alexnasa commited on
Commit
8a819c3
Β·
verified Β·
1 Parent(s): 6880805

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1115 -1112
app.py CHANGED
@@ -1,1113 +1,1116 @@
1
-
2
- import subprocess
3
- from huggingface_hub import snapshot_download, hf_hub_download
4
-
5
- def sh(cmd): subprocess.check_call(cmd, shell=True)
6
-
7
- snapshot_download(
8
- repo_id = "alexnasa/outofsync",
9
- local_dir = "./outofsync"
10
- )
11
-
12
- sh("cd outofsync && pip install . && cd ..")
13
- sh("pip uninstall onnxruntime onnxruntime-gpu -y && pip install onnxruntime-gpu")
14
-
15
- import os
16
- import shutil
17
-
18
- src = "checkpoints" # your source folder
19
- dst = "/home/user/.cache/torch/hub/checkpoints"
20
-
21
- # Create destination folder if it doesn't exist
22
- os.makedirs(dst, exist_ok=True)
23
-
24
- # Copy each item from src β†’ dst
25
- for item in os.listdir(src):
26
- s = os.path.join(src, item)
27
- d = os.path.join(dst, item)
28
-
29
- if os.path.isdir(s):
30
- # Copy directory
31
- shutil.copytree(s, d, dirs_exist_ok=True)
32
- else:
33
- # Copy file
34
- shutil.copy2(s, d)
35
-
36
- print("βœ“ Done copying checkpoints!")
37
-
38
- import spaces
39
- import io
40
- import torch
41
- import inspect
42
- import pyannote.audio.core.task as task_module
43
- from pathlib import Path
44
- from pydub import AudioSegment
45
- import math
46
-
47
- # Collect all classes from pyannote.audio.core.task
48
- safe_globals = [torch.torch_version.TorchVersion]
49
- for name, obj in inspect.getmembers(task_module):
50
- if inspect.isclass(obj):
51
- safe_globals.append(obj)
52
-
53
- # Allow these classes to be used when unpickling weights with weights_only=True
54
- torch.serialization.add_safe_globals(safe_globals)
55
-
56
- from typing import List, Dict
57
- import time
58
- from time_util import timer
59
- import os, pathlib, sys, ctypes
60
- import uuid
61
- # preload the CNN component
62
-
63
- ctypes.CDLL("/usr/local/lib/python3.10/site-packages/nvidia/cudnn/lib/libcudnn_cnn.so.9")
64
-
65
-
66
- # print(os.environ.get('LD_LIBRARY_PATH', ''))
67
- import torch, ctranslate2, os
68
-
69
- import numpy as np
70
- from pydub import AudioSegment
71
- from faster_whisper import WhisperModel
72
- from pyannote.audio import Pipeline
73
- from pyannote.audio.pipelines.utils.hook import ProgressHook
74
- import gradio as gr
75
-
76
- from pydub import AudioSegment
77
- import srt
78
- import io
79
- from pydub import AudioSegment
80
- import math
81
- from datetime import timedelta
82
- import torchaudio
83
- import tigersound.look2hear.models
84
-
85
- @spaces.GPU()
86
- def print_ort():
87
-
88
- import onnxruntime as ort
89
- print(ort.get_available_providers())
90
-
91
- print_ort()
92
-
93
- current_dir = os.path.dirname(os.path.abspath(__file__))
94
- snapshot_download("IndexTeam/IndexTTS-2", local_dir=os.path.join(current_dir,"checkpoints"))
95
-
96
- dnr_model = tigersound.look2hear.models.TIGERDNR.from_pretrained("JusperLee/TIGER-DnR").to("cuda").eval()
97
-
98
- sh(f"pip install --no-deps git+https://github.com/OutofAi/index-tts.git")
99
-
100
- from indextts.infer_v2 import IndexTTS2
101
-
102
- MODE = 'local'
103
- tts = IndexTTS2(model_dir="./checkpoints",
104
- cfg_path=os.path.join("./checkpoints", "config.yaml"),
105
- use_fp16=True,
106
- use_deepspeed=False,
107
- use_cuda_kernel=False,
108
- )
109
-
110
-
111
- os.environ["PROCESSED_RESULTS"] = f"{os.getcwd()}/proprocess_results"
112
-
113
- from lipsync import apply_lipsync
114
-
115
-
116
- def split_subtitles_max_duration(
117
- subtitles,
118
- max_seconds: float = 10.0,
119
- min_last_chunk_seconds: float = 1.0,
120
- ):
121
- """
122
- Take a list of srt.Subtitle and return a new list where
123
- no subtitle duration is longer than max_seconds, except that
124
- the *last* chunk is allowed to exceed max_seconds slightly
125
- if the leftover duration would otherwise be less than
126
- min_last_chunk_seconds.
127
-
128
- Text is split by words roughly evenly across the chunks.
129
- """
130
- max_td = timedelta(seconds=max_seconds)
131
- new_subs = []
132
- new_index = 1
133
-
134
- for sub in subtitles:
135
- start = sub.start
136
- end = sub.end
137
- duration = end - start
138
- total_secs = duration.total_seconds()
139
-
140
- # If already short enough, just copy it
141
- if total_secs <= max_seconds:
142
- new_subs.append(
143
- srt.Subtitle(
144
- index=new_index,
145
- start=start,
146
- end=end,
147
- content=sub.content,
148
- )
149
- )
150
- new_index += 1
151
- continue
152
-
153
- # Need to split this subtitle
154
- words = sub.content.split()
155
- if not words:
156
- # No text, skip
157
- continue
158
-
159
- # --- Determine number of chunks, avoiding tiny last chunk ---
160
- base_chunks = int(total_secs // max_seconds)
161
- remainder = total_secs - base_chunks * max_seconds
162
-
163
- if base_chunks == 0:
164
- # total_secs > max_seconds due to earlier check, but just in case
165
- num_chunks = 1
166
- else:
167
- if remainder == 0:
168
- num_chunks = base_chunks
169
- elif remainder < min_last_chunk_seconds:
170
- # Don't create a tiny last chunk; merge its time into previous chunks
171
- num_chunks = base_chunks
172
- else:
173
- num_chunks = base_chunks + 1
174
-
175
- # Ensure at least one chunk
176
- num_chunks = max(1, num_chunks)
177
-
178
- # Words per chunk (roughly even)
179
- words_per_chunk = max(1, int(math.ceil(len(words) / num_chunks)))
180
-
181
- chunk_start = start
182
- word_idx = 0
183
-
184
- for chunk_idx in range(num_chunks):
185
- # Last chunk takes us all the way to the original end,
186
- # so it can be slightly > max_seconds if needed.
187
- if chunk_idx == num_chunks - 1:
188
- chunk_end = end
189
- else:
190
- chunk_end = min(end, chunk_start + max_td)
191
-
192
- if chunk_end <= chunk_start:
193
- break
194
-
195
- chunk_words = words[word_idx:word_idx + words_per_chunk]
196
- word_idx += words_per_chunk
197
-
198
- if not chunk_words:
199
- break
200
-
201
- new_subs.append(
202
- srt.Subtitle(
203
- index=new_index,
204
- start=chunk_start,
205
- end=chunk_end,
206
- content=" ".join(chunk_words),
207
- )
208
- )
209
- new_index += 1
210
-
211
- chunk_start = chunk_end
212
-
213
- return new_subs
214
-
215
-
216
- def split_text_into_chunks(text, max_chars=400):
217
- """
218
- Rough splitter: breaks text into chunks <= max_chars,
219
- preferring to split at sentence boundaries, then spaces.
220
- """
221
- text = text.strip()
222
- chunks = []
223
-
224
- while len(text) > max_chars:
225
- # Try to split at the last sentence end before max_chars
226
- split_at = max(
227
- text.rfind(". ", 0, max_chars),
228
- text.rfind("! ", 0, max_chars),
229
- text.rfind("? ", 0, max_chars),
230
- )
231
-
232
- # If there was no sentence boundary, fall back to last space
233
- if split_at == -1:
234
- split_at = text.rfind(" ", 0, max_chars)
235
-
236
- # If still nothing, just hard cut
237
- if split_at == -1:
238
- split_at = max_chars
239
-
240
- chunk = text[:split_at + 1].strip()
241
- chunks.append(chunk)
242
- text = text[split_at + 1 :].strip()
243
-
244
- if text:
245
- chunks.append(text)
246
-
247
- return chunks
248
-
249
-
250
- def sh(cmd): subprocess.check_call(cmd, shell=True)
251
-
252
- # sh("find / -name \"libcudnn*\" 2>/dev/null")
253
- # --------------------
254
- # CONFIG
255
- # --------------------
256
- MODEL_SIZE = "medium" # e.g. "small", "medium", "large-v2"
257
- MIN_SEGMENT_SECONDS = 0.5 # only transcribe segments longer than this
258
-
259
- # If your pyannote pipeline needs a HF token, set it here or via env var:
260
- # HUGGINGFACE_TOKEN = "hf_..."
261
- HF_TOKEN = os.getenv("HF_TOKEN", None)
262
-
263
- # --------------------
264
- # LOAD GLOBAL MODELS (ONCE)
265
- # --------------------
266
- device = "cuda" if torch.cuda.is_available() else "cpu"
267
-
268
- print(f"Loading pyannote diarization model...")
269
- diarization_pipeline = Pipeline.from_pretrained(
270
- "pyannote/speaker-diarization-3.1"
271
- )
272
-
273
- # --------------------
274
- # HELPERS
275
- # --------------------
276
- def format_timestamp(ts: float) -> str:
277
- """Convert seconds to SRT timestamp format."""
278
- hrs = int(ts // 3600)
279
- mins = int((ts % 3600) // 60)
280
- secs = int(ts % 60)
281
- ms = int((ts - int(ts)) * 1000)
282
- return f"{hrs:02d}:{mins:02d}:{secs:02d},{ms:03d}"
283
-
284
-
285
- def extract_audio_to_wav(input_video: str, output_dir: str):
286
-
287
- audio_file = os.path.join(output_dir, "audio_og.wav")
288
- background_file = os.path.join(output_dir, "background_og.wav")
289
- vocal_file = os.path.join(output_dir, "vocal_og.wav")
290
- effect_file = os.path.join(output_dir, "effect_og.wav")
291
-
292
- audio_16k_file = os.path.join(output_dir, "audio_16k.wav")
293
-
294
- video_path = input_video
295
- separator_dir = Path(os.path.join(output_dir, "separator_directory"))
296
- os.makedirs(separator_dir, exist_ok=True)
297
-
298
-
299
- # Extract raw audio
300
- cmd = [
301
- "ffmpeg",
302
- "-loglevel", "error",
303
- "-i", video_path,
304
- "-vn",
305
- "-acodec", "pcm_s16le",
306
- "-ar", "44100",
307
- "-ac", "2",
308
- audio_file
309
- ]
310
- subprocess.run(cmd, check=True)
311
-
312
- audio, sr = torchaudio.load(audio_file)
313
- audio = audio.to("cuda")
314
-
315
- with torch.no_grad():
316
- dialog, effect, music = dnr_model(audio[None])
317
-
318
- torchaudio.save(vocal_file, dialog.cpu(), sr)
319
- torchaudio.save(effect_file, effect.cpu(), sr)
320
- torchaudio.save(background_file, music.cpu(), sr)
321
-
322
- # Convert vocals to 16k mono
323
- cmd = [
324
- "ffmpeg",
325
- "-loglevel", "error",
326
- "-y",
327
- "-i", vocal_file,
328
- "-ac", "1",
329
- "-ar", "16000",
330
- "-acodec", "pcm_s16le",
331
- audio_16k_file
332
- ]
333
- subprocess.run(cmd, check=True)
334
-
335
- return audio_file, effect_file, background_file, audio_16k_file, vocal_file
336
-
337
-
338
- def diarize_audio(audio_path: str) -> List[Dict]:
339
- """Run pyannote diarization and return segments."""
340
-
341
- diarization_pipeline.to(torch.device(device))
342
-
343
- with ProgressHook() as hook:
344
- diarization_result = diarization_pipeline(audio_path, hook=hook)
345
-
346
- segments = []
347
- for segment, _, speaker in diarization_result.itertracks(yield_label=True):
348
- duration = segment.end - segment.start
349
- if duration >= MIN_SEGMENT_SECONDS:
350
- segments.append(
351
- {
352
- "start": float(segment.start),
353
- "end": float(segment.end),
354
- "speaker": speaker,
355
- }
356
- )
357
-
358
- segments.sort(key=lambda x: x["start"])
359
- return segments
360
-
361
-
362
- def chunk_to_float32(chunk: AudioSegment) -> np.ndarray:
363
- """Convert a pydub chunk to mono 16kHz float32 numpy array in [-1, 1]."""
364
- chunk = chunk.set_frame_rate(16000).set_channels(1)
365
- samples = np.array(chunk.get_array_of_samples())
366
-
367
- # Normalize based on sample width
368
- if chunk.sample_width == 2: # 16-bit
369
- samples = samples.astype(np.float32) / 32768.0
370
- elif chunk.sample_width == 4: # 32-bit
371
- samples = samples.astype(np.float32) / 2147483648.0
372
- else:
373
- samples = samples.astype(np.float32)
374
-
375
- return samples
376
-
377
-
378
- def transcribe_segment(whisper_model, samples: np.ndarray) -> str:
379
- """Transcribe+translate a single segment with faster-whisper."""
380
- segment_text_parts = []
381
-
382
-
383
- segments, info = whisper_model.transcribe(
384
- samples,
385
- beam_size=1,
386
- vad_filter=False, # diarization already detected speech
387
- condition_on_previous_text=True, # independent segments
388
- task="translate", # translate to English
389
- word_timestamps=True,
390
- )
391
-
392
- for seg in segments:
393
- if seg.text:
394
- segment_text_parts.append(seg.text.strip())
395
-
396
- return " ".join(segment_text_parts)
397
-
398
- def transcribe_segment_words(
399
- whisper_model,
400
- samples: np.ndarray,
401
- offset_sec: float,
402
- speaker: str | None = None,
403
- ):
404
- """
405
- Transcribe+translate a single diarization segment, returning a
406
- list of word dicts with absolute timestamps.
407
- """
408
- words_out = []
409
-
410
- segments, info = whisper_model.transcribe(
411
- samples,
412
- beam_size=1,
413
- vad_filter=False, # diarization already detected speech
414
- condition_on_previous_text=False, # better for hard cuts / segments
415
- task="translate",
416
- word_timestamps=True,
417
- )
418
-
419
- for seg in segments:
420
- if not seg.words:
421
- continue
422
- for w in seg.words:
423
- words_out.append(
424
- {
425
- "start": offset_sec + float(w.start),
426
- "end": offset_sec + float(w.end),
427
- "text": w.word,
428
- "speaker": speaker,
429
- }
430
- )
431
-
432
- return words_out
433
-
434
- def words_to_subtitles(words, max_seconds: float = 10.0):
435
- """
436
- Group word-level timings into SRT subtitles, each up to max_seconds long,
437
- cutting ONLY at word boundaries, AND never mixing speakers in the same subtitle.
438
- Whenever the speaker changes, we close the current subtitle and start a new one.
439
-
440
- Expects each word dict to have:
441
- - "start" (float, seconds)
442
- - "end" (float, seconds)
443
- - "text" (str)
444
- - "speaker" (str or None)
445
- """
446
- # sort just in case
447
- words = sorted(words, key=lambda w: w["start"])
448
-
449
- subtitles = []
450
- current_words = []
451
- current_start = None
452
- current_speaker = None
453
-
454
- index = 1
455
-
456
- for w in words:
457
- w_start = w["start"]
458
- w_end = w["end"]
459
- w_speaker = w.get("speaker")
460
-
461
- if current_start is None:
462
- # start first subtitle
463
- current_start = w_start
464
- current_words = [w]
465
- current_speaker = w_speaker
466
- continue
467
-
468
- speaker_changed = (w_speaker != current_speaker)
469
- duration_if_added = w_end - current_start
470
- exceeds_max = duration_if_added > max_seconds
471
-
472
- # If adding this word would:
473
- # - exceed max_seconds, OR
474
- # - cross into a different speaker,
475
- # then we close the current subtitle and start a new one.
476
- if (speaker_changed or exceeds_max) and current_words:
477
- text = " ".join(x["text"] for x in current_words).strip()
478
- sub_start = current_start
479
- sub_end = current_words[-1]["end"]
480
-
481
- subtitles.append(
482
- srt.Subtitle(
483
- index=index,
484
- start=timedelta(seconds=sub_start),
485
- end=timedelta(seconds=sub_end),
486
- content=text,
487
- )
488
- )
489
- index += 1
490
-
491
- # start new subtitle from this word
492
- current_start = w_start
493
- current_words = [w]
494
- current_speaker = w_speaker
495
- else:
496
- current_words.append(w)
497
-
498
- # flush last subtitle
499
- if current_words:
500
- text = " ".join(x["text"] for x in current_words).strip()
501
- sub_start = current_start
502
- sub_end = current_words[-1]["end"]
503
- subtitles.append(
504
- srt.Subtitle(
505
- index=index,
506
- start=timedelta(seconds=sub_start),
507
- end=timedelta(seconds=sub_end),
508
- content=text,
509
- )
510
- )
511
-
512
- return subtitles
513
-
514
- def build_srt(segments: List[Dict], audio_wav: str, out_srt_path: str):
515
- """
516
- Generate SRT file from diarized segments and audio,
517
- using word-level timestamps and grouping into ~10s subtitles.
518
- """
519
- audio = AudioSegment.from_file(audio_wav)
520
-
521
- print(f"Loading faster-whisper model ({MODEL_SIZE})...")
522
- whisper_model = WhisperModel(
523
- MODEL_SIZE,
524
- device="cuda",
525
- compute_type="float16",
526
- )
527
-
528
- all_words = []
529
-
530
- for i, seg in enumerate(segments, start=1):
531
- start_sec = seg["start"]
532
- end_sec = seg["end"]
533
- speaker = seg["speaker"]
534
-
535
- start_ms = int(start_sec * 1000)
536
- end_ms = int(end_sec * 1000)
537
- chunk = audio[start_ms:end_ms]
538
-
539
- samples = chunk_to_float32(chunk)
540
-
541
- # get words for this diar segment, with absolute times
542
- seg_words = transcribe_segment_words(
543
- whisper_model,
544
- samples,
545
- offset_sec=start_sec,
546
- speaker=speaker,
547
- )
548
-
549
- all_words.extend(seg_words)
550
- print(f"Diar segment {i} ({speaker}): {len(seg_words)} words")
551
-
552
- # group words into ≀10s subtitles, word aligned
553
- subtitles = words_to_subtitles(all_words, max_seconds=10.0)
554
-
555
- # write SRT
556
- with open(out_srt_path, "w", encoding="utf-8") as f:
557
- f.write(srt.compose(subtitles))
558
-
559
- def translate_video(video_file):
560
-
561
- return process_video(video_file, False)
562
-
563
- def translate_lipsync_video(video_file):
564
-
565
- return process_video(video_file, True)
566
-
567
- def run_example(video_file, allow_lipsync, duration):
568
-
569
- with timer("processed"):
570
- result = process_video(video_file, allow_lipsync, duration)
571
-
572
- return result
573
-
574
- @spaces.GPU(duration=350)
575
- def process_video(video_file, allow_lipsync, duration = 30):
576
- """
577
- Gradio callback:
578
- - video_file: temp file object/path from Gradio
579
- - returns path to generated SRT file (for download)
580
- """
581
- if video_file is None:
582
- raise gr.Error("Please upload an MP4 video.")
583
-
584
- session_id = uuid.uuid4().hex
585
-
586
- output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
587
- os.makedirs(output_dir, exist_ok=True)
588
-
589
- # Gradio's File/Video component gives dict or str depending on version
590
- if isinstance(video_file, dict):
591
- video_path = video_file.get("name") or video_file.get("path")
592
- else:
593
- video_path = video_file
594
-
595
- if video_path is None or not os.path.exists(video_path):
596
- raise gr.Error("Could not read uploaded video file.")
597
-
598
- # Create temp directory to hold WAV + SRT
599
- srt_path = os.path.join(output_dir, "diarized_translated.srt")
600
-
601
- src_video_path = video_file
602
-
603
- cropped_video_path = os.path.join(output_dir, "input_30s.mp4")
604
-
605
- duration_s = int(duration)
606
-
607
- cmd = [
608
- "ffmpeg",
609
- "-y",
610
- "-i", src_video_path,
611
- "-t", f"{duration_s}",
612
- "-c", "copy", # stream copy, no re-encode
613
- cropped_video_path,
614
- ]
615
- subprocess.run(cmd, check=True)
616
- video_path = cropped_video_path
617
-
618
- # 1. Extract audio
619
- audio_wav, effect_wav, background_wav, audio_16k_wav, vocal_wav = extract_audio_to_wav(video_path, output_dir)
620
-
621
- # 2. Diarization
622
- segments = diarize_audio(audio_16k_wav)
623
- if not segments:
624
- raise gr.Error("No valid speech segments found for diarization.")
625
-
626
- # 3. Build SRT from diarized segments + whisper
627
- with timer("Generating srt"):
628
- build_srt(segments, audio_16k_wav, srt_path)
629
-
630
- # ---- ORIGINAL SRT (used for TTS) ----
631
- with open(srt_path, "r", encoding="utf-8") as f:
632
- srt_data = f.read()
633
-
634
- subtitles = list(srt.parse(srt_data))
635
-
636
- # Keep this list as-is for TTS timing
637
- tts_subtitles = subtitles
638
-
639
- # ---- CREATE 10s-MAX SRT FOR DOWNLOAD ----
640
- max10_subtitles = tts_subtitles
641
- # max10_subtitles = split_subtitles_max_duration(subtitles, max_seconds=10.0)
642
-
643
- tts_subtitles = max10_subtitles
644
-
645
- srt_10s_path = os.path.join(output_dir, "diarized_translated_max10s.srt")
646
- with open(srt_10s_path, "w", encoding="utf-8") as f:
647
- f.write(srt.compose(max10_subtitles))
648
-
649
- # ---- TTS USING ORIGINAL SRT ----
650
- last_end_seconds = tts_subtitles[-1].end.total_seconds()
651
- total_ms = int((last_end_seconds + 1) * 1000)
652
-
653
- timeline = AudioSegment.silent(duration=total_ms)
654
-
655
- original_audio = AudioSegment.from_file(audio_wav)
656
-
657
- MAX_BATCH_MS = 300_000 # ~5 minutes of target subtitle duration per batch
658
-
659
- with timer("Generating speech"):
660
- num_subs = len(tts_subtitles)
661
- idx = 0
662
-
663
- while idx < num_subs:
664
- spk_prompts = [] # paths to src_prompt_*.wav
665
- texts = [] # subtitle texts for this batch
666
- out_paths = [] # where IndexTTS2 will save generated wavs
667
- starts_ms = [] # for overlaying later
668
- target_ms_list = [] # per-subtitle target durations
669
- batch_ms_sum = 0
670
-
671
- batch_start = idx
672
-
673
- # ---- fill one batch until we hit ~MAX_BATCH_MS ----
674
- while idx < num_subs:
675
- sub = tts_subtitles[idx]
676
-
677
- start_ms = int(sub.start.total_seconds() * 1000)
678
- end_ms = int(sub.end.total_seconds() * 1000)
679
- target_ms = max(end_ms - start_ms, 0)
680
-
681
- # If adding this subtitle would exceed the limit and we already
682
- # have something in the batch, stop and process the current batch.
683
- if batch_ms_sum + target_ms > MAX_BATCH_MS and len(target_ms_list) > 0:
684
- break
685
-
686
- global_idx = idx
687
-
688
- # 1) prompt audio for this subtitle
689
- src_chunk = original_audio[start_ms:end_ms]
690
- src_prompt_path = os.path.join(output_dir, f"src_prompt_{global_idx}.wav")
691
- src_chunk.export(src_prompt_path, format="wav")
692
-
693
- # 2) text + output path
694
- text = sub.content.replace("\n", " ")
695
- out_path = os.path.join(output_dir, f"gen_{global_idx}.wav")
696
-
697
- spk_prompts.append(src_prompt_path)
698
- texts.append(text)
699
- out_paths.append(out_path)
700
- starts_ms.append(start_ms)
701
- target_ms_list.append(target_ms)
702
-
703
- batch_ms_sum += target_ms
704
- idx += 1
705
-
706
- print(f"batch from {batch_start} to {idx - 1}, batch_ms_sum: {batch_ms_sum}")
707
-
708
- # --- call batched TTS once for this batch ---
709
- do_sample = True
710
- top_p = 0.8
711
- top_k = 30
712
- temperature = 0.8
713
- length_penalty = 0.0
714
- num_beams = 3
715
- repetition_penalty = 10.0
716
- max_mel_tokens = 1500
717
-
718
- # You could compute some aggregate target_length_ms here if your API supports it,
719
- # e.g. avg or max(target_ms_list). For now, keep None as before.
720
- tts_outputs = tts.infer_batch(
721
- spk_audio_prompts=spk_prompts,
722
- texts=texts,
723
- output_paths=out_paths,
724
- emo_audio_prompts=None,
725
- emo_alpha=1.0,
726
- emo_vectors=None,
727
- use_emo_text=False,
728
- emo_texts=None,
729
- use_random=False,
730
- interval_silence=200,
731
- verbose=False,
732
- max_text_tokens_per_segment=120,
733
- speed=1.0,
734
- target_length_ms=target_ms_list,
735
- do_sample=do_sample,
736
- top_p=top_p,
737
- top_k=top_k,
738
- temperature=temperature,
739
- length_penalty=length_penalty,
740
- num_beams=num_beams,
741
- repetition_penalty=repetition_penalty,
742
- max_mel_tokens=max_mel_tokens,
743
- )
744
-
745
- # --- read generated wavs and overlay them ---
746
- for local_idx, out_path in enumerate(tts_outputs):
747
- start_ms = starts_ms[local_idx]
748
-
749
- seg = AudioSegment.from_file(out_path, format="wav")
750
- seg = seg - 2
751
- timeline = timeline.overlay(seg, position=start_ms)
752
-
753
- # cleanup
754
- os.remove(out_path)
755
- os.remove(spk_prompts[local_idx])
756
-
757
- # -------------------------------------------------------
758
- # Bring back original dialog in the *gaps* (grunts, etc.)
759
- # -------------------------------------------------------
760
- # Load separated dialog track
761
- dialog = AudioSegment.from_file(vocal_wav)
762
-
763
- # Make sure it matches the TTS timeline parameters
764
- dialog = dialog.set_frame_rate(timeline.frame_rate).set_channels(timeline.channels)
765
-
766
- total_len_ms = len(timeline)
767
-
768
- # Collect speech regions from subtitles (approximate "where TTS will speak")
769
- speech_regions = []
770
- for sub in tts_subtitles:
771
- start_ms = int(sub.start.total_seconds() * 1000)
772
- end_ms = int(sub.end.total_seconds() * 1000)
773
- # clamp to track length
774
- start_ms = max(0, min(start_ms, total_len_ms))
775
- end_ms = max(0, min(end_ms, total_len_ms))
776
- if end_ms > start_ms:
777
- speech_regions.append((start_ms, end_ms))
778
-
779
- # Merge overlapping/adjacent regions
780
- speech_regions.sort()
781
- merged = []
782
- for s, e in speech_regions:
783
- if not merged:
784
- merged.append([s, e])
785
- else:
786
- last_s, last_e = merged[-1]
787
- if s <= last_e: # overlap or touch
788
- merged[-1][1] = max(last_e, e)
789
- else:
790
- merged.append([s, e])
791
-
792
- # Compute the complement: regions where there's NO subtitle (gaps)
793
- gaps = []
794
- cursor = 0
795
- for s, e in merged:
796
- if cursor < s:
797
- gaps.append((cursor, s))
798
- cursor = max(cursor, e)
799
- if cursor < total_len_ms:
800
- gaps.append((cursor, total_len_ms))
801
-
802
- # Overlay original dialog only in those gaps
803
- MIN_GAP_MS = 10 # ignore ultra-tiny gaps
804
-
805
- for g_start, g_end in gaps:
806
- if g_end - g_start < MIN_GAP_MS:
807
- continue
808
-
809
- # Extract that piece of the original dialog
810
- original_chunk = dialog[g_start:g_end]
811
- original_chunk = original_chunk + 6
812
-
813
- timeline = timeline.overlay(original_chunk, position=g_start)
814
-
815
-
816
- video_in = video_file
817
- audio_in = output_dir + "/final_output.wav"
818
- audio_16k_in = output_dir + "/final_16k_output.wav"
819
-
820
- # ---------- 5. Mix background + new TTS vocal ----------
821
-
822
- if background_wav is not None:
823
- eff = AudioSegment.from_file(effect_wav)
824
- bg = AudioSegment.from_file(background_wav)
825
-
826
-
827
-
828
- # If background is shorter than the TTS timeline, loop it
829
- if len(eff) < len(timeline):
830
- loops = math.ceil(len(timeline) / len(eff))
831
- eff = eff * loops
832
-
833
- if len(bg) < len(timeline):
834
- loops = math.ceil(len(timeline) / len(bg))
835
- bg = bg * loops
836
-
837
-
838
-
839
- # Cut or match to TTS length
840
- eff = eff[:len(timeline)]
841
- bg = bg[:len(timeline)]
842
-
843
-
844
- bg = bg + 6
845
- eff = eff + 6
846
-
847
- eff_timeline = eff.overlay(timeline)
848
- final_audio = bg.overlay(eff_timeline)
849
- final_16k_audio = timeline.set_frame_rate(16000).set_channels(1)
850
- else:
851
- # Fallback: no background found, just use TTS
852
- final_audio = timeline
853
- final_16k_audio = timeline
854
-
855
- final_audio.export(audio_in, format="wav")
856
- final_16k_audio.export(audio_16k_in, format="wav")
857
-
858
- print(f"Done! Saved to {audio_in}")
859
-
860
- lipsynced_video = output_dir + "/output_with_lipsync_16k.mp4"
861
-
862
- if allow_lipsync:
863
- apply_lipsync(video_in, audio_16k_in, lipsynced_video)
864
- else:
865
- lipsynced_video = video_in
866
-
867
- video_out = output_dir + "/output_with_lipsync.mp4"
868
-
869
-
870
- cmd = [
871
- "ffmpeg",
872
- "-loglevel", "error",
873
- "-y", # overwrite output file
874
- "-i", lipsynced_video, # input video
875
- "-i", audio_in, # new audio
876
- "-c:v", "copy", # do not re-encode video
877
- "-map", "0:v:0", # take video from input 0
878
- "-map", "1:a:0", # take audio from input 1
879
- "-shortest", # stop when either track ends
880
- video_out,
881
- ]
882
-
883
- subprocess.run(cmd, check=True)
884
-
885
-
886
- # IMPORTANT: return the 10s-max SRT for download
887
- return video_out, srt_10s_path, audio_16k_in
888
-
889
-
890
-
891
- css = """
892
- #col-container {
893
- margin: 0 auto;
894
- max-width: 1600px;
895
- }
896
- #modal-container {
897
- width: 100vw; /* Take full viewport width */
898
- height: 100vh; /* Take full viewport height (optional) */
899
- display: flex;
900
- justify-content: center; /* Center content horizontally */
901
- align-items: center; /* Center content vertically if desired */
902
- }
903
- #modal-content {
904
- width: 100%;
905
- max-width: 700px; /* Limit content width */
906
- margin: 0 auto;
907
- border-radius: 8px;
908
- padding: 1.5rem;
909
- }
910
- #step-column {
911
- padding: 10px;
912
- border-radius: 8px;
913
- box-shadow: var(--card-shadow);
914
- margin: 10px;
915
- }
916
- #col-showcase {
917
- margin: 0 auto;
918
- max-width: 1100px;
919
- }
920
- .button-gradient {
921
- background: linear-gradient(45deg, rgb(255, 65, 108), rgb(255, 75, 43), rgb(255, 155, 0), rgb(255, 65, 108)) 0% 0% / 400% 400%;
922
- border: none;
923
- padding: 14px 28px;
924
- font-size: 16px;
925
- font-weight: bold;
926
- color: white;
927
- border-radius: 10px;
928
- cursor: pointer;
929
- transition: 0.3s ease-in-out;
930
- animation: 2s linear 0s infinite normal none running gradientAnimation;
931
- box-shadow: rgba(255, 65, 108, 0.6) 0px 4px 10px;
932
- }
933
- .toggle-container {
934
- display: inline-flex;
935
- background-color: #ffd6ff; /* light pink background */
936
- border-radius: 9999px;
937
- padding: 4px;
938
- position: relative;
939
- width: fit-content;
940
- font-family: sans-serif;
941
- }
942
- .toggle-container input[type="radio"] {
943
- display: none;
944
- }
945
- .toggle-container label {
946
- position: relative;
947
- z-index: 2;
948
- flex: 1;
949
- text-align: center;
950
- font-weight: 700;
951
- color: #4b2ab5; /* dark purple text for unselected */
952
- padding: 6px 22px;
953
- border-radius: 9999px;
954
- cursor: pointer;
955
- transition: color 0.25s ease;
956
- }
957
- /* Moving highlight */
958
- .toggle-highlight {
959
- position: absolute;
960
- top: 4px;
961
- left: 4px;
962
- width: calc(50% - 4px);
963
- height: calc(100% - 8px);
964
- background-color: #4b2ab5; /* dark purple background */
965
- border-radius: 9999px;
966
- transition: transform 0.25s ease;
967
- z-index: 1;
968
- }
969
- /* When "True" is checked */
970
- #true:checked ~ label[for="true"] {
971
- color: #ffd6ff; /* light pink text */
972
- }
973
- /* When "False" is checked */
974
- #false:checked ~ label[for="false"] {
975
- color: #ffd6ff; /* light pink text */
976
- }
977
- /* Move highlight to right side when False is checked */
978
- #false:checked ~ .toggle-highlight {
979
- transform: translateX(100%);
980
- }
981
- """
982
-
983
-
984
- with gr.Blocks(css=css) as demo:
985
-
986
- with gr.Column(elem_id="col-container"):
987
- gr.HTML(
988
- """
989
- <div style="text-align: center;">
990
- <p style="font-size:16px; display: inline; margin: 0;">
991
- <strong>OutofSync </strong>
992
- </p>
993
- <p style="font-size:16px; display: inline; margin: 0;">
994
- -- HF Space By:
995
- </p>
996
- <a href="https://huggingface.co/alexnasa" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;">
997
- <img src="https://img.shields.io/badge/πŸ€—-Follow Me-yellow.svg">
998
- </a>
999
- <a href="https://www.buymeacoffee.com/outofai" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;" target="_blank"><img src="https://img.shields.io/badge/-buy_me_a%C2%A0coffee-red?logo=buy-me-a-coffee" alt="Buy Me A Coffee"></a>
1000
- </div>
1001
- """
1002
- )
1003
-
1004
- with gr.Row():
1005
- with gr.Column(elem_id="step-column"):
1006
- gr.HTML("""
1007
- <div>
1008
- <span style="font-size: 24px;">1. Upload a Video</span><br>
1009
- </div>
1010
- """)
1011
-
1012
- video_input = gr.Video(
1013
- label="OG Clip",
1014
- height=512
1015
- )
1016
-
1017
- with gr.Column(elem_id="step-column"):
1018
- gr.HTML("""
1019
- <div>
1020
- <span style="font-size: 24px;">2. Translate + πŸ’‹ </span><br>
1021
- </div>
1022
- """)
1023
-
1024
- video_output = gr.Video(label="Output", height=512)
1025
- lipsync = gr.Checkbox(label="Lipsync", value=False, visible=False)
1026
- duration = gr.Slider(0, 30, 30, step=10)
1027
- translate_btn = gr.Button("πŸ€Ήβ€β™‚οΈ Translate")
1028
- translate_lipsync_btn = gr.Button("πŸ€Ήβ€β™‚οΈ Translate + πŸ’‹ Lipsync", variant='primary', elem_classes="button-gradient")
1029
-
1030
- with gr.Column(elem_id="step-column"):
1031
- vocal_16k_output = gr.File(label="Vocal 16k", visible=False)
1032
- srt_output = gr.File(label="Download translated diarized SRT", visible=False)
1033
-
1034
- cached_examples = gr.Examples(
1035
- examples=[
1036
-
1037
- [
1038
- "assets/popup-2.mp4",
1039
- False,
1040
- 10
1041
- ],
1042
-
1043
- [
1044
- "assets/popup-2.mp4",
1045
- False,
1046
- 20
1047
- ],
1048
-
1049
- [
1050
- "assets/popup-2.mp4",
1051
- False,
1052
- 30
1053
- ],
1054
-
1055
- [
1056
- "assets/german.mp4",
1057
- True,
1058
- 10
1059
- ],
1060
-
1061
- [
1062
- "assets/popup-2.mp4",
1063
- True,
1064
- 20
1065
- ],
1066
-
1067
- [
1068
- "assets/popup-2.mp4",
1069
- True,
1070
- 30
1071
- ],
1072
-
1073
- [
1074
- "assets/popup-2.mp4",
1075
- True,
1076
- 10
1077
- ],
1078
-
1079
- [
1080
- "assets/italian.mp4",
1081
- True,
1082
- 10
1083
- ],
1084
-
1085
- [
1086
- "assets/french-movie.mp4",
1087
- True,
1088
- 10
1089
- ],
1090
-
1091
- ],
1092
- label="Cached Examples",
1093
- fn=process_video,
1094
- inputs=[video_input, lipsync, duration],
1095
- outputs=[video_output, srt_output, vocal_16k_output],
1096
- cache_examples=True
1097
- )
1098
-
1099
- translate_btn.click(
1100
- fn=translate_video,
1101
- inputs=[video_input],
1102
- outputs=[video_output, srt_output, vocal_16k_output],
1103
- )
1104
-
1105
- translate_lipsync_btn.click(
1106
- fn=translate_lipsync_video,
1107
- inputs=[video_input],
1108
- outputs=[video_output, srt_output, vocal_16k_output],
1109
- )
1110
-
1111
- if __name__ == "__main__":
1112
- demo.queue()
 
 
 
1113
  demo.launch()
 
1
+
2
+ import subprocess
3
+ from huggingface_hub import snapshot_download, hf_hub_download
4
+
5
+ def sh(cmd): subprocess.check_call(cmd, shell=True)
6
+
7
+ snapshot_download(
8
+ repo_id = "alexnasa/outofsync",
9
+ local_dir = "./outofsync"
10
+ )
11
+
12
+ sh("cd outofsync && pip install . && cd ..")
13
+ sh("pip uninstall onnxruntime onnxruntime-gpu -y && pip install onnxruntime-gpu")
14
+
15
+ import os
16
+ import shutil
17
+
18
+ src = "checkpoints" # your source folder
19
+ dst = "/home/user/.cache/torch/hub/checkpoints"
20
+
21
+ # Create destination folder if it doesn't exist
22
+ os.makedirs(dst, exist_ok=True)
23
+
24
+ # Copy each item from src β†’ dst
25
+ for item in os.listdir(src):
26
+ s = os.path.join(src, item)
27
+ d = os.path.join(dst, item)
28
+
29
+ if os.path.isdir(s):
30
+ # Copy directory
31
+ shutil.copytree(s, d, dirs_exist_ok=True)
32
+ else:
33
+ # Copy file
34
+ shutil.copy2(s, d)
35
+
36
+ print("βœ“ Done copying checkpoints!")
37
+
38
+ import spaces
39
+ import io
40
+ import torch
41
+ import inspect
42
+ import pyannote.audio.core.task as task_module
43
+ from pathlib import Path
44
+ from pydub import AudioSegment
45
+ import math
46
+
47
+ # Collect all classes from pyannote.audio.core.task
48
+ safe_globals = [torch.torch_version.TorchVersion]
49
+ for name, obj in inspect.getmembers(task_module):
50
+ if inspect.isclass(obj):
51
+ safe_globals.append(obj)
52
+
53
+ # Allow these classes to be used when unpickling weights with weights_only=True
54
+ torch.serialization.add_safe_globals(safe_globals)
55
+
56
+ from typing import List, Dict
57
+ import time
58
+ from time_util import timer
59
+ import os, pathlib, sys, ctypes
60
+ import uuid
61
+ # preload the CNN component
62
+
63
+ ctypes.CDLL("/usr/local/lib/python3.10/site-packages/nvidia/cudnn/lib/libcudnn_cnn.so.9")
64
+
65
+
66
+ # print(os.environ.get('LD_LIBRARY_PATH', ''))
67
+ import torch, ctranslate2, os
68
+
69
+ import numpy as np
70
+ from pydub import AudioSegment
71
+ from faster_whisper import WhisperModel
72
+ from pyannote.audio import Pipeline
73
+ from pyannote.audio.pipelines.utils.hook import ProgressHook
74
+ import gradio as gr
75
+
76
+ from pydub import AudioSegment
77
+ import srt
78
+ import io
79
+ from pydub import AudioSegment
80
+ import math
81
+ from datetime import timedelta
82
+ import torchaudio
83
+ import tigersound.look2hear.models
84
+
85
+ @spaces.GPU()
86
+ def print_ort():
87
+
88
+ import onnxruntime as ort
89
+ print(ort.get_available_providers())
90
+
91
+ print_ort()
92
+
93
+ current_dir = os.path.dirname(os.path.abspath(__file__))
94
+ snapshot_download("IndexTeam/IndexTTS-2", local_dir=os.path.join(current_dir,"checkpoints"))
95
+
96
+ dnr_model = tigersound.look2hear.models.TIGERDNR.from_pretrained("JusperLee/TIGER-DnR").to("cuda").eval()
97
+
98
+ sh(f"pip install --no-deps git+https://github.com/OutofAi/index-tts.git")
99
+
100
+ from indextts.infer_v2 import IndexTTS2
101
+
102
+ MODE = 'local'
103
+ tts = IndexTTS2(model_dir="./checkpoints",
104
+ cfg_path=os.path.join("./checkpoints", "config.yaml"),
105
+ use_fp16=True,
106
+ use_deepspeed=False,
107
+ use_cuda_kernel=False,
108
+ )
109
+
110
+
111
+ os.environ["PROCESSED_RESULTS"] = f"{os.getcwd()}/proprocess_results"
112
+
113
+ from lipsync import apply_lipsync
114
+
115
+
116
+ def split_subtitles_max_duration(
117
+ subtitles,
118
+ max_seconds: float = 10.0,
119
+ min_last_chunk_seconds: float = 1.0,
120
+ ):
121
+ """
122
+ Take a list of srt.Subtitle and return a new list where
123
+ no subtitle duration is longer than max_seconds, except that
124
+ the *last* chunk is allowed to exceed max_seconds slightly
125
+ if the leftover duration would otherwise be less than
126
+ min_last_chunk_seconds.
127
+
128
+ Text is split by words roughly evenly across the chunks.
129
+ """
130
+ max_td = timedelta(seconds=max_seconds)
131
+ new_subs = []
132
+ new_index = 1
133
+
134
+ for sub in subtitles:
135
+ start = sub.start
136
+ end = sub.end
137
+ duration = end - start
138
+ total_secs = duration.total_seconds()
139
+
140
+ # If already short enough, just copy it
141
+ if total_secs <= max_seconds:
142
+ new_subs.append(
143
+ srt.Subtitle(
144
+ index=new_index,
145
+ start=start,
146
+ end=end,
147
+ content=sub.content,
148
+ )
149
+ )
150
+ new_index += 1
151
+ continue
152
+
153
+ # Need to split this subtitle
154
+ words = sub.content.split()
155
+ if not words:
156
+ # No text, skip
157
+ continue
158
+
159
+ # --- Determine number of chunks, avoiding tiny last chunk ---
160
+ base_chunks = int(total_secs // max_seconds)
161
+ remainder = total_secs - base_chunks * max_seconds
162
+
163
+ if base_chunks == 0:
164
+ # total_secs > max_seconds due to earlier check, but just in case
165
+ num_chunks = 1
166
+ else:
167
+ if remainder == 0:
168
+ num_chunks = base_chunks
169
+ elif remainder < min_last_chunk_seconds:
170
+ # Don't create a tiny last chunk; merge its time into previous chunks
171
+ num_chunks = base_chunks
172
+ else:
173
+ num_chunks = base_chunks + 1
174
+
175
+ # Ensure at least one chunk
176
+ num_chunks = max(1, num_chunks)
177
+
178
+ # Words per chunk (roughly even)
179
+ words_per_chunk = max(1, int(math.ceil(len(words) / num_chunks)))
180
+
181
+ chunk_start = start
182
+ word_idx = 0
183
+
184
+ for chunk_idx in range(num_chunks):
185
+ # Last chunk takes us all the way to the original end,
186
+ # so it can be slightly > max_seconds if needed.
187
+ if chunk_idx == num_chunks - 1:
188
+ chunk_end = end
189
+ else:
190
+ chunk_end = min(end, chunk_start + max_td)
191
+
192
+ if chunk_end <= chunk_start:
193
+ break
194
+
195
+ chunk_words = words[word_idx:word_idx + words_per_chunk]
196
+ word_idx += words_per_chunk
197
+
198
+ if not chunk_words:
199
+ break
200
+
201
+ new_subs.append(
202
+ srt.Subtitle(
203
+ index=new_index,
204
+ start=chunk_start,
205
+ end=chunk_end,
206
+ content=" ".join(chunk_words),
207
+ )
208
+ )
209
+ new_index += 1
210
+
211
+ chunk_start = chunk_end
212
+
213
+ return new_subs
214
+
215
+
216
+ def split_text_into_chunks(text, max_chars=400):
217
+ """
218
+ Rough splitter: breaks text into chunks <= max_chars,
219
+ preferring to split at sentence boundaries, then spaces.
220
+ """
221
+ text = text.strip()
222
+ chunks = []
223
+
224
+ while len(text) > max_chars:
225
+ # Try to split at the last sentence end before max_chars
226
+ split_at = max(
227
+ text.rfind(". ", 0, max_chars),
228
+ text.rfind("! ", 0, max_chars),
229
+ text.rfind("? ", 0, max_chars),
230
+ )
231
+
232
+ # If there was no sentence boundary, fall back to last space
233
+ if split_at == -1:
234
+ split_at = text.rfind(" ", 0, max_chars)
235
+
236
+ # If still nothing, just hard cut
237
+ if split_at == -1:
238
+ split_at = max_chars
239
+
240
+ chunk = text[:split_at + 1].strip()
241
+ chunks.append(chunk)
242
+ text = text[split_at + 1 :].strip()
243
+
244
+ if text:
245
+ chunks.append(text)
246
+
247
+ return chunks
248
+
249
+
250
+ def sh(cmd): subprocess.check_call(cmd, shell=True)
251
+
252
+ # sh("find / -name \"libcudnn*\" 2>/dev/null")
253
+ # --------------------
254
+ # CONFIG
255
+ # --------------------
256
+ MODEL_SIZE = "medium" # e.g. "small", "medium", "large-v2"
257
+ MIN_SEGMENT_SECONDS = 0.5 # only transcribe segments longer than this
258
+
259
+ # If your pyannote pipeline needs a HF token, set it here or via env var:
260
+ # HUGGINGFACE_TOKEN = "hf_..."
261
+ HF_TOKEN = os.getenv("HF_TOKEN", None)
262
+
263
+ # --------------------
264
+ # LOAD GLOBAL MODELS (ONCE)
265
+ # --------------------
266
+ device = "cuda" if torch.cuda.is_available() else "cpu"
267
+
268
+ print(f"Loading pyannote diarization model...")
269
+ diarization_pipeline = Pipeline.from_pretrained(
270
+ "pyannote/speaker-diarization-3.1"
271
+ )
272
+
273
+ # --------------------
274
+ # HELPERS
275
+ # --------------------
276
+ def format_timestamp(ts: float) -> str:
277
+ """Convert seconds to SRT timestamp format."""
278
+ hrs = int(ts // 3600)
279
+ mins = int((ts % 3600) // 60)
280
+ secs = int(ts % 60)
281
+ ms = int((ts - int(ts)) * 1000)
282
+ return f"{hrs:02d}:{mins:02d}:{secs:02d},{ms:03d}"
283
+
284
+
285
+ def extract_audio_to_wav(input_video: str, output_dir: str):
286
+
287
+ audio_file = os.path.join(output_dir, "audio_og.wav")
288
+ background_file = os.path.join(output_dir, "background_og.wav")
289
+ vocal_file = os.path.join(output_dir, "vocal_og.wav")
290
+ effect_file = os.path.join(output_dir, "effect_og.wav")
291
+
292
+ audio_16k_file = os.path.join(output_dir, "audio_16k.wav")
293
+
294
+ video_path = input_video
295
+ separator_dir = Path(os.path.join(output_dir, "separator_directory"))
296
+ os.makedirs(separator_dir, exist_ok=True)
297
+
298
+
299
+ # Extract raw audio
300
+ cmd = [
301
+ "ffmpeg",
302
+ "-loglevel", "error",
303
+ "-i", video_path,
304
+ "-vn",
305
+ "-acodec", "pcm_s16le",
306
+ "-ar", "44100",
307
+ "-ac", "2",
308
+ audio_file
309
+ ]
310
+ subprocess.run(cmd, check=True)
311
+
312
+ audio, sr = torchaudio.load(audio_file)
313
+ audio = audio.to("cuda")
314
+
315
+ with torch.no_grad():
316
+ dialog, effect, music = dnr_model(audio[None])
317
+
318
+ torchaudio.save(vocal_file, dialog.cpu(), sr)
319
+ torchaudio.save(effect_file, effect.cpu(), sr)
320
+ torchaudio.save(background_file, music.cpu(), sr)
321
+
322
+ # Convert vocals to 16k mono
323
+ cmd = [
324
+ "ffmpeg",
325
+ "-loglevel", "error",
326
+ "-y",
327
+ "-i", vocal_file,
328
+ "-ac", "1",
329
+ "-ar", "16000",
330
+ "-acodec", "pcm_s16le",
331
+ audio_16k_file
332
+ ]
333
+ subprocess.run(cmd, check=True)
334
+
335
+ return audio_file, effect_file, background_file, audio_16k_file, vocal_file
336
+
337
+
338
+ def diarize_audio(audio_path: str) -> List[Dict]:
339
+ """Run pyannote diarization and return segments."""
340
+
341
+ diarization_pipeline.to(torch.device(device))
342
+
343
+ with ProgressHook() as hook:
344
+ diarization_result = diarization_pipeline(audio_path, hook=hook)
345
+
346
+ segments = []
347
+ for segment, _, speaker in diarization_result.itertracks(yield_label=True):
348
+ duration = segment.end - segment.start
349
+ if duration >= MIN_SEGMENT_SECONDS:
350
+ segments.append(
351
+ {
352
+ "start": float(segment.start),
353
+ "end": float(segment.end),
354
+ "speaker": speaker,
355
+ }
356
+ )
357
+
358
+ segments.sort(key=lambda x: x["start"])
359
+ return segments
360
+
361
+
362
+ def chunk_to_float32(chunk: AudioSegment) -> np.ndarray:
363
+ """Convert a pydub chunk to mono 16kHz float32 numpy array in [-1, 1]."""
364
+ chunk = chunk.set_frame_rate(16000).set_channels(1)
365
+ samples = np.array(chunk.get_array_of_samples())
366
+
367
+ # Normalize based on sample width
368
+ if chunk.sample_width == 2: # 16-bit
369
+ samples = samples.astype(np.float32) / 32768.0
370
+ elif chunk.sample_width == 4: # 32-bit
371
+ samples = samples.astype(np.float32) / 2147483648.0
372
+ else:
373
+ samples = samples.astype(np.float32)
374
+
375
+ return samples
376
+
377
+
378
+ def transcribe_segment(whisper_model, samples: np.ndarray) -> str:
379
+ """Transcribe+translate a single segment with faster-whisper."""
380
+ segment_text_parts = []
381
+
382
+
383
+ segments, info = whisper_model.transcribe(
384
+ samples,
385
+ beam_size=1,
386
+ vad_filter=False, # diarization already detected speech
387
+ condition_on_previous_text=True, # independent segments
388
+ task="translate", # translate to English
389
+ word_timestamps=True,
390
+ )
391
+
392
+ for seg in segments:
393
+ if seg.text:
394
+ segment_text_parts.append(seg.text.strip())
395
+
396
+ return " ".join(segment_text_parts)
397
+
398
+ def transcribe_segment_words(
399
+ whisper_model,
400
+ samples: np.ndarray,
401
+ offset_sec: float,
402
+ speaker: str | None = None,
403
+ ):
404
+ """
405
+ Transcribe+translate a single diarization segment, returning a
406
+ list of word dicts with absolute timestamps.
407
+ """
408
+ words_out = []
409
+
410
+ segments, info = whisper_model.transcribe(
411
+ samples,
412
+ beam_size=1,
413
+ vad_filter=False, # diarization already detected speech
414
+ condition_on_previous_text=False, # better for hard cuts / segments
415
+ task="translate",
416
+ word_timestamps=True,
417
+ )
418
+
419
+ for seg in segments:
420
+ if not seg.words:
421
+ continue
422
+ for w in seg.words:
423
+ words_out.append(
424
+ {
425
+ "start": offset_sec + float(w.start),
426
+ "end": offset_sec + float(w.end),
427
+ "text": w.word,
428
+ "speaker": speaker,
429
+ }
430
+ )
431
+
432
+ return words_out
433
+
434
+ def words_to_subtitles(words, max_seconds: float = 10.0):
435
+ """
436
+ Group word-level timings into SRT subtitles, each up to max_seconds long,
437
+ cutting ONLY at word boundaries, AND never mixing speakers in the same subtitle.
438
+ Whenever the speaker changes, we close the current subtitle and start a new one.
439
+
440
+ Expects each word dict to have:
441
+ - "start" (float, seconds)
442
+ - "end" (float, seconds)
443
+ - "text" (str)
444
+ - "speaker" (str or None)
445
+ """
446
+ # sort just in case
447
+ words = sorted(words, key=lambda w: w["start"])
448
+
449
+ subtitles = []
450
+ current_words = []
451
+ current_start = None
452
+ current_speaker = None
453
+
454
+ index = 1
455
+
456
+ for w in words:
457
+ w_start = w["start"]
458
+ w_end = w["end"]
459
+ w_speaker = w.get("speaker")
460
+
461
+ if current_start is None:
462
+ # start first subtitle
463
+ current_start = w_start
464
+ current_words = [w]
465
+ current_speaker = w_speaker
466
+ continue
467
+
468
+ speaker_changed = (w_speaker != current_speaker)
469
+ duration_if_added = w_end - current_start
470
+ exceeds_max = duration_if_added > max_seconds
471
+
472
+ # If adding this word would:
473
+ # - exceed max_seconds, OR
474
+ # - cross into a different speaker,
475
+ # then we close the current subtitle and start a new one.
476
+ if (speaker_changed or exceeds_max) and current_words:
477
+ text = " ".join(x["text"] for x in current_words).strip()
478
+ sub_start = current_start
479
+ sub_end = current_words[-1]["end"]
480
+
481
+ subtitles.append(
482
+ srt.Subtitle(
483
+ index=index,
484
+ start=timedelta(seconds=sub_start),
485
+ end=timedelta(seconds=sub_end),
486
+ content=text,
487
+ )
488
+ )
489
+ index += 1
490
+
491
+ # start new subtitle from this word
492
+ current_start = w_start
493
+ current_words = [w]
494
+ current_speaker = w_speaker
495
+ else:
496
+ current_words.append(w)
497
+
498
+ # flush last subtitle
499
+ if current_words:
500
+ text = " ".join(x["text"] for x in current_words).strip()
501
+ sub_start = current_start
502
+ sub_end = current_words[-1]["end"]
503
+ subtitles.append(
504
+ srt.Subtitle(
505
+ index=index,
506
+ start=timedelta(seconds=sub_start),
507
+ end=timedelta(seconds=sub_end),
508
+ content=text,
509
+ )
510
+ )
511
+
512
+ return subtitles
513
+
514
+ def build_srt(segments: List[Dict], audio_wav: str, out_srt_path: str):
515
+ """
516
+ Generate SRT file from diarized segments and audio,
517
+ using word-level timestamps and grouping into ~10s subtitles.
518
+ """
519
+ audio = AudioSegment.from_file(audio_wav)
520
+
521
+ print(f"Loading faster-whisper model ({MODEL_SIZE})...")
522
+ whisper_model = WhisperModel(
523
+ MODEL_SIZE,
524
+ device="cuda",
525
+ compute_type="float16",
526
+ )
527
+
528
+ all_words = []
529
+
530
+ for i, seg in enumerate(segments, start=1):
531
+ start_sec = seg["start"]
532
+ end_sec = seg["end"]
533
+ speaker = seg["speaker"]
534
+
535
+ start_ms = int(start_sec * 1000)
536
+ end_ms = int(end_sec * 1000)
537
+ chunk = audio[start_ms:end_ms]
538
+
539
+ samples = chunk_to_float32(chunk)
540
+
541
+ # get words for this diar segment, with absolute times
542
+ seg_words = transcribe_segment_words(
543
+ whisper_model,
544
+ samples,
545
+ offset_sec=start_sec,
546
+ speaker=speaker,
547
+ )
548
+
549
+ all_words.extend(seg_words)
550
+ print(f"Diar segment {i} ({speaker}): {len(seg_words)} words")
551
+
552
+ # group words into ≀10s subtitles, word aligned
553
+ subtitles = words_to_subtitles(all_words, max_seconds=10.0)
554
+
555
+ # write SRT
556
+ with open(out_srt_path, "w", encoding="utf-8") as f:
557
+ f.write(srt.compose(subtitles))
558
+
559
+ def translate_video(video_file, duration):
560
+ return process_video(video_file, False, duration)
561
+
562
+ def translate_lipsync_video(video_file, duration):
563
+ return process_video(video_file, True, duration)
564
+
565
+
566
+ def run_example(video_file, allow_lipsync, duration):
567
+
568
+ with timer("processed"):
569
+ result = process_video(video_file, allow_lipsync, duration)
570
+
571
+ return result
572
+
573
+ @spaces.GPU(duration=350)
574
+ def process_video(video_file, allow_lipsync, duration):
575
+ """
576
+ Gradio callback:
577
+ - video_file: temp file object/path from Gradio
578
+ - returns path to generated SRT file (for download)
579
+ """
580
+ if video_file is None:
581
+ raise gr.Error("Please upload an MP4 video.")
582
+
583
+ session_id = uuid.uuid4().hex
584
+
585
+ output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
586
+ os.makedirs(output_dir, exist_ok=True)
587
+
588
+ # Gradio's File/Video component gives dict or str depending on version
589
+ if isinstance(video_file, dict):
590
+ video_path = video_file.get("name") or video_file.get("path")
591
+ else:
592
+ video_path = video_file
593
+
594
+ if video_path is None or not os.path.exists(video_path):
595
+ raise gr.Error("Could not read uploaded video file.")
596
+
597
+ # Create temp directory to hold WAV + SRT
598
+ srt_path = os.path.join(output_dir, "diarized_translated.srt")
599
+
600
+ src_video_path = video_path
601
+
602
+ cropped_video_path = os.path.join(output_dir, "input_30s.mp4")
603
+
604
+ duration_s = int(duration)
605
+
606
+ print(f"duration_s:{duration_s}")
607
+
608
+ cmd = [
609
+ "ffmpeg",
610
+ "-y",
611
+ "-i", src_video_path,
612
+ "-t", f"{duration_s}",
613
+ "-c", "copy", # stream copy, no re-encode
614
+ cropped_video_path,
615
+ ]
616
+ subprocess.run(cmd, check=True)
617
+ video_path = cropped_video_path
618
+
619
+ # 1. Extract audio
620
+ audio_wav, effect_wav, background_wav, audio_16k_wav, vocal_wav = extract_audio_to_wav(video_path, output_dir)
621
+
622
+ # 2. Diarization
623
+ segments = diarize_audio(audio_16k_wav)
624
+ if not segments:
625
+ raise gr.Error("No valid speech segments found for diarization.")
626
+
627
+ # 3. Build SRT from diarized segments + whisper
628
+ with timer("Generating srt"):
629
+ build_srt(segments, audio_16k_wav, srt_path)
630
+
631
+ # ---- ORIGINAL SRT (used for TTS) ----
632
+ with open(srt_path, "r", encoding="utf-8") as f:
633
+ srt_data = f.read()
634
+
635
+ subtitles = list(srt.parse(srt_data))
636
+
637
+ # Keep this list as-is for TTS timing
638
+ tts_subtitles = subtitles
639
+
640
+ # ---- CREATE 10s-MAX SRT FOR DOWNLOAD ----
641
+ max10_subtitles = tts_subtitles
642
+ # max10_subtitles = split_subtitles_max_duration(subtitles, max_seconds=10.0)
643
+
644
+ tts_subtitles = max10_subtitles
645
+
646
+ srt_10s_path = os.path.join(output_dir, "diarized_translated_max10s.srt")
647
+ with open(srt_10s_path, "w", encoding="utf-8") as f:
648
+ f.write(srt.compose(max10_subtitles))
649
+
650
+ # ---- TTS USING ORIGINAL SRT ----
651
+ last_end_seconds = tts_subtitles[-1].end.total_seconds()
652
+ total_ms = int((last_end_seconds + 1) * 1000)
653
+
654
+ timeline = AudioSegment.silent(duration=total_ms)
655
+
656
+ original_audio = AudioSegment.from_file(audio_wav)
657
+
658
+ MAX_BATCH_MS = 300_000 # ~5 minutes of target subtitle duration per batch
659
+
660
+ with timer("Generating speech"):
661
+ num_subs = len(tts_subtitles)
662
+ idx = 0
663
+
664
+ while idx < num_subs:
665
+ spk_prompts = [] # paths to src_prompt_*.wav
666
+ texts = [] # subtitle texts for this batch
667
+ out_paths = [] # where IndexTTS2 will save generated wavs
668
+ starts_ms = [] # for overlaying later
669
+ target_ms_list = [] # per-subtitle target durations
670
+ batch_ms_sum = 0
671
+
672
+ batch_start = idx
673
+
674
+ # ---- fill one batch until we hit ~MAX_BATCH_MS ----
675
+ while idx < num_subs:
676
+ sub = tts_subtitles[idx]
677
+
678
+ start_ms = int(sub.start.total_seconds() * 1000)
679
+ end_ms = int(sub.end.total_seconds() * 1000)
680
+ target_ms = max(end_ms - start_ms, 0)
681
+
682
+ # If adding this subtitle would exceed the limit and we already
683
+ # have something in the batch, stop and process the current batch.
684
+ if batch_ms_sum + target_ms > MAX_BATCH_MS and len(target_ms_list) > 0:
685
+ break
686
+
687
+ global_idx = idx
688
+
689
+ # 1) prompt audio for this subtitle
690
+ src_chunk = original_audio[start_ms:end_ms]
691
+ src_prompt_path = os.path.join(output_dir, f"src_prompt_{global_idx}.wav")
692
+ src_chunk.export(src_prompt_path, format="wav")
693
+
694
+ # 2) text + output path
695
+ text = sub.content.replace("\n", " ")
696
+ out_path = os.path.join(output_dir, f"gen_{global_idx}.wav")
697
+
698
+ spk_prompts.append(src_prompt_path)
699
+ texts.append(text)
700
+ out_paths.append(out_path)
701
+ starts_ms.append(start_ms)
702
+ target_ms_list.append(target_ms)
703
+
704
+ batch_ms_sum += target_ms
705
+ idx += 1
706
+
707
+ print(f"batch from {batch_start} to {idx - 1}, batch_ms_sum: {batch_ms_sum}")
708
+
709
+ # --- call batched TTS once for this batch ---
710
+ do_sample = True
711
+ top_p = 0.8
712
+ top_k = 30
713
+ temperature = 0.8
714
+ length_penalty = 0.0
715
+ num_beams = 3
716
+ repetition_penalty = 10.0
717
+ max_mel_tokens = 1500
718
+
719
+ # You could compute some aggregate target_length_ms here if your API supports it,
720
+ # e.g. avg or max(target_ms_list). For now, keep None as before.
721
+ tts_outputs = tts.infer_batch(
722
+ spk_audio_prompts=spk_prompts,
723
+ texts=texts,
724
+ output_paths=out_paths,
725
+ emo_audio_prompts=None,
726
+ emo_alpha=1.0,
727
+ emo_vectors=None,
728
+ use_emo_text=False,
729
+ emo_texts=None,
730
+ use_random=False,
731
+ interval_silence=200,
732
+ verbose=False,
733
+ max_text_tokens_per_segment=120,
734
+ speed=1.0,
735
+ target_length_ms=target_ms_list,
736
+ do_sample=do_sample,
737
+ top_p=top_p,
738
+ top_k=top_k,
739
+ temperature=temperature,
740
+ length_penalty=length_penalty,
741
+ num_beams=num_beams,
742
+ repetition_penalty=repetition_penalty,
743
+ max_mel_tokens=max_mel_tokens,
744
+ )
745
+
746
+ # --- read generated wavs and overlay them ---
747
+ for local_idx, out_path in enumerate(tts_outputs):
748
+ start_ms = starts_ms[local_idx]
749
+
750
+ seg = AudioSegment.from_file(out_path, format="wav")
751
+ seg = seg - 2
752
+ timeline = timeline.overlay(seg, position=start_ms)
753
+
754
+ # cleanup
755
+ os.remove(out_path)
756
+ os.remove(spk_prompts[local_idx])
757
+
758
+ # -------------------------------------------------------
759
+ # Bring back original dialog in the *gaps* (grunts, etc.)
760
+ # -------------------------------------------------------
761
+ # Load separated dialog track
762
+ dialog = AudioSegment.from_file(vocal_wav)
763
+
764
+ # Make sure it matches the TTS timeline parameters
765
+ dialog = dialog.set_frame_rate(timeline.frame_rate).set_channels(timeline.channels)
766
+
767
+ total_len_ms = len(timeline)
768
+
769
+ # Collect speech regions from subtitles (approximate "where TTS will speak")
770
+ speech_regions = []
771
+ for sub in tts_subtitles:
772
+ start_ms = int(sub.start.total_seconds() * 1000)
773
+ end_ms = int(sub.end.total_seconds() * 1000)
774
+ # clamp to track length
775
+ start_ms = max(0, min(start_ms, total_len_ms))
776
+ end_ms = max(0, min(end_ms, total_len_ms))
777
+ if end_ms > start_ms:
778
+ speech_regions.append((start_ms, end_ms))
779
+
780
+ # Merge overlapping/adjacent regions
781
+ speech_regions.sort()
782
+ merged = []
783
+ for s, e in speech_regions:
784
+ if not merged:
785
+ merged.append([s, e])
786
+ else:
787
+ last_s, last_e = merged[-1]
788
+ if s <= last_e: # overlap or touch
789
+ merged[-1][1] = max(last_e, e)
790
+ else:
791
+ merged.append([s, e])
792
+
793
+ # Compute the complement: regions where there's NO subtitle (gaps)
794
+ gaps = []
795
+ cursor = 0
796
+ for s, e in merged:
797
+ if cursor < s:
798
+ gaps.append((cursor, s))
799
+ cursor = max(cursor, e)
800
+ if cursor < total_len_ms:
801
+ gaps.append((cursor, total_len_ms))
802
+
803
+ # Overlay original dialog only in those gaps
804
+ MIN_GAP_MS = 10 # ignore ultra-tiny gaps
805
+
806
+ for g_start, g_end in gaps:
807
+ if g_end - g_start < MIN_GAP_MS:
808
+ continue
809
+
810
+ # Extract that piece of the original dialog
811
+ original_chunk = dialog[g_start:g_end]
812
+ original_chunk = original_chunk + 6
813
+
814
+ timeline = timeline.overlay(original_chunk, position=g_start)
815
+
816
+
817
+ video_in = video_file
818
+ audio_in = output_dir + "/final_output.wav"
819
+ audio_16k_in = output_dir + "/final_16k_output.wav"
820
+
821
+ # ---------- 5. Mix background + new TTS vocal ----------
822
+
823
+ if background_wav is not None:
824
+ eff = AudioSegment.from_file(effect_wav)
825
+ bg = AudioSegment.from_file(background_wav)
826
+
827
+
828
+
829
+ # If background is shorter than the TTS timeline, loop it
830
+ if len(eff) < len(timeline):
831
+ loops = math.ceil(len(timeline) / len(eff))
832
+ eff = eff * loops
833
+
834
+ if len(bg) < len(timeline):
835
+ loops = math.ceil(len(timeline) / len(bg))
836
+ bg = bg * loops
837
+
838
+
839
+
840
+ # Cut or match to TTS length
841
+ eff = eff[:len(timeline)]
842
+ bg = bg[:len(timeline)]
843
+
844
+
845
+ bg = bg + 6
846
+ eff = eff + 6
847
+
848
+ eff_timeline = eff.overlay(timeline)
849
+ final_audio = bg.overlay(eff_timeline)
850
+ final_16k_audio = timeline.set_frame_rate(16000).set_channels(1)
851
+ else:
852
+ # Fallback: no background found, just use TTS
853
+ final_audio = timeline
854
+ final_16k_audio = timeline
855
+
856
+ final_audio.export(audio_in, format="wav")
857
+ final_16k_audio.export(audio_16k_in, format="wav")
858
+
859
+ print(f"Done! Saved to {audio_in}")
860
+
861
+ lipsynced_video = output_dir + "/output_with_lipsync_16k.mp4"
862
+
863
+ if allow_lipsync:
864
+ apply_lipsync(video_in, audio_16k_in, lipsynced_video)
865
+ else:
866
+ lipsynced_video = video_in
867
+
868
+ video_out = output_dir + "/output_with_lipsync.mp4"
869
+
870
+
871
+ cmd = [
872
+ "ffmpeg",
873
+ "-loglevel", "error",
874
+ "-y", # overwrite output file
875
+ "-i", lipsynced_video, # input video
876
+ "-i", audio_in, # new audio
877
+ "-c:v", "copy", # do not re-encode video
878
+ "-map", "0:v:0", # take video from input 0
879
+ "-map", "1:a:0", # take audio from input 1
880
+ "-shortest", # stop when either track ends
881
+ video_out,
882
+ ]
883
+
884
+ subprocess.run(cmd, check=True)
885
+
886
+
887
+ # IMPORTANT: return the 10s-max SRT for download
888
+ return video_out, srt_10s_path, audio_16k_in
889
+
890
+
891
+
892
+ css = """
893
+ #col-container {
894
+ margin: 0 auto;
895
+ max-width: 1600px;
896
+ }
897
+ #modal-container {
898
+ width: 100vw; /* Take full viewport width */
899
+ height: 100vh; /* Take full viewport height (optional) */
900
+ display: flex;
901
+ justify-content: center; /* Center content horizontally */
902
+ align-items: center; /* Center content vertically if desired */
903
+ }
904
+ #modal-content {
905
+ width: 100%;
906
+ max-width: 700px; /* Limit content width */
907
+ margin: 0 auto;
908
+ border-radius: 8px;
909
+ padding: 1.5rem;
910
+ }
911
+ #step-column {
912
+ padding: 10px;
913
+ border-radius: 8px;
914
+ box-shadow: var(--card-shadow);
915
+ margin: 10px;
916
+ }
917
+ #col-showcase {
918
+ margin: 0 auto;
919
+ max-width: 1100px;
920
+ }
921
+ .button-gradient {
922
+ background: linear-gradient(45deg, rgb(255, 65, 108), rgb(255, 75, 43), rgb(255, 155, 0), rgb(255, 65, 108)) 0% 0% / 400% 400%;
923
+ border: none;
924
+ padding: 14px 28px;
925
+ font-size: 16px;
926
+ font-weight: bold;
927
+ color: white;
928
+ border-radius: 10px;
929
+ cursor: pointer;
930
+ transition: 0.3s ease-in-out;
931
+ animation: 2s linear 0s infinite normal none running gradientAnimation;
932
+ box-shadow: rgba(255, 65, 108, 0.6) 0px 4px 10px;
933
+ }
934
+ .toggle-container {
935
+ display: inline-flex;
936
+ background-color: #ffd6ff; /* light pink background */
937
+ border-radius: 9999px;
938
+ padding: 4px;
939
+ position: relative;
940
+ width: fit-content;
941
+ font-family: sans-serif;
942
+ }
943
+ .toggle-container input[type="radio"] {
944
+ display: none;
945
+ }
946
+ .toggle-container label {
947
+ position: relative;
948
+ z-index: 2;
949
+ flex: 1;
950
+ text-align: center;
951
+ font-weight: 700;
952
+ color: #4b2ab5; /* dark purple text for unselected */
953
+ padding: 6px 22px;
954
+ border-radius: 9999px;
955
+ cursor: pointer;
956
+ transition: color 0.25s ease;
957
+ }
958
+ /* Moving highlight */
959
+ .toggle-highlight {
960
+ position: absolute;
961
+ top: 4px;
962
+ left: 4px;
963
+ width: calc(50% - 4px);
964
+ height: calc(100% - 8px);
965
+ background-color: #4b2ab5; /* dark purple background */
966
+ border-radius: 9999px;
967
+ transition: transform 0.25s ease;
968
+ z-index: 1;
969
+ }
970
+ /* When "True" is checked */
971
+ #true:checked ~ label[for="true"] {
972
+ color: #ffd6ff; /* light pink text */
973
+ }
974
+ /* When "False" is checked */
975
+ #false:checked ~ label[for="false"] {
976
+ color: #ffd6ff; /* light pink text */
977
+ }
978
+ /* Move highlight to right side when False is checked */
979
+ #false:checked ~ .toggle-highlight {
980
+ transform: translateX(100%);
981
+ }
982
+ """
983
+
984
+
985
+ with gr.Blocks(css=css) as demo:
986
+
987
+ with gr.Column(elem_id="col-container"):
988
+ gr.HTML(
989
+ """
990
+ <div style="text-align: center;">
991
+ <p style="font-size:16px; display: inline; margin: 0;">
992
+ <strong>OutofSync </strong>
993
+ </p>
994
+ <p style="font-size:16px; display: inline; margin: 0;">
995
+ -- HF Space By:
996
+ </p>
997
+ <a href="https://huggingface.co/alexnasa" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;">
998
+ <img src="https://img.shields.io/badge/πŸ€—-Follow Me-yellow.svg">
999
+ </a>
1000
+ <a href="https://www.buymeacoffee.com/outofai" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;" target="_blank"><img src="https://img.shields.io/badge/-buy_me_a%C2%A0coffee-red?logo=buy-me-a-coffee" alt="Buy Me A Coffee"></a>
1001
+ </div>
1002
+ """
1003
+ )
1004
+
1005
+ with gr.Row():
1006
+ with gr.Column(elem_id="step-column"):
1007
+ gr.HTML("""
1008
+ <div>
1009
+ <span style="font-size: 24px;">1. Upload a Video</span><br>
1010
+ </div>
1011
+ """)
1012
+
1013
+ video_input = gr.Video(
1014
+ label="OG Clip",
1015
+ height=512
1016
+ )
1017
+
1018
+ with gr.Column(elem_id="step-column"):
1019
+ gr.HTML("""
1020
+ <div>
1021
+ <span style="font-size: 24px;">2. Translate + πŸ’‹ </span><br>
1022
+ </div>
1023
+ """)
1024
+
1025
+ video_output = gr.Video(label="Output", height=512)
1026
+ lipsync = gr.Checkbox(label="Lipsync", value=False, visible=False)
1027
+ duration = gr.Slider(0, 30, 30, step=10)
1028
+ translate_btn = gr.Button("πŸ€Ήβ€β™‚οΈ Translate")
1029
+ translate_lipsync_btn = gr.Button("πŸ€Ήβ€β™‚οΈ Translate + πŸ’‹ Lipsync", variant='primary', elem_classes="button-gradient")
1030
+
1031
+ with gr.Column(elem_id="step-column"):
1032
+ vocal_16k_output = gr.File(label="Vocal 16k", visible=False)
1033
+ srt_output = gr.File(label="Download translated diarized SRT", visible=False)
1034
+
1035
+ cached_examples = gr.Examples(
1036
+ examples=[
1037
+
1038
+ [
1039
+ "assets/popup-2.mp4",
1040
+ False,
1041
+ 10
1042
+ ],
1043
+
1044
+ [
1045
+ "assets/popup-2.mp4",
1046
+ False,
1047
+ 20
1048
+ ],
1049
+
1050
+ [
1051
+ "assets/popup-2.mp4",
1052
+ False,
1053
+ 30
1054
+ ],
1055
+
1056
+ [
1057
+ "assets/german.mp4",
1058
+ True,
1059
+ 10
1060
+ ],
1061
+
1062
+ [
1063
+ "assets/popup-2.mp4",
1064
+ True,
1065
+ 20
1066
+ ],
1067
+
1068
+ [
1069
+ "assets/popup-2.mp4",
1070
+ True,
1071
+ 30
1072
+ ],
1073
+
1074
+ [
1075
+ "assets/popup-2.mp4",
1076
+ True,
1077
+ 10
1078
+ ],
1079
+
1080
+ [
1081
+ "assets/italian.mp4",
1082
+ True,
1083
+ 10
1084
+ ],
1085
+
1086
+ [
1087
+ "assets/french-movie.mp4",
1088
+ True,
1089
+ 10
1090
+ ],
1091
+
1092
+ ],
1093
+ label="Cached Examples",
1094
+ fn=process_video,
1095
+ inputs=[video_input, lipsync, duration],
1096
+ outputs=[video_output, srt_output, vocal_16k_output],
1097
+ cache_examples=True
1098
+ )
1099
+
1100
+
1101
+ translate_btn.click(
1102
+ fn=translate_video,
1103
+ inputs=[video_input, duration],
1104
+ outputs=[video_output, srt_output, vocal_16k_output],
1105
+ )
1106
+
1107
+ translate_lipsync_btn.click(
1108
+ fn=translate_lipsync_video,
1109
+ inputs=[video_input, duration],
1110
+ outputs=[video_output, srt_output, vocal_16k_output],
1111
+ )
1112
+
1113
+
1114
+ if __name__ == "__main__":
1115
+ demo.queue()
1116
  demo.launch()