Spaces:
Running
Running
g1
Browse files
app.py
CHANGED
@@ -95,37 +95,42 @@ def download_youtube_audio(url, method_choice):
|
|
95 |
|
96 |
def yt_dlp_method(url):
|
97 |
"""
|
98 |
-
Downloads YouTube audio using yt-dlp.
|
99 |
-
|
100 |
Args:
|
101 |
url (str): The YouTube URL.
|
102 |
-
|
103 |
Returns:
|
104 |
str: Path to the downloaded audio file.
|
105 |
"""
|
106 |
logging.info("Using yt-dlp method")
|
|
|
|
|
107 |
ydl_opts = {
|
108 |
'format': 'bestaudio/best',
|
|
|
109 |
'postprocessors': [{
|
110 |
'key': 'FFmpegExtractAudio',
|
111 |
'preferredcodec': 'mp3',
|
112 |
'preferredquality': '192',
|
113 |
}],
|
114 |
-
'
|
|
|
115 |
}
|
116 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
117 |
info = ydl.extract_info(url, download=True)
|
118 |
-
output_file =
|
|
|
119 |
logging.info(f"Downloaded YouTube audio: {output_file}")
|
120 |
return output_file
|
121 |
|
122 |
def pytube_method(url):
|
123 |
"""
|
124 |
-
Downloads audio using pytube.
|
125 |
-
|
126 |
Args:
|
127 |
url (str): The YouTube URL.
|
128 |
-
|
129 |
Returns:
|
130 |
str: Path to the downloaded audio file.
|
131 |
"""
|
@@ -133,7 +138,8 @@ def pytube_method(url):
|
|
133 |
from pytube import YouTube
|
134 |
yt = YouTube(url)
|
135 |
audio_stream = yt.streams.filter(only_audio=True).first()
|
136 |
-
|
|
|
137 |
base, ext = os.path.splitext(out_file)
|
138 |
new_file = base + '.mp3'
|
139 |
os.rename(out_file, new_file)
|
@@ -396,10 +402,10 @@ loaded_models = {}
|
|
396 |
|
397 |
def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, download_method, start_time=None, end_time=None, verbose=False):
|
398 |
"""
|
399 |
-
Transcribes audio from a given
|
400 |
|
401 |
Args:
|
402 |
-
input_source (str
|
403 |
pipeline_type (str): Type of pipeline to use ('faster-batched', 'faster-sequenced', or 'transformers').
|
404 |
model_id (str): The ID of the model to use.
|
405 |
dtype (str): Data type for model computations ('int8', 'float16', or 'float32').
|
@@ -424,32 +430,22 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
|
|
424 |
if verbose:
|
425 |
yield verbose_messages, "", None
|
426 |
|
427 |
-
#
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
return
|
437 |
-
elif isinstance(input_source, str) and os.path.exists(input_source):
|
438 |
-
# Input source is a local file path
|
439 |
-
audio_path = input_source
|
440 |
-
is_temp_file = False
|
441 |
-
elif hasattr(input_source, 'name'):
|
442 |
-
# Input source is an uploaded file object
|
443 |
-
audio_path = input_source.name
|
444 |
-
is_temp_file = False
|
445 |
-
else:
|
446 |
-
yield "No valid audio source provided.", "", None
|
447 |
return
|
448 |
|
449 |
# Convert start_time and end_time to float or None
|
450 |
start_time = float(start_time) if start_time else None
|
451 |
end_time = float(end_time) if end_time else None
|
452 |
|
|
|
453 |
if start_time is not None or end_time is not None:
|
454 |
audio_path = trim_audio(audio_path, start_time, end_time)
|
455 |
is_temp_file = True # The trimmed audio is a temporary file
|
@@ -463,6 +459,7 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
|
|
463 |
model_or_pipeline = loaded_models[model_key]
|
464 |
logging.info("Loaded model from cache")
|
465 |
else:
|
|
|
466 |
if pipeline_type == "faster-batched":
|
467 |
model = WhisperModel(model_id, device=device, compute_type=dtype)
|
468 |
model_or_pipeline = BatchedInferencePipeline(model=model)
|
@@ -495,6 +492,7 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
|
|
495 |
raise ValueError("Invalid pipeline type")
|
496 |
loaded_models[model_key] = model_or_pipeline # Cache the model or pipeline
|
497 |
|
|
|
498 |
start_time_perf = time.time()
|
499 |
if pipeline_type == "faster-batched":
|
500 |
segments, info = model_or_pipeline.transcribe(audio_path, batch_size=batch_size)
|
@@ -505,6 +503,7 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
|
|
505 |
segments = result["chunks"]
|
506 |
end_time_perf = time.time()
|
507 |
|
|
|
508 |
transcription_time = end_time_perf - start_time_perf
|
509 |
audio_file_size = os.path.getsize(audio_path) / (1024 * 1024)
|
510 |
|
@@ -516,6 +515,7 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
|
|
516 |
if verbose:
|
517 |
yield verbose_messages + metrics_output, "", None
|
518 |
|
|
|
519 |
transcription = ""
|
520 |
|
521 |
for segment in segments:
|
@@ -527,6 +527,7 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
|
|
527 |
if verbose:
|
528 |
yield verbose_messages + metrics_output, transcription, None
|
529 |
|
|
|
530 |
transcription_file = save_transcription(transcription)
|
531 |
yield verbose_messages + metrics_output, transcription, transcription_file
|
532 |
|
|
|
95 |
|
96 |
def yt_dlp_method(url):
|
97 |
"""
|
98 |
+
Downloads YouTube audio using yt-dlp and saves it to a temporary file.
|
99 |
+
|
100 |
Args:
|
101 |
url (str): The YouTube URL.
|
102 |
+
|
103 |
Returns:
|
104 |
str: Path to the downloaded audio file.
|
105 |
"""
|
106 |
logging.info("Using yt-dlp method")
|
107 |
+
temp_dir = tempfile.mkdtemp()
|
108 |
+
output_template = os.path.join(temp_dir, '%(id)s.%(ext)s')
|
109 |
ydl_opts = {
|
110 |
'format': 'bestaudio/best',
|
111 |
+
'outtmpl': output_template,
|
112 |
'postprocessors': [{
|
113 |
'key': 'FFmpegExtractAudio',
|
114 |
'preferredcodec': 'mp3',
|
115 |
'preferredquality': '192',
|
116 |
}],
|
117 |
+
'quiet': True,
|
118 |
+
'no_warnings': True,
|
119 |
}
|
120 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
121 |
info = ydl.extract_info(url, download=True)
|
122 |
+
output_file = ydl.prepare_filename(info)
|
123 |
+
output_file = os.path.splitext(output_file)[0] + '.mp3'
|
124 |
logging.info(f"Downloaded YouTube audio: {output_file}")
|
125 |
return output_file
|
126 |
|
127 |
def pytube_method(url):
|
128 |
"""
|
129 |
+
Downloads audio from a YouTube URL using pytube and saves it to a temporary file.
|
130 |
+
|
131 |
Args:
|
132 |
url (str): The YouTube URL.
|
133 |
+
|
134 |
Returns:
|
135 |
str: Path to the downloaded audio file.
|
136 |
"""
|
|
|
138 |
from pytube import YouTube
|
139 |
yt = YouTube(url)
|
140 |
audio_stream = yt.streams.filter(only_audio=True).first()
|
141 |
+
temp_dir = tempfile.mkdtemp()
|
142 |
+
out_file = audio_stream.download(output_path=temp_dir)
|
143 |
base, ext = os.path.splitext(out_file)
|
144 |
new_file = base + '.mp3'
|
145 |
os.rename(out_file, new_file)
|
|
|
402 |
|
403 |
def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, download_method, start_time=None, end_time=None, verbose=False):
|
404 |
"""
|
405 |
+
Transcribes audio from a given URL using the specified pipeline and model.
|
406 |
|
407 |
Args:
|
408 |
+
input_source (str): URL of the audio.
|
409 |
pipeline_type (str): Type of pipeline to use ('faster-batched', 'faster-sequenced', or 'transformers').
|
410 |
model_id (str): The ID of the model to use.
|
411 |
dtype (str): Data type for model computations ('int8', 'float16', or 'float32').
|
|
|
430 |
if verbose:
|
431 |
yield verbose_messages, "", None
|
432 |
|
433 |
+
# Input source is expected to be a URL
|
434 |
+
if not input_source or not input_source.strip():
|
435 |
+
yield "No audio URL provided.", "", None
|
436 |
+
return
|
437 |
+
|
438 |
+
# Download the audio from the URL
|
439 |
+
audio_path, is_temp_file = download_audio(input_source, download_method)
|
440 |
+
if not audio_path or audio_path.startswith("Error"):
|
441 |
+
yield f"Error downloading audio: {audio_path}", "", None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
442 |
return
|
443 |
|
444 |
# Convert start_time and end_time to float or None
|
445 |
start_time = float(start_time) if start_time else None
|
446 |
end_time = float(end_time) if end_time else None
|
447 |
|
448 |
+
# Trim the audio if start or end times are provided
|
449 |
if start_time is not None or end_time is not None:
|
450 |
audio_path = trim_audio(audio_path, start_time, end_time)
|
451 |
is_temp_file = True # The trimmed audio is a temporary file
|
|
|
459 |
model_or_pipeline = loaded_models[model_key]
|
460 |
logging.info("Loaded model from cache")
|
461 |
else:
|
462 |
+
# Load the appropriate model or pipeline based on the pipeline type
|
463 |
if pipeline_type == "faster-batched":
|
464 |
model = WhisperModel(model_id, device=device, compute_type=dtype)
|
465 |
model_or_pipeline = BatchedInferencePipeline(model=model)
|
|
|
492 |
raise ValueError("Invalid pipeline type")
|
493 |
loaded_models[model_key] = model_or_pipeline # Cache the model or pipeline
|
494 |
|
495 |
+
# Perform the transcription
|
496 |
start_time_perf = time.time()
|
497 |
if pipeline_type == "faster-batched":
|
498 |
segments, info = model_or_pipeline.transcribe(audio_path, batch_size=batch_size)
|
|
|
503 |
segments = result["chunks"]
|
504 |
end_time_perf = time.time()
|
505 |
|
506 |
+
# Calculate metrics
|
507 |
transcription_time = end_time_perf - start_time_perf
|
508 |
audio_file_size = os.path.getsize(audio_path) / (1024 * 1024)
|
509 |
|
|
|
515 |
if verbose:
|
516 |
yield verbose_messages + metrics_output, "", None
|
517 |
|
518 |
+
# Compile the transcription text
|
519 |
transcription = ""
|
520 |
|
521 |
for segment in segments:
|
|
|
527 |
if verbose:
|
528 |
yield verbose_messages + metrics_output, transcription, None
|
529 |
|
530 |
+
# Save the transcription to a file
|
531 |
transcription_file = save_transcription(transcription)
|
532 |
yield verbose_messages + metrics_output, transcription, transcription_file
|
533 |
|