transcribe_audio

Running

App Files Files Community

cstr commited on Oct 2, 2024

Commit

516bec5

verified ·

1 Parent(s): 4b50bd3

Update app.py

Browse files

Files changed (1) hide show

app.py +164 -67

app.py CHANGED Viewed

@@ -40,30 +40,53 @@ from faster_whisper.transcribe import BatchedInferencePipeline
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 def download_audio(url, method_choice):
     parsed_url = urlparse(url)
     logging.info(f"Downloading audio from URL: {url} using method: {method_choice}")
     try:
-        if parsed_url.netloc in ['www.youtube.com', 'youtu.be', 'youtube.com']:
             audio_file = download_youtube_audio(url, method_choice)
         else:
             audio_file = download_direct_audio(url, method_choice)
         if not audio_file or not os.path.exists(audio_file):
             raise Exception(f"Failed to download audio from {url}")
-        return audio_file
     except Exception as e:
         logging.error(f"Error downloading audio: {str(e)}")
-        return f"Error: {str(e)}"
 def download_youtube_audio(url, method_choice):
     methods = {
         'yt-dlp': youtube_dl_method,
         'pytube': pytube_method,
         'youtube-dl': youtube_dl_classic_method,
         'yt-dlp-alt': youtube_dl_alternative_method,
-        'ffmpeg': ffmpeg_method,
-        'aria2': aria2_method
     }
-    method = methods.get(method_choice, youtube_dl_method)
     try:
         logging.info(f"Attempting to download YouTube audio using {method_choice}")
         return method(url)
@@ -157,19 +180,31 @@ def aria2_method(url):
     return output_file
 def download_direct_audio(url, method_choice):
     logging.info(f"Downloading direct audio from: {url} using method: {method_choice}")
     if method_choice == 'wget':
         return wget_method(url)
     else:
         try:
-            response = requests.get(url)
             if response.status_code == 200:
                 with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
-                    temp_file.write(response.content)
-                    logging.info(f"Downloaded direct audio to: {temp_file.name}")
-                    return temp_file.name
             else:
-                raise Exception(f"Failed to download audio from {url}")
         except Exception as e:
             logging.error(f"Error downloading direct audio: {str(e)}")
             return None
@@ -183,56 +218,108 @@ def wget_method(url):
     return output_file
 def trim_audio(audio_path, start_time, end_time):
-    logging.info(f"Trimming audio from {start_time} to {end_time}")
-    audio = AudioSegment.from_file(audio_path)
-    audio_duration = len(audio) / 1000  # Duration in seconds
-    # Default start and end times if None
-    if start_time is None:
-        start_time = 0
-    if end_time is None or end_time > audio_duration:
-        end_time = audio_duration
-    # Validate times
-    if start_time < 0 or end_time < 0:
-        raise gr.Error("Start time and end time must be non-negative.")
-    if start_time >= end_time:
-        raise gr.Error("End time must be greater than start time.")
-    if start_time > audio_duration:
-        raise gr.Error("Start time exceeds audio duration.")
-    trimmed_audio = audio[start_time * 1000:end_time * 1000]
-    trimmed_audio_path = tempfile.mktemp(suffix='.wav')
-    trimmed_audio.export(trimmed_audio_path, format="wav")
-    logging.info(f"Trimmed audio saved to: {trimmed_audio_path}")
-    return trimmed_audio_path
 def save_transcription(transcription):
-    file_path = tempfile.mktemp(suffix='.txt')
-    with open(file_path, 'w') as f:
-        f.write(transcription)
-    logging.info(f"Transcription saved to: {file_path}")
-    return file_path
 def get_model_options(pipeline_type):
     if pipeline_type == "faster-batched":
-        return ["cstr/whisper-large-v3-turbo-int8_float32", "deepdml/faster-whisper-large-v3-turbo-ct2", "Systran/faster-whisper-large-v3", "GalaktischeGurke/primeline-whisper-large-v3-german-ct2"]
     elif pipeline_type == "faster-sequenced":
-        return ["cstr/whisper-large-v3-turbo-int8_float32", "deepdml/faster-whisper-large-v3-turbo-ct2", "Systran/faster-whisper-large-v3", "GalaktischeGurke/primeline-whisper-large-v3-german-ct2"]
     elif pipeline_type == "transformers":
-        return ["openai/whisper-large-v3", "openai/whisper-large-v3-turbo", "primeline/whisper-large-v3-german"]
     else:
         return []
 loaded_models = {}
 def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, download_method, start_time=None, end_time=None, verbose=False):
     try:
         if verbose:
             logging.getLogger().setLevel(logging.INFO)
         else:
             logging.getLogger().setLevel(logging.WARNING)
         logging.info(f"Transcription parameters: pipeline_type={pipeline_type}, model_id={model_id}, dtype={dtype}, batch_size={batch_size}, download_method={download_method}")
         verbose_messages = f"Starting transcription with parameters:\nPipeline Type: {pipeline_type}\nModel ID: {model_id}\nData Type: {dtype}\nBatch Size: {batch_size}\nDownload Method: {download_method}\n"
@@ -240,21 +327,25 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
             yield verbose_messages, "", None
         # Determine if input_source is a URL or file
-        if isinstance(input_source, str):
-            if input_source.startswith('http://') or input_source.startswith('https://'):
-                audio_path = download_audio(input_source, download_method)
-                if not audio_path or audio_path.startswith("Error"):
-                    yield f"Error: {audio_path}", "", None
-                    return
-            else:
-                # Assume it's a local file path
-                audio_path = input_source
-        elif input_source is not None:
-            # Uploaded file object
             audio_path = input_source.name
-            logging.info(f"Using uploaded audio file: {audio_path}")
         else:
-            yield "No audio source provided.", "", None
             return
         # Convert start_time and end_time to float or None
@@ -262,8 +353,8 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
         end_time = float(end_time) if end_time else None
         if start_time is not None or end_time is not None:
-            trimmed_audio_path = trim_audio(audio_path, start_time, end_time)
-            audio_path = trimmed_audio_path
             verbose_messages += f"Audio trimmed from {start_time} to {end_time}\n"
             if verbose:
                 yield verbose_messages, "", None
@@ -276,10 +367,9 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
         else:
             if pipeline_type == "faster-batched":
                 model = WhisperModel(model_id, device=device, compute_type=dtype)
-                pipeline = BatchedInferencePipeline(model=model)
             elif pipeline_type == "faster-sequenced":
-                model = WhisperModel(model_id, device=device, compute_type=dtype)
-                pipeline = model.transcribe
             elif pipeline_type == "transformers":
                 torch_dtype = torch.float16 if dtype == "float16" else torch.float32
                 model = AutoModelForSpeechSeq2Seq.from_pretrained(
@@ -287,7 +377,7 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
                 )
                 model.to(device)
                 processor = AutoProcessor.from_pretrained(model_id)
-                pipeline = pipeline(
                    "automatic-speech-recognition",
                     model=model,
                     tokenizer=processor.tokenizer,
@@ -300,7 +390,7 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
                 )
             else:
                 raise ValueError("Invalid pipeline type")
-            loaded_models[model_key] = model_or_pipeline  # Cache the model
         start_time_perf = time.time()
         if pipeline_type == "faster-batched":
@@ -343,11 +433,9 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
     finally:
         # Clean up temporary files
-        if audio_path and os.path.exists(audio_path):
             os.remove(audio_path)
-        if 'trimmed_audio_path' in locals() and os.path.exists(trimmed_audio_path):
-            os.remove(trimmed_audio_path)
-        if 'transcription_file' in locals() and os.path.exists(transcription_file):
             os.remove(transcription_file)
 with gr.Blocks() as iface:
@@ -390,6 +478,15 @@ with gr.Blocks() as iface:
         transcription_file = gr.File(label="Download Transcription")
     def update_model_dropdown(pipeline_type):
         try:
             model_choices = get_model_options(pipeline_type)
             logging.info(f"Model choices for {pipeline_type}: {model_choices}")

 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 def download_audio(url, method_choice):
+    """
+    Downloads audio from a given URL using the specified method.
+    Args:
+        url (str): The URL of the audio.
+        method_choice (str): The method to use for downloading audio.
+    Returns:
+        tuple: (path to the downloaded audio file, is_temp_file), or (error message, False).
+    """
     parsed_url = urlparse(url)
     logging.info(f"Downloading audio from URL: {url} using method: {method_choice}")
     try:
+        if 'youtube.com' in parsed_url.netloc or 'youtu.be' in parsed_url.netloc:
+            # Use YouTube download methods
             audio_file = download_youtube_audio(url, method_choice)
         else:
+            # Use direct download methods
             audio_file = download_direct_audio(url, method_choice)
         if not audio_file or not os.path.exists(audio_file):
             raise Exception(f"Failed to download audio from {url}")
+        return audio_file, True  # The file is a temporary file
     except Exception as e:
         logging.error(f"Error downloading audio: {str(e)}")
+        return f"Error: {str(e)}", False
 def download_youtube_audio(url, method_choice):
+    """
+    Downloads audio from a YouTube URL using the specified method.
+    Args:
+        url (str): The YouTube URL.
+        method_choice (str): The method to use for downloading ('yt-dlp', 'pytube', 'youtube-dl').
+    Returns:
+        str: Path to the downloaded audio file, or None if failed.
+    """
     methods = {
         'yt-dlp': youtube_dl_method,
         'pytube': pytube_method,
         'youtube-dl': youtube_dl_classic_method,
         'yt-dlp-alt': youtube_dl_alternative_method,
     }
+    method = methods.get(method_choice)
+    if method is None:
+        logging.warning(f"Invalid download method for YouTube: {method_choice}. Defaulting to 'yt-dlp'.")
+        method = youtube_dl_method
     try:
         logging.info(f"Attempting to download YouTube audio using {method_choice}")
         return method(url)
     return output_file
 def download_direct_audio(url, method_choice):
+    """
+    Downloads audio from a direct URL using the specified method.
+    Args:
+        url (str): The direct URL of the audio file.
+        method_choice (str): The method to use for downloading ('wget', 'requests').
+    Returns:
+        str: Path to the downloaded audio file, or None if failed.
+    """
     logging.info(f"Downloading direct audio from: {url} using method: {method_choice}")
     if method_choice == 'wget':
         return wget_method(url)
     else:
         try:
+            response = requests.get(url, stream=True)
             if response.status_code == 200:
                 with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
+                    for chunk in response.iter_content(chunk_size=8192):
+                        if chunk:
+                            temp_file.write(chunk)
+                logging.info(f"Downloaded direct audio to: {temp_file.name}")
+                return temp_file.name
             else:
+                raise Exception(f"Failed to download audio from {url} with status code {response.status_code}")
         except Exception as e:
             logging.error(f"Error downloading direct audio: {str(e)}")
             return None
     return output_file
 def trim_audio(audio_path, start_time, end_time):
+    """
+    Trims an audio file to the specified start and end times.
+    Args:
+        audio_path (str): Path to the audio file.
+        start_time (float): Start time in seconds.
+        end_time (float): End time in seconds.
+    Returns:
+        str: Path to the trimmed audio file.
+    Raises:
+        gr.Error: If invalid start or end times are provided.
+    """
+    try:
+        logging.info(f"Trimming audio from {start_time} to {end_time}")
+        audio = AudioSegment.from_file(audio_path)
+        audio_duration = len(audio) / 1000  # Duration in seconds
+        # Default start and end times if None
+        if start_time is None:
+            start_time = 0
+        if end_time is None or end_time > audio_duration:
+            end_time = audio_duration
+        # Validate times
+        if start_time < 0 or end_time <= 0:
+            raise gr.Error("Start time and end time must be positive.")
+        if start_time >= end_time:
+            raise gr.Error("End time must be greater than start time.")
+        if start_time > audio_duration:
+            raise gr.Error("Start time exceeds audio duration.")
+        trimmed_audio = audio[start_time * 1000:end_time * 1000]
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio_file:
+            trimmed_audio.export(temp_audio_file.name, format="wav")
+            logging.info(f"Trimmed audio saved to: {temp_audio_file.name}")
+        return temp_audio_file.name
+    except Exception as e:
+        logging.error(f"Error trimming audio: {str(e)}")
+        raise gr.Error(f"Error trimming audio: {str(e)}")
 def save_transcription(transcription):
+    """
+    Saves the transcription text to a temporary file.
+    Args:
+        transcription (str): The transcription text.
+    Returns:
+        str: The path to the transcription file.
+    """
+    with tempfile.NamedTemporaryFile(delete=False, suffix='.txt', mode='w', encoding='utf-8') as temp_file:
+        temp_file.write(transcription)
+        logging.info(f"Transcription saved to: {temp_file.name}")
+        return temp_file.name
 def get_model_options(pipeline_type):
+    """
+    Returns a list of model IDs based on the selected pipeline type.
+    Args:
+        pipeline_type (str): The type of pipeline ('faster-batched', 'faster-sequenced', 'transformers').
+    Returns:
+        list: A list of model IDs.
+    """
     if pipeline_type == "faster-batched":
+        return ["cstr/whisper-large-v3-turbo-int8_float32", "SYSTRAN/faster-whisper-large-v1", "GalaktischeGurke/primeline-whisper-large-v3-german-ct2"]
     elif pipeline_type == "faster-sequenced":
+        return ["SYSTRAN/faster-whisper-large-v1", "GalaktischeGurke/primeline-whisper-large-v3-german-ct2"]
     elif pipeline_type == "transformers":
+        return ["openai/whisper-large-v3", "openai/whisper-large-v2"]
     else:
         return []
 loaded_models = {}
 def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, download_method, start_time=None, end_time=None, verbose=False):
+    """
+    Transcribes audio from a given source using the specified pipeline and model.
+    Args:
+        input_source (str or file): URL of audio, path to local file, or uploaded file object.
+        pipeline_type (str): Type of pipeline to use ('faster-batched', 'faster-sequenced', or 'transformers').
+        model_id (str): The ID of the model to use.
+        dtype (str): Data type for model computations ('int8', 'float16', or 'float32').
+        batch_size (int): Batch size for transcription.
+        download_method (str): Method to use for downloading audio.
+        start_time (float, optional): Start time in seconds for trimming audio.
+        end_time (float, optional): End time in seconds for trimming audio.
+        verbose (bool, optional): Whether to output verbose logging.
+    Yields:
+        Tuple[str, str, str or None]: Metrics and messages, transcription text, path to transcription file.
+    """
     try:
         if verbose:
             logging.getLogger().setLevel(logging.INFO)
         else:
             logging.getLogger().setLevel(logging.WARNING)
         logging.info(f"Transcription parameters: pipeline_type={pipeline_type}, model_id={model_id}, dtype={dtype}, batch_size={batch_size}, download_method={download_method}")
         verbose_messages = f"Starting transcription with parameters:\nPipeline Type: {pipeline_type}\nModel ID: {model_id}\nData Type: {dtype}\nBatch Size: {batch_size}\nDownload Method: {download_method}\n"
             yield verbose_messages, "", None
         # Determine if input_source is a URL or file
+        audio_path = None
+        is_temp_file = False
+        if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
+            # Input source is a URL
+            audio_path, is_temp_file = download_audio(input_source, download_method)
+            if not audio_path or audio_path.startswith("Error"):
+                yield f"Error downloading audio: {audio_path}", "", None
+                return
+        elif isinstance(input_source, str) and os.path.exists(input_source):
+            # Input source is a local file path
+            audio_path = input_source
+            is_temp_file = False
+        elif hasattr(input_source, 'name'):
+            # Input source is an uploaded file object
             audio_path = input_source.name
+            is_temp_file = False
         else:
+            yield "No valid audio source provided.", "", None
             return
         # Convert start_time and end_time to float or None
         end_time = float(end_time) if end_time else None
         if start_time is not None or end_time is not None:
+            audio_path = trim_audio(audio_path, start_time, end_time)
+            is_temp_file = True  # The trimmed audio is a temporary file
             verbose_messages += f"Audio trimmed from {start_time} to {end_time}\n"
             if verbose:
                 yield verbose_messages, "", None
         else:
             if pipeline_type == "faster-batched":
                 model = WhisperModel(model_id, device=device, compute_type=dtype)
+                model_or_pipeline = BatchedInferencePipeline(model=model)
             elif pipeline_type == "faster-sequenced":
+                model_or_pipeline = WhisperModel(model_id, device=device, compute_type=dtype)
             elif pipeline_type == "transformers":
                 torch_dtype = torch.float16 if dtype == "float16" else torch.float32
                 model = AutoModelForSpeechSeq2Seq.from_pretrained(
                 )
                 model.to(device)
                 processor = AutoProcessor.from_pretrained(model_id)
+                model_or_pipeline = pipeline(
                    "automatic-speech-recognition",
                     model=model,
                     tokenizer=processor.tokenizer,
                 )
             else:
                 raise ValueError("Invalid pipeline type")
+            loaded_models[model_key] = model_or_pipeline  # Cache the model or pipeline
         start_time_perf = time.time()
         if pipeline_type == "faster-batched":
     finally:
         # Clean up temporary files
+        if audio_path and is_temp_file and os.path.exists(audio_path):
             os.remove(audio_path)
+        if 'transcription_file' in locals() and transcription_file and os.path.exists(transcription_file):
             os.remove(transcription_file)
 with gr.Blocks() as iface:
         transcription_file = gr.File(label="Download Transcription")
     def update_model_dropdown(pipeline_type):
+        """
+        Updates the model dropdown choices based on the selected pipeline type.
+        Args:
+            pipeline_type (str): The selected pipeline type.
+        Returns:
+            gr.update: Updated model dropdown component.
+        """
         try:
             model_choices = get_model_options(pipeline_type)
             logging.info(f"Model choices for {pipeline_type}: {model_choices}")