MMS_1_10

Sleeping

App Files Files Community

bomolopuu commited on Oct 1, 2024

Commit

b9b5a7c

1 Parent(s): 6643110

fixing 4

Browse files

Files changed (1) hide show

asr.py +74 -116

asr.py CHANGED Viewed

@@ -1,12 +1,15 @@
 import librosa
-from transformers import Wav2Vec2ForCTC, AutoProcessor
 import torch
 import numpy as np
-from pathlib import Path
-import os
-from huggingface_hub import hf_hub_download
-from torchaudio.models.decoder import ctc_decoder
 ASR_SAMPLING_RATE = 16_000
@@ -21,25 +24,42 @@ MODEL_ID = "facebook/mms-1b-all"
 processor = AutoProcessor.from_pretrained(MODEL_ID)
 model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
-def transcribe(model, audio_dir, lang="eng (English)", user_transcription=None):
-    # Получить список файлов в папке
-    files = os.listdir(audio_dir)
-    # Обработать каждый файл в папке
-    for file in files:
-        # Проверить, является ли файл аудиофайлом
-        if file.endswith(".mp3") or file.endswith(".wav"):
-            # Загрузить аудиофайл
-            audio_path = os.path.join(audio_dir, file)
-            audio_samples = librosa.load(audio_path, sr=ASR_SAMPLING_RATE, mono=True)[0]
-            # Обработать аудиофайл
-            transcription = transcribe_file(model, audio_samples, lang, user_transcription)
-            # Вывести результат
-            print(f"Файл: {file}")
-            print(f"Транскрипция: {transcription}")
-            print()
 def transcribe_file(model, audio_samples, lang, user_transcription):
     if not audio_samples:
@@ -54,16 +74,7 @@ def transcribe_file(model, audio_samples, lang, user_transcription):
     )
     # set device
-    if torch.cuda.is_available():
-        device = torch.device("cuda")
-    elif (
-        hasattr(torch.backends, "mps")
-        and torch.backends.mps.is_available()
-        and torch.backends.mps.is_built()
-    ):
-        device = torch.device("mps")
-    else:
-        device = torch.device("cpu")
     model.to(device)
     inputs = inputs.to(device)
@@ -71,95 +82,42 @@ def transcribe_file(model, audio_samples, lang, user_transcription):
     with torch.no_grad():
         outputs = model(**inputs).logits
-    if lang_code != "eng" or True:
-        ids = torch.argmax(outputs, dim=-1)[0]
-        transcription = processor.decode(ids)
-    else:
-        assert False
-        # beam_search_result = beam_search_decoder(outputs.to("cpu"))
-        # transcription = " ".join(beam_search_result[0][0].words).strip()
     # If user-provided transcription is available, use it to fine-tune the model
     if user_transcription:
-        # Update the model's weights using the user-provided transcription
         model = fine_tune_model(model, processor, user_transcription, audio_samples, lang_code)
-        print(f"Fine-tuning the model with user-provided transcription: {user_transcription}")
     return transcription
 def fine_tune_model(model, processor, user_transcription, audio_samples, lang_code):
-    # Define the device
-    if torch.cuda.is_available():
-        device = torch.device("cuda")
-    elif (
-        hasattr(torch.backends, "mps")
-        and torch.backends.mps.is_available()
-        and torch.backends.mps.is_built()
-    ):
-        device = torch.device("mps")
-    else:
-        device = torch.device("cpu")
-    # Convert the user-provided transcription to a tensor
-    transcription_tensor = processor.tokenizer(user_transcription, return_tensors="pt")
-    # Create a new dataset with the user-provided transcription and audio samples
-    dataset = [(audio_samples, transcription_tensor)]
-    # Create a data loader for the new dataset
-    data_loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False)
-    # Set the model to training mode
-    model.train()
-    # Define the loss function and optimizer
-    criterion = torch.nn.CTCLoss()
-    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
-    # Move the model to the device
-    model.to(device)
-    # Fine-tune the model on the new dataset
-    for epoch in range(5):  # fine-tune for 5 epochs
-        for batch in data_loader:
-            audio, transcription = batch
-            audio = audio.to(device)
-            transcription = transcription.to(device)
-            # Forward pass
-            inputs = processor(audio, sampling_rate=ASR_SAMPLING_RATE, return_tensors ="pt")
-            inputs = inputs.to(device)
-            outputs = model(**inputs).logits
-            # Calculate the loss
-            loss = criterion(outputs, transcription)
-            # Backward pass
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-    # Set the model to evaluation mode
-    model.eval()
-    return model
-def beam_search_decoder(logits):
-    # Define the beam search parameters
-    beam_width = 10
-    alpha = 0.7
-    # Initialize the beam search decoder
-    decoder = ctc_decoder.CTCTokenizer(
-        logits, beam_width=beam_width, alpha=alpha, blank_index=processor.tokenizer.pad_token_id
-    )
-    # Decode the logits
-    decoded = decoder.decode()
-    return decoded
 if __name__ == "__main__":
-    audio_dir = "/path/to/audio/files"
-    model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
-    transcribe(model, audio_dir)

+import gradio as gr
 import librosa
+import os
+import logging
+from pathlib import Path
 import torch
+from transformers import Wav2Vec2ForCTC, AutoProcessor
 import numpy as np
+# Настройка логирования
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
 ASR_SAMPLING_RATE = 16_000
 processor = AutoProcessor.from_pretrained(MODEL_ID)
 model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
+def safe_process_file(file_obj):
+    try:
+        logger.debug(f"Processing file: {file_obj.name}")
+        # Используем Path для безопасной обработки путей
+        file_path = Path(file_obj.name)
+        logger.debug(f"Loading audio from file path: {file_path}")
+        # Используем librosa для загрузки аудио
+        audio_samples, sr = librosa.load(str(file_path), sr=ASR_SAMPLING_RATE, mono=True)
+        safe_name = f"audio_{file_path.stem}.wav"
+        logger.debug(f"File processed successfully: {safe_name}")
+        return audio_samples, sr, safe_name
+    except Exception as e:
+        logger.error(f"Error processing file {getattr(file_obj, 'name', 'unknown')}: {str(e)}")
+        raise
+def transcribe_multiple_files(audio_files, lang, transcription):
+    transcriptions = []
+    for audio_file in audio_files:
+        try:
+            audio_samples, sr, safe_name = safe_process_file(audio_file)
+            logger.debug(f"Transcribing file: {safe_name}")
+            logger.debug(f"Language selected: {lang}")
+            logger.debug(f"User-provided transcription: {transcription}")
+            result = transcribe_file(model, audio_samples, lang, transcription)
+            logger.debug(f"Transcription result: {result}")
+            transcriptions.append(f"File: {safe_name}\nTranscription: {result}\n")
+        except Exception as e:
+            logger.error(f"Error in transcription process: {str(e)}")
+            transcriptions.append(f"Error processing file: {str(e)}\n")
+    return "\n".join(transcriptions)
 def transcribe_file(model, audio_samples, lang, user_transcription):
     if not audio_samples:
     )
     # set device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model.to(device)
     inputs = inputs.to(device)
     with torch.no_grad():
         outputs = model(**inputs).logits
+    ids = torch.argmax(outputs, dim=-1)[0]
+    transcription = processor.decode(ids)
     # If user-provided transcription is available, use it to fine-tune the model
     if user_transcription:
         model = fine_tune_model(model, processor, user_transcription, audio_samples, lang_code)
+        logger.debug(f"Fine-tuning the model with user-provided transcription: {user_transcription}")
     return transcription
 def fine_tune_model(model, processor, user_transcription, audio_samples, lang_code):
+    # Implementation of fine_tune_model remains the same
+    # ...
+# Подготовка опций языка для Dropdown
+language_options = [f"{k} ({v})" for k, v in ASR_LANGUAGES.items()]
+mms_transcribe = gr.Interface(
+    fn=transcribe_multiple_files,
+    inputs=[
+        gr.File(label="Audio Files", file_count="multiple"),
+        gr.Dropdown(
+            choices=language_options,
+            label="Language",
+            value=language_options[0] if language_options else None,
+        ),
+        gr.Textbox(label="Optional: Provide your own transcription"),
+    ],
+    outputs=gr.Textbox(label="Transcriptions", lines=10),
+    title="Speech-to-text",
+    description="Transcribe multiple audio files in your desired language.",
+    allow_flagging="never",
+)
+# Остальной код интерфейса остается без изменений
+# ...
 if __name__ == "__main__":
+    demo.launch()