MMS_1_10

Sleeping

App Files Files Community

bomolopuu commited on Oct 5, 2024

Commit

4129671

1 Parent(s): 581e11a

Revert "tokenizeR"

Browse files

This reverts commit 78b8142b0963bfa1c8eb08e63a1dbd9de9962a12.

Files changed (1) hide show

asr.py +50 -29

asr.py CHANGED Viewed

@@ -92,32 +92,53 @@ def transcribe_file(model, audio_samples, lang, user_transcription):
     #return transcription
-#def fine_tune_model(model, processor, user_transcription, audio_samples, lang_code):
-    # Implementation of fine_tune_model remains the same
-    # ...
-# Подготовка опций языка для Dropdown
-language_options = [f"{k} ({v})" for k, v in ASR_LANGUAGES.items()]
-mms_transcribe = gr.Interface(
-    fn=transcribe_multiple_files,
-    inputs=[
-        gr.File(label="Audio Files", file_count="multiple"),
-        gr.Dropdown(
-            choices=language_options,
-            label="Language",
-            value=language_options[0] if language_options else None,
-        ),
-        gr.Textbox(label="Optional: Provide your own transcription"),
-    ],
-    outputs=gr.Textbox(label="Transcriptions", lines=10),
-    title="Speech-to-text",
-    description="Transcribe multiple audio files in your desired language.",
-    allow_flagging="never",
-)
-# Остальной код интерфейса остается без изменений
-# ...
-if __name__ == "__main__":
-    mms_transcribe.launch()

     #return transcription
+def fine_tune_model(model, processor, user_transcription, audio_samples, lang_code):
+    # Convert the user-provided transcription to a tensor
+    transcription_tensor = processor.tokenize(user_transcription, return_tensors="pt")
+    # Create a new dataset with the user-provided transcription and audio samples
+    dataset = [(audio_samples, transcription_tensor)]
+    # Create a data loader for the new dataset
+    data_loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False)
+    # Set the model to training mode
+    model.train()
+    # Define the loss function and optimizer
+    criterion = torch.nn.CTCLoss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
+    # Fine-tune the model on the new dataset
+    for epoch in range(5):  # fine-tune for 5 epochs
+        for batch in data_loader:
+            audio, transcription = batch
+            audio = audio.to(device)
+            transcription = transcription.to(device)
+            # Forward pass
+            inputs = processor(audio, sampling_rate=ASR_SAMPLING_RATE, return_tensors="pt")
+            outputs = model(**inputs).logits
+            loss = criterion(outputs, transcription["input_ids"])
+            # Backward pass
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+    # Set the model to evaluation mode
+    model.eval()
+    return model
+ASR_EXAMPLES = [
+    ["upload/english.mp3", "eng (English)"],
+    # ["upload/tamil.mp3", "tam (Tamil)"],
+    # ["upload/burmese.mp3",  "mya (Burmese)"],
+]
+ASR_NOTE = """
+The above demo doesn't use beam-search decoding using a language model.
+Checkout the instructions [here](https://huggingface.co/facebook/mms-1b-all) on how to run LM decoding for better accuracy.
+"""