bomolopuu commited on
Commit
4129671
·
1 Parent(s): 581e11a

Revert "tokenizeR"

Browse files

This reverts commit 78b8142b0963bfa1c8eb08e63a1dbd9de9962a12.

Files changed (1) hide show
  1. asr.py +50 -29
asr.py CHANGED
@@ -92,32 +92,53 @@ def transcribe_file(model, audio_samples, lang, user_transcription):
92
 
93
  #return transcription
94
 
95
- #def fine_tune_model(model, processor, user_transcription, audio_samples, lang_code):
96
- # Implementation of fine_tune_model remains the same
97
- # ...
98
-
99
- # Подготовка опций языка для Dropdown
100
- language_options = [f"{k} ({v})" for k, v in ASR_LANGUAGES.items()]
101
-
102
- mms_transcribe = gr.Interface(
103
- fn=transcribe_multiple_files,
104
- inputs=[
105
- gr.File(label="Audio Files", file_count="multiple"),
106
- gr.Dropdown(
107
- choices=language_options,
108
- label="Language",
109
- value=language_options[0] if language_options else None,
110
- ),
111
- gr.Textbox(label="Optional: Provide your own transcription"),
112
- ],
113
- outputs=gr.Textbox(label="Transcriptions", lines=10),
114
- title="Speech-to-text",
115
- description="Transcribe multiple audio files in your desired language.",
116
- allow_flagging="never",
117
- )
118
-
119
- # Остальной код интерфейса остается без изменений
120
- # ...
121
-
122
- if __name__ == "__main__":
123
- mms_transcribe.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  #return transcription
94
 
95
+ def fine_tune_model(model, processor, user_transcription, audio_samples, lang_code):
96
+ # Convert the user-provided transcription to a tensor
97
+ transcription_tensor = processor.tokenize(user_transcription, return_tensors="pt")
98
+
99
+ # Create a new dataset with the user-provided transcription and audio samples
100
+ dataset = [(audio_samples, transcription_tensor)]
101
+
102
+ # Create a data loader for the new dataset
103
+ data_loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False)
104
+
105
+ # Set the model to training mode
106
+ model.train()
107
+
108
+ # Define the loss function and optimizer
109
+ criterion = torch.nn.CTCLoss()
110
+ optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
111
+
112
+ # Fine-tune the model on the new dataset
113
+ for epoch in range(5): # fine-tune for 5 epochs
114
+ for batch in data_loader:
115
+ audio, transcription = batch
116
+ audio = audio.to(device)
117
+ transcription = transcription.to(device)
118
+
119
+ # Forward pass
120
+ inputs = processor(audio, sampling_rate=ASR_SAMPLING_RATE, return_tensors="pt")
121
+ outputs = model(**inputs).logits
122
+
123
+ loss = criterion(outputs, transcription["input_ids"])
124
+
125
+ # Backward pass
126
+ optimizer.zero_grad()
127
+ loss.backward()
128
+ optimizer.step()
129
+
130
+ # Set the model to evaluation mode
131
+ model.eval()
132
+
133
+ return model
134
+
135
+ ASR_EXAMPLES = [
136
+ ["upload/english.mp3", "eng (English)"],
137
+ # ["upload/tamil.mp3", "tam (Tamil)"],
138
+ # ["upload/burmese.mp3", "mya (Burmese)"],
139
+ ]
140
+
141
+ ASR_NOTE = """
142
+ The above demo doesn't use beam-search decoding using a language model.
143
+ Checkout the instructions [here](https://huggingface.co/facebook/mms-1b-all) on how to run LM decoding for better accuracy.
144
+ """