Spaces:

thak123
/

Whisper-Konkani

Sleeping

App Files Files Community

thak123 commited on Feb 7

Commit

70a53fa

verified ·

1 Parent(s): 0d95a29

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -23

app.py CHANGED Viewed

@@ -19,31 +19,74 @@ pipe = pipeline(model="thak123/gom-stt-v3", #"thak123/whisper-small-LDC-V1", #"t
 #         )
 #     )
-def transcribe_speech(filepath):
-    # waveform, sample_rate = torchaudio.load(filepath)
-    # Resample the audio signal to 16k sampling rate
-    # resampler = torchaudio.transforms.Resample(sample_rate, 16000)
-    # waveform_16k = resampler(waveform)
-    # Save the resampled audio signal to a new file
-    # torchaudio.save(filepath, waveform_16k, 16000)
-    output = pipe(
-        filepath,
-        max_new_tokens=3,
-        generate_kwargs={
-            "task": "transcribe",
-            # "language": "konkani",
-        },  # update with the language you've fine-tuned on
-        chunk_length_s=30,
-        batch_size=8,
-         # sampling_rate=16000,
-        # padding=True
-    )
-    print(output)
-    return output["text"]
 demo = gr.Blocks()
 mic_transcribe = gr.Interface(

 #         )
 #     )
+# def transcribe_speech(filepath):
+#     # waveform, sample_rate = torchaudio.load(filepath)
+#     # Resample the audio signal to 16k sampling rate
+#     # resampler = torchaudio.transforms.Resample(sample_rate, 16000)
+#     # waveform_16k = resampler(waveform)
+#     # Save the resampled audio signal to a new file
+#     # torchaudio.save(filepath, waveform_16k, 16000)
+#     output = pipe(
+#         filepath,
+#         max_new_tokens=3,
+#         generate_kwargs={
+#             "task": "transcribe",
+#             # "language": "konkani",
+#         },  # update with the language you've fine-tuned on
+#         chunk_length_s=30,
+#         batch_size=8,
+#          # sampling_rate=16000,
+#         # padding=True
+#     )
+#     print(output)
+#     return output["text"]
+def transcribe_speech(filepath):
+    from transformers import WhisperProcessor, WhisperForConditionalGeneration
+    import torch
+    import librosa
+    # Load model and processor
+    model = WhisperForConditionalGeneration.from_pretrained("thak123/gom-stt-v3")
+    tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe")
+    processor = WhisperProcessor.from_pretrained("openai/whisper-small")
+    output = ""
+    # Load and preprocess audio
+    audio_path = filepath
+    audio, sr = librosa.load(audio_path, sr=16000)
+    input_features = processor(audio, sampling_rate=16000, return_tensors="pt",truncation=False, padding="max_length").input_features
+    # Check length and process
+    if input_features.shape[-1] > 3000:
+        print("Splitting audio required")
+        from pydub import AudioSegment
+        def split_audio(file_path, chunk_length_ms=30000):  # 30 sec chunks
+            audio = AudioSegment.from_file(file_path)
+            chunks = [audio[i:i+chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
+            return chunks
+        # Split and transcribe
+        audio_chunks = split_audio(audio_path)
+        for i, chunk in enumerate(audio_chunks):
+            print(i)
+            chunk.export(f"chunk_{i}.wav", format="wav")
+            result = pipe(f"chunk_{i}.wav")
+            output += result['text'] + " "
+            print(f"Chunk {i}: {result['text']}")
+    else:
+        predicted_ids = model.generate(input_features)
+        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+        output = transcription
+        print(transcription)
+    return output #output["text"]
 demo = gr.Blocks()
 mic_transcribe = gr.Interface(