Spaces:

darag
/

kurdish-kurmanci-to-text-srt

Sleeping

App Files Files Community

darag commited on Sep 1, 2024

Commit

2c66d5b

verified ·

1 Parent(s): a6ced78

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -49

app.py CHANGED Viewed

@@ -1,72 +1,106 @@
-# -*- coding: utf-8 -*-
 import torch
-from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 import librosa
 import numpy as np
-from datetime import timedelta
 import gradio as gr
-import os
-def format_time(seconds):
-    td = timedelta(seconds=seconds)
-    hours, remainder = divmod(td.seconds, 3600)
-    minutes, seconds = divmod(remainder, 60)
-    milliseconds = td.microseconds // 1000
     return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
-def estimate_word_timings(transcription, total_duration):
-    words = transcription.split()
-    total_chars = sum(len(word) for word in words)
-    char_duration = total_duration / total_chars
-    word_timings = []
-    current_time = 0
-    for word in words:
-        word_duration = len(word) * char_duration
-        start_time = current_time
-        end_time = current_time + word_duration
-        word_timings.append((word, start_time, end_time))
-        current_time = end_time
-    return word_timings
-model_name = "Akashpb13/xlsr_kurmanji_kurdish"
-model = Wav2Vec2ForCTC.from_pretrained(model_name)
-processor = Wav2Vec2Processor.from_pretrained(model_name)
-def transcribe_audio(file):
-    speech, rate = librosa.load(file, sr=16000)
-    input_values = processor(speech, return_tensors="pt", sampling_rate=rate).input_values
-    with torch.no_grad():
-        logits = model(input_values).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    transcription = processor.batch_decode(predicted_ids)[0]
-    total_duration = len(speech) / rate
-    word_timings = estimate_word_timings(transcription, total_duration)
     srt_content = ""
-    for i, (word, start_time, end_time) in enumerate(word_timings, start=1):
-        start_time_str = format_time(start_time)
-        end_time_str = format_time(end_time)
-        srt_content += f"{i}\n{start_time_str} --> {end_time_str}\n{word}\n\n"
-    output_filename = "output_word_by_word.srt"
     with open(output_filename, "w", encoding="utf-8") as f:
         f.write(srt_content)
-    return transcription, output_filename
-interface = gr.Interface(
-    fn=transcribe_audio,
     inputs=gr.Audio(type="filepath"),
-    outputs=[gr.Textbox(label="Transcription"), gr.File(label="Download SRT File")],
-    title="Deng --- Nivîsandin ::: Kurdî-Kurmancî",
-    description="Dengê xwe ji me re rêke û li Submit bixe ... û bila bêhna te fireh be .",
-    article="By Derax Elî"
 )
 if __name__ == "__main__":
-    interface.launch()

+import os
 import torch
 import librosa
 import numpy as np
 import gradio as gr
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+def format_time(milliseconds):
+    seconds, milliseconds = divmod(int(milliseconds), 1000)
+    minutes, seconds = divmod(seconds, 60)
+    hours, minutes = divmod(minutes, 60)
     return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
+def detect_speech_activity(y, sr, frame_length=1024, hop_length=512, threshold=0.01):
+    energy = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length)[0]
+    speech_frames = energy > threshold
+    speech_regions = []
+    in_speech = False
+    for i, speech in enumerate(speech_frames):
+        if speech and not in_speech:
+            start = i
+            in_speech = True
+        elif not speech and in_speech:
+            end = i
+            speech_regions.append((start * hop_length / sr, end * hop_length / sr))
+            in_speech = False
+    if in_speech:
+        speech_regions.append((start * hop_length / sr, len(y) / sr))
+    return speech_regions
+def post_process_text(text):
+    text = text.replace("  ", " ")
+    text = text.strip()
+    return text
+def transcribe_audio(audio_file):
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model_name = "Akashpb13/xlsr_kurmanji_kurdish"
+    processor = Wav2Vec2Processor.from_pretrained(model_name)
+    model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
+    y, sr = librosa.load(audio_file, sr=16000)
+    voiced_segments = detect_speech_activity(y, sr, threshold=0.005)
     srt_content = ""
+    for i, (start, end) in enumerate(voiced_segments, start=1):
+        segment_audio = y[int(start * sr):int(end * sr)]
+        input_values = processor(segment_audio, sampling_rate=sr, return_tensors="pt").input_values
+        input_values = input_values.to(device)
+        with torch.no_grad():
+            logits = model(input_values).logits
+        predicted_ids = torch.argmax(logits, dim=-1)
+        transcription = processor.batch_decode(predicted_ids)[0]
+        transcription = post_process_text(transcription)
+        if transcription:
+            start_time = format_time(start * 1000)
+            end_time = format_time(end * 1000)
+            srt_content += f"{i}\n"
+            srt_content += f"{start_time} --> {end_time}\n"
+            # Break long lines into shorter ones (max 50 characters)
+            words = transcription.split()
+            lines = []
+            current_line = ""
+            for word in words:
+                if len(current_line) + len(word) > 50:
+                    lines.append(current_line.strip())
+                    current_line = ""
+                current_line += word + " "
+            if current_line:
+                lines.append(current_line.strip())
+            srt_content += "\n".join(lines) + "\n\n"
+    return srt_content
+def save_srt(audio_file):
+    srt_content = transcribe_audio(audio_file)
+    output_filename = "output.srt"
     with open(output_filename, "w", encoding="utf-8") as f:
         f.write(srt_content)
+    return output_filename, srt_content
+iface = gr.Interface(
+    fn=save_srt,
     inputs=gr.Audio(type="filepath"),
+    outputs=[
+        gr.File(label="Download SRT"),
+        gr.Textbox(label="SRT Content", lines=10)
+    ],
+    title="Kurdish Speech-to-Text Transcription",
+    description="Upload an audio file to generate a SRT subtitle file with Kurdish transcription."
 )
 if __name__ == "__main__":
+    iface.launch()