darag commited on
Commit
2c66d5b
verified
1 Parent(s): a6ced78

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -49
app.py CHANGED
@@ -1,72 +1,106 @@
1
- # -*- coding: utf-8 -*-
2
  import torch
3
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
4
  import librosa
5
  import numpy as np
6
- from datetime import timedelta
7
  import gradio as gr
8
- import os
9
 
10
- def format_time(seconds):
11
- td = timedelta(seconds=seconds)
12
- hours, remainder = divmod(td.seconds, 3600)
13
- minutes, seconds = divmod(remainder, 60)
14
- milliseconds = td.microseconds // 1000
15
  return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
16
 
17
- def estimate_word_timings(transcription, total_duration):
18
- words = transcription.split()
19
- total_chars = sum(len(word) for word in words)
20
- char_duration = total_duration / total_chars
21
 
22
- word_timings = []
23
- current_time = 0
 
 
 
 
 
 
 
 
24
 
25
- for word in words:
26
- word_duration = len(word) * char_duration
27
- start_time = current_time
28
- end_time = current_time + word_duration
29
- word_timings.append((word, start_time, end_time))
30
- current_time = end_time
31
 
32
- return word_timings
33
 
34
- model_name = "Akashpb13/xlsr_kurmanji_kurdish"
35
- model = Wav2Vec2ForCTC.from_pretrained(model_name)
36
- processor = Wav2Vec2Processor.from_pretrained(model_name)
 
37
 
38
- def transcribe_audio(file):
39
- speech, rate = librosa.load(file, sr=16000)
40
- input_values = processor(speech, return_tensors="pt", sampling_rate=rate).input_values
 
 
 
41
 
42
- with torch.no_grad():
43
- logits = model(input_values).logits
44
-
45
- predicted_ids = torch.argmax(logits, dim=-1)
46
- transcription = processor.batch_decode(predicted_ids)[0]
47
- total_duration = len(speech) / rate
48
- word_timings = estimate_word_timings(transcription, total_duration)
49
 
50
  srt_content = ""
51
- for i, (word, start_time, end_time) in enumerate(word_timings, start=1):
52
- start_time_str = format_time(start_time)
53
- end_time_str = format_time(end_time)
54
- srt_content += f"{i}\n{start_time_str} --> {end_time_str}\n{word}\n\n"
 
 
 
 
55
 
56
- output_filename = "output_word_by_word.srt"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  with open(output_filename, "w", encoding="utf-8") as f:
58
  f.write(srt_content)
 
59
 
60
- return transcription, output_filename
61
-
62
- interface = gr.Interface(
63
- fn=transcribe_audio,
64
  inputs=gr.Audio(type="filepath"),
65
- outputs=[gr.Textbox(label="Transcription"), gr.File(label="Download SRT File")],
66
- title="Deng --- Niv卯sandin ::: Kurd卯-Kurmanc卯",
67
- description="Deng锚 xwe ji me re r锚ke 没 li Submit bixe ... 没 bila b锚hna te fireh be .",
68
- article="By Derax El卯"
 
 
69
  )
70
 
71
  if __name__ == "__main__":
72
- interface.launch()
 
1
+ import os
2
  import torch
 
3
  import librosa
4
  import numpy as np
 
5
  import gradio as gr
6
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
7
 
8
+ def format_time(milliseconds):
9
+ seconds, milliseconds = divmod(int(milliseconds), 1000)
10
+ minutes, seconds = divmod(seconds, 60)
11
+ hours, minutes = divmod(minutes, 60)
 
12
  return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
13
 
14
+ def detect_speech_activity(y, sr, frame_length=1024, hop_length=512, threshold=0.01):
15
+ energy = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length)[0]
16
+ speech_frames = energy > threshold
 
17
 
18
+ speech_regions = []
19
+ in_speech = False
20
+ for i, speech in enumerate(speech_frames):
21
+ if speech and not in_speech:
22
+ start = i
23
+ in_speech = True
24
+ elif not speech and in_speech:
25
+ end = i
26
+ speech_regions.append((start * hop_length / sr, end * hop_length / sr))
27
+ in_speech = False
28
 
29
+ if in_speech:
30
+ speech_regions.append((start * hop_length / sr, len(y) / sr))
 
 
 
 
31
 
32
+ return speech_regions
33
 
34
+ def post_process_text(text):
35
+ text = text.replace(" ", " ")
36
+ text = text.strip()
37
+ return text
38
 
39
+ def transcribe_audio(audio_file):
40
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
41
+
42
+ model_name = "Akashpb13/xlsr_kurmanji_kurdish"
43
+ processor = Wav2Vec2Processor.from_pretrained(model_name)
44
+ model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
45
 
46
+ y, sr = librosa.load(audio_file, sr=16000)
47
+ voiced_segments = detect_speech_activity(y, sr, threshold=0.005)
 
 
 
 
 
48
 
49
  srt_content = ""
50
+ for i, (start, end) in enumerate(voiced_segments, start=1):
51
+ segment_audio = y[int(start * sr):int(end * sr)]
52
+
53
+ input_values = processor(segment_audio, sampling_rate=sr, return_tensors="pt").input_values
54
+ input_values = input_values.to(device)
55
+
56
+ with torch.no_grad():
57
+ logits = model(input_values).logits
58
 
59
+ predicted_ids = torch.argmax(logits, dim=-1)
60
+ transcription = processor.batch_decode(predicted_ids)[0]
61
+
62
+ transcription = post_process_text(transcription)
63
+
64
+ if transcription:
65
+ start_time = format_time(start * 1000)
66
+ end_time = format_time(end * 1000)
67
+
68
+ srt_content += f"{i}\n"
69
+ srt_content += f"{start_time} --> {end_time}\n"
70
+
71
+ # Break long lines into shorter ones (max 50 characters)
72
+ words = transcription.split()
73
+ lines = []
74
+ current_line = ""
75
+ for word in words:
76
+ if len(current_line) + len(word) > 50:
77
+ lines.append(current_line.strip())
78
+ current_line = ""
79
+ current_line += word + " "
80
+ if current_line:
81
+ lines.append(current_line.strip())
82
+
83
+ srt_content += "\n".join(lines) + "\n\n"
84
+
85
+ return srt_content
86
+
87
+ def save_srt(audio_file):
88
+ srt_content = transcribe_audio(audio_file)
89
+ output_filename = "output.srt"
90
  with open(output_filename, "w", encoding="utf-8") as f:
91
  f.write(srt_content)
92
+ return output_filename, srt_content
93
 
94
+ iface = gr.Interface(
95
+ fn=save_srt,
 
 
96
  inputs=gr.Audio(type="filepath"),
97
+ outputs=[
98
+ gr.File(label="Download SRT"),
99
+ gr.Textbox(label="SRT Content", lines=10)
100
+ ],
101
+ title="Kurdish Speech-to-Text Transcription",
102
+ description="Upload an audio file to generate a SRT subtitle file with Kurdish transcription."
103
  )
104
 
105
  if __name__ == "__main__":
106
+ iface.launch()