Merlintxu commited on
Commit
5653d92
·
verified ·
1 Parent(s): 4013e0d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -84
app.py CHANGED
@@ -38,107 +38,128 @@ MODELS = {
38
  }
39
 
40
  def convert_audio_to_wav(audio_path):
41
- wav_path = "converted_audio.wav"
42
- command = ["ffmpeg", "-i", audio_path, "-ac", "1", "-ar", "16000", wav_path]
43
- subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
44
- return wav_path
 
 
 
45
 
46
  def detect_language(audio_path):
47
- speech, _ = librosa.load(audio_path, sr=16000, duration=30)
48
-
49
- processor = WhisperProcessor.from_pretrained("openai/whisper-base")
50
- model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
51
-
52
- input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
53
- predicted_ids = model.generate(input_features)
54
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
55
-
56
- langs = detect_langs(transcription)
57
-
58
- es_confidence = next((lang.prob for lang in langs if lang.lang == 'es'), 0)
59
- pt_confidence = next((lang.prob for lang in langs if lang.lang == 'pt'), 0)
60
-
61
- if abs(es_confidence - pt_confidence) < 0.2:
62
- return 'es'
63
-
64
- return max(langs, key=lambda x: x.prob).lang
 
 
 
65
 
66
  def diarize_audio(wav_audio):
67
- pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=HUGGINGFACE_TOKEN)
68
- diarization = pipeline(wav_audio)
69
- return diarization
 
 
 
70
 
71
  def transcribe_audio_stream(audio, model_name):
72
- wav_audio = convert_audio_to_wav(audio)
73
- speech, rate = librosa.load(wav_audio, sr=16000)
74
- duration = len(speech) / rate
75
-
76
- transcriptions = []
77
-
78
- if "whisper" in model_name:
79
- processor = WhisperProcessor.from_pretrained(model_name)
80
- model = WhisperForConditionalGeneration.from_pretrained(model_name)
81
 
82
- chunk_duration = 30 # seconds
83
 
84
- for i in range(0, int(duration), chunk_duration):
85
- end = min(i + chunk_duration, duration)
86
- chunk = speech[int(i * rate):int(end * rate)]
87
 
88
- input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features
89
- predicted_ids = model.generate(input_features)
90
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
91
 
92
- progress = min(100, (end / duration) * 100)
93
- timestamp = i
94
- transcriptions.append((timestamp, transcription, progress))
95
- yield transcriptions, progress
96
- else:
97
- transcriber = pipeline("automatic-speech-recognition", model=model_name)
98
-
99
- chunk_duration = 10 # seconds
100
-
101
- for i in range(0, int(duration), chunk_duration):
102
- end = min(i + chunk_duration, duration)
103
- chunk = speech[int(i * rate):int(end * rate)]
104
- result = transcriber(chunk)
 
105
 
106
- progress = min(100, (end / duration) * 100)
107
- timestamp = i
108
- transcriptions.append((timestamp, result["text"], progress))
109
- yield transcriptions, progress
 
 
 
 
 
 
 
 
 
110
 
111
  def merge_diarization_with_transcription(transcriptions, diarization, rate):
112
- speaker_transcriptions = []
113
- for segment in diarization.itertracks(yield_label=True):
114
- start, end, speaker = segment
115
- start_time = start / rate
116
- end_time = end / rate
117
- text_segment = ""
118
- for ts, text, _ in transcriptions:
119
- if start_time <= ts <= end_time:
120
- text_segment += text + " "
121
- speaker_transcriptions.append((start_time, end_time, speaker, text_segment.strip()))
122
- return speaker_transcriptions
 
 
 
123
 
124
  def detect_and_select_model(audio):
125
- wav_audio = convert_audio_to_wav(audio)
126
- language = detect_language(wav_audio)
127
- model_options = MODELS.get(language, MODELS["en"])
128
- return language, model_options
 
 
 
129
 
130
  def save_transcription(transcriptions, file_format):
131
- if file_format == "txt":
132
- file_path = "/tmp/transcription.txt"
133
- with open(file_path, "w") as f:
134
- for start, end, speaker, text in transcriptions:
135
- f.write(f"[{start:.2f}-{end:.2f}] {speaker}: {text}\n")
136
- return file_path
137
- elif file_format == "json":
138
- file_path = "/tmp/transcription.json"
139
- with open(file_path, "w") as f:
140
- json.dump(transcriptions, f)
141
- return file_path
 
 
 
142
 
143
  def combined_interface(audio):
144
  try:
 
38
  }
39
 
40
  def convert_audio_to_wav(audio_path):
41
+ try:
42
+ wav_path = "converted_audio.wav"
43
+ command = ["ffmpeg", "-i", audio_path, "-ac", "1", "-ar", "16000", wav_path]
44
+ subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
45
+ return wav_path
46
+ except Exception as e:
47
+ raise RuntimeError(f"Error converting audio to WAV: {e}")
48
 
49
  def detect_language(audio_path):
50
+ try:
51
+ speech, _ = librosa.load(audio_path, sr=16000, duration=30)
52
+
53
+ processor = WhisperProcessor.from_pretrained("openai/whisper-base")
54
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
55
+
56
+ input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
57
+ predicted_ids = model.generate(input_features)
58
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
59
+
60
+ langs = detect_langs(transcription)
61
+
62
+ es_confidence = next((lang.prob for lang in langs if lang.lang == 'es'), 0)
63
+ pt_confidence = next((lang.prob for lang in langs if lang.lang == 'pt'), 0)
64
+
65
+ if abs(es_confidence - pt_confidence) < 0.2:
66
+ return 'es'
67
+
68
+ return max(langs, key=lambda x: x.prob).lang
69
+ except Exception as e:
70
+ raise RuntimeError(f"Error detecting language: {e}")
71
 
72
  def diarize_audio(wav_audio):
73
+ try:
74
+ pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=HUGGINGFACE_TOKEN)
75
+ diarization = pipeline(wav_audio)
76
+ return diarization
77
+ except Exception as e:
78
+ raise RuntimeError(f"Error in diarization: {e}")
79
 
80
  def transcribe_audio_stream(audio, model_name):
81
+ try:
82
+ wav_audio = convert_audio_to_wav(audio)
83
+ speech, rate = librosa.load(wav_audio, sr=16000)
84
+ duration = len(speech) / rate
 
 
 
 
 
85
 
86
+ transcriptions = []
87
 
88
+ if "whisper" in model_name:
89
+ processor = WhisperProcessor.from_pretrained(model_name)
90
+ model = WhisperForConditionalGeneration.from_pretrained(model_name)
91
 
92
+ chunk_duration = 30 # seconds
 
 
93
 
94
+ for i in range(0, int(duration), chunk_duration):
95
+ end = min(i + chunk_duration, duration)
96
+ chunk = speech[int(i * rate):int(end * rate)]
97
+
98
+ input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features
99
+ predicted_ids = model.generate(input_features)
100
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
101
+
102
+ progress = min(100, (end / duration) * 100)
103
+ timestamp = i
104
+ transcriptions.append((timestamp, transcription, progress))
105
+ yield transcriptions, progress
106
+ else:
107
+ transcriber = pipeline("automatic-speech-recognition", model=model_name)
108
 
109
+ chunk_duration = 10 # seconds
110
+
111
+ for i in range(0, int(duration), chunk_duration):
112
+ end = min(i + chunk_duration, duration)
113
+ chunk = speech[int(i * rate):int(end * rate)]
114
+ result = transcriber(chunk)
115
+
116
+ progress = min(100, (end / duration) * 100)
117
+ timestamp = i
118
+ transcriptions.append((timestamp, result["text"], progress))
119
+ yield transcriptions, progress
120
+ except Exception as e:
121
+ raise RuntimeError(f"Error in transcription: {e}")
122
 
123
  def merge_diarization_with_transcription(transcriptions, diarization, rate):
124
+ try:
125
+ speaker_transcriptions = []
126
+ for segment in diarization.itertracks(yield_label=True):
127
+ start, end, speaker = segment
128
+ start_time = start / rate
129
+ end_time = end / rate
130
+ text_segment = ""
131
+ for ts, text, _ in transcriptions:
132
+ if start_time <= ts <= end_time:
133
+ text_segment += text + " "
134
+ speaker_transcriptions.append((start_time, end_time, speaker, text_segment.strip()))
135
+ return speaker_transcriptions
136
+ except Exception as e:
137
+ raise RuntimeError(f"Error merging diarization with transcription: {e}")
138
 
139
  def detect_and_select_model(audio):
140
+ try:
141
+ wav_audio = convert_audio_to_wav(audio)
142
+ language = detect_language(wav_audio)
143
+ model_options = MODELS.get(language, MODELS["en"])
144
+ return language, model_options
145
+ except Exception as e:
146
+ raise RuntimeError(f"Error detecting and selecting model: {e}")
147
 
148
  def save_transcription(transcriptions, file_format):
149
+ try:
150
+ if file_format == "txt":
151
+ file_path = "/tmp/transcription.txt"
152
+ with open(file_path, "w") as f:
153
+ for start, end, speaker, text in transcriptions:
154
+ f.write(f"[{start:.2f}-{end:.2f}] {speaker}: {text}\n")
155
+ return file_path
156
+ elif file_format == "json":
157
+ file_path = "/tmp/transcription.json"
158
+ with open(file_path, "w") as f:
159
+ json.dump(transcriptions, f)
160
+ return file_path
161
+ except Exception as e:
162
+ raise RuntimeError(f"Error saving transcription: {e}")
163
 
164
  def combined_interface(audio):
165
  try: