asr-inference

Running on Zero

App Files Files Community

ssolito commited on 10 days ago

Commit

a413b59

verified ·

1 Parent(s): 9905a05

Update whisper_cs.py

Browse files

Files changed (1) hide show

whisper_cs.py +11 -9

whisper_cs.py CHANGED Viewed

@@ -25,9 +25,15 @@ def clean_text(input_text):
 def split_stereo_channels(audio_path):
-    audio = AudioSegment.from_wav(audio_path)
     channels = audio.split_to_mono()
     if len(channels) != 2:
         raise ValueError(f"Audio {audio_path} does not have 2 channels.")
@@ -127,10 +133,8 @@ def post_process_transcription(transcription, max_repeats=2):
     return cleaned_transcription
-def post_merge_consecutive_segments(input_file, output_file): #check
-    with open(input_file, "r") as f:
-        transcription_text = f.read()
     segments = re.split(r'(\[SPEAKER_\d{2}\])', transcription_text)
     merged_transcription = ''
     current_speaker = None
@@ -153,8 +157,7 @@ def post_merge_consecutive_segments(input_file, output_file): #check
     if current_speaker is not None:
         merged_transcription += f'[SPEAKER_{current_speaker}] {" ".join(current_segment)}\n'
-    with open(output_file, "w") as f:
-        f.write(merged_transcription.strip())
 def cleanup_temp_files(*file_paths):
     for path in file_paths:
@@ -262,8 +265,6 @@ def generate(audio_path, use_v2):
         model = load_whisper_model(MODEL_PATH_2)
         split_stereo_channels(audio_path)
-        audio_id = os.path.splitext(os.path.basename(audio_path))[0]
         left_channel_path = "temp_mono_speaker2.wav"
         right_channel_path = "temp_mono_speaker1.wav"
@@ -309,6 +310,7 @@ def generate(audio_path, use_v2):
         clean_output = ""
         for line in aligned_text:
             clean_output += f"{line}\n"
         cleanup_temp_files(mono_audio_path,tmp_full_path)
     cleanup_temp_files(

 def split_stereo_channels(audio_path):
+    ext = os.path.splitext(audio_path)[1].lower()
+    if ext == ".wav":
+        audio = AudioSegment.from_wav(audio_path)
+    elif ext == ".mp3":
+        audio = AudioSegment.from_file(audio_path, format="mp3")
+    else:
+        raise ValueError(f"Unsupported file format: {audio_path}")
     channels = audio.split_to_mono()
     if len(channels) != 2:
         raise ValueError(f"Audio {audio_path} does not have 2 channels.")
     return cleaned_transcription
+def post_merge_consecutive_segments_from_text(transcription_text: str) -> str:
     segments = re.split(r'(\[SPEAKER_\d{2}\])', transcription_text)
     merged_transcription = ''
     current_speaker = None
     if current_speaker is not None:
         merged_transcription += f'[SPEAKER_{current_speaker}] {" ".join(current_segment)}\n'
+    return merged_transcription.strip()
 def cleanup_temp_files(*file_paths):
     for path in file_paths:
         model = load_whisper_model(MODEL_PATH_2)
         split_stereo_channels(audio_path)
         left_channel_path = "temp_mono_speaker2.wav"
         right_channel_path = "temp_mono_speaker1.wav"
         clean_output = ""
         for line in aligned_text:
             clean_output += f"{line}\n"
+        clean_output = post_merge_consecutive_segments_from_text(clean_output)
         cleanup_temp_files(mono_audio_path,tmp_full_path)
     cleanup_temp_files(