Spaces:

Justtalk
/

JusTalk

Running

App Files Files Community

A-yum1 commited on Mar 29

Commit

d8bbfec

1 Parent(s): 3966778

Update transcription.py

Browse files

Files changed (5) hide show

__pycache__/process.cpython-310.pyc +0 -0
__pycache__/transcription.cpython-310.pyc +0 -0
app.py +4 -6
process.py +31 -4
transcription.py +93 -8

__pycache__/process.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/process.cpython-310.pyc and b/__pycache__/process.cpython-310.pyc differ

__pycache__/transcription.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/transcription.cpython-310.pyc and b/__pycache__/transcription.cpython-310.pyc differ

app.py CHANGED Viewed

@@ -123,8 +123,7 @@ def transcription():
         try:
             if not total_audio or not os.path.exists(total_audio):
                 return jsonify({"error": "No audio segments provided"}), 400
-            audio_directory = transcripter.merge_segments(total_audio, '/tmp/data/transcription_audio')
-            transcription_text = transcripter.create_transcription(audio_directory)
             print("transcription")
             print(transcription_text)
         except Exception as e:
@@ -336,10 +335,9 @@ def upload_audio():
             user_rates = {users[i]: rates[i] for i in range(len(users))}
             return jsonify({"rates": rates, "user_rates": user_rates}), 200
         else:
-            matched_time, unmatched_time, segments_dir = process.process_audio(reference_paths[0], audio_path, threshold=0.05)
-            total_audio = transcripter.merge_segments(segments_dir)
             print("単一ユーザーの処理")
-            print(total_audio)
             total_time = matched_time + unmatched_time
             rate = (matched_time / total_time) * 100 if total_time > 0 else 0
             return jsonify({"rate": rate, "user": users[0]}), 200
@@ -382,7 +380,7 @@ def reset():
     # 一時ディレクトリのクリーンアップ
     if total_audio:
         process.delete_files_in_directory(total_audio)
-    process.delete_files_in_directory('/tmp/data/transcription_audio')
     # 書き起こしテキストの削除
     if os.path.exists(transcription_text):

         try:
             if not total_audio or not os.path.exists(total_audio):
                 return jsonify({"error": "No audio segments provided"}), 400
+            transcription_text = transcripter.create_transcription(total_audio)
             print("transcription")
             print(transcription_text)
         except Exception as e:
             user_rates = {users[i]: rates[i] for i in range(len(users))}
             return jsonify({"rates": rates, "user_rates": user_rates}), 200
         else:
+            matched_time, unmatched_time, merged_segments = process.process_audio(reference_paths[0], audio_path, threshold=0.05)
+            total_audio = transcripter.save_marged_segments(merged_segments)
             print("単一ユーザーの処理")
             total_time = matched_time + unmatched_time
             rate = (matched_time / total_time) * 100 if total_time > 0 else 0
             return jsonify({"rate": rate, "user": users[0]}), 200
     # 一時ディレクトリのクリーンアップ
     if total_audio:
         process.delete_files_in_directory(total_audio)
+        process.delete_files_in_directory('/tmp/data/transcription_audio')
     # 書き起こしテキストの削除
     if os.path.exists(transcription_text):

process.py CHANGED Viewed

@@ -250,6 +250,10 @@ class AudioProcessor():
         入力音声からリファレンス音声に類似したセグメントを抽出する
         Parameters:
         reference_path (str): リファレンス音声のパス
         input_path (str): 入力音声のパス
         output_folder (str): 類似セグメントを保存するディレクトリ
@@ -257,8 +261,14 @@ class AudioProcessor():
         threshold (float): 類似度の閾値
         Returns:
-        tuple: (マッチした時間（ミリ秒）, マッチしなかった時間（ミリ秒）, 出力フォルダのパス)
         """
         try:
             # リファレンス音声のエンベディングを計算（長さを標準化）
             reference_embedding = self.calculate_embedding(reference_path)
@@ -294,15 +304,32 @@ class AudioProcessor():
                     if similarity > threshold:
                         shutil.copy(segment_file, output_folder)
                         matched_time_ms += len(AudioSegment.from_file(segment_file))
                 except Exception as e:
                     print(f"セグメント {file} の類似度計算でエラーが発生しました: {e}")
             unmatched_time_ms = total_duration_ms - matched_time_ms
-            return matched_time_ms, unmatched_time_ms, output_folder
         except Exception as e:
             print(f"音声処理でエラーが発生しました: {e}")
-            return 0, 0, output_folder
     def process_multi_audio(self, reference_pathes, input_path, output_folder='/tmp/data/matched_multi_segments', seg_duration=1.0, threshold=0.5):
         """

         入力音声からリファレンス音声に類似したセグメントを抽出する
         Parameters:
+        isSpeaking(bool): 現在のセグメントがリファレンス音声と類似しているか
+        wasSpeaking(bool): 1つ前のセグメントがリファレンス音声と類似しているか
+        current_segment(list): 一致している、または一致しない話者のセグメントのストック
+        merged_segments(list): 要素は(一致するか(bool), セグメントのリスト)。書き起こしに利用。
         reference_path (str): リファレンス音声のパス
         input_path (str): 入力音声のパス
         output_folder (str): 類似セグメントを保存するディレクトリ
         threshold (float): 類似度の閾値
         Returns:
+        tuple: (マッチした時間（ミリ秒）, マッチしなかった時間（ミリ秒）, 分類済みのセグメント)
         """
+        isSpeaking = None
+        wasSpeaking = None
+        current_segment=[]
+        merged_segments=[]
         try:
             # リファレンス音声のエンベディングを計算（長さを標準化）
             reference_embedding = self.calculate_embedding(reference_path)
                     if similarity > threshold:
                         shutil.copy(segment_file, output_folder)
                         matched_time_ms += len(AudioSegment.from_file(segment_file))
+                        isSpeaking = True
+                    else:
+                        isSpeaking = False
+                    # 話者が変わった場合、保存
+                    if wasSpeaking != isSpeaking:
+                        if current_segment:
+                            merged_segments.append((wasSpeaking, current_segment))
+                        wasSpeaking = isSpeaking
+                        current_segment = [segment_file]
+                    # 変わらなかった場合、結合
+                    else:
+                        current_segment.append(segment_file)
                 except Exception as e:
                     print(f"セグメント {file} の類似度計算でエラーが発生しました: {e}")
+            # 余りを保存
+            if current_segment:
+                merged_segments.append((wasSpeaking, current_segment))
             unmatched_time_ms = total_duration_ms - matched_time_ms
+            return matched_time_ms, unmatched_time_ms, merged_segments
         except Exception as e:
             print(f"音声処理でエラーが発生しました: {e}")
+            return 0, 0, merged_segments
     def process_multi_audio(self, reference_pathes, input_path, output_folder='/tmp/data/matched_multi_segments', seg_duration=1.0, threshold=0.5):
         """

transcription.py CHANGED Viewed

@@ -22,41 +22,86 @@ class TranscriptionMaker():
     #音声ファイルのディレクトリを受け取り、書き起こしファイルを作成する
     def create_transcription(self,audio_directory):
-        results = []
         #ディレクトリ内のファイルを全て取得
         if not os.path.isdir(audio_directory):
             raise ValueError(f"The specified path is not a valid directory: {audio_directory}")
-        audio_files = os.listdir(audio_directory)
-        audio_files = sorted(os.listdir(audio_directory))
-        for audio_file in audio_files:
             if os.path.splitext(audio_file)[-1].lower() != '.wav':
                 continue
-            audio_path =  os.path.join(audio_directory, audio_file)
             try:
                 segments,info = list(self.model.transcribe(audio_path))
             except Exception as e:
                 print(f"Error transcripting file {audio_path}: {e}")
                 raise
             sorted_segments = sorted(segments, key=lambda s: s.start)
             for segment in sorted_segments:
                 results.append({
                     "start": segment.start,
                     "end": segment.end,
                     "text": segment.text
                 })
         #ファイルの書き込み。ファイル名は"transcription.txt"
         output_file=os.path.join(self.output_dir,"transcription.txt")
         try:
             with open(output_file,"w",encoding="utf-8") as f:
-                for result in results:
-                    f.write(f"{result['text']}\n")
         except OSError as e:
             print(f"Error writing transcription file: {e}")
             raise
         return output_file
     #ファイル名が連続しているならくっつける
     def merge_segments(self,segments_dir,output_dir = "/tmp/data/merged_segment"):
         if not os.path.exists(output_dir):
             os.makedirs(output_dir, exist_ok=True)
@@ -97,7 +142,47 @@ class TranscriptionMaker():
             output_file = os.path.join(output_dir, self.generate_filename(3))
             combined_audio.export(output_file, format='wav')
-        return output_dir
     def generate_random_string(self,length):
         letters = string.ascii_letters + string.digits

     #音声ファイルのディレクトリを受け取り、書き起こしファイルを作成する
     def create_transcription(self,audio_directory):
+        conversation = []
         #ディレクトリ内のファイルを全て取得
         if not os.path.isdir(audio_directory):
             raise ValueError(f"The specified path is not a valid directory: {audio_directory}")
+        audio_files = self.sort_audio_files_in_directory(audio_directory)
+        merged_segments = self.combine_audio(audio_files)
+        merged_audio_directory = self.save_marged_segments(merged_segments, output_directory='/tmp/data/transcription_audio')
+        merged_files = self.sort_audio_files_in_directory(merged_audio_directory)
+        for audio_file in merged_files:
             if os.path.splitext(audio_file)[-1].lower() != '.wav':
                 continue
+            audio_path =  os.path.join(merged_audio_directory, audio_file)
             try:
                 segments,info = list(self.model.transcribe(audio_path))
             except Exception as e:
                 print(f"Error transcripting file {audio_path}: {e}")
                 raise
             sorted_segments = sorted(segments, key=lambda s: s.start)
+            results = []
             for segment in sorted_segments:
                 results.append({
                     "start": segment.start,
                     "end": segment.end,
                     "text": segment.text
                 })
+            combined_text = "".join([result["text"] for result in results])
+            speaker = os.path.basename(audio_file).split("_")[0]
+            # 無音ならスキップ
+            if not combined_text:
+                continue
+            conversation.append(f"{speaker}: {combined_text}")
         #ファイルの書き込み。ファイル名は"transcription.txt"
         output_file=os.path.join(self.output_dir,"transcription.txt")
+        print(conversation)
         try:
             with open(output_file,"w",encoding="utf-8") as f:
+                for result in conversation:
+                    f.write(result)
         except OSError as e:
             print(f"Error writing transcription file: {e}")
             raise
         return output_file
+    def combine_audio(self,audio_files):
+        if not audio_files:
+            raise
+        merged_segments = []
+        current_speaker = None
+        current_segment = []
+        for segment in audio_files:
+            speaker = os.path.basename(segment).split("_")[0]
+            if speaker != current_speaker:
+                # 話者が変わった場合はセグメントを保存
+                if current_segment:
+                    merged_segments.append((current_speaker, current_segment))
+                current_speaker = speaker
+                current_segment = [segment]
+            else:
+                # 話者が同一の場合はセグメントを結合
+                current_segment.append(segment)
+        # 最後のセグメントを保存
+        if current_segment:
+            merged_segments.append((current_speaker, current_segment))
+        return merged_segments
+    # ディレクトリ内の音声ファイルを並べ替える
+    def sort_audio_files_in_directory(self, directory):
+        files = os.listdir(directory)
+        audio_files = [f for f in files if f.endswith(".wav")]
+        audio_files.sort(key=lambda x: datetime.strptime(x.split("_")[1].split(".")[0], "%Y%m%d%H%M%S"))
+        return [os.path.join(directory, f) for f in audio_files]
     #ファイル名が連続しているならくっつける
+    '''
     def merge_segments(self,segments_dir,output_dir = "/tmp/data/merged_segment"):
         if not os.path.exists(output_dir):
             os.makedirs(output_dir, exist_ok=True)
             output_file = os.path.join(output_dir, self.generate_filename(3))
             combined_audio.export(output_file, format='wav')
+        return output_dir'''
+    def save_marged_segments(self,merged_segments,output_directory='/tmp/data/conversations'):
+        if not merged_segments:
+            print("merged_segmentsが見つかりませんでした。")
+            raise
+        conversation = []
+        for speaker, segments in merged_segments:
+            combined_audio = self.merge_segments(segments)
+            conversation.append((speaker,combined_audio))
+        if not os.path.exists(output_directory):
+            os.makedirs(output_directory)
+        for i, (speaker, combined_audio) in enumerate(conversation):
+            current_time = datetime.now().strftime("%Y%m%d%H%M%S")
+            filename = f"{speaker}_{current_time}.wav"
+            file_path = os.path.join(output_directory,filename)
+            combined_audio.export(file_path,format = "wav")
+            print(f"Saved: {file_path}")
+        return output_directory
+    def merge_segments(self,segments):
+        combined = AudioSegment.empty()  # 空のAudioSegmentを初期化
+        for segment in segments:
+            if isinstance(segment, str):
+                # セグメントがファイルパスの場合、読み込む
+                audio = AudioSegment.from_file(segment)
+            elif isinstance(segment, AudioSegment):
+                # セグメントがすでにAudioSegmentの場合、そのまま使用
+                audio = segment
+            else:
+                raise ValueError("Invalid segment type. Must be file path or AudioSegment.")
+            combined += audio
+        return combined
     def generate_random_string(self,length):
         letters = string.ascii_letters + string.digits