A-yum1 commited on
Commit
d8bbfec
·
1 Parent(s): 3966778

Update transcription.py

Browse files
__pycache__/process.cpython-310.pyc CHANGED
Binary files a/__pycache__/process.cpython-310.pyc and b/__pycache__/process.cpython-310.pyc differ
 
__pycache__/transcription.cpython-310.pyc CHANGED
Binary files a/__pycache__/transcription.cpython-310.pyc and b/__pycache__/transcription.cpython-310.pyc differ
 
app.py CHANGED
@@ -123,8 +123,7 @@ def transcription():
123
  try:
124
  if not total_audio or not os.path.exists(total_audio):
125
  return jsonify({"error": "No audio segments provided"}), 400
126
- audio_directory = transcripter.merge_segments(total_audio, '/tmp/data/transcription_audio')
127
- transcription_text = transcripter.create_transcription(audio_directory)
128
  print("transcription")
129
  print(transcription_text)
130
  except Exception as e:
@@ -336,10 +335,9 @@ def upload_audio():
336
  user_rates = {users[i]: rates[i] for i in range(len(users))}
337
  return jsonify({"rates": rates, "user_rates": user_rates}), 200
338
  else:
339
- matched_time, unmatched_time, segments_dir = process.process_audio(reference_paths[0], audio_path, threshold=0.05)
340
- total_audio = transcripter.merge_segments(segments_dir)
341
  print("単一ユーザーの処理")
342
- print(total_audio)
343
  total_time = matched_time + unmatched_time
344
  rate = (matched_time / total_time) * 100 if total_time > 0 else 0
345
  return jsonify({"rate": rate, "user": users[0]}), 200
@@ -382,7 +380,7 @@ def reset():
382
  # 一時ディレクトリのクリーンアップ
383
  if total_audio:
384
  process.delete_files_in_directory(total_audio)
385
- process.delete_files_in_directory('/tmp/data/transcription_audio')
386
 
387
  # 書き起こしテキストの削除
388
  if os.path.exists(transcription_text):
 
123
  try:
124
  if not total_audio or not os.path.exists(total_audio):
125
  return jsonify({"error": "No audio segments provided"}), 400
126
+ transcription_text = transcripter.create_transcription(total_audio)
 
127
  print("transcription")
128
  print(transcription_text)
129
  except Exception as e:
 
335
  user_rates = {users[i]: rates[i] for i in range(len(users))}
336
  return jsonify({"rates": rates, "user_rates": user_rates}), 200
337
  else:
338
+ matched_time, unmatched_time, merged_segments = process.process_audio(reference_paths[0], audio_path, threshold=0.05)
339
+ total_audio = transcripter.save_marged_segments(merged_segments)
340
  print("単一ユーザーの処理")
 
341
  total_time = matched_time + unmatched_time
342
  rate = (matched_time / total_time) * 100 if total_time > 0 else 0
343
  return jsonify({"rate": rate, "user": users[0]}), 200
 
380
  # 一時ディレクトリのクリーンアップ
381
  if total_audio:
382
  process.delete_files_in_directory(total_audio)
383
+ process.delete_files_in_directory('/tmp/data/transcription_audio')
384
 
385
  # 書き起こしテキストの削除
386
  if os.path.exists(transcription_text):
process.py CHANGED
@@ -250,6 +250,10 @@ class AudioProcessor():
250
  入力音声からリファレンス音声に類似したセグメントを抽出する
251
 
252
  Parameters:
 
 
 
 
253
  reference_path (str): リファレンス音声のパス
254
  input_path (str): 入力音声のパス
255
  output_folder (str): 類似セグメントを保存するディレクトリ
@@ -257,8 +261,14 @@ class AudioProcessor():
257
  threshold (float): 類似度の閾値
258
 
259
  Returns:
260
- tuple: (マッチした時間(ミリ秒), マッチしなかった時間(ミリ秒), 出力フォルダのパス)
261
  """
 
 
 
 
 
 
262
  try:
263
  # リファレンス音声のエンベディングを計算(長さを標準化)
264
  reference_embedding = self.calculate_embedding(reference_path)
@@ -294,15 +304,32 @@ class AudioProcessor():
294
  if similarity > threshold:
295
  shutil.copy(segment_file, output_folder)
296
  matched_time_ms += len(AudioSegment.from_file(segment_file))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
  except Exception as e:
298
  print(f"セグメント {file} の類似度計算でエラーが発生しました: {e}")
299
-
 
 
 
300
  unmatched_time_ms = total_duration_ms - matched_time_ms
301
- return matched_time_ms, unmatched_time_ms, output_folder
302
 
303
  except Exception as e:
304
  print(f"音声処理でエラーが発生しました: {e}")
305
- return 0, 0, output_folder
306
 
307
  def process_multi_audio(self, reference_pathes, input_path, output_folder='/tmp/data/matched_multi_segments', seg_duration=1.0, threshold=0.5):
308
  """
 
250
  入力音声からリファレンス音声に類似したセグメントを抽出する
251
 
252
  Parameters:
253
+ isSpeaking(bool): 現在のセグメントがリファレンス音声と類似しているか
254
+ wasSpeaking(bool): 1つ前のセグメントがリファレンス音声と類似しているか
255
+ current_segment(list): 一致している、または一致しない話者のセグメントのストック
256
+ merged_segments(list): 要素は(一致するか(bool), セグメントのリスト)。書き起こしに利用。
257
  reference_path (str): リファレンス音声のパス
258
  input_path (str): 入力音声のパス
259
  output_folder (str): 類似セグメントを保存するディレクトリ
 
261
  threshold (float): 類似度の閾値
262
 
263
  Returns:
264
+ tuple: (マッチした時間(ミリ秒), マッチしなかった時間(ミリ秒), 分類済みのセグメント)
265
  """
266
+
267
+ isSpeaking = None
268
+ wasSpeaking = None
269
+ current_segment=[]
270
+ merged_segments=[]
271
+
272
  try:
273
  # リファレンス音声のエンベディングを計算(長さを標準化)
274
  reference_embedding = self.calculate_embedding(reference_path)
 
304
  if similarity > threshold:
305
  shutil.copy(segment_file, output_folder)
306
  matched_time_ms += len(AudioSegment.from_file(segment_file))
307
+ isSpeaking = True
308
+ else:
309
+ isSpeaking = False
310
+
311
+ # 話者が変わった場合、保存
312
+ if wasSpeaking != isSpeaking:
313
+ if current_segment:
314
+ merged_segments.append((wasSpeaking, current_segment))
315
+ wasSpeaking = isSpeaking
316
+ current_segment = [segment_file]
317
+ # 変わらなかった場合、結合
318
+ else:
319
+ current_segment.append(segment_file)
320
+
321
  except Exception as e:
322
  print(f"セグメント {file} の類似度計算でエラーが発生しました: {e}")
323
+ # 余りを保存
324
+ if current_segment:
325
+ merged_segments.append((wasSpeaking, current_segment))
326
+
327
  unmatched_time_ms = total_duration_ms - matched_time_ms
328
+ return matched_time_ms, unmatched_time_ms, merged_segments
329
 
330
  except Exception as e:
331
  print(f"音声処理でエラーが発生しました: {e}")
332
+ return 0, 0, merged_segments
333
 
334
  def process_multi_audio(self, reference_pathes, input_path, output_folder='/tmp/data/matched_multi_segments', seg_duration=1.0, threshold=0.5):
335
  """
transcription.py CHANGED
@@ -22,41 +22,86 @@ class TranscriptionMaker():
22
 
23
  #音声ファイルのディレクトリを受け取り、書き起こしファイルを作成する
24
  def create_transcription(self,audio_directory):
25
- results = []
26
 
27
  #ディレクトリ内のファイルを全て取得
28
  if not os.path.isdir(audio_directory):
29
  raise ValueError(f"The specified path is not a valid directory: {audio_directory}")
30
- audio_files = os.listdir(audio_directory)
31
- audio_files = sorted(os.listdir(audio_directory))
32
- for audio_file in audio_files:
 
 
 
33
  if os.path.splitext(audio_file)[-1].lower() != '.wav':
34
  continue
35
- audio_path = os.path.join(audio_directory, audio_file)
36
  try:
37
  segments,info = list(self.model.transcribe(audio_path))
38
  except Exception as e:
39
  print(f"Error transcripting file {audio_path}: {e}")
40
  raise
41
  sorted_segments = sorted(segments, key=lambda s: s.start)
 
42
  for segment in sorted_segments:
43
  results.append({
44
  "start": segment.start,
45
  "end": segment.end,
46
  "text": segment.text
47
  })
 
 
 
 
 
 
 
48
  #ファイルの書き込み。ファイル名は"transcription.txt"
49
  output_file=os.path.join(self.output_dir,"transcription.txt")
 
50
  try:
51
  with open(output_file,"w",encoding="utf-8") as f:
52
- for result in results:
53
- f.write(f"{result['text']}\n")
54
  except OSError as e:
55
  print(f"Error writing transcription file: {e}")
56
  raise
57
  return output_file
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  #ファイル名が連続しているならくっつける
 
60
  def merge_segments(self,segments_dir,output_dir = "/tmp/data/merged_segment"):
61
  if not os.path.exists(output_dir):
62
  os.makedirs(output_dir, exist_ok=True)
@@ -97,7 +142,47 @@ class TranscriptionMaker():
97
  output_file = os.path.join(output_dir, self.generate_filename(3))
98
  combined_audio.export(output_file, format='wav')
99
 
100
- return output_dir
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  def generate_random_string(self,length):
103
  letters = string.ascii_letters + string.digits
 
22
 
23
  #音声ファイルのディレクトリを受け取り、書き起こしファイルを作成する
24
  def create_transcription(self,audio_directory):
25
+ conversation = []
26
 
27
  #ディレクトリ内のファイルを全て取得
28
  if not os.path.isdir(audio_directory):
29
  raise ValueError(f"The specified path is not a valid directory: {audio_directory}")
30
+ audio_files = self.sort_audio_files_in_directory(audio_directory)
31
+ merged_segments = self.combine_audio(audio_files)
32
+ merged_audio_directory = self.save_marged_segments(merged_segments, output_directory='/tmp/data/transcription_audio')
33
+ merged_files = self.sort_audio_files_in_directory(merged_audio_directory)
34
+
35
+ for audio_file in merged_files:
36
  if os.path.splitext(audio_file)[-1].lower() != '.wav':
37
  continue
38
+ audio_path = os.path.join(merged_audio_directory, audio_file)
39
  try:
40
  segments,info = list(self.model.transcribe(audio_path))
41
  except Exception as e:
42
  print(f"Error transcripting file {audio_path}: {e}")
43
  raise
44
  sorted_segments = sorted(segments, key=lambda s: s.start)
45
+ results = []
46
  for segment in sorted_segments:
47
  results.append({
48
  "start": segment.start,
49
  "end": segment.end,
50
  "text": segment.text
51
  })
52
+ combined_text = "".join([result["text"] for result in results])
53
+ speaker = os.path.basename(audio_file).split("_")[0]
54
+ # 無音ならスキップ
55
+ if not combined_text:
56
+ continue
57
+ conversation.append(f"{speaker}: {combined_text}")
58
+
59
  #ファイルの書き込み。ファイル名は"transcription.txt"
60
  output_file=os.path.join(self.output_dir,"transcription.txt")
61
+ print(conversation)
62
  try:
63
  with open(output_file,"w",encoding="utf-8") as f:
64
+ for result in conversation:
65
+ f.write(result)
66
  except OSError as e:
67
  print(f"Error writing transcription file: {e}")
68
  raise
69
  return output_file
70
 
71
+ def combine_audio(self,audio_files):
72
+ if not audio_files:
73
+ raise
74
+ merged_segments = []
75
+ current_speaker = None
76
+ current_segment = []
77
+ for segment in audio_files:
78
+ speaker = os.path.basename(segment).split("_")[0]
79
+ if speaker != current_speaker:
80
+ # 話者が変わった場合はセグメントを保存
81
+ if current_segment:
82
+ merged_segments.append((current_speaker, current_segment))
83
+ current_speaker = speaker
84
+ current_segment = [segment]
85
+ else:
86
+ # 話者が同一の場合はセグメントを結合
87
+ current_segment.append(segment)
88
+ # 最後のセグメントを保存
89
+ if current_segment:
90
+ merged_segments.append((current_speaker, current_segment))
91
+
92
+ return merged_segments
93
+
94
+
95
+ # ディレクトリ内の音声ファイルを並べ替える
96
+ def sort_audio_files_in_directory(self, directory):
97
+ files = os.listdir(directory)
98
+ audio_files = [f for f in files if f.endswith(".wav")]
99
+
100
+ audio_files.sort(key=lambda x: datetime.strptime(x.split("_")[1].split(".")[0], "%Y%m%d%H%M%S"))
101
+ return [os.path.join(directory, f) for f in audio_files]
102
+
103
  #ファイル名が連続しているならくっつける
104
+ '''
105
  def merge_segments(self,segments_dir,output_dir = "/tmp/data/merged_segment"):
106
  if not os.path.exists(output_dir):
107
  os.makedirs(output_dir, exist_ok=True)
 
142
  output_file = os.path.join(output_dir, self.generate_filename(3))
143
  combined_audio.export(output_file, format='wav')
144
 
145
+ return output_dir'''
146
+
147
+ def save_marged_segments(self,merged_segments,output_directory='/tmp/data/conversations'):
148
+ if not merged_segments:
149
+ print("merged_segmentsが見つかりませんでした。")
150
+ raise
151
+
152
+ conversation = []
153
+ for speaker, segments in merged_segments:
154
+ combined_audio = self.merge_segments(segments)
155
+ conversation.append((speaker,combined_audio))
156
+ if not os.path.exists(output_directory):
157
+ os.makedirs(output_directory)
158
+
159
+ for i, (speaker, combined_audio) in enumerate(conversation):
160
+ current_time = datetime.now().strftime("%Y%m%d%H%M%S")
161
+ filename = f"{speaker}_{current_time}.wav"
162
+ file_path = os.path.join(output_directory,filename)
163
+ combined_audio.export(file_path,format = "wav")
164
+ print(f"Saved: {file_path}")
165
+
166
+ return output_directory
167
+
168
+
169
+ def merge_segments(self,segments):
170
+ combined = AudioSegment.empty() # 空のAudioSegmentを初期化
171
+
172
+ for segment in segments:
173
+ if isinstance(segment, str):
174
+ # セグメントがファイルパスの場合、読み込む
175
+ audio = AudioSegment.from_file(segment)
176
+ elif isinstance(segment, AudioSegment):
177
+ # セグメントがすでにAudioSegmentの場合、そのまま使用
178
+ audio = segment
179
+ else:
180
+ raise ValueError("Invalid segment type. Must be file path or AudioSegment.")
181
+
182
+ combined += audio
183
+ return combined
184
+
185
+
186
 
187
  def generate_random_string(self,length):
188
  letters = string.ascii_letters + string.digits