Spaces:
Running
Running
Update transcription.py
Browse files- __pycache__/process.cpython-310.pyc +0 -0
- __pycache__/transcription.cpython-310.pyc +0 -0
- app.py +4 -6
- process.py +31 -4
- transcription.py +93 -8
__pycache__/process.cpython-310.pyc
CHANGED
Binary files a/__pycache__/process.cpython-310.pyc and b/__pycache__/process.cpython-310.pyc differ
|
|
__pycache__/transcription.cpython-310.pyc
CHANGED
Binary files a/__pycache__/transcription.cpython-310.pyc and b/__pycache__/transcription.cpython-310.pyc differ
|
|
app.py
CHANGED
@@ -123,8 +123,7 @@ def transcription():
|
|
123 |
try:
|
124 |
if not total_audio or not os.path.exists(total_audio):
|
125 |
return jsonify({"error": "No audio segments provided"}), 400
|
126 |
-
|
127 |
-
transcription_text = transcripter.create_transcription(audio_directory)
|
128 |
print("transcription")
|
129 |
print(transcription_text)
|
130 |
except Exception as e:
|
@@ -336,10 +335,9 @@ def upload_audio():
|
|
336 |
user_rates = {users[i]: rates[i] for i in range(len(users))}
|
337 |
return jsonify({"rates": rates, "user_rates": user_rates}), 200
|
338 |
else:
|
339 |
-
matched_time, unmatched_time,
|
340 |
-
total_audio = transcripter.
|
341 |
print("単一ユーザーの処理")
|
342 |
-
print(total_audio)
|
343 |
total_time = matched_time + unmatched_time
|
344 |
rate = (matched_time / total_time) * 100 if total_time > 0 else 0
|
345 |
return jsonify({"rate": rate, "user": users[0]}), 200
|
@@ -382,7 +380,7 @@ def reset():
|
|
382 |
# 一時ディレクトリのクリーンアップ
|
383 |
if total_audio:
|
384 |
process.delete_files_in_directory(total_audio)
|
385 |
-
|
386 |
|
387 |
# 書き起こしテキストの削除
|
388 |
if os.path.exists(transcription_text):
|
|
|
123 |
try:
|
124 |
if not total_audio or not os.path.exists(total_audio):
|
125 |
return jsonify({"error": "No audio segments provided"}), 400
|
126 |
+
transcription_text = transcripter.create_transcription(total_audio)
|
|
|
127 |
print("transcription")
|
128 |
print(transcription_text)
|
129 |
except Exception as e:
|
|
|
335 |
user_rates = {users[i]: rates[i] for i in range(len(users))}
|
336 |
return jsonify({"rates": rates, "user_rates": user_rates}), 200
|
337 |
else:
|
338 |
+
matched_time, unmatched_time, merged_segments = process.process_audio(reference_paths[0], audio_path, threshold=0.05)
|
339 |
+
total_audio = transcripter.save_marged_segments(merged_segments)
|
340 |
print("単一ユーザーの処理")
|
|
|
341 |
total_time = matched_time + unmatched_time
|
342 |
rate = (matched_time / total_time) * 100 if total_time > 0 else 0
|
343 |
return jsonify({"rate": rate, "user": users[0]}), 200
|
|
|
380 |
# 一時ディレクトリのクリーンアップ
|
381 |
if total_audio:
|
382 |
process.delete_files_in_directory(total_audio)
|
383 |
+
process.delete_files_in_directory('/tmp/data/transcription_audio')
|
384 |
|
385 |
# 書き起こしテキストの削除
|
386 |
if os.path.exists(transcription_text):
|
process.py
CHANGED
@@ -250,6 +250,10 @@ class AudioProcessor():
|
|
250 |
入力音声からリファレンス音声に類似したセグメントを抽出する
|
251 |
|
252 |
Parameters:
|
|
|
|
|
|
|
|
|
253 |
reference_path (str): リファレンス音声のパス
|
254 |
input_path (str): 入力音声のパス
|
255 |
output_folder (str): 類似セグメントを保存するディレクトリ
|
@@ -257,8 +261,14 @@ class AudioProcessor():
|
|
257 |
threshold (float): 類似度の閾値
|
258 |
|
259 |
Returns:
|
260 |
-
tuple: (マッチした時間(ミリ秒), マッチしなかった時間(ミリ秒),
|
261 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
try:
|
263 |
# リファレンス音声のエンベディングを計算(長さを標準化)
|
264 |
reference_embedding = self.calculate_embedding(reference_path)
|
@@ -294,15 +304,32 @@ class AudioProcessor():
|
|
294 |
if similarity > threshold:
|
295 |
shutil.copy(segment_file, output_folder)
|
296 |
matched_time_ms += len(AudioSegment.from_file(segment_file))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
297 |
except Exception as e:
|
298 |
print(f"セグメント {file} の類似度計算でエラーが発生しました: {e}")
|
299 |
-
|
|
|
|
|
|
|
300 |
unmatched_time_ms = total_duration_ms - matched_time_ms
|
301 |
-
return matched_time_ms, unmatched_time_ms,
|
302 |
|
303 |
except Exception as e:
|
304 |
print(f"音声処理でエラーが発生しました: {e}")
|
305 |
-
return 0, 0,
|
306 |
|
307 |
def process_multi_audio(self, reference_pathes, input_path, output_folder='/tmp/data/matched_multi_segments', seg_duration=1.0, threshold=0.5):
|
308 |
"""
|
|
|
250 |
入力音声からリファレンス音声に類似したセグメントを抽出する
|
251 |
|
252 |
Parameters:
|
253 |
+
isSpeaking(bool): 現在のセグメントがリファレンス音声と類似しているか
|
254 |
+
wasSpeaking(bool): 1つ前のセグメントがリファレンス音声と類似しているか
|
255 |
+
current_segment(list): 一致している、または一致しない話者のセグメントのストック
|
256 |
+
merged_segments(list): 要素は(一致するか(bool), セグメントのリスト)。書き起こしに利用。
|
257 |
reference_path (str): リファレンス音声のパス
|
258 |
input_path (str): 入力音声のパス
|
259 |
output_folder (str): 類似セグメントを保存するディレクトリ
|
|
|
261 |
threshold (float): 類似度の閾値
|
262 |
|
263 |
Returns:
|
264 |
+
tuple: (マッチした時間(ミリ秒), マッチしなかった時間(ミリ秒), 分類済みのセグメント)
|
265 |
"""
|
266 |
+
|
267 |
+
isSpeaking = None
|
268 |
+
wasSpeaking = None
|
269 |
+
current_segment=[]
|
270 |
+
merged_segments=[]
|
271 |
+
|
272 |
try:
|
273 |
# リファレンス音声のエンベディングを計算(長さを標準化)
|
274 |
reference_embedding = self.calculate_embedding(reference_path)
|
|
|
304 |
if similarity > threshold:
|
305 |
shutil.copy(segment_file, output_folder)
|
306 |
matched_time_ms += len(AudioSegment.from_file(segment_file))
|
307 |
+
isSpeaking = True
|
308 |
+
else:
|
309 |
+
isSpeaking = False
|
310 |
+
|
311 |
+
# 話者が変わった場合、保存
|
312 |
+
if wasSpeaking != isSpeaking:
|
313 |
+
if current_segment:
|
314 |
+
merged_segments.append((wasSpeaking, current_segment))
|
315 |
+
wasSpeaking = isSpeaking
|
316 |
+
current_segment = [segment_file]
|
317 |
+
# 変わらなかった場合、結合
|
318 |
+
else:
|
319 |
+
current_segment.append(segment_file)
|
320 |
+
|
321 |
except Exception as e:
|
322 |
print(f"セグメント {file} の類似度計算でエラーが発生しました: {e}")
|
323 |
+
# 余りを保存
|
324 |
+
if current_segment:
|
325 |
+
merged_segments.append((wasSpeaking, current_segment))
|
326 |
+
|
327 |
unmatched_time_ms = total_duration_ms - matched_time_ms
|
328 |
+
return matched_time_ms, unmatched_time_ms, merged_segments
|
329 |
|
330 |
except Exception as e:
|
331 |
print(f"音声処理でエラーが発生しました: {e}")
|
332 |
+
return 0, 0, merged_segments
|
333 |
|
334 |
def process_multi_audio(self, reference_pathes, input_path, output_folder='/tmp/data/matched_multi_segments', seg_duration=1.0, threshold=0.5):
|
335 |
"""
|
transcription.py
CHANGED
@@ -22,41 +22,86 @@ class TranscriptionMaker():
|
|
22 |
|
23 |
#音声ファイルのディレクトリを受け取り、書き起こしファイルを作成する
|
24 |
def create_transcription(self,audio_directory):
|
25 |
-
|
26 |
|
27 |
#ディレクトリ内のファイルを全て取得
|
28 |
if not os.path.isdir(audio_directory):
|
29 |
raise ValueError(f"The specified path is not a valid directory: {audio_directory}")
|
30 |
-
audio_files =
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
33 |
if os.path.splitext(audio_file)[-1].lower() != '.wav':
|
34 |
continue
|
35 |
-
audio_path = os.path.join(
|
36 |
try:
|
37 |
segments,info = list(self.model.transcribe(audio_path))
|
38 |
except Exception as e:
|
39 |
print(f"Error transcripting file {audio_path}: {e}")
|
40 |
raise
|
41 |
sorted_segments = sorted(segments, key=lambda s: s.start)
|
|
|
42 |
for segment in sorted_segments:
|
43 |
results.append({
|
44 |
"start": segment.start,
|
45 |
"end": segment.end,
|
46 |
"text": segment.text
|
47 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
#ファイルの書き込み。ファイル名は"transcription.txt"
|
49 |
output_file=os.path.join(self.output_dir,"transcription.txt")
|
|
|
50 |
try:
|
51 |
with open(output_file,"w",encoding="utf-8") as f:
|
52 |
-
for result in
|
53 |
-
f.write(
|
54 |
except OSError as e:
|
55 |
print(f"Error writing transcription file: {e}")
|
56 |
raise
|
57 |
return output_file
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
#ファイル名が連続しているならくっつける
|
|
|
60 |
def merge_segments(self,segments_dir,output_dir = "/tmp/data/merged_segment"):
|
61 |
if not os.path.exists(output_dir):
|
62 |
os.makedirs(output_dir, exist_ok=True)
|
@@ -97,7 +142,47 @@ class TranscriptionMaker():
|
|
97 |
output_file = os.path.join(output_dir, self.generate_filename(3))
|
98 |
combined_audio.export(output_file, format='wav')
|
99 |
|
100 |
-
return output_dir
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
def generate_random_string(self,length):
|
103 |
letters = string.ascii_letters + string.digits
|
|
|
22 |
|
23 |
#音声ファイルのディレクトリを受け取り、書き起こしファイルを作成する
|
24 |
def create_transcription(self,audio_directory):
|
25 |
+
conversation = []
|
26 |
|
27 |
#ディレクトリ内のファイルを全て取得
|
28 |
if not os.path.isdir(audio_directory):
|
29 |
raise ValueError(f"The specified path is not a valid directory: {audio_directory}")
|
30 |
+
audio_files = self.sort_audio_files_in_directory(audio_directory)
|
31 |
+
merged_segments = self.combine_audio(audio_files)
|
32 |
+
merged_audio_directory = self.save_marged_segments(merged_segments, output_directory='/tmp/data/transcription_audio')
|
33 |
+
merged_files = self.sort_audio_files_in_directory(merged_audio_directory)
|
34 |
+
|
35 |
+
for audio_file in merged_files:
|
36 |
if os.path.splitext(audio_file)[-1].lower() != '.wav':
|
37 |
continue
|
38 |
+
audio_path = os.path.join(merged_audio_directory, audio_file)
|
39 |
try:
|
40 |
segments,info = list(self.model.transcribe(audio_path))
|
41 |
except Exception as e:
|
42 |
print(f"Error transcripting file {audio_path}: {e}")
|
43 |
raise
|
44 |
sorted_segments = sorted(segments, key=lambda s: s.start)
|
45 |
+
results = []
|
46 |
for segment in sorted_segments:
|
47 |
results.append({
|
48 |
"start": segment.start,
|
49 |
"end": segment.end,
|
50 |
"text": segment.text
|
51 |
})
|
52 |
+
combined_text = "".join([result["text"] for result in results])
|
53 |
+
speaker = os.path.basename(audio_file).split("_")[0]
|
54 |
+
# 無音ならスキップ
|
55 |
+
if not combined_text:
|
56 |
+
continue
|
57 |
+
conversation.append(f"{speaker}: {combined_text}")
|
58 |
+
|
59 |
#ファイルの書き込み。ファイル名は"transcription.txt"
|
60 |
output_file=os.path.join(self.output_dir,"transcription.txt")
|
61 |
+
print(conversation)
|
62 |
try:
|
63 |
with open(output_file,"w",encoding="utf-8") as f:
|
64 |
+
for result in conversation:
|
65 |
+
f.write(result)
|
66 |
except OSError as e:
|
67 |
print(f"Error writing transcription file: {e}")
|
68 |
raise
|
69 |
return output_file
|
70 |
|
71 |
+
def combine_audio(self,audio_files):
|
72 |
+
if not audio_files:
|
73 |
+
raise
|
74 |
+
merged_segments = []
|
75 |
+
current_speaker = None
|
76 |
+
current_segment = []
|
77 |
+
for segment in audio_files:
|
78 |
+
speaker = os.path.basename(segment).split("_")[0]
|
79 |
+
if speaker != current_speaker:
|
80 |
+
# 話者が変わった場合はセグメントを保存
|
81 |
+
if current_segment:
|
82 |
+
merged_segments.append((current_speaker, current_segment))
|
83 |
+
current_speaker = speaker
|
84 |
+
current_segment = [segment]
|
85 |
+
else:
|
86 |
+
# 話者が同一の場合はセグメントを結合
|
87 |
+
current_segment.append(segment)
|
88 |
+
# 最後のセグメントを保存
|
89 |
+
if current_segment:
|
90 |
+
merged_segments.append((current_speaker, current_segment))
|
91 |
+
|
92 |
+
return merged_segments
|
93 |
+
|
94 |
+
|
95 |
+
# ディレクトリ内の音声ファイルを並べ替える
|
96 |
+
def sort_audio_files_in_directory(self, directory):
|
97 |
+
files = os.listdir(directory)
|
98 |
+
audio_files = [f for f in files if f.endswith(".wav")]
|
99 |
+
|
100 |
+
audio_files.sort(key=lambda x: datetime.strptime(x.split("_")[1].split(".")[0], "%Y%m%d%H%M%S"))
|
101 |
+
return [os.path.join(directory, f) for f in audio_files]
|
102 |
+
|
103 |
#ファイル名が連続しているならくっつける
|
104 |
+
'''
|
105 |
def merge_segments(self,segments_dir,output_dir = "/tmp/data/merged_segment"):
|
106 |
if not os.path.exists(output_dir):
|
107 |
os.makedirs(output_dir, exist_ok=True)
|
|
|
142 |
output_file = os.path.join(output_dir, self.generate_filename(3))
|
143 |
combined_audio.export(output_file, format='wav')
|
144 |
|
145 |
+
return output_dir'''
|
146 |
+
|
147 |
+
def save_marged_segments(self,merged_segments,output_directory='/tmp/data/conversations'):
|
148 |
+
if not merged_segments:
|
149 |
+
print("merged_segmentsが見つかりませんでした。")
|
150 |
+
raise
|
151 |
+
|
152 |
+
conversation = []
|
153 |
+
for speaker, segments in merged_segments:
|
154 |
+
combined_audio = self.merge_segments(segments)
|
155 |
+
conversation.append((speaker,combined_audio))
|
156 |
+
if not os.path.exists(output_directory):
|
157 |
+
os.makedirs(output_directory)
|
158 |
+
|
159 |
+
for i, (speaker, combined_audio) in enumerate(conversation):
|
160 |
+
current_time = datetime.now().strftime("%Y%m%d%H%M%S")
|
161 |
+
filename = f"{speaker}_{current_time}.wav"
|
162 |
+
file_path = os.path.join(output_directory,filename)
|
163 |
+
combined_audio.export(file_path,format = "wav")
|
164 |
+
print(f"Saved: {file_path}")
|
165 |
+
|
166 |
+
return output_directory
|
167 |
+
|
168 |
+
|
169 |
+
def merge_segments(self,segments):
|
170 |
+
combined = AudioSegment.empty() # 空のAudioSegmentを初期化
|
171 |
+
|
172 |
+
for segment in segments:
|
173 |
+
if isinstance(segment, str):
|
174 |
+
# セグメントがファイルパスの場合、読み込む
|
175 |
+
audio = AudioSegment.from_file(segment)
|
176 |
+
elif isinstance(segment, AudioSegment):
|
177 |
+
# セグメントがすでにAudioSegmentの場合、そのまま使用
|
178 |
+
audio = segment
|
179 |
+
else:
|
180 |
+
raise ValueError("Invalid segment type. Must be file path or AudioSegment.")
|
181 |
+
|
182 |
+
combined += audio
|
183 |
+
return combined
|
184 |
+
|
185 |
+
|
186 |
|
187 |
def generate_random_string(self,length):
|
188 |
letters = string.ascii_letters + string.digits
|