JusTalk / transcription.py
A-yum1's picture
Update talk_detail.js
917a40d
import os
from faster_whisper import WhisperModel
from pydub import AudioSegment
import string
import random
from datetime import datetime
# Matplotlibのキャッシュディレクトリを変更
os.environ["MPLCONFIGDIR"] = "/tmp/matplotlib"
# Hugging Faceのキャッシュディレクトリを変更
os.environ["HF_HOME"] = "/tmp/huggingface"
os.environ["HUGGINGFACE_HUB_CACHE"] = "/tmp/huggingface"
class TranscriptionMaker():
# 書き起こしファイルを吐き出すディレクトリを指定
def __init__(self, output_dir="/tmp/data/transcriptions"):
self.model = WhisperModel("base", device="cpu", download_root="/tmp/huggingface")
self.output_dir = output_dir
os.makedirs(self.output_dir, exist_ok=True)
#音声ファイルのディレクトリを受け取り、書き起こしファイルを作成する
def create_transcription(self,audio_directory):
conversation = []
#ディレクトリ内のファイルを全て取得
if not os.path.isdir(audio_directory):
raise ValueError(f"The specified path is not a valid directory: {audio_directory}")
audio_files = self.sort_audio_files_in_directory(audio_directory)
merged_segments = self.combine_audio(audio_files)
merged_audio_directory = self.save_marged_segments(merged_segments, output_directory='/tmp/data/transcription_audio')
merged_files = self.sort_audio_files_in_directory(merged_audio_directory)
for audio_file in merged_files:
if os.path.splitext(audio_file)[-1].lower() != '.wav':
continue
audio_path = os.path.join(merged_audio_directory, audio_file)
try:
segments,info = list(self.model.transcribe(audio_path))
except Exception as e:
print(f"Error transcripting file {audio_path}: {e}")
raise
sorted_segments = sorted(segments, key=lambda s: s.start)
results = []
for segment in sorted_segments:
results.append({
"start": segment.start,
"end": segment.end,
"text": segment.text
})
combined_text = "".join([result["text"] for result in results])
speaker = os.path.basename(audio_file).split("_")[0]
# 無音ならスキップ
if not combined_text:
continue
conversation.append(f"{speaker}: {combined_text}<br>")
#ファイルの書き込み。ファイル名は"transcription.txt"
output_file=os.path.join(self.output_dir,"transcription.txt")
print(conversation)
try:
with open(output_file,"w",encoding="utf-8") as f:
for result in conversation:
f.write(result)
except OSError as e:
print(f"Error writing transcription file: {e}")
raise
return output_file
# 受け取った音声ファイルを話者ごとに整理する
def combine_audio(self,audio_files):
if not audio_files:
raise
merged_segments = []
current_speaker = None
current_segment = []
for segment in audio_files:
speaker = os.path.basename(segment).split("_")[0]
if speaker != current_speaker:
# 話者が変わった場合はセグメントを保存
if current_segment:
merged_segments.append((current_speaker, current_segment))
current_speaker = speaker
current_segment = [segment]
else:
# 話者が同一の場合はセグメントを結合
current_segment.append(segment)
# 最後のセグメントを保存
if current_segment:
merged_segments.append((current_speaker, current_segment))
return merged_segments
# ディレクトリ内の音声ファイルを並べ替える
def sort_audio_files_in_directory(self, directory):
files = os.listdir(directory)
audio_files = [f for f in files if f.endswith(".wav")]
audio_files.sort(key=lambda x: datetime.strptime(x.split("_")[1].split(".")[0], "%Y%m%d%H%M%S"))
return [os.path.join(directory, f) for f in audio_files]
def save_marged_segments(self,merged_segments,output_directory='/tmp/data/conversations'):
if not merged_segments:
print("merged_segmentsが見つかりませんでした。")
raise
conversation = []
for speaker, segments in merged_segments:
combined_audio = self.merge_segments(segments)
conversation.append((speaker,combined_audio))
if not os.path.exists(output_directory):
os.makedirs(output_directory)
for i, (speaker, combined_audio) in enumerate(conversation):
current_time = datetime.now().strftime("%Y%m%d%H%M%S")
filename = f"{speaker}_{current_time}.wav"
file_path = os.path.join(output_directory,filename)
combined_audio.export(file_path,format = "wav")
print(f"Saved: {file_path}")
return output_directory
def merge_segments(self,segments):
combined = AudioSegment.empty() # 空のAudioSegmentを初期化
for segment in segments:
if isinstance(segment, str):
# セグメントがファイルパスの場合、読み込む
audio = AudioSegment.from_file(segment)
elif isinstance(segment, AudioSegment):
# セグメントがすでにAudioSegmentの場合、そのまま使用
audio = segment
else:
raise ValueError("Invalid segment type. Must be file path or AudioSegment.")
combined += audio
return combined
def generate_random_string(self,length):
letters = string.ascii_letters + string.digits
return ''.join(random.choice(letters) for i in range(length))
def generate_filename(self,random_length):
current_time = datetime.now().strftime("%Y%m%d%H%M%S")
filename = f"{current_time}.wav"
return filename