File size: 6,458 Bytes
10d653d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8bbfec
10d653d
 
 
 
d8bbfec
 
 
 
 
 
10d653d
 
d8bbfec
10d653d
 
 
 
 
 
d8bbfec
10d653d
 
 
 
 
 
d8bbfec
 
 
 
 
917a40d
d8bbfec
10d653d
 
d8bbfec
10d653d
 
d8bbfec
 
10d653d
 
 
 
 
0dbd483
d8bbfec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10d653d
 
 
 
 
 
 
 
3fbb133
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import os
from faster_whisper import WhisperModel
from pydub import AudioSegment
import string
import random
from datetime import datetime

# Matplotlibのキャッシュディレクトリを変更
os.environ["MPLCONFIGDIR"] = "/tmp/matplotlib"

# Hugging Faceのキャッシュディレクトリを変更
os.environ["HF_HOME"] = "/tmp/huggingface"
os.environ["HUGGINGFACE_HUB_CACHE"] = "/tmp/huggingface"

class TranscriptionMaker():
    # 書き起こしファイルを吐き出すディレクトリを指定
    def __init__(self, output_dir="/tmp/data/transcriptions"):
        self.model = WhisperModel("base", device="cpu", download_root="/tmp/huggingface")
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)


    #音声ファイルのディレクトリを受け取り、書き起こしファイルを作成する
    def create_transcription(self,audio_directory):
        conversation = []

        #ディレクトリ内のファイルを全て取得
        if not os.path.isdir(audio_directory):
            raise ValueError(f"The specified path is not a valid directory: {audio_directory}")
        audio_files = self.sort_audio_files_in_directory(audio_directory)
        merged_segments = self.combine_audio(audio_files)
        merged_audio_directory = self.save_marged_segments(merged_segments, output_directory='/tmp/data/transcription_audio')
        merged_files = self.sort_audio_files_in_directory(merged_audio_directory)

        for audio_file in merged_files:
            if os.path.splitext(audio_file)[-1].lower() != '.wav':
                continue
            audio_path =  os.path.join(merged_audio_directory, audio_file)
            try:
                segments,info = list(self.model.transcribe(audio_path))
            except Exception as e:
                print(f"Error transcripting file {audio_path}: {e}")
                raise
            sorted_segments = sorted(segments, key=lambda s: s.start)
            results = []
            for segment in sorted_segments:
                results.append({
                    "start": segment.start,
                    "end": segment.end,
                    "text": segment.text
                })
            combined_text = "".join([result["text"] for result in results])
            speaker = os.path.basename(audio_file).split("_")[0]
            # 無音ならスキップ
            if not combined_text:
                continue
            conversation.append(f"{speaker}: {combined_text}<br>")
            
        #ファイルの書き込み。ファイル名は"transcription.txt"
        output_file=os.path.join(self.output_dir,"transcription.txt")
        print(conversation)
        try:
            with open(output_file,"w",encoding="utf-8") as f:
                for result in conversation:
                    f.write(result)
        except OSError as e:
            print(f"Error writing transcription file: {e}")
            raise
        return output_file
    
    # 受け取った音声ファイルを話者ごとに整理する
    def combine_audio(self,audio_files):
        if not audio_files:
            raise
        merged_segments = []
        current_speaker = None
        current_segment = [] 
        for segment in audio_files:
            speaker = os.path.basename(segment).split("_")[0]
            if speaker != current_speaker:
                # 話者が変わった場合はセグメントを保存
                if current_segment:
                    merged_segments.append((current_speaker, current_segment))
                current_speaker = speaker
                current_segment = [segment]
            else:
                # 話者が同一の場合はセグメントを結合
                current_segment.append(segment)
        # 最後のセグメントを保存
        if current_segment:
            merged_segments.append((current_speaker, current_segment))
        
        return merged_segments

    # ディレクトリ内の音声ファイルを並べ替える
    def sort_audio_files_in_directory(self, directory):
        files = os.listdir(directory)
        audio_files = [f for f in files if f.endswith(".wav")]

        audio_files.sort(key=lambda x: datetime.strptime(x.split("_")[1].split(".")[0], "%Y%m%d%H%M%S"))
        return [os.path.join(directory, f) for f in audio_files]
    
    def save_marged_segments(self,merged_segments,output_directory='/tmp/data/conversations'):
        if not merged_segments:
            print("merged_segmentsが見つかりませんでした。")
            raise
        
        conversation = []
        for speaker, segments in merged_segments:
            combined_audio = self.merge_segments(segments)
            conversation.append((speaker,combined_audio))
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)

        for i, (speaker, combined_audio) in enumerate(conversation):
            current_time = datetime.now().strftime("%Y%m%d%H%M%S")
            filename = f"{speaker}_{current_time}.wav"
            file_path = os.path.join(output_directory,filename)
            combined_audio.export(file_path,format = "wav")
            print(f"Saved: {file_path}")

        return output_directory

    def merge_segments(self,segments):
        combined = AudioSegment.empty()  # 空のAudioSegmentを初期化
    
        for segment in segments:
            if isinstance(segment, str):
                # セグメントがファイルパスの場合、読み込む
                audio = AudioSegment.from_file(segment)
            elif isinstance(segment, AudioSegment):
                # セグメントがすでにAudioSegmentの場合、そのまま使用
                audio = segment
            else:
                raise ValueError("Invalid segment type. Must be file path or AudioSegment.")

            combined += audio  
        return combined 
    
    def generate_random_string(self,length):
        letters = string.ascii_letters + string.digits
        return ''.join(random.choice(letters) for i in range(length))

    def generate_filename(self,random_length):
        current_time = datetime.now().strftime("%Y%m%d%H%M%S")
        filename = f"{current_time}.wav"
        return filename