Spaces:

Justtalk
/

JusTalk

Running

App Files Files Community

rein0421 commited on Mar 22

Commit

7bc1736

verified ·

1 Parent(s): d8f8a02

Upload process.py

Browse files

Files changed (1) hide show

process.py +387 -90

process.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import os
 import shutil
 import numpy as np
@@ -9,9 +8,10 @@ from pyannote.audio import Model, Inference
 from pydub import AudioSegment
 import base64
 import binascii
 class AudioProcessor():
-    def __init__(self,cache_dir = "/tmp/hf_cache"):
         hf_token = os.environ.get("HF")
         if hf_token is None:
             raise ValueError("HUGGINGFACE_HUB_TOKEN が設定されていません。")
@@ -19,14 +19,145 @@ class AudioProcessor():
         # pyannote モデルの読み込み
         model = Model.from_pretrained("pyannote/embedding", use_auth_token=hf_token, cache_dir=cache_dir)
         self.inference = Inference(model)
-    def cosine_similarity(self,vec1, vec2):
-        vec1 = vec1 / np.linalg.norm(vec1)
-        vec2 = vec2 / np.linalg.norm(vec2)
-        return np.dot(vec1, vec2)
     def segment_audio(self, path, target_path='/tmp/setup_voice', seg_duration=1.0):
         # 出力先ディレクトリが存在していれば中身をクリアする
         if os.path.exists(target_path):
             for file in os.listdir(target_path):
@@ -52,91 +183,252 @@ class AudioProcessor():
         return target_path, duration_ms
-    def calculate_similarity(self,path1, path2):
-        embedding1 = self.inference(path1)
-        embedding2 = self.inference(path2)
-        return float(self.cosine_similarity(embedding1.data.flatten(), embedding2.data.flatten()))
     def process_audio(self, reference_path, input_path, output_folder='/tmp/data/matched_segments', seg_duration=1.0, threshold=0.5):
-        # 出力先ディレクトリの中身をクリアする
-        if os.path.exists(output_folder):
-            for file in os.listdir(output_folder):
-                file_path = os.path.join(output_folder, file)
-                if os.path.isfile(file_path):
-                    os.remove(file_path)
-        else:
-            os.makedirs(output_folder, exist_ok=True)
-        segmented_path, total_duration_ms = self.segment_audio(input_path, seg_duration=seg_duration)
-        matched_time_ms = 0
-        for file in sorted(os.listdir(segmented_path)):
-            segment_file = os.path.join(segmented_path, file)
-            similarity = self.calculate_similarity(segment_file, reference_path)
-            if similarity > threshold:
-                shutil.copy(segment_file, output_folder)
-                matched_time_ms += len(AudioSegment.from_file(segment_file))
-        unmatched_time_ms = total_duration_ms - matched_time_ms
-        return matched_time_ms, unmatched_time_ms,output_folder
     def process_multi_audio(self, reference_pathes, input_path, output_folder='/tmp/data/matched_multi_segments', seg_duration=1.0, threshold=0.5):
-        # 出力先ディレクトリの中身をクリアする
-        if os.path.exists(output_folder):
-            for file in os.listdir(output_folder):
-                file_path = os.path.join(output_folder, file)
-                if os.path.isfile(file_path):
-                    os.remove(file_path)
-        else:
-            os.makedirs(output_folder, exist_ok=True)
-        # 入力音声をセグメントに分割
-        segmented_path, total_duration_ms = self.segment_audio(input_path, seg_duration=seg_duration)
-        segment_files = sorted(os.listdir(segmented_path))
-        num_segments = len(segment_files)
-        # 各リファレンスごとにセグメントとの類似度を計算し、行列 (rows: reference, columns: segment) を作成
-        similarity = []
-        for reference_path in reference_pathes:
-            ref_similarity = []
             for file in segment_files:
                 segment_file = os.path.join(segmented_path, file)
-                sim = self.calculate_similarity(segment_file, reference_path)
-                ref_similarity.append(sim)
-            similarity.append(ref_similarity)
-        # 転置行列を作成 (rows: segment, columns: reference)
-        similarity_transposed = []
-        for seg_idx in range(num_segments):
-            seg_sim = []
-            for ref_idx in range(len(reference_pathes)):
-                seg_sim.append(similarity[ref_idx][seg_idx])
-            similarity_transposed.append(seg_sim)
-        # 各セグメントについて、最も高い類似度のリファレンスを選択
-        best_matches = []
-        for seg_sim in similarity_transposed:
-            best_ref = np.argmax(seg_sim)  # 最も類似度の高いリファレンスのインデックス
-            # 閾値チェック (必要に応じて)
-            if seg_sim[best_ref] < threshold:
-                best_matches.append(None)  # 閾値未満の場合はマッチなしとする
-            else:
-                best_matches.append(best_ref)
-        # 各リファレンスごとに一致時間を集計 (セグメントごとの長さ seg_duration を加算)
-        matched_time = [0] * len(reference_pathes)
-        for match in best_matches:
-            if match is not None:
-                matched_time[match] += seg_duration
-        return matched_time, segmented_path
-    def save_audio_from_base64(self,base64_audio,output_dir,output_filename,temp_format='webm'):
         try:
             # Base64デコードして音声バイナリを取得
             try:
@@ -145,26 +437,26 @@ class AudioProcessor():
                 raise ValueError("Invalid Base64 input data")
             # 保存するディレクトリを作成
-            os.makedirs(output_dir,exist_ok=True)
-            # 一時ファイルに保存（実際の形式は WebM などと仮定）
-            temp_audio_path = os.path.join(output_dir,"temp_audio")
             try:
-                with open(temp_audio_path,'wb') as f:
                     f.write(audio_binary)
                 # pydub を使って一時ファイルを WAV に変換
-                # ※ここでは WebM 形式と仮定していますが、実際の形式に合わせて format の指定を変更してください
                 try:
-                    audio = AudioSegment.from_file(temp_audio_path,format=temp_format)
                 except Exception as e:
-                    audio = AudioSegment.from_file(temp_audio_path) #形式が不明な場合は自動判別させる（ただし変換できない場合もあり）
                 # 音声ファイルを保存
-                wav_audio_path = os.path.join(output_dir,output_filename)
-                audio.export(wav_audio_path,format="wav")
             finally:
-                #一時ファイルを削除
                 if os.path.exists(temp_audio_path):
                     os.remove(temp_audio_path)
             return wav_audio_path
@@ -176,7 +468,13 @@ class AudioProcessor():
             print(f"Unexpected Error: {e}")
         return None
-    def delete_files_in_directory(self,directory_path):
         try:
             # ディレクトリ内のすべてのファイルを取得
             for filename in os.listdir(directory_path):
@@ -186,5 +484,4 @@ class AudioProcessor():
                     os.remove(file_path)
                     print(f"{file_path} を削除しました")
         except Exception as e:
-            print(f"エラーが発生しました: {e}")

 import os
 import shutil
 import numpy as np
 from pydub import AudioSegment
 import base64
 import binascii
+import warnings
 class AudioProcessor():
+    def __init__(self, cache_dir="/tmp/hf_cache", standard_duration=5.0):
         hf_token = os.environ.get("HF")
         if hf_token is None:
             raise ValueError("HUGGINGFACE_HUB_TOKEN が設定されていません。")
         # pyannote モデルの読み込み
         model = Model.from_pretrained("pyannote/embedding", use_auth_token=hf_token, cache_dir=cache_dir)
         self.inference = Inference(model)
+        # 標準の音声長さ（秒）
+        self.standard_duration = standard_duration
+    def normalize_audio_duration(self, input_path, target_duration_seconds=None, output_path=None):
+        """
+        音声ファイルの長さを指定された時間（秒）にそろえる関数
+        短すぎる場合は無音を追加し、長すぎる場合は切り詰める
+        Parameters:
+        input_path (str): 入力音声ファイルのパス
+        target_duration_seconds (float, optional): 目標となる音声の長さ（秒）。Noneの場合はself.standard_durationを使用
+        output_path (str, optional): 出力先のパス。Noneの場合は一時ファイルを生成
+        Returns:
+        str: 処理された音声ファイルのパス
+        """
+        try:
+            # デフォルト値の設定
+            if target_duration_seconds is None:
+                target_duration_seconds = self.standard_duration
+            # 音声ファイルを読み込む
+            audio = AudioSegment.from_file(input_path)
+            # 現在の長さ（ミリ秒）
+            current_duration_ms = len(audio)
+            target_duration_ms = int(target_duration_seconds * 1000)
+            # 出力パスが指定されていない場合は一時ファイルを生成
+            if output_path is None:
+                random_str = ''.join(random.choices(string.ascii_lowercase + string.digits, k=8))
+                timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+                output_dir = os.path.dirname(input_path) if os.path.dirname(input_path) else '/tmp'
+                output_path = os.path.join(output_dir, f"normalized_{timestamp}_{random_str}.wav")
+            # 長さの調整
+            if current_duration_ms < target_duration_ms:
+                # 短い場合は無音を追加
+                silence_duration = target_duration_ms - current_duration_ms
+                silence = AudioSegment.silent(duration=silence_duration)
+                normalized_audio = audio + silence
+            else:
+                # 長い場合は切り詰め
+                normalized_audio = audio[:target_duration_ms]
+            # ファイルに保存
+            normalized_audio.export(output_path, format="wav")
+            return output_path
+        except Exception as e:
+            print(f"音声の長さをそろえる処理でエラーが発生しました: {e}")
+            return None
+    def batch_normalize_audio_duration(self, input_directory, target_duration_seconds=None, output_directory=None):
+        """
+        ディレクトリ内の全音声ファイルの長さをそろえる関数
+        Parameters:
+        input_directory (str): 入力音声ファイルが格納されているディレクトリ
+        target_duration_seconds (float, optional): 目標となる音声の長さ（秒）。Noneの場合はself.standard_durationを使用
+        output_directory (str, optional): 出力先のディレクトリ。Noneの場合は入力と同じディレクトリに処理結果を保存
+        Returns:
+        list: 処理された音声ファイルのパスのリスト
+        """
+        try:
+            # デフォルト値の設定
+            if target_duration_seconds is None:
+                target_duration_seconds = self.standard_duration
+            # 出力ディレクトリが指定されていない場合は入力ディレクトリを使用
+            if output_directory is None:
+                output_directory = input_directory
+            else:
+                os.makedirs(output_directory, exist_ok=True)
+            output_files = []
+            # ディレクトリ内の全ファイルを処理
+            for filename in os.listdir(input_directory):
+                if filename.lower().endswith(('.wav', '.mp3', '.webm', '.ogg', '.flac')):
+                    input_path = os.path.join(input_directory, filename)
+                    output_filename = f"normalized_{filename}"
+                    output_path = os.path.join(output_directory, output_filename)
+                    # 音声の長さをそろえる
+                    processed_file = self.normalize_audio_duration(
+                        input_path,
+                        target_duration_seconds,
+                        output_path
+                    )
+                    if processed_file:
+                        output_files.append(processed_file)
+            return output_files
+        except Exception as e:
+            print(f"バッチ処理でエラーが発生しました: {e}")
+            return []
+    def cosine_similarity(self, vec1, vec2):
+        """
+        2つのベクトル間のコサイン類似度を計算する
+        次元数が異なる場合はエラーを発生させる
+        Parameters:
+        vec1, vec2: 比較する2つのベクトル
+        Returns:
+        float: コサイン類似度 (-1 から 1 の範囲)
+        """
+        try:
+            # 次元数チェック
+            if vec1.shape != vec2.shape:
+                raise ValueError(f"ベクトルの次元数が一致しません: {vec1.shape} vs {vec2.shape}")
+            # 正規化
+            vec1 = vec1 / np.linalg.norm(vec1)
+            vec2 = vec2 / np.linalg.norm(vec2)
+            return np.dot(vec1, vec2)
+        except Exception as e:
+            print(f"コサイン類似度計算でエラーが発生しました: {e}")
+            return None
     def segment_audio(self, path, target_path='/tmp/setup_voice', seg_duration=1.0):
+        """
+        音声ファイルを一定の長さのセグメントに分割する
+        Parameters:
+        path (str): 入力音声ファイルのパス
+        target_path (str): 分割されたセグメントを保存するディレクトリ
+        seg_duration (float): 各セグメントの長さ（秒）
+        Returns:
+        tuple: (セグメントが保存されたディレクトリのパス, 元の音声の総時間（ミリ秒）)
+        """
         # 出力先ディレクトリが存在していれば中身をクリアする
         if os.path.exists(target_path):
             for file in os.listdir(target_path):
         return target_path, duration_ms
+    def calculate_embedding(self, audio_path):
+        """
+        音声ファイルからエンベディングを計算する
+        必要に応じて音声の長さを標準化する
+        Parameters:
+        audio_path (str): 音声ファイルのパス
+        Returns:
+        numpy.ndarray: 計算されたエンベディング
+        """
+        try:
+            # 一時的に長さを標準化した音声ファイルを作成
+            normalized_path = self.normalize_audio_duration(audio_path)
+            if normalized_path is None:
+                raise ValueError("音声の長さの標準化に失敗しました")
+            # エンベディングを計算
+            embedding = self.inference(normalized_path)
+            # 一時ファイルを削除（必要に応じて）
+            if normalized_path != audio_path:
+                try:
+                    os.remove(normalized_path)
+                except Exception as e:
+                    warnings.warn(f"一時ファイルの削除に失敗しました: {e}")
+            return embedding.data.flatten()
+        except Exception as e:
+            print(f"エンベディング計算でエラーが発生しました: {e}")
+            return None
+    def calculate_similarity(self, path1, path2):
+        """
+        2つの音声ファイル間の類似度を計算する
+        音声の長さを標準化してからエンベディングを計算
+        Parameters:
+        path1, path2 (str): 比較する2つの音声ファイルのパス
+        Returns:
+        float: コサイン類似度 (-1 から 1 の範囲)、エラー時はNone
+        """
+        try:
+            # エンベディングを計算
+            embedding1 = self.calculate_embedding(path1)
+            embedding2 = self.calculate_embedding(path2)
+            if embedding1 is None or embedding2 is None:
+                raise ValueError("エンベディングの計算に失���しました")
+            # 次元数チェック（念のため）
+            if embedding1.shape != embedding2.shape:
+                raise ValueError(f"エンベディングの次元数が一致しません: {embedding1.shape} vs {embedding2.shape}")
+            # 類似度を計算
+            return float(self.cosine_similarity(embedding1, embedding2))
+        except Exception as e:
+            print(f"類似度計算でエラーが発生しました: {e}")
+            return None
     def process_audio(self, reference_path, input_path, output_folder='/tmp/data/matched_segments', seg_duration=1.0, threshold=0.5):
+        """
+        入力音声からリファレンス音声に類似したセグメントを抽出する
+        Parameters:
+        reference_path (str): リファレンス音声のパス
+        input_path (str): 入力音声のパス
+        output_folder (str): 類似セグメントを保存するディレクトリ
+        seg_duration (float): セグメントの長さ（秒）
+        threshold (float): 類似度の閾値
+        Returns:
+        tuple: (マッチした時間（ミリ秒）, マッチしなかった時間（ミリ秒）, 出力フォルダのパス)
+        """
+        try:
+            # リファレンス音声のエンベディングを計算（長さを標準化）
+            reference_embedding = self.calculate_embedding(reference_path)
+            if reference_embedding is None:
+                raise ValueError("リファレンス音声のエンベディング計算に失敗しました")
+            # 出力先ディレクトリの中身をクリアする
+            if os.path.exists(output_folder):
+                for file in os.listdir(output_folder):
+                    file_path = os.path.join(output_folder, file)
+                    if os.path.isfile(file_path):
+                        os.remove(file_path)
+            else:
+                os.makedirs(output_folder, exist_ok=True)
+            # 入力音声をセグメントに分割
+            segmented_path, total_duration_ms = self.segment_audio(input_path, seg_duration=seg_duration)
+            matched_time_ms = 0
+            for file in sorted(os.listdir(segmented_path)):
+                segment_file = os.path.join(segmented_path, file)
+                # セグメントのエンベディングを計算
+                segment_embedding = self.calculate_embedding(segment_file)
+                if segment_embedding is None:
+                    print(f"警告: セグメント {file} のエンベディング計算に失敗しました。スキップします。")
+                    continue
+                try:
+                    # 類似度を計算
+                    similarity = float(self.cosine_similarity(segment_embedding, reference_embedding))
+                    if similarity > threshold:
+                        shutil.copy(segment_file, output_folder)
+                        matched_time_ms += len(AudioSegment.from_file(segment_file))
+                except Exception as e:
+                    print(f"セグメント {file} の類似度計算でエラーが発生しました: {e}")
+            unmatched_time_ms = total_duration_ms - matched_time_ms
+            return matched_time_ms, unmatched_time_ms, output_folder
+        except Exception as e:
+            print(f"音声処理でエラーが発生しました: {e}")
+            return 0, 0, output_folder
     def process_multi_audio(self, reference_pathes, input_path, output_folder='/tmp/data/matched_multi_segments', seg_duration=1.0, threshold=0.5):
+        """
+        入力音声から複数のリファレンス音声に類似したセグメントを抽出する
+        Parameters:
+        reference_pathes (list): リファレンス音声のパスのリスト
+        input_path (str): 入力音声のパス
+        output_folder (str): 類似セグメントを保存するディレクトリ
+        seg_duration (float): セグメントの長さ（秒）
+        threshold (float): 類似度の閾値
+        Returns:
+        tuple: (各リファレンスごとのマッチした時間のリスト, セグメントが保存されたディレクトリのパス)
+        """
+        try:
+            # 出力先ディレクトリの中身をクリアする
+            if os.path.exists(output_folder):
+                for file in os.listdir(output_folder):
+                    file_path = os.path.join(output_folder, file)
+                    if os.path.isfile(file_path):
+                        os.remove(file_path)
+            else:
+                os.makedirs(output_folder, exist_ok=True)
+            # リファレンス音声のエンベディングを事前計算
+            reference_embeddings = []
+            for ref_path in reference_pathes:
+                embedding = self.calculate_embedding(ref_path)
+                if embedding is None:
+                    print(f"警告: リファレンス {ref_path} のエンベディング計算に失敗しました")
+                    # ダミーエンベディングを挿入（後で処理をスキップ）
+                    reference_embeddings.append(None)
+                else:
+                    reference_embeddings.append(embedding)
+            # 入力音声をセグメントに分割
+            segmented_path, total_duration_ms = self.segment_audio(input_path, seg_duration=seg_duration)
+            segment_files = sorted(os.listdir(segmented_path))
+            num_segments = len(segment_files)
+            # 各セグメントのエンベディングを計算
+            segment_embeddings = []
             for file in segment_files:
                 segment_file = os.path.join(segmented_path, file)
+                embedding = self.calculate_embedding(segment_file)
+                if embedding is None:
+                    print(f"警告: セグメント {file} のエンベディング計算に失敗しました")
+                    segment_embeddings.append(None)
+                else:
+                    segment_embeddings.append(embedding)
+            # 各リファレンスごとにセグメントとの類似度を計算
+            similarity = []
+            for ref_embedding in reference_embeddings:
+                if ref_embedding is None:
+                    # リファレンスのエンベディングが計算できなかった場合
+                    similarity.append([0.0] * num_segments)
+                    continue
+                ref_similarity = []
+                for seg_embedding in segment_embeddings:
+                    if seg_embedding is None:
+                        # セグメントのエンベディングが計算できなかった場合
+                        ref_similarity.append(0.0)
+                        continue
+                    try:
+                        # 次元数チェック
+                        if ref_embedding.shape != seg_embedding.shape:
+                            print(f"警告: エンベディングの次元数が一致しません: {ref_embedding.shape} vs {seg_embedding.shape}")
+                            ref_similarity.append(0.0)
+                            continue
+                        # 類似度を計算
+                        sim = float(self.cosine_similarity(seg_embedding, ref_embedding))
+                        ref_similarity.append(sim)
+                    except Exception as e:
+                        print(f"類似度計算でエラーが発生しました: {e}")
+                        ref_similarity.append(0.0)
+                similarity.append(ref_similarity)
+            # 転置行列を作成 (rows: segment, columns: reference)
+            similarity_transposed = []
+            for seg_idx in range(num_segments):
+                seg_sim = []
+                for ref_idx in range(len(reference_pathes)):
+                    seg_sim.append(similarity[ref_idx][seg_idx])
+                similarity_transposed.append(seg_sim)
+            # 各セグメントについて、最も高い類似度のリファレンスを選択
+            best_matches = []
+            for seg_sim in similarity_transposed:
+                best_ref = np.argmax(seg_sim)  # 最も類似度の高いリファレンスのインデックス
+                # 閾値チェック
+                if seg_sim[best_ref] < threshold:
+                    best_matches.append(None)  # 閾値未満の場合はマッチなしとする
+                else:
+                    best_matches.append(best_ref)
+            # 各リファレンスごとに一致時間を集計
+            matched_time = [0] * len(reference_pathes)
+            for match in best_matches:
+                if match is not None:
+                    matched_time[match] += seg_duration
+            return matched_time, segmented_path
+        except Exception as e:
+            print(f"マルチ音声処理でエラーが発生しました: {e}")
+            return [0] * len(reference_pathes), None
+    def save_audio_from_base64(self, base64_audio, output_dir, output_filename, temp_format='webm'):
+        """
+        Base64エンコードされた音声データをデコードして保存する
+        Parameters:
+        base64_audio (str): Base64エンコードされた音声データ
+        output_dir (str): 出力先ディレクトリ
+        output_filename (str): 出力ファイル名
+        temp_format (str): 一時ファイルのフォーマット
+        Returns:
+        str: 保存された音声ファイルのパス、エラー時はNone
+        """
         try:
             # Base64デコードして音声バイナリを取得
             try:
                 raise ValueError("Invalid Base64 input data")
             # 保存するディレクトリを作成
+            os.makedirs(output_dir, exist_ok=True)
+            # 一時ファイルに保存
+            temp_audio_path = os.path.join(output_dir, "temp_audio")
             try:
+                with open(temp_audio_path, 'wb') as f:
                     f.write(audio_binary)
                 # pydub を使って一時ファイルを WAV に変換
                 try:
+                    audio = AudioSegment.from_file(temp_audio_path, format=temp_format)
                 except Exception as e:
+                    # 形式が不明な場合は自動判別
+                    audio = AudioSegment.from_file(temp_audio_path)
                 # 音声ファイルを保存
+                wav_audio_path = os.path.join(output_dir, output_filename)
+                audio.export(wav_audio_path, format="wav")
             finally:
+                # 一時ファイルを削除
                 if os.path.exists(temp_audio_path):
                     os.remove(temp_audio_path)
             return wav_audio_path
             print(f"Unexpected Error: {e}")
         return None
+    def delete_files_in_directory(self, directory_path):
+        """
+        ディレクトリ内のすべてのファイルを削除する
+        Parameters:
+        directory_path (str): 削除対象のディレクトリパス
+        """
         try:
             # ディレクトリ内のすべてのファイルを取得
             for filename in os.listdir(directory_path):
                     os.remove(file_path)
                     print(f"{file_path} を削除しました")
         except Exception as e:
+            print(f"ファイル削除でエラーが発生しました: {e}")