Spaces:

Justtalk
/

JusTalk

Running

App Files Files Community

buletomato25 commited on Mar 18

Commit

4e9c42f

2 Parent(s): dd43f38 1792d9b

Merge branch 'main' into suwabe/docker

Browse files

Files changed (1) hide show

process.py +65 -8

process.py CHANGED Viewed

@@ -7,6 +7,7 @@ import random
 from datetime import datetime
 from pyannote.audio import Model, Inference
 from pydub import AudioSegment
 class AudioProcessor():
     def __init__(self,cache_dir = "/tmp/hf_cache"):
         hf_token = os.environ.get("HUGGINGFACE_HUB_TOKEN")
@@ -17,6 +18,7 @@ class AudioProcessor():
         model = Model.from_pretrained("pyannote/embedding", use_auth_token=hf_token, cache_dir=cache_dir)
         self.inference = Inference(model)
     def cosine_similarity(self,vec1, vec2):
         vec1 = vec1 / np.linalg.norm(vec1)
         vec2 = vec2 / np.linalg.norm(vec2)
@@ -53,7 +55,17 @@ class AudioProcessor():
         embedding1 = self.inference(path1)
         embedding2 = self.inference(path2)
         return float(self.cosine_similarity(embedding1.data.flatten(), embedding2.data.flatten()))
     def process_audio(self, reference_path, input_path, output_folder='/tmp/data/matched_segments', seg_duration=1.0, threshold=0.5):
         # 出力先ディレクトリの中身をクリアする
         if os.path.exists(output_folder):
@@ -76,13 +88,58 @@ class AudioProcessor():
         unmatched_time_ms = total_duration_ms - matched_time_ms
         return matched_time_ms, unmatched_time_ms
-    def generate_random_string(self,length):
-        letters = string.ascii_letters + string.digits
-        return ''.join(random.choice(letters) for i in range(length))
-    def generate_filename(self,random_length):
-        random_string = self.generate_random_string(random_length)
-        current_time = datetime.now().strftime("%Y%m%d%H%M%S")
-        filename = f"{current_time}_{random_string}.wav"
-        return filename

 from datetime import datetime
 from pyannote.audio import Model, Inference
 from pydub import AudioSegment
 class AudioProcessor():
     def __init__(self,cache_dir = "/tmp/hf_cache"):
         hf_token = os.environ.get("HUGGINGFACE_HUB_TOKEN")
         model = Model.from_pretrained("pyannote/embedding", use_auth_token=hf_token, cache_dir=cache_dir)
         self.inference = Inference(model)
     def cosine_similarity(self,vec1, vec2):
         vec1 = vec1 / np.linalg.norm(vec1)
         vec2 = vec2 / np.linalg.norm(vec2)
         embedding1 = self.inference(path1)
         embedding2 = self.inference(path2)
         return float(self.cosine_similarity(embedding1.data.flatten(), embedding2.data.flatten()))
+    def generate_random_string(self,length):
+        letters = string.ascii_letters + string.digits
+        return ''.join(random.choice(letters) for i in range(length))
+    def generate_filename(self,random_length):
+        random_string = self.generate_random_string(random_length)
+        current_time = datetime.now().strftime("%Y%m%d%H%M%S")
+        filename = f"{current_time}_{random_string}.wav"
+        return filename
     def process_audio(self, reference_path, input_path, output_folder='/tmp/data/matched_segments', seg_duration=1.0, threshold=0.5):
         # 出力先ディレクトリの中身をクリアする
         if os.path.exists(output_folder):
         unmatched_time_ms = total_duration_ms - matched_time_ms
         return matched_time_ms, unmatched_time_ms
+    def process_multi_audio(self, reference_pathes, input_path, output_folder='/tmp/data/matched_multi_segments', seg_duration=1.0, threshold=0.5):
+        # 出力先ディレクトリの中身をクリアする
+        if os.path.exists(output_folder):
+            for file in os.listdir(output_folder):
+                file_path = os.path.join(output_folder, file)
+                if os.path.isfile(file_path):
+                    os.remove(file_path)
+        else:
+            os.makedirs(output_folder, exist_ok=True)
+        # 入力音声をセグメントに分割
+        segmented_path, total_duration_ms = self.segment_audio(input_path, seg_duration=seg_duration)
+        segment_files = sorted(os.listdir(segmented_path))
+        num_segments = len(segment_files)
+        # 各リファレンスごとにセグメントとの類似度を計算し、行列 (rows: reference, columns: segment) を作成
+        similarity = []
+        for reference_path in reference_pathes:
+            ref_similarity = []
+            for file in segment_files:
+                segment_file = os.path.join(segmented_path, file)
+                sim = self.calculate_similarity(segment_file, reference_path)
+                ref_similarity.append(sim)
+            similarity.append(ref_similarity)
+        # 転置行列を作成 (rows: segment, columns: reference)
+        similarity_transposed = []
+        for seg_idx in range(num_segments):
+            seg_sim = []
+            for ref_idx in range(len(reference_pathes)):
+                seg_sim.append(similarity[ref_idx][seg_idx])
+            similarity_transposed.append(seg_sim)
+        # 各セグメントについて、最も高い類似度のリファレンスを選択
+        best_matches = []
+        for seg_sim in similarity_transposed:
+            best_ref = np.argmax(seg_sim)  # 最も類似度の高いリファレンスのインデックス
+            # 閾値チェック (必要に応じて)
+            if seg_sim[best_ref] < threshold:
+                best_matches.append(None)  # 閾値未満の場合はマッチなしとする
+            else:
+                best_matches.append(best_ref)
+        # 各リファレンスごとに一致時間を集計 (セグメントごとの長さ seg_duration を加算)
+        matched_time = [0] * len(reference_pathes)
+        for match in best_matches:
+            if match is not None:
+                matched_time[match] += seg_duration
+        return matched_time