Spaces:

Justtalk
/

JusTalk

Running

App Files Files Community

rein0421 commited on Mar 16

Commit

99daaaf

verified ·

1 Parent(s): 3b67cd9

Upload 5 files

Browse files

Files changed (3) hide show

Dockerfile +3 -7
app.py +4 -77
process.py +81 -0

Dockerfile CHANGED Viewed

@@ -12,11 +12,7 @@ RUN apt-get update && \
 RUN python3 -m pip install --upgrade pip
 WORKDIR /app
-COPY requirements.txt .
-# requirements.txtのインストール
-RUN python3 -m pip install --no-cache-dir -r requirements.txt
-COPY . .
-CMD ["python3", "app.py"]

 RUN python3 -m pip install --upgrade pip
 WORKDIR /app
+# requirements.txt をコンテナ内にコピーして、必要なパッケージをインストール
+COPY requirements.txt /app/
+RUN pip install --no-cache-dir -r requirements.txt

app.py CHANGED Viewed

@@ -1,84 +1,11 @@
 from flask import Flask, request, jsonify, render_template, send_from_directory
 import base64
 import os
 import shutil
-import numpy as np
-import string
-import random
-from datetime import datetime
-from pyannote.audio import Model, Inference
-from pydub import AudioSegment
-# Hugging Face のトークン取得（環境変数 HF に設定）
-hf_token = os.environ.get("HF")
-if hf_token is None:
-    raise ValueError("HUGGINGFACE_HUB_TOKEN が設定されていません。")
-# キャッシュディレクトリの作成（書き込み可能な /tmp を利用）
-cache_dir = "/tmp/hf_cache"
-os.makedirs(cache_dir, exist_ok=True)
-# pyannote モデルの読み込み
-model = Model.from_pretrained("pyannote/embedding", use_auth_token=hf_token, cache_dir=cache_dir)
-inference = Inference(model)
-def cosine_similarity(vec1, vec2):
-    vec1 = vec1 / np.linalg.norm(vec1)
-    vec2 = vec2 / np.linalg.norm(vec2)
-    return np.dot(vec1, vec2)
-def segment_audio(path, target_path='/tmp/setup_voice', seg_duration=1.0):
-    """
-    音声を指定秒数ごとに分割する。
-    target_path に分割したファイルを保存し、元の音声の総長（ミリ秒）を返す。
-    """
-    os.makedirs(target_path, exist_ok=True)
-    base_sound = AudioSegment.from_file(path)
-    duration_ms = len(base_sound)
-    seg_duration_ms = int(seg_duration * 1000)
-    for i, start in enumerate(range(0, duration_ms, seg_duration_ms)):
-        end = min(start + seg_duration_ms, duration_ms)
-        segment = base_sound[start:end]
-        segment.export(os.path.join(target_path, f'{i}.wav'), format="wav")
-    return target_path, duration_ms
-def calculate_similarity(path1, path2):
-    embedding1 = inference(path1)
-    embedding2 = inference(path2)
-    return float(cosine_similarity(embedding1.data.flatten(), embedding2.data.flatten()))
-def process_audio(reference_path, input_path, output_folder='/tmp/data/matched_segments', seg_duration=1.0, threshold=0.5):
-    """
-    入力音声ファイルを seg_duration 秒ごとに分割し、各セグメントと参照音声の類似度を計算。
-    類似度が threshold を超えたセグメントを output_folder にコピーし、マッチした時間（ms）と
-    マッチしなかった時間（ms）を返す。
-    """
-    os.makedirs(output_folder, exist_ok=True)
-    segmented_path, total_duration_ms = segment_audio(input_path, seg_duration=seg_duration)
-    matched_time_ms = 0
-    for file in sorted(os.listdir(segmented_path)):
-        segment_file = os.path.join(segmented_path, file)
-        similarity = calculate_similarity(segment_file, reference_path)
-        if similarity > threshold:
-            shutil.copy(segment_file, output_folder)
-            matched_time_ms += len(AudioSegment.from_file(segment_file))
-    unmatched_time_ms = total_duration_ms - matched_time_ms
-    return matched_time_ms, unmatched_time_ms
-def generate_random_string(length):
-    letters = string.ascii_letters + string.digits
-    return ''.join(random.choice(letters) for i in range(length))
-def generate_filename(random_length):
-    random_string = generate_random_string(random_length)
-    current_time = datetime.now().strftime("%Y%m%d%H%M%S")
-    filename = f"{current_time}_{random_string}.wav"
-    return filename
 app = Flask(__name__)
 # トップページ（テンプレート: index.html）
@@ -121,7 +48,7 @@ def upload_audio():
         # 音声解析：参照音声とアップロードされた音声との類似度をセグメント毎に計算
         # threshold の値は調整可能です（例: 0.1）
-        matched_time, unmatched_time = process_audio(reference_audio, audio_path, threshold=0.1)
         total_time = matched_time + unmatched_time
         rate = (matched_time / total_time) * 100 if total_time > 0 else 0

 from flask import Flask, request, jsonify, render_template, send_from_directory
 import base64
 import os
 import shutil
+from process import AudioProcessor
+process=AudioProcessor()
 app = Flask(__name__)
 # トップページ（テンプレート: index.html）
         # 音声解析：参照音声とアップロードされた音声との類似度をセグメント毎に計算
         # threshold の値は調整可能です（例: 0.1）
+        matched_time, unmatched_time = process.process_audio(reference_audio, audio_path, threshold=0.1)
         total_time = matched_time + unmatched_time
         rate = (matched_time / total_time) * 100 if total_time > 0 else 0

process.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import os
+import shutil
+import numpy as np
+import string
+import random
+from datetime import datetime
+from pyannote.audio import Model, Inference
+from pydub import AudioSegment
+class AudioProcessor():
+    def __init__(self,cache_dir = "/tmp/hf_cache"):
+        hf_token = os.environ.get("HF")
+        if hf_token is None:
+            print('3')
+            raise ValueError("HUGGINGFACE_HUB_TOKEN が設定されていません。")
+        os.makedirs(cache_dir, exist_ok=True)
+        # pyannote モデルの読み込み
+        model = Model.from_pretrained("pyannote/embedding", use_auth_token=hf_token, cache_dir=cache_dir)
+        self.inference = Inference(model)
+    def cosine_similarity(self,vec1, vec2):
+        vec1 = vec1 / np.linalg.norm(vec1)
+        vec2 = vec2 / np.linalg.norm(vec2)
+        return np.dot(vec1, vec2)
+    def segment_audio(self, path, target_path='/tmp/setup_voice', seg_duration=1.0):
+        """
+        音声を指定秒数ごとに分割し、短いセグメントには無音をパディングする。
+        """
+        os.makedirs(target_path, exist_ok=True)
+        base_sound = AudioSegment.from_file(path)
+        duration_ms = len(base_sound)
+        seg_duration_ms = int(seg_duration * 1000)
+        for i, start in enumerate(range(0, duration_ms, seg_duration_ms)):
+            end = min(start + seg_duration_ms, duration_ms)
+            segment = base_sound[start:end]
+            # セグメントが指定長さに満たない場合、無音でパディングする
+            if len(segment) < seg_duration_ms:
+                silence = AudioSegment.silent(duration=(seg_duration_ms - len(segment)))
+                segment = segment + silence
+            segment.export(os.path.join(target_path, f'{i}.wav'), format="wav")
+        return target_path, duration_ms
+    def calculate_similarity(self,path1, path2):
+        embedding1 = self.inference(path1)
+        embedding2 = self.inference(path2)
+        return float(self.cosine_similarity(embedding1.data.flatten(), embedding2.data.flatten()))
+    def process_audio(self,reference_path, input_path, output_folder='/tmp/data/matched_segments', seg_duration=1.0, threshold=0.5):
+        """
+        入力音声ファイルを seg_duration 秒ごとに分割し、各セグメントと参照音声の類似度を計算。
+        類似度が threshold を超えたセグメントを output_folder にコピーし、マッチした時間（ms）と
+        マッチしなかった時間（ms）を返す。
+        """
+        os.makedirs(output_folder, exist_ok=True)
+        segmented_path, total_duration_ms = self.segment_audio(input_path, seg_duration=seg_duration)
+        matched_time_ms = 0
+        for file in sorted(os.listdir(segmented_path)):
+            segment_file = os.path.join(segmented_path, file)
+            similarity = self.calculate_similarity(segment_file, reference_path)
+            if similarity > threshold:
+                shutil.copy(segment_file, output_folder)
+                matched_time_ms += len(AudioSegment.from_file(segment_file))
+        unmatched_time_ms = total_duration_ms - matched_time_ms
+        return matched_time_ms, unmatched_time_ms
+    def generate_random_string(self,length):
+        letters = string.ascii_letters + string.digits
+        return ''.join(random.choice(letters) for i in range(length))
+    def generate_filename(self,random_length):
+        random_string = self.generate_random_string(random_length)
+        current_time = datetime.now().strftime("%Y%m%d%H%M%S")
+        filename = f"{current_time}_{random_string}.wav"
+        return filename