Spaces:

Justtalk
/

JusTalk

Running

File size: 5,711 Bytes

from flask import Flask, request, jsonify, render_template, send_from_directory
import base64
import os
import shutil
import numpy as np
import string
import random
from datetime import datetime
from pyannote.audio import Model, Inference
from pydub import AudioSegment

# Hugging Face のトークン取得（環境変数 HF に設定）
hf_token = os.environ.get("HF")
if hf_token is None:
    raise ValueError("HUGGINGFACE_HUB_TOKEN が設定されていません。")

# キャッシュディレクトリの作成（書き込み可能な /tmp を利用）
cache_dir = "/tmp/hf_cache"
os.makedirs(cache_dir, exist_ok=True)

# pyannote モデルの読み込み
model = Model.from_pretrained("pyannote/embedding", use_auth_token=hf_token, cache_dir=cache_dir)
inference = Inference(model)

def cosine_similarity(vec1, vec2):
    vec1 = vec1 / np.linalg.norm(vec1)
    vec2 = vec2 / np.linalg.norm(vec2)
    return np.dot(vec1, vec2)

def segment_audio(path, target_path='/tmp/setup_voice', seg_duration=1.0):
    """
    音声を指定秒数ごとに分割する。
    target_path に分割したファイルを保存し、元の音声の総長（ミリ秒）を返す。
    """
    os.makedirs(target_path, exist_ok=True)
    base_sound = AudioSegment.from_file(path)
    duration_ms = len(base_sound)
    seg_duration_ms = int(seg_duration * 1000)
    
    for i, start in enumerate(range(0, duration_ms, seg_duration_ms)):
        end = min(start + seg_duration_ms, duration_ms)
        segment = base_sound[start:end]
        segment.export(os.path.join(target_path, f'{i}.wav'), format="wav")
    
    return target_path, duration_ms

def calculate_similarity(path1, path2):
    embedding1 = inference(path1)
    embedding2 = inference(path2)
    return float(cosine_similarity(embedding1.data.flatten(), embedding2.data.flatten()))

def process_audio(reference_path, input_path, output_folder='/tmp/data/matched_segments', seg_duration=1.0, threshold=0.5):
    """
    入力音声ファイルを seg_duration 秒ごとに分割し、各セグメントと参照音声の類似度を計算。
    類似度が threshold を超えたセグメントを output_folder にコピーし、マッチした時間（ms）と
    マッチしなかった時間（ms）を返す。
    """
    os.makedirs(output_folder, exist_ok=True)
    segmented_path, total_duration_ms = segment_audio(input_path, seg_duration=seg_duration)
    
    matched_time_ms = 0
    for file in sorted(os.listdir(segmented_path)):
        segment_file = os.path.join(segmented_path, file)
        similarity = calculate_similarity(segment_file, reference_path)
        if similarity > threshold:
            shutil.copy(segment_file, output_folder)
            matched_time_ms += len(AudioSegment.from_file(segment_file))
    
    unmatched_time_ms = total_duration_ms - matched_time_ms
    return matched_time_ms, unmatched_time_ms

def generate_random_string(length):
    letters = string.ascii_letters + string.digits
    return ''.join(random.choice(letters) for i in range(length))

def generate_filename(random_length):
    random_string = generate_random_string(random_length)
    current_time = datetime.now().strftime("%Y%m%d%H%M%S")
    filename = f"{current_time}_{random_string}.wav"
    return filename

app = Flask(__name__)

# トップページ（テンプレート: index.html）
@app.route('/')
@app.route('/index', methods=['GET', 'POST'])
def index():
    return render_template('index.html')

# フィードバック画面（テンプレート: feedback.html）
@app.route('/feedback', methods=['GET', 'POST'])
def feedback():
    return render_template('feedback.html')

# 会話詳細画面（テンプレート: talkDetail.html）
@app.route('/talk_detail', methods=['GET', 'POST'])
def talk_detail():
    return render_template('talkDetail.html')

# 音声アップロード＆解析エンドポイント
@app.route('/upload_audio', methods=['POST'])
def upload_audio():
    try:
        data = request.get_json()
        if not data or 'audio_data' not in data:
            return jsonify({"error": "音声データがありません"}), 400
        
        # Base64デコードして音声バイナリを取得
        audio_binary = base64.b64decode(data['audio_data'])
        audio_dir = "/tmp/data"
        os.makedirs(audio_dir, exist_ok=True)
        # 固定ファイル名（必要に応じて generate_filename() で一意のファイル名に変更可能）
        audio_path = os.path.join(audio_dir, "recorded_audio.wav")
        with open(audio_path, 'wb') as f:
            f.write(audio_binary)
        
        # 参照音声ファイルのパスを指定（sample.wav を正しい場所に配置すること）
        reference_audio = os.path.abspath('./sample.wav')
        if not os.path.exists(reference_audio):
            return jsonify({"error": "参照音声ファイルが見つかりません", "details": reference_audio}), 500
        
        # 音声解析：参照音声とアップロードされた音声との類似度をセグメント毎に計算
        # threshold の値は調整可能です（例: 0.1）
        matched_time, unmatched_time = process_audio(reference_audio, audio_path, threshold=0.1)
        total_time = matched_time + unmatched_time
        rate = (matched_time / total_time) * 100 if total_time > 0 else 0
        
        return jsonify({"rate": rate}), 200
    except Exception as e:
        print("Error in /upload_audio:", str(e))
        return jsonify({"error": "サーバーエラー", "details": str(e)}), 500

if __name__ == '__main__':
    port = int(os.environ.get("PORT", 7860))
    app.run(debug=True, host="0.0.0.0", port=port)