JusTalk / process.py
buletomato25
docker-setup
98c4a68
raw
history blame
3.95 kB
import os
import shutil
import numpy as np
import string
import random
from datetime import datetime
from pyannote.audio import Model, Inference
from pydub import AudioSegment
class AudioProcessor():
def __init__(self,cache_dir = "/tmp/hf_cache"):
hf_token = os.environ.get("HUGGINGFACE_HUB_TOKEN")
if hf_token is None:
raise ValueError("HUGGINGFACE_HUB_TOKEN が設定されていません。")
os.makedirs(cache_dir, exist_ok=True)
# pyannote モデルの読み込み
model = Model.from_pretrained("pyannote/embedding", use_auth_token=hf_token, cache_dir=cache_dir)
self.inference = Inference(model)
def cosine_similarity(self,vec1, vec2):
vec1 = vec1 / np.linalg.norm(vec1)
vec2 = vec2 / np.linalg.norm(vec2)
return np.dot(vec1, vec2)
def segment_audio(self, path, target_path='/tmp/setup_voice', seg_duration=1.0):
# 出力先ディレクトリが存在していれば中身をクリアする
if os.path.exists(target_path):
for file in os.listdir(target_path):
file_path = os.path.join(target_path, file)
if os.path.isfile(file_path):
os.remove(file_path)
else:
os.makedirs(target_path, exist_ok=True)
base_sound = AudioSegment.from_file(path)
duration_ms = len(base_sound)
seg_duration_ms = int(seg_duration * 1000)
for i, start in enumerate(range(0, duration_ms, seg_duration_ms)):
end = min(start + seg_duration_ms, duration_ms)
segment = base_sound[start:end]
# セグメントが指定長さに満たない場合、無音でパディングする
if len(segment) < seg_duration_ms:
silence = AudioSegment.silent(duration=(seg_duration_ms - len(segment)))
segment = segment + silence
segment.export(os.path.join(target_path, f'{i}.wav'), format="wav")
return target_path, duration_ms
def calculate_similarity(self,path1, path2):
embedding1 = self.inference(path1)
embedding2 = self.inference(path2)
return float(self.cosine_similarity(embedding1.data.flatten(), embedding2.data.flatten()))
def process_audio(self, reference_path, input_path, output_folder='/tmp/data/matched_segments', seg_duration=1.0, threshold=0.5):
# 出力先ディレクトリの中身をクリアする
if os.path.exists(output_folder):
for file in os.listdir(output_folder):
file_path = os.path.join(output_folder, file)
if os.path.isfile(file_path):
os.remove(file_path)
else:
os.makedirs(output_folder, exist_ok=True)
segmented_path, total_duration_ms = self.segment_audio(input_path, seg_duration=seg_duration)
matched_time_ms = 0
for file in sorted(os.listdir(segmented_path)):
segment_file = os.path.join(segmented_path, file)
similarity = self.calculate_similarity(segment_file, reference_path)
if similarity > threshold:
shutil.copy(segment_file, output_folder)
matched_time_ms += len(AudioSegment.from_file(segment_file))
unmatched_time_ms = total_duration_ms - matched_time_ms
return matched_time_ms, unmatched_time_ms
def generate_random_string(self,length):
letters = string.ascii_letters + string.digits
return ''.join(random.choice(letters) for i in range(length))
def generate_filename(self,random_length):
random_string = self.generate_random_string(random_length)
current_time = datetime.now().strftime("%Y%m%d%H%M%S")
filename = f"{current_time}_{random_string}.wav"
return filename