Spaces:
Running
Running
import os | |
import shutil | |
import numpy as np | |
import string | |
import random | |
from datetime import datetime | |
from pyannote.audio import Model, Inference | |
from pydub import AudioSegment | |
class AudioProcessor(): | |
def __init__(self,cache_dir = "/tmp/hf_cache"): | |
hf_token = os.environ.get("HUGGINGFACE_HUB_TOKEN") | |
if hf_token is None: | |
raise ValueError("HUGGINGFACE_HUB_TOKEN が設定されていません。") | |
os.makedirs(cache_dir, exist_ok=True) | |
# pyannote モデルの読み込み | |
model = Model.from_pretrained("pyannote/embedding", use_auth_token=hf_token, cache_dir=cache_dir) | |
self.inference = Inference(model) | |
def cosine_similarity(self,vec1, vec2): | |
vec1 = vec1 / np.linalg.norm(vec1) | |
vec2 = vec2 / np.linalg.norm(vec2) | |
return np.dot(vec1, vec2) | |
def segment_audio(self, path, target_path='/tmp/setup_voice', seg_duration=1.0): | |
# 出力先ディレクトリが存在していれば中身をクリアする | |
if os.path.exists(target_path): | |
for file in os.listdir(target_path): | |
file_path = os.path.join(target_path, file) | |
if os.path.isfile(file_path): | |
os.remove(file_path) | |
else: | |
os.makedirs(target_path, exist_ok=True) | |
base_sound = AudioSegment.from_file(path) | |
duration_ms = len(base_sound) | |
seg_duration_ms = int(seg_duration * 1000) | |
for i, start in enumerate(range(0, duration_ms, seg_duration_ms)): | |
end = min(start + seg_duration_ms, duration_ms) | |
segment = base_sound[start:end] | |
# セグメントが指定長さに満たない場合、無音でパディングする | |
if len(segment) < seg_duration_ms: | |
silence = AudioSegment.silent(duration=(seg_duration_ms - len(segment))) | |
segment = segment + silence | |
segment.export(os.path.join(target_path, f'{i}.wav'), format="wav") | |
return target_path, duration_ms | |
def calculate_similarity(self,path1, path2): | |
embedding1 = self.inference(path1) | |
embedding2 = self.inference(path2) | |
return float(self.cosine_similarity(embedding1.data.flatten(), embedding2.data.flatten())) | |
def process_audio(self, reference_path, input_path, output_folder='/tmp/data/matched_segments', seg_duration=1.0, threshold=0.5): | |
# 出力先ディレクトリの中身をクリアする | |
if os.path.exists(output_folder): | |
for file in os.listdir(output_folder): | |
file_path = os.path.join(output_folder, file) | |
if os.path.isfile(file_path): | |
os.remove(file_path) | |
else: | |
os.makedirs(output_folder, exist_ok=True) | |
segmented_path, total_duration_ms = self.segment_audio(input_path, seg_duration=seg_duration) | |
matched_time_ms = 0 | |
for file in sorted(os.listdir(segmented_path)): | |
segment_file = os.path.join(segmented_path, file) | |
similarity = self.calculate_similarity(segment_file, reference_path) | |
if similarity > threshold: | |
shutil.copy(segment_file, output_folder) | |
matched_time_ms += len(AudioSegment.from_file(segment_file)) | |
unmatched_time_ms = total_duration_ms - matched_time_ms | |
return matched_time_ms, unmatched_time_ms | |
def generate_random_string(self,length): | |
letters = string.ascii_letters + string.digits | |
return ''.join(random.choice(letters) for i in range(length)) | |
def generate_filename(self,random_length): | |
random_string = self.generate_random_string(random_length) | |
current_time = datetime.now().strftime("%Y%m%d%H%M%S") | |
filename = f"{current_time}_{random_string}.wav" | |
return filename | |