Spaces:
Running
Running
File size: 3,954 Bytes
99daaaf 98c4a68 99daaaf db2374b 99daaaf db2374b 99daaaf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
import os
import shutil
import numpy as np
import string
import random
from datetime import datetime
from pyannote.audio import Model, Inference
from pydub import AudioSegment
class AudioProcessor():
def __init__(self,cache_dir = "/tmp/hf_cache"):
hf_token = os.environ.get("HUGGINGFACE_HUB_TOKEN")
if hf_token is None:
raise ValueError("HUGGINGFACE_HUB_TOKEN が設定されていません。")
os.makedirs(cache_dir, exist_ok=True)
# pyannote モデルの読み込み
model = Model.from_pretrained("pyannote/embedding", use_auth_token=hf_token, cache_dir=cache_dir)
self.inference = Inference(model)
def cosine_similarity(self,vec1, vec2):
vec1 = vec1 / np.linalg.norm(vec1)
vec2 = vec2 / np.linalg.norm(vec2)
return np.dot(vec1, vec2)
def segment_audio(self, path, target_path='/tmp/setup_voice', seg_duration=1.0):
# 出力先ディレクトリが存在していれば中身をクリアする
if os.path.exists(target_path):
for file in os.listdir(target_path):
file_path = os.path.join(target_path, file)
if os.path.isfile(file_path):
os.remove(file_path)
else:
os.makedirs(target_path, exist_ok=True)
base_sound = AudioSegment.from_file(path)
duration_ms = len(base_sound)
seg_duration_ms = int(seg_duration * 1000)
for i, start in enumerate(range(0, duration_ms, seg_duration_ms)):
end = min(start + seg_duration_ms, duration_ms)
segment = base_sound[start:end]
# セグメントが指定長さに満たない場合、無音でパディングする
if len(segment) < seg_duration_ms:
silence = AudioSegment.silent(duration=(seg_duration_ms - len(segment)))
segment = segment + silence
segment.export(os.path.join(target_path, f'{i}.wav'), format="wav")
return target_path, duration_ms
def calculate_similarity(self,path1, path2):
embedding1 = self.inference(path1)
embedding2 = self.inference(path2)
return float(self.cosine_similarity(embedding1.data.flatten(), embedding2.data.flatten()))
def process_audio(self, reference_path, input_path, output_folder='/tmp/data/matched_segments', seg_duration=1.0, threshold=0.5):
# 出力先ディレクトリの中身をクリアする
if os.path.exists(output_folder):
for file in os.listdir(output_folder):
file_path = os.path.join(output_folder, file)
if os.path.isfile(file_path):
os.remove(file_path)
else:
os.makedirs(output_folder, exist_ok=True)
segmented_path, total_duration_ms = self.segment_audio(input_path, seg_duration=seg_duration)
matched_time_ms = 0
for file in sorted(os.listdir(segmented_path)):
segment_file = os.path.join(segmented_path, file)
similarity = self.calculate_similarity(segment_file, reference_path)
if similarity > threshold:
shutil.copy(segment_file, output_folder)
matched_time_ms += len(AudioSegment.from_file(segment_file))
unmatched_time_ms = total_duration_ms - matched_time_ms
return matched_time_ms, unmatched_time_ms
def generate_random_string(self,length):
letters = string.ascii_letters + string.digits
return ''.join(random.choice(letters) for i in range(length))
def generate_filename(self,random_length):
random_string = self.generate_random_string(random_length)
current_time = datetime.now().strftime("%Y%m%d%H%M%S")
filename = f"{current_time}_{random_string}.wav"
return filename
|