Spaces:

Justtalk
/

JusTalk

Running

App Files Files Community

buletomato25 commited on Mar 18

Commit

f009a0b

2 Parent(s): 4fe2183 1e4ae9f

new_stage

Browse files

Files changed (4) hide show

app.py +2 -2
process.py +145 -145
requirements.txt +1 -0
transcription.py +47 -0

app.py CHANGED Viewed

@@ -19,7 +19,7 @@ from google.auth.transport import requests as google_requests
 # Hugging Face のトークン取得（環境変数 HF に設定）
 #hf_token = os.environ.get("HF")
 load_dotenv()
-hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
 if hf_token is None:
     raise ValueError("HUGGINGFACE_HUB_TOKEN が設定されていません。")
@@ -39,7 +39,7 @@ app.config['SECRET_KEY'] = os.urandom(24)
 os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
 GOOGLE_CLIENT_ID = "228160683186-6u7986qsfhcv3kd9iqtv08iphpl4gdk2.apps.googleusercontent.com"
 GOOGLE_CLIENT_SECRET = "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"
-REDIRECT_URI = "https://huggingface.co/spaces/Justtalk/JusTalk/callbacck"
 flow = Flow.from_client_secrets_file(
     'client_secret.json',

 # Hugging Face のトークン取得（環境変数 HF に設定）
 #hf_token = os.environ.get("HF")
 load_dotenv()
+hf_token = os.getenv("HF")
 if hf_token is None:
     raise ValueError("HUGGINGFACE_HUB_TOKEN が設定されていません。")
 os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
 GOOGLE_CLIENT_ID = "228160683186-6u7986qsfhcv3kd9iqtv08iphpl4gdk2.apps.googleusercontent.com"
 GOOGLE_CLIENT_SECRET = "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"
+REDIRECT_URI = "https://huggingface.co/spaces/Justtalk/JusTalk/callback"
 flow = Flow.from_client_secrets_file(
     'client_secret.json',

process.py CHANGED Viewed

@@ -1,145 +1,145 @@
-import os
-import shutil
-import numpy as np
-import string
-import random
-from datetime import datetime
-from pyannote.audio import Model, Inference
-from pydub import AudioSegment
-class AudioProcessor():
-    def __init__(self,cache_dir = "/tmp/hf_cache"):
-        hf_token = os.environ.get("HUGGINGFACE_HUB_TOKEN")
-        if hf_token is None:
-            raise ValueError("HUGGINGFACE_HUB_TOKEN が設定されていません。")
-        os.makedirs(cache_dir, exist_ok=True)
-        # pyannote モデルの読み込み
-        model = Model.from_pretrained("pyannote/embedding", use_auth_token=hf_token, cache_dir=cache_dir)
-        self.inference = Inference(model)
-    def cosine_similarity(self,vec1, vec2):
-        vec1 = vec1 / np.linalg.norm(vec1)
-        vec2 = vec2 / np.linalg.norm(vec2)
-        return np.dot(vec1, vec2)
-    def segment_audio(self, path, target_path='/tmp/setup_voice', seg_duration=1.0):
-        # 出力先ディレクトリが存在していれば中身をクリアする
-        if os.path.exists(target_path):
-            for file in os.listdir(target_path):
-                file_path = os.path.join(target_path, file)
-                if os.path.isfile(file_path):
-                    os.remove(file_path)
-        else:
-            os.makedirs(target_path, exist_ok=True)
-        base_sound = AudioSegment.from_file(path)
-        duration_ms = len(base_sound)
-        seg_duration_ms = int(seg_duration * 1000)
-        for i, start in enumerate(range(0, duration_ms, seg_duration_ms)):
-            end = min(start + seg_duration_ms, duration_ms)
-            segment = base_sound[start:end]
-            # セグメントが指定長さに満たない場合、無音でパディングする
-            if len(segment) < seg_duration_ms:
-                silence = AudioSegment.silent(duration=(seg_duration_ms - len(segment)))
-                segment = segment + silence
-            segment.export(os.path.join(target_path, f'{i}.wav'), format="wav")
-        return target_path, duration_ms
-    def calculate_similarity(self,path1, path2):
-        embedding1 = self.inference(path1)
-        embedding2 = self.inference(path2)
-        return float(self.cosine_similarity(embedding1.data.flatten(), embedding2.data.flatten()))
-    def generate_random_string(self,length):
-        letters = string.ascii_letters + string.digits
-        return ''.join(random.choice(letters) for i in range(length))
-    def generate_filename(self,random_length):
-        random_string = self.generate_random_string(random_length)
-        current_time = datetime.now().strftime("%Y%m%d%H%M%S")
-        filename = f"{current_time}_{random_string}.wav"
-        return filename
-    def process_audio(self, reference_path, input_path, output_folder='/tmp/data/matched_segments', seg_duration=1.0, threshold=0.5):
-        # 出力先ディレクトリの中身をクリアする
-        if os.path.exists(output_folder):
-            for file in os.listdir(output_folder):
-                file_path = os.path.join(output_folder, file)
-                if os.path.isfile(file_path):
-                    os.remove(file_path)
-        else:
-            os.makedirs(output_folder, exist_ok=True)
-        segmented_path, total_duration_ms = self.segment_audio(input_path, seg_duration=seg_duration)
-        matched_time_ms = 0
-        for file in sorted(os.listdir(segmented_path)):
-            segment_file = os.path.join(segmented_path, file)
-            similarity = self.calculate_similarity(segment_file, reference_path)
-            if similarity > threshold:
-                shutil.copy(segment_file, output_folder)
-                matched_time_ms += len(AudioSegment.from_file(segment_file))
-        unmatched_time_ms = total_duration_ms - matched_time_ms
-        return matched_time_ms, unmatched_time_ms
-    def process_multi_audio(self, reference_pathes, input_path, output_folder='/tmp/data/matched_multi_segments', seg_duration=1.0, threshold=0.5):
-        # 出力先ディレクトリの中身をクリアする
-        if os.path.exists(output_folder):
-            for file in os.listdir(output_folder):
-                file_path = os.path.join(output_folder, file)
-                if os.path.isfile(file_path):
-                    os.remove(file_path)
-        else:
-            os.makedirs(output_folder, exist_ok=True)
-        # 入力音声をセグメントに分割
-        segmented_path, total_duration_ms = self.segment_audio(input_path, seg_duration=seg_duration)
-        segment_files = sorted(os.listdir(segmented_path))
-        num_segments = len(segment_files)
-        # 各リファレンスごとにセグメントとの類似度を計算し、行列 (rows: reference, columns: segment) を作成
-        similarity = []
-        for reference_path in reference_pathes:
-            ref_similarity = []
-            for file in segment_files:
-                segment_file = os.path.join(segmented_path, file)
-                sim = self.calculate_similarity(segment_file, reference_path)
-                ref_similarity.append(sim)
-            similarity.append(ref_similarity)
-        # 転置行列を作成 (rows: segment, columns: reference)
-        similarity_transposed = []
-        for seg_idx in range(num_segments):
-            seg_sim = []
-            for ref_idx in range(len(reference_pathes)):
-                seg_sim.append(similarity[ref_idx][seg_idx])
-            similarity_transposed.append(seg_sim)
-        # 各セグメントについて、最も高い類似度のリファレンスを選択
-        best_matches = []
-        for seg_sim in similarity_transposed:
-            best_ref = np.argmax(seg_sim)  # 最も類似度の高いリファレンスのインデックス
-            # 閾値チェック (必要に応じて)
-            if seg_sim[best_ref] < threshold:
-                best_matches.append(None)  # 閾値未満の場合はマッチなしとする
-            else:
-                best_matches.append(best_ref)
-        # 各リファレンスごとに一致時間を集計 (セグメントごとの長さ seg_duration を加算)
-        matched_time = [0] * len(reference_pathes)
-        for match in best_matches:
-            if match is not None:
-                matched_time[match] += seg_duration
-        return matched_time

+import os
+import shutil
+import numpy as np
+import string
+import random
+from datetime import datetime
+from pyannote.audio import Model, Inference
+from pydub import AudioSegment
+class AudioProcessor():
+    def __init__(self,cache_dir = "/tmp/hf_cache"):
+        hf_token = os.environ.get("HF")
+        if hf_token is None:
+            raise ValueError("HUGGINGFACE_HUB_TOKEN が設定されていません。")
+        os.makedirs(cache_dir, exist_ok=True)
+        # pyannote モデルの読み込み
+        model = Model.from_pretrained("pyannote/embedding", use_auth_token=hf_token, cache_dir=cache_dir)
+        self.inference = Inference(model)
+    def cosine_similarity(self,vec1, vec2):
+        vec1 = vec1 / np.linalg.norm(vec1)
+        vec2 = vec2 / np.linalg.norm(vec2)
+        return np.dot(vec1, vec2)
+    def segment_audio(self, path, target_path='/tmp/setup_voice', seg_duration=1.0):
+        # 出力先ディレクトリが存在していれば中身をクリアする
+        if os.path.exists(target_path):
+            for file in os.listdir(target_path):
+                file_path = os.path.join(target_path, file)
+                if os.path.isfile(file_path):
+                    os.remove(file_path)
+        else:
+            os.makedirs(target_path, exist_ok=True)
+        base_sound = AudioSegment.from_file(path)
+        duration_ms = len(base_sound)
+        seg_duration_ms = int(seg_duration * 1000)
+        for i, start in enumerate(range(0, duration_ms, seg_duration_ms)):
+            end = min(start + seg_duration_ms, duration_ms)
+            segment = base_sound[start:end]
+            # セグメントが指定長さに満たない場合、無音でパディングする
+            if len(segment) < seg_duration_ms:
+                silence = AudioSegment.silent(duration=(seg_duration_ms - len(segment)))
+                segment = segment + silence
+            segment.export(os.path.join(target_path, f'{i}.wav'), format="wav")
+        return target_path, duration_ms
+    def calculate_similarity(self,path1, path2):
+        embedding1 = self.inference(path1)
+        embedding2 = self.inference(path2)
+        return float(self.cosine_similarity(embedding1.data.flatten(), embedding2.data.flatten()))
+    def generate_random_string(self,length):
+        letters = string.ascii_letters + string.digits
+        return ''.join(random.choice(letters) for i in range(length))
+    def generate_filename(self,random_length):
+        random_string = self.generate_random_string(random_length)
+        current_time = datetime.now().strftime("%Y%m%d%H%M%S")
+        filename = f"{current_time}_{random_string}.wav"
+        return filename
+    def process_audio(self, reference_path, input_path, output_folder='/tmp/data/matched_segments', seg_duration=1.0, threshold=0.5):
+        # 出力先ディレクトリの中身をクリアする
+        if os.path.exists(output_folder):
+            for file in os.listdir(output_folder):
+                file_path = os.path.join(output_folder, file)
+                if os.path.isfile(file_path):
+                    os.remove(file_path)
+        else:
+            os.makedirs(output_folder, exist_ok=True)
+        segmented_path, total_duration_ms = self.segment_audio(input_path, seg_duration=seg_duration)
+        matched_time_ms = 0
+        for file in sorted(os.listdir(segmented_path)):
+            segment_file = os.path.join(segmented_path, file)
+            similarity = self.calculate_similarity(segment_file, reference_path)
+            if similarity > threshold:
+                shutil.copy(segment_file, output_folder)
+                matched_time_ms += len(AudioSegment.from_file(segment_file))
+        unmatched_time_ms = total_duration_ms - matched_time_ms
+        return matched_time_ms, unmatched_time_ms
+    def process_multi_audio(self, reference_pathes, input_path, output_folder='/tmp/data/matched_multi_segments', seg_duration=1.0, threshold=0.5):
+        # 出力先ディレクトリの中身をクリアする
+        if os.path.exists(output_folder):
+            for file in os.listdir(output_folder):
+                file_path = os.path.join(output_folder, file)
+                if os.path.isfile(file_path):
+                    os.remove(file_path)
+        else:
+            os.makedirs(output_folder, exist_ok=True)
+        # 入力音声をセグメントに分割
+        segmented_path, total_duration_ms = self.segment_audio(input_path, seg_duration=seg_duration)
+        segment_files = sorted(os.listdir(segmented_path))
+        num_segments = len(segment_files)
+        # 各リファレンスごとにセグメントとの類似度を計算し、行列 (rows: reference, columns: segment) を作成
+        similarity = []
+        for reference_path in reference_pathes:
+            ref_similarity = []
+            for file in segment_files:
+                segment_file = os.path.join(segmented_path, file)
+                sim = self.calculate_similarity(segment_file, reference_path)
+                ref_similarity.append(sim)
+            similarity.append(ref_similarity)
+        # 転置行列を作成 (rows: segment, columns: reference)
+        similarity_transposed = []
+        for seg_idx in range(num_segments):
+            seg_sim = []
+            for ref_idx in range(len(reference_pathes)):
+                seg_sim.append(similarity[ref_idx][seg_idx])
+            similarity_transposed.append(seg_sim)
+        # 各セグメントについて、最も高い類似度のリファレンスを選択
+        best_matches = []
+        for seg_sim in similarity_transposed:
+            best_ref = np.argmax(seg_sim)  # 最も類似度の高いリファレンスのインデックス
+            # 閾値チェック (必要に応じて)
+            if seg_sim[best_ref] < threshold:
+                best_matches.append(None)  # 閾値未満の場合はマッチなしとする
+            else:
+                best_matches.append(best_ref)
+        # 各リファレンスごとに一致時間を集計 (セグメントごとの長さ seg_duration を加算)
+        matched_time = [0] * len(reference_pathes)
+        for match in best_matches:
+            if match is not None:
+                matched_time[match] += seg_duration
+        return matched_time

requirements.txt CHANGED Viewed

@@ -12,4 +12,5 @@ requests==2.32.3
 google-auth==2.38.0
 google-auth-oauthlib==1.2.1
 google-auth-httplib2==0.2.0

 google-auth==2.38.0
 google-auth-oauthlib==1.2.1
 google-auth-httplib2==0.2.0
+faster-whisper

transcription.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import os
+from faster_whisper import WhisperModel
+class TranscriptionMaker():
+    #書き起こしファイル(ファイル名_transcription.txt)を吐き出すディレクトリを指定
+    def __init__(self,output_dir=os.path.abspath("/tmp/data/transcriptions")):
+        self.model = WhisperModel("base", device="cpu")
+        self.output_dir = output_dir
+        try:
+            if not os.path.exists(self.output_dir):
+                os.makedirs(self.output_dir)
+        except OSError as e:
+            print(f"Error creating directory {self.output_dir}: {e}")
+            raise
+    #音声ファイルのパスを受け取り、書き起こしファイルを作成する
+    def create_transcription(self,audio_path):
+        try:
+            if not os.path.isfile(audio_path):
+                raise FileNotFoundError(f"The specified audio file does not exist: {audio_path}")
+            segments, info = self.model.transcribe(audio_path)
+            results = []
+            for segment in segments:
+                results.append({
+                    "start": segment.start,
+                    "end": segment.end,
+                    "text": segment.text
+                })
+            #ファイルの書き込み
+            output_file=os.path.join(self.output_dir,os.path.basename(audio_path)+"_transcription.txt")
+            try:
+                with open(output_file,"w",encoding="utf-8") as f:
+                    for result in results:
+                        f.write(f"[{result['start']:.2f}s - {result['end']:.2f}s] {result['text']}\n")
+            except OSError as e:
+                print(f"Error writing transcription file: {e}")
+                raise
+            return output_file
+        except FileNotFoundError as e:
+            print(f"Error: {e}")
+            raise
+        except Exception as e:
+            print(f"An unexpected error occurred: {e}")
+            raise