buletomato25 commited on
Commit
f009a0b
·
2 Parent(s): 4fe2183 1e4ae9f
Files changed (4) hide show
  1. app.py +2 -2
  2. process.py +145 -145
  3. requirements.txt +1 -0
  4. transcription.py +47 -0
app.py CHANGED
@@ -19,7 +19,7 @@ from google.auth.transport import requests as google_requests
19
  # Hugging Face のトークン取得(環境変数 HF に設定)
20
  #hf_token = os.environ.get("HF")
21
  load_dotenv()
22
- hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
23
  if hf_token is None:
24
  raise ValueError("HUGGINGFACE_HUB_TOKEN が設定されていません。")
25
 
@@ -39,7 +39,7 @@ app.config['SECRET_KEY'] = os.urandom(24)
39
  os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
40
  GOOGLE_CLIENT_ID = "228160683186-6u7986qsfhcv3kd9iqtv08iphpl4gdk2.apps.googleusercontent.com"
41
  GOOGLE_CLIENT_SECRET = "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"
42
- REDIRECT_URI = "https://huggingface.co/spaces/Justtalk/JusTalk/callbacck"
43
 
44
  flow = Flow.from_client_secrets_file(
45
  'client_secret.json',
 
19
  # Hugging Face のトークン取得(環境変数 HF に設定)
20
  #hf_token = os.environ.get("HF")
21
  load_dotenv()
22
+ hf_token = os.getenv("HF")
23
  if hf_token is None:
24
  raise ValueError("HUGGINGFACE_HUB_TOKEN が設定されていません。")
25
 
 
39
  os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
40
  GOOGLE_CLIENT_ID = "228160683186-6u7986qsfhcv3kd9iqtv08iphpl4gdk2.apps.googleusercontent.com"
41
  GOOGLE_CLIENT_SECRET = "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"
42
+ REDIRECT_URI = "https://huggingface.co/spaces/Justtalk/JusTalk/callback"
43
 
44
  flow = Flow.from_client_secrets_file(
45
  'client_secret.json',
process.py CHANGED
@@ -1,145 +1,145 @@
1
-
2
- import os
3
- import shutil
4
- import numpy as np
5
- import string
6
- import random
7
- from datetime import datetime
8
- from pyannote.audio import Model, Inference
9
- from pydub import AudioSegment
10
-
11
- class AudioProcessor():
12
- def __init__(self,cache_dir = "/tmp/hf_cache"):
13
- hf_token = os.environ.get("HUGGINGFACE_HUB_TOKEN")
14
- if hf_token is None:
15
- raise ValueError("HUGGINGFACE_HUB_TOKEN が設定されていません。")
16
- os.makedirs(cache_dir, exist_ok=True)
17
- # pyannote モデルの読み込み
18
- model = Model.from_pretrained("pyannote/embedding", use_auth_token=hf_token, cache_dir=cache_dir)
19
- self.inference = Inference(model)
20
-
21
-
22
- def cosine_similarity(self,vec1, vec2):
23
- vec1 = vec1 / np.linalg.norm(vec1)
24
- vec2 = vec2 / np.linalg.norm(vec2)
25
- return np.dot(vec1, vec2)
26
-
27
- def segment_audio(self, path, target_path='/tmp/setup_voice', seg_duration=1.0):
28
- # 出力先ディレクトリが存在していれば中身をクリアする
29
- if os.path.exists(target_path):
30
- for file in os.listdir(target_path):
31
- file_path = os.path.join(target_path, file)
32
- if os.path.isfile(file_path):
33
- os.remove(file_path)
34
- else:
35
- os.makedirs(target_path, exist_ok=True)
36
-
37
- base_sound = AudioSegment.from_file(path)
38
- duration_ms = len(base_sound)
39
- seg_duration_ms = int(seg_duration * 1000)
40
-
41
- for i, start in enumerate(range(0, duration_ms, seg_duration_ms)):
42
- end = min(start + seg_duration_ms, duration_ms)
43
- segment = base_sound[start:end]
44
- # セグメントが指定長さに満たない場合、無音でパディングする
45
- if len(segment) < seg_duration_ms:
46
- silence = AudioSegment.silent(duration=(seg_duration_ms - len(segment)))
47
- segment = segment + silence
48
-
49
- segment.export(os.path.join(target_path, f'{i}.wav'), format="wav")
50
-
51
- return target_path, duration_ms
52
-
53
-
54
- def calculate_similarity(self,path1, path2):
55
- embedding1 = self.inference(path1)
56
- embedding2 = self.inference(path2)
57
- return float(self.cosine_similarity(embedding1.data.flatten(), embedding2.data.flatten()))
58
-
59
- def generate_random_string(self,length):
60
- letters = string.ascii_letters + string.digits
61
- return ''.join(random.choice(letters) for i in range(length))
62
-
63
- def generate_filename(self,random_length):
64
- random_string = self.generate_random_string(random_length)
65
- current_time = datetime.now().strftime("%Y%m%d%H%M%S")
66
- filename = f"{current_time}_{random_string}.wav"
67
- return filename
68
-
69
- def process_audio(self, reference_path, input_path, output_folder='/tmp/data/matched_segments', seg_duration=1.0, threshold=0.5):
70
- # 出力先ディレクトリの中身をクリアする
71
- if os.path.exists(output_folder):
72
- for file in os.listdir(output_folder):
73
- file_path = os.path.join(output_folder, file)
74
- if os.path.isfile(file_path):
75
- os.remove(file_path)
76
- else:
77
- os.makedirs(output_folder, exist_ok=True)
78
-
79
- segmented_path, total_duration_ms = self.segment_audio(input_path, seg_duration=seg_duration)
80
-
81
- matched_time_ms = 0
82
- for file in sorted(os.listdir(segmented_path)):
83
- segment_file = os.path.join(segmented_path, file)
84
- similarity = self.calculate_similarity(segment_file, reference_path)
85
- if similarity > threshold:
86
- shutil.copy(segment_file, output_folder)
87
- matched_time_ms += len(AudioSegment.from_file(segment_file))
88
-
89
- unmatched_time_ms = total_duration_ms - matched_time_ms
90
- return matched_time_ms, unmatched_time_ms
91
-
92
-
93
- def process_multi_audio(self, reference_pathes, input_path, output_folder='/tmp/data/matched_multi_segments', seg_duration=1.0, threshold=0.5):
94
- # 出力先ディレクトリの中身をクリアする
95
- if os.path.exists(output_folder):
96
- for file in os.listdir(output_folder):
97
- file_path = os.path.join(output_folder, file)
98
- if os.path.isfile(file_path):
99
- os.remove(file_path)
100
- else:
101
- os.makedirs(output_folder, exist_ok=True)
102
-
103
- # 入力音声をセグメントに分割
104
- segmented_path, total_duration_ms = self.segment_audio(input_path, seg_duration=seg_duration)
105
- segment_files = sorted(os.listdir(segmented_path))
106
- num_segments = len(segment_files)
107
-
108
- # 各リファレンスごとにセグメントとの類似度を計算し、行列 (rows: reference, columns: segment) を作成
109
- similarity = []
110
- for reference_path in reference_pathes:
111
- ref_similarity = []
112
- for file in segment_files:
113
- segment_file = os.path.join(segmented_path, file)
114
- sim = self.calculate_similarity(segment_file, reference_path)
115
- ref_similarity.append(sim)
116
- similarity.append(ref_similarity)
117
-
118
- # 転置行列を作成 (rows: segment, columns: reference)
119
- similarity_transposed = []
120
- for seg_idx in range(num_segments):
121
- seg_sim = []
122
- for ref_idx in range(len(reference_pathes)):
123
- seg_sim.append(similarity[ref_idx][seg_idx])
124
- similarity_transposed.append(seg_sim)
125
-
126
- # 各セグメントについて、最も高い類似度のリファレンスを選択
127
- best_matches = []
128
- for seg_sim in similarity_transposed:
129
- best_ref = np.argmax(seg_sim) # 最も類似度の高いリファレンスのインデックス
130
- # 閾値チェック (必要に応じて)
131
- if seg_sim[best_ref] < threshold:
132
- best_matches.append(None) # 閾値未満の場合はマッチなしとする
133
- else:
134
- best_matches.append(best_ref)
135
-
136
- # 各リファレンスごとに一致時間を集計 (セグメントごとの長さ seg_duration を加算)
137
- matched_time = [0] * len(reference_pathes)
138
- for match in best_matches:
139
- if match is not None:
140
- matched_time[match] += seg_duration
141
-
142
- return matched_time
143
-
144
-
145
-
 
1
+
2
+ import os
3
+ import shutil
4
+ import numpy as np
5
+ import string
6
+ import random
7
+ from datetime import datetime
8
+ from pyannote.audio import Model, Inference
9
+ from pydub import AudioSegment
10
+
11
+ class AudioProcessor():
12
+ def __init__(self,cache_dir = "/tmp/hf_cache"):
13
+ hf_token = os.environ.get("HF")
14
+ if hf_token is None:
15
+ raise ValueError("HUGGINGFACE_HUB_TOKEN が設定されていません。")
16
+ os.makedirs(cache_dir, exist_ok=True)
17
+ # pyannote モデルの読み込み
18
+ model = Model.from_pretrained("pyannote/embedding", use_auth_token=hf_token, cache_dir=cache_dir)
19
+ self.inference = Inference(model)
20
+
21
+
22
+ def cosine_similarity(self,vec1, vec2):
23
+ vec1 = vec1 / np.linalg.norm(vec1)
24
+ vec2 = vec2 / np.linalg.norm(vec2)
25
+ return np.dot(vec1, vec2)
26
+
27
+ def segment_audio(self, path, target_path='/tmp/setup_voice', seg_duration=1.0):
28
+ # 出力先ディレクトリが存在していれば中身をクリアする
29
+ if os.path.exists(target_path):
30
+ for file in os.listdir(target_path):
31
+ file_path = os.path.join(target_path, file)
32
+ if os.path.isfile(file_path):
33
+ os.remove(file_path)
34
+ else:
35
+ os.makedirs(target_path, exist_ok=True)
36
+
37
+ base_sound = AudioSegment.from_file(path)
38
+ duration_ms = len(base_sound)
39
+ seg_duration_ms = int(seg_duration * 1000)
40
+
41
+ for i, start in enumerate(range(0, duration_ms, seg_duration_ms)):
42
+ end = min(start + seg_duration_ms, duration_ms)
43
+ segment = base_sound[start:end]
44
+ # セグメントが指定長さに満たない場合、無音でパディングする
45
+ if len(segment) < seg_duration_ms:
46
+ silence = AudioSegment.silent(duration=(seg_duration_ms - len(segment)))
47
+ segment = segment + silence
48
+
49
+ segment.export(os.path.join(target_path, f'{i}.wav'), format="wav")
50
+
51
+ return target_path, duration_ms
52
+
53
+
54
+ def calculate_similarity(self,path1, path2):
55
+ embedding1 = self.inference(path1)
56
+ embedding2 = self.inference(path2)
57
+ return float(self.cosine_similarity(embedding1.data.flatten(), embedding2.data.flatten()))
58
+
59
+ def generate_random_string(self,length):
60
+ letters = string.ascii_letters + string.digits
61
+ return ''.join(random.choice(letters) for i in range(length))
62
+
63
+ def generate_filename(self,random_length):
64
+ random_string = self.generate_random_string(random_length)
65
+ current_time = datetime.now().strftime("%Y%m%d%H%M%S")
66
+ filename = f"{current_time}_{random_string}.wav"
67
+ return filename
68
+
69
+ def process_audio(self, reference_path, input_path, output_folder='/tmp/data/matched_segments', seg_duration=1.0, threshold=0.5):
70
+ # 出力先ディレクトリの中身をクリアする
71
+ if os.path.exists(output_folder):
72
+ for file in os.listdir(output_folder):
73
+ file_path = os.path.join(output_folder, file)
74
+ if os.path.isfile(file_path):
75
+ os.remove(file_path)
76
+ else:
77
+ os.makedirs(output_folder, exist_ok=True)
78
+
79
+ segmented_path, total_duration_ms = self.segment_audio(input_path, seg_duration=seg_duration)
80
+
81
+ matched_time_ms = 0
82
+ for file in sorted(os.listdir(segmented_path)):
83
+ segment_file = os.path.join(segmented_path, file)
84
+ similarity = self.calculate_similarity(segment_file, reference_path)
85
+ if similarity > threshold:
86
+ shutil.copy(segment_file, output_folder)
87
+ matched_time_ms += len(AudioSegment.from_file(segment_file))
88
+
89
+ unmatched_time_ms = total_duration_ms - matched_time_ms
90
+ return matched_time_ms, unmatched_time_ms
91
+
92
+
93
+ def process_multi_audio(self, reference_pathes, input_path, output_folder='/tmp/data/matched_multi_segments', seg_duration=1.0, threshold=0.5):
94
+ # 出力先ディレクトリの中身をクリアする
95
+ if os.path.exists(output_folder):
96
+ for file in os.listdir(output_folder):
97
+ file_path = os.path.join(output_folder, file)
98
+ if os.path.isfile(file_path):
99
+ os.remove(file_path)
100
+ else:
101
+ os.makedirs(output_folder, exist_ok=True)
102
+
103
+ # 入力音声をセグメントに分割
104
+ segmented_path, total_duration_ms = self.segment_audio(input_path, seg_duration=seg_duration)
105
+ segment_files = sorted(os.listdir(segmented_path))
106
+ num_segments = len(segment_files)
107
+
108
+ # 各リファレンスごとにセグメントとの類似度を計算し、行列 (rows: reference, columns: segment) を作成
109
+ similarity = []
110
+ for reference_path in reference_pathes:
111
+ ref_similarity = []
112
+ for file in segment_files:
113
+ segment_file = os.path.join(segmented_path, file)
114
+ sim = self.calculate_similarity(segment_file, reference_path)
115
+ ref_similarity.append(sim)
116
+ similarity.append(ref_similarity)
117
+
118
+ # 転置行列を作成 (rows: segment, columns: reference)
119
+ similarity_transposed = []
120
+ for seg_idx in range(num_segments):
121
+ seg_sim = []
122
+ for ref_idx in range(len(reference_pathes)):
123
+ seg_sim.append(similarity[ref_idx][seg_idx])
124
+ similarity_transposed.append(seg_sim)
125
+
126
+ # 各セグメントについて、最も高い類似度のリファレンスを選択
127
+ best_matches = []
128
+ for seg_sim in similarity_transposed:
129
+ best_ref = np.argmax(seg_sim) # 最も類似度の高いリファレンスのインデックス
130
+ # 閾値チェック (必要に応じて)
131
+ if seg_sim[best_ref] < threshold:
132
+ best_matches.append(None) # 閾値未満の場合はマッチなしとする
133
+ else:
134
+ best_matches.append(best_ref)
135
+
136
+ # 各リファレンスごとに一致時間を集計 (セグメントごとの長さ seg_duration を加算)
137
+ matched_time = [0] * len(reference_pathes)
138
+ for match in best_matches:
139
+ if match is not None:
140
+ matched_time[match] += seg_duration
141
+
142
+ return matched_time
143
+
144
+
145
+
requirements.txt CHANGED
@@ -12,4 +12,5 @@ requests==2.32.3
12
  google-auth==2.38.0
13
  google-auth-oauthlib==1.2.1
14
  google-auth-httplib2==0.2.0
 
15
 
 
12
  google-auth==2.38.0
13
  google-auth-oauthlib==1.2.1
14
  google-auth-httplib2==0.2.0
15
+ faster-whisper
16
 
transcription.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from faster_whisper import WhisperModel
3
+
4
+ class TranscriptionMaker():
5
+ #書き起こしファイル(ファイル名_transcription.txt)を吐き出すディレクトリを指定
6
+ def __init__(self,output_dir=os.path.abspath("/tmp/data/transcriptions")):
7
+ self.model = WhisperModel("base", device="cpu")
8
+ self.output_dir = output_dir
9
+ try:
10
+ if not os.path.exists(self.output_dir):
11
+ os.makedirs(self.output_dir)
12
+ except OSError as e:
13
+ print(f"Error creating directory {self.output_dir}: {e}")
14
+ raise
15
+
16
+ #音声ファイルのパスを受け取り、書き起こしファイルを作成する
17
+ def create_transcription(self,audio_path):
18
+ try:
19
+ if not os.path.isfile(audio_path):
20
+ raise FileNotFoundError(f"The specified audio file does not exist: {audio_path}")
21
+
22
+ segments, info = self.model.transcribe(audio_path)
23
+ results = []
24
+
25
+ for segment in segments:
26
+ results.append({
27
+ "start": segment.start,
28
+ "end": segment.end,
29
+ "text": segment.text
30
+ })
31
+
32
+ #ファイルの書き込み
33
+ output_file=os.path.join(self.output_dir,os.path.basename(audio_path)+"_transcription.txt")
34
+ try:
35
+ with open(output_file,"w",encoding="utf-8") as f:
36
+ for result in results:
37
+ f.write(f"[{result['start']:.2f}s - {result['end']:.2f}s] {result['text']}\n")
38
+ except OSError as e:
39
+ print(f"Error writing transcription file: {e}")
40
+ raise
41
+ return output_file
42
+ except FileNotFoundError as e:
43
+ print(f"Error: {e}")
44
+ raise
45
+ except Exception as e:
46
+ print(f"An unexpected error occurred: {e}")
47
+ raise