rein0421 commited on
Commit
77591a2
·
verified ·
1 Parent(s): 6cfbac7

Upload 10 files

Browse files
Files changed (6) hide show
  1. .gitattributes +37 -37
  2. Dockerfile +22 -22
  3. README.md +10 -10
  4. app.py +116 -94
  5. process.py +184 -184
  6. requirements.txt +19 -19
.gitattributes CHANGED
@@ -1,37 +1,37 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- segment_0[[:space:]](2).wav filter=lfs diff=lfs merge=lfs -text
37
- sample.wav filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ segment_0[[:space:]](2).wav filter=lfs diff=lfs merge=lfs -text
37
+ sample.wav filter=lfs diff=lfs merge=lfs -text
Dockerfile CHANGED
@@ -1,23 +1,23 @@
1
- FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
2
-
3
- # タイムゾーン設定
4
- RUN ln -sf /usr/share/zoneinfo/Asia/Tokyo /etc/localtime
5
-
6
- # Python3、pip、ffmpegをインストール
7
- RUN apt-get update && \
8
- apt-get install -y python3 python3-pip ffmpeg && \
9
- rm -rf /var/lib/apt/lists/*
10
-
11
- # pipを最新版にアップグレード
12
- RUN python3 -m pip install --upgrade pip
13
-
14
- WORKDIR /app
15
-
16
- # requirements.txt をコンテナ内にコピーして、必要なパッケージをインストール
17
- COPY requirements.txt /app/
18
-
19
- RUN python3 -m pip install --no-cache-dir -r requirements.txt
20
-
21
- COPY . .
22
-
23
  CMD ["python3", "app.py"]
 
1
+ FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
2
+
3
+ # タイムゾーン設定
4
+ RUN ln -sf /usr/share/zoneinfo/Asia/Tokyo /etc/localtime
5
+
6
+ # Python3、pip、ffmpegをインストール
7
+ RUN apt-get update && \
8
+ apt-get install -y python3 python3-pip ffmpeg && \
9
+ rm -rf /var/lib/apt/lists/*
10
+
11
+ # pipを最新版にアップグレード
12
+ RUN python3 -m pip install --upgrade pip
13
+
14
+ WORKDIR /app
15
+
16
+ # requirements.txt をコンテナ内にコピーして、必要なパッケージをインストール
17
+ COPY requirements.txt /app/
18
+
19
+ RUN python3 -m pip install --no-cache-dir -r requirements.txt
20
+
21
+ COPY . .
22
+
23
  CMD ["python3", "app.py"]
README.md CHANGED
@@ -1,10 +1,10 @@
1
- ---
2
- title: JusTalk
3
- emoji: ⚡
4
- colorFrom: gray
5
- colorTo: blue
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: JusTalk
3
+ emoji: ⚡
4
+ colorFrom: gray
5
+ colorTo: blue
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,95 +1,117 @@
1
- from flask import Flask, request, jsonify, render_template, send_from_directory
2
- import base64
3
- from pydub import AudioSegment # 変換用にpydubをインポート
4
- import os
5
- import shutil
6
- from process import AudioProcessor
7
-
8
- process=AudioProcessor()
9
- app = Flask(__name__)
10
-
11
- users = ["ccc"]
12
-
13
- # トップページ(テンプレート: index.html)
14
- @app.route('/')
15
- @app.route('/index', methods=['GET', 'POST'])
16
- def index():
17
- return render_template('index.html', users = users)
18
-
19
- # フィードバック画面(テンプレート: feedback.html)
20
- @app.route('/feedback', methods=['GET', 'POST'])
21
- def feedback():
22
- return render_template('feedback.html')
23
-
24
-
25
- # 会話詳細画面(テンプレート: talkDetail.html)
26
- @app.route('/talk_detail', methods=['GET', 'POST'])
27
- def talk_detail():
28
- return render_template('talkDetail.html')
29
-
30
- # 音声登録画面(テンプレート: userRegister.html)
31
- @app.route('/userregister', methods=['GET', 'POST'])
32
- def userregister():
33
- return render_template('userRegister.html')
34
-
35
- # 音声アップロード&解析エンドポイント
36
- @app.route('/upload_audio', methods=['POST'])
37
- def upload_audio():
38
- try:
39
- data = request.get_json()
40
- if not data or 'audio_data' not in data or 'name' not in data:
41
- return jsonify({"error": "音声データまたは名前がありません"}), 400
42
-
43
- # Base64デコードして音声バイナリを取得
44
- audio_binary = base64.b64decode(data['audio_data'])
45
- name = data['name'] # 名前を取得
46
- audio_dir = "/tmp/data"
47
- os.makedirs(audio_dir, exist_ok=True)
48
- # 固定ファイル名(必要に応じて generate_filename() で一意のファイル名に変更可能)
49
- audio_path = os.path.join(audio_dir, f"{name}.wav")
50
-
51
- with open(audio_path, 'wb') as f:
52
- f.write(audio_binary)
53
-
54
- # 参照音声ファイルのパスを指定(sample.wav を正しい場所に配置すること)
55
- reference_audio = os.path.abspath("/tmp/data/base_audio/", f"{name}.wav")
56
-
57
- if not os.path.exists(reference_audio):
58
- return jsonify({"error": "参照音声ファイルが見つかりません", "details": reference_audio}), 500
59
-
60
- # 音声解析:参照音声とアップロードされた音声との類似度をセグメント毎に計算
61
- # threshold の値は調整可能です(例: 0.1)
62
- if(users.length > 2):
63
- print("複数人の場合の処理")
64
- else:
65
- matched_time, unmatched_time = process.process_audio(reference_audio, audio_path, threshold=0.05)
66
- total_time = matched_time + unmatched_time
67
- rate = (matched_time / total_time) * 100 if total_time > 0 else 0
68
-
69
-
70
- return jsonify({"rate": rate}), 200
71
- except Exception as e:
72
- print("Error in /upload_audio:", str(e))
73
- return jsonify({"error": "サーバーエラー", "details": str(e)}), 500
74
- @app.route('/upload_base_audio', methods=['POST'])
75
- def upload_base_audio():
76
- try:
77
- data = request.get_json()
78
- if not data or 'audio_data' not in data or 'name' not in data:
79
- return jsonify({"error": "音声データまたは名前がありません"}), 400
80
- name = data['name'] # 名前を取得
81
- users.append(name)
82
-
83
- audio_path=process.save_audio_from_base64(
84
- base64_audio=data['audio_data'], # 音声データ
85
- output_dir= "/tmp/data", #保存先
86
- output_filename=f"{name}.wav" # 固定ファイル名(必要に応じて generate_filename() で一意のファイル名に変更可能)
87
- )
88
- return jsonify({"state": "Registration Success!", "path": audio_path}), 200
89
- except Exception as e:
90
- print("Error in /upload_base_audio:", str(e))
91
- return jsonify({"error": "サーバーエラー", "details": str(e)}), 500
92
-
93
- if __name__ == '__main__':
94
- port = int(os.environ.get("PORT", 7860))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  app.run(debug=True, host="0.0.0.0", port=port)
 
1
+ from flask import Flask, request, jsonify, render_template, send_from_directory
2
+ import base64
3
+ from pydub import AudioSegment # 変換用にpydubをインポート
4
+ import os
5
+ import shutil
6
+ from process import AudioProcessor
7
+
8
+ process=AudioProcessor()
9
+ app = Flask(__name__)
10
+
11
+ users = []
12
+
13
+ # トップページ(テンプレート: index.html)
14
+ @app.route('/')
15
+ @app.route('/index', methods=['GET', 'POST'])
16
+ def index():
17
+ return render_template('index.html', users = users)
18
+
19
+ # フィードバック画面(テンプレート: feedback.html)
20
+ @app.route('/feedback', methods=['GET', 'POST'])
21
+ def feedback():
22
+ return render_template('feedback.html')
23
+
24
+
25
+ # 会話詳細画面(テンプレート: talkDetail.html)
26
+ @app.route('/talk_detail', methods=['GET', 'POST'])
27
+ def talk_detail():
28
+ return render_template('talkDetail.html')
29
+
30
+ # 音声登録画面(テンプレート: userRegister.html)
31
+ @app.route('/userregister', methods=['GET', 'POST'])
32
+ def userregister():
33
+ return render_template('userRegister.html')
34
+ #人数確認
35
+ @app.route('/confirm', methods=['GET']) # 基本的にGETで取得する想定なので、GETのみに変更
36
+ def confirm():
37
+ return jsonify({'members': users}), 200
38
+
39
+
40
+
41
+ # 音声アップロード&解析エンドポイント
42
+ @app.route('/upload_audio', methods=['POST'])
43
+ def upload_audio():
44
+ try:
45
+ data = request.get_json()
46
+ # name か users のいずれかが必須。どちらも無い場合はエラー
47
+ if not data or 'audio_data' not in data or ('name' not in data and 'users' not in data):
48
+ return jsonify({"error": "音声データまたは名前がありません"}), 400
49
+
50
+ # Base64デコードして音声バイナリを取得
51
+ audio_binary = base64.b64decode(data['audio_data'])
52
+
53
+
54
+
55
+ upload_name = 'tmp'
56
+ audio_dir = "/tmp/data"
57
+ os.makedirs(audio_dir, exist_ok=True)
58
+ audio_path = os.path.join(audio_dir, f"{upload_name}.wav")
59
+ with open(audio_path, 'wb') as f:
60
+ f.write(audio_binary)
61
+ print(users)
62
+ # 各ユーザーの参照音声ファイルのパスをリストに格納
63
+ reference_paths = []
64
+ base_audio_dir = "/tmp/data/base_audio"
65
+ for user in users:
66
+ ref_path = os.path.abspath(os.path.join(base_audio_dir, f"{user}.wav"))
67
+ if not os.path.exists(ref_path):
68
+ return jsonify({"error": "参照音声ファイルが見つかりません", "details": ref_path}), 500
69
+ reference_paths.append(ref_path)
70
+
71
+ # 複数人の場合は参照パスのリストを、1人の場合は単一のパスを渡す
72
+ if len(users) > 1:
73
+ print("複数人の場合の処理")
74
+ matched_time, unmatched_time = process.process_multi_audio(reference_paths, audio_path, threshold=0.05)
75
+ else:
76
+ matched_time, unmatched_time = process.process_audio(reference_paths[0], audio_path, threshold=0.05)
77
+
78
+ total_time = matched_time + unmatched_time
79
+ rate = (matched_time / total_time) * 100 if total_time > 0 else 0
80
+ return jsonify({"rate": rate}), 200
81
+ except Exception as e:
82
+ print("Error in /upload_audio:", str(e))
83
+ return jsonify({"error": "サーバーエラー", "details": str(e)}), 500
84
+ @app.route('/reset', methods=['GET'])
85
+ def reset():
86
+ global users
87
+ users=[]
88
+ return 200
89
+ @app.route('/upload_base_audio', methods=['POST'])
90
+ def upload_base_audio():
91
+ global users#グローバル変数を編集できるようにする
92
+ try:
93
+ data = request.get_json()
94
+ if not data or 'audio_data' not in data or 'name' not in data:
95
+ return jsonify({"error": "音声データまたは名前がありません"}), 400
96
+ name = data['name'] # 名前を取得
97
+ print(name)
98
+
99
+
100
+ users.append(name)
101
+ users=list(set(users))#重複排除
102
+ print(users)
103
+
104
+
105
+ audio_path=process.save_audio_from_base64(
106
+ base64_audio=data['audio_data'], # 音声データ
107
+ output_dir= "/tmp/data/base_audio", #保存先
108
+ output_filename=f"{name}.wav" # 固定ファイル名(必要に応じて generate_filename() で一意のファイル名に変更可能)
109
+ )
110
+ return jsonify({"state": "Registration Success!", "path": audio_path}), 200
111
+ except Exception as e:
112
+ print("Error in /upload_base_audio:", str(e))
113
+ return jsonify({"error": "サーバーエラー", "details": str(e)}), 500
114
+
115
+ if __name__ == '__main__':
116
+ port = int(os.environ.get("PORT", 7860))
117
  app.run(debug=True, host="0.0.0.0", port=port)
process.py CHANGED
@@ -1,185 +1,185 @@
1
-
2
- import os
3
- import shutil
4
- import numpy as np
5
- import string
6
- import random
7
- from datetime import datetime
8
- from pyannote.audio import Model, Inference
9
- from pydub import AudioSegment
10
- import base64
11
- import binascii
12
-
13
- class AudioProcessor():
14
- def __init__(self,cache_dir = "/tmp/hf_cache"):
15
- hf_token = os.environ.get("HF")
16
- if hf_token is None:
17
- raise ValueError("HUGGINGFACE_HUB_TOKEN が設定されていません。")
18
- os.makedirs(cache_dir, exist_ok=True)
19
- # pyannote モデルの読み込み
20
- model = Model.from_pretrained("pyannote/embedding", use_auth_token=hf_token, cache_dir=cache_dir)
21
- self.inference = Inference(model)
22
-
23
-
24
- def cosine_similarity(self,vec1, vec2):
25
- vec1 = vec1 / np.linalg.norm(vec1)
26
- vec2 = vec2 / np.linalg.norm(vec2)
27
- return np.dot(vec1, vec2)
28
-
29
- def segment_audio(self, path, target_path='/tmp/setup_voice', seg_duration=1.0):
30
- # 出力先ディレクトリが存在していれば中身をクリアする
31
- if os.path.exists(target_path):
32
- for file in os.listdir(target_path):
33
- file_path = os.path.join(target_path, file)
34
- if os.path.isfile(file_path):
35
- os.remove(file_path)
36
- else:
37
- os.makedirs(target_path, exist_ok=True)
38
-
39
- base_sound = AudioSegment.from_file(path)
40
- duration_ms = len(base_sound)
41
- seg_duration_ms = int(seg_duration * 1000)
42
-
43
- for i, start in enumerate(range(0, duration_ms, seg_duration_ms)):
44
- end = min(start + seg_duration_ms, duration_ms)
45
- segment = base_sound[start:end]
46
- # セグメントが指定長さに満たない場合、無音でパディングする
47
- if len(segment) < seg_duration_ms:
48
- silence = AudioSegment.silent(duration=(seg_duration_ms - len(segment)))
49
- segment = segment + silence
50
-
51
- segment.export(os.path.join(target_path, f'{i}.wav'), format="wav")
52
-
53
- return target_path, duration_ms
54
-
55
-
56
- def calculate_similarity(self,path1, path2):
57
- embedding1 = self.inference(path1)
58
- embedding2 = self.inference(path2)
59
- return float(self.cosine_similarity(embedding1.data.flatten(), embedding2.data.flatten()))
60
-
61
- def generate_random_string(self,length):
62
- letters = string.ascii_letters + string.digits
63
- return ''.join(random.choice(letters) for i in range(length))
64
-
65
- def generate_filename(self,random_length):
66
- random_string = self.generate_random_string(random_length)
67
- current_time = datetime.now().strftime("%Y%m%d%H%M%S")
68
- filename = f"{current_time}_{random_string}.wav"
69
- return filename
70
-
71
- def process_audio(self, reference_path, input_path, output_folder='/tmp/data/matched_segments', seg_duration=1.0, threshold=0.5):
72
- # 出力先ディレクトリの中身をクリアする
73
- if os.path.exists(output_folder):
74
- for file in os.listdir(output_folder):
75
- file_path = os.path.join(output_folder, file)
76
- if os.path.isfile(file_path):
77
- os.remove(file_path)
78
- else:
79
- os.makedirs(output_folder, exist_ok=True)
80
-
81
- segmented_path, total_duration_ms = self.segment_audio(input_path, seg_duration=seg_duration)
82
-
83
- matched_time_ms = 0
84
- for file in sorted(os.listdir(segmented_path)):
85
- segment_file = os.path.join(segmented_path, file)
86
- similarity = self.calculate_similarity(segment_file, reference_path)
87
- if similarity > threshold:
88
- shutil.copy(segment_file, output_folder)
89
- matched_time_ms += len(AudioSegment.from_file(segment_file))
90
-
91
- unmatched_time_ms = total_duration_ms - matched_time_ms
92
- return matched_time_ms, unmatched_time_ms
93
-
94
-
95
- def process_multi_audio(self, reference_pathes, input_path, output_folder='/tmp/data/matched_multi_segments', seg_duration=1.0, threshold=0.5):
96
- # 出力先ディレクトリの中身をクリアする
97
- if os.path.exists(output_folder):
98
- for file in os.listdir(output_folder):
99
- file_path = os.path.join(output_folder, file)
100
- if os.path.isfile(file_path):
101
- os.remove(file_path)
102
- else:
103
- os.makedirs(output_folder, exist_ok=True)
104
-
105
- # 入力音声をセグメントに分割
106
- segmented_path, total_duration_ms = self.segment_audio(input_path, seg_duration=seg_duration)
107
- segment_files = sorted(os.listdir(segmented_path))
108
- num_segments = len(segment_files)
109
-
110
- # 各リファレンスごとにセグメントとの類似度を計算し、行列 (rows: reference, columns: segment) を作成
111
- similarity = []
112
- for reference_path in reference_pathes:
113
- ref_similarity = []
114
- for file in segment_files:
115
- segment_file = os.path.join(segmented_path, file)
116
- sim = self.calculate_similarity(segment_file, reference_path)
117
- ref_similarity.append(sim)
118
- similarity.append(ref_similarity)
119
-
120
- # 転置行列を作成 (rows: segment, columns: reference)
121
- similarity_transposed = []
122
- for seg_idx in range(num_segments):
123
- seg_sim = []
124
- for ref_idx in range(len(reference_pathes)):
125
- seg_sim.append(similarity[ref_idx][seg_idx])
126
- similarity_transposed.append(seg_sim)
127
-
128
- # 各セグメントについて、最も高い類似度のリファレンスを選択
129
- best_matches = []
130
- for seg_sim in similarity_transposed:
131
- best_ref = np.argmax(seg_sim) # 最も類似度の高いリファレンスのインデックス
132
- # 閾値チェック (必要に応じて)
133
- if seg_sim[best_ref] < threshold:
134
- best_matches.append(None) # 閾値未満の場合はマッチなしとする
135
- else:
136
- best_matches.append(best_ref)
137
-
138
- # 各リファレンスごとに一致時間を集計 (セグメントごとの長さ seg_duration を加算)
139
- matched_time = [0] * len(reference_pathes)
140
- for match in best_matches:
141
- if match is not None:
142
- matched_time[match] += seg_duration
143
-
144
- return matched_time
145
-
146
-
147
- def save_audio_from_base64(self,base64_audio,output_dir,output_filename,temp_format='webm'):
148
- try:
149
- # Base64デコードして音声バイナリを取得
150
- try:
151
- audio_binary = base64.b64decode(base64_audio)
152
- except binascii.Error:
153
- raise ValueError("Invalid Base64 input data")
154
-
155
- # 保存するディレクトリを作成
156
- os.makedirs(output_dir,exist_ok=True)
157
-
158
- # 一時ファイルに保存(実際の形式は WebM などと仮定)
159
- temp_audio_path = os.path.join(output_dir,"temp_audio")
160
- try:
161
- with open(temp_audio_path,'wb') as f:
162
- f.write(audio_binary)
163
-
164
- # pydub を使って一時ファイルを WAV に変換
165
- # ※ここでは WebM 形式と仮定していますが、実際の形式に合わせて format の指定を変更してください
166
- try:
167
- audio = AudioSegment.from_file(temp_audio_path,format=temp_format)
168
- except Exception as e:
169
- audio = AudioSegment.from_file(temp_audio_path) #形式が不明な場合は自動判別させる(ただし変換できない場合もあり)
170
-
171
- # 音声ファイルを保存
172
- wav_audio_path = os.path.join(output_dir,output_filename)
173
- audio.export(wav_audio_path,format="wav")
174
- finally:
175
- #一時ファイルを削除
176
- if os.path.exists(temp_audio_path):
177
- os.remove(temp_audio_path)
178
- return wav_audio_path
179
- except ValueError as e:
180
- print(f"Value Error: {e}")
181
- except FileNotFoundError as e:
182
- print(f"File Not Found Error: {e}")
183
- except Exception as e:
184
- print(f"Unexpected Error: {e}")
185
  return None
 
1
+
2
+ import os
3
+ import shutil
4
+ import numpy as np
5
+ import string
6
+ import random
7
+ from datetime import datetime
8
+ from pyannote.audio import Model, Inference
9
+ from pydub import AudioSegment
10
+ import base64
11
+ import binascii
12
+
13
+ class AudioProcessor():
14
+ def __init__(self,cache_dir = "/tmp/hf_cache"):
15
+ hf_token = os.environ.get("HF")
16
+ if hf_token is None:
17
+ raise ValueError("HUGGINGFACE_HUB_TOKEN が設定されていません。")
18
+ os.makedirs(cache_dir, exist_ok=True)
19
+ # pyannote モデルの読み込み
20
+ model = Model.from_pretrained("pyannote/embedding", use_auth_token=hf_token, cache_dir=cache_dir)
21
+ self.inference = Inference(model)
22
+
23
+
24
+ def cosine_similarity(self,vec1, vec2):
25
+ vec1 = vec1 / np.linalg.norm(vec1)
26
+ vec2 = vec2 / np.linalg.norm(vec2)
27
+ return np.dot(vec1, vec2)
28
+
29
+ def segment_audio(self, path, target_path='/tmp/setup_voice', seg_duration=1.0):
30
+ # 出力先ディレクトリが存在していれば中身をクリアする
31
+ if os.path.exists(target_path):
32
+ for file in os.listdir(target_path):
33
+ file_path = os.path.join(target_path, file)
34
+ if os.path.isfile(file_path):
35
+ os.remove(file_path)
36
+ else:
37
+ os.makedirs(target_path, exist_ok=True)
38
+
39
+ base_sound = AudioSegment.from_file(path)
40
+ duration_ms = len(base_sound)
41
+ seg_duration_ms = int(seg_duration * 1000)
42
+
43
+ for i, start in enumerate(range(0, duration_ms, seg_duration_ms)):
44
+ end = min(start + seg_duration_ms, duration_ms)
45
+ segment = base_sound[start:end]
46
+ # セグメントが指定長さに満たない場合、無音でパディングする
47
+ if len(segment) < seg_duration_ms:
48
+ silence = AudioSegment.silent(duration=(seg_duration_ms - len(segment)))
49
+ segment = segment + silence
50
+
51
+ segment.export(os.path.join(target_path, f'{i}.wav'), format="wav")
52
+
53
+ return target_path, duration_ms
54
+
55
+
56
+ def calculate_similarity(self,path1, path2):
57
+ embedding1 = self.inference(path1)
58
+ embedding2 = self.inference(path2)
59
+ return float(self.cosine_similarity(embedding1.data.flatten(), embedding2.data.flatten()))
60
+
61
+ def generate_random_string(self,length):
62
+ letters = string.ascii_letters + string.digits
63
+ return ''.join(random.choice(letters) for i in range(length))
64
+
65
+ def generate_filename(self,random_length):
66
+ random_string = self.generate_random_string(random_length)
67
+ current_time = datetime.now().strftime("%Y%m%d%H%M%S")
68
+ filename = f"{current_time}_{random_string}.wav"
69
+ return filename
70
+
71
+ def process_audio(self, reference_path, input_path, output_folder='/tmp/data/matched_segments', seg_duration=1.0, threshold=0.5):
72
+ # 出力先ディレクトリの中身をクリアする
73
+ if os.path.exists(output_folder):
74
+ for file in os.listdir(output_folder):
75
+ file_path = os.path.join(output_folder, file)
76
+ if os.path.isfile(file_path):
77
+ os.remove(file_path)
78
+ else:
79
+ os.makedirs(output_folder, exist_ok=True)
80
+
81
+ segmented_path, total_duration_ms = self.segment_audio(input_path, seg_duration=seg_duration)
82
+
83
+ matched_time_ms = 0
84
+ for file in sorted(os.listdir(segmented_path)):
85
+ segment_file = os.path.join(segmented_path, file)
86
+ similarity = self.calculate_similarity(segment_file, reference_path)
87
+ if similarity > threshold:
88
+ shutil.copy(segment_file, output_folder)
89
+ matched_time_ms += len(AudioSegment.from_file(segment_file))
90
+
91
+ unmatched_time_ms = total_duration_ms - matched_time_ms
92
+ return matched_time_ms, unmatched_time_ms
93
+
94
+
95
+ def process_multi_audio(self, reference_pathes, input_path, output_folder='/tmp/data/matched_multi_segments', seg_duration=1.0, threshold=0.5):
96
+ # 出力先ディレクトリの中身をクリアする
97
+ if os.path.exists(output_folder):
98
+ for file in os.listdir(output_folder):
99
+ file_path = os.path.join(output_folder, file)
100
+ if os.path.isfile(file_path):
101
+ os.remove(file_path)
102
+ else:
103
+ os.makedirs(output_folder, exist_ok=True)
104
+
105
+ # 入力音声をセグメントに分割
106
+ segmented_path, total_duration_ms = self.segment_audio(input_path, seg_duration=seg_duration)
107
+ segment_files = sorted(os.listdir(segmented_path))
108
+ num_segments = len(segment_files)
109
+
110
+ # 各リファレンスごとにセグメントとの類似度を計算し、行列 (rows: reference, columns: segment) を作成
111
+ similarity = []
112
+ for reference_path in reference_pathes:
113
+ ref_similarity = []
114
+ for file in segment_files:
115
+ segment_file = os.path.join(segmented_path, file)
116
+ sim = self.calculate_similarity(segment_file, reference_path)
117
+ ref_similarity.append(sim)
118
+ similarity.append(ref_similarity)
119
+
120
+ # 転置行列を作成 (rows: segment, columns: reference)
121
+ similarity_transposed = []
122
+ for seg_idx in range(num_segments):
123
+ seg_sim = []
124
+ for ref_idx in range(len(reference_pathes)):
125
+ seg_sim.append(similarity[ref_idx][seg_idx])
126
+ similarity_transposed.append(seg_sim)
127
+
128
+ # 各セグメントについて、最も高い類似度のリファレンスを選択
129
+ best_matches = []
130
+ for seg_sim in similarity_transposed:
131
+ best_ref = np.argmax(seg_sim) # 最も類似度の高いリファレンスのインデックス
132
+ # 閾値チェック (必要に応じて)
133
+ if seg_sim[best_ref] < threshold:
134
+ best_matches.append(None) # 閾値未満の場合はマッチなしとする
135
+ else:
136
+ best_matches.append(best_ref)
137
+
138
+ # 各リファレンスごとに一致時間を集計 (セグメントごとの長さ seg_duration を加算)
139
+ matched_time = [0] * len(reference_pathes)
140
+ for match in best_matches:
141
+ if match is not None:
142
+ matched_time[match] += seg_duration
143
+
144
+ return matched_time
145
+
146
+
147
+ def save_audio_from_base64(self,base64_audio,output_dir,output_filename,temp_format='webm'):
148
+ try:
149
+ # Base64デコードして音声バイナリ���取得
150
+ try:
151
+ audio_binary = base64.b64decode(base64_audio)
152
+ except binascii.Error:
153
+ raise ValueError("Invalid Base64 input data")
154
+
155
+ # 保存するディレクトリを作成
156
+ os.makedirs(output_dir,exist_ok=True)
157
+
158
+ # 一時ファイルに保存(実際の形式は WebM などと仮定)
159
+ temp_audio_path = os.path.join(output_dir,"temp_audio")
160
+ try:
161
+ with open(temp_audio_path,'wb') as f:
162
+ f.write(audio_binary)
163
+
164
+ # pydub を使って一時ファイルを WAV に変換
165
+ # ※ここでは WebM 形式と仮定していますが、実際の形式に合わせて format の指定を変更してください
166
+ try:
167
+ audio = AudioSegment.from_file(temp_audio_path,format=temp_format)
168
+ except Exception as e:
169
+ audio = AudioSegment.from_file(temp_audio_path) #形式が不明な場合は自動判別させる(ただし変換できない場合もあり)
170
+
171
+ # 音声ファイルを保存
172
+ wav_audio_path = os.path.join(output_dir,output_filename)
173
+ audio.export(wav_audio_path,format="wav")
174
+ finally:
175
+ #一時ファイルを削除
176
+ if os.path.exists(temp_audio_path):
177
+ os.remove(temp_audio_path)
178
+ return wav_audio_path
179
+ except ValueError as e:
180
+ print(f"Value Error: {e}")
181
+ except FileNotFoundError as e:
182
+ print(f"File Not Found Error: {e}")
183
+ except Exception as e:
184
+ print(f"Unexpected Error: {e}")
185
  return None
requirements.txt CHANGED
@@ -1,19 +1,19 @@
1
- Flask==2.2.5
2
- Flask-WTF
3
- pyannote.audio==2.1.1
4
- numpy==1.23.5
5
- pydub==0.25.1
6
- matplotlib==3.6.3
7
- python-dotenv
8
- uwsgi
9
- Flask-SQLAlchemy==3.0.5
10
- PyMySQL
11
- Flask-Login==0.6.3
12
- requests==2.32.3
13
- google-auth==2.38.0
14
- google-auth-oauthlib==1.2.1
15
- google-auth-httplib2==0.2.0
16
- faster-whisper
17
- Flask-Migrate
18
- requests
19
-
 
1
+ Flask==2.2.5
2
+ Flask-WTF
3
+ pyannote.audio==2.1.1
4
+ numpy==1.23.5
5
+ pydub==0.25.1
6
+ matplotlib==3.6.3
7
+ python-dotenv
8
+ uwsgi
9
+ Flask-SQLAlchemy==3.0.5
10
+ PyMySQL
11
+ Flask-Login==0.6.3
12
+ requests==2.32.3
13
+ google-auth==2.38.0
14
+ google-auth-oauthlib==1.2.1
15
+ google-auth-httplib2==0.2.0
16
+ faster-whisper
17
+ Flask-Migrate
18
+ requests
19
+