Spaces:
Sleeping
Sleeping
Tweak
Browse files- app.py +44 -64
- lang_id.py +48 -0
- whisper.py +22 -0
app.py
CHANGED
@@ -3,57 +3,24 @@ import numpy as np
|
|
3 |
import pandas as pd
|
4 |
import torch
|
5 |
import torchaudio
|
6 |
-
import time
|
7 |
-
from transformers import pipeline
|
8 |
-
from speechbrain.inference.classifiers import EncoderClassifier
|
9 |
-
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
10 |
|
11 |
-
|
12 |
-
|
13 |
-
processor = WhisperProcessor.from_pretrained(model_name)
|
14 |
-
model = WhisperForConditionalGeneration.from_pretrained(model_name)
|
15 |
-
# デバイスの設定(GPUが利用可能な場合はGPUを使用)
|
16 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
17 |
-
model.to(device)
|
18 |
|
19 |
-
|
20 |
-
#
|
21 |
-
|
|
|
|
|
|
|
|
|
22 |
|
23 |
# アプリケーションの状態を保持する変数
|
24 |
data = []
|
25 |
current_chunk = []
|
26 |
|
27 |
-
index_to_lang = {
|
28 |
-
0: 'Abkhazian', 1: 'Afrikaans', 2: 'Amharic', 3: 'Arabic', 4: 'Assamese',
|
29 |
-
5: 'Azerbaijani', 6: 'Bashkir', 7: 'Belarusian', 8: 'Bulgarian', 9: 'Bengali',
|
30 |
-
10: 'Tibetan', 11: 'Breton', 12: 'Bosnian', 13: 'Catalan', 14: 'Cebuano',
|
31 |
-
15: 'Czech', 16: 'Welsh', 17: 'Danish', 18: 'German', 19: 'Greek',
|
32 |
-
20: 'English', 21: 'Esperanto', 22: 'Spanish', 23: 'Estonian', 24: 'Basque',
|
33 |
-
25: 'Persian', 26: 'Finnish', 27: 'Faroese', 28: 'French', 29: 'Galician',
|
34 |
-
30: 'Guarani', 31: 'Gujarati', 32: 'Manx', 33: 'Hausa', 34: 'Hawaiian',
|
35 |
-
35: 'Hindi', 36: 'Croatian', 37: 'Haitian', 38: 'Hungarian', 39: 'Armenian',
|
36 |
-
40: 'Interlingua', 41: 'Indonesian', 42: 'Icelandic', 43: 'Italian', 44: 'Hebrew',
|
37 |
-
45: 'Japanese', 46: 'Javanese', 47: 'Georgian', 48: 'Kazakh', 49: 'Central Khmer',
|
38 |
-
50: 'Kannada', 51: 'Korean', 52: 'Latin', 53: 'Luxembourgish', 54: 'Lingala',
|
39 |
-
55: 'Lao', 56: 'Lithuanian', 57: 'Latvian', 58: 'Malagasy', 59: 'Maori',
|
40 |
-
60: 'Macedonian', 61: 'Malayalam', 62: 'Mongolian', 63: 'Marathi', 64: 'Malay',
|
41 |
-
65: 'Maltese', 66: 'Burmese', 67: 'Nepali', 68: 'Dutch', 69: 'Norwegian Nynorsk',
|
42 |
-
70: 'Norwegian', 71: 'Occitan', 72: 'Panjabi', 73: 'Polish', 74: 'Pushto',
|
43 |
-
75: 'Portuguese', 76: 'Romanian', 77: 'Russian', 78: 'Sanskrit', 79: 'Scots',
|
44 |
-
80: 'Sindhi', 81: 'Sinhala', 82: 'Slovak', 83: 'Slovenian', 84: 'Shona',
|
45 |
-
85: 'Somali', 86: 'Albanian', 87: 'Serbian', 88: 'Sundanese', 89: 'Swedish',
|
46 |
-
90: 'Swahili', 91: 'Tamil', 92: 'Telugu', 93: 'Tajik', 94: 'Thai',
|
47 |
-
95: 'Turkmen', 96: 'Tagalog', 97: 'Turkish', 98: 'Tatar', 99: 'Ukrainian',
|
48 |
-
100: 'Urdu', 101: 'Uzbek', 102: 'Vietnamese', 103: 'Waray', 104: 'Yiddish',
|
49 |
-
105: 'Yoruba', 106: 'Chinese'
|
50 |
-
}
|
51 |
-
lang_index_JA_EN = {
|
52 |
-
'ja': 45,
|
53 |
-
'en': 20,
|
54 |
-
}
|
55 |
SAMPLING_RATE = 16000
|
56 |
-
CHUNK_DURATION = 5
|
57 |
|
58 |
|
59 |
def normalize_audio(audio):
|
@@ -77,7 +44,6 @@ def process_audio(audio):
|
|
77 |
print(audio)
|
78 |
sr, audio_data = audio
|
79 |
|
80 |
-
|
81 |
print(audio_data.shape, audio_data.dtype)
|
82 |
# 一番最初にSampling rateを揃えておく
|
83 |
audio_data = resample_audio(audio_data, sr, target_sr=SAMPLING_RATE)
|
@@ -98,22 +64,19 @@ def process_audio(audio):
|
|
98 |
print(f"Processing audio chunk of length {len(chunk)}")
|
99 |
volume_norm = np.linalg.norm(chunk) / np.finfo(np.float32).max
|
100 |
length = len(chunk) / SAMPLING_RATE # 音声データの長さ(秒)
|
101 |
-
|
102 |
|
103 |
# 日本語と英語の確率値を取得
|
104 |
-
ja_prob =
|
105 |
-
en_prob =
|
|
|
106 |
ja_en = 'ja' if ja_prob > en_prob else 'en'
|
107 |
|
108 |
# Top 3言語を取得
|
109 |
-
|
110 |
-
top3_languages = [index_to_lang[idx.item()] for idx in top3_indices]
|
111 |
|
112 |
-
|
113 |
-
|
114 |
-
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
115 |
-
# transcript = transcribe_audio(chunk, SAMPLING_RATE)
|
116 |
-
print(transcription)
|
117 |
|
118 |
data.append({
|
119 |
# "Time": pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
|
@@ -131,18 +94,35 @@ def process_audio(audio):
|
|
131 |
# 未処理の残りのデータを保持
|
132 |
current_chunk = [total_chunk]
|
133 |
|
134 |
-
|
135 |
-
|
|
|
136 |
outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])]
|
137 |
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
|
148 |
if __name__ == "__main__":
|
|
|
3 |
import pandas as pd
|
4 |
import torch
|
5 |
import torchaudio
|
|
|
|
|
|
|
|
|
6 |
|
7 |
+
from lang_id import identify_languages
|
8 |
+
from whisper import transcribe
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
+
# # Whisperモデルとプロセッサのロード
|
11 |
+
# model_name = "openai/whisper-tiny"
|
12 |
+
# processor = WhisperProcessor.from_pretrained(model_name)
|
13 |
+
# model = WhisperForConditionalGeneration.from_pretrained(model_name)
|
14 |
+
# # デバイスの設定(GPUが利用可能な場合はGPUを使用)
|
15 |
+
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
16 |
+
# model.to(device)
|
17 |
|
18 |
# アプリケーションの状態を保持する変数
|
19 |
data = []
|
20 |
current_chunk = []
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
SAMPLING_RATE = 16000
|
23 |
+
CHUNK_DURATION = 5 # 5秒ごとのチャンク
|
24 |
|
25 |
|
26 |
def normalize_audio(audio):
|
|
|
44 |
print(audio)
|
45 |
sr, audio_data = audio
|
46 |
|
|
|
47 |
print(audio_data.shape, audio_data.dtype)
|
48 |
# 一番最初にSampling rateを揃えておく
|
49 |
audio_data = resample_audio(audio_data, sr, target_sr=SAMPLING_RATE)
|
|
|
64 |
print(f"Processing audio chunk of length {len(chunk)}")
|
65 |
volume_norm = np.linalg.norm(chunk) / np.finfo(np.float32).max
|
66 |
length = len(chunk) / SAMPLING_RATE # 音声データの長さ(秒)
|
67 |
+
selected_scores, all_scores = identify_languages(chunk)
|
68 |
|
69 |
# 日本語と英語の確率値を取得
|
70 |
+
ja_prob = selected_scores['Japanese']
|
71 |
+
en_prob = selected_scores['English']
|
72 |
+
|
73 |
ja_en = 'ja' if ja_prob > en_prob else 'en'
|
74 |
|
75 |
# Top 3言語を取得
|
76 |
+
top3_languages = ", ".join([f"{lang} ({all_scores[lang]:.2f})" for lang in sorted(all_scores, key=all_scores.get, reverse=True)[:3]])
|
|
|
77 |
|
78 |
+
# テキストの認識
|
79 |
+
transcription = transcribe(chunk)
|
|
|
|
|
|
|
80 |
|
81 |
data.append({
|
82 |
# "Time": pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
|
|
|
94 |
# 未処理の残りのデータを保持
|
95 |
current_chunk = [total_chunk]
|
96 |
|
97 |
+
|
98 |
+
inputs_file = gr.Audio(sources=["upload"], type="numpy")
|
99 |
+
inputs_stream = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
|
100 |
outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])]
|
101 |
|
102 |
+
with gr.Blocks() as demo:
|
103 |
+
with gr.TabItem("Upload"):
|
104 |
+
inputs_file = gr.Audio(sources=["upload"], type="numpy")
|
105 |
+
outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])]
|
106 |
+
gr.Interface(
|
107 |
+
fn=process_audio,
|
108 |
+
inputs=inputs_file,
|
109 |
+
outputs=outputs,
|
110 |
+
live=False,
|
111 |
+
title="File Audio Processing",
|
112 |
+
description="Upload an audio file to see the processing results."
|
113 |
+
)
|
114 |
+
|
115 |
+
with gr.TabItem("Microphone"):
|
116 |
+
inputs_stream = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
|
117 |
+
outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])]
|
118 |
+
gr.Interface(
|
119 |
+
fn=process_audio,
|
120 |
+
inputs=inputs_stream,
|
121 |
+
outputs=outputs,
|
122 |
+
live=True,
|
123 |
+
title="Real-time Audio Processing",
|
124 |
+
description="Speak into the microphone and see real-time audio processing results."
|
125 |
+
)
|
126 |
|
127 |
|
128 |
if __name__ == "__main__":
|
lang_id.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from speechbrain.inference.classifiers import EncoderClassifier
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
|
5 |
+
|
6 |
+
INDEX_TO_LANG = {
|
7 |
+
0: 'Abkhazian', 1: 'Afrikaans', 2: 'Amharic', 3: 'Arabic', 4: 'Assamese',
|
8 |
+
5: 'Azerbaijani', 6: 'Bashkir', 7: 'Belarusian', 8: 'Bulgarian', 9: 'Bengali',
|
9 |
+
10: 'Tibetan', 11: 'Breton', 12: 'Bosnian', 13: 'Catalan', 14: 'Cebuano',
|
10 |
+
15: 'Czech', 16: 'Welsh', 17: 'Danish', 18: 'German', 19: 'Greek',
|
11 |
+
20: 'English', 21: 'Esperanto', 22: 'Spanish', 23: 'Estonian', 24: 'Basque',
|
12 |
+
25: 'Persian', 26: 'Finnish', 27: 'Faroese', 28: 'French', 29: 'Galician',
|
13 |
+
30: 'Guarani', 31: 'Gujarati', 32: 'Manx', 33: 'Hausa', 34: 'Hawaiian',
|
14 |
+
35: 'Hindi', 36: 'Croatian', 37: 'Haitian', 38: 'Hungarian', 39: 'Armenian',
|
15 |
+
40: 'Interlingua', 41: 'Indonesian', 42: 'Icelandic', 43: 'Italian', 44: 'Hebrew',
|
16 |
+
45: 'Japanese', 46: 'Javanese', 47: 'Georgian', 48: 'Kazakh', 49: 'Central Khmer',
|
17 |
+
50: 'Kannada', 51: 'Korean', 52: 'Latin', 53: 'Luxembourgish', 54: 'Lingala',
|
18 |
+
55: 'Lao', 56: 'Lithuanian', 57: 'Latvian', 58: 'Malagasy', 59: 'Maori',
|
19 |
+
60: 'Macedonian', 61: 'Malayalam', 62: 'Mongolian', 63: 'Marathi', 64: 'Malay',
|
20 |
+
65: 'Maltese', 66: 'Burmese', 67: 'Nepali', 68: 'Dutch', 69: 'Norwegian Nynorsk',
|
21 |
+
70: 'Norwegian', 71: 'Occitan', 72: 'Panjabi', 73: 'Polish', 74: 'Pushto',
|
22 |
+
75: 'Portuguese', 76: 'Romanian', 77: 'Russian', 78: 'Sanskrit', 79: 'Scots',
|
23 |
+
80: 'Sindhi', 81: 'Sinhala', 82: 'Slovak', 83: 'Slovenian', 84: 'Shona',
|
24 |
+
85: 'Somali', 86: 'Albanian', 87: 'Serbian', 88: 'Sundanese', 89: 'Swedish',
|
25 |
+
90: 'Swahili', 91: 'Tamil', 92: 'Telugu', 93: 'Tajik', 94: 'Thai',
|
26 |
+
95: 'Turkmen', 96: 'Tagalog', 97: 'Turkish', 98: 'Tatar', 99: 'Ukrainian',
|
27 |
+
100: 'Urdu', 101: 'Uzbek', 102: 'Vietnamese', 103: 'Waray', 104: 'Yiddish',
|
28 |
+
105: 'Yoruba', 106: 'Chinese'
|
29 |
+
}
|
30 |
+
LANG_TO_INDEX = {v: k for k, v in INDEX_TO_LANG.items()}
|
31 |
+
|
32 |
+
# speechbrainの言語分類モデルのロード
|
33 |
+
language_id = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa")
|
34 |
+
|
35 |
+
|
36 |
+
def identify_languages(chunk: np.ndarray, languages: list[str] = ["Japanese", "English"]) -> tuple[dict, dict]:
|
37 |
+
"""
|
38 |
+
言語分類を行う関数
|
39 |
+
"""
|
40 |
+
# 言語分類
|
41 |
+
# outputs = language_id.encode_batch([chunk])
|
42 |
+
lang_scores, _, _, _ = language_id.classify_batch(torch.from_numpy(chunk).unsqueeze(0))
|
43 |
+
|
44 |
+
# 結果の整形
|
45 |
+
all_scores = {INDEX_TO_LANG[i]: score for i, score in enumerate(lang_scores[0])}
|
46 |
+
selected_scores = {lang: float(all_scores[lang]) for lang in languages}
|
47 |
+
|
48 |
+
return selected_scores, all_scores
|
whisper.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
4 |
+
|
5 |
+
# Whisperモデルとプロセッサのロード
|
6 |
+
model_name = "openai/whisper-tiny"
|
7 |
+
processor = WhisperProcessor.from_pretrained(model_name)
|
8 |
+
model = WhisperForConditionalGeneration.from_pretrained(model_name)
|
9 |
+
# デバイスの設定(GPUが利用可能な場合はGPUを使用)
|
10 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
11 |
+
model.to(device)
|
12 |
+
|
13 |
+
SAMPLING_RATE = 16000
|
14 |
+
CHUNK_DURATION = 5 # 5秒ごとのチャンク
|
15 |
+
|
16 |
+
|
17 |
+
def transcribe(chunk: np.ndarray) -> str:
|
18 |
+
input_features = processor(chunk, sampling_rate=SAMPLING_RATE, return_tensors="pt").input_features.to(device)
|
19 |
+
predicted_ids = model.generate(input_features)
|
20 |
+
transcriptions = processor.batch_decode(predicted_ids, skip_special_tokens=True)
|
21 |
+
print(transcriptions)
|
22 |
+
return "\n".join(transcriptions)
|