tan-z-tan commited on
Commit
5e7654d
·
1 Parent(s): 385ef96
Files changed (3) hide show
  1. app.py +44 -64
  2. lang_id.py +48 -0
  3. whisper.py +22 -0
app.py CHANGED
@@ -3,57 +3,24 @@ import numpy as np
3
  import pandas as pd
4
  import torch
5
  import torchaudio
6
- import time
7
- from transformers import pipeline
8
- from speechbrain.inference.classifiers import EncoderClassifier
9
- from transformers import WhisperProcessor, WhisperForConditionalGeneration
10
 
11
- # Whisperモデルとプロセッサのロード
12
- model_name = "openai/whisper-tiny"
13
- processor = WhisperProcessor.from_pretrained(model_name)
14
- model = WhisperForConditionalGeneration.from_pretrained(model_name)
15
- # デバイスの設定(GPUが利用可能な場合はGPUを使用)
16
- device = "cuda" if torch.cuda.is_available() else "cpu"
17
- model.to(device)
18
 
19
-
20
- # speechbrainの言語分類モデルのロード
21
- language_id = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa")
 
 
 
 
22
 
23
  # アプリケーションの状態を保持する変数
24
  data = []
25
  current_chunk = []
26
 
27
- index_to_lang = {
28
- 0: 'Abkhazian', 1: 'Afrikaans', 2: 'Amharic', 3: 'Arabic', 4: 'Assamese',
29
- 5: 'Azerbaijani', 6: 'Bashkir', 7: 'Belarusian', 8: 'Bulgarian', 9: 'Bengali',
30
- 10: 'Tibetan', 11: 'Breton', 12: 'Bosnian', 13: 'Catalan', 14: 'Cebuano',
31
- 15: 'Czech', 16: 'Welsh', 17: 'Danish', 18: 'German', 19: 'Greek',
32
- 20: 'English', 21: 'Esperanto', 22: 'Spanish', 23: 'Estonian', 24: 'Basque',
33
- 25: 'Persian', 26: 'Finnish', 27: 'Faroese', 28: 'French', 29: 'Galician',
34
- 30: 'Guarani', 31: 'Gujarati', 32: 'Manx', 33: 'Hausa', 34: 'Hawaiian',
35
- 35: 'Hindi', 36: 'Croatian', 37: 'Haitian', 38: 'Hungarian', 39: 'Armenian',
36
- 40: 'Interlingua', 41: 'Indonesian', 42: 'Icelandic', 43: 'Italian', 44: 'Hebrew',
37
- 45: 'Japanese', 46: 'Javanese', 47: 'Georgian', 48: 'Kazakh', 49: 'Central Khmer',
38
- 50: 'Kannada', 51: 'Korean', 52: 'Latin', 53: 'Luxembourgish', 54: 'Lingala',
39
- 55: 'Lao', 56: 'Lithuanian', 57: 'Latvian', 58: 'Malagasy', 59: 'Maori',
40
- 60: 'Macedonian', 61: 'Malayalam', 62: 'Mongolian', 63: 'Marathi', 64: 'Malay',
41
- 65: 'Maltese', 66: 'Burmese', 67: 'Nepali', 68: 'Dutch', 69: 'Norwegian Nynorsk',
42
- 70: 'Norwegian', 71: 'Occitan', 72: 'Panjabi', 73: 'Polish', 74: 'Pushto',
43
- 75: 'Portuguese', 76: 'Romanian', 77: 'Russian', 78: 'Sanskrit', 79: 'Scots',
44
- 80: 'Sindhi', 81: 'Sinhala', 82: 'Slovak', 83: 'Slovenian', 84: 'Shona',
45
- 85: 'Somali', 86: 'Albanian', 87: 'Serbian', 88: 'Sundanese', 89: 'Swedish',
46
- 90: 'Swahili', 91: 'Tamil', 92: 'Telugu', 93: 'Tajik', 94: 'Thai',
47
- 95: 'Turkmen', 96: 'Tagalog', 97: 'Turkish', 98: 'Tatar', 99: 'Ukrainian',
48
- 100: 'Urdu', 101: 'Uzbek', 102: 'Vietnamese', 103: 'Waray', 104: 'Yiddish',
49
- 105: 'Yoruba', 106: 'Chinese'
50
- }
51
- lang_index_JA_EN = {
52
- 'ja': 45,
53
- 'en': 20,
54
- }
55
  SAMPLING_RATE = 16000
56
- CHUNK_DURATION = 5 # 5秒ごとのチャンク
57
 
58
 
59
  def normalize_audio(audio):
@@ -77,7 +44,6 @@ def process_audio(audio):
77
  print(audio)
78
  sr, audio_data = audio
79
 
80
-
81
  print(audio_data.shape, audio_data.dtype)
82
  # 一番最初にSampling rateを揃えておく
83
  audio_data = resample_audio(audio_data, sr, target_sr=SAMPLING_RATE)
@@ -98,22 +64,19 @@ def process_audio(audio):
98
  print(f"Processing audio chunk of length {len(chunk)}")
99
  volume_norm = np.linalg.norm(chunk) / np.finfo(np.float32).max
100
  length = len(chunk) / SAMPLING_RATE # 音声データの長さ(秒)
101
- lang_guess = language_id.classify_batch(torch.from_numpy(chunk).unsqueeze(0))
102
 
103
  # 日本語と英語の確率値を取得
104
- ja_prob = lang_guess[0][0][lang_index_JA_EN['ja']].item()
105
- en_prob = lang_guess[0][0][lang_index_JA_EN['en']].item()
 
106
  ja_en = 'ja' if ja_prob > en_prob else 'en'
107
 
108
  # Top 3言語を取得
109
- top3_indices = torch.topk(lang_guess[0], 3, dim=1, largest=True).indices[0]
110
- top3_languages = [index_to_lang[idx.item()] for idx in top3_indices]
111
 
112
- input_features = processor(chunk, sampling_rate=SAMPLING_RATE, return_tensors="pt").input_features.to(device)
113
- predicted_ids = model.generate(input_features)
114
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
115
- # transcript = transcribe_audio(chunk, SAMPLING_RATE)
116
- print(transcription)
117
 
118
  data.append({
119
  # "Time": pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
@@ -131,18 +94,35 @@ def process_audio(audio):
131
  # 未処理の残りのデータを保持
132
  current_chunk = [total_chunk]
133
 
134
- # inputs = gr.Audio(sources=["microphone", "upload"], type="numpy", streaming=True)
135
- inputs = gr.Audio(sources=["microphone", "upload"], type="numpy")
 
136
  outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])]
137
 
138
- demo = gr.Interface(
139
- fn=process_audio,
140
- inputs=inputs,
141
- outputs=outputs,
142
- live=True,
143
- title="Real-time Audio Processing",
144
- description="Speak into the microphone and see real-time audio processing results."
145
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
 
148
  if __name__ == "__main__":
 
3
  import pandas as pd
4
  import torch
5
  import torchaudio
 
 
 
 
6
 
7
+ from lang_id import identify_languages
8
+ from whisper import transcribe
 
 
 
 
 
9
 
10
+ # # Whisperモデルとプロセッサのロード
11
+ # model_name = "openai/whisper-tiny"
12
+ # processor = WhisperProcessor.from_pretrained(model_name)
13
+ # model = WhisperForConditionalGeneration.from_pretrained(model_name)
14
+ # # デバイスの設定(GPUが利用可能な場合はGPUを使用)
15
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
16
+ # model.to(device)
17
 
18
  # アプリケーションの状態を保持する変数
19
  data = []
20
  current_chunk = []
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  SAMPLING_RATE = 16000
23
+ CHUNK_DURATION = 5 # 5秒ごとのチャンク
24
 
25
 
26
  def normalize_audio(audio):
 
44
  print(audio)
45
  sr, audio_data = audio
46
 
 
47
  print(audio_data.shape, audio_data.dtype)
48
  # 一番最初にSampling rateを揃えておく
49
  audio_data = resample_audio(audio_data, sr, target_sr=SAMPLING_RATE)
 
64
  print(f"Processing audio chunk of length {len(chunk)}")
65
  volume_norm = np.linalg.norm(chunk) / np.finfo(np.float32).max
66
  length = len(chunk) / SAMPLING_RATE # 音声データの長さ(秒)
67
+ selected_scores, all_scores = identify_languages(chunk)
68
 
69
  # 日本語と英語の確率値を取得
70
+ ja_prob = selected_scores['Japanese']
71
+ en_prob = selected_scores['English']
72
+
73
  ja_en = 'ja' if ja_prob > en_prob else 'en'
74
 
75
  # Top 3言語を取得
76
+ top3_languages = ", ".join([f"{lang} ({all_scores[lang]:.2f})" for lang in sorted(all_scores, key=all_scores.get, reverse=True)[:3]])
 
77
 
78
+ # テキストの認識
79
+ transcription = transcribe(chunk)
 
 
 
80
 
81
  data.append({
82
  # "Time": pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
 
94
  # 未処理の残りのデータを保持
95
  current_chunk = [total_chunk]
96
 
97
+
98
+ inputs_file = gr.Audio(sources=["upload"], type="numpy")
99
+ inputs_stream = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
100
  outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])]
101
 
102
+ with gr.Blocks() as demo:
103
+ with gr.TabItem("Upload"):
104
+ inputs_file = gr.Audio(sources=["upload"], type="numpy")
105
+ outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])]
106
+ gr.Interface(
107
+ fn=process_audio,
108
+ inputs=inputs_file,
109
+ outputs=outputs,
110
+ live=False,
111
+ title="File Audio Processing",
112
+ description="Upload an audio file to see the processing results."
113
+ )
114
+
115
+ with gr.TabItem("Microphone"):
116
+ inputs_stream = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
117
+ outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])]
118
+ gr.Interface(
119
+ fn=process_audio,
120
+ inputs=inputs_stream,
121
+ outputs=outputs,
122
+ live=True,
123
+ title="Real-time Audio Processing",
124
+ description="Speak into the microphone and see real-time audio processing results."
125
+ )
126
 
127
 
128
  if __name__ == "__main__":
lang_id.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from speechbrain.inference.classifiers import EncoderClassifier
2
+ import numpy as np
3
+ import torch
4
+
5
+
6
+ INDEX_TO_LANG = {
7
+ 0: 'Abkhazian', 1: 'Afrikaans', 2: 'Amharic', 3: 'Arabic', 4: 'Assamese',
8
+ 5: 'Azerbaijani', 6: 'Bashkir', 7: 'Belarusian', 8: 'Bulgarian', 9: 'Bengali',
9
+ 10: 'Tibetan', 11: 'Breton', 12: 'Bosnian', 13: 'Catalan', 14: 'Cebuano',
10
+ 15: 'Czech', 16: 'Welsh', 17: 'Danish', 18: 'German', 19: 'Greek',
11
+ 20: 'English', 21: 'Esperanto', 22: 'Spanish', 23: 'Estonian', 24: 'Basque',
12
+ 25: 'Persian', 26: 'Finnish', 27: 'Faroese', 28: 'French', 29: 'Galician',
13
+ 30: 'Guarani', 31: 'Gujarati', 32: 'Manx', 33: 'Hausa', 34: 'Hawaiian',
14
+ 35: 'Hindi', 36: 'Croatian', 37: 'Haitian', 38: 'Hungarian', 39: 'Armenian',
15
+ 40: 'Interlingua', 41: 'Indonesian', 42: 'Icelandic', 43: 'Italian', 44: 'Hebrew',
16
+ 45: 'Japanese', 46: 'Javanese', 47: 'Georgian', 48: 'Kazakh', 49: 'Central Khmer',
17
+ 50: 'Kannada', 51: 'Korean', 52: 'Latin', 53: 'Luxembourgish', 54: 'Lingala',
18
+ 55: 'Lao', 56: 'Lithuanian', 57: 'Latvian', 58: 'Malagasy', 59: 'Maori',
19
+ 60: 'Macedonian', 61: 'Malayalam', 62: 'Mongolian', 63: 'Marathi', 64: 'Malay',
20
+ 65: 'Maltese', 66: 'Burmese', 67: 'Nepali', 68: 'Dutch', 69: 'Norwegian Nynorsk',
21
+ 70: 'Norwegian', 71: 'Occitan', 72: 'Panjabi', 73: 'Polish', 74: 'Pushto',
22
+ 75: 'Portuguese', 76: 'Romanian', 77: 'Russian', 78: 'Sanskrit', 79: 'Scots',
23
+ 80: 'Sindhi', 81: 'Sinhala', 82: 'Slovak', 83: 'Slovenian', 84: 'Shona',
24
+ 85: 'Somali', 86: 'Albanian', 87: 'Serbian', 88: 'Sundanese', 89: 'Swedish',
25
+ 90: 'Swahili', 91: 'Tamil', 92: 'Telugu', 93: 'Tajik', 94: 'Thai',
26
+ 95: 'Turkmen', 96: 'Tagalog', 97: 'Turkish', 98: 'Tatar', 99: 'Ukrainian',
27
+ 100: 'Urdu', 101: 'Uzbek', 102: 'Vietnamese', 103: 'Waray', 104: 'Yiddish',
28
+ 105: 'Yoruba', 106: 'Chinese'
29
+ }
30
+ LANG_TO_INDEX = {v: k for k, v in INDEX_TO_LANG.items()}
31
+
32
+ # speechbrainの言語分類モデルのロード
33
+ language_id = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa")
34
+
35
+
36
+ def identify_languages(chunk: np.ndarray, languages: list[str] = ["Japanese", "English"]) -> tuple[dict, dict]:
37
+ """
38
+ 言語分類を行う関数
39
+ """
40
+ # 言語分類
41
+ # outputs = language_id.encode_batch([chunk])
42
+ lang_scores, _, _, _ = language_id.classify_batch(torch.from_numpy(chunk).unsqueeze(0))
43
+
44
+ # 結果の整形
45
+ all_scores = {INDEX_TO_LANG[i]: score for i, score in enumerate(lang_scores[0])}
46
+ selected_scores = {lang: float(all_scores[lang]) for lang in languages}
47
+
48
+ return selected_scores, all_scores
whisper.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
4
+
5
+ # Whisperモデルとプロセッサのロード
6
+ model_name = "openai/whisper-tiny"
7
+ processor = WhisperProcessor.from_pretrained(model_name)
8
+ model = WhisperForConditionalGeneration.from_pretrained(model_name)
9
+ # デバイスの設定(GPUが利用可能な場合はGPUを使用)
10
+ device = "cuda" if torch.cuda.is_available() else "cpu"
11
+ model.to(device)
12
+
13
+ SAMPLING_RATE = 16000
14
+ CHUNK_DURATION = 5 # 5秒ごとのチャンク
15
+
16
+
17
+ def transcribe(chunk: np.ndarray) -> str:
18
+ input_features = processor(chunk, sampling_rate=SAMPLING_RATE, return_tensors="pt").input_features.to(device)
19
+ predicted_ids = model.generate(input_features)
20
+ transcriptions = processor.batch_decode(predicted_ids, skip_special_tokens=True)
21
+ print(transcriptions)
22
+ return "\n".join(transcriptions)