Spaces:

tan-z-tan
/

speech_language_detection

Running

App Files Files Community

tan-z-tan commited on Jun 21, 2024

Commit

5e7654d

1 Parent(s): 385ef96

Tweak

Browse files

Files changed (3) hide show

app.py +44 -64
lang_id.py +48 -0
whisper.py +22 -0

app.py CHANGED Viewed

@@ -3,57 +3,24 @@ import numpy as np
 import pandas as pd
 import torch
 import torchaudio
-import time
-from transformers import pipeline
-from speechbrain.inference.classifiers import EncoderClassifier
-from transformers import WhisperProcessor, WhisperForConditionalGeneration
-# Whisperモデルとプロセッサのロード
-model_name = "openai/whisper-tiny"
-processor = WhisperProcessor.from_pretrained(model_name)
-model = WhisperForConditionalGeneration.from_pretrained(model_name)
-# デバイスの設定（GPUが利用可能な場合はGPUを使用）
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model.to(device)
-# speechbrainの言語分類モデルのロード
-language_id = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa")
 # アプリケーションの状態を保持する変数
 data = []
 current_chunk = []
-index_to_lang = {
-    0: 'Abkhazian', 1: 'Afrikaans', 2: 'Amharic', 3: 'Arabic', 4: 'Assamese',
-    5: 'Azerbaijani', 6: 'Bashkir', 7: 'Belarusian', 8: 'Bulgarian', 9: 'Bengali',
-    10: 'Tibetan', 11: 'Breton', 12: 'Bosnian', 13: 'Catalan', 14: 'Cebuano',
-    15: 'Czech', 16: 'Welsh', 17: 'Danish', 18: 'German', 19: 'Greek',
-    20: 'English', 21: 'Esperanto', 22: 'Spanish', 23: 'Estonian', 24: 'Basque',
-    25: 'Persian', 26: 'Finnish', 27: 'Faroese', 28: 'French', 29: 'Galician',
-    30: 'Guarani', 31: 'Gujarati', 32: 'Manx', 33: 'Hausa', 34: 'Hawaiian',
-    35: 'Hindi', 36: 'Croatian', 37: 'Haitian', 38: 'Hungarian', 39: 'Armenian',
-    40: 'Interlingua', 41: 'Indonesian', 42: 'Icelandic', 43: 'Italian', 44: 'Hebrew',
-    45: 'Japanese', 46: 'Javanese', 47: 'Georgian', 48: 'Kazakh', 49: 'Central Khmer',
-    50: 'Kannada', 51: 'Korean', 52: 'Latin', 53: 'Luxembourgish', 54: 'Lingala',
-    55: 'Lao', 56: 'Lithuanian', 57: 'Latvian', 58: 'Malagasy', 59: 'Maori',
-    60: 'Macedonian', 61: 'Malayalam', 62: 'Mongolian', 63: 'Marathi', 64: 'Malay',
-    65: 'Maltese', 66: 'Burmese', 67: 'Nepali', 68: 'Dutch', 69: 'Norwegian Nynorsk',
-    70: 'Norwegian', 71: 'Occitan', 72: 'Panjabi', 73: 'Polish', 74: 'Pushto',
-    75: 'Portuguese', 76: 'Romanian', 77: 'Russian', 78: 'Sanskrit', 79: 'Scots',
-    80: 'Sindhi', 81: 'Sinhala', 82: 'Slovak', 83: 'Slovenian', 84: 'Shona',
-    85: 'Somali', 86: 'Albanian', 87: 'Serbian', 88: 'Sundanese', 89: 'Swedish',
-    90: 'Swahili', 91: 'Tamil', 92: 'Telugu', 93: 'Tajik', 94: 'Thai',
-    95: 'Turkmen', 96: 'Tagalog', 97: 'Turkish', 98: 'Tatar', 99: 'Ukrainian',
-    100: 'Urdu', 101: 'Uzbek', 102: 'Vietnamese', 103: 'Waray', 104: 'Yiddish',
-    105: 'Yoruba', 106: 'Chinese'
-}
-lang_index_JA_EN = {
-    'ja': 45,
-    'en': 20,
-}
 SAMPLING_RATE = 16000
-CHUNK_DURATION = 5 # 5秒ごとのチャンク
 def normalize_audio(audio):
@@ -77,7 +44,6 @@ def process_audio(audio):
     print(audio)
     sr, audio_data = audio
     print(audio_data.shape, audio_data.dtype)
     # 一番最初にSampling rateを揃えておく
     audio_data = resample_audio(audio_data, sr, target_sr=SAMPLING_RATE)
@@ -98,22 +64,19 @@ def process_audio(audio):
         print(f"Processing audio chunk of length {len(chunk)}")
         volume_norm = np.linalg.norm(chunk) / np.finfo(np.float32).max
         length = len(chunk) / SAMPLING_RATE  # 音声データの長さ（秒）
-        lang_guess = language_id.classify_batch(torch.from_numpy(chunk).unsqueeze(0))
         # 日本語と英語の確率値を取得
-        ja_prob = lang_guess[0][0][lang_index_JA_EN['ja']].item()
-        en_prob = lang_guess[0][0][lang_index_JA_EN['en']].item()
         ja_en = 'ja' if ja_prob > en_prob else 'en'
         # Top 3言語を取得
-        top3_indices = torch.topk(lang_guess[0], 3, dim=1, largest=True).indices[0]
-        top3_languages = [index_to_lang[idx.item()] for idx in top3_indices]
-        input_features = processor(chunk, sampling_rate=SAMPLING_RATE, return_tensors="pt").input_features.to(device)
-        predicted_ids = model.generate(input_features)
-        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-        # transcript = transcribe_audio(chunk, SAMPLING_RATE)
-        print(transcription)
         data.append({
             # "Time": pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
@@ -131,18 +94,35 @@ def process_audio(audio):
     # 未処理の残りのデータを保持
     current_chunk = [total_chunk]
-# inputs = gr.Audio(sources=["microphone", "upload"], type="numpy", streaming=True)
-inputs = gr.Audio(sources=["microphone", "upload"], type="numpy")
 outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])]
-demo = gr.Interface(
-    fn=process_audio,
-    inputs=inputs,
-    outputs=outputs,
-    live=True,
-    title="Real-time Audio Processing",
-    description="Speak into the microphone and see real-time audio processing results."
-)
 if __name__ == "__main__":

 import pandas as pd
 import torch
 import torchaudio
+from lang_id import identify_languages
+from whisper import transcribe
+# # Whisperモデルとプロセッサのロード
+# model_name = "openai/whisper-tiny"
+# processor = WhisperProcessor.from_pretrained(model_name)
+# model = WhisperForConditionalGeneration.from_pretrained(model_name)
+# # デバイスの設定（GPUが利用可能な場合はGPUを使用）
+# device = "cuda" if torch.cuda.is_available() else "cpu"
+# model.to(device)
 # アプリケーションの状態を保持する変数
 data = []
 current_chunk = []
 SAMPLING_RATE = 16000
+CHUNK_DURATION = 5  # 5秒ごとのチャンク
 def normalize_audio(audio):
     print(audio)
     sr, audio_data = audio
     print(audio_data.shape, audio_data.dtype)
     # 一番最初にSampling rateを揃えておく
     audio_data = resample_audio(audio_data, sr, target_sr=SAMPLING_RATE)
         print(f"Processing audio chunk of length {len(chunk)}")
         volume_norm = np.linalg.norm(chunk) / np.finfo(np.float32).max
         length = len(chunk) / SAMPLING_RATE  # 音声データの長さ（秒）
+        selected_scores, all_scores = identify_languages(chunk)
         # 日本語と英語の確率値を取得
+        ja_prob = selected_scores['Japanese']
+        en_prob = selected_scores['English']
         ja_en = 'ja' if ja_prob > en_prob else 'en'
         # Top 3言語を取得
+        top3_languages = ", ".join([f"{lang} ({all_scores[lang]:.2f})" for lang in sorted(all_scores, key=all_scores.get, reverse=True)[:3]])
+        # テキストの認識
+        transcription = transcribe(chunk)
         data.append({
             # "Time": pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
     # 未処理の残りのデータを保持
     current_chunk = [total_chunk]
+inputs_file = gr.Audio(sources=["upload"], type="numpy")
+inputs_stream = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
 outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])]
+with gr.Blocks() as demo:
+    with gr.TabItem("Upload"):
+        inputs_file = gr.Audio(sources=["upload"], type="numpy")
+        outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])]
+        gr.Interface(
+            fn=process_audio,
+            inputs=inputs_file,
+            outputs=outputs,
+            live=False,
+            title="File Audio Processing",
+            description="Upload an audio file to see the processing results."
+        )
+    with gr.TabItem("Microphone"):
+        inputs_stream = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
+        outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])]
+        gr.Interface(
+            fn=process_audio,
+            inputs=inputs_stream,
+            outputs=outputs,
+            live=True,
+            title="Real-time Audio Processing",
+            description="Speak into the microphone and see real-time audio processing results."
+        )
 if __name__ == "__main__":

lang_id.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from speechbrain.inference.classifiers import EncoderClassifier
+import numpy as np
+import torch
+INDEX_TO_LANG = {
+    0: 'Abkhazian', 1: 'Afrikaans', 2: 'Amharic', 3: 'Arabic', 4: 'Assamese',
+    5: 'Azerbaijani', 6: 'Bashkir', 7: 'Belarusian', 8: 'Bulgarian', 9: 'Bengali',
+    10: 'Tibetan', 11: 'Breton', 12: 'Bosnian', 13: 'Catalan', 14: 'Cebuano',
+    15: 'Czech', 16: 'Welsh', 17: 'Danish', 18: 'German', 19: 'Greek',
+    20: 'English', 21: 'Esperanto', 22: 'Spanish', 23: 'Estonian', 24: 'Basque',
+    25: 'Persian', 26: 'Finnish', 27: 'Faroese', 28: 'French', 29: 'Galician',
+    30: 'Guarani', 31: 'Gujarati', 32: 'Manx', 33: 'Hausa', 34: 'Hawaiian',
+    35: 'Hindi', 36: 'Croatian', 37: 'Haitian', 38: 'Hungarian', 39: 'Armenian',
+    40: 'Interlingua', 41: 'Indonesian', 42: 'Icelandic', 43: 'Italian', 44: 'Hebrew',
+    45: 'Japanese', 46: 'Javanese', 47: 'Georgian', 48: 'Kazakh', 49: 'Central Khmer',
+    50: 'Kannada', 51: 'Korean', 52: 'Latin', 53: 'Luxembourgish', 54: 'Lingala',
+    55: 'Lao', 56: 'Lithuanian', 57: 'Latvian', 58: 'Malagasy', 59: 'Maori',
+    60: 'Macedonian', 61: 'Malayalam', 62: 'Mongolian', 63: 'Marathi', 64: 'Malay',
+    65: 'Maltese', 66: 'Burmese', 67: 'Nepali', 68: 'Dutch', 69: 'Norwegian Nynorsk',
+    70: 'Norwegian', 71: 'Occitan', 72: 'Panjabi', 73: 'Polish', 74: 'Pushto',
+    75: 'Portuguese', 76: 'Romanian', 77: 'Russian', 78: 'Sanskrit', 79: 'Scots',
+    80: 'Sindhi', 81: 'Sinhala', 82: 'Slovak', 83: 'Slovenian', 84: 'Shona',
+    85: 'Somali', 86: 'Albanian', 87: 'Serbian', 88: 'Sundanese', 89: 'Swedish',
+    90: 'Swahili', 91: 'Tamil', 92: 'Telugu', 93: 'Tajik', 94: 'Thai',
+    95: 'Turkmen', 96: 'Tagalog', 97: 'Turkish', 98: 'Tatar', 99: 'Ukrainian',
+    100: 'Urdu', 101: 'Uzbek', 102: 'Vietnamese', 103: 'Waray', 104: 'Yiddish',
+    105: 'Yoruba', 106: 'Chinese'
+}
+LANG_TO_INDEX = {v: k for k, v in INDEX_TO_LANG.items()}
+# speechbrainの言語分類モデルのロード
+language_id = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa")
+def identify_languages(chunk: np.ndarray, languages: list[str] = ["Japanese", "English"]) -> tuple[dict, dict]:
+    """
+    言語分類を行う関数
+    """
+    # 言語分類
+    # outputs = language_id.encode_batch([chunk])
+    lang_scores, _, _, _ = language_id.classify_batch(torch.from_numpy(chunk).unsqueeze(0))
+    # 結果の整形
+    all_scores = {INDEX_TO_LANG[i]: score for i, score in enumerate(lang_scores[0])}
+    selected_scores = {lang: float(all_scores[lang]) for lang in languages}
+    return selected_scores, all_scores

whisper.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import numpy as np
+import torch
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+# Whisperモデルとプロセッサのロード
+model_name = "openai/whisper-tiny"
+processor = WhisperProcessor.from_pretrained(model_name)
+model = WhisperForConditionalGeneration.from_pretrained(model_name)
+# デバイスの設定（GPUが利用可能な場合はGPUを使用）
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model.to(device)
+SAMPLING_RATE = 16000
+CHUNK_DURATION = 5  # 5秒ごとのチャンク
+def transcribe(chunk: np.ndarray) -> str:
+    input_features = processor(chunk, sampling_rate=SAMPLING_RATE, return_tensors="pt").input_features.to(device)
+    predicted_ids = model.generate(input_features)
+    transcriptions = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+    print(transcriptions)
+    return "\n".join(transcriptions)