import gradio as gr import numpy as np import pandas as pd import torch import torchaudio from lang_id import identify_languages from whisper import transcribe # # Whisperモデルとプロセッサのロード # model_name = "openai/whisper-tiny" # processor = WhisperProcessor.from_pretrained(model_name) # model = WhisperForConditionalGeneration.from_pretrained(model_name) # # デバイスの設定(GPUが利用可能な場合はGPUを使用) # device = "cuda" if torch.cuda.is_available() else "cpu" # model.to(device) # アプリケーションの状態を保持する変数 data = [] current_chunk = [] SAMPLING_RATE = 16000 CHUNK_DURATION = 5 # 5秒ごとのチャンク def normalize_audio(audio): # 音量の正規化(最大振幅が1になるようにスケーリング) audio = audio / np.max(np.abs(audio)) return audio def resample_audio(audio, orig_sr, target_sr=16000): if orig_sr != target_sr: print(f"Resampling audio from {orig_sr} to {target_sr}") audio = audio.astype(np.float32) resampler = torchaudio.transforms.Resample(orig_freq=orig_sr, new_freq=target_sr) audio = resampler(torch.from_numpy(audio).unsqueeze(0)).squeeze(0).numpy() return audio def process_audio(audio): global data, current_chunk print("Process_audio") print(audio) sr, audio_data = audio print(audio_data.shape, audio_data.dtype) # 一番最初にSampling rateを揃えておく audio_data = resample_audio(audio_data, sr, target_sr=SAMPLING_RATE) audio_sec = 0 # 音量の正規化 audio_data = normalize_audio(audio_data) # 新しいデータを現在のチャンクに追加 current_chunk.append(audio_data) total_chunk = np.concatenate(current_chunk) while len(total_chunk) >= SAMPLING_RATE * CHUNK_DURATION: chunk = total_chunk[:SAMPLING_RATE * CHUNK_DURATION] total_chunk = total_chunk[SAMPLING_RATE * CHUNK_DURATION:] # 処理済みの部分を削除 audio_sec += CHUNK_DURATION print(f"Processing audio chunk of length {len(chunk)}") volume_norm = np.linalg.norm(chunk) / np.finfo(np.float32).max length = len(chunk) / SAMPLING_RATE # 音声データの長さ(秒) selected_scores, all_scores = identify_languages(chunk) # 日本語と英語の確率値を取得 ja_prob = selected_scores['Japanese'] en_prob = selected_scores['English'] ja_en = 'ja' if ja_prob > en_prob else 'en' # Top 3言語を取得 top3_languages = ", ".join([f"{lang} ({all_scores[lang]:.2f})" for lang in sorted(all_scores, key=all_scores.get, reverse=True)[:3]]) # テキストの認識 transcription = transcribe(chunk) data.append({ # "Time": pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'), "Time": audio_sec, "Length (s)": length, "Volume": volume_norm, "Japanese_English": f"{ja_en} ({ja_prob:.2f}, {en_prob:.2f})", "Language": top3_languages, "Text": transcription, }) df = pd.DataFrame(data) yield (SAMPLING_RATE, chunk), df # 未処理の残りのデータを保持 current_chunk = [total_chunk] inputs_file = gr.Audio(sources=["upload"], type="numpy") inputs_stream = gr.Audio(sources=["microphone"], type="numpy", streaming=True) outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])] with gr.Blocks() as demo: with gr.TabItem("Upload"): inputs_file = gr.Audio(sources=["upload"], type="numpy") outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])] gr.Interface( fn=process_audio, inputs=inputs_file, outputs=outputs, live=False, title="File Audio Processing", description="Upload an audio file to see the processing results." ) with gr.TabItem("Microphone"): inputs_stream = gr.Audio(sources=["microphone"], type="numpy", streaming=True) outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])] gr.Interface( fn=process_audio, inputs=inputs_stream, outputs=outputs, live=True, title="Real-time Audio Processing", description="Speak into the microphone and see real-time audio processing results." ) if __name__ == "__main__": demo.launch()