Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
import pandas as pd | |
import torch | |
import torchaudio | |
from lang_id import identify_languages | |
from whisper import transcribe | |
# # Whisperモデルとプロセッサのロード | |
# model_name = "openai/whisper-tiny" | |
# processor = WhisperProcessor.from_pretrained(model_name) | |
# model = WhisperForConditionalGeneration.from_pretrained(model_name) | |
# # デバイスの設定(GPUが利用可能な場合はGPUを使用) | |
# device = "cuda" if torch.cuda.is_available() else "cpu" | |
# model.to(device) | |
# アプリケーションの状態を保持する変数 | |
data = [] | |
current_chunk = [] | |
SAMPLING_RATE = 16000 | |
CHUNK_DURATION = 5 # 5秒ごとのチャンク | |
def normalize_audio(audio): | |
# 音量の正規化(最大振幅が1になるようにスケーリング) | |
audio = audio / np.max(np.abs(audio)) | |
return audio | |
def resample_audio(audio, orig_sr, target_sr=16000): | |
if orig_sr != target_sr: | |
print(f"Resampling audio from {orig_sr} to {target_sr}") | |
audio = audio.astype(np.float32) | |
resampler = torchaudio.transforms.Resample(orig_freq=orig_sr, new_freq=target_sr) | |
audio = resampler(torch.from_numpy(audio).unsqueeze(0)).squeeze(0).numpy() | |
return audio | |
def process_audio(audio): | |
global data, current_chunk | |
print("Process_audio") | |
print(audio) | |
sr, audio_data = audio | |
print(audio_data.shape, audio_data.dtype) | |
# 一番最初にSampling rateを揃えておく | |
audio_data = resample_audio(audio_data, sr, target_sr=SAMPLING_RATE) | |
audio_sec = 0 | |
# 音量の正規化 | |
audio_data = normalize_audio(audio_data) | |
# 新しいデータを現在のチャンクに追加 | |
current_chunk.append(audio_data) | |
total_chunk = np.concatenate(current_chunk) | |
while len(total_chunk) >= SAMPLING_RATE * CHUNK_DURATION: | |
chunk = total_chunk[:SAMPLING_RATE * CHUNK_DURATION] | |
total_chunk = total_chunk[SAMPLING_RATE * CHUNK_DURATION:] # 処理済みの部分を削除 | |
audio_sec += CHUNK_DURATION | |
print(f"Processing audio chunk of length {len(chunk)}") | |
volume_norm = np.linalg.norm(chunk) / np.finfo(np.float32).max | |
length = len(chunk) / SAMPLING_RATE # 音声データの長さ(秒) | |
selected_scores, all_scores = identify_languages(chunk) | |
# 日本語と英語の確率値を取得 | |
ja_prob = selected_scores['Japanese'] | |
en_prob = selected_scores['English'] | |
ja_en = 'ja' if ja_prob > en_prob else 'en' | |
# Top 3言語を取得 | |
top3_languages = ", ".join([f"{lang} ({all_scores[lang]:.2f})" for lang in sorted(all_scores, key=all_scores.get, reverse=True)[:3]]) | |
# テキストの認識 | |
transcription = transcribe(chunk) | |
data.append({ | |
# "Time": pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'), | |
"Time": audio_sec, | |
"Length (s)": length, | |
"Volume": volume_norm, | |
"Japanese_English": f"{ja_en} ({ja_prob:.2f}, {en_prob:.2f})", | |
"Language": top3_languages, | |
"Text": transcription, | |
}) | |
df = pd.DataFrame(data) | |
yield (SAMPLING_RATE, chunk), df | |
# 未処理の残りのデータを保持 | |
current_chunk = [total_chunk] | |
inputs_file = gr.Audio(sources=["upload"], type="numpy") | |
inputs_stream = gr.Audio(sources=["microphone"], type="numpy", streaming=True) | |
outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])] | |
with gr.Blocks() as demo: | |
with gr.TabItem("Upload"): | |
inputs_file = gr.Audio(sources=["upload"], type="numpy") | |
outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])] | |
gr.Interface( | |
fn=process_audio, | |
inputs=inputs_file, | |
outputs=outputs, | |
live=False, | |
title="File Audio Processing", | |
description="Upload an audio file to see the processing results." | |
) | |
with gr.TabItem("Microphone"): | |
inputs_stream = gr.Audio(sources=["microphone"], type="numpy", streaming=True) | |
outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])] | |
gr.Interface( | |
fn=process_audio, | |
inputs=inputs_stream, | |
outputs=outputs, | |
live=True, | |
title="Real-time Audio Processing", | |
description="Speak into the microphone and see real-time audio processing results." | |
) | |
if __name__ == "__main__": | |
demo.launch() | |