File size: 6,545 Bytes
3fbd296 1ecc4f1 5e7654d 385ef96 3fbd296 3d444ab 3fbd296 385ef96 9996005 385ef96 3fbd296 3d444ab 9b5cb27 3d444ab 9b5cb27 3d444ab 9b5cb27 3d444ab 9b5cb27 3d444ab 9996005 3d444ab 6274b4a 3d444ab 3fbd296 9996005 3fbd296 9996005 385ef96 3fbd296 9b5cb27 385ef96 3fbd296 9996005 3fbd296 3d444ab 3fbd296 5e7654d 9996005 3fbd296 5e7654d 3d444ab 5e7654d 3fbd296 4244a83 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
import gradio as gr
import numpy as np
import pandas as pd
import torch
import torchaudio
from datetime import datetime
from lang_id import identify_languages
from whisper import transcribe
# アプリケーションの状態を保持する変数
data = []
data_df = pd.DataFrame()
current_chunk = []
CHUNK_DURATION = 5 # 初期値としての5秒
def normalize_audio(audio):
# 音量の正規化(最大振幅が1になるようにスケーリング)
audio = audio / np.max(np.abs(audio))
return audio
def resample_audio(audio, orig_sr, target_sr=16000):
if orig_sr != target_sr:
print(f"Resampling audio from {orig_sr} to {target_sr}")
audio = audio.astype(np.float32)
resampler = torchaudio.transforms.Resample(orig_freq=orig_sr, new_freq=target_sr)
audio = resampler(torch.from_numpy(audio).unsqueeze(0)).squeeze(0).numpy()
return audio
def process_chunk(chunk, language_set) -> pd.DataFrame:
print(f"Processing audio chunk of length {len(chunk)}")
rms = np.sqrt(np.mean(chunk**2))
db_level = 20 * np.log10(rms + 1e-9) # 加えた小さな値で-inf値を防ぐ
# 音量の正規化
chunk = normalize_audio(chunk)
length = len(chunk) / SAMPLING_RATE # 音声データの長さ(秒)
s =
selected_scores, all_scores = identify_languages(chunk, language_set)
lang_id_time = ( - s).total_seconds()
# 日本語と英語の確率値を取得
ja_prob = selected_scores['Japanese']
en_prob = selected_scores['English']
ja_en = 'ja' if ja_prob > en_prob else 'en'
# Top 3言語を取得
top3_languages = ", ".join([f"{lang} ({all_scores[lang]:.2f})" for lang in sorted(all_scores, key=all_scores.get, reverse=True)[:3]])
# テキストの認識
s =
transcription = transcribe(chunk, language=ja_en)
transcribe_time = ( - s).total_seconds()
return pd.DataFrame({
"Length (s)": [length],
"db_level": [db_level],
"Japanese_English": [f"{ja_en} ({ja_prob:.2f}, {en_prob:.2f})"] if db_level > 50 else ["Silent"],
"Language": [top3_languages],
"Lang ID Time": [lang_id_time],
"Transcribe Time": [transcribe_time],
"Text": [transcription],
def process_audio_stream(audio, chunk_duration, language_set):
global data_df, current_chunk, SAMPLING_RATE
if audio is None:
return None, data_df
sr, audio_data = audio
# language_set
language_set = [lang.strip() for lang in language_set.split(",")]
print(audio_data.shape, audio_data.dtype)
# 一番最初にSampling rateを揃えておく
audio_data = resample_audio(audio_data, sr, target_sr=SAMPLING_RATE)
audio_sec = 0
total_chunk = np.concatenate(current_chunk)
if len(total_chunk) >= SAMPLING_RATE * chunk_duration:
chunk = total_chunk[:SAMPLING_RATE * chunk_duration]
total_chunk = total_chunk[SAMPLING_RATE * chunk_duration:]
audio_sec += chunk_duration
# Check if the audio in the window is too quiet
# rms = np.sqrt(np.mean(chunk**2))
# db_level = 20 * np.log10(rms + 1e-9) # 加えた小さな値で-inf値を防ぐ
# print(db_level)
df = process_chunk(chunk, language_set)
# add db_level
# df["dB Level"] = db_level
data_df = pd.concat([data_df, df], ignore_index=True)
current_chunk = [total_chunk]
return (SAMPLING_RATE, chunk), data_df
return (SAMPLING_RATE, total_chunk), data_df
def process_audio(audio, chunk_duration, language_set):
global data, data_df, current_chunk, SAMPLING_RATE
# reset state
data = []
data_df = pd.DataFrame()
current_chunk = []
if audio is None:
sr, audio_data = audio
# language_set
language_set = [lang.strip() for lang in language_set.split(",")]
print(audio_data.shape, audio_data.dtype)
# 一番最初にSampling rateを揃えておく
audio_data = resample_audio(audio_data, sr, target_sr=SAMPLING_RATE)
audio_sec = 0
# Check if the audio in the window is too quiet
rms = np.sqrt(np.mean(audio_data**2))
db_level = 20 * np.log10(rms + 1e-9) # 加えた小さな値で-inf値を防ぐ
# 音量の正規化
audio_data = normalize_audio(audio_data)
# 新しいデータを現在のチャンクに追加
total_chunk = np.concatenate(current_chunk)
while len(total_chunk) >= SAMPLING_RATE * chunk_duration:
chunk = total_chunk[:SAMPLING_RATE * chunk_duration]
total_chunk = total_chunk[SAMPLING_RATE * chunk_duration:] # 処理済みの部分を削除
audio_sec += chunk_duration
print(f"Processing audio chunk of length {len(chunk)}")
df = process_chunk(chunk, language_set)
data_df = pd.concat([data_df, df], ignore_index=True)
yield (SAMPLING_RATE, chunk), data_df
# 未処理の残りのデータを保持
current_chunk = [total_chunk]
# パラメータの入力コンポーネント
chunk_duration_input = gr.Number(value=5, label="Chunk Duration (seconds)")
language_set_input = gr.Textbox(value="Japanese,English", label="Language Set (comma-separated)")
inputs_file = [gr.Audio(sources=["upload"], type="numpy"), chunk_duration_input, language_set_input]
inputs_stream = [gr.Audio(sources=["microphone"], type="numpy", streaming=True), chunk_duration_input, language_set_input]
outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])]
with gr.Blocks() as demo:
with gr.TabItem("Upload"):
title="File Audio Processing",
description="Upload an audio file to see the processing results."
with gr.TabItem("Microphone"):
title="Real-time Audio Processing",
description="Speak into the microphone and see real-time audio processing results."
if __name__ == "__main__":