tan-z-tan commited on
Commit
385ef96
·
1 Parent(s): 4244a83

Normalize audio

Browse files
Files changed (1) hide show
  1. app.py +33 -9
app.py CHANGED
@@ -6,12 +6,24 @@ import torchaudio
6
  import time
7
  from transformers import pipeline
8
  from speechbrain.inference.classifiers import EncoderClassifier
 
9
 
10
- transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
 
 
 
 
 
 
 
 
 
11
  language_id = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa")
12
 
 
13
  data = []
14
  current_chunk = []
 
15
  index_to_lang = {
16
  0: 'Abkhazian', 1: 'Afrikaans', 2: 'Amharic', 3: 'Arabic', 4: 'Assamese',
17
  5: 'Azerbaijani', 6: 'Bashkir', 7: 'Belarusian', 8: 'Bulgarian', 9: 'Bengali',
@@ -40,6 +52,15 @@ lang_index_JA_EN = {
40
  'ja': 45,
41
  'en': 20,
42
  }
 
 
 
 
 
 
 
 
 
43
 
44
  def resample_audio(audio, orig_sr, target_sr=16000):
45
  if orig_sr != target_sr:
@@ -50,20 +71,21 @@ def resample_audio(audio, orig_sr, target_sr=16000):
50
  return audio
51
 
52
 
53
- SAMPLING_RATE = 16000
54
- CHUNK_DURATION = 5 # 5秒ごとのチャンク
55
-
56
  def process_audio(audio):
57
  global data, current_chunk
58
  print("Process_audio")
59
  print(audio)
60
  sr, audio_data = audio
61
 
62
- print(audio_data.shape)
 
63
  # 一番最初にSampling rateを揃えておく
64
  audio_data = resample_audio(audio_data, sr, target_sr=SAMPLING_RATE)
65
  audio_sec = 0
66
 
 
 
 
67
  # 新しいデータを現在のチャンクに追加
68
  current_chunk.append(audio_data)
69
  total_chunk = np.concatenate(current_chunk)
@@ -87,9 +109,11 @@ def process_audio(audio):
87
  top3_indices = torch.topk(lang_guess[0], 3, dim=1, largest=True).indices[0]
88
  top3_languages = [index_to_lang[idx.item()] for idx in top3_indices]
89
 
90
- # transcript
91
- transcript = transcriber(chunk)
92
- print(transcript)
 
 
93
 
94
  data.append({
95
  # "Time": pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
@@ -98,7 +122,7 @@ def process_audio(audio):
98
  "Volume": volume_norm,
99
  "Japanese_English": f"{ja_en} ({ja_prob:.2f}, {en_prob:.2f})",
100
  "Language": top3_languages,
101
- "Text": transcript['text'],
102
  })
103
 
104
  df = pd.DataFrame(data)
 
6
  import time
7
  from transformers import pipeline
8
  from speechbrain.inference.classifiers import EncoderClassifier
9
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
10
 
11
+ # Whisperモデルとプロセッサのロード
12
+ model_name = "openai/whisper-tiny"
13
+ processor = WhisperProcessor.from_pretrained(model_name)
14
+ model = WhisperForConditionalGeneration.from_pretrained(model_name)
15
+ # デバイスの設定(GPUが利用可能な場合はGPUを使用)
16
+ device = "cuda" if torch.cuda.is_available() else "cpu"
17
+ model.to(device)
18
+
19
+
20
+ # speechbrainの言語分類モデルのロード
21
  language_id = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa")
22
 
23
+ # アプリケーションの状態を保持する変数
24
  data = []
25
  current_chunk = []
26
+
27
  index_to_lang = {
28
  0: 'Abkhazian', 1: 'Afrikaans', 2: 'Amharic', 3: 'Arabic', 4: 'Assamese',
29
  5: 'Azerbaijani', 6: 'Bashkir', 7: 'Belarusian', 8: 'Bulgarian', 9: 'Bengali',
 
52
  'ja': 45,
53
  'en': 20,
54
  }
55
+ SAMPLING_RATE = 16000
56
+ CHUNK_DURATION = 5 # 5秒ごとのチャンク
57
+
58
+
59
+ def normalize_audio(audio):
60
+ # 音量の正規化(最大振幅が1になるようにスケーリング)
61
+ audio = audio / np.max(np.abs(audio))
62
+ return audio
63
+
64
 
65
  def resample_audio(audio, orig_sr, target_sr=16000):
66
  if orig_sr != target_sr:
 
71
  return audio
72
 
73
 
 
 
 
74
  def process_audio(audio):
75
  global data, current_chunk
76
  print("Process_audio")
77
  print(audio)
78
  sr, audio_data = audio
79
 
80
+
81
+ print(audio_data.shape, audio_data.dtype)
82
  # 一番最初にSampling rateを揃えておく
83
  audio_data = resample_audio(audio_data, sr, target_sr=SAMPLING_RATE)
84
  audio_sec = 0
85
 
86
+ # 音量の正規化
87
+ audio_data = normalize_audio(audio_data)
88
+
89
  # 新しいデータを現在のチャンクに追加
90
  current_chunk.append(audio_data)
91
  total_chunk = np.concatenate(current_chunk)
 
109
  top3_indices = torch.topk(lang_guess[0], 3, dim=1, largest=True).indices[0]
110
  top3_languages = [index_to_lang[idx.item()] for idx in top3_indices]
111
 
112
+ input_features = processor(chunk, sampling_rate=SAMPLING_RATE, return_tensors="pt").input_features.to(device)
113
+ predicted_ids = model.generate(input_features)
114
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
115
+ # transcript = transcribe_audio(chunk, SAMPLING_RATE)
116
+ print(transcription)
117
 
118
  data.append({
119
  # "Time": pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
 
122
  "Volume": volume_norm,
123
  "Japanese_English": f"{ja_en} ({ja_prob:.2f}, {en_prob:.2f})",
124
  "Language": top3_languages,
125
+ "Text": transcription,
126
  })
127
 
128
  df = pd.DataFrame(data)