Deepakkori45 commited on
Commit
eb035cc
·
verified ·
1 Parent(s): 8efb751

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -22
app.py CHANGED
@@ -1,27 +1,30 @@
1
  import streamlit as st
2
  import os
 
 
3
  from pydub import AudioSegment
4
  from pydub.silence import split_on_silence
5
  from dotenv import load_dotenv
6
  from tempfile import NamedTemporaryFile
7
  import math
8
  from docx import Document
9
- import whisper
10
 
11
- # Load environment variables from .env file (if needed for other configurations)
12
  load_dotenv()
13
 
14
  @st.cache_resource
15
  def load_whisper_model():
16
  """
17
- Load the Whisper model once and cache it for future use.
18
- You can choose the model size: "tiny", "base", "small", "medium", or "large".
19
  """
20
- model = whisper.load_model("base")
21
- return model
 
 
22
 
23
- # Load the Whisper model globally so it’s only loaded once.
24
- model = load_whisper_model()
25
 
26
  def split_audio_on_silence(audio_file_path, min_silence_len=500, silence_thresh=-40, keep_silence=250):
27
  """
@@ -47,7 +50,7 @@ def split_audio_on_silence(audio_file_path, min_silence_len=500, silence_thresh=
47
 
48
  def transcribe(audio_file):
49
  """
50
- Transcribe an audio file using the locally loaded Whisper model.
51
 
52
  Args:
53
  audio_file (str): Path to the audio file.
@@ -55,12 +58,17 @@ def transcribe(audio_file):
55
  Returns:
56
  str: Transcribed text.
57
  """
58
- result = model.transcribe(audio_file, language="en")
59
- return result["text"]
 
 
 
 
 
60
 
61
  def process_audio_chunks(audio_chunks):
62
  """
63
- Process and transcribe each audio chunk in sequence.
64
 
65
  Args:
66
  audio_chunks (list): List of AudioSegment chunks.
@@ -69,22 +77,22 @@ def process_audio_chunks(audio_chunks):
69
  str: Combined transcription from all chunks.
70
  """
71
  transcriptions = []
72
- min_length_ms = 100 # Minimum length required for processing
 
73
  for i, chunk in enumerate(audio_chunks):
74
  if len(chunk) < min_length_ms:
75
  st.warning(f"Chunk {i} is too short to be processed.")
76
  continue
77
 
78
- # Save the chunk temporarily as a WAV file
79
  with NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
80
  chunk.export(temp_audio_file.name, format="wav")
81
  temp_audio_file_path = temp_audio_file.name
82
-
83
  transcription = transcribe(temp_audio_file_path)
84
  if transcription:
85
  transcriptions.append(transcription)
86
  st.write(f"Transcription for chunk {i}: {transcription}")
87
-
88
  os.remove(temp_audio_file_path)
89
  return " ".join(transcriptions)
90
 
@@ -106,7 +114,7 @@ def save_transcription_to_docx(transcription, audio_file_path):
106
  doc.save(output_file_name)
107
  return output_file_name
108
 
109
- st.title("Audio Transcription with Whisper (Local)")
110
 
111
  # Allow uploading of audio or video files
112
  uploaded_file = st.file_uploader("Upload an audio or video file", type=["wav", "mp3", "ogg", "m4a", "mp4", "mov"])
@@ -122,14 +130,14 @@ if uploaded_file is not None and st.session_state.transcription is None:
122
  temp_audio_file = f"temp_audio_file.{file_extension}"
123
  with open(temp_audio_file, "wb") as f:
124
  f.write(uploaded_file.getbuffer())
125
-
126
  # Split and process audio using silence detection
127
  with st.spinner('Transcribing...'):
128
  audio_chunks = split_audio_on_silence(
129
  temp_audio_file,
130
- min_silence_len=500, # adjust based on your audio
131
- silence_thresh=-40, # adjust based on ambient noise level
132
- keep_silence=250 # retains a bit of silence at the edges
133
  )
134
  transcription = process_audio_chunks(audio_chunks)
135
  if transcription:
@@ -137,7 +145,7 @@ if uploaded_file is not None and st.session_state.transcription is None:
137
  st.success('Transcription complete!')
138
  output_docx_file = save_transcription_to_docx(transcription, uploaded_file.name)
139
  st.session_state.output_docx_file = output_docx_file
140
-
141
  if os.path.exists(temp_audio_file):
142
  os.remove(temp_audio_file)
143
 
 
1
  import streamlit as st
2
  import os
3
+ import librosa
4
+ import torch
5
  from pydub import AudioSegment
6
  from pydub.silence import split_on_silence
7
  from dotenv import load_dotenv
8
  from tempfile import NamedTemporaryFile
9
  import math
10
  from docx import Document
11
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
12
 
13
+ # Load environment variables from .env file (if needed)
14
  load_dotenv()
15
 
16
  @st.cache_resource
17
  def load_whisper_model():
18
  """
19
+ Load the Whisper model and processor from Hugging Face.
20
+ You can change the model variant ("openai/whisper-base" is used here).
21
  """
22
+ model_name = "openai/whisper-base" # Options: "tiny", "base", "small", "medium", "large"
23
+ processor = WhisperProcessor.from_pretrained(model_name)
24
+ model = WhisperForConditionalGeneration.from_pretrained(model_name)
25
+ return processor, model
26
 
27
+ processor, model = load_whisper_model()
 
28
 
29
  def split_audio_on_silence(audio_file_path, min_silence_len=500, silence_thresh=-40, keep_silence=250):
30
  """
 
50
 
51
  def transcribe(audio_file):
52
  """
53
+ Transcribe an audio file using the locally loaded Whisper model from Hugging Face.
54
 
55
  Args:
56
  audio_file (str): Path to the audio file.
 
58
  Returns:
59
  str: Transcribed text.
60
  """
61
+ # Load audio using librosa, resampling to 16000 Hz as required by Whisper
62
+ speech, sr = librosa.load(audio_file, sr=16000)
63
+ input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
64
+ # Generate transcription
65
+ predicted_ids = model.generate(input_features)
66
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
67
+ return transcription
68
 
69
  def process_audio_chunks(audio_chunks):
70
  """
71
+ Process and transcribe each audio chunk.
72
 
73
  Args:
74
  audio_chunks (list): List of AudioSegment chunks.
 
77
  str: Combined transcription from all chunks.
78
  """
79
  transcriptions = []
80
+ min_length_ms = 100 # Minimum length required (0.1 seconds)
81
+
82
  for i, chunk in enumerate(audio_chunks):
83
  if len(chunk) < min_length_ms:
84
  st.warning(f"Chunk {i} is too short to be processed.")
85
  continue
86
 
 
87
  with NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
88
  chunk.export(temp_audio_file.name, format="wav")
89
  temp_audio_file_path = temp_audio_file.name
90
+
91
  transcription = transcribe(temp_audio_file_path)
92
  if transcription:
93
  transcriptions.append(transcription)
94
  st.write(f"Transcription for chunk {i}: {transcription}")
95
+
96
  os.remove(temp_audio_file_path)
97
  return " ".join(transcriptions)
98
 
 
114
  doc.save(output_file_name)
115
  return output_file_name
116
 
117
+ st.title("Audio Transcription with Whisper (Local via Hugging Face)")
118
 
119
  # Allow uploading of audio or video files
120
  uploaded_file = st.file_uploader("Upload an audio or video file", type=["wav", "mp3", "ogg", "m4a", "mp4", "mov"])
 
130
  temp_audio_file = f"temp_audio_file.{file_extension}"
131
  with open(temp_audio_file, "wb") as f:
132
  f.write(uploaded_file.getbuffer())
133
+
134
  # Split and process audio using silence detection
135
  with st.spinner('Transcribing...'):
136
  audio_chunks = split_audio_on_silence(
137
  temp_audio_file,
138
+ min_silence_len=500,
139
+ silence_thresh=-40,
140
+ keep_silence=250
141
  )
142
  transcription = process_audio_chunks(audio_chunks)
143
  if transcription:
 
145
  st.success('Transcription complete!')
146
  output_docx_file = save_transcription_to_docx(transcription, uploaded_file.name)
147
  st.session_state.output_docx_file = output_docx_file
148
+
149
  if os.path.exists(temp_audio_file):
150
  os.remove(temp_audio_file)
151