Deepakkori45 commited on
Commit
ba9b23b
·
verified ·
1 Parent(s): eb035cc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -60
app.py CHANGED
@@ -8,37 +8,38 @@ from dotenv import load_dotenv
8
  from tempfile import NamedTemporaryFile
9
  import math
10
  from docx import Document
 
11
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
12
 
13
- # Load environment variables from .env file (if needed)
14
  load_dotenv()
15
 
 
 
 
 
 
 
16
  @st.cache_resource
17
  def load_whisper_model():
18
  """
19
  Load the Whisper model and processor from Hugging Face.
20
- You can change the model variant ("openai/whisper-base" is used here).
21
  """
22
- model_name = "openai/whisper-base" # Options: "tiny", "base", "small", "medium", "large"
23
  processor = WhisperProcessor.from_pretrained(model_name)
24
  model = WhisperForConditionalGeneration.from_pretrained(model_name)
25
  return processor, model
26
 
27
  processor, model = load_whisper_model()
28
 
 
 
29
  def split_audio_on_silence(audio_file_path, min_silence_len=500, silence_thresh=-40, keep_silence=250):
30
  """
31
  Split an audio file into chunks using silence detection.
32
-
33
- Args:
34
- audio_file_path (str): Path to the audio file.
35
- min_silence_len (int): Minimum length of silence (in ms) required for a split.
36
- silence_thresh (int): The volume (in dBFS) below which is considered silence.
37
- keep_silence (int): Amount of silence (in ms) to retain at the beginning and end of each chunk.
38
-
39
- Returns:
40
- list: List of AudioSegment chunks.
41
  """
 
42
  audio = AudioSegment.from_file(audio_file_path)
43
  chunks = split_on_silence(
44
  audio,
@@ -46,72 +47,66 @@ def split_audio_on_silence(audio_file_path, min_silence_len=500, silence_thresh=
46
  silence_thresh=silence_thresh,
47
  keep_silence=keep_silence
48
  )
 
49
  return chunks
50
 
51
  def transcribe(audio_file):
52
  """
53
  Transcribe an audio file using the locally loaded Whisper model from Hugging Face.
54
-
55
- Args:
56
- audio_file (str): Path to the audio file.
57
-
58
- Returns:
59
- str: Transcribed text.
60
  """
61
- # Load audio using librosa, resampling to 16000 Hz as required by Whisper
62
  speech, sr = librosa.load(audio_file, sr=16000)
63
  input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
64
- # Generate transcription
65
  predicted_ids = model.generate(input_features)
66
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
67
  return transcription
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def process_audio_chunks(audio_chunks):
70
  """
71
- Process and transcribe each audio chunk.
72
-
73
- Args:
74
- audio_chunks (list): List of AudioSegment chunks.
75
-
76
- Returns:
77
- str: Combined transcription from all chunks.
78
  """
79
  transcriptions = []
80
- min_length_ms = 100 # Minimum length required (0.1 seconds)
81
-
82
  for i, chunk in enumerate(audio_chunks):
83
- if len(chunk) < min_length_ms:
84
- st.warning(f"Chunk {i} is too short to be processed.")
85
- continue
86
-
87
- with NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
88
- chunk.export(temp_audio_file.name, format="wav")
89
- temp_audio_file_path = temp_audio_file.name
90
-
91
- transcription = transcribe(temp_audio_file_path)
92
- if transcription:
93
- transcriptions.append(transcription)
94
- st.write(f"Transcription for chunk {i}: {transcription}")
95
-
96
- os.remove(temp_audio_file_path)
97
- return " ".join(transcriptions)
98
 
99
  def save_transcription_to_docx(transcription, audio_file_path):
100
  """
101
  Save the transcription as a .docx file.
102
-
103
- Args:
104
- transcription (str): Transcribed text.
105
- audio_file_path (str): Path to the original audio file for naming purposes.
106
-
107
- Returns:
108
- str: Path to the saved .docx file.
109
  """
110
  base_name = os.path.splitext(os.path.basename(audio_file_path))[0]
111
  output_file_name = f"{base_name}_full_transcription.docx"
 
112
  doc = Document()
113
  doc.add_paragraph(transcription)
114
  doc.save(output_file_name)
 
115
  return output_file_name
116
 
117
  st.title("Audio Transcription with Whisper (Local via Hugging Face)")
@@ -131,21 +126,17 @@ if uploaded_file is not None and st.session_state.transcription is None:
131
  with open(temp_audio_file, "wb") as f:
132
  f.write(uploaded_file.getbuffer())
133
 
134
- # Split and process audio using silence detection
135
- with st.spinner('Transcribing...'):
136
- audio_chunks = split_audio_on_silence(
137
- temp_audio_file,
138
- min_silence_len=500,
139
- silence_thresh=-40,
140
- keep_silence=250
141
- )
142
  transcription = process_audio_chunks(audio_chunks)
143
  if transcription:
144
  st.session_state.transcription = transcription
145
  st.success('Transcription complete!')
146
  output_docx_file = save_transcription_to_docx(transcription, uploaded_file.name)
147
  st.session_state.output_docx_file = output_docx_file
148
-
 
149
  if os.path.exists(temp_audio_file):
150
  os.remove(temp_audio_file)
151
 
 
8
  from tempfile import NamedTemporaryFile
9
  import math
10
  from docx import Document
11
+ import time
12
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
13
 
14
+ # Load environment variables from .env file (if needed for other config)
15
  load_dotenv()
16
 
17
+ # Create a placeholder for status messages
18
+ status_placeholder = st.empty()
19
+
20
+ # Display status while loading the model
21
+ status_placeholder.info("Loading Whisper model from Hugging Face...")
22
+
23
  @st.cache_resource
24
  def load_whisper_model():
25
  """
26
  Load the Whisper model and processor from Hugging Face.
27
+ Change 'openai/whisper-base' to another variant if needed.
28
  """
29
+ model_name = "openai/whisper-base"
30
  processor = WhisperProcessor.from_pretrained(model_name)
31
  model = WhisperForConditionalGeneration.from_pretrained(model_name)
32
  return processor, model
33
 
34
  processor, model = load_whisper_model()
35
 
36
+ status_placeholder.info("Whisper model loaded successfully!")
37
+
38
  def split_audio_on_silence(audio_file_path, min_silence_len=500, silence_thresh=-40, keep_silence=250):
39
  """
40
  Split an audio file into chunks using silence detection.
 
 
 
 
 
 
 
 
 
41
  """
42
+ status_placeholder.info("Splitting audio on silence...")
43
  audio = AudioSegment.from_file(audio_file_path)
44
  chunks = split_on_silence(
45
  audio,
 
47
  silence_thresh=silence_thresh,
48
  keep_silence=keep_silence
49
  )
50
+ status_placeholder.info(f"Audio split into {len(chunks)} chunks.")
51
  return chunks
52
 
53
  def transcribe(audio_file):
54
  """
55
  Transcribe an audio file using the locally loaded Whisper model from Hugging Face.
56
+ This uses librosa to load and resample the audio as required.
 
 
 
 
 
57
  """
58
+ # Load audio with librosa at 16kHz (as required by Whisper)
59
  speech, sr = librosa.load(audio_file, sr=16000)
60
  input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
 
61
  predicted_ids = model.generate(input_features)
62
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
63
  return transcription
64
 
65
+ def transcribe_chunk(chunk, index, min_length_ms=100):
66
+ """
67
+ Transcribe an individual audio chunk.
68
+ """
69
+ if len(chunk) < min_length_ms:
70
+ st.warning(f"Chunk {index} is too short to be processed.")
71
+ return (index, "")
72
+ # Save chunk temporarily as a WAV file
73
+ with NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
74
+ chunk.export(temp_audio_file.name, format="wav")
75
+ temp_audio_file_path = temp_audio_file.name
76
+ status_placeholder.info(f"Transcribing chunk {index}...")
77
+ transcription = transcribe(temp_audio_file_path)
78
+ os.remove(temp_audio_file_path)
79
+ st.write(f"Transcription for chunk {index}: {transcription}")
80
+ return (index, transcription)
81
+
82
  def process_audio_chunks(audio_chunks):
83
  """
84
+ Process and transcribe each audio chunk in sequence.
85
+ Reports the total time taken.
 
 
 
 
 
86
  """
87
  transcriptions = []
88
+ min_length_ms = 100 # minimum duration for processing
89
+ start_transcription = time.time()
90
  for i, chunk in enumerate(audio_chunks):
91
+ index, text = transcribe_chunk(chunk, i, min_length_ms)
92
+ transcriptions.append((index, text))
93
+ transcriptions.sort(key=lambda x: x[0])
94
+ total_time = time.time() - start_transcription
95
+ status_placeholder.info(f"All chunks transcribed in {total_time:.2f} seconds.")
96
+ combined = " ".join([text for idx, text in transcriptions])
97
+ return combined
 
 
 
 
 
 
 
 
98
 
99
  def save_transcription_to_docx(transcription, audio_file_path):
100
  """
101
  Save the transcription as a .docx file.
 
 
 
 
 
 
 
102
  """
103
  base_name = os.path.splitext(os.path.basename(audio_file_path))[0]
104
  output_file_name = f"{base_name}_full_transcription.docx"
105
+ status_placeholder.info("Saving transcription to DOCX...")
106
  doc = Document()
107
  doc.add_paragraph(transcription)
108
  doc.save(output_file_name)
109
+ status_placeholder.info("Transcription saved as DOCX.")
110
  return output_file_name
111
 
112
  st.title("Audio Transcription with Whisper (Local via Hugging Face)")
 
126
  with open(temp_audio_file, "wb") as f:
127
  f.write(uploaded_file.getbuffer())
128
 
129
+ processing_start = time.time()
130
+ with st.spinner('Processing audio...'):
131
+ audio_chunks = split_audio_on_silence(temp_audio_file)
 
 
 
 
 
132
  transcription = process_audio_chunks(audio_chunks)
133
  if transcription:
134
  st.session_state.transcription = transcription
135
  st.success('Transcription complete!')
136
  output_docx_file = save_transcription_to_docx(transcription, uploaded_file.name)
137
  st.session_state.output_docx_file = output_docx_file
138
+ processing_duration = time.time() - processing_start
139
+ status_placeholder.info(f"Total processing time: {processing_duration:.2f} seconds.")
140
  if os.path.exists(temp_audio_file):
141
  os.remove(temp_audio_file)
142