oceansweep commited on
Commit
341be53
·
verified ·
1 Parent(s): 83c8d2b

Upload Audio_Transcription_Lib.py

Browse files
App_Function_Libraries/Audio_Transcription_Lib.py CHANGED
@@ -1,247 +1,254 @@
1
- # Audio_Transcription_Lib.py
2
- #########################################
3
- # Transcription Library
4
- # This library is used to perform transcription of audio files.
5
- # Currently, uses faster_whisper for transcription.
6
- #
7
- ####################
8
- # Function List
9
- #
10
- # 1. convert_to_wav(video_file_path, offset=0, overwrite=False)
11
- # 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False)
12
- #
13
- ####################
14
- #
15
- # Import necessary libraries to run solo for testing
16
- import gc
17
- import json
18
- import logging
19
- import os
20
- import queue
21
- import sys
22
- import subprocess
23
- import tempfile
24
- import threading
25
- import time
26
- import configparser
27
- # DEBUG Imports
28
- #from memory_profiler import profile
29
- #import pyaudio
30
- # Import Local
31
- #
32
- #######################################################################################################################
33
- # Function Definitions
34
- #
35
-
36
- # Convert video .m4a into .wav using ffmpeg
37
- # ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav"
38
- # https://www.gyan.dev/ffmpeg/builds/
39
- #
40
-
41
-
42
- whisper_model_instance = None
43
- # Retrieve processing choice from the configuration file
44
- config = configparser.ConfigParser()
45
- config.read('config.txt')
46
- processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
47
-
48
-
49
- # FIXME: This is a temporary solution.
50
- # This doesn't clear older models, which means potentially a lot of memory is being used...
51
- def get_whisper_model(model_name, device):
52
- global whisper_model_instance
53
- if whisper_model_instance is None:
54
- from faster_whisper import WhisperModel
55
- logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
56
- whisper_model_instance = WhisperModel(model_name, device=device)
57
- return whisper_model_instance
58
-
59
-
60
- # os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
61
- #DEBUG
62
- #@profile
63
- def convert_to_wav(video_file_path, offset=0, overwrite=False):
64
- out_path = os.path.splitext(video_file_path)[0] + ".wav"
65
-
66
- if os.path.exists(out_path) and not overwrite:
67
- print(f"File '{out_path}' already exists. Skipping conversion.")
68
- logging.info(f"Skipping conversion as file already exists: {out_path}")
69
- return out_path
70
- print("Starting conversion process of .m4a to .WAV")
71
- out_path = os.path.splitext(video_file_path)[0] + ".wav"
72
-
73
- try:
74
- if os.name == "nt":
75
- logging.debug("ffmpeg being ran on windows")
76
-
77
- if sys.platform.startswith('win'):
78
- ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
79
- logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
80
- else:
81
- ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
82
-
83
- command = [
84
- ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists
85
- "-ss", "00:00:00", # Start at the beginning of the video
86
- "-i", video_file_path,
87
- "-ar", "16000", # Audio sample rate
88
- "-ac", "1", # Number of audio channels
89
- "-c:a", "pcm_s16le", # Audio codec
90
- out_path
91
- ]
92
- try:
93
- # Redirect stdin from null device to prevent ffmpeg from waiting for input
94
- with open(os.devnull, 'rb') as null_file:
95
- result = subprocess.run(command, stdin=null_file, text=True, capture_output=True)
96
- if result.returncode == 0:
97
- logging.info("FFmpeg executed successfully")
98
- logging.debug("FFmpeg output: %s", result.stdout)
99
- else:
100
- logging.error("Error in running FFmpeg")
101
- logging.error("FFmpeg stderr: %s", result.stderr)
102
- raise RuntimeError(f"FFmpeg error: {result.stderr}")
103
- except Exception as e:
104
- logging.error("Error occurred - ffmpeg doesn't like windows")
105
- raise RuntimeError("ffmpeg failed")
106
- elif os.name == "posix":
107
- os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
108
- else:
109
- raise RuntimeError("Unsupported operating system")
110
- logging.info("Conversion to WAV completed: %s", out_path)
111
- except subprocess.CalledProcessError as e:
112
- logging.error("Error executing FFmpeg command: %s", str(e))
113
- raise RuntimeError("Error converting video file to WAV")
114
- except Exception as e:
115
- logging.error("speech-to-text: Error transcribing audio: %s", str(e))
116
- return {"error": str(e)}
117
- gc.collect()
118
- return out_path
119
-
120
-
121
- # Transcribe .wav into .segments.json
122
- #DEBUG
123
- #@profile
124
- def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
125
- global whisper_model_instance, processing_choice
126
- logging.info('speech-to-text: Loading faster_whisper model: %s', whisper_model)
127
-
128
- time_start = time.time()
129
- if audio_file_path is None:
130
- raise ValueError("speech-to-text: No audio file provided")
131
- logging.info("speech-to-text: Audio file path: %s", audio_file_path)
132
-
133
- try:
134
- _, file_ending = os.path.splitext(audio_file_path)
135
- out_file = audio_file_path.replace(file_ending, ".segments.json")
136
- prettified_out_file = audio_file_path.replace(file_ending, ".segments_pretty.json")
137
- if os.path.exists(out_file):
138
- logging.info("speech-to-text: Segments file already exists: %s", out_file)
139
- with open(out_file) as f:
140
- global segments
141
- segments = json.load(f)
142
- return segments
143
-
144
- logging.info('speech-to-text: Starting transcription...')
145
- options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter)
146
- transcribe_options = dict(task="transcribe", **options)
147
- # use function and config at top of file
148
- whisper_model_instance = get_whisper_model(whisper_model, processing_choice)
149
- segments_raw, info = whisper_model_instance.transcribe(audio_file_path, **transcribe_options)
150
-
151
- segments = []
152
- for segment_chunk in segments_raw:
153
- chunk = {
154
- "Time_Start": segment_chunk.start,
155
- "Time_End": segment_chunk.end,
156
- "Text": segment_chunk.text
157
- }
158
- logging.debug("Segment: %s", chunk)
159
- segments.append(chunk)
160
-
161
- if segments:
162
- segments[0]["Text"] = f"This text was transcribed using whisper model: {whisper_model}\n\n" + segments[0]["Text"]
163
-
164
- if not segments:
165
- raise RuntimeError("No transcription produced. The audio file may be invalid or empty.")
166
- logging.info("speech-to-text: Transcription completed in %.2f seconds", time.time() - time_start)
167
-
168
- # Save the segments to a JSON file - prettified and non-prettified
169
- # FIXME so this is an optional flag to save either the prettified json file or the normal one
170
- save_json = True
171
- if save_json:
172
- logging.info("speech-to-text: Saving segments to JSON file")
173
- output_data = {'segments': segments}
174
-
175
- logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
176
- with open(prettified_out_file, 'w') as f:
177
- json.dump(output_data, f, indent=2)
178
-
179
- logging.info("speech-to-text: Saving JSON to %s", out_file)
180
- with open(out_file, 'w') as f:
181
- json.dump(output_data, f)
182
-
183
- logging.debug(f"speech-to-text: returning {segments[:500]}")
184
- gc.collect()
185
- return segments
186
-
187
- except Exception as e:
188
- logging.error("speech-to-text: Error transcribing audio: %s", str(e))
189
- raise RuntimeError("speech-to-text: Error transcribing audio")
190
-
191
-
192
- #def record_audio(duration, sample_rate=16000, chunk_size=1024):
193
- # p = pyaudio.PyAudio()
194
- # stream = p.open(format=pyaudio.paInt16,
195
- # channels=1,
196
- # rate=sample_rate,
197
- # input=True,
198
- # frames_per_buffer=chunk_size)
199
-
200
- # print("Recording...")
201
- # frames = []
202
- # stop_recording = threading.Event()
203
- # audio_queue = queue.Queue()
204
-
205
- def audio_callback():
206
- for _ in range(0, int(sample_rate / chunk_size * duration)):
207
- if stop_recording.is_set():
208
- break
209
- data = stream.read(chunk_size)
210
- audio_queue.put(data)
211
-
212
- audio_thread = threading.Thread(target=audio_callback)
213
- audio_thread.start()
214
-
215
- return p, stream, audio_queue, stop_recording, audio_thread
216
-
217
-
218
- def stop_recording(p, stream, audio_queue, stop_recording_event, audio_thread):
219
- stop_recording_event.set()
220
- audio_thread.join()
221
-
222
- frames = []
223
- while not audio_queue.empty():
224
- frames.append(audio_queue.get())
225
-
226
- print("Recording finished.")
227
-
228
- stream.stop_stream()
229
- stream.close()
230
- p.terminate()
231
-
232
- return b''.join(frames)
233
-
234
- def save_audio_temp(audio_data, sample_rate=16000):
235
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
236
- import wave
237
- wf = wave.open(temp_file.name, 'wb')
238
- wf.setnchannels(1)
239
- wf.setsampwidth(2)
240
- wf.setframerate(sample_rate)
241
- wf.writeframes(audio_data)
242
- wf.close()
243
- return temp_file.name
244
-
245
- #
246
- #
 
 
 
 
 
 
 
247
  #######################################################################################################################
 
1
+ # Audio_Transcription_Lib.py
2
+ #########################################
3
+ # Transcription Library
4
+ # This library is used to perform transcription of audio files.
5
+ # Currently, uses faster_whisper for transcription.
6
+ #
7
+ ####################
8
+ # Function List
9
+ #
10
+ # 1. convert_to_wav(video_file_path, offset=0, overwrite=False)
11
+ # 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False)
12
+ #
13
+ ####################
14
+ #
15
+ # Import necessary libraries to run solo for testing
16
+ import gc
17
+ import json
18
+ import logging
19
+ import os
20
+ import queue
21
+ import sys
22
+ import subprocess
23
+ import tempfile
24
+ import threading
25
+ import time
26
+ import configparser
27
+ # DEBUG Imports
28
+ #from memory_profiler import profile
29
+ import pyaudio
30
+
31
+ from App_Function_Libraries.Utils.Utils import load_comprehensive_config
32
+
33
+ # Import Local
34
+ #
35
+ #######################################################################################################################
36
+ # Function Definitions
37
+ #
38
+
39
+ # Convert video .m4a into .wav using ffmpeg
40
+ # ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav"
41
+ # https://www.gyan.dev/ffmpeg/builds/
42
+ #
43
+
44
+
45
+ whisper_model_instance = None
46
+ config = load_comprehensive_config()
47
+ processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
48
+
49
+
50
+ # FIXME: This is a temporary solution.
51
+ # This doesn't clear older models, which means potentially a lot of memory is being used...
52
+ def get_whisper_model(model_name, device):
53
+ global whisper_model_instance
54
+ if whisper_model_instance is None:
55
+ from faster_whisper import WhisperModel
56
+ logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
57
+ whisper_model_instance = WhisperModel(model_name, device=device)
58
+ return whisper_model_instance
59
+
60
+
61
+ # os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
62
+ #DEBUG
63
+ #@profile
64
+ def convert_to_wav(video_file_path, offset=0, overwrite=False):
65
+ out_path = os.path.splitext(video_file_path)[0] + ".wav"
66
+
67
+ if os.path.exists(out_path) and not overwrite:
68
+ print(f"File '{out_path}' already exists. Skipping conversion.")
69
+ logging.info(f"Skipping conversion as file already exists: {out_path}")
70
+ return out_path
71
+ print("Starting conversion process of .m4a to .WAV")
72
+ out_path = os.path.splitext(video_file_path)[0] + ".wav"
73
+
74
+ try:
75
+ if os.name == "nt":
76
+ logging.debug("ffmpeg being ran on windows")
77
+
78
+ if sys.platform.startswith('win'):
79
+ ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
80
+ logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
81
+ else:
82
+ ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
83
+
84
+ command = [
85
+ ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists
86
+ "-ss", "00:00:00", # Start at the beginning of the video
87
+ "-i", video_file_path,
88
+ "-ar", "16000", # Audio sample rate
89
+ "-ac", "1", # Number of audio channels
90
+ "-c:a", "pcm_s16le", # Audio codec
91
+ out_path
92
+ ]
93
+ try:
94
+ # Redirect stdin from null device to prevent ffmpeg from waiting for input
95
+ with open(os.devnull, 'rb') as null_file:
96
+ result = subprocess.run(command, stdin=null_file, text=True, capture_output=True)
97
+ if result.returncode == 0:
98
+ logging.info("FFmpeg executed successfully")
99
+ logging.debug("FFmpeg output: %s", result.stdout)
100
+ else:
101
+ logging.error("Error in running FFmpeg")
102
+ logging.error("FFmpeg stderr: %s", result.stderr)
103
+ raise RuntimeError(f"FFmpeg error: {result.stderr}")
104
+ except Exception as e:
105
+ logging.error("Error occurred - ffmpeg doesn't like windows")
106
+ raise RuntimeError("ffmpeg failed")
107
+ elif os.name == "posix":
108
+ os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
109
+ else:
110
+ raise RuntimeError("Unsupported operating system")
111
+ logging.info("Conversion to WAV completed: %s", out_path)
112
+ except subprocess.CalledProcessError as e:
113
+ logging.error("Error executing FFmpeg command: %s", str(e))
114
+ raise RuntimeError("Error converting video file to WAV")
115
+ except Exception as e:
116
+ logging.error("speech-to-text: Error transcribing audio: %s", str(e))
117
+ return {"error": str(e)}
118
+ gc.collect()
119
+ return out_path
120
+
121
+
122
+ # Transcribe .wav into .segments.json
123
+ #DEBUG
124
+ #@profile
125
+ def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
126
+ global whisper_model_instance, processing_choice
127
+ logging.info('speech-to-text: Loading faster_whisper model: %s', whisper_model)
128
+
129
+ time_start = time.time()
130
+ if audio_file_path is None:
131
+ raise ValueError("speech-to-text: No audio file provided")
132
+ logging.info("speech-to-text: Audio file path: %s", audio_file_path)
133
+
134
+ try:
135
+ _, file_ending = os.path.splitext(audio_file_path)
136
+ out_file = audio_file_path.replace(file_ending, ".segments.json")
137
+ prettified_out_file = audio_file_path.replace(file_ending, ".segments_pretty.json")
138
+ if os.path.exists(out_file):
139
+ logging.info("speech-to-text: Segments file already exists: %s", out_file)
140
+ with open(out_file) as f:
141
+ global segments
142
+ segments = json.load(f)
143
+ return segments
144
+
145
+ logging.info('speech-to-text: Starting transcription...')
146
+ options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter)
147
+ transcribe_options = dict(task="transcribe", **options)
148
+ # use function and config at top of file
149
+ whisper_model_instance = get_whisper_model(whisper_model, processing_choice)
150
+ segments_raw, info = whisper_model_instance.transcribe(audio_file_path, **transcribe_options)
151
+
152
+ segments = []
153
+ for segment_chunk in segments_raw:
154
+ chunk = {
155
+ "Time_Start": segment_chunk.start,
156
+ "Time_End": segment_chunk.end,
157
+ "Text": segment_chunk.text
158
+ }
159
+ logging.debug("Segment: %s", chunk)
160
+ segments.append(chunk)
161
+ # Print to verify its working
162
+ print(f"{segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
163
+
164
+ # Log it as well.
165
+ logging.debug(
166
+ f"Transcribed Segment: {segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
167
+
168
+ if segments:
169
+ segments[0]["Text"] = f"This text was transcribed using whisper model: {whisper_model}\n\n" + segments[0]["Text"]
170
+
171
+ if not segments:
172
+ raise RuntimeError("No transcription produced. The audio file may be invalid or empty.")
173
+ logging.info("speech-to-text: Transcription completed in %.2f seconds", time.time() - time_start)
174
+
175
+ # Save the segments to a JSON file - prettified and non-prettified
176
+ # FIXME so this is an optional flag to save either the prettified json file or the normal one
177
+ save_json = True
178
+ if save_json:
179
+ logging.info("speech-to-text: Saving segments to JSON file")
180
+ output_data = {'segments': segments}
181
+
182
+ logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
183
+ with open(prettified_out_file, 'w') as f:
184
+ json.dump(output_data, f, indent=2)
185
+
186
+ logging.info("speech-to-text: Saving JSON to %s", out_file)
187
+ with open(out_file, 'w') as f:
188
+ json.dump(output_data, f)
189
+
190
+ logging.debug(f"speech-to-text: returning {segments[:500]}")
191
+ gc.collect()
192
+ return segments
193
+
194
+ except Exception as e:
195
+ logging.error("speech-to-text: Error transcribing audio: %s", str(e))
196
+ raise RuntimeError("speech-to-text: Error transcribing audio")
197
+
198
+
199
+ def record_audio(duration, sample_rate=16000, chunk_size=1024):
200
+ p = pyaudio.PyAudio()
201
+ stream = p.open(format=pyaudio.paInt16,
202
+ channels=1,
203
+ rate=sample_rate,
204
+ input=True,
205
+ frames_per_buffer=chunk_size)
206
+
207
+ print("Recording...")
208
+ frames = []
209
+ stop_recording = threading.Event()
210
+ audio_queue = queue.Queue()
211
+
212
+ def audio_callback():
213
+ for _ in range(0, int(sample_rate / chunk_size * duration)):
214
+ if stop_recording.is_set():
215
+ break
216
+ data = stream.read(chunk_size)
217
+ audio_queue.put(data)
218
+
219
+ audio_thread = threading.Thread(target=audio_callback)
220
+ audio_thread.start()
221
+
222
+ return p, stream, audio_queue, stop_recording, audio_thread
223
+
224
+
225
+ def stop_recording(p, stream, audio_queue, stop_recording_event, audio_thread):
226
+ stop_recording_event.set()
227
+ audio_thread.join()
228
+
229
+ frames = []
230
+ while not audio_queue.empty():
231
+ frames.append(audio_queue.get())
232
+
233
+ print("Recording finished.")
234
+
235
+ stream.stop_stream()
236
+ stream.close()
237
+ p.terminate()
238
+
239
+ return b''.join(frames)
240
+
241
+ def save_audio_temp(audio_data, sample_rate=16000):
242
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
243
+ import wave
244
+ wf = wave.open(temp_file.name, 'wb')
245
+ wf.setnchannels(1)
246
+ wf.setsampwidth(2)
247
+ wf.setframerate(sample_rate)
248
+ wf.writeframes(audio_data)
249
+ wf.close()
250
+ return temp_file.name
251
+
252
+ #
253
+ #
254
  #######################################################################################################################