EwoutLagendijk
commited on
Commit
•
b9710dc
1
Parent(s):
a8d0349
Update app.py
Browse files
app.py
CHANGED
@@ -37,17 +37,36 @@ def format_timestamp(seconds: float, always_include_hours: bool = False, decimal
|
|
37 |
return seconds
|
38 |
|
39 |
|
40 |
-
def
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
|
53 |
demo = gr.Blocks()
|
|
|
37 |
return seconds
|
38 |
|
39 |
|
40 |
+
def transcribe_speech(filepath):
|
41 |
+
# Load the audio
|
42 |
+
audio, sampling_rate = librosa.load(filepath, sr=16000)
|
43 |
+
|
44 |
+
# Define chunk size (e.g., 30 seconds)
|
45 |
+
chunk_duration = 30 # in seconds
|
46 |
+
chunk_samples = chunk_duration * sampling_rate
|
47 |
+
|
48 |
+
# Process audio in chunks
|
49 |
+
transcription = []
|
50 |
+
for i in range(0, len(audio), chunk_samples):
|
51 |
+
chunk = audio[i:i + chunk_samples]
|
52 |
+
|
53 |
+
# Convert the chunk into input features
|
54 |
+
inputs = processor(audio=chunk, sampling_rate=16000, return_tensors="pt").input_features
|
55 |
+
|
56 |
+
# Generate transcription for the chunk
|
57 |
+
generated_ids = model.generate(
|
58 |
+
inputs,
|
59 |
+
max_new_tokens=444, # Max allowed by Whisper
|
60 |
+
forced_decoder_ids=processor.get_decoder_prompt_ids(language="id", task="transcribe")
|
61 |
+
)
|
62 |
+
|
63 |
+
# Decode and append the transcription
|
64 |
+
chunk_transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
65 |
+
transcription.append(chunk_transcription)
|
66 |
+
|
67 |
+
# Combine all chunk transcriptions into a single string
|
68 |
+
return " ".join(transcription)
|
69 |
+
|
70 |
|
71 |
|
72 |
demo = gr.Blocks()
|