Spaces:
Sleeping
Sleeping
File size: 5,252 Bytes
5543320 763f8fd 5543320 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import gradio as gr
import torch
import librosa
from transformers import Wav2Vec2Processor, AutoModelForCTC
import zipfile
import os
import firebase_admin
from firebase_admin import credentials, firestore
from datetime import datetime
# Initialize Firebase
cred = credentials.Certificate('firebase_credentials.json') # Your Firebase JSON key file
firebase_admin.initialize_app(cred)
db = firestore.client()
# Load the ASR model and processor
MODEL_NAME = "eleferrand/xlsr53_Amis"
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = AutoModelForCTC.from_pretrained(MODEL_NAME)
def transcribe(audio_file):
"""
Transcribes the audio file using the loaded ASR model.
Returns the transcription string.
"""
try:
# Load and resample the audio to 16kHz
audio, rate = librosa.load(audio_file, sr=16000)
# Prepare the input tensor for the model
input_values = processor(audio, sampling_rate=16000, return_tensors="pt").input_values
# Get model predictions (logits) and decode to text
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
return transcription.replace("[UNK]", "")
except Exception as e:
return f"Error processing file: {e}"
def transcribe_both(audio_file):
"""
Calls the transcribe function and returns the transcription
for both the original (read-only) and the corrected (editable) textboxes.
"""
transcription = transcribe(audio_file)
return transcription, transcription
def store_correction(original_transcription, corrected_transcription):
"""
Stores the original and corrected transcription in Firestore.
"""
try:
correction_data = {
'original_text': original_transcription,
'corrected_text': corrected_transcription,
'timestamp': datetime.now().isoformat()
}
db.collection('transcription_corrections').add(correction_data)
return "✅ Correction saved successfully!"
except Exception as e:
return f"⚠️ Error saving correction: {e}"
def prepare_download(audio_file, original_transcription, corrected_transcription):
"""
Prepares a ZIP file containing:
- The uploaded audio file (saved as audio.wav)
- A text file with the original transcription
- A text file with the corrected transcription
Returns the path to the ZIP file.
"""
if audio_file is None:
return None
zip_filename = "results.zip"
with zipfile.ZipFile(zip_filename, "w") as zf:
# Add the audio file (saved as audio.wav in the zip)
if os.path.exists(audio_file):
zf.write(audio_file, arcname="audio.wav")
else:
print("Audio file not found:", audio_file)
# Create and add the original transcription file
orig_txt = "original_transcription.txt"
with open(orig_txt, "w", encoding="utf-8") as f:
f.write(original_transcription)
zf.write(orig_txt, arcname="original_transcription.txt")
os.remove(orig_txt)
# Create and add the corrected transcription file
corr_txt = "corrected_transcription.txt"
with open(corr_txt, "w", encoding="utf-8") as f:
f.write(corrected_transcription)
zf.write(corr_txt, arcname="corrected_transcription.txt")
os.remove(corr_txt)
return zip_filename
# Build the Gradio Blocks interface
with gr.Blocks() as demo:
gr.Markdown("# ASR Demo with Editable Transcription, Firestore Storage, and Download")
with gr.Row():
audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Upload or Record Audio")
transcribe_button = gr.Button("Transcribe Audio")
with gr.Row():
# The original transcription is displayed (non-editable)
original_text = gr.Textbox(label="Original Transcription", interactive=False)
# The corrected transcription is pre-filled with the original, but remains editable.
corrected_text = gr.Textbox(label="Corrected Transcription", interactive=True)
save_button = gr.Button("Save Correction to Database")
save_status = gr.Textbox(label="Save Status", interactive=False)
download_button = gr.Button("Download Results (ZIP)")
download_output = gr.File(label="Download ZIP")
# When the transcribe button is clicked, update both textboxes with the transcription.
transcribe_button.click(
fn=transcribe_both,
inputs=audio_input,
outputs=[original_text, corrected_text]
)
# When the "Save Correction" button is clicked, store the corrected transcription in Firestore.
save_button.click(
fn=store_correction,
inputs=[original_text, corrected_text],
outputs=save_status
)
# When the download button is clicked, package the audio file and both transcriptions into a zip.
download_button.click(
fn=prepare_download,
inputs=[audio_input, original_text, corrected_text],
outputs=download_output
)
# Launch the demo
demo.launch(share=True) |