Spaces:
Building
Building
from flask import Flask, request, jsonify, render_template | |
import librosa | |
import torch | |
import Levenshtein | |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor | |
from io import BytesIO | |
from flask_cors import CORS | |
from pydub import AudioSegment # NEW | |
app = Flask(__name__) | |
CORS(app) | |
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic" | |
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID) | |
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID) | |
def convert_to_wav(audio_bytes): | |
"""Convert audio bytes to wav format using pydub""" | |
try: | |
audio = AudioSegment.from_file(BytesIO(audio_bytes)) # Auto-detect format | |
wav_io = BytesIO() | |
audio.export(wav_io, format="wav") | |
wav_io.seek(0) | |
return wav_io | |
except Exception as e: | |
print(f"Error converting audio: {e}") | |
return None | |
def transcribe_audio_hf(audio_bytes): | |
"""Transcribes the audio using a pretrained Wav2Vec2 model.""" | |
wav_io = convert_to_wav(audio_bytes) # Convert to wav | |
if wav_io is None: | |
raise Exception("Could not convert audio to WAV format") | |
speech_array, sampling_rate = librosa.load(wav_io, sr=16000) | |
input_values = processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True).input_values | |
with torch.no_grad(): | |
logits = model(input_values).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
transcription = processor.batch_decode(predicted_ids)[0].strip() | |
return transcription | |
def levenshtein_similarity(transcription1, transcription2): | |
distance = Levenshtein.distance(transcription1, transcription2) | |
max_len = max(len(transcription1), len(transcription2)) | |
return 1 - distance / max_len | |
def index(): | |
return render_template('index.html') | |
def transcribe(): | |
original_audio = request.files['original_audio'] | |
user_audio = request.files['user_audio'] | |
original_audio_bytes = original_audio.read() | |
user_audio_bytes = user_audio.read() | |
try: | |
transcription_original = transcribe_audio_hf(original_audio_bytes) | |
transcription_user = transcribe_audio_hf(user_audio_bytes) | |
except Exception as e: | |
return jsonify({"error": str(e)}), 500 | |
similarity_score = levenshtein_similarity(transcription_original, transcription_user) | |
return jsonify({ | |
"transcription_original": transcription_original, | |
"transcription_user": transcription_user, | |
"similarity_score": similarity_score | |
}) | |
if __name__ == '__main__': | |
app.run(debug=True) | |