muzammil-eds's picture
Files added
af71291
raw
history blame
2.6 kB
from flask import Flask, request, jsonify, render_template
import librosa
import torch
import Levenshtein
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from io import BytesIO
from flask_cors import CORS
from pydub import AudioSegment # NEW
app = Flask(__name__)
CORS(app)
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
def convert_to_wav(audio_bytes):
"""Convert audio bytes to wav format using pydub"""
try:
audio = AudioSegment.from_file(BytesIO(audio_bytes)) # Auto-detect format
wav_io = BytesIO()
audio.export(wav_io, format="wav")
wav_io.seek(0)
return wav_io
except Exception as e:
print(f"Error converting audio: {e}")
return None
def transcribe_audio_hf(audio_bytes):
"""Transcribes the audio using a pretrained Wav2Vec2 model."""
wav_io = convert_to_wav(audio_bytes) # Convert to wav
if wav_io is None:
raise Exception("Could not convert audio to WAV format")
speech_array, sampling_rate = librosa.load(wav_io, sr=16000)
input_values = processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True).input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0].strip()
return transcription
def levenshtein_similarity(transcription1, transcription2):
distance = Levenshtein.distance(transcription1, transcription2)
max_len = max(len(transcription1), len(transcription2))
return 1 - distance / max_len
@app.route('/')
def index():
return render_template('index.html')
@app.route('/transcribe', methods=['POST'])
def transcribe():
original_audio = request.files['original_audio']
user_audio = request.files['user_audio']
original_audio_bytes = original_audio.read()
user_audio_bytes = user_audio.read()
try:
transcription_original = transcribe_audio_hf(original_audio_bytes)
transcription_user = transcribe_audio_hf(user_audio_bytes)
except Exception as e:
return jsonify({"error": str(e)}), 500
similarity_score = levenshtein_similarity(transcription_original, transcription_user)
return jsonify({
"transcription_original": transcription_original,
"transcription_user": transcription_user,
"similarity_score": similarity_score
})
if __name__ == '__main__':
app.run(debug=True)