from flask import Flask, request, jsonify, render_template import librosa import torch import Levenshtein from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor from io import BytesIO from flask_cors import CORS from pydub import AudioSegment # NEW import os os.environ['HF_HOME'] = '/tmp/.cache' app = Flask(__name__) CORS(app) MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic" processor = Wav2Vec2Processor.from_pretrained(MODEL_ID) model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID) def convert_to_wav(audio_bytes): """Convert audio bytes to wav format using pydub""" try: audio = AudioSegment.from_file(BytesIO(audio_bytes)) # Auto-detect format wav_io = BytesIO() audio.export(wav_io, format="wav") wav_io.seek(0) return wav_io except Exception as e: print(f"Error converting audio: {e}") return None def transcribe_audio_hf(audio_bytes): """Transcribes the audio using a pretrained Wav2Vec2 model.""" wav_io = convert_to_wav(audio_bytes) # Convert to wav if wav_io is None: raise Exception("Could not convert audio to WAV format") speech_array, sampling_rate = librosa.load(wav_io, sr=16000) input_values = processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True).input_values with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids)[0].strip() return transcription def levenshtein_similarity(transcription1, transcription2): distance = Levenshtein.distance(transcription1, transcription2) max_len = max(len(transcription1), len(transcription2)) return 1 - distance / max_len @app.route('/') def index(): return render_template('index.html') @app.route('/transcribe', methods=['POST']) def transcribe(): original_audio = request.files['original_audio'] user_audio = request.files['user_audio'] original_audio_bytes = original_audio.read() user_audio_bytes = user_audio.read() try: transcription_original = transcribe_audio_hf(original_audio_bytes) transcription_user = transcribe_audio_hf(user_audio_bytes) except Exception as e: return jsonify({"error": str(e)}), 500 similarity_score = levenshtein_similarity(transcription_original, transcription_user) return jsonify({ "transcription_original": transcription_original, "transcription_user": transcription_user, "similarity_score": similarity_score }) if __name__ == '__main__': app.run(debug=True)