Spaces:
Building
Building
import os | |
from flask import Flask, request, jsonify, render_template | |
from transformers import pipeline | |
from flask_cors import CORS | |
from pydub import AudioSegment | |
from io import BytesIO | |
import Levenshtein | |
# Set the FFmpeg paths explicitly | |
AudioSegment.converter = "/usr/bin/ffmpeg" | |
AudioSegment.ffprobe = "/usr/bin/ffprobe" | |
# Set Hugging Face cache directory to avoid permission issues | |
os.environ['HF_HOME'] = '/tmp/.cache' | |
app = Flask(__name__) | |
CORS(app) | |
# Use Hugging Face ASR pipeline for automatic speech recognition | |
asr_pipeline = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-arabic") | |
def convert_to_wav(audio_bytes): | |
"""Convert audio bytes to wav format using pydub""" | |
try: | |
audio = AudioSegment.from_file(BytesIO(audio_bytes)) # Auto-detect format | |
wav_io = BytesIO() | |
audio.export(wav_io, format="wav") | |
wav_io.seek(0) | |
return wav_io | |
except Exception as e: | |
print(f"Error converting audio: {e}") | |
return None | |
def transcribe_audio(audio_bytes): | |
"""Transcribes the audio using the Hugging Face ASR pipeline.""" | |
wav_io = convert_to_wav(audio_bytes) | |
if wav_io is None: | |
raise Exception("Could not convert audio to WAV format") | |
# Read the audio file into bytes for the ASR pipeline | |
wav_io.seek(0) | |
transcription = asr_pipeline(wav_io)["text"] | |
return transcription.strip() | |
def levenshtein_similarity(transcription1, transcription2): | |
distance = Levenshtein.distance(transcription1, transcription2) | |
max_len = max(len(transcription1), len(transcription2)) | |
return 1 - distance / max_len | |
def index(): | |
return render_template('index.html') | |
def transcribe(): | |
original_audio = request.files['original_audio'] | |
user_audio = request.files['user_audio'] | |
original_audio_bytes = original_audio.read() | |
user_audio_bytes = user_audio.read() | |
try: | |
transcription_original = transcribe_audio(original_audio_bytes) | |
transcription_user = transcribe_audio(user_audio_bytes) | |
except Exception as e: | |
return jsonify({"error": str(e)}), 500 | |
similarity_score = levenshtein_similarity(transcription_original, transcription_user) | |
return jsonify({ | |
"transcription_original": transcription_original, | |
"transcription_user": transcription_user, | |
"similarity_score": similarity_score | |
}) | |
if __name__ == '__main__': | |
app.run(debug=False, port=7860, host='0.0.0.0') | |