muzammil-eds's picture
Files added
1bead67
raw
history blame
2.54 kB
import os
from flask import Flask, request, jsonify, render_template
from transformers import pipeline
from flask_cors import CORS
from pydub import AudioSegment
from io import BytesIO
import Levenshtein
# Set the FFmpeg paths explicitly
AudioSegment.converter = "/usr/bin/ffmpeg"
AudioSegment.ffprobe = "/usr/bin/ffprobe"
# Set Hugging Face cache directory to avoid permission issues
os.environ['HF_HOME'] = '/tmp/.cache'
app = Flask(__name__)
CORS(app)
# Use Hugging Face ASR pipeline for automatic speech recognition
asr_pipeline = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-arabic")
def convert_to_wav(audio_bytes):
"""Convert audio bytes to wav format using pydub"""
try:
audio = AudioSegment.from_file(BytesIO(audio_bytes)) # Auto-detect format
wav_io = BytesIO()
audio.export(wav_io, format="wav")
wav_io.seek(0)
return wav_io
except Exception as e:
print(f"Error converting audio: {e}")
return None
def transcribe_audio(audio_bytes):
"""Transcribes the audio using the Hugging Face ASR pipeline."""
wav_io = convert_to_wav(audio_bytes)
if wav_io is None:
raise Exception("Could not convert audio to WAV format")
# Read the audio file into bytes for the ASR pipeline
wav_io.seek(0)
transcription = asr_pipeline(wav_io)["text"]
return transcription.strip()
def levenshtein_similarity(transcription1, transcription2):
distance = Levenshtein.distance(transcription1, transcription2)
max_len = max(len(transcription1), len(transcription2))
return 1 - distance / max_len
@app.route('/')
def index():
return render_template('index.html')
@app.route('/transcribe', methods=['POST'])
def transcribe():
original_audio = request.files['original_audio']
user_audio = request.files['user_audio']
original_audio_bytes = original_audio.read()
user_audio_bytes = user_audio.read()
try:
transcription_original = transcribe_audio(original_audio_bytes)
transcription_user = transcribe_audio(user_audio_bytes)
except Exception as e:
return jsonify({"error": str(e)}), 500
similarity_score = levenshtein_similarity(transcription_original, transcription_user)
return jsonify({
"transcription_original": transcription_original,
"transcription_user": transcription_user,
"similarity_score": similarity_score
})
if __name__ == '__main__':
app.run(debug=False, port=7860, host='0.0.0.0')