muzammil-eds commited on
Commit
1bead67
·
1 Parent(s): bb0a58d

Files added

Browse files
Files changed (1) hide show
  1. app.py +17 -26
app.py CHANGED
@@ -1,29 +1,23 @@
1
  import os
2
- # Append /usr/bin to PATH
3
- os.environ["PATH"] += os.pathsep + "/usr/bin"
4
-
5
-
6
-
7
  from flask import Flask, request, jsonify, render_template
8
- import librosa
9
- import torch
10
- import Levenshtein
11
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
12
- from io import BytesIO
13
  from flask_cors import CORS
14
- from pydub import AudioSegment # NEW
 
 
15
 
 
16
  AudioSegment.converter = "/usr/bin/ffmpeg"
17
  AudioSegment.ffprobe = "/usr/bin/ffprobe"
18
 
 
19
  os.environ['HF_HOME'] = '/tmp/.cache'
20
 
21
  app = Flask(__name__)
22
  CORS(app)
23
 
24
- MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
25
- processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
26
- model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
27
 
28
 
29
  def convert_to_wav(audio_bytes):
@@ -39,19 +33,16 @@ def convert_to_wav(audio_bytes):
39
  return None
40
 
41
 
42
- def transcribe_audio_hf(audio_bytes):
43
- """Transcribes the audio using a pretrained Wav2Vec2 model."""
44
- wav_io = convert_to_wav(audio_bytes) # Convert to wav
45
  if wav_io is None:
46
  raise Exception("Could not convert audio to WAV format")
47
 
48
- speech_array, sampling_rate = librosa.load(wav_io, sr=16000)
49
- input_values = processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True).input_values
50
- with torch.no_grad():
51
- logits = model(input_values).logits
52
- predicted_ids = torch.argmax(logits, dim=-1)
53
- transcription = processor.batch_decode(predicted_ids)[0].strip()
54
- return transcription
55
 
56
 
57
  def levenshtein_similarity(transcription1, transcription2):
@@ -74,8 +65,8 @@ def transcribe():
74
  user_audio_bytes = user_audio.read()
75
 
76
  try:
77
- transcription_original = transcribe_audio_hf(original_audio_bytes)
78
- transcription_user = transcribe_audio_hf(user_audio_bytes)
79
  except Exception as e:
80
  return jsonify({"error": str(e)}), 500
81
 
 
1
  import os
 
 
 
 
 
2
  from flask import Flask, request, jsonify, render_template
3
+ from transformers import pipeline
 
 
 
 
4
  from flask_cors import CORS
5
+ from pydub import AudioSegment
6
+ from io import BytesIO
7
+ import Levenshtein
8
 
9
+ # Set the FFmpeg paths explicitly
10
  AudioSegment.converter = "/usr/bin/ffmpeg"
11
  AudioSegment.ffprobe = "/usr/bin/ffprobe"
12
 
13
+ # Set Hugging Face cache directory to avoid permission issues
14
  os.environ['HF_HOME'] = '/tmp/.cache'
15
 
16
  app = Flask(__name__)
17
  CORS(app)
18
 
19
+ # Use Hugging Face ASR pipeline for automatic speech recognition
20
+ asr_pipeline = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-arabic")
 
21
 
22
 
23
  def convert_to_wav(audio_bytes):
 
33
  return None
34
 
35
 
36
+ def transcribe_audio(audio_bytes):
37
+ """Transcribes the audio using the Hugging Face ASR pipeline."""
38
+ wav_io = convert_to_wav(audio_bytes)
39
  if wav_io is None:
40
  raise Exception("Could not convert audio to WAV format")
41
 
42
+ # Read the audio file into bytes for the ASR pipeline
43
+ wav_io.seek(0)
44
+ transcription = asr_pipeline(wav_io)["text"]
45
+ return transcription.strip()
 
 
 
46
 
47
 
48
  def levenshtein_similarity(transcription1, transcription2):
 
65
  user_audio_bytes = user_audio.read()
66
 
67
  try:
68
+ transcription_original = transcribe_audio(original_audio_bytes)
69
+ transcription_user = transcribe_audio(user_audio_bytes)
70
  except Exception as e:
71
  return jsonify({"error": str(e)}), 500
72