viditk commited on
Commit
31ae891
·
verified ·
1 Parent(s): f7494f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -14
app.py CHANGED
@@ -3,11 +3,15 @@ import torch
3
  from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig, AutoTokenizer
4
  from IndicTransToolkit import IndicProcessor
5
  import speech_recognition as sr
 
 
 
6
 
7
  # Constants
8
  BATCH_SIZE = 4
9
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
10
  quantization = None
 
11
 
12
  # ---- IndicTrans2 Model Initialization ----
13
  def initialize_model_and_tokenizer(ckpt_dir, quantization):
@@ -83,36 +87,103 @@ indic_en_ckpt_dir = "ai4bharat/indictrans2-indic-en-1B"
83
  indic_en_tokenizer, indic_en_model = initialize_model_and_tokenizer(indic_en_ckpt_dir, quantization)
84
  ip = IndicProcessor(inference=True)
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  # ---- Gradio Function ----
87
  def transcribe_and_translate(audio):
88
- recognizer = sr.Recognizer()
89
- with sr.AudioFile(audio) as source:
90
- audio_data = recognizer.record(source)
91
- try:
92
- # Malayalam transcription using Google API
93
- malayalam_text = recognizer.recognize_google(audio_data, language="ml-IN")
94
- except sr.UnknownValueError:
95
- return "Could not understand audio", ""
96
- except sr.RequestError as e:
97
- return f"Google API Error: {e}", ""
98
 
99
  # Translation
100
  en_sents = [malayalam_text]
101
  src_lang, tgt_lang = "mal_Mlym", "eng_Latn"
102
  translations = batch_translate(en_sents, src_lang, tgt_lang, indic_en_model, indic_en_tokenizer, ip)
103
 
104
- return malayalam_text, translations[0]
 
 
 
105
 
106
  # ---- Gradio Interface ----
107
  iface = gr.Interface(
108
  fn=transcribe_and_translate,
109
- inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
 
 
110
  outputs=[
111
  gr.Textbox(label="Malayalam Transcription"),
112
- gr.Textbox(label="English Translation")
 
113
  ],
114
  title="Malayalam Speech Recognition & Translation",
115
- description="Speak in Malayalam → Transcribe using Google Speech Recognition → Translate to English using IndicTrans2."
 
116
  )
117
 
118
  iface.launch(debug=True, share=True)
 
3
  from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig, AutoTokenizer
4
  from IndicTransToolkit import IndicProcessor
5
  import speech_recognition as sr
6
+ from pydub import AudioSegment
7
+ import os
8
+ from sentence_transformers import SentenceTransformer, util #Multilingual Similarity
9
 
10
  # Constants
11
  BATCH_SIZE = 4
12
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
13
  quantization = None
14
+ MAX_AUDIO_DURATION = 600 # 10 minutes in seconds
15
 
16
  # ---- IndicTrans2 Model Initialization ----
17
  def initialize_model_and_tokenizer(ckpt_dir, quantization):
 
87
  indic_en_tokenizer, indic_en_model = initialize_model_and_tokenizer(indic_en_ckpt_dir, quantization)
88
  ip = IndicProcessor(inference=True)
89
 
90
+ # Load LaBSE for Multilingual Similarity
91
+ similarity_model = SentenceTransformer("sentence-transformers/LaBSE")
92
+
93
+ # ---- Audio Processing Function ----
94
+ def convert_audio_to_wav(file_path):
95
+ """ Convert audio to WAV format for compatibility with SpeechRecognition """
96
+ audio = AudioSegment.from_file(file_path)
97
+ wav_path = file_path.replace(file_path.split(".")[-1], "wav")
98
+ audio.export(wav_path, format="wav")
99
+ return wav_path
100
+
101
+ def transcribe_audio_in_chunks(audio_path, chunk_duration=30):
102
+ """Transcribe long audio files in chunks of `chunk_duration` seconds."""
103
+ recognizer = sr.Recognizer()
104
+ audio = AudioSegment.from_wav(audio_path)
105
+
106
+ # Limit audio duration to MAX_AUDIO_DURATION
107
+ if len(audio) > MAX_AUDIO_DURATION * 1000:
108
+ audio = audio[:MAX_AUDIO_DURATION * 1000]
109
+
110
+ full_text = []
111
+ for i in range(0, len(audio), chunk_duration * 1000):
112
+ chunk = audio[i : i + chunk_duration * 1000]
113
+ chunk_path = f"temp_chunk.wav"
114
+ chunk.export(chunk_path, format="wav")
115
+
116
+ with sr.AudioFile(chunk_path) as source:
117
+ audio_data = recognizer.record(source)
118
+ try:
119
+ text = recognizer.recognize_google(audio_data, language="ml-IN")
120
+ full_text.append(text)
121
+ except sr.UnknownValueError:
122
+ full_text.append("[Unrecognized Audio]")
123
+ except sr.RequestError as e:
124
+ full_text.append(f"[Speech Error: {e}]")
125
+
126
+ return " ".join(full_text)
127
+
128
+ # Multilingual Semantic Similarity Function (Auto-Reference)
129
+ def compute_similarity(malayalam_text, english_translation):
130
+ """Compares the original Malayalam transcription with back-translated Malayalam text for similarity."""
131
+
132
+ if not malayalam_text.strip():
133
+ print("⚠️ Malayalam transcription is empty!")
134
+ return "N/A"
135
+
136
+ if not english_translation.strip():
137
+ print("⚠️ English translation is empty!")
138
+ return "N/A"
139
+
140
+ try:
141
+ # Translate English back to Malayalam for comparison
142
+ back_translated = batch_translate([english_translation], "eng_Latn", "mal_Mlym", indic_en_model, indic_en_tokenizer, ip)[0]
143
+
144
+ # Encode Malayalam transcription & Back-Translated Malayalam
145
+ embeddings = similarity_model.encode([malayalam_text, back_translated])
146
+
147
+ # Compute cosine similarity
148
+ similarity_score = util.cos_sim(embeddings[0], embeddings[1]).item()
149
+ return round(similarity_score * 100, 2) # Convert to percentage
150
+ except Exception as e:
151
+ print(f"Error in similarity computation: {e}")
152
+ return "N/A"
153
+
154
  # ---- Gradio Function ----
155
  def transcribe_and_translate(audio):
156
+ # Convert to WAV if necessary
157
+ if not audio.endswith(".wav"):
158
+ audio = convert_audio_to_wav(audio)
159
+
160
+ # Transcribe audio in chunks
161
+ malayalam_text = transcribe_audio_in_chunks(audio)
 
 
 
 
162
 
163
  # Translation
164
  en_sents = [malayalam_text]
165
  src_lang, tgt_lang = "mal_Mlym", "eng_Latn"
166
  translations = batch_translate(en_sents, src_lang, tgt_lang, indic_en_model, indic_en_tokenizer, ip)
167
 
168
+ # Compute Multilingual Semantic Similarity (Malayalam → English → Malayalam)
169
+ similarity_score = compute_similarity(malayalam_text, translations[0])
170
+
171
+ return malayalam_text, translations[0], f"{similarity_score}%" # Similarity as %
172
 
173
  # ---- Gradio Interface ----
174
  iface = gr.Interface(
175
  fn=transcribe_and_translate,
176
+ inputs=[
177
+ gr.Audio(sources=["microphone", "upload"], type="filepath"), # Only audio input
178
+ ],
179
  outputs=[
180
  gr.Textbox(label="Malayalam Transcription"),
181
+ gr.Textbox(label="English Translation"),
182
+ gr.Textbox(label="Semantic Similarity (%)"), # Automatically computed
183
  ],
184
  title="Malayalam Speech Recognition & Translation",
185
+ description="Speak in Malayalam → Transcribe using Speech Recognition → Translate to English & Measure Accuracy.",
186
+ allow_flagging="never"
187
  )
188
 
189
  iface.launch(debug=True, share=True)