Spaces:
Running
Running
import gradio as gr | |
import torch | |
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig, AutoTokenizer | |
from IndicTransToolkit import IndicProcessor | |
import speech_recognition as sr | |
from pydub import AudioSegment | |
import os | |
from sentence_transformers import SentenceTransformer, util #Multilingual Similarity | |
# Constants | |
BATCH_SIZE = 4 | |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
quantization = None | |
MAX_AUDIO_DURATION = 600 # 10 minutes in seconds | |
# ---- IndicTrans2 Model Initialization ---- | |
def initialize_model_and_tokenizer(ckpt_dir, quantization): | |
if quantization == "4-bit": | |
qconfig = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_use_double_quant=True, | |
bnb_4bit_compute_dtype=torch.bfloat16, | |
) | |
elif quantization == "8-bit": | |
qconfig = BitsAndBytesConfig( | |
load_in_8bit=True, | |
bnb_8bit_use_double_quant=True, | |
bnb_8bit_compute_dtype=torch.bfloat16, | |
) | |
else: | |
qconfig = None | |
tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True) | |
model = AutoModelForSeq2SeqLM.from_pretrained( | |
ckpt_dir, | |
trust_remote_code=True, | |
low_cpu_mem_usage=True, | |
quantization_config=qconfig, | |
) | |
if qconfig is None: | |
model = model.to(DEVICE) | |
if DEVICE == "cuda": | |
model.half() | |
model.eval() | |
return tokenizer, model | |
def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip): | |
translations = [] | |
for i in range(0, len(input_sentences), BATCH_SIZE): | |
batch = input_sentences[i : i + BATCH_SIZE] | |
batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang) | |
inputs = tokenizer( | |
batch, | |
truncation=True, | |
padding="longest", | |
return_tensors="pt", | |
return_attention_mask=True, | |
).to(DEVICE) | |
with torch.no_grad(): | |
generated_tokens = model.generate( | |
**inputs, | |
use_cache=True, | |
min_length=0, | |
max_length=256, | |
num_beams=5, | |
num_return_sequences=1, | |
) | |
with tokenizer.as_target_tokenizer(): | |
generated_tokens = tokenizer.batch_decode( | |
generated_tokens.detach().cpu().tolist(), | |
skip_special_tokens=True, | |
clean_up_tokenization_spaces=True, | |
) | |
translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang) | |
del inputs | |
torch.cuda.empty_cache() | |
return translations | |
# Initialize IndicTrans2 | |
indic_en_ckpt_dir = "ai4bharat/indictrans2-indic-en-1B" | |
indic_en_tokenizer, indic_en_model = initialize_model_and_tokenizer(indic_en_ckpt_dir, quantization) | |
ip = IndicProcessor(inference=True) | |
# Load LaBSE for Multilingual Similarity | |
similarity_model = SentenceTransformer("sentence-transformers/LaBSE") | |
# ---- Audio Processing Function ---- | |
def convert_audio_to_wav(file_path): | |
""" Convert audio to WAV format for compatibility with SpeechRecognition """ | |
audio = AudioSegment.from_file(file_path) | |
wav_path = file_path.replace(file_path.split(".")[-1], "wav") | |
audio.export(wav_path, format="wav") | |
return wav_path | |
def transcribe_audio_in_chunks(audio_path, chunk_duration=30): | |
"""Transcribe long audio files in chunks of `chunk_duration` seconds.""" | |
recognizer = sr.Recognizer() | |
audio = AudioSegment.from_wav(audio_path) | |
# Limit audio duration to MAX_AUDIO_DURATION | |
if len(audio) > MAX_AUDIO_DURATION * 1000: | |
audio = audio[:MAX_AUDIO_DURATION * 1000] | |
full_text = [] | |
for i in range(0, len(audio), chunk_duration * 1000): | |
chunk = audio[i : i + chunk_duration * 1000] | |
chunk_path = f"temp_chunk.wav" | |
chunk.export(chunk_path, format="wav") | |
with sr.AudioFile(chunk_path) as source: | |
audio_data = recognizer.record(source) | |
try: | |
text = recognizer.recognize_google(audio_data, language="ml-IN") | |
full_text.append(text) | |
except sr.UnknownValueError: | |
full_text.append("[Unrecognized Audio]") | |
except sr.RequestError as e: | |
full_text.append(f"[Speech Error: {e}]") | |
return " ".join(full_text) | |
# Multilingual Semantic Similarity Function (Auto-Reference) | |
def compute_similarity(malayalam_text, english_translation): | |
"""Compares the original Malayalam transcription with back-translated Malayalam text for similarity.""" | |
if not malayalam_text.strip(): | |
print("⚠️ Malayalam transcription is empty!") | |
return "N/A" | |
if not english_translation.strip(): | |
print("⚠️ English translation is empty!") | |
return "N/A" | |
try: | |
# Translate English back to Malayalam for comparison | |
back_translated = batch_translate([english_translation], "eng_Latn", "mal_Mlym", indic_en_model, indic_en_tokenizer, ip)[0] | |
# Encode Malayalam transcription & Back-Translated Malayalam | |
embeddings = similarity_model.encode([malayalam_text, back_translated]) | |
# Compute cosine similarity | |
similarity_score = util.cos_sim(embeddings[0], embeddings[1]).item() | |
return round(similarity_score * 100, 2) # Convert to percentage | |
except Exception as e: | |
print(f"Error in similarity computation: {e}") | |
return "N/A" | |
# ---- Gradio Function ---- | |
def transcribe_and_translate(audio): | |
# Convert to WAV if necessary | |
if not audio.endswith(".wav"): | |
audio = convert_audio_to_wav(audio) | |
# Transcribe audio in chunks | |
malayalam_text = transcribe_audio_in_chunks(audio) | |
# Translation | |
en_sents = [malayalam_text] | |
src_lang, tgt_lang = "mal_Mlym", "eng_Latn" | |
translations = batch_translate(en_sents, src_lang, tgt_lang, indic_en_model, indic_en_tokenizer, ip) | |
# Compute Multilingual Semantic Similarity (Malayalam → English → Malayalam) | |
similarity_score = compute_similarity(malayalam_text, translations[0]) | |
return malayalam_text, translations[0], f"{similarity_score}%" # Similarity as % | |
# ---- Gradio Interface ---- | |
iface = gr.Interface( | |
fn=transcribe_and_translate, | |
inputs=[ | |
gr.Audio(sources=["microphone", "upload"], type="filepath"), # Only audio input | |
], | |
outputs=[ | |
gr.Textbox(label="Malayalam Transcription"), | |
gr.Textbox(label="English Translation"), | |
gr.Textbox(label="Semantic Similarity (%)"), # Automatically computed | |
], | |
title="Malayalam Speech Recognition & Translation", | |
description="Speak in Malayalam → Transcribe using Speech Recognition → Translate to English & Measure Accuracy.", | |
allow_flagging="never" | |
) | |
iface.launch(debug=True, share=True) | |