Spaces:
Running
Running
File size: 6,841 Bytes
219dcf9 31ae891 219dcf9 31ae891 219dcf9 f7494f3 219dcf9 31ae891 219dcf9 31ae891 219dcf9 f7494f3 31ae891 219dcf9 31ae891 219dcf9 31ae891 219dcf9 31ae891 219dcf9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
import gradio as gr
import torch
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig, AutoTokenizer
from IndicTransToolkit import IndicProcessor
import speech_recognition as sr
from pydub import AudioSegment
import os
from sentence_transformers import SentenceTransformer, util #Multilingual Similarity
# Constants
BATCH_SIZE = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
quantization = None
MAX_AUDIO_DURATION = 600 # 10 minutes in seconds
# ---- IndicTrans2 Model Initialization ----
def initialize_model_and_tokenizer(ckpt_dir, quantization):
if quantization == "4-bit":
qconfig = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
)
elif quantization == "8-bit":
qconfig = BitsAndBytesConfig(
load_in_8bit=True,
bnb_8bit_use_double_quant=True,
bnb_8bit_compute_dtype=torch.bfloat16,
)
else:
qconfig = None
tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained(
ckpt_dir,
trust_remote_code=True,
low_cpu_mem_usage=True,
quantization_config=qconfig,
)
if qconfig is None:
model = model.to(DEVICE)
if DEVICE == "cuda":
model.half()
model.eval()
return tokenizer, model
def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
translations = []
for i in range(0, len(input_sentences), BATCH_SIZE):
batch = input_sentences[i : i + BATCH_SIZE]
batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)
inputs = tokenizer(
batch,
truncation=True,
padding="longest",
return_tensors="pt",
return_attention_mask=True,
).to(DEVICE)
with torch.no_grad():
generated_tokens = model.generate(
**inputs,
use_cache=True,
min_length=0,
max_length=256,
num_beams=5,
num_return_sequences=1,
)
with tokenizer.as_target_tokenizer():
generated_tokens = tokenizer.batch_decode(
generated_tokens.detach().cpu().tolist(),
skip_special_tokens=True,
clean_up_tokenization_spaces=True,
)
translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)
del inputs
torch.cuda.empty_cache()
return translations
# Initialize IndicTrans2
indic_en_ckpt_dir = "ai4bharat/indictrans2-indic-en-1B"
indic_en_tokenizer, indic_en_model = initialize_model_and_tokenizer(indic_en_ckpt_dir, quantization)
ip = IndicProcessor(inference=True)
# Load LaBSE for Multilingual Similarity
similarity_model = SentenceTransformer("sentence-transformers/LaBSE")
# ---- Audio Processing Function ----
def convert_audio_to_wav(file_path):
""" Convert audio to WAV format for compatibility with SpeechRecognition """
audio = AudioSegment.from_file(file_path)
wav_path = file_path.replace(file_path.split(".")[-1], "wav")
audio.export(wav_path, format="wav")
return wav_path
def transcribe_audio_in_chunks(audio_path, chunk_duration=30):
"""Transcribe long audio files in chunks of `chunk_duration` seconds."""
recognizer = sr.Recognizer()
audio = AudioSegment.from_wav(audio_path)
# Limit audio duration to MAX_AUDIO_DURATION
if len(audio) > MAX_AUDIO_DURATION * 1000:
audio = audio[:MAX_AUDIO_DURATION * 1000]
full_text = []
for i in range(0, len(audio), chunk_duration * 1000):
chunk = audio[i : i + chunk_duration * 1000]
chunk_path = f"temp_chunk.wav"
chunk.export(chunk_path, format="wav")
with sr.AudioFile(chunk_path) as source:
audio_data = recognizer.record(source)
try:
text = recognizer.recognize_google(audio_data, language="ml-IN")
full_text.append(text)
except sr.UnknownValueError:
full_text.append("[Unrecognized Audio]")
except sr.RequestError as e:
full_text.append(f"[Speech Error: {e}]")
return " ".join(full_text)
# Multilingual Semantic Similarity Function (Auto-Reference)
def compute_similarity(malayalam_text, english_translation):
"""Compares the original Malayalam transcription with back-translated Malayalam text for similarity."""
if not malayalam_text.strip():
print("⚠️ Malayalam transcription is empty!")
return "N/A"
if not english_translation.strip():
print("⚠️ English translation is empty!")
return "N/A"
try:
# Translate English back to Malayalam for comparison
back_translated = batch_translate([english_translation], "eng_Latn", "mal_Mlym", indic_en_model, indic_en_tokenizer, ip)[0]
# Encode Malayalam transcription & Back-Translated Malayalam
embeddings = similarity_model.encode([malayalam_text, back_translated])
# Compute cosine similarity
similarity_score = util.cos_sim(embeddings[0], embeddings[1]).item()
return round(similarity_score * 100, 2) # Convert to percentage
except Exception as e:
print(f"Error in similarity computation: {e}")
return "N/A"
# ---- Gradio Function ----
def transcribe_and_translate(audio):
# Convert to WAV if necessary
if not audio.endswith(".wav"):
audio = convert_audio_to_wav(audio)
# Transcribe audio in chunks
malayalam_text = transcribe_audio_in_chunks(audio)
# Translation
en_sents = [malayalam_text]
src_lang, tgt_lang = "mal_Mlym", "eng_Latn"
translations = batch_translate(en_sents, src_lang, tgt_lang, indic_en_model, indic_en_tokenizer, ip)
# Compute Multilingual Semantic Similarity (Malayalam → English → Malayalam)
similarity_score = compute_similarity(malayalam_text, translations[0])
return malayalam_text, translations[0], f"{similarity_score}%" # Similarity as %
# ---- Gradio Interface ----
iface = gr.Interface(
fn=transcribe_and_translate,
inputs=[
gr.Audio(sources=["microphone", "upload"], type="filepath"), # Only audio input
],
outputs=[
gr.Textbox(label="Malayalam Transcription"),
gr.Textbox(label="English Translation"),
gr.Textbox(label="Semantic Similarity (%)"), # Automatically computed
],
title="Malayalam Speech Recognition & Translation",
description="Speak in Malayalam → Transcribe using Speech Recognition → Translate to English & Measure Accuracy.",
allow_flagging="never"
)
iface.launch(debug=True, share=True)
|