import os import subprocess import sys # Clone required repositories def clone_repositories(): repos = [ ('https://github.com/AI4Bharat/IndicTrans2.git', 'indictrans2'), ('https://github.com/VarunGumma/IndicTransToolkit.git', 'indictranstoolkit') ] for repo_url, repo_dir in repos: if not os.path.exists(repo_dir): subprocess.check_call(['git', 'clone', repo_url, repo_dir]) sys.path.append(os.path.abspath(repo_dir)) # Clone repositories before importing clone_repositories() import streamlit as st import torch import librosa import matplotlib.pyplot as plt from PIL import Image import torchaudio from transformers import ( AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig ) from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler, StableDiffusionImg2ImgPipeline import stanza import numpy as np from IndicTransToolkit import IndicProcessor class TransGen: def __init__( self, translation_model="ai4bharat/indictrans2-indic-en-1B", stable_diff_model="stabilityai/stable-diffusion-2-base", src_lang='hin_Deva', tgt_lang='eng_Latn' ): self.bnb_config = BitsAndBytesConfig(load_in_4bit=True) self.tokenizer = AutoTokenizer.from_pretrained(translation_model, trust_remote_code=True) self.model = AutoModelForSeq2SeqLM.from_pretrained(translation_model, trust_remote_code=True, quantization_config=self.bnb_config) self.ip = IndicProcessor(inference=True) self.src_lang = src_lang self.tgt_lang = tgt_lang scheduler = EulerDiscreteScheduler.from_pretrained(stable_diff_model, subfolder="scheduler") self.pipe = StableDiffusionPipeline.from_pretrained(stable_diff_model, scheduler=scheduler, torch_dtype=torch.bfloat16) self.pipe = self.pipe.to("cuda") self.img2img_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(stable_diff_model, torch_dtype=torch.float16) self.img2img_pipe = self.img2img_pipe.to('cuda') def translate(self, input_sentences): batch = self.ip.preprocess_batch( input_sentences, src_lang=self.src_lang, tgt_lang=self.tgt_lang, ) inputs = self.tokenizer( batch, truncation=True, padding="longest", return_tensors="pt", return_attention_mask=True, ) with torch.no_grad(): generated_tokens = self.model.generate( **inputs, use_cache=True, min_length=0, max_length=256, num_beams=5, num_return_sequences=1, ) with self.tokenizer.as_target_tokenizer(): generated_tokens = self.tokenizer.batch_decode( generated_tokens.detach().cpu().tolist(), skip_special_tokens=True, clean_up_tokenization_spaces=True, ) translations = self.ip.postprocess_batch(generated_tokens, lang=self.tgt_lang) return translations def generate_image(self, prompt, prev_image, strength=1.0, guidance_scale=7.5): strength = float(strength) if strength is not None else 1.0 guidance_scale = float(guidance_scale) if guidance_scale is not None else 7.5 strength = max(0.0, min(1.0, strength)) if prev_image is not None: image = self.img2img_pipe( prompt, image=prev_image, strength=strength, guidance_scale=guidance_scale, negative_prompt='generate text in image' ).images[0] return image image = self.pipe(prompt) return image.images[0] def run(self, input_sentences, strength, guidance_scale, prev_image=None): translations = self.translate(input_sentences) sentence = translations[0] image = self.generate_image(sentence, prev_image, strength, guidance_scale) return sentence, image def transcribe_audio_to_hindi(audio_path: str) -> str: device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model_id = "openai/whisper-large-v3" model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ) model.to(device) processor = AutoProcessor.from_pretrained(model_id) whisper_pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, torch_dtype=torch_dtype, device=device, model_kwargs={"language": "hi"} ) waveform, sample_rate = torchaudio.load(audio_path) if sample_rate != 16000: resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) waveform = resampler(waveform) result = whisper_pipe(waveform.squeeze(0).cpu().numpy(), return_timestamps=True) return result["text"] # Download Stanza resources stanza.download('hi') nlp = stanza.Pipeline(lang='hi', processors='tokenize,pos') def POS_policy(input_text): doc = nlp(input_text) words = doc.sentences[-1].words n = len(words) i = n-1 while i >= 0: if words[i].upos in ['NOUN', 'VERB']: return i i -= 1 return 0 def generate_images_from_audio(audio_path, base_strength=0.8, base_guidance_scale=12): text_tot = transcribe_audio_to_hindi(audio_path) st.write(f'Transcripted sentence: {text_tot}') cur_sent = '' prev_idx = 0 generated_images = [] transgen = TransGen() for word in text_tot.split(): cur_sent += word + ' ' str_idx = POS_policy(cur_sent) if str_idx != 0 and str_idx != prev_idx: prev_idx = str_idx sent, image = transgen.run( [cur_sent], base_strength, base_guidance_scale, image if 'image' in locals() else None ) generated_images.append({ 'sentence': cur_sent, 'image': image }) return generated_images def main(): st.title("Audio to Image Generation App") # File uploader uploaded_file = st.file_uploader("Choose a WAV audio file", type="wav") # Strength and Guidance Scale sliders base_strength = st.slider("Image Generation Strength", min_value=0.0, max_value=1.0, value=0.8, step=0.1) base_guidance_scale = st.slider("Guidance Scale", min_value=1.0, max_value=20.0, value=12.0, step=0.5) if uploaded_file is not None: # Save the uploaded file temporarily with open("temp_audio.wav", "wb") as f: f.write(uploaded_file.getvalue()) # Generate images st.write("Generating Images...") generated_images = generate_images_from_audio("temp_audio.wav", base_strength, base_guidance_scale) # Display generated images st.write("Generated Images:") for img_data in generated_images: st.image(img_data['image'], caption=img_data['sentence']) if __name__ == "__main__": main()