import os
import subprocess
import sys

# Clone required repositories
def clone_repositories():
    repos = [
        ('https://github.com/AI4Bharat/IndicTrans2.git', 'indictrans2'),
        ('https://github.com/VarunGumma/IndicTransToolkit.git', 'indictranstoolkit')
    ]
    
    for repo_url, repo_dir in repos:
        if not os.path.exists(repo_dir):
            subprocess.check_call(['git', 'clone', repo_url, repo_dir])
            sys.path.append(os.path.abspath(repo_dir))

# Clone repositories before importing
clone_repositories()

import streamlit as st
import torch
import librosa
import matplotlib.pyplot as plt
from PIL import Image
import torchaudio
from transformers import (
    AutoModelForSpeechSeq2Seq, 
    AutoProcessor, 
    pipeline,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler, StableDiffusionImg2ImgPipeline
import stanza
import numpy as np

from IndicTransToolkit import IndicProcessor

class TransGen:
    def __init__(
        self, 
        translation_model="ai4bharat/indictrans2-indic-en-1B", 
        stable_diff_model="stabilityai/stable-diffusion-2-base", 
        src_lang='hin_Deva', 
        tgt_lang='eng_Latn'
    ):
        self.bnb_config = BitsAndBytesConfig(load_in_4bit=True)
        self.tokenizer = AutoTokenizer.from_pretrained(translation_model, trust_remote_code=True)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(translation_model, trust_remote_code=True, quantization_config=self.bnb_config)
        self.ip = IndicProcessor(inference=True)
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang

        scheduler = EulerDiscreteScheduler.from_pretrained(stable_diff_model, subfolder="scheduler")
        self.pipe = StableDiffusionPipeline.from_pretrained(stable_diff_model, scheduler=scheduler, torch_dtype=torch.bfloat16)
        self.pipe = self.pipe.to("cuda")

        self.img2img_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(stable_diff_model, torch_dtype=torch.float16)
        self.img2img_pipe = self.img2img_pipe.to('cuda')

    def translate(self, input_sentences):
        batch = self.ip.preprocess_batch(
            input_sentences,
            src_lang=self.src_lang,
            tgt_lang=self.tgt_lang,
        )
        inputs = self.tokenizer(
            batch,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        )

        with torch.no_grad():
            generated_tokens = self.model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=5,
                num_return_sequences=1,
            )

        with self.tokenizer.as_target_tokenizer():
            generated_tokens = self.tokenizer.batch_decode(
                generated_tokens.detach().cpu().tolist(),
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True,
            )

        translations = self.ip.postprocess_batch(generated_tokens, lang=self.tgt_lang)
        return translations

    def generate_image(self, prompt, prev_image, strength=1.0, guidance_scale=7.5):
        strength = float(strength) if strength is not None else 1.0
        guidance_scale = float(guidance_scale) if guidance_scale is not None else 7.5
        
        strength = max(0.0, min(1.0, strength))
        
        if prev_image is not None:
            image = self.img2img_pipe(
                prompt, 
                image=prev_image,
                strength=strength, 
                guidance_scale=guidance_scale,
                negative_prompt='generate text in image'
            ).images[0]
            return image
        
        image = self.pipe(prompt)
        return image.images[0]

    def run(self, input_sentences, strength, guidance_scale, prev_image=None):
        translations = self.translate(input_sentences)
        sentence = translations[0]
        image = self.generate_image(sentence, prev_image, strength, guidance_scale)
        return sentence, image

def transcribe_audio_to_hindi(audio_path: str) -> str:
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    model_id = "openai/whisper-large-v3"
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
    )
    model.to(device)

    processor = AutoProcessor.from_pretrained(model_id)

    whisper_pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        torch_dtype=torch_dtype,
        device=device,
        model_kwargs={"language": "hi"}  
    )

    waveform, sample_rate = torchaudio.load(audio_path)

    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    result = whisper_pipe(waveform.squeeze(0).cpu().numpy(), return_timestamps=True)
    return result["text"]

# Download Stanza resources
stanza.download('hi')
nlp = stanza.Pipeline(lang='hi', processors='tokenize,pos')

def POS_policy(input_text):
    doc = nlp(input_text)
    words = doc.sentences[-1].words
    n = len(words)
    i = n-1
    
    while i >= 0:
        if words[i].upos in ['NOUN', 'VERB']:
            return i
        i -= 1
    return 0

def generate_images_from_audio(audio_path, base_strength=0.8, base_guidance_scale=12):
    text_tot = transcribe_audio_to_hindi(audio_path)
    
    st.write(f'Transcripted sentence: {text_tot}')
    
    cur_sent = ''
    prev_idx = 0
    generated_images = []
    transgen = TransGen()
    
    for word in text_tot.split():
        cur_sent += word + ' '
        
        str_idx = POS_policy(cur_sent)
        
        if str_idx != 0 and str_idx != prev_idx:
            prev_idx = str_idx
            
            sent, image = transgen.run(
                [cur_sent], 
                base_strength, 
                base_guidance_scale, 
                image if 'image' in locals() else None
            )
            
            generated_images.append({
                'sentence': cur_sent,
                'image': image
            })
    
    return generated_images

def main():
    st.title("Audio to Image Generation App")
    
    # File uploader
    uploaded_file = st.file_uploader("Choose a WAV audio file", type="wav")
    
    # Strength and Guidance Scale sliders
    base_strength = st.slider("Image Generation Strength", min_value=0.0, max_value=1.0, value=0.8, step=0.1)
    base_guidance_scale = st.slider("Guidance Scale", min_value=1.0, max_value=20.0, value=12.0, step=0.5)
    
    if uploaded_file is not None:
        # Save the uploaded file temporarily
        with open("temp_audio.wav", "wb") as f:
            f.write(uploaded_file.getvalue())
        
        # Generate images
        st.write("Generating Images...")
        generated_images = generate_images_from_audio("temp_audio.wav", base_strength, base_guidance_scale)
        
        # Display generated images
        st.write("Generated Images:")
        for img_data in generated_images:
            st.image(img_data['image'], caption=img_data['sentence'])

if __name__ == "__main__":
    main()