from transformers import GPT2LMHeadModel, AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import gradio as gr

# Function to load model and tokenizer
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id
    return model, tokenizer

# Load your three models
deleuze, tokenizer1 = load_model('genaforvena/the_soft_delerizome_machine_a_thousand_guattaris_fourth_of_plateaus_once')
gospel, tokenizer2 = load_model('genaforvena/the_soft_scum_gospel_delerizome_machine_a_thousand_guattaris')
scum, tokenizer3 = load_model('genaforvena/the_soft_scum_delerizome_machine_a_thousand_guattaris')

def generate_text_stream(model, tokenizer, prompt, max_new_tokens, temperature, top_k, top_p, repetition_penalty, no_repeat_ngram_size):
    inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
    generated_tokens = 0
    output_text = ""
    print(f"Starting text generation for model: {model.config._name_or_path}")
    while generated_tokens < max_new_tokens:
        try:
            outputs = model.generate(
                inputs,
                max_new_tokens=1,
                do_sample=True,
                top_k=top_k,
                top_p=top_p,
                temperature=temperature,
                repetition_penalty=repetition_penalty,
                no_repeat_ngram_size=no_repeat_ngram_size,
                pad_token_id=tokenizer.eos_token_id,
            )
            print(f"Generated token: {outputs}")
            new_token = tokenizer.decode(outputs[0, -1], skip_special_tokens=True)
            print(f"Decoded token: {new_token}")
            output_text += new_token
            generated_tokens += 1
            print(f"Generated {generated_tokens} tokens so far")
            yield output_text
            inputs = outputs
        except Exception as e:
            print(f"Error occurred during text generation: {e}")
            break
    print(f"Final output for {model.config._name_or_path}:\n{output_text}\n")


# Load BART model and tokenizer for summarization
summarization_model_name = "facebook/bart-large-cnn"
summarizer_model = AutoModelForSeq2SeqLM.from_pretrained(summarization_model_name).to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) # Use GPU if available
summarizer_tokenizer = AutoTokenizer.from_pretrained(summarization_model_name)

# Function to generate summary with manual streaming (Corrected)
def generate_summary_stream(model, tokenizer, text, max_length, min_length):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True).to(model.device)
    summary_text = ""
    print(f"Starting summary generation for text: {text[:100]}...")
    
    # Use a more efficient way to generate the summary
    output = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=max_length,
        min_length=min_length,
        num_beams=4,
        no_repeat_ngram_size=2,
        early_stopping=False,
    )
    
    # Decode the generated summary in chunks to simulate streaming
    chunk_size = 50
    for i in range(0, len(output[0]), chunk_size):
        chunk = output[0][i:i + chunk_size]
        chunk_text = tokenizer.decode(chunk, skip_special_tokens=True)
        summary_text += chunk_text
        print(f"Generated summary chunk: {chunk_text}")
        yield summary_text
    
    print(f"Final summary:\n{summary_text}\n")


def reply(prompt):
    yield "Morosia sees...."
    if len(prompt) == 0:
        prompt = "class "

    # --- Phase 1: Generate and Stream Combined Output ---
    combined_output = ""

    # Stream deleuze output
    print("Starting Deleuze model generation...")
    for text in generate_text_stream(
        deleuze, tokenizer1, prompt, max_new_tokens=300, temperature=1.5, top_k=180, top_p=0.95, repetition_penalty=1.2, no_repeat_ngram_size=2
    ):
        combined_output = text
        print(f"Deleuze output: {text}")
        yield combined_output

    # Stream scum output (appending to the existing combined output)
    print("Starting Scum model generation...")
    scum_output = ""
    for text in generate_text_stream(
        scum, tokenizer3, combined_output, max_new_tokens=100, temperature=1.7, top_k=320, top_p=0.9, repetition_penalty=1.3, no_repeat_ngram_size=2
    ):
        combined_output += text
        print(f"Scum output: {text}")
        yield combined_output

    scum_output = text
    # Stream gospel output (appending to the existing combined output)
    print("Starting Gospel model generation...")
    for text in generate_text_stream(
        gospel, tokenizer2, scum_output, max_new_tokens=100, temperature=1.0, top_k=190, top_p=0.95, repetition_penalty=1.2, no_repeat_ngram_size=2
    ):
        combined_output += text
        print(f"Gospel output: {text}")
        yield combined_output

    # --- Phase 2: Stream final_output_for_summary ---
    print(f"Streaming final_output_for_summary: {combined_output[:100]}...")  # Print first 100 chars
    yield "Morosia is analysing her visions... Wait..."

    # --- Phase 3: Generate and Stream Summary (Replacing Combined Output) ---
    print(f"Starting summary generation for final output: {combined_output[:100]}...")
    for text in generate_summary_stream(summarizer_model, summarizer_tokenizer, combined_output, max_length=100, min_length=30):
        print(f"Streaming summary: {text}")
        yield text


# Gradio interface
iface = gr.Interface(fn=reply, inputs="text", outputs="text")
iface.launch()