from huggingface_hub import InferenceClient
import gradio as gr
import random
import pandas as pd
from io import BytesIO
import csv
import os
import io
import tempfile
import re
import streamlit as st
import torch
from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration
import time
import logging

if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")
    logging.warning("GPU not found, using CPU, translation will be very slow.")

client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")

lang_id = {
    "Afrikaans": "af",
    "Amharic": "am",
    "Arabic": "ar",
    "Asturian": "ast",
    "Azerbaijani": "az",
    "Bashkir": "ba",
    "Belarusian": "be",
    "Bulgarian": "bg",
    "Bengali": "bn",
    "Breton": "br",
    "Bosnian": "bs",
    "Catalan": "ca",
    "Cebuano": "ceb",
    "Czech": "cs",
    "Welsh": "cy",
    "Danish": "da",
    "German": "de",
    "Greeek": "el",
    "English": "en",
    "Spanish": "es",
    "Estonian": "et",
    "Persian": "fa",
    "Fulah": "ff",
    "Finnish": "fi",
    "French": "fr",
    "Western Frisian": "fy",
    "Irish": "ga",
    "Gaelic": "gd",
    "Galician": "gl",
    "Gujarati": "gu",
    "Hausa": "ha",
    "Hebrew": "he",
    "Hindi": "hi",
    "Croatian": "hr",
    "Haitian": "ht",
    "Hungarian": "hu",
    "Armenian": "hy",
    "Indonesian": "id"
}

@st.cache(suppress_st_warning=True, allow_output_mutation=True)
def load_model(pretrained_model: str = "facebook/m2m100_1.2B", cache_dir: str = "models/"):
    tokenizer = M2M100Tokenizer.from_pretrained(pretrained_model, cache_dir=cache_dir)
    model = M2M100ForConditionalGeneration.from_pretrained(pretrained_model, cache_dir=cache_dir).to(device)
    model.eval()
    return tokenizer, model

def extract_text_from_excel(file):
    df = pd.read_excel(file)
    text = ' '.join(df['Unnamed: 1'].astype(str))
    return text

def save_to_csv(sentence, output, filename="synthetic_data.csv"):
    with open(filename, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow([sentence, output])

def generate(file, temperature, max_new_tokens, top_p, repetition_penalty, num_similar_sentences):
    text = extract_text_from_excel(file)
    sentences = text.split('.')
    random.shuffle(sentences)  # Shuffle sentences

    with tempfile.NamedTemporaryFile(mode='w', newline='', delete=False, suffix='.csv') as tmp:
        fieldnames = ['Original Sentence', 'Generated Sentence']
        writer = csv.DictWriter(tmp, fieldnames=fieldnames)
        writer.writeheader()

        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue

            generate_kwargs = {
                "temperature": temperature,
                "max_new_tokens": max_new_tokens,
                "top_p": top_p,
                "repetition_penalty": repetition_penalty,
                "do_sample": True,
                "seed": 42,
            }

            try:
                stream = client.text_generation(sentence, **generate_kwargs, stream=True, details=True, return_full_text=False)
                output = ""
                for response in stream:
                    output += response.token.text

                generated_sentences = re.split(r'(?<=[\.\!\?:])[\s\n]+', output)
                generated_sentences = [s.strip() for s in generated_sentences if s.strip() and s != '.']

                for _ in range(num_similar_sentences):
                    if not generated_sentences:
                        break
                    generated_sentence = generated_sentences.pop(random.randrange(len(generated_sentences)))

                    # Translate generated sentence to English
                    tokenizer, model = load_model()
                    src_lang = lang_id[language]
                    trg_lang = lang_id["English"]
                    tokenizer.src_lang = src_lang
                    with torch.no_grad():
                        encoded_input = tokenizer(generated_sentence, return_tensors="pt").to(device)
                        generated_tokens = model.generate(**encoded_input, forced_bos_token_id=tokenizer.get_lang_id(trg_lang))
                        translated_sentence = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

                    # Translate original sentence to Azerbaijani
                    tokenizer, model = load_model()
                    src_lang = lang_id["English"]
                    trg_lang = lang_id["Azerbaijani"]
                    tokenizer.src_lang = src_lang
                    with torch.no_grad():
                        encoded_input = tokenizer(sentence, return_tensors="pt").to(device)
                        generated_tokens = model.generate(**encoded_input, forced_bos_token_id=tokenizer.get_lang_id(trg_lang))
                        translated_sentence_az = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

                    writer.writerow({'Original Sentence': translated_sentence_az, 'Generated Sentence': translated_sentence})

            except Exception as e:
                print(f"Error generating data for sentence '{sentence}': {e}")

        tmp_path = tmp.name

    return tmp_path

gr.Interface(
    fn=generate,
    inputs=[
        gr.File(label="Upload Excel File", file_count="single", file_types=[".xlsx"]),
        gr.Slider(label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs"),
        gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=5120, step=64, interactive=True, info="The maximum numbers of new tokens"),
        gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
        gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
        gr.Slider(label="Number of similar sentences", value=10, minimum=1, maximum=20, step=1, interactive=True, info="Number of similar sentences to generate for each original sentence"),
        gr.Dropdown(label="Language of the input data", choices=list(lang_id.keys()), value="English")
    ],
    outputs=gr.File(label="Synthetic Data "),
    title="SDG",
    description="AYE QABIL.",
    allow_flagging="never",
).launch()