import transformers
import re
from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForCausalLM, pipeline
import torch
import gradio as gr
import json
import os
import shutil
import requests
import pandas as pd

# Define the device
device = "cuda" if torch.cuda.is_available() else "cpu"

editorial_model = "PleIAs/Estienne"
token_classifier = pipeline(
    "token-classification", model=editorial_model, aggregation_strategy="simple", device=device
)

tokenizer = AutoTokenizer.from_pretrained(editorial_model, model_max_length=512)

# Preprocess the 'word' column
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Replace newlines with spaces
    text = re.sub(r'\n', ' ', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading and trailing whitespace
    return text.strip()
    
def split_text(text, max_tokens=500):
    # Split the text by newline characters
    parts = text.split("\n")
    chunks = []
    current_chunk = ""

    for part in parts:
        # Add part to current chunk
        if current_chunk:
            temp_chunk = current_chunk + "\n" + part
        else:
            temp_chunk = part

        # Tokenize the temporary chunk
        num_tokens = len(tokenizer.tokenize(temp_chunk))

        if num_tokens <= max_tokens:
            current_chunk = temp_chunk
        else:
            if current_chunk:
                chunks.append(current_chunk)
            current_chunk = part

    if current_chunk:
        chunks.append(current_chunk)

    # If no newlines were found and still exceeding max_tokens, split further
    if len(chunks) == 1 and len(tokenizer.tokenize(chunks[0])) > max_tokens:
        long_text = chunks[0]
        chunks = []
        while len(tokenizer.tokenize(long_text)) > max_tokens:
            split_point = len(long_text) // 2
            while split_point < len(long_text) and not re.match(r'\s', long_text[split_point]):
                split_point += 1
            # Ensure split_point does not go out of range
            if split_point >= len(long_text):
                split_point = len(long_text) - 1
            chunks.append(long_text[:split_point].strip())
            long_text = long_text[split_point:].strip()
        if long_text:
            chunks.append(long_text)

    return chunks

def transform_chunks(marianne_segmentation):

    # Filter out separators
    marianne_segmentation = marianne_segmentation[marianne_segmentation['entity_group'] != 'separator']
    
    # Replace '¶' with '\n' and convert to string
    marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).str.replace('¶', '\n', regex=False)
    
    #A bit of lceaning.
    marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).apply(preprocess_text)
    
    marianne_segmentation = marianne_segmentation[marianne_segmentation['word'] != 'nan']
    marianne_segmentation = marianne_segmentation[marianne_segmentation['word'] != '']
    marianne_segmentation = marianne_segmentation[marianne_segmentation['word'] != ' ']
    
    # Add entity_group as a header to each word
    marianne_segmentation['word'] = '### ' + marianne_segmentation['entity_group'] + ' ###\n' + marianne_segmentation['word']
    
    # Group by text_id, identifier, and date, then concatenate words
    marianne_segmentation = marianne_segmentation.agg({
        'word': lambda x: '\n\n'.join(x.dropna())
    }).reset_index()

    final_text = marianne_segmentation['word'].tolist()[0]

    return final_text


# Class to encapsulate the Falcon chatbot
class MistralChatBot:
    def __init__(self, system_prompt="Le dialogue suivant est une conversation"):
        self.system_prompt = system_prompt

    def predict(self, user_message):
        #We drop the newlines.
        editorial_text =  re.sub("\n", " ¶ ", user_message)
    
        # Tokenize the prompt and check if it exceeds 500 tokens
        num_tokens = len(tokenizer.tokenize(editorial_text))
    
        if num_tokens > 500:
            # Split the prompt into chunks
            batch_prompts = split_text(editorial_text, max_tokens=500)
        else:
            batch_prompts = [editorial_text]

        out = token_classifier(batch_prompts)
        out = transform_chunks(out)
        print(out)
        generated_text = '<h2 style="text-align:center">Réponse</h3>\n<div class="generation">' + out + "</div>"
        return generated_text

# Create the Falcon chatbot instance
mistral_bot = MistralChatBot()

# Define the Gradio interface
title = "Éditorialisation"
description = "Un outil expérimental d'identification de la structure du texte à partir d'un encoder (Deberta)"
examples = [
    [
        "Qui peut bénéficier de l'AIP?",  # user_message
        0.7  # temperature
    ]
]

demo = gr.Blocks()

with gr.Blocks(theme='JohnSmith9982/small_and_pretty') as demo:
    gr.HTML("""<h1 style="text-align:center">Correction d'OCR</h1>""")
    text_input = gr.Textbox(label="Votre texte.", type="text", lines=1)
    text_button = gr.Button("Identifier les structures éditoriales")
    text_output = gr.HTML(label="Le texte corrigé")
    text_button.click(mistral_bot.predict, inputs=text_input, outputs=[text_output])

if __name__ == "__main__":
    demo.queue().launch()