import gradio as gr
import PyPDF2
from PyPDF2 import PdfReader
from io import BytesIO
import pytesseract
from PIL import Image
import spacy
import json
import subprocess

update_process = subprocess.run(['sudo', 'apt', 'update'], capture_output=True, text=True)
install_process = subprocess.run(['sudo', 'apt', 'install', 'tesseract-ocr'], capture_output=True, text=True)

from transformers import pipeline
from PyPDF2 import PdfReader
ner_model = pipeline('token-classification', model='dslim/bert-large-NER')
summarization_pipeline = pipeline("summarization", model="facebook/bart-large-cnn")
ner_models = {
    'bert-large-NER': 'dslim/bert-large-NER',
    'bioNER': 'd4data/biomedical-ner-all',
    'SpaCy English NER': 'en_core_web_trf',
}

spacy_ner_model = spacy.load(ner_models['SpaCy English NER'])
ner_model_bio = pipeline('token-classification', model='d4data/biomedical-ner-all')
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
from spacy import displacy


def extract_text_from_pdf(pdf_bytes):
  """
  Extracts text from a PDF file using PyPDF2.

  Parameters:
  - pdf_bytes (bytes): The content of the PDF file in bytes.
  Returns:
  - text (str): Extracted text from the PDF.
  """
  text=''
  pdf_file=BytesIO(pdf_bytes)

  pdf_reader=PdfReader(pdf_file)

  for page_number in range(len(pdf_reader.pages)):
    page=pdf_reader.pages[page_number]
    text+=page.extract_text()

  return text


def extract_text_from_image_or_pdf(file_bytes):
    """
    Extracts text from either a PDF or an image file using PyPDF2 and pytesseract.

    Parameters:
    - file_bytes (bytes): The content of the file in bytes.

    Returns:
    - text (str): Extracted text from the file.
    """
    try:
        if file_bytes.startswith(b'%PDF'):
            # PDF file
            text = extract_text_from_pdf(file_bytes)
        else:
            # Assume image file
            image = Image.open(BytesIO(file_bytes))
            text = pytesseract.image_to_string(image)

        return text
    except Exception as e:
        return f"Error extracting text: {str(e)}"


def perform_ner(text, model_name):
    """
    Performs Named Entity Recognition (NER) on the given text using the specified NER model.

    Parameters:
    - text (str): The input text on which NER will be performed.
    - model_name (str): The name of the NER model to be used ('bert-large-NER', 'bioNER', or 'SpaCy English NER').

    Returns:
    - extracted_entities (list): A list of dictionaries containing information about the recognized entities.
      Each dictionary has the keys: 'text', 'type', 'start_index', 'end_index'.
    - error_message (str): If an error occurs during the NER process, an error message is returned.
    """
    try:
        if model_name == 'SpaCy English NER':
            doc = spacy_ner_model(text)
            extracted_entities = [{'text': ent.text, 'type': ent.label_,
                                   'start_index': ent.start_char, 'end_index': ent.end_char} for ent in doc.ents]
        elif model_name == 'bert-large-NER':
            entities = ner_model(text)
            extracted_entities = [{'text': entity['word'], 'type': entity['entity'],
                                   'start_index': entity['start'], 'end_index': entity['end']} for entity in entities]
        else:
            entities = ner_model_bio(text)
            extracted_entities = [{'text': entity['word'], 'type': entity['entity'],
                                   'start_index': entity['start'], 'end_index': entity['end']} for entity in entities]

        return extracted_entities

    except Exception as e:
        return f"Error performing NER: {str(e)}"


def highlight_entities_with_colors_and_labels_tokenized(text, entities, color_mapping, tokenizer):
    """
    This function takes a raw text input, a list of entities with their start and end indices, a color mapping for entity labels, and a tokenizer.
    It tokenizes the input text, highlights the entities with specified colors and labels, and returns the formatted text with HTML-style markup.

    Parameters:
    - `text` (str): The raw input text.
    - `entities` (list): A list of dictionaries, each containing the start index (`start`), end index (`end`), and type (`type`) of an entity.
    - `color_mapping` (dict): A dictionary mapping entity labels to background colors for highlighting.
    - `tokenizer` (transformers.AutoTokenizer): The tokenizer for encoding the entity text.

    Returns:
    - `highlighted_text` (str): The formatted text with highlighted entities using HTML-style markup.
    """


    highlighted_text = ""
    current_pos = 0

    for ent in entities:
        start, end, label = ent.get('start_index', 0), ent.get('end_index', 0), ent.get('type', 'O')
        entity_text = text[start:end]

        # Tokenize the entity text
        encoded_entity = tokenizer.encode(entity_text, add_special_tokens=False)
        tokenized_entity_text = tokenizer.convert_ids_to_tokens(encoded_entity)
        tokenized_entity_length = len(tokenized_entity_text)

        # Add non-entity text
        highlighted_text += text[current_pos:start]

        # Add highlighted entity text with color and label on the same line
        color = color_mapping.get(label,'#4D94FF')
        highlighted_text += f"<mark style='background-color:{color}' title='{label}'>{entity_text} ({label})</mark>"

        # Update current position
        current_pos = end

    # Add any remaining non-entity text
    highlighted_text += text[current_pos:]

    return highlighted_text
def highlight_entities(text, entities,model_name):
    """
    Highlights named entities in the given text and returns HTML with colored annotations.

    Parameters:
    - text (str): The input text containing named entities.
    - entities (list): A list of dictionaries containing information about the recognized entities.
      Each dictionary has the keys: 'text', 'type', 'start_index', 'end_index'.
    - model_name (str): The name of the NER model used ('SpaCy English NER').

    Returns:
    - colored_text (str): HTML with colored annotations highlighting the recognized entities.
    - error_message (str): If an error occurs during the highlighting process, an error message is returned.
    """
    try:
      if model_name == 'SpaCy English NER':
            doc = spacy_ner_model(text)

            color_mapping = {
            "DATE": "#4D94FF",        # Blue
            "PERSON": "#4CAF50",      # Green
            "EVENT": "#FF6666",       # Salmon
            "FAC": "#66B2FF",         # Sky Blue
            "GPE": "#FFCC99",         # Light Apricot
            "LANGUAGE": "#FF80BF",     # Pink
            "LAW": "#66FF99",         # Mint
            "LOC": "#809FFF",         # Lavender Blue
            "MONEY": "#FFFF99",       # Light Yellow
            "NORP": "#808000",        # Olive Green
            "ORDINAL": "#FF9999",     # Misty Rose
            "ORG": "#FFB366",         # Light Peach
            "PERCENT": "#FF99FF",     # Orchid
            "PRODUCT": "#FF6666",     # Salmon
            "QUANTITY": "#CC99FF",    # Pastel Purple
            "TIME": "#FFD54F",        # Amber
            "WORK_OF_ART": "#FFC266" ,  # Light Orange
            "CARDINAL": "#008080"  # Teal
            }


            options = {"ents": [entity['type'] for entity in entities], "colors": color_mapping}

            html = displacy.render(doc, style="ent", options=options, page=True)
            colored_text = html

            return colored_text
      else:
            color_mapping = {
            'O': 'pink',
            'B-MIS': 'red',
            'I-MIS': 'brown',
            'B-PER': 'green',
            'I-PER': '#FFD54F',
            'B-ORG': 'orange',
            'I-ORG': '#FF6666',
            'B-LOC': 'purple',
            'I-LOC': '#FFCC99',
            }
            highlighted_example = highlight_entities_with_colors_and_labels_tokenized(text, entities, color_mapping, tokenizer)

            return highlighted_example

    except Exception as e:
        return f"Error highlighting entities: {str(e)}"


def summarize_text(input_text):
    """
    The `summarize_text` function is designed to provide a concise summary of a given input text using the Hugging Face Transformers library's summarization pipeline.
    The function takes an `input_text` parameter, representing the text that needs to be summarized.

    Parameters:
    - **input_text (str):** The input text that needs to be summarized.

    Returns:
    - **summarized_text (str):** The function utilizes the summarization pipeline with specific parameters,
                                  including `max_length`, `min_length`, `length_penalty`, `num_beams`, and `early_stopping`,
                                  to generate a summary of the input text. The summarized text is then extracted from the pipeline output and returned.
    """
    summarized_text = summarization_pipeline(input_text, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)

    summarized_text = summarized_text[0]['summary_text']

    return summarized_text


def image_ner_tool(file, model_name):
    """
    Perform Named Entity Recognition (NER) on the text extracted from an image or PDF file.
    The extracted text is highlighted with colored annotations based on recognized entities.

    Parameters:
    - file (str or BytesIO): Either a file path or a BytesIO object containing the image or PDF file.
    - model_name (str): The name of the NER model to be used ('bert-large-NER', 'bioNER', or 'SpaCy English NER').

    Returns:
    - text (str): Extracted text from the input file.
    - highlighted_text (str): HTML with colored annotations highlighting the recognized entities.
    - reformatted_ner_output (str): JSON-formatted string containing information about the recognized entities.
    """
    reformatted_ner_output = ""
    try:
        if isinstance(file, str):  # If the input is a file path
            with open(file, 'rb') as file_stream:
                file_bytes = file_stream.read()
        else:  # If the input is a byte stream
            file_bytes = file.getvalue()

        text = extract_text_from_image_or_pdf(file_bytes)

        entities = perform_ner(text, model_name)
        highlighted_text = highlight_entities(text, entities,model_name)

        reformatted_ner_output = json.dumps(entities, indent=2)

        summary = summarize_text(text)

        return text, highlighted_text, reformatted_ner_output, summary

    except Exception as e:
        error_message = f"Error processing file: {str(e)}"
        return error_message, "", reformatted_ner_output


import pandas as pd
def store_data_to_csv(inputs, outputs):
    print(inputs)
    print(outputs)
    if isinstance(inputs, str):  # If the input is a file path
        with open(inputs, 'rb') as file_stream:
            file_bytes = file_stream.read()
    else:  # If the input is a byte stream
        file_bytes = inputs.getvalue()

    extracted_text = extract_text_from_image_or_pdf(file_bytes)
    named_entities=perform_ner(extracted_text, outputs)
    df = pd.DataFrame({"Extracted Text": [extracted_text], "Extracted Entities": [named_entities]})
    df.to_csv("log.csv", mode='a', index=False, header=False)


with gr.Blocks() as demo:
    gr.Markdown(
        """
        <p style="text-align: center; font-weight: bold; font-size: 44px;">
        Intelligent Document Processing
        </p>

        <p style="text-align: center;">
        Upload a PDF or an image file to extract text and identify named entities
        </p>
        """
    )
    with gr.Row() as row:
        with gr.Column():
            text1 =gr.File(label="Upload File")
            model=gr.Dropdown(list(ner_models.keys()), label="Select NER Model")
            btn = gr.Button("submit")
        with gr.Column():
            with gr.Tab("Extracted Text"):
                output1=gr.Textbox(label="Extracted Text", container= True)
            with gr.Tab("Highlighted Entitied"):
                output2=gr.HTML(label="Highlighted Text")
            with gr.Tab("Summarized Text"):
                output3=gr.HTML(label="Summarized text")
            with gr.Tab("Named Entities Extracted"):
                output4=gr.HTML(label="Named Entities")
                store_button = gr.Button("Store Data to CSV")
    gr.Examples(
        [
            [  # Text to display above the image
                "The year is 2043.pdf",  # Path to the image file
                "SpaCy English NER"  # Selected value for the dropdown menu
            ]
        ],
        [text1, model],
    )
    btn.click(
        image_ner_tool,
        [text1, model],
        [output1, output2, output4, output3],
    )
    store_button.click(
        store_data_to_csv,
        [text1, model],
    )


demo.launch()