Spaces:

keshva
/

Intelligent-Document-Processing

Runtime error

File size: 12,725 Bytes

import gradio as gr
import PyPDF2
from PyPDF2 import PdfReader
from io import BytesIO
import pytesseract
from PIL import Image
import spacy
import json

from transformers import pipeline
from PyPDF2 import PdfReader
ner_model = pipeline('token-classification', model='dslim/bert-large-NER')
summarization_pipeline = pipeline("summarization", model="facebook/bart-large-cnn")
ner_models = {
    'bert-large-NER': 'dslim/bert-large-NER',
    'bioNER': 'd4data/biomedical-ner-all',
    'SpaCy English NER': 'en_core_web_trf',
}

spacy_ner_model = spacy.load(ner_models['SpaCy English NER'])
ner_model_bio = pipeline('token-classification', model='d4data/biomedical-ner-all')
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
from spacy import displacy


def extract_text_from_pdf(pdf_bytes):
  """
  Extracts text from a PDF file using PyPDF2.

  Parameters:
  - pdf_bytes (bytes): The content of the PDF file in bytes.
  Returns:
  - text (str): Extracted text from the PDF.
  """
  text=''
  pdf_file=BytesIO(pdf_bytes)

  pdf_reader=PdfReader(pdf_file)

  for page_number in range(len(pdf_reader.pages)):
    page=pdf_reader.pages[page_number]
    text+=page.extract_text()

  return text


def extract_text_from_image_or_pdf(file_bytes):
    """
    Extracts text from either a PDF or an image file using PyPDF2 and pytesseract.

    Parameters:
    - file_bytes (bytes): The content of the file in bytes.

    Returns:
    - text (str): Extracted text from the file.
    """
    try:
        if file_bytes.startswith(b'%PDF'):
            # PDF file
            text = extract_text_from_pdf(file_bytes)
        else:
            # Assume image file
            image = Image.open(BytesIO(file_bytes))
            text = pytesseract.image_to_string(image)

        return text
    except Exception as e:
        return f"Error extracting text: {str(e)}"



def perform_ner(text, model_name):
    """
    Performs Named Entity Recognition (NER) on the given text using the specified NER model.

    Parameters:
    - text (str): The input text on which NER will be performed.
    - model_name (str): The name of the NER model to be used ('bert-large-NER', 'bioNER', or 'SpaCy English NER').

    Returns:
    - extracted_entities (list): A list of dictionaries containing information about the recognized entities.
      Each dictionary has the keys: 'text', 'type', 'start_index', 'end_index'.
    - error_message (str): If an error occurs during the NER process, an error message is returned.
    """
    try:
        if model_name == 'SpaCy English NER':
            doc = spacy_ner_model(text)
            extracted_entities = [{'text': ent.text, 'type': ent.label_,
                                   'start_index': ent.start_char, 'end_index': ent.end_char} for ent in doc.ents]
        elif model_name == 'bert-large-NER':
            entities = ner_model(text)
            extracted_entities = [{'text': entity['word'], 'type': entity['entity'],
                                   'start_index': entity['start'], 'end_index': entity['end']} for entity in entities]
        else:
            entities = ner_model_bio(text)
            extracted_entities = [{'text': entity['word'], 'type': entity['entity'],
                                   'start_index': entity['start'], 'end_index': entity['end']} for entity in entities]

        return extracted_entities

    except Exception as e:
        return f"Error performing NER: {str(e)}"


def highlight_entities_with_colors_and_labels_tokenized(text, entities, color_mapping, tokenizer):
    """
    This function takes a raw text input, a list of entities with their start and end indices, a color mapping for entity labels, and a tokenizer.
    It tokenizes the input text, highlights the entities with specified colors and labels, and returns the formatted text with HTML-style markup.

    Parameters:
    - `text` (str): The raw input text.
    - `entities` (list): A list of dictionaries, each containing the start index (`start`), end index (`end`), and type (`type`) of an entity.
    - `color_mapping` (dict): A dictionary mapping entity labels to background colors for highlighting.
    - `tokenizer` (transformers.AutoTokenizer): The tokenizer for encoding the entity text.

    Returns:
    - `highlighted_text` (str): The formatted text with highlighted entities using HTML-style markup.
    """


    highlighted_text = ""
    current_pos = 0

    for ent in entities:
        start, end, label = ent.get('start_index', 0), ent.get('end_index', 0), ent.get('type', 'O')
        entity_text = text[start:end]

        # Tokenize the entity text
        encoded_entity = tokenizer.encode(entity_text, add_special_tokens=False)
        tokenized_entity_text = tokenizer.convert_ids_to_tokens(encoded_entity)
        tokenized_entity_length = len(tokenized_entity_text)

        # Add non-entity text
        highlighted_text += text[current_pos:start]

        # Add highlighted entity text with color and label on the same line
        color = color_mapping.get(label,'#4D94FF')
        highlighted_text += f"<mark style='background-color:{color}' title='{label}'>{entity_text} ({label})</mark>"

        # Update current position
        current_pos = end

    # Add any remaining non-entity text
    highlighted_text += text[current_pos:]

    return highlighted_text
def highlight_entities(text, entities,model_name):
    """
    Highlights named entities in the given text and returns HTML with colored annotations.

    Parameters:
    - text (str): The input text containing named entities.
    - entities (list): A list of dictionaries containing information about the recognized entities.
      Each dictionary has the keys: 'text', 'type', 'start_index', 'end_index'.
    - model_name (str): The name of the NER model used ('SpaCy English NER').

    Returns:
    - colored_text (str): HTML with colored annotations highlighting the recognized entities.
    - error_message (str): If an error occurs during the highlighting process, an error message is returned.
    """
    try:
      if model_name == 'SpaCy English NER':
            doc = spacy_ner_model(text)

            color_mapping = {
            "DATE": "#4D94FF",        # Blue
            "PERSON": "#4CAF50",      # Green
            "EVENT": "#FF6666",       # Salmon
            "FAC": "#66B2FF",         # Sky Blue
            "GPE": "#FFCC99",         # Light Apricot
            "LANGUAGE": "#FF80BF",     # Pink
            "LAW": "#66FF99",         # Mint
            "LOC": "#809FFF",         # Lavender Blue
            "MONEY": "#FFFF99",       # Light Yellow
            "NORP": "#808000",        # Olive Green
            "ORDINAL": "#FF9999",     # Misty Rose
            "ORG": "#FFB366",         # Light Peach
            "PERCENT": "#FF99FF",     # Orchid
            "PRODUCT": "#FF6666",     # Salmon
            "QUANTITY": "#CC99FF",    # Pastel Purple
            "TIME": "#FFD54F",        # Amber
            "WORK_OF_ART": "#FFC266" ,  # Light Orange
            "CARDINAL": "#008080"  # Teal
            }


            options = {"ents": [entity['type'] for entity in entities], "colors": color_mapping}

            html = displacy.render(doc, style="ent", options=options, page=True)
            colored_text = html

            return colored_text
      else:
            color_mapping = {
            'O': 'pink',
            'B-MIS': 'red',
            'I-MIS': 'brown',
            'B-PER': 'green',
            'I-PER': '#FFD54F',
            'B-ORG': 'orange',
            'I-ORG': '#FF6666',
            'B-LOC': 'purple',
            'I-LOC': '#FFCC99',
            }
            highlighted_example = highlight_entities_with_colors_and_labels_tokenized(text, entities, color_mapping, tokenizer)

            return highlighted_example

    except Exception as e:
        return f"Error highlighting entities: {str(e)}"


def summarize_text(input_text):
    """
    The `summarize_text` function is designed to provide a concise summary of a given input text using the Hugging Face Transformers library's summarization pipeline.
    The function takes an `input_text` parameter, representing the text that needs to be summarized.

    Parameters:
    - **input_text (str):** The input text that needs to be summarized.

    Returns:
    - **summarized_text (str):** The function utilizes the summarization pipeline with specific parameters,
                                  including `max_length`, `min_length`, `length_penalty`, `num_beams`, and `early_stopping`,
                                  to generate a summary of the input text. The summarized text is then extracted from the pipeline output and returned.
    """
    summarized_text = summarization_pipeline(input_text, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)

    summarized_text = summarized_text[0]['summary_text']

    return summarized_text


def image_ner_tool(file, model_name):
    """
    Perform Named Entity Recognition (NER) on the text extracted from an image or PDF file.
    The extracted text is highlighted with colored annotations based on recognized entities.

    Parameters:
    - file (str or BytesIO): Either a file path or a BytesIO object containing the image or PDF file.
    - model_name (str): The name of the NER model to be used ('bert-large-NER', 'bioNER', or 'SpaCy English NER').

    Returns:
    - text (str): Extracted text from the input file.
    - highlighted_text (str): HTML with colored annotations highlighting the recognized entities.
    - reformatted_ner_output (str): JSON-formatted string containing information about the recognized entities.
    """
    reformatted_ner_output = ""
    try:
        if isinstance(file, str):  # If the input is a file path
            with open(file, 'rb') as file_stream:
                file_bytes = file_stream.read()
        else:  # If the input is a byte stream
            file_bytes = file.getvalue()

        text = extract_text_from_image_or_pdf(file_bytes)

        entities = perform_ner(text, model_name)
        highlighted_text = highlight_entities(text, entities,model_name)

        reformatted_ner_output = json.dumps(entities, indent=2)

        summary = summarize_text(text)

        return text, highlighted_text, reformatted_ner_output, summary

    except Exception as e:
        error_message = f"Error processing file: {str(e)}"
        return error_message, "", reformatted_ner_output


import pandas as pd
def store_data_to_csv(inputs, outputs):
    print(inputs)
    print(outputs)
    if isinstance(inputs, str):  # If the input is a file path
        with open(inputs, 'rb') as file_stream:
            file_bytes = file_stream.read()
    else:  # If the input is a byte stream
        file_bytes = inputs.getvalue()

    extracted_text = extract_text_from_image_or_pdf(file_bytes)
    named_entities=perform_ner(extracted_text, outputs)
    df = pd.DataFrame({"Extracted Text": [extracted_text], "Extracted Entities": [named_entities]})
    df.to_csv("log.csv", mode='a', index=False, header=False)





with gr.Blocks() as demo:
    gr.Markdown(
        """
        <p style="text-align: center; font-weight: bold; font-size: 44px;">
        Intelligent Document Processing
        </p>

        <p style="text-align: center;">
        Upload a PDF or an image file to extract text and identify named entities
        </p>
        """
    )
    with gr.Row() as row:
        with gr.Column():
            text1 =gr.File(label="Upload File")
            model=gr.Dropdown(list(ner_models.keys()), label="Select NER Model")
            btn = gr.Button("submit")
        with gr.Column():
            with gr.Tab("Extracted Text"):
                output1=gr.Textbox(label="Extracted Text", container= True)
            with gr.Tab("Highlighted Entitied"):
                output2=gr.HTML(label="Highlighted Text")
            with gr.Tab("Summarized Text"):
                output3=gr.HTML(label="Summarized text")
            with gr.Tab("Named Entities Extracted"):
                output4=gr.HTML(label="Named Entities")
                store_button = gr.Button("Store Data to CSV")
    gr.Examples(
        [
            [  # Text to display above the image
                "The year is 2043.pdf",  # Path to the image file
                "SpaCy English NER"  # Selected value for the dropdown menu
            ]
        ],
        [text1, model],
    )
    btn.click(
        image_ner_tool,
        [text1, model],
        [output1, output2, output4, output3],
    )
    store_button.click(
        store_data_to_csv,
        [text1, model],
    )


demo.launch()