Spaces:

keshva
/

Intelligent-Document-Processing

Runtime error

App Files Files Community

keshva commited on Dec 23, 2023

Commit

206555d

1 Parent(s): b85c912

Create app.py

Browse files

Files changed (1) hide show

app.py +338 -0

app.py ADDED Viewed

	@@ -0,0 +1,338 @@

+import gradio as gr
+import PyPDF2
+from PyPDF2 import PdfReader
+from io import BytesIO
+import pytesseract
+from PIL import Image
+import spacy
+import json
+from transformers import pipeline
+from PyPDF2 import PdfReader
+ner_model = pipeline('token-classification', model='dslim/bert-large-NER')
+summarization_pipeline = pipeline("summarization", model="facebook/bart-large-cnn")
+ner_models = {
+    'bert-large-NER': 'dslim/bert-large-NER',
+    'bioNER': 'd4data/biomedical-ner-all',
+    'SpaCy English NER': 'en_core_web_trf',
+}
+spacy.cli.download("en_core_web_trf")
+spacy_ner_model = spacy.load(ner_models['SpaCy English NER'])
+ner_model_bio = pipeline('token-classification', model='d4data/biomedical-ner-all')
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
+from spacy import displacy
+def extract_text_from_pdf(pdf_bytes):
+  """
+  Extracts text from a PDF file using PyPDF2.
+  Parameters:
+  - pdf_bytes (bytes): The content of the PDF file in bytes.
+  Returns:
+  - text (str): Extracted text from the PDF.
+  """
+  text=''
+  pdf_file=BytesIO(pdf_bytes)
+  pdf_reader=PdfReader(pdf_file)
+  for page_number in range(len(pdf_reader.pages)):
+    page=pdf_reader.pages[page_number]
+    text+=page.extract_text()
+  return text
+def extract_text_from_image_or_pdf(file_bytes):
+    """
+    Extracts text from either a PDF or an image file using PyPDF2 and pytesseract.
+    Parameters:
+    - file_bytes (bytes): The content of the file in bytes.
+    Returns:
+    - text (str): Extracted text from the file.
+    """
+    try:
+        if file_bytes.startswith(b'%PDF'):
+            # PDF file
+            text = extract_text_from_pdf(file_bytes)
+        else:
+            # Assume image file
+            image = Image.open(BytesIO(file_bytes))
+            text = pytesseract.image_to_string(image)
+        return text
+    except Exception as e:
+        return f"Error extracting text: {str(e)}"
+def perform_ner(text, model_name):
+    """
+    Performs Named Entity Recognition (NER) on the given text using the specified NER model.
+    Parameters:
+    - text (str): The input text on which NER will be performed.
+    - model_name (str): The name of the NER model to be used ('bert-large-NER', 'bioNER', or 'SpaCy English NER').
+    Returns:
+    - extracted_entities (list): A list of dictionaries containing information about the recognized entities.
+      Each dictionary has the keys: 'text', 'type', 'start_index', 'end_index'.
+    - error_message (str): If an error occurs during the NER process, an error message is returned.
+    """
+    try:
+        if model_name == 'SpaCy English NER':
+            doc = spacy_ner_model(text)
+            extracted_entities = [{'text': ent.text, 'type': ent.label_,
+                                   'start_index': ent.start_char, 'end_index': ent.end_char} for ent in doc.ents]
+        elif model_name == 'bert-large-NER':
+            entities = ner_model(text)
+            extracted_entities = [{'text': entity['word'], 'type': entity['entity'],
+                                   'start_index': entity['start'], 'end_index': entity['end']} for entity in entities]
+        else:
+            entities = ner_model_bio(text)
+            extracted_entities = [{'text': entity['word'], 'type': entity['entity'],
+                                   'start_index': entity['start'], 'end_index': entity['end']} for entity in entities]
+        return extracted_entities
+    except Exception as e:
+        return f"Error performing NER: {str(e)}"
+def highlight_entities_with_colors_and_labels_tokenized(text, entities, color_mapping, tokenizer):
+    """
+    This function takes a raw text input, a list of entities with their start and end indices, a color mapping for entity labels, and a tokenizer.
+    It tokenizes the input text, highlights the entities with specified colors and labels, and returns the formatted text with HTML-style markup.
+    Parameters:
+    - `text` (str): The raw input text.
+    - `entities` (list): A list of dictionaries, each containing the start index (`start`), end index (`end`), and type (`type`) of an entity.
+    - `color_mapping` (dict): A dictionary mapping entity labels to background colors for highlighting.
+    - `tokenizer` (transformers.AutoTokenizer): The tokenizer for encoding the entity text.
+    Returns:
+    - `highlighted_text` (str): The formatted text with highlighted entities using HTML-style markup.
+    """
+    highlighted_text = ""
+    current_pos = 0
+    for ent in entities:
+        start, end, label = ent.get('start_index', 0), ent.get('end_index', 0), ent.get('type', 'O')
+        entity_text = text[start:end]
+        # Tokenize the entity text
+        encoded_entity = tokenizer.encode(entity_text, add_special_tokens=False)
+        tokenized_entity_text = tokenizer.convert_ids_to_tokens(encoded_entity)
+        tokenized_entity_length = len(tokenized_entity_text)
+        # Add non-entity text
+        highlighted_text += text[current_pos:start]
+        # Add highlighted entity text with color and label on the same line
+        color = color_mapping.get(label,'#4D94FF')
+        highlighted_text += f"<mark style='background-color:{color}' title='{label}'>{entity_text} ({label})</mark>"
+        # Update current position
+        current_pos = end
+    # Add any remaining non-entity text
+    highlighted_text += text[current_pos:]
+    return highlighted_text
+def highlight_entities(text, entities,model_name):
+    """
+    Highlights named entities in the given text and returns HTML with colored annotations.
+    Parameters:
+    - text (str): The input text containing named entities.
+    - entities (list): A list of dictionaries containing information about the recognized entities.
+      Each dictionary has the keys: 'text', 'type', 'start_index', 'end_index'.
+    - model_name (str): The name of the NER model used ('SpaCy English NER').
+    Returns:
+    - colored_text (str): HTML with colored annotations highlighting the recognized entities.
+    - error_message (str): If an error occurs during the highlighting process, an error message is returned.
+    """
+    try:
+      if model_name == 'SpaCy English NER':
+            doc = spacy_ner_model(text)
+            color_mapping = {
+            "DATE": "#4D94FF",        # Blue
+            "PERSON": "#4CAF50",      # Green
+            "EVENT": "#FF6666",       # Salmon
+            "FAC": "#66B2FF",         # Sky Blue
+            "GPE": "#FFCC99",         # Light Apricot
+            "LANGUAGE": "#FF80BF",     # Pink
+            "LAW": "#66FF99",         # Mint
+            "LOC": "#809FFF",         # Lavender Blue
+            "MONEY": "#FFFF99",       # Light Yellow
+            "NORP": "#808000",        # Olive Green
+            "ORDINAL": "#FF9999",     # Misty Rose
+            "ORG": "#FFB366",         # Light Peach
+            "PERCENT": "#FF99FF",     # Orchid
+            "PRODUCT": "#FF6666",     # Salmon
+            "QUANTITY": "#CC99FF",    # Pastel Purple
+            "TIME": "#FFD54F",        # Amber
+            "WORK_OF_ART": "#FFC266" ,  # Light Orange
+            "CARDINAL": "#008080"  # Teal
+            }
+            options = {"ents": [entity['type'] for entity in entities], "colors": color_mapping}
+            html = displacy.render(doc, style="ent", options=options, page=True)
+            colored_text = html
+            return colored_text
+      else:
+            color_mapping = {
+            'O': 'pink',
+            'B-MIS': 'red',
+            'I-MIS': 'brown',
+            'B-PER': 'green',
+            'I-PER': '#FFD54F',
+            'B-ORG': 'orange',
+            'I-ORG': '#FF6666',
+            'B-LOC': 'purple',
+            'I-LOC': '#FFCC99',
+            }
+            highlighted_example = highlight_entities_with_colors_and_labels_tokenized(text, entities, color_mapping, tokenizer)
+            return highlighted_example
+    except Exception as e:
+        return f"Error highlighting entities: {str(e)}"
+def summarize_text(input_text):
+    """
+    The `summarize_text` function is designed to provide a concise summary of a given input text using the Hugging Face Transformers library's summarization pipeline.
+    The function takes an `input_text` parameter, representing the text that needs to be summarized.
+    Parameters:
+    - **input_text (str):** The input text that needs to be summarized.
+    Returns:
+    - **summarized_text (str):** The function utilizes the summarization pipeline with specific parameters,
+                                  including `max_length`, `min_length`, `length_penalty`, `num_beams`, and `early_stopping`,
+                                  to generate a summary of the input text. The summarized text is then extracted from the pipeline output and returned.
+    """
+    summarized_text = summarization_pipeline(input_text, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
+    summarized_text = summarized_text[0]['summary_text']
+    return summarized_text
+def image_ner_tool(file, model_name):
+    """
+    Perform Named Entity Recognition (NER) on the text extracted from an image or PDF file.
+    The extracted text is highlighted with colored annotations based on recognized entities.
+    Parameters:
+    - file (str or BytesIO): Either a file path or a BytesIO object containing the image or PDF file.
+    - model_name (str): The name of the NER model to be used ('bert-large-NER', 'bioNER', or 'SpaCy English NER').
+    Returns:
+    - text (str): Extracted text from the input file.
+    - highlighted_text (str): HTML with colored annotations highlighting the recognized entities.
+    - reformatted_ner_output (str): JSON-formatted string containing information about the recognized entities.
+    """
+    reformatted_ner_output = ""
+    try:
+        if isinstance(file, str):  # If the input is a file path
+            with open(file, 'rb') as file_stream:
+                file_bytes = file_stream.read()
+        else:  # If the input is a byte stream
+            file_bytes = file.getvalue()
+        text = extract_text_from_image_or_pdf(file_bytes)
+        entities = perform_ner(text, model_name)
+        highlighted_text = highlight_entities(text, entities,model_name)
+        reformatted_ner_output = json.dumps(entities, indent=2)
+        summary = summarize_text(text)
+        return text, highlighted_text, reformatted_ner_output, summary
+    except Exception as e:
+        error_message = f"Error processing file: {str(e)}"
+        return error_message, "", reformatted_ner_output
+import pandas as pd
+def store_data_to_csv(inputs, outputs):
+    print(inputs)
+    print(outputs)
+    if isinstance(inputs, str):  # If the input is a file path
+        with open(inputs, 'rb') as file_stream:
+            file_bytes = file_stream.read()
+    else:  # If the input is a byte stream
+        file_bytes = inputs.getvalue()
+    extracted_text = extract_text_from_image_or_pdf(file_bytes)
+    named_entities=perform_ner(extracted_text, outputs)
+    df = pd.DataFrame({"Extracted Text": [extracted_text], "Extracted Entities": [named_entities]})
+    df.to_csv("log.csv", mode='a', index=False, header=False)
+with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+        <p style="text-align: center; font-weight: bold; font-size: 44px;">
+        Intelligent Document Processing
+        </p>
+        <p style="text-align: center;">
+        Upload a PDF or an image file to extract text and identify named entities
+        </p>
+        """
+    )
+    with gr.Row() as row:
+        with gr.Column():
+            text1 =gr.File(label="Upload File")
+            model=gr.Dropdown(list(ner_models.keys()), label="Select NER Model")
+            btn = gr.Button("submit")
+        with gr.Column():
+            with gr.Tab("Extracted Text"):
+                output1=gr.Textbox(label="Extracted Text", container= True)
+            with gr.Tab("Highlighted Entitied"):
+                output2=gr.HTML(label="Highlighted Text")
+            with gr.Tab("Summarized Text"):
+                output3=gr.HTML(label="Summarized text")
+            with gr.Tab("Named Entities Extracted"):
+                output4=gr.HTML(label="Named Entities")
+                store_button = gr.Button("Store Data to CSV")
+    gr.Examples(
+        [
+            [  # Text to display above the image
+                "/content/The year is 2043.pdf",  # Path to the image file
+                "SpaCy English NER"  # Selected value for the dropdown menu
+            ]
+        ],
+        [text1, model],
+    )
+    btn.click(
+        image_ner_tool,
+        [text1, model],
+        [output1, output2, output4, output3],
+    )
+    store_button.click(
+        store_data_to_csv,
+        [text1, model],
+    )
+demo.launch()