import gradio as gr import PyPDF2 from PyPDF2 import PdfReader from io import BytesIO import pytesseract from PIL import Image import spacy import json import subprocess update_process = subprocess.run(['sudo', 'apt', 'update'], capture_output=True, text=True) install_process = subprocess.run(['sudo', 'apt', 'install', 'tesseract-ocr'], capture_output=True, text=True) from transformers import pipeline from PyPDF2 import PdfReader ner_model = pipeline('token-classification', model='dslim/bert-large-NER') summarization_pipeline = pipeline("summarization", model="facebook/bart-large-cnn") ner_models = { 'bert-large-NER': 'dslim/bert-large-NER', 'bioNER': 'd4data/biomedical-ner-all', 'SpaCy English NER': 'en_core_web_trf', } spacy_ner_model = spacy.load(ner_models['SpaCy English NER']) ner_model_bio = pipeline('token-classification', model='d4data/biomedical-ner-all') from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER") from spacy import displacy def extract_text_from_pdf(pdf_bytes): """ Extracts text from a PDF file using PyPDF2. Parameters: - pdf_bytes (bytes): The content of the PDF file in bytes. Returns: - text (str): Extracted text from the PDF. """ text='' pdf_file=BytesIO(pdf_bytes) pdf_reader=PdfReader(pdf_file) for page_number in range(len(pdf_reader.pages)): page=pdf_reader.pages[page_number] text+=page.extract_text() return text def extract_text_from_image_or_pdf(file_bytes): """ Extracts text from either a PDF or an image file using PyPDF2 and pytesseract. Parameters: - file_bytes (bytes): The content of the file in bytes. Returns: - text (str): Extracted text from the file. """ try: if file_bytes.startswith(b'%PDF'): # PDF file text = extract_text_from_pdf(file_bytes) else: # Assume image file image = Image.open(BytesIO(file_bytes)) text = pytesseract.image_to_string(image) return text except Exception as e: return f"Error extracting text: {str(e)}" def perform_ner(text, model_name): """ Performs Named Entity Recognition (NER) on the given text using the specified NER model. Parameters: - text (str): The input text on which NER will be performed. - model_name (str): The name of the NER model to be used ('bert-large-NER', 'bioNER', or 'SpaCy English NER'). Returns: - extracted_entities (list): A list of dictionaries containing information about the recognized entities. Each dictionary has the keys: 'text', 'type', 'start_index', 'end_index'. - error_message (str): If an error occurs during the NER process, an error message is returned. """ try: if model_name == 'SpaCy English NER': doc = spacy_ner_model(text) extracted_entities = [{'text': ent.text, 'type': ent.label_, 'start_index': ent.start_char, 'end_index': ent.end_char} for ent in doc.ents] elif model_name == 'bert-large-NER': entities = ner_model(text) extracted_entities = [{'text': entity['word'], 'type': entity['entity'], 'start_index': entity['start'], 'end_index': entity['end']} for entity in entities] else: entities = ner_model_bio(text) extracted_entities = [{'text': entity['word'], 'type': entity['entity'], 'start_index': entity['start'], 'end_index': entity['end']} for entity in entities] return extracted_entities except Exception as e: return f"Error performing NER: {str(e)}" def highlight_entities_with_colors_and_labels_tokenized(text, entities, color_mapping, tokenizer): """ This function takes a raw text input, a list of entities with their start and end indices, a color mapping for entity labels, and a tokenizer. It tokenizes the input text, highlights the entities with specified colors and labels, and returns the formatted text with HTML-style markup. Parameters: - `text` (str): The raw input text. - `entities` (list): A list of dictionaries, each containing the start index (`start`), end index (`end`), and type (`type`) of an entity. - `color_mapping` (dict): A dictionary mapping entity labels to background colors for highlighting. - `tokenizer` (transformers.AutoTokenizer): The tokenizer for encoding the entity text. Returns: - `highlighted_text` (str): The formatted text with highlighted entities using HTML-style markup. """ highlighted_text = "" current_pos = 0 for ent in entities: start, end, label = ent.get('start_index', 0), ent.get('end_index', 0), ent.get('type', 'O') entity_text = text[start:end] # Tokenize the entity text encoded_entity = tokenizer.encode(entity_text, add_special_tokens=False) tokenized_entity_text = tokenizer.convert_ids_to_tokens(encoded_entity) tokenized_entity_length = len(tokenized_entity_text) # Add non-entity text highlighted_text += text[current_pos:start] # Add highlighted entity text with color and label on the same line color = color_mapping.get(label,'#4D94FF') highlighted_text += f"{entity_text} ({label})" # Update current position current_pos = end # Add any remaining non-entity text highlighted_text += text[current_pos:] return highlighted_text def highlight_entities(text, entities,model_name): """ Highlights named entities in the given text and returns HTML with colored annotations. Parameters: - text (str): The input text containing named entities. - entities (list): A list of dictionaries containing information about the recognized entities. Each dictionary has the keys: 'text', 'type', 'start_index', 'end_index'. - model_name (str): The name of the NER model used ('SpaCy English NER'). Returns: - colored_text (str): HTML with colored annotations highlighting the recognized entities. - error_message (str): If an error occurs during the highlighting process, an error message is returned. """ try: if model_name == 'SpaCy English NER': doc = spacy_ner_model(text) color_mapping = { "DATE": "#4D94FF", # Blue "PERSON": "#4CAF50", # Green "EVENT": "#FF6666", # Salmon "FAC": "#66B2FF", # Sky Blue "GPE": "#FFCC99", # Light Apricot "LANGUAGE": "#FF80BF", # Pink "LAW": "#66FF99", # Mint "LOC": "#809FFF", # Lavender Blue "MONEY": "#FFFF99", # Light Yellow "NORP": "#808000", # Olive Green "ORDINAL": "#FF9999", # Misty Rose "ORG": "#FFB366", # Light Peach "PERCENT": "#FF99FF", # Orchid "PRODUCT": "#FF6666", # Salmon "QUANTITY": "#CC99FF", # Pastel Purple "TIME": "#FFD54F", # Amber "WORK_OF_ART": "#FFC266" , # Light Orange "CARDINAL": "#008080" # Teal } options = {"ents": [entity['type'] for entity in entities], "colors": color_mapping} html = displacy.render(doc, style="ent", options=options, page=True) colored_text = html return colored_text else: color_mapping = { 'O': 'pink', 'B-MIS': 'red', 'I-MIS': 'brown', 'B-PER': 'green', 'I-PER': '#FFD54F', 'B-ORG': 'orange', 'I-ORG': '#FF6666', 'B-LOC': 'purple', 'I-LOC': '#FFCC99', } highlighted_example = highlight_entities_with_colors_and_labels_tokenized(text, entities, color_mapping, tokenizer) return highlighted_example except Exception as e: return f"Error highlighting entities: {str(e)}" def summarize_text(input_text): """ The `summarize_text` function is designed to provide a concise summary of a given input text using the Hugging Face Transformers library's summarization pipeline. The function takes an `input_text` parameter, representing the text that needs to be summarized. Parameters: - **input_text (str):** The input text that needs to be summarized. Returns: - **summarized_text (str):** The function utilizes the summarization pipeline with specific parameters, including `max_length`, `min_length`, `length_penalty`, `num_beams`, and `early_stopping`, to generate a summary of the input text. The summarized text is then extracted from the pipeline output and returned. """ summarized_text = summarization_pipeline(input_text, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True) summarized_text = summarized_text[0]['summary_text'] return summarized_text def image_ner_tool(file, model_name): """ Perform Named Entity Recognition (NER) on the text extracted from an image or PDF file. The extracted text is highlighted with colored annotations based on recognized entities. Parameters: - file (str or BytesIO): Either a file path or a BytesIO object containing the image or PDF file. - model_name (str): The name of the NER model to be used ('bert-large-NER', 'bioNER', or 'SpaCy English NER'). Returns: - text (str): Extracted text from the input file. - highlighted_text (str): HTML with colored annotations highlighting the recognized entities. - reformatted_ner_output (str): JSON-formatted string containing information about the recognized entities. """ reformatted_ner_output = "" try: if isinstance(file, str): # If the input is a file path with open(file, 'rb') as file_stream: file_bytes = file_stream.read() else: # If the input is a byte stream file_bytes = file.getvalue() text = extract_text_from_image_or_pdf(file_bytes) entities = perform_ner(text, model_name) highlighted_text = highlight_entities(text, entities,model_name) reformatted_ner_output = json.dumps(entities, indent=2) summary = summarize_text(text) return text, highlighted_text, reformatted_ner_output, summary except Exception as e: error_message = f"Error processing file: {str(e)}" return error_message, "", reformatted_ner_output import pandas as pd def store_data_to_csv(inputs, outputs): print(inputs) print(outputs) if isinstance(inputs, str): # If the input is a file path with open(inputs, 'rb') as file_stream: file_bytes = file_stream.read() else: # If the input is a byte stream file_bytes = inputs.getvalue() extracted_text = extract_text_from_image_or_pdf(file_bytes) named_entities=perform_ner(extracted_text, outputs) df = pd.DataFrame({"Extracted Text": [extracted_text], "Extracted Entities": [named_entities]}) df.to_csv("log.csv", mode='a', index=False, header=False) with gr.Blocks() as demo: gr.Markdown( """

Intelligent Document Processing

Upload a PDF or an image file to extract text and identify named entities

""" ) with gr.Row() as row: with gr.Column(): text1 =gr.File(label="Upload File") model=gr.Dropdown(list(ner_models.keys()), label="Select NER Model") btn = gr.Button("submit") with gr.Column(): with gr.Tab("Extracted Text"): output1=gr.Textbox(label="Extracted Text", container= True) with gr.Tab("Highlighted Entitied"): output2=gr.HTML(label="Highlighted Text") with gr.Tab("Summarized Text"): output3=gr.HTML(label="Summarized text") with gr.Tab("Named Entities Extracted"): output4=gr.HTML(label="Named Entities") store_button = gr.Button("Store Data to CSV") gr.Examples( [ [ # Text to display above the image "The year is 2043.pdf", # Path to the image file "SpaCy English NER" # Selected value for the dropdown menu ] ], [text1, model], ) btn.click( image_ner_tool, [text1, model], [output1, output2, output4, output3], ) store_button.click( store_data_to_csv, [text1, model], ) demo.launch()