File size: 12,725 Bytes
206555d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4acb675
206555d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d51bfff
206555d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
import gradio as gr
import PyPDF2
from PyPDF2 import PdfReader
from io import BytesIO
import pytesseract
from PIL import Image
import spacy
import json

from transformers import pipeline
from PyPDF2 import PdfReader
ner_model = pipeline('token-classification', model='dslim/bert-large-NER')
summarization_pipeline = pipeline("summarization", model="facebook/bart-large-cnn")
ner_models = {
    'bert-large-NER': 'dslim/bert-large-NER',
    'bioNER': 'd4data/biomedical-ner-all',
    'SpaCy English NER': 'en_core_web_trf',
}

spacy_ner_model = spacy.load(ner_models['SpaCy English NER'])
ner_model_bio = pipeline('token-classification', model='d4data/biomedical-ner-all')
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
from spacy import displacy


def extract_text_from_pdf(pdf_bytes):
  """
  Extracts text from a PDF file using PyPDF2.

  Parameters:
  - pdf_bytes (bytes): The content of the PDF file in bytes.
  Returns:
  - text (str): Extracted text from the PDF.
  """
  text=''
  pdf_file=BytesIO(pdf_bytes)

  pdf_reader=PdfReader(pdf_file)

  for page_number in range(len(pdf_reader.pages)):
    page=pdf_reader.pages[page_number]
    text+=page.extract_text()

  return text


def extract_text_from_image_or_pdf(file_bytes):
    """
    Extracts text from either a PDF or an image file using PyPDF2 and pytesseract.

    Parameters:
    - file_bytes (bytes): The content of the file in bytes.

    Returns:
    - text (str): Extracted text from the file.
    """
    try:
        if file_bytes.startswith(b'%PDF'):
            # PDF file
            text = extract_text_from_pdf(file_bytes)
        else:
            # Assume image file
            image = Image.open(BytesIO(file_bytes))
            text = pytesseract.image_to_string(image)

        return text
    except Exception as e:
        return f"Error extracting text: {str(e)}"



def perform_ner(text, model_name):
    """
    Performs Named Entity Recognition (NER) on the given text using the specified NER model.

    Parameters:
    - text (str): The input text on which NER will be performed.
    - model_name (str): The name of the NER model to be used ('bert-large-NER', 'bioNER', or 'SpaCy English NER').

    Returns:
    - extracted_entities (list): A list of dictionaries containing information about the recognized entities.
      Each dictionary has the keys: 'text', 'type', 'start_index', 'end_index'.
    - error_message (str): If an error occurs during the NER process, an error message is returned.
    """
    try:
        if model_name == 'SpaCy English NER':
            doc = spacy_ner_model(text)
            extracted_entities = [{'text': ent.text, 'type': ent.label_,
                                   'start_index': ent.start_char, 'end_index': ent.end_char} for ent in doc.ents]
        elif model_name == 'bert-large-NER':
            entities = ner_model(text)
            extracted_entities = [{'text': entity['word'], 'type': entity['entity'],
                                   'start_index': entity['start'], 'end_index': entity['end']} for entity in entities]
        else:
            entities = ner_model_bio(text)
            extracted_entities = [{'text': entity['word'], 'type': entity['entity'],
                                   'start_index': entity['start'], 'end_index': entity['end']} for entity in entities]

        return extracted_entities

    except Exception as e:
        return f"Error performing NER: {str(e)}"


def highlight_entities_with_colors_and_labels_tokenized(text, entities, color_mapping, tokenizer):
    """
    This function takes a raw text input, a list of entities with their start and end indices, a color mapping for entity labels, and a tokenizer.
    It tokenizes the input text, highlights the entities with specified colors and labels, and returns the formatted text with HTML-style markup.

    Parameters:
    - `text` (str): The raw input text.
    - `entities` (list): A list of dictionaries, each containing the start index (`start`), end index (`end`), and type (`type`) of an entity.
    - `color_mapping` (dict): A dictionary mapping entity labels to background colors for highlighting.
    - `tokenizer` (transformers.AutoTokenizer): The tokenizer for encoding the entity text.

    Returns:
    - `highlighted_text` (str): The formatted text with highlighted entities using HTML-style markup.
    """


    highlighted_text = ""
    current_pos = 0

    for ent in entities:
        start, end, label = ent.get('start_index', 0), ent.get('end_index', 0), ent.get('type', 'O')
        entity_text = text[start:end]

        # Tokenize the entity text
        encoded_entity = tokenizer.encode(entity_text, add_special_tokens=False)
        tokenized_entity_text = tokenizer.convert_ids_to_tokens(encoded_entity)
        tokenized_entity_length = len(tokenized_entity_text)

        # Add non-entity text
        highlighted_text += text[current_pos:start]

        # Add highlighted entity text with color and label on the same line
        color = color_mapping.get(label,'#4D94FF')
        highlighted_text += f"<mark style='background-color:{color}' title='{label}'>{entity_text} ({label})</mark>"

        # Update current position
        current_pos = end

    # Add any remaining non-entity text
    highlighted_text += text[current_pos:]

    return highlighted_text
def highlight_entities(text, entities,model_name):
    """
    Highlights named entities in the given text and returns HTML with colored annotations.

    Parameters:
    - text (str): The input text containing named entities.
    - entities (list): A list of dictionaries containing information about the recognized entities.
      Each dictionary has the keys: 'text', 'type', 'start_index', 'end_index'.
    - model_name (str): The name of the NER model used ('SpaCy English NER').

    Returns:
    - colored_text (str): HTML with colored annotations highlighting the recognized entities.
    - error_message (str): If an error occurs during the highlighting process, an error message is returned.
    """
    try:
      if model_name == 'SpaCy English NER':
            doc = spacy_ner_model(text)

            color_mapping = {
            "DATE": "#4D94FF",        # Blue
            "PERSON": "#4CAF50",      # Green
            "EVENT": "#FF6666",       # Salmon
            "FAC": "#66B2FF",         # Sky Blue
            "GPE": "#FFCC99",         # Light Apricot
            "LANGUAGE": "#FF80BF",     # Pink
            "LAW": "#66FF99",         # Mint
            "LOC": "#809FFF",         # Lavender Blue
            "MONEY": "#FFFF99",       # Light Yellow
            "NORP": "#808000",        # Olive Green
            "ORDINAL": "#FF9999",     # Misty Rose
            "ORG": "#FFB366",         # Light Peach
            "PERCENT": "#FF99FF",     # Orchid
            "PRODUCT": "#FF6666",     # Salmon
            "QUANTITY": "#CC99FF",    # Pastel Purple
            "TIME": "#FFD54F",        # Amber
            "WORK_OF_ART": "#FFC266" ,  # Light Orange
            "CARDINAL": "#008080"  # Teal
            }


            options = {"ents": [entity['type'] for entity in entities], "colors": color_mapping}

            html = displacy.render(doc, style="ent", options=options, page=True)
            colored_text = html

            return colored_text
      else:
            color_mapping = {
            'O': 'pink',
            'B-MIS': 'red',
            'I-MIS': 'brown',
            'B-PER': 'green',
            'I-PER': '#FFD54F',
            'B-ORG': 'orange',
            'I-ORG': '#FF6666',
            'B-LOC': 'purple',
            'I-LOC': '#FFCC99',
            }
            highlighted_example = highlight_entities_with_colors_and_labels_tokenized(text, entities, color_mapping, tokenizer)

            return highlighted_example

    except Exception as e:
        return f"Error highlighting entities: {str(e)}"


def summarize_text(input_text):
    """
    The `summarize_text` function is designed to provide a concise summary of a given input text using the Hugging Face Transformers library's summarization pipeline.
    The function takes an `input_text` parameter, representing the text that needs to be summarized.

    Parameters:
    - **input_text (str):** The input text that needs to be summarized.

    Returns:
    - **summarized_text (str):** The function utilizes the summarization pipeline with specific parameters,
                                  including `max_length`, `min_length`, `length_penalty`, `num_beams`, and `early_stopping`,
                                  to generate a summary of the input text. The summarized text is then extracted from the pipeline output and returned.
    """
    summarized_text = summarization_pipeline(input_text, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)

    summarized_text = summarized_text[0]['summary_text']

    return summarized_text


def image_ner_tool(file, model_name):
    """
    Perform Named Entity Recognition (NER) on the text extracted from an image or PDF file.
    The extracted text is highlighted with colored annotations based on recognized entities.

    Parameters:
    - file (str or BytesIO): Either a file path or a BytesIO object containing the image or PDF file.
    - model_name (str): The name of the NER model to be used ('bert-large-NER', 'bioNER', or 'SpaCy English NER').

    Returns:
    - text (str): Extracted text from the input file.
    - highlighted_text (str): HTML with colored annotations highlighting the recognized entities.
    - reformatted_ner_output (str): JSON-formatted string containing information about the recognized entities.
    """
    reformatted_ner_output = ""
    try:
        if isinstance(file, str):  # If the input is a file path
            with open(file, 'rb') as file_stream:
                file_bytes = file_stream.read()
        else:  # If the input is a byte stream
            file_bytes = file.getvalue()

        text = extract_text_from_image_or_pdf(file_bytes)

        entities = perform_ner(text, model_name)
        highlighted_text = highlight_entities(text, entities,model_name)

        reformatted_ner_output = json.dumps(entities, indent=2)

        summary = summarize_text(text)

        return text, highlighted_text, reformatted_ner_output, summary

    except Exception as e:
        error_message = f"Error processing file: {str(e)}"
        return error_message, "", reformatted_ner_output


import pandas as pd
def store_data_to_csv(inputs, outputs):
    print(inputs)
    print(outputs)
    if isinstance(inputs, str):  # If the input is a file path
        with open(inputs, 'rb') as file_stream:
            file_bytes = file_stream.read()
    else:  # If the input is a byte stream
        file_bytes = inputs.getvalue()

    extracted_text = extract_text_from_image_or_pdf(file_bytes)
    named_entities=perform_ner(extracted_text, outputs)
    df = pd.DataFrame({"Extracted Text": [extracted_text], "Extracted Entities": [named_entities]})
    df.to_csv("log.csv", mode='a', index=False, header=False)





with gr.Blocks() as demo:
    gr.Markdown(
        """
        <p style="text-align: center; font-weight: bold; font-size: 44px;">
        Intelligent Document Processing
        </p>

        <p style="text-align: center;">
        Upload a PDF or an image file to extract text and identify named entities
        </p>
        """
    )
    with gr.Row() as row:
        with gr.Column():
            text1 =gr.File(label="Upload File")
            model=gr.Dropdown(list(ner_models.keys()), label="Select NER Model")
            btn = gr.Button("submit")
        with gr.Column():
            with gr.Tab("Extracted Text"):
                output1=gr.Textbox(label="Extracted Text", container= True)
            with gr.Tab("Highlighted Entitied"):
                output2=gr.HTML(label="Highlighted Text")
            with gr.Tab("Summarized Text"):
                output3=gr.HTML(label="Summarized text")
            with gr.Tab("Named Entities Extracted"):
                output4=gr.HTML(label="Named Entities")
                store_button = gr.Button("Store Data to CSV")
    gr.Examples(
        [
            [  # Text to display above the image
                "The year is 2043.pdf",  # Path to the image file
                "SpaCy English NER"  # Selected value for the dropdown menu
            ]
        ],
        [text1, model],
    )
    btn.click(
        image_ner_tool,
        [text1, model],
        [output1, output2, output4, output3],
    )
    store_button.click(
        store_data_to_csv,
        [text1, model],
    )


demo.launch()