keshva commited on
Commit
206555d
·
1 Parent(s): b85c912

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +338 -0
app.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import PyPDF2
3
+ from PyPDF2 import PdfReader
4
+ from io import BytesIO
5
+ import pytesseract
6
+ from PIL import Image
7
+ import spacy
8
+ import json
9
+
10
+ from transformers import pipeline
11
+ from PyPDF2 import PdfReader
12
+ ner_model = pipeline('token-classification', model='dslim/bert-large-NER')
13
+ summarization_pipeline = pipeline("summarization", model="facebook/bart-large-cnn")
14
+ ner_models = {
15
+ 'bert-large-NER': 'dslim/bert-large-NER',
16
+ 'bioNER': 'd4data/biomedical-ner-all',
17
+ 'SpaCy English NER': 'en_core_web_trf',
18
+ }
19
+ spacy.cli.download("en_core_web_trf")
20
+ spacy_ner_model = spacy.load(ner_models['SpaCy English NER'])
21
+ ner_model_bio = pipeline('token-classification', model='d4data/biomedical-ner-all')
22
+ from transformers import AutoTokenizer
23
+ tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
24
+ from spacy import displacy
25
+
26
+
27
+ def extract_text_from_pdf(pdf_bytes):
28
+ """
29
+ Extracts text from a PDF file using PyPDF2.
30
+
31
+ Parameters:
32
+ - pdf_bytes (bytes): The content of the PDF file in bytes.
33
+ Returns:
34
+ - text (str): Extracted text from the PDF.
35
+ """
36
+ text=''
37
+ pdf_file=BytesIO(pdf_bytes)
38
+
39
+ pdf_reader=PdfReader(pdf_file)
40
+
41
+ for page_number in range(len(pdf_reader.pages)):
42
+ page=pdf_reader.pages[page_number]
43
+ text+=page.extract_text()
44
+
45
+ return text
46
+
47
+
48
+ def extract_text_from_image_or_pdf(file_bytes):
49
+ """
50
+ Extracts text from either a PDF or an image file using PyPDF2 and pytesseract.
51
+
52
+ Parameters:
53
+ - file_bytes (bytes): The content of the file in bytes.
54
+
55
+ Returns:
56
+ - text (str): Extracted text from the file.
57
+ """
58
+ try:
59
+ if file_bytes.startswith(b'%PDF'):
60
+ # PDF file
61
+ text = extract_text_from_pdf(file_bytes)
62
+ else:
63
+ # Assume image file
64
+ image = Image.open(BytesIO(file_bytes))
65
+ text = pytesseract.image_to_string(image)
66
+
67
+ return text
68
+ except Exception as e:
69
+ return f"Error extracting text: {str(e)}"
70
+
71
+
72
+
73
+ def perform_ner(text, model_name):
74
+ """
75
+ Performs Named Entity Recognition (NER) on the given text using the specified NER model.
76
+
77
+ Parameters:
78
+ - text (str): The input text on which NER will be performed.
79
+ - model_name (str): The name of the NER model to be used ('bert-large-NER', 'bioNER', or 'SpaCy English NER').
80
+
81
+ Returns:
82
+ - extracted_entities (list): A list of dictionaries containing information about the recognized entities.
83
+ Each dictionary has the keys: 'text', 'type', 'start_index', 'end_index'.
84
+ - error_message (str): If an error occurs during the NER process, an error message is returned.
85
+ """
86
+ try:
87
+ if model_name == 'SpaCy English NER':
88
+ doc = spacy_ner_model(text)
89
+ extracted_entities = [{'text': ent.text, 'type': ent.label_,
90
+ 'start_index': ent.start_char, 'end_index': ent.end_char} for ent in doc.ents]
91
+ elif model_name == 'bert-large-NER':
92
+ entities = ner_model(text)
93
+ extracted_entities = [{'text': entity['word'], 'type': entity['entity'],
94
+ 'start_index': entity['start'], 'end_index': entity['end']} for entity in entities]
95
+ else:
96
+ entities = ner_model_bio(text)
97
+ extracted_entities = [{'text': entity['word'], 'type': entity['entity'],
98
+ 'start_index': entity['start'], 'end_index': entity['end']} for entity in entities]
99
+
100
+ return extracted_entities
101
+
102
+ except Exception as e:
103
+ return f"Error performing NER: {str(e)}"
104
+
105
+
106
+ def highlight_entities_with_colors_and_labels_tokenized(text, entities, color_mapping, tokenizer):
107
+ """
108
+ This function takes a raw text input, a list of entities with their start and end indices, a color mapping for entity labels, and a tokenizer.
109
+ It tokenizes the input text, highlights the entities with specified colors and labels, and returns the formatted text with HTML-style markup.
110
+
111
+ Parameters:
112
+ - `text` (str): The raw input text.
113
+ - `entities` (list): A list of dictionaries, each containing the start index (`start`), end index (`end`), and type (`type`) of an entity.
114
+ - `color_mapping` (dict): A dictionary mapping entity labels to background colors for highlighting.
115
+ - `tokenizer` (transformers.AutoTokenizer): The tokenizer for encoding the entity text.
116
+
117
+ Returns:
118
+ - `highlighted_text` (str): The formatted text with highlighted entities using HTML-style markup.
119
+ """
120
+
121
+
122
+ highlighted_text = ""
123
+ current_pos = 0
124
+
125
+ for ent in entities:
126
+ start, end, label = ent.get('start_index', 0), ent.get('end_index', 0), ent.get('type', 'O')
127
+ entity_text = text[start:end]
128
+
129
+ # Tokenize the entity text
130
+ encoded_entity = tokenizer.encode(entity_text, add_special_tokens=False)
131
+ tokenized_entity_text = tokenizer.convert_ids_to_tokens(encoded_entity)
132
+ tokenized_entity_length = len(tokenized_entity_text)
133
+
134
+ # Add non-entity text
135
+ highlighted_text += text[current_pos:start]
136
+
137
+ # Add highlighted entity text with color and label on the same line
138
+ color = color_mapping.get(label,'#4D94FF')
139
+ highlighted_text += f"<mark style='background-color:{color}' title='{label}'>{entity_text} ({label})</mark>"
140
+
141
+ # Update current position
142
+ current_pos = end
143
+
144
+ # Add any remaining non-entity text
145
+ highlighted_text += text[current_pos:]
146
+
147
+ return highlighted_text
148
+ def highlight_entities(text, entities,model_name):
149
+ """
150
+ Highlights named entities in the given text and returns HTML with colored annotations.
151
+
152
+ Parameters:
153
+ - text (str): The input text containing named entities.
154
+ - entities (list): A list of dictionaries containing information about the recognized entities.
155
+ Each dictionary has the keys: 'text', 'type', 'start_index', 'end_index'.
156
+ - model_name (str): The name of the NER model used ('SpaCy English NER').
157
+
158
+ Returns:
159
+ - colored_text (str): HTML with colored annotations highlighting the recognized entities.
160
+ - error_message (str): If an error occurs during the highlighting process, an error message is returned.
161
+ """
162
+ try:
163
+ if model_name == 'SpaCy English NER':
164
+ doc = spacy_ner_model(text)
165
+
166
+ color_mapping = {
167
+ "DATE": "#4D94FF", # Blue
168
+ "PERSON": "#4CAF50", # Green
169
+ "EVENT": "#FF6666", # Salmon
170
+ "FAC": "#66B2FF", # Sky Blue
171
+ "GPE": "#FFCC99", # Light Apricot
172
+ "LANGUAGE": "#FF80BF", # Pink
173
+ "LAW": "#66FF99", # Mint
174
+ "LOC": "#809FFF", # Lavender Blue
175
+ "MONEY": "#FFFF99", # Light Yellow
176
+ "NORP": "#808000", # Olive Green
177
+ "ORDINAL": "#FF9999", # Misty Rose
178
+ "ORG": "#FFB366", # Light Peach
179
+ "PERCENT": "#FF99FF", # Orchid
180
+ "PRODUCT": "#FF6666", # Salmon
181
+ "QUANTITY": "#CC99FF", # Pastel Purple
182
+ "TIME": "#FFD54F", # Amber
183
+ "WORK_OF_ART": "#FFC266" , # Light Orange
184
+ "CARDINAL": "#008080" # Teal
185
+ }
186
+
187
+
188
+ options = {"ents": [entity['type'] for entity in entities], "colors": color_mapping}
189
+
190
+ html = displacy.render(doc, style="ent", options=options, page=True)
191
+ colored_text = html
192
+
193
+ return colored_text
194
+ else:
195
+ color_mapping = {
196
+ 'O': 'pink',
197
+ 'B-MIS': 'red',
198
+ 'I-MIS': 'brown',
199
+ 'B-PER': 'green',
200
+ 'I-PER': '#FFD54F',
201
+ 'B-ORG': 'orange',
202
+ 'I-ORG': '#FF6666',
203
+ 'B-LOC': 'purple',
204
+ 'I-LOC': '#FFCC99',
205
+ }
206
+ highlighted_example = highlight_entities_with_colors_and_labels_tokenized(text, entities, color_mapping, tokenizer)
207
+
208
+ return highlighted_example
209
+
210
+ except Exception as e:
211
+ return f"Error highlighting entities: {str(e)}"
212
+
213
+
214
+ def summarize_text(input_text):
215
+ """
216
+ The `summarize_text` function is designed to provide a concise summary of a given input text using the Hugging Face Transformers library's summarization pipeline.
217
+ The function takes an `input_text` parameter, representing the text that needs to be summarized.
218
+
219
+ Parameters:
220
+ - **input_text (str):** The input text that needs to be summarized.
221
+
222
+ Returns:
223
+ - **summarized_text (str):** The function utilizes the summarization pipeline with specific parameters,
224
+ including `max_length`, `min_length`, `length_penalty`, `num_beams`, and `early_stopping`,
225
+ to generate a summary of the input text. The summarized text is then extracted from the pipeline output and returned.
226
+ """
227
+ summarized_text = summarization_pipeline(input_text, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
228
+
229
+ summarized_text = summarized_text[0]['summary_text']
230
+
231
+ return summarized_text
232
+
233
+
234
+ def image_ner_tool(file, model_name):
235
+ """
236
+ Perform Named Entity Recognition (NER) on the text extracted from an image or PDF file.
237
+ The extracted text is highlighted with colored annotations based on recognized entities.
238
+
239
+ Parameters:
240
+ - file (str or BytesIO): Either a file path or a BytesIO object containing the image or PDF file.
241
+ - model_name (str): The name of the NER model to be used ('bert-large-NER', 'bioNER', or 'SpaCy English NER').
242
+
243
+ Returns:
244
+ - text (str): Extracted text from the input file.
245
+ - highlighted_text (str): HTML with colored annotations highlighting the recognized entities.
246
+ - reformatted_ner_output (str): JSON-formatted string containing information about the recognized entities.
247
+ """
248
+ reformatted_ner_output = ""
249
+ try:
250
+ if isinstance(file, str): # If the input is a file path
251
+ with open(file, 'rb') as file_stream:
252
+ file_bytes = file_stream.read()
253
+ else: # If the input is a byte stream
254
+ file_bytes = file.getvalue()
255
+
256
+ text = extract_text_from_image_or_pdf(file_bytes)
257
+
258
+ entities = perform_ner(text, model_name)
259
+ highlighted_text = highlight_entities(text, entities,model_name)
260
+
261
+ reformatted_ner_output = json.dumps(entities, indent=2)
262
+
263
+ summary = summarize_text(text)
264
+
265
+ return text, highlighted_text, reformatted_ner_output, summary
266
+
267
+ except Exception as e:
268
+ error_message = f"Error processing file: {str(e)}"
269
+ return error_message, "", reformatted_ner_output
270
+
271
+
272
+ import pandas as pd
273
+ def store_data_to_csv(inputs, outputs):
274
+ print(inputs)
275
+ print(outputs)
276
+ if isinstance(inputs, str): # If the input is a file path
277
+ with open(inputs, 'rb') as file_stream:
278
+ file_bytes = file_stream.read()
279
+ else: # If the input is a byte stream
280
+ file_bytes = inputs.getvalue()
281
+
282
+ extracted_text = extract_text_from_image_or_pdf(file_bytes)
283
+ named_entities=perform_ner(extracted_text, outputs)
284
+ df = pd.DataFrame({"Extracted Text": [extracted_text], "Extracted Entities": [named_entities]})
285
+ df.to_csv("log.csv", mode='a', index=False, header=False)
286
+
287
+
288
+
289
+
290
+
291
+ with gr.Blocks() as demo:
292
+ gr.Markdown(
293
+ """
294
+ <p style="text-align: center; font-weight: bold; font-size: 44px;">
295
+ Intelligent Document Processing
296
+ </p>
297
+
298
+ <p style="text-align: center;">
299
+ Upload a PDF or an image file to extract text and identify named entities
300
+ </p>
301
+ """
302
+ )
303
+ with gr.Row() as row:
304
+ with gr.Column():
305
+ text1 =gr.File(label="Upload File")
306
+ model=gr.Dropdown(list(ner_models.keys()), label="Select NER Model")
307
+ btn = gr.Button("submit")
308
+ with gr.Column():
309
+ with gr.Tab("Extracted Text"):
310
+ output1=gr.Textbox(label="Extracted Text", container= True)
311
+ with gr.Tab("Highlighted Entitied"):
312
+ output2=gr.HTML(label="Highlighted Text")
313
+ with gr.Tab("Summarized Text"):
314
+ output3=gr.HTML(label="Summarized text")
315
+ with gr.Tab("Named Entities Extracted"):
316
+ output4=gr.HTML(label="Named Entities")
317
+ store_button = gr.Button("Store Data to CSV")
318
+ gr.Examples(
319
+ [
320
+ [ # Text to display above the image
321
+ "/content/The year is 2043.pdf", # Path to the image file
322
+ "SpaCy English NER" # Selected value for the dropdown menu
323
+ ]
324
+ ],
325
+ [text1, model],
326
+ )
327
+ btn.click(
328
+ image_ner_tool,
329
+ [text1, model],
330
+ [output1, output2, output4, output3],
331
+ )
332
+ store_button.click(
333
+ store_data_to_csv,
334
+ [text1, model],
335
+ )
336
+
337
+
338
+ demo.launch()