wfranco commited on
Commit
9e4f7af
·
1 Parent(s): f9d18db

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +131 -1
app.py CHANGED
@@ -8,4 +8,134 @@ iface.launch()
8
 
9
  demo = gr.Interface(fn=greet, inputs="text", outputs="text")
10
 
11
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  demo = gr.Interface(fn=greet, inputs="text", outputs="text")
10
 
11
+ ###Installing the packages###
12
+ !pip install PyPDF2
13
+
14
+ #PyPDF2: To read the PDF file from the repository path.
15
+ !pip install pdfminer.six
16
+
17
+ #Pdfplumber: To identify tables in a PDF page and extract the information from them.
18
+ !pip install pdfplumber
19
+
20
+ #Pdf2image: To convert the cropped PDF image to a PNG image.
21
+ !pip install pdf2image
22
+
23
+ #PIL: To read the PNG image.
24
+ !pip install Pillow
25
+
26
+ #Pytesseract: To extract the text from the images using OCR technology.
27
+ !pip install pytesseract
28
+
29
+ #Other libraries
30
+ !apt-get install poppler-utils
31
+ !apt install tesseract-ocr
32
+ !apt install libtesseract-dev
33
+
34
+ ###Importing libraries ###
35
+ # To read the PDF
36
+ import PyPDF2
37
+ # To analyze the PDF layout and extract text
38
+ from pdfminer.high_level import extract_pages, extract_text
39
+ from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
40
+ # To extract text from tables in PDF
41
+ import pdfplumber
42
+ # To extract the images from the PDFs
43
+ from PIL import Image
44
+ from pdf2image import convert_from_path
45
+ # To perform OCR to extract text from images
46
+ import pytesseract
47
+ # To remove the additional created files
48
+ import os
49
+
50
+ ### Create a function to extract text ###
51
+
52
+ def text_extraction(element):
53
+ # Extracting the text from the in-line text element
54
+ line_text = element.get_text()
55
+
56
+ # Find the formats of the text
57
+ # Initialize the list with all the formats that appeared in the line of text
58
+ line_formats = []
59
+ for text_line in element:
60
+ if isinstance(text_line, LTTextContainer):
61
+ # Iterating through each character in the line of text
62
+ for character in text_line:
63
+ if isinstance(character, LTChar):
64
+ # Append the font name of the character
65
+ line_formats.append(character.fontname)
66
+ # Append the font size of the character
67
+ line_formats.append(character.size)
68
+ # Find the unique font sizes and names in the line
69
+ format_per_line = list(set(line_formats))
70
+
71
+ # Return a tuple with the text in each line along with its format
72
+ return (line_text, format_per_line)
73
+
74
+ ### Step 4: Create a function that reads a PDF file ###
75
+
76
+ def read_pdf(pdf_path):
77
+ # create a PDF file object
78
+ pdfFileObj = open(pdf_path, 'rb')
79
+ # create a PDF reader object
80
+ pdfReaded = PyPDF2.PdfReader(pdfFileObj)
81
+
82
+ # Create the dictionary to extract text from each image
83
+ text_per_page = {}
84
+ # We extract the pages from the PDF
85
+ for pagenum, page in enumerate(extract_pages(pdf_path)):
86
+ print("Elaborating Page_" +str(pagenum))
87
+ # Initialize the variables needed for the text extraction from the page
88
+ pageObj = pdfReaded.pages[pagenum]
89
+ page_text = []
90
+ line_format = []
91
+ text_from_images = []
92
+ text_from_tables = []
93
+ page_content = []
94
+ # Initialize the number of the examined tables
95
+ table_num = 0
96
+ first_element= True
97
+ table_extraction_flag= False
98
+ # Open the pdf file
99
+ pdf = pdfplumber.open(pdf_path)
100
+ # Find the examined page
101
+ page_tables = pdf.pages[pagenum]
102
+ # Find the number of tables on the page
103
+ tables = page_tables.find_tables()
104
+
105
+
106
+ # Find all the elements
107
+ page_elements = [(element.y1, element) for element in page._objs]
108
+ # Sort all the elements as they appear in the page
109
+ page_elements.sort(key=lambda a: a[0], reverse=True)
110
+
111
+ # Find the elements that composed a page
112
+ for i,component in enumerate(page_elements):
113
+ # Extract the position of the top side of the element in the PDF
114
+ pos= component[0]
115
+ # Extract the element of the page layout
116
+ element = component[1]
117
+
118
+ # Check if the element is a text element
119
+ if isinstance(element, LTTextContainer):
120
+ # Check if the text appeared in a table
121
+ if table_extraction_flag == False:
122
+ # Use the function to extract the text and format for each text element
123
+ (line_text, format_per_line) = text_extraction(element)
124
+ # Append the text of each line to the page text
125
+ page_text.append(line_text)
126
+ # Append the format for each line containing text
127
+ line_format.append(format_per_line)
128
+ page_content.append(line_text)
129
+ else:
130
+ # Omit the text that appeared in a table
131
+ pass
132
+
133
+
134
+ # Create the key of the dictionary
135
+ dctkey = 'Page_'+str(pagenum)
136
+ # Add the list of list as the value of the page key
137
+ text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
138
+
139
+ # Closing the pdf file object
140
+ pdfFileObj.close()
141
+ return text_per_page