wfranco commited on
Commit
196066f
·
1 Parent(s): db2e1b0

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -140
app.py DELETED
@@ -1,140 +0,0 @@
1
- import gradio as gr
2
-
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
-
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
8
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
9
-
10
- ###Installing the packages###
11
- ! pip install PyPDF2
12
-
13
- #PyPDF2: To read the PDF file from the repository path.
14
- ! pip install pdfminer.six
15
-
16
- #Pdfplumber: To identify tables in a PDF page and extract the information from them.
17
- ! pip install pdfplumber
18
-
19
- #Pdf2image: To convert the cropped PDF image to a PNG image.
20
- ! pip install pdf2image
21
-
22
- #PIL: To read the PNG image.
23
- ! pip install Pillow
24
-
25
- #Pytesseract: To extract the text from the images using OCR technology.
26
- ! pip install pytesseract
27
-
28
- #Other libraries
29
- ! apt-get install poppler-utils
30
- ! apt install tesseract-ocr
31
- ! apt install libtesseract-dev
32
-
33
- ###Importing libraries ###
34
- # To read the PDF
35
- import PyPDF2
36
- # To analyze the PDF layout and extract text
37
- from pdfminer.high_level import extract_pages, extract_text
38
- from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
39
- # To extract text from tables in PDF
40
- import pdfplumber
41
- # To extract the images from the PDFs
42
- from PIL import Image
43
- from pdf2image import convert_from_path
44
- # To perform OCR to extract text from images
45
- import pytesseract
46
- # To remove the additional created files
47
- import os
48
-
49
- ### Create a function to extract text ###
50
-
51
- def text_extraction(element):
52
- # Extracting the text from the in-line text element
53
- line_text = element.get_text()
54
-
55
- # Find the formats of the text
56
- # Initialize the list with all the formats that appeared in the line of text
57
- line_formats = []
58
- for text_line in element:
59
- if isinstance(text_line, LTTextContainer):
60
- # Iterating through each character in the line of text
61
- for character in text_line:
62
- if isinstance(character, LTChar):
63
- # Append the font name of the character
64
- line_formats.append(character.fontname)
65
- # Append the font size of the character
66
- line_formats.append(character.size)
67
- # Find the unique font sizes and names in the line
68
- format_per_line = list(set(line_formats))
69
-
70
- # Return a tuple with the text in each line along with its format
71
- return (line_text, format_per_line)
72
-
73
- ### Step 4: Create a function that reads a PDF file ###
74
-
75
- def read_pdf(pdf_path):
76
- # create a PDF file object
77
- pdfFileObj = open(pdf_path, 'rb')
78
- # create a PDF reader object
79
- pdfReaded = PyPDF2.PdfReader(pdfFileObj)
80
-
81
- # Create the dictionary to extract text from each image
82
- text_per_page = {}
83
- # We extract the pages from the PDF
84
- for pagenum, page in enumerate(extract_pages(pdf_path)):
85
- print("Elaborating Page_" +str(pagenum))
86
- # Initialize the variables needed for the text extraction from the page
87
- pageObj = pdfReaded.pages[pagenum]
88
- page_text = []
89
- line_format = []
90
- text_from_images = []
91
- text_from_tables = []
92
- page_content = []
93
- # Initialize the number of the examined tables
94
- table_num = 0
95
- first_element= True
96
- table_extraction_flag= False
97
- # Open the pdf file
98
- pdf = pdfplumber.open(pdf_path)
99
- # Find the examined page
100
- page_tables = pdf.pages[pagenum]
101
- # Find the number of tables on the page
102
- tables = page_tables.find_tables()
103
-
104
-
105
- # Find all the elements
106
- page_elements = [(element.y1, element) for element in page._objs]
107
- # Sort all the elements as they appear in the page
108
- page_elements.sort(key=lambda a: a[0], reverse=True)
109
-
110
- # Find the elements that composed a page
111
- for i,component in enumerate(page_elements):
112
- # Extract the position of the top side of the element in the PDF
113
- pos= component[0]
114
- # Extract the element of the page layout
115
- element = component[1]
116
-
117
- # Check if the element is a text element
118
- if isinstance(element, LTTextContainer):
119
- # Check if the text appeared in a table
120
- if table_extraction_flag == False:
121
- # Use the function to extract the text and format for each text element
122
- (line_text, format_per_line) = text_extraction(element)
123
- # Append the text of each line to the page text
124
- page_text.append(line_text)
125
- # Append the format for each line containing text
126
- line_format.append(format_per_line)
127
- page_content.append(line_text)
128
- else:
129
- # Omit the text that appeared in a table
130
- pass
131
-
132
-
133
- # Create the key of the dictionary
134
- dctkey = 'Page_'+str(pagenum)
135
- # Add the list of list as the value of the page key
136
- text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
137
-
138
- # Closing the pdf file object
139
- pdfFileObj.close()
140
- return text_per_page