barser65 commited on
Commit
403f739
·
1 Parent(s): 2addf3f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +141 -1
app.py CHANGED
@@ -1,5 +1,145 @@
1
- import myfunct
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
 
 
3
  import gradio as gr
4
 
5
  #def greet(name):
 
1
+ def converti(path):
2
+ import pip
3
+
4
+ def install(package):
5
+ if hasattr(pip, 'main'):
6
+ pip.main(['install', package])
7
+ else:
8
+ pip._internal.main(['install', package])
9
+
10
+ install('git+https://github.com/huggingface/transformers.git')
11
+ install('datasets sentencepiece')
12
+ install('PyPDF2')
13
+ install('pdfminer.six')
14
+ install('pdfplumber')
15
+ install('poppler-utils')
16
+ install('tesseract-ocr')
17
+ install('libtesseract-dev')
18
+
19
+ # To read the PDF
20
+ import PyPDF2
21
+ # To analyze the PDF layout and extract text
22
+ from pdfminer.high_level import extract_pages, extract_text
23
+ from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
24
+ # To extract text from tables in PDF
25
+ import pdfplumber
26
+ # To remove the additional created files
27
+ import os
28
+
29
+ # Create a function to extract text
30
+
31
+ def text_extraction(element):
32
+ # Extracting the text from the in-line text element
33
+ line_text = element.get_text()
34
+
35
+ # Find the formats of the text
36
+ # Initialize the list with all the formats that appeared in the line of text
37
+ line_formats = []
38
+ for text_line in element:
39
+ if isinstance(text_line, LTTextContainer):
40
+ # Iterating through each character in the line of text
41
+ for character in text_line:
42
+ if isinstance(character, LTChar):
43
+ # Append the font name of the character
44
+ line_formats.append(character.fontname)
45
+ # Append the font size of the character
46
+ line_formats.append(character.size)
47
+ # Find the unique font sizes and names in the line
48
+ format_per_line = list(set(line_formats))
49
+
50
+ # Return a tuple with the text in each line along with its format
51
+ return (line_text, format_per_line)
52
+
53
+ def read_pdf(pdf_path):
54
+ # create a PDF file object
55
+ pdfFileObj = open(pdf_path, 'rb')
56
+ # create a PDF reader object
57
+ pdfReaded = PyPDF2.PdfReader(pdfFileObj)
58
+
59
+ # Create the dictionary to extract text from each image
60
+ text_per_page = {}
61
+ # We extract the pages from the PDF
62
+ for pagenum, page in enumerate(extract_pages(pdf_path)):
63
+ print("Elaborating Page_" +str(pagenum))
64
+ # Initialize the variables needed for the text extraction from the page
65
+ pageObj = pdfReaded.pages[pagenum]
66
+ page_text = []
67
+ line_format = []
68
+ text_from_images = []
69
+ text_from_tables = []
70
+ page_content = []
71
+ # Initialize the number of the examined tables
72
+ table_num = 0
73
+ first_element= True
74
+ table_extraction_flag= False
75
+ # Open the pdf file
76
+ pdf = pdfplumber.open(pdf_path)
77
+ # Find the examined page
78
+ page_tables = pdf.pages[pagenum]
79
+ # Find the number of tables on the page
80
+ tables = page_tables.find_tables()
81
+
82
+
83
+ # Find all the elements
84
+ page_elements = [(element.y1, element) for element in page._objs]
85
+ # Sort all the elements as they appear in the page
86
+ page_elements.sort(key=lambda a: a[0], reverse=True)
87
+
88
+ # Find the elements that composed a page
89
+ for i,component in enumerate(page_elements):
90
+ # Extract the position of the top side of the element in the PDF
91
+ pos= component[0]
92
+ # Extract the element of the page layout
93
+ element = component[1]
94
+
95
+ # Check if the element is a text element
96
+ if isinstance(element, LTTextContainer):
97
+ # Check if the text appeared in a table
98
+ if table_extraction_flag == False:
99
+ # Use the function to extract the text and format for each text element
100
+ (line_text, format_per_line) = text_extraction(element)
101
+ # Append the text of each line to the page text
102
+ page_text.append(line_text)
103
+ # Append the format for each line containing text
104
+ line_format.append(format_per_line)
105
+ page_content.append(line_text)
106
+ else:
107
+ # Omit the text that appeared in a table
108
+ pass
109
+
110
+ # Create the key of the dictionary
111
+ dctkey = 'Page_'+str(pagenum)
112
+ # Add the list of list as the value of the page key
113
+ text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
114
+
115
+ # Closing the pdf file object
116
+ pdfFileObj.close()
117
+
118
+ return text_per_page
119
+
120
+ from google.colab import drive
121
+ drive.mount('/content/drive')
122
+
123
+ pdf_path = '/content/drive/MyDrive/' + path
124
+ text_per_page = read_pdf(pdf_path)
125
+
126
+ abstr = ''
127
+ while len(abstr) == 0:
128
+ for par in range(len(text_per_page)):
129
+ for x in text_per_page['Page_'+str(par)]:
130
+ mystring = ' '.join(map(str,x))
131
+ if mystring.find('Abstract\n') > 0:
132
+ abstr0 = mystring[mystring.find('Abstract\n')+10:]
133
+ abstr = abstr0[:abstr0.find('1\n')]
134
+ print(abstr)
135
+
136
+ from transformers import pipeline
137
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
138
+ summary = summarizer(abstr, max_length=56)
139
+ summary_text = summary[0]['summary_text']
140
 
141
+ return summary_text
142
+
143
  import gradio as gr
144
 
145
  #def greet(name):