barser65 commited on
Commit
ff92222
·
1 Parent(s): 8f0f72b

Delete myfunct.py

Browse files
Files changed (1) hide show
  1. myfunct.py +0 -141
myfunct.py DELETED
@@ -1,141 +0,0 @@
1
- def converti(path):
2
- import pip
3
-
4
- def install(package):
5
- if hasattr(pip, 'main'):
6
- pip.main(['install', package])
7
- else:
8
- pip._internal.main(['install', package])
9
-
10
- install('git+https://github.com/huggingface/transformers.git')
11
- install('datasets sentencepiece')
12
- install('PyPDF2')
13
- install('pdfminer.six')
14
- install('pdfplumber')
15
- install('poppler-utils')
16
- install('tesseract-ocr')
17
- install('libtesseract-dev')
18
-
19
- # To read the PDF
20
- import PyPDF2
21
- # To analyze the PDF layout and extract text
22
- from pdfminer.high_level import extract_pages, extract_text
23
- from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
24
- # To extract text from tables in PDF
25
- import pdfplumber
26
- # To remove the additional created files
27
- import os
28
-
29
- # Create a function to extract text
30
-
31
- def text_extraction(element):
32
- # Extracting the text from the in-line text element
33
- line_text = element.get_text()
34
-
35
- # Find the formats of the text
36
- # Initialize the list with all the formats that appeared in the line of text
37
- line_formats = []
38
- for text_line in element:
39
- if isinstance(text_line, LTTextContainer):
40
- # Iterating through each character in the line of text
41
- for character in text_line:
42
- if isinstance(character, LTChar):
43
- # Append the font name of the character
44
- line_formats.append(character.fontname)
45
- # Append the font size of the character
46
- line_formats.append(character.size)
47
- # Find the unique font sizes and names in the line
48
- format_per_line = list(set(line_formats))
49
-
50
- # Return a tuple with the text in each line along with its format
51
- return (line_text, format_per_line)
52
-
53
- def read_pdf(pdf_path):
54
- # create a PDF file object
55
- pdfFileObj = open(pdf_path, 'rb')
56
- # create a PDF reader object
57
- pdfReaded = PyPDF2.PdfReader(pdfFileObj)
58
-
59
- # Create the dictionary to extract text from each image
60
- text_per_page = {}
61
- # We extract the pages from the PDF
62
- for pagenum, page in enumerate(extract_pages(pdf_path)):
63
- print("Elaborating Page_" +str(pagenum))
64
- # Initialize the variables needed for the text extraction from the page
65
- pageObj = pdfReaded.pages[pagenum]
66
- page_text = []
67
- line_format = []
68
- text_from_images = []
69
- text_from_tables = []
70
- page_content = []
71
- # Initialize the number of the examined tables
72
- table_num = 0
73
- first_element= True
74
- table_extraction_flag= False
75
- # Open the pdf file
76
- pdf = pdfplumber.open(pdf_path)
77
- # Find the examined page
78
- page_tables = pdf.pages[pagenum]
79
- # Find the number of tables on the page
80
- tables = page_tables.find_tables()
81
-
82
-
83
- # Find all the elements
84
- page_elements = [(element.y1, element) for element in page._objs]
85
- # Sort all the elements as they appear in the page
86
- page_elements.sort(key=lambda a: a[0], reverse=True)
87
-
88
- # Find the elements that composed a page
89
- for i,component in enumerate(page_elements):
90
- # Extract the position of the top side of the element in the PDF
91
- pos= component[0]
92
- # Extract the element of the page layout
93
- element = component[1]
94
-
95
- # Check if the element is a text element
96
- if isinstance(element, LTTextContainer):
97
- # Check if the text appeared in a table
98
- if table_extraction_flag == False:
99
- # Use the function to extract the text and format for each text element
100
- (line_text, format_per_line) = text_extraction(element)
101
- # Append the text of each line to the page text
102
- page_text.append(line_text)
103
- # Append the format for each line containing text
104
- line_format.append(format_per_line)
105
- page_content.append(line_text)
106
- else:
107
- # Omit the text that appeared in a table
108
- pass
109
-
110
- # Create the key of the dictionary
111
- dctkey = 'Page_'+str(pagenum)
112
- # Add the list of list as the value of the page key
113
- text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
114
-
115
- # Closing the pdf file object
116
- pdfFileObj.close()
117
-
118
- return text_per_page
119
-
120
- from google.colab import drive
121
- drive.mount('/content/drive')
122
-
123
- pdf_path = '/content/drive/MyDrive/' + path
124
- text_per_page = read_pdf(pdf_path)
125
-
126
- abstr = ''
127
- while len(abstr) == 0:
128
- for par in range(len(text_per_page)):
129
- for x in text_per_page['Page_'+str(par)]:
130
- mystring = ' '.join(map(str,x))
131
- if mystring.find('Abstract\n') > 0:
132
- abstr0 = mystring[mystring.find('Abstract\n')+10:]
133
- abstr = abstr0[:abstr0.find('1\n')]
134
- print(abstr)
135
-
136
- from transformers import pipeline
137
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
138
- summary = summarizer(abstr, max_length=56)
139
- summary_text = summary[0]['summary_text']
140
-
141
- return summary_text