Spaces:
Running
Running
wangrongsheng
commited on
Commit
·
a04bf88
1
Parent(s):
2ccd2a4
Update app.py
Browse files
app.py
CHANGED
@@ -60,6 +60,7 @@ class Paper:
|
|
60 |
self.section_names = [] # 段落标题
|
61 |
self.section_texts = {} # 段落内容
|
62 |
self.abs = abs
|
|
|
63 |
if title == '':
|
64 |
self.pdf = fitz.open(self.path) # pdf文档
|
65 |
self.title = self.get_title()
|
@@ -79,7 +80,17 @@ class Paper:
|
|
79 |
print("section_page_dict", self.section_page_dict)
|
80 |
self.section_text_dict = self._get_all_page() # 段落与内容的对应字典
|
81 |
self.section_text_dict.update({"title": self.title})
|
82 |
-
self.pdf.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
def get_image_path(self, image_path=''):
|
85 |
"""
|
@@ -195,6 +206,7 @@ class Paper:
|
|
195 |
cur_title += cur_string
|
196 |
else:
|
197 |
cur_title += ' ' + cur_string
|
|
|
198 |
# break
|
199 |
title = cur_title.replace('\n', ' ')
|
200 |
return title
|
@@ -433,6 +445,7 @@ class Reader:
|
|
433 |
text += 'Title:' + paper.title
|
434 |
text += 'Url:' + paper.url
|
435 |
text += 'Abstrat:' + paper.abs
|
|
|
436 |
# intro
|
437 |
text += list(paper.section_text_dict.values())[0]
|
438 |
#max_token = 2500 * 4
|
|
|
60 |
self.section_names = [] # 段落标题
|
61 |
self.section_texts = {} # 段落内容
|
62 |
self.abs = abs
|
63 |
+
self.title_page = 0
|
64 |
if title == '':
|
65 |
self.pdf = fitz.open(self.path) # pdf文档
|
66 |
self.title = self.get_title()
|
|
|
80 |
print("section_page_dict", self.section_page_dict)
|
81 |
self.section_text_dict = self._get_all_page() # 段落与内容的对应字典
|
82 |
self.section_text_dict.update({"title": self.title})
|
83 |
+
self.pdf.close()
|
84 |
+
|
85 |
+
def get_paper_info(self):
|
86 |
+
first_page_text = self.pdf[self.title_page].get_text()
|
87 |
+
if "Abstract" in self.section_text_dict.keys():
|
88 |
+
abstract_text = self.section_text_dict['Abstract']
|
89 |
+
else:
|
90 |
+
abstract_text = self.abs
|
91 |
+
introduction_text = self.section_text_dict['Introduction']
|
92 |
+
first_page_text = first_page_text.replace(abstract_text, "").replace(introduction_text, "")
|
93 |
+
return first_page_text
|
94 |
|
95 |
def get_image_path(self, image_path=''):
|
96 |
"""
|
|
|
206 |
cur_title += cur_string
|
207 |
else:
|
208 |
cur_title += ' ' + cur_string
|
209 |
+
self.title_page = page_index
|
210 |
# break
|
211 |
title = cur_title.replace('\n', ' ')
|
212 |
return title
|
|
|
445 |
text += 'Title:' + paper.title
|
446 |
text += 'Url:' + paper.url
|
447 |
text += 'Abstrat:' + paper.abs
|
448 |
+
text += 'Paper_info:' + paper.section_text_dict['paper_info']
|
449 |
# intro
|
450 |
text += list(paper.section_text_dict.values())[0]
|
451 |
#max_token = 2500 * 4
|