File size: 4,694 Bytes
13e969c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import fitz
import os
import io
import arxiv
import tempfile
from PIL import Image
from urllib.parse import urlparse
class Paper:
def __init__(self, url=''):
self.url = url
self.parse_url()
self.get_pdf()
self.paper_instance = {
'title': self.paper_arxiv.title,
'authors': self.paper_arxiv.authors,
'arxiv_id': self.paper_id,
'abstract': self.paper_arxiv.summary,
'pdf_url': self.paper_arxiv.pdf_url,
'categories': self.paper_arxiv.categories,
'published': self.paper_arxiv.published,
'updated': self.paper_arxiv.updated,
'content': {}
}
self.parse_pdf()
def get_paper(self):
return self.paper_instance
def parse_url(self):
self.url = self.url.replace('.pdf', '')
parsed_url = urlparse(self.url)
paper_id = os.path.basename(parsed_url.path)
self.paper_id = paper_id
def get_pdf(self):
search = arxiv.Search(id_list=[self.paper_id], max_results=1)
results = search.results()
paper_arxiv = next(results)
if paper_arxiv:
# with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
paper_path = f'{self.paper_id}.pdf'
dir_path = "./pdf"
os.makedirs(dir_path, exist_ok=True)
save_dir = os.path.join(dir_path, paper_path)
if not os.path.exists(save_dir):
paper_arxiv.download_pdf(dirpath=dir_path, filename=paper_path)
self.paper_arxiv = paper_arxiv
self.path = save_dir
else:
raise Exception("无法找到论文,请检查 URL 是否正确。")
def parse_pdf(self):
self.pdf = fitz.open(self.path)
self.text_list = [page.get_text() for page in self.pdf]
self.all_text = ' '.join(self.text_list)
self._parse_paper()
self.pdf.close()
def _get_sections(self):
sections = 'Abstract,Introduction,Related Work,Background,Preliminary,Problem Formulation,Methods,Methodology,Method,Approach,Approaches,Materials and Methods,Experiment Settings,Experiment,Experimental Results,Evaluation,Experiments,Results,Findings,Data Analysis,Discussion,Results and Discussion,Conclusion,References'
self.sections = sections.split(',')
def _get_all_page_index(self):
section_list = self.sections
section_page_dict = {}
for page_index, page in enumerate(self.pdf):
cur_text = page.get_text()
for section_name in section_list:
section_name_upper = section_name.upper()
if "Abstract" == section_name and section_name in cur_text:
section_page_dict[section_name] = page_index
continue
if section_name + '\n' in cur_text:
section_page_dict[section_name] = page_index
elif section_name_upper + '\n' in cur_text:
section_page_dict[section_name] = page_index
self.section_page_dict = section_page_dict
def _parse_paper(self):
"""
Return: dict { <Section Name>: <Content> }
"""
self._get_sections()
self._get_all_page_index()
text_list = [page.get_text() for page in self.pdf]
section_keys = list(self.section_page_dict.keys())
section_count = len(section_keys)
section_dict = {}
for sec_index, sec_name in enumerate(section_keys):
if sec_index == 0:
continue
start_page = self.section_page_dict[sec_name]
end_page = self.section_page_dict[section_keys[sec_index + 1]] if sec_index < section_count - 1 else len(text_list)
cur_sec_text = []
for page_i in range(start_page, end_page):
page_text = text_list[page_i]
if page_i == start_page:
start_i = page_text.find(sec_name) if sec_name in page_text else page_text.find(sec_name.upper())
page_text = page_text[start_i:]
if page_i == end_page - 1 and sec_index < section_count - 1:
next_sec = section_keys[sec_index + 1]
end_i = page_text.find(next_sec) if next_sec in page_text else page_text.find(next_sec.upper())
page_text = page_text[:end_i]
cur_sec_text.append(page_text)
section_dict[sec_name] = ''.join(cur_sec_text).replace('-\n', '').replace('\n', ' ')
self.paper_instance['content'] = section_dict |