SciPIP / src /utils /paper_crawling.py
lihuigu
init commit
e17c9f2
raw
history blame
14.1 kB
import os
import re
import random
import requests
from requests.exceptions import RequestException
import re
from bs4 import BeautifulSoup
from .hash import generate_hash_id
from .header import get_dir
from loguru import logger
with open(get_dir("./assets/data/user_agents.txt"), "r", encoding="utf8") as f:
user_agents = [l.rstrip() for l in f.readlines()]
def extract_title_from_index(index_url):
try:
headers = {"User-Agent": random.choice(user_agents)}
response_title = requests.get(index_url, headers=headers)
response_title.raise_for_status()
soup = BeautifulSoup(response_title.content, "html.parser")
papers = soup.find_all("h2", id="title")
for paper in papers:
title = paper.text.strip()
return title
except RequestException as e:
logger.error(f"Failed to extract title from {index_url}: {e}")
return None
def extract_year_from_index(index_url):
try:
headers = {"User-Agent": random.choice(user_agents)}
response_year = requests.get(index_url, headers=headers)
response_year.raise_for_status()
soup = BeautifulSoup(response_year.content, "html.parser")
year_tag = soup.find("dt", text="Year:")
if year_tag:
year_dd = year_tag.find_next_sibling("dd")
if year_dd:
year = year_dd.text.strip()
return year
else:
print(f"Year not found in {index_url}")
return None
except requests.RequestException as e:
print(f"Failed to extract year from {index_url}: {e}")
return None
def extract_pdf_url_from_index(index_url, id):
try:
headers = {"User-Agent": random.choice(user_agents)}
response = requests.get(index_url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
pdf_link = soup.find("a", href=True, string=re.compile(r"\bPDF\b", re.I))
if pdf_link:
pdf_url = pdf_link["href"]
return pdf_url
else:
logger.warning(f"No PDF link found on {index_url}")
return None
except RequestException as e:
logger.error(f"Failed to extract PDF URL from {index_url}: {e}")
return None
class PaperCrawling:
def __init__(self, config, data_type="train") -> None:
self.base_url = "https://aclanthology.org/"
self.data_type = data_type
self.paper_pdf_folder = config.DEFAULT.pdf_cached
if not os.path.exists(self.paper_pdf_folder):
os.makedirs(self.paper_pdf_folder)
logger.info(f"Created directory '{self.paper_pdf_folder}'")
def need_to_parse(self, paper: dict):
if (
paper["abstract"] is None
or paper["introduction"] is None
or paper["reference"] is None
):
return True
return False
def get_title(self, paper):
index_url = f"{self.base_url}{paper['id']}/"
title = extract_title_from_index(index_url)
return title
def get_year(self, paper):
index_url = f"{self.base_url}{paper['id']}/"
year = extract_year_from_index(index_url)
return year
def get_pdf_url(self, paper):
if "pdf_url" not in paper.keys() or paper["pdf_url"] is None:
index_url = f"{self.base_url}{paper['id']}/"
paper["pdf_url"] = extract_pdf_url_from_index(index_url, paper["id"])
def download_paper(self, paper):
headers = {"User-Agent": random.choice(user_agents)}
pdf_folder = os.path.join(
self.paper_pdf_folder, f"{paper['venue_name']}", f"{paper['year']}"
)
file_path = os.path.join(pdf_folder, f"{paper['hash_id']}.pdf")
paper["pdf_path"] = file_path
paper_url = paper["pdf_url"]
if not os.path.exists(pdf_folder):
os.makedirs(pdf_folder)
if os.path.exists(file_path):
# print("pdf file {} exist ...".format(file_path))
return True
try:
response = requests.get(paper_url, headers=headers, timeout=10)
response.raise_for_status()
except Exception:
print(f"download failed... {paper['pdf_url']}")
return False
if response.status_code == 200:
with open(file_path, "wb") as f:
f.write(response.content)
logger.info("download success {}".format(paper_url))
logger.info(f"save {file_path}")
return True
else:
print("download failed, status code: {}".format(response.status_code))
return False
def get_page(self, url):
headers = {"User-Agent": random.choice(user_agents)}
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
response.encoding = response.apparent_encoding
return response.text
return None
except RequestException as e:
print(e)
def crawling(self, year, venue_name):
paper_list = []
paper_html_list = []
def append_paper_to_list(pdf_url, title):
for paper in paper_html_list:
if paper["title"] == title:
if paper["pdf_url"] != pdf_url:
logger.warning(
f"Different PDF URL found for the same title '{title}'."
)
return
paper_html_list.append({"pdf_url": pdf_url, "title": title})
if venue_name == "nips":
if year == "2024":
return []
base_url = "https://papers.nips.cc/paper_files/paper/{}"
target_url = base_url.format(year)
target_html = self.get_page(target_url)
soup = BeautifulSoup(target_html, "html.parser")
ids = soup.find("div", {"class": "container-fluid"}).find_all("li")
for id in ids:
a = id.find("a")
href = a.attrs.get("href")
pdf_url = "https://papers.nips.cc{}".format(
href.replace("hash", "file")
.replace("Abstract", "Paper")
.replace("html", "pdf")
)
title = a.text
append_paper_to_list(pdf_url, title)
for paper_html in paper_html_list:
title = paper_html["title"]
pdf_url = paper_html["pdf_url"]
hash_id = generate_hash_id(title)
paper_list.append(
{
"hash_id": hash_id,
"year": year,
"venue_name": venue_name,
"title": title,
"pdf_url": pdf_url,
}
)
elif venue_name == "cvpr":
base_url = "https://openaccess.thecvf.com/CVPR{}"
dict_cvpr = {
"2018": ["2018-06-19", "2018-06-20", "2018-06-21"],
"2019": ["2019-06-18", "2019-06-28", "2019-06-20"],
"2020": ["2020-06-16", "2020-06-17", "2020-06-18"],
"2021": ["all"],
"2022": ["all"],
"2023": ["all"],
}
if year in dict_cvpr.keys():
day_list = dict_cvpr[year]
target_url = [
base_url.format(year) + "?day={}".format(day) for day in day_list
]
else:
target_url = [base_url.format(year)]
print("paper list from {}".format(target_url))
for url in target_url:
target_html = self.get_page(url)
soup = BeautifulSoup(target_html, "html.parser")
dl_elements = soup.find("div", {"id": "content"}).find_all("dl")
for dl in dl_elements:
dt_elements = dl.find_all("dt")
dd_elements = dl.find_all("dd")
if year in dict_cvpr.keys():
dd_elements.pop(0)
for idx in range(len(dt_elements)):
title = dt_elements[idx].text
href = dd_elements[idx * 2 + 1].find("a").attrs.get("href")
pdf_url = "https://openaccess.thecvf.com/{}".format(href)
hash_id = generate_hash_id(title)
paper_list.append(
{
"hash_id": hash_id,
"year": year,
"venue_name": venue_name,
"title": title,
"pdf_url": pdf_url,
}
)
elif venue_name == "emnlp":
if year == "2024":
return []
if year not in ["2020", "2021", "2022", "2023"]:
dev_id = "main-container"
else:
dev_id = "{}emnlp-main".format(year)
base_url = "https://aclanthology.org/events/emnlp-{}"
target_url = base_url.format(year)
target_html = self.get_page(target_url)
soup = BeautifulSoup(target_html, "html.parser")
ids = soup.find("div", {"id": dev_id}).find_all("p")
for id in ids:
a = id.find("a")
pdf_url = a.attrs.get("href")
title = id.find("strong").get_text()
append_paper_to_list(pdf_url, title)
for paper_html in paper_html_list:
title = paper_html["title"]
hash_id = generate_hash_id(title)
pdf_url = paper_html["pdf_url"]
if "http" not in pdf_url:
continue
paper_list.append(
{
"hash_id": hash_id,
"year": year,
"venue_name": venue_name,
"title": title,
"pdf_url": pdf_url,
}
)
elif venue_name == "naacl":
# https://aclanthology.org/
if year in ["2023", "2020", "2017", "2014"]:
return []
dev_id = "main-container"
base_url = "https://aclanthology.org/events/naacl-{}/"
target_url = base_url.format(year)
target_html = self.get_page(target_url)
soup = BeautifulSoup(target_html, "html.parser")
ids = soup.find("div", {"id": dev_id}).find_all("p")
for id in ids:
a = id.find("a")
pdf_url = a.attrs.get("href")
title = id.find("strong").get_text()
append_paper_to_list(pdf_url, title)
for paper_html in paper_html_list:
title = paper_html["title"]
hash_id = generate_hash_id(title)
pdf_url = paper_html["pdf_url"]
paper_list.append(
{
"hash_id": hash_id,
"year": year,
"venue_name": venue_name,
"title": title,
"pdf_url": pdf_url,
}
)
elif venue_name == "acl":
dev_id = "main-container"
base_url = "https://aclanthology.org/events/acl-{}/"
target_url = base_url.format(year)
target_html = self.get_page(target_url)
soup = BeautifulSoup(target_html, "html.parser")
ids = soup.find("div", {"id": dev_id}).find_all("p")
for id in ids:
a = id.find("a")
pdf_url = a.attrs.get("href")
title = id.find("strong").get_text()
append_paper_to_list(pdf_url, title)
for paper_html in paper_html_list:
title = paper_html["title"]
hash_id = generate_hash_id(title)
pdf_url = paper_html["pdf_url"]
if "http" not in pdf_url:
continue
paper_list.append(
{
"hash_id": hash_id,
"year": year,
"venue_name": venue_name,
"title": title,
"pdf_url": pdf_url,
}
)
elif venue_name == "icml":
hit = {
"2024": "v235",
"2023": "v202",
"2022": "v162",
"2021": "v139",
"2020": "v119",
"2019": "v97",
"2018": "v80",
"2017": "v70",
"2016": "v48",
"2015": "v37",
"2014": "v32",
"2013": "v28",
}
dev_id = "container"
base_url = "https://proceedings.mlr.press/{}/"
target_url = base_url.format(hit[year])
target_html = self.get_page(target_url)
soup = BeautifulSoup(target_html, "html.parser")
ids = soup.find("main", {"class": "page-content"}).find_all(
"div", {"class": "paper"}
)
for id in ids:
title = id.find("p", class_="title").text
pdf_url = id.find("a", text="Download PDF")["href"]
append_paper_to_list(pdf_url, title)
for paper_html in paper_html_list:
title = paper_html["title"]
hash_id = generate_hash_id(title)
pdf_url = paper_html["pdf_url"]
paper_list.append(
{
"hash_id": hash_id,
"year": year,
"venue_name": venue_name,
"title": title,
"pdf_url": pdf_url,
}
)
return paper_list