import os import re import random import requests from requests.exceptions import RequestException import re from bs4 import BeautifulSoup from .hash import generate_hash_id from .header import get_dir from loguru import logger with open(get_dir("./assets/data/user_agents.txt"), "r", encoding="utf8") as f: user_agents = [l.rstrip() for l in f.readlines()] def extract_title_from_index(index_url): try: headers = {"User-Agent": random.choice(user_agents)} response_title = requests.get(index_url, headers=headers) response_title.raise_for_status() soup = BeautifulSoup(response_title.content, "html.parser") papers = soup.find_all("h2", id="title") for paper in papers: title = paper.text.strip() return title except RequestException as e: logger.error(f"Failed to extract title from {index_url}: {e}") return None def extract_year_from_index(index_url): try: headers = {"User-Agent": random.choice(user_agents)} response_year = requests.get(index_url, headers=headers) response_year.raise_for_status() soup = BeautifulSoup(response_year.content, "html.parser") year_tag = soup.find("dt", text="Year:") if year_tag: year_dd = year_tag.find_next_sibling("dd") if year_dd: year = year_dd.text.strip() return year else: print(f"Year not found in {index_url}") return None except requests.RequestException as e: print(f"Failed to extract year from {index_url}: {e}") return None def extract_pdf_url_from_index(index_url, id): try: headers = {"User-Agent": random.choice(user_agents)} response = requests.get(index_url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") pdf_link = soup.find("a", href=True, string=re.compile(r"\bPDF\b", re.I)) if pdf_link: pdf_url = pdf_link["href"] return pdf_url else: logger.warning(f"No PDF link found on {index_url}") return None except RequestException as e: logger.error(f"Failed to extract PDF URL from {index_url}: {e}") return None class PaperCrawling: def __init__(self, config, data_type="train") -> None: self.base_url = "https://aclanthology.org/" self.data_type = data_type self.paper_pdf_folder = config.DEFAULT.pdf_cached if not os.path.exists(self.paper_pdf_folder): os.makedirs(self.paper_pdf_folder) logger.info(f"Created directory '{self.paper_pdf_folder}'") def need_to_parse(self, paper: dict): if ( paper["abstract"] is None or paper["introduction"] is None or paper["reference"] is None ): return True return False def get_title(self, paper): index_url = f"{self.base_url}{paper['id']}/" title = extract_title_from_index(index_url) return title def get_year(self, paper): index_url = f"{self.base_url}{paper['id']}/" year = extract_year_from_index(index_url) return year def get_pdf_url(self, paper): if "pdf_url" not in paper.keys() or paper["pdf_url"] is None: index_url = f"{self.base_url}{paper['id']}/" paper["pdf_url"] = extract_pdf_url_from_index(index_url, paper["id"]) def download_paper(self, paper): headers = {"User-Agent": random.choice(user_agents)} pdf_folder = os.path.join( self.paper_pdf_folder, f"{paper['venue_name']}", f"{paper['year']}" ) file_path = os.path.join(pdf_folder, f"{paper['hash_id']}.pdf") paper["pdf_path"] = file_path paper_url = paper["pdf_url"] if not os.path.exists(pdf_folder): os.makedirs(pdf_folder) if os.path.exists(file_path): # print("pdf file {} exist ...".format(file_path)) return True try: response = requests.get(paper_url, headers=headers, timeout=10) response.raise_for_status() except Exception: print(f"download failed... {paper['pdf_url']}") return False if response.status_code == 200: with open(file_path, "wb") as f: f.write(response.content) logger.info("download success {}".format(paper_url)) logger.info(f"save {file_path}") return True else: print("download failed, status code: {}".format(response.status_code)) return False def get_page(self, url): headers = {"User-Agent": random.choice(user_agents)} try: response = requests.get(url, headers=headers) if response.status_code == 200: response.encoding = response.apparent_encoding return response.text return None except RequestException as e: print(e) def crawling(self, year, venue_name): """ Args: Returns: paper_list (List of Dict):[ { "hash_id": hash_id, hash id of the paper "year": year, published year "venue_name": venue_name, venue name "title": title, paper title "pdf_url": pdf_url, paper url } ] """ paper_list = [] paper_html_list = [] def append_paper_to_list(pdf_url, title): for paper in paper_html_list: if paper["title"] == title: if paper["pdf_url"] != pdf_url: logger.warning( f"Different PDF URL found for the same title '{title}'." ) return paper_html_list.append({"pdf_url": pdf_url, "title": title}) if venue_name == "nips": if year == "2024": return [] base_url = "https://papers.nips.cc/paper_files/paper/{}" target_url = base_url.format(year) target_html = self.get_page(target_url) soup = BeautifulSoup(target_html, "html.parser") ids = soup.find("div", {"class": "container-fluid"}).find_all("li") for id in ids: a = id.find("a") href = a.attrs.get("href") pdf_url = "https://papers.nips.cc{}".format( href.replace("hash", "file") .replace("Abstract", "Paper") .replace("html", "pdf") ) title = a.text append_paper_to_list(pdf_url, title) for paper_html in paper_html_list: title = paper_html["title"] pdf_url = paper_html["pdf_url"] hash_id = generate_hash_id(title) paper_list.append( { "hash_id": hash_id, "year": year, "venue_name": venue_name, "title": title, "pdf_url": pdf_url, } ) elif venue_name == "cvpr": base_url = "https://openaccess.thecvf.com/CVPR{}" dict_cvpr = { "2018": ["2018-06-19", "2018-06-20", "2018-06-21"], "2019": ["2019-06-18", "2019-06-28", "2019-06-20"], "2020": ["2020-06-16", "2020-06-17", "2020-06-18"], "2021": ["all"], "2022": ["all"], "2023": ["all"], } if year in dict_cvpr.keys(): day_list = dict_cvpr[year] target_url = [ base_url.format(year) + "?day={}".format(day) for day in day_list ] else: target_url = [base_url.format(year)] print("paper list from {}".format(target_url)) for url in target_url: target_html = self.get_page(url) soup = BeautifulSoup(target_html, "html.parser") dl_elements = soup.find("div", {"id": "content"}).find_all("dl") for dl in dl_elements: dt_elements = dl.find_all("dt") dd_elements = dl.find_all("dd") if year in dict_cvpr.keys(): dd_elements.pop(0) for idx in range(len(dt_elements)): title = dt_elements[idx].text href = dd_elements[idx * 2 + 1].find("a").attrs.get("href") pdf_url = "https://openaccess.thecvf.com/{}".format(href) hash_id = generate_hash_id(title) paper_list.append( { "hash_id": hash_id, "year": year, "venue_name": venue_name, "title": title, "pdf_url": pdf_url, } ) elif venue_name == "emnlp": if year == "2024": return [] if year not in ["2020", "2021", "2022", "2023"]: dev_id = "main-container" else: dev_id = "{}emnlp-main".format(year) base_url = "https://aclanthology.org/events/emnlp-{}" target_url = base_url.format(year) target_html = self.get_page(target_url) soup = BeautifulSoup(target_html, "html.parser") ids = soup.find("div", {"id": dev_id}).find_all("p") for id in ids: a = id.find("a") pdf_url = a.attrs.get("href") title = id.find("strong").get_text() append_paper_to_list(pdf_url, title) for paper_html in paper_html_list: title = paper_html["title"] hash_id = generate_hash_id(title) pdf_url = paper_html["pdf_url"] if "http" not in pdf_url: continue paper_list.append( { "hash_id": hash_id, "year": year, "venue_name": venue_name, "title": title, "pdf_url": pdf_url, } ) elif venue_name == "naacl": # https://aclanthology.org/ if year in ["2023", "2020", "2017", "2014"]: return [] dev_id = "main-container" base_url = "https://aclanthology.org/events/naacl-{}/" target_url = base_url.format(year) target_html = self.get_page(target_url) soup = BeautifulSoup(target_html, "html.parser") ids = soup.find("div", {"id": dev_id}).find_all("p") for id in ids: a = id.find("a") pdf_url = a.attrs.get("href") title = id.find("strong").get_text() append_paper_to_list(pdf_url, title) for paper_html in paper_html_list: title = paper_html["title"] hash_id = generate_hash_id(title) pdf_url = paper_html["pdf_url"] paper_list.append( { "hash_id": hash_id, "year": year, "venue_name": venue_name, "title": title, "pdf_url": pdf_url, } ) elif venue_name == "acl": dev_id = "main-container" base_url = "https://aclanthology.org/events/acl-{}/" target_url = base_url.format(year) target_html = self.get_page(target_url) soup = BeautifulSoup(target_html, "html.parser") ids = soup.find("div", {"id": dev_id}).find_all("p") for id in ids: a = id.find("a") pdf_url = a.attrs.get("href") title = id.find("strong").get_text() append_paper_to_list(pdf_url, title) for paper_html in paper_html_list: title = paper_html["title"] hash_id = generate_hash_id(title) pdf_url = paper_html["pdf_url"] if "http" not in pdf_url: continue paper_list.append( { "hash_id": hash_id, "year": year, "venue_name": venue_name, "title": title, "pdf_url": pdf_url, } ) elif venue_name == "icml": hit = { "2024": "v235", "2023": "v202", "2022": "v162", "2021": "v139", "2020": "v119", "2019": "v97", "2018": "v80", "2017": "v70", "2016": "v48", "2015": "v37", "2014": "v32", "2013": "v28", } dev_id = "container" base_url = "https://proceedings.mlr.press/{}/" target_url = base_url.format(hit[year]) target_html = self.get_page(target_url) soup = BeautifulSoup(target_html, "html.parser") ids = soup.find("main", {"class": "page-content"}).find_all( "div", {"class": "paper"} ) for id in ids: title = id.find("p", class_="title").text pdf_url = id.find("a", text="Download PDF")["href"] append_paper_to_list(pdf_url, title) for paper_html in paper_html_list: title = paper_html["title"] hash_id = generate_hash_id(title) pdf_url = paper_html["pdf_url"] paper_list.append( { "hash_id": hash_id, "year": year, "venue_name": venue_name, "title": title, "pdf_url": pdf_url, } ) return paper_list