research-compass / arxiv_stuff.py
nomadicsynth's picture
Add new categories for Quantitative Biology and Quantitative Finance in ARXIV_CATEGORIES
4bf7cef
import random
from datetime import datetime, timedelta, timezone
from typing import Optional, Union
import arxiv
import requests
# Initialize the arXiv API client
arxiv_client = arxiv.Client()
ARXIV_CATEGORIES = {
"Computer Science": {
"cs.AI": "Artificial Intelligence",
"cs.AR": "Hardware Architecture",
"cs.CC": "Computational Complexity",
"cs.CE": "Computational Engineering",
"cs.CG": "Computational Geometry",
"cs.CL": "Computation and Language",
"cs.CR": "Cryptography and Security",
"cs.CV": "Computer Vision and Pattern Recognition",
"cs.CY": "Computers and Society",
"cs.DB": "Databases",
"cs.DC": "Distributed Computing",
"cs.DL": "Digital Libraries",
"cs.DM": "Discrete Mathematics",
"cs.DS": "Data Structures and Algorithms",
"cs.ET": "Emerging Technologies",
"cs.FL": "Formal Languages and Automata Theory",
"cs.GL": "General Literature",
"cs.GR": "Graphics",
"cs.GT": "Computer Science and Game Theory",
"cs.HC": "Human-Computer Interaction",
"cs.IR": "Information Retrieval",
"cs.IT": "Information Theory",
"cs.LG": "Machine Learning",
"cs.LO": "Logic in Computer Science",
"cs.MA": "Multiagent Systems",
"cs.MM": "Multimedia",
"cs.MS": "Mathematical Software",
"cs.NA": "Numerical Analysis",
"cs.NE": "Neural and Evolutionary Computing",
"cs.NI": "Networking and Internet Architecture",
"cs.OH": "Other Computer Science",
"cs.OS": "Operating Systems",
"cs.PF": "Performance",
"cs.PL": "Programming Languages",
"cs.RO": "Robotics",
"cs.SC": "Symbolic Computation",
"cs.SD": "Sound",
"cs.SE": "Software Engineering",
"cs.SI": "Social and Information Networks",
"cs.SY": "Systems and Control",
},
"Physics": {
"astro-ph.CO": "Cosmology and Nongalactic Astrophysics",
"astro-ph.EP": "Earth and Planetary Astrophysics",
"astro-ph.GA": "Astrophysics of Galaxies",
"astro-ph.HE": "High Energy Astrophysical Phenomena",
"astro-ph.IM": "Instrumentation and Methods for Astrophysics",
"astro-ph.SR": "Solar and Stellar Astrophysics",
"cond-mat.dis-nn": "Disordered Systems and Neural Networks",
"cond-mat.mes-hall": "Mesoscale and Nanoscale Physics",
"cond-mat.mtrl-sci": "Materials Science",
"cond-mat.other": "Other Condensed Matter",
"cond-mat.quant-gas": "Quantum Gases",
"cond-mat.soft": "Soft Condensed Matter",
"cond-mat.stat-mech": "Statistical Mechanics",
"cond-mat.str-el": "Strongly Correlated Electrons",
"cond-mat.supr-con": "Superconductivity",
"gr-qc": "General Relativity and Quantum Cosmology",
"hep-ex": "High Energy Physics - Experiment",
"hep-lat": "High Energy Physics - Lattice",
"hep-ph": "High Energy Physics - Phenomenology",
"hep-th": "High Energy Physics - Theory",
"math-ph": "Mathematical Physics",
"nlin.AO": "Adaptation and Self-Organizing Systems",
"nlin.CD": "Chaotic Dynamics",
"nlin.CG": "Cellular Automata and Lattice Gases",
"nlin.PS": "Pattern Formation and Solitons",
"nlin.SI": "Exactly Solvable and Integrable Systems",
"nucl-ex": "Nuclear Experiment",
"nucl-th": "Nuclear Theory",
"physics.acc-ph": "Accelerator Physics",
"physics.ao-ph": "Atmospheric and Oceanic Physics",
"physics.app-ph": "Applied Physics",
"physics.atm-clus": "Atomic and Molecular Clusters",
"physics.atom-ph": "Atomic Physics",
"physics.bio-ph": "Biological Physics",
"physics.chem-ph": "Chemical Physics",
"physics.class-ph": "Classical Physics",
"physics.comp-ph": "Computational Physics",
"physics.data-an": "Data Analysis, Statistics and Probability",
"physics.ed-ph": "Physics Education",
"physics.flu-dyn": "Fluid Dynamics",
"physics.gen-ph": "General Physics",
"physics.geo-ph": "Geophysics",
"physics.hist-ph": "History and Philosophy of Physics",
"physics.ins-det": "Instrumentation and Detectors",
"physics.med-ph": "Medical Physics",
"physics.optics": "Optics",
"physics.plasm-ph": "Plasma Physics",
"physics.pop-ph": "Popular Physics",
"physics.soc-ph": "Physics and Society",
"physics.space-ph": "Space Physics",
"quant-ph": "Quantum Physics",
},
"Mathematics": {
"math.AC": "Commutative Algebra",
"math.AG": "Algebraic Geometry",
"math.AP": "Analysis of PDEs",
"math.AT": "Algebraic Topology",
"math.CA": "Classical Analysis and ODEs",
"math.CO": "Combinatorics",
"math.CT": "Category Theory",
"math.CV": "Complex Variables",
"math.DG": "Differential Geometry",
"math.DS": "Dynamical Systems",
"math.FA": "Functional Analysis",
"math.GM": "General Mathematics",
"math.GN": "General Topology",
"math.GR": "Group Theory",
"math.GT": "Geometric Topology",
"math.HO": "History and Overview",
"math.IT": "Information Theory",
"math.KT": "K-Theory and Homology",
"math.LO": "Logic",
"math.MG": "Metric Geometry",
"math.MP": "Mathematical Physics",
"math.NA": "Numerical Analysis",
"math.NT": "Number Theory",
"math.OA": "Operator Algebras",
"math.OC": "Optimization and Control",
"math.PR": "Probability",
"math.QA": "Quantum Algebra",
"math.RA": "Rings and Algebras",
"math.RT": "Representation Theory",
"math.SG": "Symplectic Geometry",
"math.SP": "Spectral Theory",
"math.ST": "Statistics Theory",
},
"Quantitative Biology": {
"q-bio.BM": "Biomolecules",
"q-bio.CB": "Cell Behavior",
"q-bio.GN": "Genomics",
"q-bio.MN": "Molecular Networks",
"q-bio.NC": "Neurons and Cognition",
"q-bio.OT": "Other Quantitative Biology",
"q-bio.PE": "Populations and Evolution",
"q-bio.QM": "Quantitative Methods",
"q-bio.SC": "Subcellular Processes",
"q-bio.TO": "Tissues and Organs",
},
"Quantitative Finance": {
"q-fin.CP": "Computational Finance",
"q-fin.EC": "Economics",
"q-fin.GN": "General Finance",
"q-fin.MF": "Mathematical Finance",
"q-fin.PM": "Portfolio Management",
"q-fin.PR": "Pricing of Securities",
"q-fin.RM": "Risk Management",
"q-fin.ST": "Statistical Arbitrage",
"q-fin.TR": "Trading and Market Microstructure",
},
"Statistics": {
"stat.AP": "Applications",
"stat.CO": "Computation",
"stat.ME": "Methodology",
"stat.ML": "Machine Learning",
"stat.OT": "Other Statistics",
"stat.TH": "Theory",
},
"Economics": {
"econ.EM": "Econometrics",
"econ.GN": "General Economics",
"econ.TH": "Economic Theory",
},
"Electrical Engineering and Systems Science": {
"eess.AS": "Audio and Speech Processing",
"eess.IV": "Image and Video Processing",
"eess.SP": "Signal Processing",
"eess.SY": "Systems and Control",
},
}
# Flatten categories for easy access
ARXIV_CATEGORIES_FLAT: dict[str, str] = {}
for main_cat, subcats in ARXIV_CATEGORIES.items():
for cat_code, cat_name in subcats.items():
ARXIV_CATEGORIES_FLAT[cat_code] = f"{main_cat}: {cat_name} ({cat_code})"
def clean_doi(doi: str) -> str:
if doi.startswith("https://arxiv.org/abs/"):
return doi.split("/")[-1]
elif doi.startswith("https://arxiv.org/pdf/"):
return doi.split("/")[-1].split(".pdf")[0]
elif doi.startswith("arXiv:"):
return doi.split(":")[-1]
elif doi.startswith("http"):
return "Invalid arXiv link. Please provide a link to the abstract page."
elif doi.startswith("10."):
# Fetch the arXiv ID from the DOI
base_url = "http://dx.doi.org/"
headers = {"Accept": "application/x-bibtex"}
response = requests.get(base_url + doi, headers=headers)
if response.status_code != 200:
return "No paper found with that DOI."
bibtext = response.text
return bibtext.split("eprint = {arXiv:")[-1].split("}")[0]
elif doi.replace("v", "").replace(".", "").isdigit():
return doi
else:
return "Invalid arXiv ID or DOI. Please provide a valid arXiv ID, DOI, or arXiv URL."
def retrieve_arxiv_paper(arxiv_id: str) -> dict:
"""Retrieve the paper from arXiv.
Args:
arxiv_id: The arXiv ID of the paper to retrieve.
Returns:
A dict object representing the paper.
"""
global arxiv_client
query_string = arxiv.Search(id_list=[arxiv_id])
results = arxiv_client.results(query_string)
try:
paper = next(results)
except StopIteration:
raise ValueError("No paper found with that arXiv ID.")
return dict(
arxiv_id=paper.entry_id.split("/")[-1],
title=paper.title,
authors=[author.name for author in paper.authors],
categories=[category for category in paper.categories],
abstract=paper.summary,
published_date=paper.published,
)
def build_arxiv_category_query(
categories: Union[str, list[str]],
start_date: Optional[datetime] = None,
end_date: Optional[datetime] = None,
start: int = 0,
max_results: int = 5,
) -> arxiv.Search:
"""Builds a query string for the arXiv API.
Args:
categories: List of arXiv categories to search.
start_date: Optional datetime to start search from.
end_date: Optional datetime to end search at.
start: Index of first result to return.
max_results: Maximum number of results to return.
Returns:
arxiv.Search object with the constructed query.
"""
if isinstance(categories, str):
categories = [categories]
if start_date and end_date:
date_str = f"{start_date.strftime('%Y%m%d%H%M')}+TO+{end_date.strftime('%Y%m%d%H%M')}"
elif start_date:
date_str = start_date.strftime("%Y%m%d%H%M")
date_str = f"{date_str}+TO+{datetime.now(timezone.utc).strftime('%Y%m%d%H%M')}"
else:
date_str = ""
# Construct the category string, including the date range if provided
cat_str = " OR ".join([f"cat:{cat}" for cat in categories]) if categories else ""
if date_str:
cat_str = f"({cat_str}) AND submittedDate:[{date_str}]"
search = arxiv.Search(
query=cat_str,
max_results=max_results,
sort_by=arxiv.SortCriterion.SubmittedDate,
sort_order=arxiv.SortOrder.Descending,
)
return search
def retrieve_arxiv_papers(
categories: Union[str, list[str]],
start_date: Optional[datetime] = None,
end_date: Optional[datetime] = None,
start: int = 0,
max_results: int = 5,
) -> list[dict]:
"""Searches arXiv for papers in the given categories.
Args:
categories: List of arXiv categories to search.
start_date: Date to start searching from.
end_date: Date to stop searching at.
start: Index of the first result to return.
max_results: Maximum number of results to return.
Returns:
A generator of dict objects.
"""
global arxiv_client
query_string = build_arxiv_category_query(categories, start_date, end_date, start, max_results)
papers = []
for result in arxiv_client.results(query_string, offset=start):
papers.append(
dict(
arxiv_id=result.entry_id.split("/")[-1],
title=result.title,
authors=[author.name for author in result.authors],
categories=[category for category in result.categories],
abstract=result.summary,
published_date=result.published,
)
)
return papers
def fetch_todays_papers(categories: Union[str, list[str]], start: int = 0, max_results: int = 5) -> list[dict]:
"""Fetch papers from today in the given categories
Args:
categories: List of arXiv categories to search
start: Index of the first result to return
max_results: Maximum number of results to return
Returns:
Generator of arXiv.Result objects
"""
if isinstance(categories, str):
categories = [categories]
papers = retrieve_arxiv_papers(
categories,
start_date=datetime.now(timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0),
start=start,
max_results=max_results,
)
return papers
def fetch_24_hours_papers(categories: Union[str, list[str]], start: int = 0, max_results: int = 5) -> list[dict]:
"""Fetch papers from the last 24 hours in the given categories
Args:
categories: List of arXiv categories to search
start: Index of the first result to return
max_results: Maximum number of results to return
Returns:
Generator of dict objects
"""
if isinstance(categories, str):
categories = [categories]
twenty_four_hours_ago = datetime.now(timezone.utc) - timedelta(days=1)
papers = retrieve_arxiv_papers(
categories,
start_date=twenty_four_hours_ago,
start=start,
max_results=max_results,
)
return papers
def random_arxiv_category():
return random.choice(list(ARXIV_CATEGORIES_FLAT.values()))