Spaces:
Sleeping
Sleeping
import random | |
from datetime import datetime, timedelta, timezone | |
from typing import Optional, Union | |
import arxiv | |
import requests | |
# Initialize the arXiv API client | |
arxiv_client = arxiv.Client() | |
ARXIV_CATEGORIES = { | |
"Computer Science": { | |
"cs.AI": "Artificial Intelligence", | |
"cs.AR": "Hardware Architecture", | |
"cs.CC": "Computational Complexity", | |
"cs.CE": "Computational Engineering", | |
"cs.CG": "Computational Geometry", | |
"cs.CL": "Computation and Language", | |
"cs.CR": "Cryptography and Security", | |
"cs.CV": "Computer Vision and Pattern Recognition", | |
"cs.CY": "Computers and Society", | |
"cs.DB": "Databases", | |
"cs.DC": "Distributed Computing", | |
"cs.DL": "Digital Libraries", | |
"cs.DM": "Discrete Mathematics", | |
"cs.DS": "Data Structures and Algorithms", | |
"cs.ET": "Emerging Technologies", | |
"cs.FL": "Formal Languages and Automata Theory", | |
"cs.GL": "General Literature", | |
"cs.GR": "Graphics", | |
"cs.GT": "Computer Science and Game Theory", | |
"cs.HC": "Human-Computer Interaction", | |
"cs.IR": "Information Retrieval", | |
"cs.IT": "Information Theory", | |
"cs.LG": "Machine Learning", | |
"cs.LO": "Logic in Computer Science", | |
"cs.MA": "Multiagent Systems", | |
"cs.MM": "Multimedia", | |
"cs.MS": "Mathematical Software", | |
"cs.NA": "Numerical Analysis", | |
"cs.NE": "Neural and Evolutionary Computing", | |
"cs.NI": "Networking and Internet Architecture", | |
"cs.OH": "Other Computer Science", | |
"cs.OS": "Operating Systems", | |
"cs.PF": "Performance", | |
"cs.PL": "Programming Languages", | |
"cs.RO": "Robotics", | |
"cs.SC": "Symbolic Computation", | |
"cs.SD": "Sound", | |
"cs.SE": "Software Engineering", | |
"cs.SI": "Social and Information Networks", | |
"cs.SY": "Systems and Control", | |
}, | |
"Physics": { | |
"astro-ph.CO": "Cosmology and Nongalactic Astrophysics", | |
"astro-ph.EP": "Earth and Planetary Astrophysics", | |
"astro-ph.GA": "Astrophysics of Galaxies", | |
"astro-ph.HE": "High Energy Astrophysical Phenomena", | |
"astro-ph.IM": "Instrumentation and Methods for Astrophysics", | |
"astro-ph.SR": "Solar and Stellar Astrophysics", | |
"cond-mat.dis-nn": "Disordered Systems and Neural Networks", | |
"cond-mat.mes-hall": "Mesoscale and Nanoscale Physics", | |
"cond-mat.mtrl-sci": "Materials Science", | |
"cond-mat.other": "Other Condensed Matter", | |
"cond-mat.quant-gas": "Quantum Gases", | |
"cond-mat.soft": "Soft Condensed Matter", | |
"cond-mat.stat-mech": "Statistical Mechanics", | |
"cond-mat.str-el": "Strongly Correlated Electrons", | |
"cond-mat.supr-con": "Superconductivity", | |
"gr-qc": "General Relativity and Quantum Cosmology", | |
"hep-ex": "High Energy Physics - Experiment", | |
"hep-lat": "High Energy Physics - Lattice", | |
"hep-ph": "High Energy Physics - Phenomenology", | |
"hep-th": "High Energy Physics - Theory", | |
"math-ph": "Mathematical Physics", | |
"nlin.AO": "Adaptation and Self-Organizing Systems", | |
"nlin.CD": "Chaotic Dynamics", | |
"nlin.CG": "Cellular Automata and Lattice Gases", | |
"nlin.PS": "Pattern Formation and Solitons", | |
"nlin.SI": "Exactly Solvable and Integrable Systems", | |
"nucl-ex": "Nuclear Experiment", | |
"nucl-th": "Nuclear Theory", | |
"physics.acc-ph": "Accelerator Physics", | |
"physics.ao-ph": "Atmospheric and Oceanic Physics", | |
"physics.app-ph": "Applied Physics", | |
"physics.atm-clus": "Atomic and Molecular Clusters", | |
"physics.atom-ph": "Atomic Physics", | |
"physics.bio-ph": "Biological Physics", | |
"physics.chem-ph": "Chemical Physics", | |
"physics.class-ph": "Classical Physics", | |
"physics.comp-ph": "Computational Physics", | |
"physics.data-an": "Data Analysis, Statistics and Probability", | |
"physics.ed-ph": "Physics Education", | |
"physics.flu-dyn": "Fluid Dynamics", | |
"physics.gen-ph": "General Physics", | |
"physics.geo-ph": "Geophysics", | |
"physics.hist-ph": "History and Philosophy of Physics", | |
"physics.ins-det": "Instrumentation and Detectors", | |
"physics.med-ph": "Medical Physics", | |
"physics.optics": "Optics", | |
"physics.plasm-ph": "Plasma Physics", | |
"physics.pop-ph": "Popular Physics", | |
"physics.soc-ph": "Physics and Society", | |
"physics.space-ph": "Space Physics", | |
"quant-ph": "Quantum Physics", | |
}, | |
"Mathematics": { | |
"math.AC": "Commutative Algebra", | |
"math.AG": "Algebraic Geometry", | |
"math.AP": "Analysis of PDEs", | |
"math.AT": "Algebraic Topology", | |
"math.CA": "Classical Analysis and ODEs", | |
"math.CO": "Combinatorics", | |
"math.CT": "Category Theory", | |
"math.CV": "Complex Variables", | |
"math.DG": "Differential Geometry", | |
"math.DS": "Dynamical Systems", | |
"math.FA": "Functional Analysis", | |
"math.GM": "General Mathematics", | |
"math.GN": "General Topology", | |
"math.GR": "Group Theory", | |
"math.GT": "Geometric Topology", | |
"math.HO": "History and Overview", | |
"math.IT": "Information Theory", | |
"math.KT": "K-Theory and Homology", | |
"math.LO": "Logic", | |
"math.MG": "Metric Geometry", | |
"math.MP": "Mathematical Physics", | |
"math.NA": "Numerical Analysis", | |
"math.NT": "Number Theory", | |
"math.OA": "Operator Algebras", | |
"math.OC": "Optimization and Control", | |
"math.PR": "Probability", | |
"math.QA": "Quantum Algebra", | |
"math.RA": "Rings and Algebras", | |
"math.RT": "Representation Theory", | |
"math.SG": "Symplectic Geometry", | |
"math.SP": "Spectral Theory", | |
"math.ST": "Statistics Theory", | |
}, | |
"Quantitative Biology": { | |
"q-bio.BM": "Biomolecules", | |
"q-bio.CB": "Cell Behavior", | |
"q-bio.GN": "Genomics", | |
"q-bio.MN": "Molecular Networks", | |
"q-bio.NC": "Neurons and Cognition", | |
"q-bio.OT": "Other Quantitative Biology", | |
"q-bio.PE": "Populations and Evolution", | |
"q-bio.QM": "Quantitative Methods", | |
"q-bio.SC": "Subcellular Processes", | |
"q-bio.TO": "Tissues and Organs", | |
}, | |
"Quantitative Finance": { | |
"q-fin.CP": "Computational Finance", | |
"q-fin.EC": "Economics", | |
"q-fin.GN": "General Finance", | |
"q-fin.MF": "Mathematical Finance", | |
"q-fin.PM": "Portfolio Management", | |
"q-fin.PR": "Pricing of Securities", | |
"q-fin.RM": "Risk Management", | |
"q-fin.ST": "Statistical Arbitrage", | |
"q-fin.TR": "Trading and Market Microstructure", | |
}, | |
"Statistics": { | |
"stat.AP": "Applications", | |
"stat.CO": "Computation", | |
"stat.ME": "Methodology", | |
"stat.ML": "Machine Learning", | |
"stat.OT": "Other Statistics", | |
"stat.TH": "Theory", | |
}, | |
"Economics": { | |
"econ.EM": "Econometrics", | |
"econ.GN": "General Economics", | |
"econ.TH": "Economic Theory", | |
}, | |
"Electrical Engineering and Systems Science": { | |
"eess.AS": "Audio and Speech Processing", | |
"eess.IV": "Image and Video Processing", | |
"eess.SP": "Signal Processing", | |
"eess.SY": "Systems and Control", | |
}, | |
} | |
# Flatten categories for easy access | |
ARXIV_CATEGORIES_FLAT: dict[str, str] = {} | |
for main_cat, subcats in ARXIV_CATEGORIES.items(): | |
for cat_code, cat_name in subcats.items(): | |
ARXIV_CATEGORIES_FLAT[cat_code] = f"{main_cat}: {cat_name} ({cat_code})" | |
def clean_doi(doi: str) -> str: | |
if doi.startswith("https://arxiv.org/abs/"): | |
return doi.split("/")[-1] | |
elif doi.startswith("https://arxiv.org/pdf/"): | |
return doi.split("/")[-1].split(".pdf")[0] | |
elif doi.startswith("arXiv:"): | |
return doi.split(":")[-1] | |
elif doi.startswith("http"): | |
return "Invalid arXiv link. Please provide a link to the abstract page." | |
elif doi.startswith("10."): | |
# Fetch the arXiv ID from the DOI | |
base_url = "http://dx.doi.org/" | |
headers = {"Accept": "application/x-bibtex"} | |
response = requests.get(base_url + doi, headers=headers) | |
if response.status_code != 200: | |
return "No paper found with that DOI." | |
bibtext = response.text | |
return bibtext.split("eprint = {arXiv:")[-1].split("}")[0] | |
elif doi.replace("v", "").replace(".", "").isdigit(): | |
return doi | |
else: | |
return "Invalid arXiv ID or DOI. Please provide a valid arXiv ID, DOI, or arXiv URL." | |
def retrieve_arxiv_paper(arxiv_id: str) -> dict: | |
"""Retrieve the paper from arXiv. | |
Args: | |
arxiv_id: The arXiv ID of the paper to retrieve. | |
Returns: | |
A dict object representing the paper. | |
""" | |
global arxiv_client | |
query_string = arxiv.Search(id_list=[arxiv_id]) | |
results = arxiv_client.results(query_string) | |
try: | |
paper = next(results) | |
except StopIteration: | |
raise ValueError("No paper found with that arXiv ID.") | |
return dict( | |
arxiv_id=paper.entry_id.split("/")[-1], | |
title=paper.title, | |
authors=[author.name for author in paper.authors], | |
categories=[category for category in paper.categories], | |
abstract=paper.summary, | |
published_date=paper.published, | |
) | |
def build_arxiv_category_query( | |
categories: Union[str, list[str]], | |
start_date: Optional[datetime] = None, | |
end_date: Optional[datetime] = None, | |
start: int = 0, | |
max_results: int = 5, | |
) -> arxiv.Search: | |
"""Builds a query string for the arXiv API. | |
Args: | |
categories: List of arXiv categories to search. | |
start_date: Optional datetime to start search from. | |
end_date: Optional datetime to end search at. | |
start: Index of first result to return. | |
max_results: Maximum number of results to return. | |
Returns: | |
arxiv.Search object with the constructed query. | |
""" | |
if isinstance(categories, str): | |
categories = [categories] | |
if start_date and end_date: | |
date_str = f"{start_date.strftime('%Y%m%d%H%M')}+TO+{end_date.strftime('%Y%m%d%H%M')}" | |
elif start_date: | |
date_str = start_date.strftime("%Y%m%d%H%M") | |
date_str = f"{date_str}+TO+{datetime.now(timezone.utc).strftime('%Y%m%d%H%M')}" | |
else: | |
date_str = "" | |
# Construct the category string, including the date range if provided | |
cat_str = " OR ".join([f"cat:{cat}" for cat in categories]) if categories else "" | |
if date_str: | |
cat_str = f"({cat_str}) AND submittedDate:[{date_str}]" | |
search = arxiv.Search( | |
query=cat_str, | |
max_results=max_results, | |
sort_by=arxiv.SortCriterion.SubmittedDate, | |
sort_order=arxiv.SortOrder.Descending, | |
) | |
return search | |
def retrieve_arxiv_papers( | |
categories: Union[str, list[str]], | |
start_date: Optional[datetime] = None, | |
end_date: Optional[datetime] = None, | |
start: int = 0, | |
max_results: int = 5, | |
) -> list[dict]: | |
"""Searches arXiv for papers in the given categories. | |
Args: | |
categories: List of arXiv categories to search. | |
start_date: Date to start searching from. | |
end_date: Date to stop searching at. | |
start: Index of the first result to return. | |
max_results: Maximum number of results to return. | |
Returns: | |
A generator of dict objects. | |
""" | |
global arxiv_client | |
query_string = build_arxiv_category_query(categories, start_date, end_date, start, max_results) | |
papers = [] | |
for result in arxiv_client.results(query_string, offset=start): | |
papers.append( | |
dict( | |
arxiv_id=result.entry_id.split("/")[-1], | |
title=result.title, | |
authors=[author.name for author in result.authors], | |
categories=[category for category in result.categories], | |
abstract=result.summary, | |
published_date=result.published, | |
) | |
) | |
return papers | |
def fetch_todays_papers(categories: Union[str, list[str]], start: int = 0, max_results: int = 5) -> list[dict]: | |
"""Fetch papers from today in the given categories | |
Args: | |
categories: List of arXiv categories to search | |
start: Index of the first result to return | |
max_results: Maximum number of results to return | |
Returns: | |
Generator of arXiv.Result objects | |
""" | |
if isinstance(categories, str): | |
categories = [categories] | |
papers = retrieve_arxiv_papers( | |
categories, | |
start_date=datetime.now(timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0), | |
start=start, | |
max_results=max_results, | |
) | |
return papers | |
def fetch_24_hours_papers(categories: Union[str, list[str]], start: int = 0, max_results: int = 5) -> list[dict]: | |
"""Fetch papers from the last 24 hours in the given categories | |
Args: | |
categories: List of arXiv categories to search | |
start: Index of the first result to return | |
max_results: Maximum number of results to return | |
Returns: | |
Generator of dict objects | |
""" | |
if isinstance(categories, str): | |
categories = [categories] | |
twenty_four_hours_ago = datetime.now(timezone.utc) - timedelta(days=1) | |
papers = retrieve_arxiv_papers( | |
categories, | |
start_date=twenty_four_hours_ago, | |
start=start, | |
max_results=max_results, | |
) | |
return papers | |
def random_arxiv_category(): | |
return random.choice(list(ARXIV_CATEGORIES_FLAT.values())) | |