Spaces:
Sleeping
Sleeping
import os | |
import requests | |
import xml.etree.ElementTree as ET | |
import urllib.parse | |
from tqdm import tqdm | |
import time | |
import re | |
import requests | |
from asg_query import generate_generic_query_qwen, generate_query_qwen | |
from dotenv import load_dotenv | |
load_dotenv() | |
PARENT_FOLDER = "arxiv_downloads_new_new_new" | |
os.makedirs(PARENT_FOLDER, exist_ok=True) | |
def sanitize_filename(filename): | |
filename = filename.replace("\n", "").strip() | |
filename = re.sub(r'[\/:*?"<>|]', '_', filename) | |
return filename[:100] + ".pdf" | |
def search_arxiv_papers(topic, max_results=50): | |
query_qwen = generate_query_qwen(topic) | |
encoded_query = urllib.parse.quote_plus(query_qwen) | |
url = f"https://export.arxiv.org/api/query?search_query={encoded_query}&start=0&max_results={max_results}&sortBy=submittedDate" | |
# base_url = "http://export.arxiv.org/api/query?" | |
# query = f"search_query=all:{topic.replace(' ', '+')}&start=0&max_results={max_results}&sortBy=relevance&sortOrder=descending" | |
# url = base_url + query | |
response = requests.get(url) | |
if response.status_code != 200: | |
print(f"Error fetching data for {topic}: {response.status_code}") | |
return [] | |
root = ET.fromstring(response.text) | |
entries = root.findall("{http://www.w3.org/2005/Atom}entry") | |
papers = [] | |
for entry in entries: | |
title = entry.find("{http://www.w3.org/2005/Atom}title").text | |
pdf_link = entry.find("{http://www.w3.org/2005/Atom}id").text.replace("abs", "pdf") | |
papers.append({"title": title, "pdf_link": pdf_link}) | |
return papers | |
def download_pdf(url, folder, filename): | |
file_path = os.path.join(folder, filename) | |
response = requests.get(url, stream=True) | |
if response.status_code == 200: | |
with open(file_path, 'wb') as file: | |
for chunk in response.iter_content(chunk_size=1024): | |
file.write(chunk) | |
else: | |
print(f"Failed to download {url}") | |
def download_arxiv_papers(topic, max_results=50): | |
folder_name = os.path.join(PARENT_FOLDER, topic.replace(" ", "_")) | |
os.makedirs(folder_name, exist_ok=True) | |
papers = search_arxiv_papers(topic, max_results) | |
if not papers: | |
print(f"No papers found for topic: {topic}") | |
return | |
print(f"Downloading {len(papers)} papers for topic: {topic}") | |
for paper in tqdm(papers, total=len(papers)): | |
filename = sanitize_filename(paper['title']) | |
pdf_link = paper["pdf_link"] | |
download_pdf(pdf_link, folder_name, filename) | |
time.sleep(2) | |
print(f"Download complete. Papers saved in: {folder_name}") | |
def search_arxiv_with_query(query, max_results=50): | |
""" | |
Query the arXiv API with a given query string. | |
Parameters: | |
query (str): The query string (URL-unencoded). | |
max_results (int): Maximum number of results to request. | |
Returns: | |
list: A list of dictionaries containing paper metadata. | |
""" | |
encoded_query = urllib.parse.quote_plus(query) | |
url = f"https://export.arxiv.org/api/query?search_query={encoded_query}&start=0&max_results={max_results}&sortBy=submittedDate" | |
response = requests.get(url) | |
if response.status_code != 200: | |
print(f"Error fetching data with query: {query} | status code: {response.status_code}") | |
return [] | |
try: | |
root = ET.fromstring(response.text) | |
except Exception as e: | |
print("Error parsing XML:", e) | |
return [] | |
entries = root.findall("{http://www.w3.org/2005/Atom}entry") | |
papers = [] | |
for entry in entries: | |
title = entry.find("{http://www.w3.org/2005/Atom}title").text.strip() | |
pdf_link = entry.find("{http://www.w3.org/2005/Atom}id").text.replace("abs", "pdf") | |
papers.append({"title": title, "pdf_link": pdf_link}) | |
return papers | |
def download_arxiv_papers_new(topic, max_results=50, min_results=10): | |
""" | |
Download arXiv papers for a given topic. | |
Process: | |
1. Use a strict query generated by generate_query_qwen(topic) to query arXiv. | |
2. If the number of results is fewer than `min_results`, then generate a more generic query | |
using generate_generic_query_qwen() and run additional searches. | |
3. Combine non-duplicate papers (filtered by title) until reaching max_results or exhausting attempts. | |
4. Download the PDF of each paper. | |
Parameters: | |
topic (str): The research topic. | |
max_results (int): Total maximum number of papers to download (default is 50). | |
min_results (int): Minimum acceptable number of papers from the first query (default is 10). | |
""" | |
folder_name = os.path.join(PARENT_FOLDER, topic.replace(" ", "_")) | |
os.makedirs(folder_name, exist_ok=True) | |
# 1. Initial strict query. | |
strict_query = generate_query_qwen(topic) | |
papers = search_arxiv_with_query(strict_query, max_results=max_results) | |
# Use a dict keyed by title to avoid duplicates. | |
total_papers = {paper["title"]: paper for paper in papers} | |
print(f"[Strict Query] Found {len(total_papers)} papers for topic: {topic}") | |
# 2. If the strict query returns fewer than min_results papers, | |
# use the generic query to broaden the search. | |
attempts = 0 | |
MAX_ATTEMPTS = 5 # Limit attempts to avoid infinite loops. | |
while len(total_papers) < max_results and len(total_papers) < min_results and attempts < MAX_ATTEMPTS: | |
# Generate a less strict (generic) query | |
generic_query = generate_generic_query_qwen(strict_query, topic) | |
print(f"[Generic Query Attempt {attempts + 1}] Using generic query: {generic_query}") | |
generic_papers = search_arxiv_with_query(generic_query, max_results=max_results) | |
new_count = 0 | |
for paper in generic_papers: | |
if paper["title"] not in total_papers: | |
total_papers[paper["title"]] = paper | |
new_count += 1 | |
if len(total_papers) >= max_results: | |
break | |
attempts += 1 | |
strict_query = generic_query # Update the query for the next iteration. | |
total_paper_list = list(total_papers.values())[:max_results] | |
if not total_paper_list: | |
print(f"No papers found for topic: {topic}") | |
return | |
print(f"Downloading {len(total_paper_list)} papers for topic: {topic}") | |
for paper in tqdm(total_paper_list, total=len(total_paper_list)): | |
filename = sanitize_filename(paper['title']) | |
pdf_link = paper["pdf_link"] | |
download_pdf(pdf_link, folder_name, filename) | |
time.sleep(2) # Delay to avoid overwhelming the arXiv API | |
print(f"Download complete. Papers saved in: {folder_name}") | |
first_topics = [ | |
"quantum computing: bqp, quantum supremacy, and related concepts", | |
"fixed-parameter tractability and related concepts in computational complexity", | |
"fundamental concepts in computational complexity theory", | |
"pcp theorem and its implications in approximation and complexity theory", | |
"interconnections in theoretical computer science: seth, 3sum, apsp, and related concepts", | |
"nosql database systems for flexible and scalable data management", | |
"temporal databases, real-time databases, and data management systems", | |
"large language model integration with databases for enhanced data management and survey analysis", | |
"ai-driven database management", | |
"distributed systems and databases: key concepts and technologies", | |
"graph databases and query languages: traversal, indexing, and analytics", | |
"graph databases: models, data modeling, and applications", | |
"multi-model databases: mongodb, arangodb, and jsonb", | |
"time-series data management and analytics", | |
"advanced data management and retrieval techniques", | |
"vector databases and their role in modern data management and retrieval", | |
"content delivery networks: technologies and strategies for optimization", | |
"lpwan technologies: lora, zigbee 3.0, 6lowpan, and related protocols in iot", | |
"network slicing and emerging technologies in 6g networks", | |
"advanced concepts and technologies in software-defined networking and network function virtualization", | |
"battery electrolyte formulation in lithium-ion batteries", | |
"flow batteries as energy storage systems", | |
"internal consistency, self-feedback, and reliability in large language models", | |
"attention mechanisms in large language models", | |
"controlled text generation with large language models in natural language processing", | |
"domain adaptation and specialized nlp applications", | |
"evaluation of large language models for natural language processing", | |
"information extraction and large language models in natural language processing", | |
"techniques for low-resource natural language processing", | |
"model compression techniques for transformer models", | |
"multi-agent offline policy reinforcement learning: decentralized learning and cooperative policy optimization", | |
"multimodal learning and its applications", | |
"reasoning capabilities of large language models", | |
"transformer models in natural language processing" | |
] | |
second_topics = [ | |
"semi-supervised learning", | |
"out-of-distribution detection", | |
"in-context learning" | |
] | |
if __name__ == '__main__': | |
for topic in first_topics: | |
print(f"\nProcessing topic (first list): {topic}") | |
download_arxiv_papers_new(topic, max_results=50, min_results=20) | |
for topic in second_topics: | |
print(f"\nProcessing topic (second list): {topic}") | |
download_arxiv_papers_new(topic, max_results=50, min_results=20) |