File size: 9,657 Bytes
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import os
import requests
import xml.etree.ElementTree as ET
import urllib.parse
from tqdm import tqdm
import time
import re

import requests
from asg_query import generate_generic_query_qwen, generate_query_qwen
from dotenv import load_dotenv

load_dotenv()

PARENT_FOLDER = "arxiv_downloads_new_new_new"
os.makedirs(PARENT_FOLDER, exist_ok=True)

def sanitize_filename(filename):
    filename = filename.replace("\n", "").strip()
    filename = re.sub(r'[\/:*?"<>|]', '_', filename)
    return filename[:100] + ".pdf"

def search_arxiv_papers(topic, max_results=50):
    query_qwen = generate_query_qwen(topic)
    encoded_query = urllib.parse.quote_plus(query_qwen)
    url = f"https://export.arxiv.org/api/query?search_query={encoded_query}&start=0&max_results={max_results}&sortBy=submittedDate"

    # base_url = "http://export.arxiv.org/api/query?"
    # query = f"search_query=all:{topic.replace(' ', '+')}&start=0&max_results={max_results}&sortBy=relevance&sortOrder=descending"
    # url = base_url + query
    
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching data for {topic}: {response.status_code}")
        return []
    
    root = ET.fromstring(response.text)
    entries = root.findall("{http://www.w3.org/2005/Atom}entry")
    
    papers = []
    for entry in entries:
        title = entry.find("{http://www.w3.org/2005/Atom}title").text
        pdf_link = entry.find("{http://www.w3.org/2005/Atom}id").text.replace("abs", "pdf")
        papers.append({"title": title, "pdf_link": pdf_link})
    
    return papers

def download_pdf(url, folder, filename):
    file_path = os.path.join(folder, filename)
    
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=1024):
                file.write(chunk)
    else:
        print(f"Failed to download {url}")

def download_arxiv_papers(topic, max_results=50):
    folder_name = os.path.join(PARENT_FOLDER, topic.replace(" ", "_"))
    os.makedirs(folder_name, exist_ok=True)

    papers = search_arxiv_papers(topic, max_results)

    if not papers:
        print(f"No papers found for topic: {topic}")
        return
    
    print(f"Downloading {len(papers)} papers for topic: {topic}")

    for paper in tqdm(papers, total=len(papers)):
        filename = sanitize_filename(paper['title'])
        pdf_link = paper["pdf_link"]
        download_pdf(pdf_link, folder_name, filename)
        time.sleep(2)
        
    print(f"Download complete. Papers saved in: {folder_name}")


def search_arxiv_with_query(query, max_results=50):
    """
    Query the arXiv API with a given query string.
    
    Parameters:
      query (str): The query string (URL-unencoded).
      max_results (int): Maximum number of results to request.
      
    Returns:
      list: A list of dictionaries containing paper metadata.
    """
    encoded_query = urllib.parse.quote_plus(query)
    url = f"https://export.arxiv.org/api/query?search_query={encoded_query}&start=0&max_results={max_results}&sortBy=submittedDate"
    
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching data with query: {query} | status code: {response.status_code}")
        return []
    
    try:
        root = ET.fromstring(response.text)
    except Exception as e:
        print("Error parsing XML:", e)
        return []
    
    entries = root.findall("{http://www.w3.org/2005/Atom}entry")
    papers = []
    for entry in entries:
        title = entry.find("{http://www.w3.org/2005/Atom}title").text.strip()
        pdf_link = entry.find("{http://www.w3.org/2005/Atom}id").text.replace("abs", "pdf")
        papers.append({"title": title, "pdf_link": pdf_link})
    return papers

def download_arxiv_papers_new(topic, max_results=50, min_results=10):
    """
    Download arXiv papers for a given topic.
    
    Process:
      1. Use a strict query generated by generate_query_qwen(topic) to query arXiv.
      2. If the number of results is fewer than `min_results`, then generate a more generic query
         using generate_generic_query_qwen() and run additional searches.
      3. Combine non-duplicate papers (filtered by title) until reaching max_results or exhausting attempts.
      4. Download the PDF of each paper.
      
    Parameters:
      topic (str): The research topic.
      max_results (int): Total maximum number of papers to download (default is 50).
      min_results (int): Minimum acceptable number of papers from the first query (default is 10).
    """
    folder_name = os.path.join(PARENT_FOLDER, topic.replace(" ", "_"))
    os.makedirs(folder_name, exist_ok=True)
    
    # 1. Initial strict query.
    strict_query = generate_query_qwen(topic)
    papers = search_arxiv_with_query(strict_query, max_results=max_results)
    
    # Use a dict keyed by title to avoid duplicates.
    total_papers = {paper["title"]: paper for paper in papers}
    print(f"[Strict Query] Found {len(total_papers)} papers for topic: {topic}")
    
    # 2. If the strict query returns fewer than min_results papers,
    #    use the generic query to broaden the search.
    attempts = 0
    MAX_ATTEMPTS = 5  # Limit attempts to avoid infinite loops.
    while len(total_papers) < max_results and len(total_papers) < min_results and attempts < MAX_ATTEMPTS:
        # Generate a less strict (generic) query
        generic_query = generate_generic_query_qwen(strict_query, topic)
        print(f"[Generic Query Attempt {attempts + 1}] Using generic query: {generic_query}")
        generic_papers = search_arxiv_with_query(generic_query, max_results=max_results)
        
        new_count = 0
        for paper in generic_papers:
            if paper["title"] not in total_papers:
                total_papers[paper["title"]] = paper
                new_count += 1
            if len(total_papers) >= max_results:
                break
        
        attempts += 1
        strict_query = generic_query  # Update the query for the next iteration.

    total_paper_list = list(total_papers.values())[:max_results]
    
    if not total_paper_list:
        print(f"No papers found for topic: {topic}")
        return
    
    print(f"Downloading {len(total_paper_list)} papers for topic: {topic}")
    for paper in tqdm(total_paper_list, total=len(total_paper_list)):
        filename = sanitize_filename(paper['title'])
        pdf_link = paper["pdf_link"]
        download_pdf(pdf_link, folder_name, filename)
        time.sleep(2)  # Delay to avoid overwhelming the arXiv API
        
    print(f"Download complete. Papers saved in: {folder_name}")

first_topics = [
    "quantum computing: bqp, quantum supremacy, and related concepts",
    "fixed-parameter tractability and related concepts in computational complexity",
    "fundamental concepts in computational complexity theory",
    "pcp theorem and its implications in approximation and complexity theory",
    "interconnections in theoretical computer science: seth, 3sum, apsp, and related concepts",
    "nosql database systems for flexible and scalable data management",
    "temporal databases, real-time databases, and data management systems",
    "large language model integration with databases for enhanced data management and survey analysis",
    "ai-driven database management",
    "distributed systems and databases: key concepts and technologies",
    "graph databases and query languages: traversal, indexing, and analytics",
    "graph databases: models, data modeling, and applications",
    "multi-model databases: mongodb, arangodb, and jsonb",
    "time-series data management and analytics",
    "advanced data management and retrieval techniques",
    "vector databases and their role in modern data management and retrieval",
    "content delivery networks: technologies and strategies for optimization",
    "lpwan technologies: lora, zigbee 3.0, 6lowpan, and related protocols in iot",
    "network slicing and emerging technologies in 6g networks",
    "advanced concepts and technologies in software-defined networking and network function virtualization",
    "battery electrolyte formulation in lithium-ion batteries",
    "flow batteries as energy storage systems",
    "internal consistency, self-feedback, and reliability in large language models",
    "attention mechanisms in large language models",
    "controlled text generation with large language models in natural language processing",
    "domain adaptation and specialized nlp applications",
    "evaluation of large language models for natural language processing",
    "information extraction and large language models in natural language processing",
    "techniques for low-resource natural language processing",
    "model compression techniques for transformer models",
    "multi-agent offline policy reinforcement learning: decentralized learning and cooperative policy optimization",
    "multimodal learning and its applications",
    "reasoning capabilities of large language models",
    "transformer models in natural language processing"
]

second_topics = [
    "semi-supervised learning",
    "out-of-distribution detection",
    "in-context learning"
]

if __name__ == '__main__':
    for topic in first_topics:
        print(f"\nProcessing topic (first list): {topic}")
        download_arxiv_papers_new(topic, max_results=50, min_results=20)
    for topic in second_topics:
        print(f"\nProcessing topic (second list): {topic}")
        download_arxiv_papers_new(topic, max_results=50, min_results=20)