Spaces:
Sleeping
Sleeping
Commit
Β·
e6a1391
1
Parent(s):
2a20f7f
Flatten repo history
Browse files- .gitignore +1 -0
- README.md +8 -2
- app.py +630 -0
- arxiv_stuff.py +372 -0
- requirements.txt +9 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.env
|
README.md
CHANGED
@@ -1,14 +1,20 @@
|
|
1 |
---
|
2 |
title: Research Compass
|
3 |
emoji: π
|
4 |
-
colorFrom:
|
5 |
colorTo: red
|
|
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.22.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: agpl-3.0
|
11 |
short_description: Connect research papers. Discover new insights.
|
|
|
|
|
|
|
|
|
|
|
12 |
---
|
13 |
|
14 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
title: Research Compass
|
3 |
emoji: π
|
4 |
+
colorFrom: blue
|
5 |
colorTo: red
|
6 |
+
python_version: 3.10
|
7 |
sdk: gradio
|
8 |
sdk_version: 5.22.0
|
9 |
app_file: app.py
|
10 |
pinned: false
|
11 |
license: agpl-3.0
|
12 |
short_description: Connect research papers. Discover new insights.
|
13 |
+
datasets:
|
14 |
+
- "nomadicsynth/arxiv-dataset-abstract-embeddings"
|
15 |
+
models:
|
16 |
+
- "nomadicsynth/research-compass-arxiv-abstracts-embedding-model"
|
17 |
+
- "meta-llama/Llama-3.2-3B-Instruct"
|
18 |
---
|
19 |
|
20 |
+
Check out the configuration reference at <https://huggingface.co/docs/hub/spaces-config-reference>
|
app.py
ADDED
@@ -0,0 +1,630 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
|
4 |
+
import faiss
|
5 |
+
import gradio as gr
|
6 |
+
import pandas as pd
|
7 |
+
import spaces
|
8 |
+
import torch
|
9 |
+
from datasets import load_dataset
|
10 |
+
from huggingface_hub import InferenceClient, hf_hub_download
|
11 |
+
from huggingface_hub import login as hf_hub_login
|
12 |
+
from huggingface_hub import upload_file
|
13 |
+
from sentence_transformers import SentenceTransformer
|
14 |
+
|
15 |
+
from arxiv_stuff import ARXIV_CATEGORIES_FLAT
|
16 |
+
|
17 |
+
# Get HF_TOKEN from environment variables
|
18 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
19 |
+
|
20 |
+
# Login to Hugging Face Hub
|
21 |
+
hf_hub_login(token=HF_TOKEN, add_to_git_credential=True)
|
22 |
+
|
23 |
+
# Dataset details
|
24 |
+
dataset_name = "nomadicsynth/arxiv-dataset-abstract-embeddings"
|
25 |
+
dataset_revision = "v1.0.0"
|
26 |
+
local_index_path = "arxiv_faiss_index.faiss"
|
27 |
+
|
28 |
+
# Embedding model details
|
29 |
+
embedding_model_name = "nomadicsynth/research-compass-arxiv-abstracts-embedding-model"
|
30 |
+
embedding_model_revision = "2025-01-28_23-06-17-1epochs-12batch-32eval-512embed-final"
|
31 |
+
|
32 |
+
# Amalysis model details
|
33 |
+
|
34 |
+
# Settings for Llama-3.3-70B-Instruct
|
35 |
+
reasoning_model_id = "meta-llama/Llama-3.3-70B-Instruct"
|
36 |
+
max_length = 1024 * 4
|
37 |
+
temperature = None
|
38 |
+
top_p = None
|
39 |
+
presence_penalty = None
|
40 |
+
|
41 |
+
# Settings for QwQ-32B
|
42 |
+
# reasoning_model_id = "Qwen/QwQ-32B"
|
43 |
+
# reasoning_start_tag = "<think>"
|
44 |
+
# reasoning_end_tag = "</think>"
|
45 |
+
# max_length = 1024 * 4
|
46 |
+
# temperature = 0.6
|
47 |
+
# top_p = 0.95
|
48 |
+
# presence_penalty = 0.1
|
49 |
+
|
50 |
+
# Global variables
|
51 |
+
dataset = None
|
52 |
+
embedding_model = None
|
53 |
+
reasoning_model = None
|
54 |
+
|
55 |
+
|
56 |
+
def save_faiss_index_to_hub():
|
57 |
+
"""Save the FAISS index to the Hub for easy access"""
|
58 |
+
global dataset, local_index_path
|
59 |
+
# 1. Save the index to a local file
|
60 |
+
dataset["train"].save_faiss_index("embedding", local_index_path)
|
61 |
+
print(f"FAISS index saved locally to {local_index_path}")
|
62 |
+
|
63 |
+
# 2. Upload the index file to the Hub
|
64 |
+
remote_path = upload_file(
|
65 |
+
path_or_fileobj=local_index_path,
|
66 |
+
path_in_repo=local_index_path, # Same name on the Hub
|
67 |
+
repo_id=dataset_name, # Use your dataset repo
|
68 |
+
token=HF_TOKEN,
|
69 |
+
repo_type="dataset", # This is a dataset file
|
70 |
+
revision=dataset_revision, # Use the same revision as the dataset
|
71 |
+
commit_message="Add FAISS index", # Commit message
|
72 |
+
)
|
73 |
+
|
74 |
+
print(f"FAISS index uploaded to Hub at {remote_path}")
|
75 |
+
|
76 |
+
# Remove the local file. It's now stored on the Hub.
|
77 |
+
os.remove(local_index_path)
|
78 |
+
|
79 |
+
|
80 |
+
def setup_dataset():
|
81 |
+
"""Load dataset with FAISS index"""
|
82 |
+
global dataset
|
83 |
+
print("Loading dataset from Hugging Face...")
|
84 |
+
|
85 |
+
# Load dataset
|
86 |
+
dataset = load_dataset(
|
87 |
+
dataset_name,
|
88 |
+
revision=dataset_revision,
|
89 |
+
)
|
90 |
+
|
91 |
+
# Try to load the index from the Hub
|
92 |
+
try:
|
93 |
+
print("Downloading pre-built FAISS index...")
|
94 |
+
index_path = hf_hub_download(
|
95 |
+
repo_id=dataset_name,
|
96 |
+
filename="arxiv_faiss_index.faiss",
|
97 |
+
revision=dataset_revision,
|
98 |
+
token=HF_TOKEN,
|
99 |
+
repo_type="dataset",
|
100 |
+
)
|
101 |
+
|
102 |
+
print("Loading pre-built FAISS index...")
|
103 |
+
dataset["train"].load_faiss_index("embedding", index_path)
|
104 |
+
print("Pre-built FAISS index loaded successfully")
|
105 |
+
|
106 |
+
except Exception as e:
|
107 |
+
print(f"Could not load pre-built index: {e}")
|
108 |
+
print("Building new FAISS index...")
|
109 |
+
|
110 |
+
# Add FAISS index if it doesn't exist
|
111 |
+
if not dataset["train"].features.get("embedding"):
|
112 |
+
print("Dataset doesn't have 'embedding' column, cannot create FAISS index")
|
113 |
+
raise ValueError("Dataset doesn't have 'embedding' column")
|
114 |
+
|
115 |
+
dataset["train"].add_faiss_index(
|
116 |
+
column="embedding",
|
117 |
+
metric_type=faiss.METRIC_INNER_PRODUCT,
|
118 |
+
string_factory="HNSW,RFlat", # Using reranking
|
119 |
+
)
|
120 |
+
|
121 |
+
# Save the FAISS index to the Hub
|
122 |
+
save_faiss_index_to_hub()
|
123 |
+
|
124 |
+
print(f"Dataset loaded with {len(dataset['train'])} items and FAISS index ready")
|
125 |
+
|
126 |
+
|
127 |
+
def init_embedding_model(model_name_or_path: str, model_revision: str = None) -> SentenceTransformer:
|
128 |
+
global embedding_model
|
129 |
+
|
130 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
131 |
+
embedding_model = SentenceTransformer(
|
132 |
+
model_name_or_path,
|
133 |
+
revision=model_revision,
|
134 |
+
token=HF_TOKEN,
|
135 |
+
device=device,
|
136 |
+
)
|
137 |
+
|
138 |
+
|
139 |
+
def init_reasoning_model(model_name: str) -> InferenceClient:
|
140 |
+
global reasoning_model
|
141 |
+
reasoning_model = InferenceClient(
|
142 |
+
model=model_name,
|
143 |
+
provider="hf-inference",
|
144 |
+
api_key=HF_TOKEN,
|
145 |
+
)
|
146 |
+
return reasoning_model
|
147 |
+
|
148 |
+
|
149 |
+
def generate(messages: list[dict[str, str]]) -> str:
|
150 |
+
"""
|
151 |
+
Generate a response to a list of messages.
|
152 |
+
|
153 |
+
Args:
|
154 |
+
messages: A list of message dictionaries with a "role" and "content" key.
|
155 |
+
|
156 |
+
Returns:
|
157 |
+
The generated response as a string.
|
158 |
+
"""
|
159 |
+
global reasoning_model
|
160 |
+
|
161 |
+
system_message = {
|
162 |
+
"role": "system",
|
163 |
+
"content": "You are an expert in evaluating connections between research papers.",
|
164 |
+
}
|
165 |
+
|
166 |
+
messages.insert(0, system_message)
|
167 |
+
|
168 |
+
response_schema = r"""{
|
169 |
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
170 |
+
"title": "Generated schema for Root",
|
171 |
+
"type": "object",
|
172 |
+
"properties": {
|
173 |
+
"reasoning": {
|
174 |
+
"type": "string"
|
175 |
+
},
|
176 |
+
"key_connections": {
|
177 |
+
"type": "array",
|
178 |
+
"items": {
|
179 |
+
"type": "object",
|
180 |
+
"properties": {
|
181 |
+
"connection": {
|
182 |
+
"type": "string"
|
183 |
+
},
|
184 |
+
"description": {
|
185 |
+
"type": "string"
|
186 |
+
}
|
187 |
+
},
|
188 |
+
"required": [
|
189 |
+
"connection",
|
190 |
+
"description"
|
191 |
+
]
|
192 |
+
}
|
193 |
+
},
|
194 |
+
"synergies_and_complementarities": {
|
195 |
+
"type": "array",
|
196 |
+
"items": {
|
197 |
+
"type": "object",
|
198 |
+
"properties": {
|
199 |
+
"type": {
|
200 |
+
"type": "array",
|
201 |
+
"items": {
|
202 |
+
"type": "string"
|
203 |
+
}
|
204 |
+
},
|
205 |
+
"description": {
|
206 |
+
"type": "string"
|
207 |
+
}
|
208 |
+
},
|
209 |
+
"required": [
|
210 |
+
"type",
|
211 |
+
"description"
|
212 |
+
]
|
213 |
+
}
|
214 |
+
},
|
215 |
+
"research_potential": {
|
216 |
+
"type": "array",
|
217 |
+
"items": {
|
218 |
+
"type": "object",
|
219 |
+
"properties": {
|
220 |
+
"potential": {
|
221 |
+
"type": "string"
|
222 |
+
},
|
223 |
+
"description": {
|
224 |
+
"type": "string"
|
225 |
+
}
|
226 |
+
},
|
227 |
+
"required": [
|
228 |
+
"potential",
|
229 |
+
"description"
|
230 |
+
]
|
231 |
+
}
|
232 |
+
},
|
233 |
+
"rating": {
|
234 |
+
"type": "number"
|
235 |
+
},
|
236 |
+
"confidence": {
|
237 |
+
"type": "number"
|
238 |
+
}
|
239 |
+
},
|
240 |
+
"required": [
|
241 |
+
"reasoning",
|
242 |
+
"key_connections",
|
243 |
+
"synergies_and_complementarities",
|
244 |
+
"research_potential",
|
245 |
+
"rating",
|
246 |
+
"confidence"
|
247 |
+
]
|
248 |
+
}"""
|
249 |
+
|
250 |
+
response_format = {
|
251 |
+
"type": "json",
|
252 |
+
"value": response_schema,
|
253 |
+
}
|
254 |
+
|
255 |
+
result = reasoning_model.chat.completions.create(
|
256 |
+
messages=messages,
|
257 |
+
max_tokens=max_length,
|
258 |
+
temperature=temperature,
|
259 |
+
presence_penalty=presence_penalty,
|
260 |
+
response_format=response_format,
|
261 |
+
top_p=top_p,
|
262 |
+
)
|
263 |
+
|
264 |
+
output = result.choices[0].message.content.strip()
|
265 |
+
return output
|
266 |
+
|
267 |
+
|
268 |
+
@spaces.GPU
|
269 |
+
def embed_text(text: str | list[str]) -> torch.Tensor:
|
270 |
+
global embedding_model
|
271 |
+
|
272 |
+
# Strip any leading/trailing whitespace
|
273 |
+
text = text.strip() if isinstance(text, str) else [t.strip() for t in text]
|
274 |
+
embed_text = embedding_model.encode(text, normalize_embeddings=True) # Ensure vectors are normalized
|
275 |
+
return embed_text
|
276 |
+
|
277 |
+
|
278 |
+
def analyse_abstracts(query_abstract: str, compare_abstract: dict) -> str:
|
279 |
+
"""Analyze the relationship between two abstracts and return formatted analysis"""
|
280 |
+
# Highlight the synergies in thesede papers that would justify further research
|
281 |
+
messages = [
|
282 |
+
{
|
283 |
+
"role": "user",
|
284 |
+
"content": f"""You are trained in evaluating connections between research papers. Please **identify and analyze the links** between these two papers:
|
285 |
+
|
286 |
+
Paper 1 Abstract:
|
287 |
+
{query_abstract}
|
288 |
+
|
289 |
+
Paper 2 Abstract:
|
290 |
+
{compare_abstract["abstract"]}
|
291 |
+
|
292 |
+
Consider the following aspects in your evaluation:
|
293 |
+
|
294 |
+
* **Methodological Cross-Pollination**: How do the methods or approaches from one paper **directly enhance or inform** the other?
|
295 |
+
* **Principle or Mechanism Extension**: Do the papers **share underlying principles or mechanisms** that can be **combined or extended** to yield new insights?
|
296 |
+
* **Interdisciplinary Connections**: Are there **clear opportunities** for interdisciplinary collaborations or knowledge transfer between the two papers?
|
297 |
+
* **Solution or Application Bridge**: Can the solutions or applications presented in one paper be **directly adapted or integrated** with the other to create **novel, actionable outcomes**?
|
298 |
+
|
299 |
+
Consider the connections in either direction, that is, from Paper 1 -> Paper 2, or vice versa, from Paper 2 -> Paper 1
|
300 |
+
|
301 |
+
Return a valid JSON object with this structure:
|
302 |
+
{{
|
303 |
+
"reasoning": "Step-by-step analysis of the papers, highlighting **key established connections**, identified synergies, and **concrete complementarities**. Emphasize the most **critical, actionable insights** or **key takeaways** from the analysis using markdown bold.",
|
304 |
+
|
305 |
+
# Main connecting concepts, methods, or principles
|
306 |
+
"key_connections": [
|
307 |
+
{{
|
308 |
+
"connection": "connection 1",
|
309 |
+
"description": "Brief description (1-2 sentences) for the **established connection**, explaining its **direct relevance** to the synergy analysis."
|
310 |
+
}},
|
311 |
+
...
|
312 |
+
],
|
313 |
+
|
314 |
+
"synergies_and_complementarities": [
|
315 |
+
{{
|
316 |
+
"type": ["Methodological Cross-Pollination", "Principle or Mechanism Extension", "Interdisciplinary Connections", "Solution or Application Bridge"], # Choose only one type per entry, and only include relevant types to this analysis
|
317 |
+
"description": "Brief explanation (1-2 sentences) of the **identified, concrete synergy** or **complementarity**, and a **specific, actionable example** to illustrate the concept."
|
318 |
+
}},
|
319 |
+
...
|
320 |
+
],
|
321 |
+
|
322 |
+
# Novel, actionable outcomes or applications emerging from the synergies
|
323 |
+
"research_potential": [
|
324 |
+
{{
|
325 |
+
"potential": "Actionable outcome or application 1",
|
326 |
+
"description": "Brief description (1-2 sentences) of the **concrete potential outcome** or **application**, and a **specific scenario** to illustrate its **direct impact**."
|
327 |
+
}},
|
328 |
+
...
|
329 |
+
],
|
330 |
+
|
331 |
+
"rating": 1-5, # Overall rating of the papers' synergy potential, where:
|
332 |
+
# 1 = **No synergy or connection** (definitely no link between the papers)
|
333 |
+
# 2 = **Low potential for synergy** (some vague or speculative connection, but highly uncertain)
|
334 |
+
# 3 = **Plausible synergy potential** (some potential connections, but requiring further investigation to confirm)
|
335 |
+
# 4 = **Established synergy with potential for growth** (clear connections with opportunities for further development)
|
336 |
+
# 5 = **High established synergy with direct, clear opportunities** (strong, concrete links with immediate, actionable outcomes)
|
337 |
+
|
338 |
+
"confidence": 0.0-1.0, # Confidence in your analysis, as a floating-point value representing the probability of your assessment being accurate
|
339 |
+
}}
|
340 |
+
|
341 |
+
Return only the JSON object, with double quotes around key names and all string values.""",
|
342 |
+
},
|
343 |
+
]
|
344 |
+
|
345 |
+
# Generate analysis
|
346 |
+
try:
|
347 |
+
output = generate(messages)
|
348 |
+
except Exception as e:
|
349 |
+
return f"Error: {e}"
|
350 |
+
|
351 |
+
# Parse the JSON output
|
352 |
+
try:
|
353 |
+
output = json.loads(output)
|
354 |
+
except Exception as e:
|
355 |
+
return f"Error: {e}"
|
356 |
+
|
357 |
+
# Format the output as markdown for better display
|
358 |
+
key_connections = ""
|
359 |
+
synergies_and_complementarities = ""
|
360 |
+
research_potential = ""
|
361 |
+
if "key_connections" in output:
|
362 |
+
for connection in output["key_connections"]:
|
363 |
+
key_connections += f"- {connection['connection']}: {connection['description']}\n"
|
364 |
+
|
365 |
+
if "synergies_and_complementarities" in output:
|
366 |
+
for synergy in output["synergies_and_complementarities"]:
|
367 |
+
synergies_and_complementarities += f"- {', '.join(synergy['type'])}: {synergy['description']}\n"
|
368 |
+
|
369 |
+
if "research_potential" in output:
|
370 |
+
for potential in output["research_potential"]:
|
371 |
+
research_potential += f"- {potential['potential']}: {potential['description']}\n"
|
372 |
+
|
373 |
+
formatted_output = f"""## Synergy Analysis
|
374 |
+
|
375 |
+
**Rating**: {'β
' * output['rating']}{'β' * (5-output['rating'])} **Confidence**: {'β
' * round(output['confidence'] * 5)}{'β' * round((1-output['confidence']) * 5)}
|
376 |
+
|
377 |
+
### Key Connections
|
378 |
+
{key_connections}
|
379 |
+
|
380 |
+
### Synergies and Complementarities
|
381 |
+
{synergies_and_complementarities}
|
382 |
+
|
383 |
+
### Research Potential
|
384 |
+
{research_potential}
|
385 |
+
|
386 |
+
### Reasoning
|
387 |
+
{output['reasoning']}
|
388 |
+
"""
|
389 |
+
return formatted_output
|
390 |
+
# return '```"""\n' + output + '\n"""```'
|
391 |
+
|
392 |
+
|
393 |
+
# arXiv Embedding Dataset Details
|
394 |
+
# DatasetDict({
|
395 |
+
# train: Dataset({
|
396 |
+
# features: ['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'report-no', 'categories', 'license', 'abstract', 'update_date', 'embedding', 'timestamp', 'embedding_model'],
|
397 |
+
# num_rows: 2689088
|
398 |
+
# })
|
399 |
+
# })
|
400 |
+
|
401 |
+
|
402 |
+
def find_synergistic_papers(abstract: str, limit=25) -> list[dict]:
|
403 |
+
"""Find papers synergistic with the given abstract using FAISS with cosine similarity"""
|
404 |
+
global dataset
|
405 |
+
|
406 |
+
# Generate embedding for the query abstract (normalized for cosine similarity)
|
407 |
+
abstract_embedding = embed_text(abstract)
|
408 |
+
|
409 |
+
# Search for similar papers using FAISS with inner product (cosine similarity for normalized vectors)
|
410 |
+
scores, examples = dataset["train"].get_nearest_examples("embedding", abstract_embedding, k=limit)
|
411 |
+
|
412 |
+
papers = []
|
413 |
+
for i in range(len(scores)):
|
414 |
+
# With cosine similarity, higher scores are better (closer to 1)
|
415 |
+
paper_dict = {
|
416 |
+
"id": examples["id"][i],
|
417 |
+
"title": examples["title"][i],
|
418 |
+
"authors": examples["authors"][i],
|
419 |
+
"categories": examples["categories"][i],
|
420 |
+
"abstract": examples["abstract"][i],
|
421 |
+
"update_date": examples["update_date"][i],
|
422 |
+
"synergy_score": float(scores[i]), # Convert to float for serialization
|
423 |
+
}
|
424 |
+
papers.append(paper_dict)
|
425 |
+
|
426 |
+
return papers
|
427 |
+
|
428 |
+
|
429 |
+
def format_search_results(abstract: str) -> tuple[pd.DataFrame, list[dict]]:
|
430 |
+
"""Format search results as a DataFrame for display"""
|
431 |
+
# Find papers synergistic with the given abstract
|
432 |
+
papers = find_synergistic_papers(abstract)
|
433 |
+
|
434 |
+
# Convert to DataFrame for display
|
435 |
+
df = pd.DataFrame(
|
436 |
+
[
|
437 |
+
{
|
438 |
+
"Title": p["title"],
|
439 |
+
"Authors": p["authors"][:50] + "..." if len(p["authors"]) > 50 else p["authors"],
|
440 |
+
"Categories": p["categories"],
|
441 |
+
"Date": p["update_date"],
|
442 |
+
"Match Score": f"{int(p['synergy_score'] * 100)}%",
|
443 |
+
"ID": p["id"], # Hidden column for reference
|
444 |
+
}
|
445 |
+
for p in papers
|
446 |
+
]
|
447 |
+
)
|
448 |
+
|
449 |
+
return df, papers # Return both DataFrame and original data
|
450 |
+
|
451 |
+
|
452 |
+
def format_paper_as_markdown(paper: dict) -> str:
|
453 |
+
# Convert category codes to full names, handling unknown categories
|
454 |
+
subjects = []
|
455 |
+
for subject in paper["categories"].split():
|
456 |
+
if subject in ARXIV_CATEGORIES_FLAT:
|
457 |
+
subjects.append(ARXIV_CATEGORIES_FLAT[subject])
|
458 |
+
else:
|
459 |
+
subjects.append(f"Unknown Category ({subject})")
|
460 |
+
|
461 |
+
paper["title"] = paper["title"].replace("\n", " ").strip()
|
462 |
+
paper["authors"] = paper["authors"].replace("\n", " ").strip()
|
463 |
+
|
464 |
+
return f"""# {paper["title"]}
|
465 |
+
### {paper["authors"]}
|
466 |
+
#### {', '.join(subjects)} | {paper["update_date"]} | **Score**: {int(paper['synergy_score'] * 100)}%
|
467 |
+
**[arxiv:{paper["id"]}](https://arxiv.org/abs/{paper["id"]})** - [PDF](https://arxiv.org/pdf/{paper["id"]})<br>
|
468 |
+
|
469 |
+
{paper["abstract"]}
|
470 |
+
"""
|
471 |
+
|
472 |
+
|
473 |
+
latex_delimiters = [
|
474 |
+
{"left": "$$", "right": "$$", "display": True},
|
475 |
+
# {"left": "$", "right": "$", "display": False},
|
476 |
+
# {"left": "\\(", "right": "\\)", "display": False},
|
477 |
+
# {"left": "\\begin{equation}", "right": "\\end{equation}", "display": True},
|
478 |
+
# {"left": "\\begin{align}", "right": "\\end{align}", "display": True},
|
479 |
+
# {"left": "\\begin{alignat}", "right": "\\end{alignat}", "display": True},
|
480 |
+
# {"left": "\\begin{gather}", "right": "\\end{gather}", "display": True},
|
481 |
+
# {"left": "\\begin{CD}", "right": "\\end{CD}", "display": True},
|
482 |
+
# {"left": "\\[", "right": "\\]", "display": True},
|
483 |
+
# {"left": "\\underline{", "right": "}", "display": False},
|
484 |
+
# {"left": "\\textit{", "right": "}", "display": False},
|
485 |
+
# {"left": "\\textit{", "right": "}", "display": False},
|
486 |
+
# {"left": "{", "right": "}", "display": False},
|
487 |
+
]
|
488 |
+
|
489 |
+
|
490 |
+
def create_interface():
|
491 |
+
with gr.Blocks(
|
492 |
+
css="""
|
493 |
+
.cell-menu-button {
|
494 |
+
display: none;
|
495 |
+
}"""
|
496 |
+
) as demo:
|
497 |
+
gr.HTML(
|
498 |
+
"""
|
499 |
+
<div style="text-align: center; margin-bottom: 1rem">
|
500 |
+
<h1>Research Compass</h1>
|
501 |
+
<p>Find synergistic papers to enrich your research</p>
|
502 |
+
<p>An experiment in AI-driven research synergy analysis</p>
|
503 |
+
</div>
|
504 |
+
"""
|
505 |
+
)
|
506 |
+
|
507 |
+
with gr.Accordion(label="Instructions", open=False):
|
508 |
+
gr.Markdown(
|
509 |
+
"""
|
510 |
+
1. **Enter Abstract**: Paste an abstract or describe your research details in the text box.
|
511 |
+
2. **Search for Synergistic Papers**: Click the button to find papers with similar themes.
|
512 |
+
3. **Select a Paper**: Click on a row in the results table to view paper details.
|
513 |
+
4. **Analyze Connection Potential**: Click the button to analyze the synergy potential between the papers.
|
514 |
+
5. **Synergy Analysis**: View the detailed analysis of the connection potential between the papers.
|
515 |
+
"""
|
516 |
+
)
|
517 |
+
|
518 |
+
abstract_input = gr.Textbox(
|
519 |
+
label="Paper Abstract or Description",
|
520 |
+
placeholder="Paste an abstract or describe research details...",
|
521 |
+
lines=8,
|
522 |
+
key="abstract",
|
523 |
+
)
|
524 |
+
search_btn = gr.Button("Search for Synergistic Papers", variant="primary")
|
525 |
+
|
526 |
+
# Store full paper data
|
527 |
+
paper_data_state = gr.State([])
|
528 |
+
|
529 |
+
# Store query abstract
|
530 |
+
query_abstract_state = gr.State("")
|
531 |
+
|
532 |
+
# Store selected paper
|
533 |
+
selected_paper_state = gr.State(None)
|
534 |
+
|
535 |
+
# Use Dataframe for results
|
536 |
+
results_df = gr.Dataframe(
|
537 |
+
headers=["Title", "Authors", "Categories", "Date", "Match Score"],
|
538 |
+
datatype=["markdown", "markdown", "str", "date", "str"],
|
539 |
+
latex_delimiters=latex_delimiters,
|
540 |
+
label="Synergistic Papers",
|
541 |
+
interactive=False,
|
542 |
+
wrap=False,
|
543 |
+
line_breaks=False,
|
544 |
+
column_widths=["40%", "20%", "20%", "10%", "10%", "0%"], # Hide ID column
|
545 |
+
key="results",
|
546 |
+
)
|
547 |
+
|
548 |
+
with gr.Row():
|
549 |
+
with gr.Column(scale=1):
|
550 |
+
paper_details_output = gr.Markdown(
|
551 |
+
value="# Paper Details",
|
552 |
+
label="Paper Details",
|
553 |
+
latex_delimiters=latex_delimiters,
|
554 |
+
show_copy_button=True,
|
555 |
+
key="paper_details",
|
556 |
+
)
|
557 |
+
analyze_btn = gr.Button("Analyze Connection Potential", variant="primary", interactive=False)
|
558 |
+
with gr.Column(scale=1):
|
559 |
+
# Analysis output
|
560 |
+
analysis_output = gr.Markdown(
|
561 |
+
value="# Synergy Analysis",
|
562 |
+
label="Synergy Analysis",
|
563 |
+
latex_delimiters=latex_delimiters,
|
564 |
+
show_copy_button=True,
|
565 |
+
key="analysis_output",
|
566 |
+
)
|
567 |
+
|
568 |
+
# Display paper details when row is selected
|
569 |
+
def on_select(evt: gr.SelectData, papers, query):
|
570 |
+
selected_index = evt.index[0] # Get the row index
|
571 |
+
selected = papers[selected_index]
|
572 |
+
|
573 |
+
# Format paper details
|
574 |
+
details_md = format_paper_as_markdown(selected)
|
575 |
+
|
576 |
+
return details_md, selected
|
577 |
+
|
578 |
+
# Connect search button to the search function
|
579 |
+
search_btn.click(
|
580 |
+
format_search_results,
|
581 |
+
inputs=[abstract_input],
|
582 |
+
outputs=[results_df, paper_data_state],
|
583 |
+
).then(
|
584 |
+
lambda x: x, # Identity function to pass through the abstract
|
585 |
+
inputs=[abstract_input],
|
586 |
+
outputs=[query_abstract_state],
|
587 |
+
).then(
|
588 |
+
lambda: None, # Reset selected paper
|
589 |
+
outputs=[selected_paper_state],
|
590 |
+
).then(
|
591 |
+
lambda: gr.update(interactive=False), # Disable analyze button until paper selected
|
592 |
+
outputs=[analyze_btn],
|
593 |
+
).then(
|
594 |
+
lambda: "# Synergy Analysis", # Clear previous analysis
|
595 |
+
outputs=[analysis_output],
|
596 |
+
)
|
597 |
+
|
598 |
+
# Use built-in select event from Dataframe
|
599 |
+
results_df.select(
|
600 |
+
on_select,
|
601 |
+
inputs=[paper_data_state, query_abstract_state],
|
602 |
+
outputs=[paper_details_output, selected_paper_state],
|
603 |
+
).then(
|
604 |
+
lambda: gr.update(interactive=True), # Enable analyze button when paper selected
|
605 |
+
outputs=[analyze_btn],
|
606 |
+
)
|
607 |
+
|
608 |
+
# Connect analyze button to run analysis
|
609 |
+
analyze_btn.click(
|
610 |
+
analyse_abstracts,
|
611 |
+
inputs=[query_abstract_state, selected_paper_state],
|
612 |
+
outputs=[analysis_output],
|
613 |
+
show_progress_on=[paper_details_output, analysis_output],
|
614 |
+
)
|
615 |
+
|
616 |
+
return demo
|
617 |
+
|
618 |
+
|
619 |
+
if __name__ == "__main__":
|
620 |
+
# Load dataset with FAISS index
|
621 |
+
setup_dataset()
|
622 |
+
|
623 |
+
# Initialize the embedding model
|
624 |
+
init_embedding_model(embedding_model_name, embedding_model_revision)
|
625 |
+
|
626 |
+
# Initialize the reasoning model
|
627 |
+
reasoning_model = init_reasoning_model(reasoning_model_id)
|
628 |
+
|
629 |
+
demo = create_interface()
|
630 |
+
demo.queue().launch(ssr_mode=False)
|
arxiv_stuff.py
ADDED
@@ -0,0 +1,372 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
from datetime import datetime, timedelta, timezone
|
3 |
+
from typing import Optional, Union
|
4 |
+
|
5 |
+
import arxiv
|
6 |
+
import requests
|
7 |
+
|
8 |
+
# Initialize the arXiv API client
|
9 |
+
arxiv_client = arxiv.Client()
|
10 |
+
|
11 |
+
ARXIV_CATEGORIES = {
|
12 |
+
"Computer Science": {
|
13 |
+
"cs.AI": "Artificial Intelligence",
|
14 |
+
"cs.AR": "Hardware Architecture",
|
15 |
+
"cs.CC": "Computational Complexity",
|
16 |
+
"cs.CE": "Computational Engineering",
|
17 |
+
"cs.CG": "Computational Geometry",
|
18 |
+
"cs.CL": "Computation and Language",
|
19 |
+
"cs.CR": "Cryptography and Security",
|
20 |
+
"cs.CV": "Computer Vision and Pattern Recognition",
|
21 |
+
"cs.CY": "Computers and Society",
|
22 |
+
"cs.DB": "Databases",
|
23 |
+
"cs.DC": "Distributed Computing",
|
24 |
+
"cs.DL": "Digital Libraries",
|
25 |
+
"cs.DM": "Discrete Mathematics",
|
26 |
+
"cs.DS": "Data Structures and Algorithms",
|
27 |
+
"cs.ET": "Emerging Technologies",
|
28 |
+
"cs.FL": "Formal Languages and Automata Theory",
|
29 |
+
"cs.GL": "General Literature",
|
30 |
+
"cs.GR": "Graphics",
|
31 |
+
"cs.GT": "Computer Science and Game Theory",
|
32 |
+
"cs.HC": "Human-Computer Interaction",
|
33 |
+
"cs.IR": "Information Retrieval",
|
34 |
+
"cs.IT": "Information Theory",
|
35 |
+
"cs.LG": "Machine Learning",
|
36 |
+
"cs.LO": "Logic in Computer Science",
|
37 |
+
"cs.MA": "Multiagent Systems",
|
38 |
+
"cs.MM": "Multimedia",
|
39 |
+
"cs.MS": "Mathematical Software",
|
40 |
+
"cs.NA": "Numerical Analysis",
|
41 |
+
"cs.NE": "Neural and Evolutionary Computing",
|
42 |
+
"cs.NI": "Networking and Internet Architecture",
|
43 |
+
"cs.OH": "Other Computer Science",
|
44 |
+
"cs.OS": "Operating Systems",
|
45 |
+
"cs.PF": "Performance",
|
46 |
+
"cs.PL": "Programming Languages",
|
47 |
+
"cs.RO": "Robotics",
|
48 |
+
"cs.SC": "Symbolic Computation",
|
49 |
+
"cs.SD": "Sound",
|
50 |
+
"cs.SE": "Software Engineering",
|
51 |
+
"cs.SI": "Social and Information Networks",
|
52 |
+
"cs.SY": "Systems and Control",
|
53 |
+
},
|
54 |
+
"Physics": {
|
55 |
+
"astro-ph.CO": "Cosmology and Nongalactic Astrophysics",
|
56 |
+
"astro-ph.EP": "Earth and Planetary Astrophysics",
|
57 |
+
"astro-ph.GA": "Astrophysics of Galaxies",
|
58 |
+
"astro-ph.HE": "High Energy Astrophysical Phenomena",
|
59 |
+
"astro-ph.IM": "Instrumentation and Methods for Astrophysics",
|
60 |
+
"astro-ph.SR": "Solar and Stellar Astrophysics",
|
61 |
+
"cond-mat.dis-nn": "Disordered Systems and Neural Networks",
|
62 |
+
"cond-mat.mes-hall": "Mesoscale and Nanoscale Physics",
|
63 |
+
"cond-mat.mtrl-sci": "Materials Science",
|
64 |
+
"cond-mat.other": "Other Condensed Matter",
|
65 |
+
"cond-mat.quant-gas": "Quantum Gases",
|
66 |
+
"cond-mat.soft": "Soft Condensed Matter",
|
67 |
+
"cond-mat.stat-mech": "Statistical Mechanics",
|
68 |
+
"cond-mat.str-el": "Strongly Correlated Electrons",
|
69 |
+
"cond-mat.supr-con": "Superconductivity",
|
70 |
+
"gr-qc": "General Relativity and Quantum Cosmology",
|
71 |
+
"hep-ex": "High Energy Physics - Experiment",
|
72 |
+
"hep-lat": "High Energy Physics - Lattice",
|
73 |
+
"hep-ph": "High Energy Physics - Phenomenology",
|
74 |
+
"hep-th": "High Energy Physics - Theory",
|
75 |
+
"math-ph": "Mathematical Physics",
|
76 |
+
"nlin.AO": "Adaptation and Self-Organizing Systems",
|
77 |
+
"nlin.CD": "Chaotic Dynamics",
|
78 |
+
"nlin.CG": "Cellular Automata and Lattice Gases",
|
79 |
+
"nlin.PS": "Pattern Formation and Solitons",
|
80 |
+
"nlin.SI": "Exactly Solvable and Integrable Systems",
|
81 |
+
"nucl-ex": "Nuclear Experiment",
|
82 |
+
"nucl-th": "Nuclear Theory",
|
83 |
+
"physics.acc-ph": "Accelerator Physics",
|
84 |
+
"physics.ao-ph": "Atmospheric and Oceanic Physics",
|
85 |
+
"physics.app-ph": "Applied Physics",
|
86 |
+
"physics.atm-clus": "Atomic and Molecular Clusters",
|
87 |
+
"physics.atom-ph": "Atomic Physics",
|
88 |
+
"physics.bio-ph": "Biological Physics",
|
89 |
+
"physics.chem-ph": "Chemical Physics",
|
90 |
+
"physics.class-ph": "Classical Physics",
|
91 |
+
"physics.comp-ph": "Computational Physics",
|
92 |
+
"physics.data-an": "Data Analysis, Statistics and Probability",
|
93 |
+
"physics.ed-ph": "Physics Education",
|
94 |
+
"physics.flu-dyn": "Fluid Dynamics",
|
95 |
+
"physics.gen-ph": "General Physics",
|
96 |
+
"physics.geo-ph": "Geophysics",
|
97 |
+
"physics.hist-ph": "History and Philosophy of Physics",
|
98 |
+
"physics.ins-det": "Instrumentation and Detectors",
|
99 |
+
"physics.med-ph": "Medical Physics",
|
100 |
+
"physics.optics": "Optics",
|
101 |
+
"physics.plasm-ph": "Plasma Physics",
|
102 |
+
"physics.pop-ph": "Popular Physics",
|
103 |
+
"physics.soc-ph": "Physics and Society",
|
104 |
+
"physics.space-ph": "Space Physics",
|
105 |
+
"quant-ph": "Quantum Physics",
|
106 |
+
},
|
107 |
+
"Mathematics": {
|
108 |
+
"math.AC": "Commutative Algebra",
|
109 |
+
"math.AG": "Algebraic Geometry",
|
110 |
+
"math.AP": "Analysis of PDEs",
|
111 |
+
"math.AT": "Algebraic Topology",
|
112 |
+
"math.CA": "Classical Analysis and ODEs",
|
113 |
+
"math.CO": "Combinatorics",
|
114 |
+
"math.CT": "Category Theory",
|
115 |
+
"math.CV": "Complex Variables",
|
116 |
+
"math.DG": "Differential Geometry",
|
117 |
+
"math.DS": "Dynamical Systems",
|
118 |
+
"math.FA": "Functional Analysis",
|
119 |
+
"math.GM": "General Mathematics",
|
120 |
+
"math.GN": "General Topology",
|
121 |
+
"math.GR": "Group Theory",
|
122 |
+
"math.GT": "Geometric Topology",
|
123 |
+
"math.HO": "History and Overview",
|
124 |
+
"math.IT": "Information Theory",
|
125 |
+
"math.KT": "K-Theory and Homology",
|
126 |
+
"math.LO": "Logic",
|
127 |
+
"math.MG": "Metric Geometry",
|
128 |
+
"math.MP": "Mathematical Physics",
|
129 |
+
"math.NA": "Numerical Analysis",
|
130 |
+
"math.NT": "Number Theory",
|
131 |
+
"math.OA": "Operator Algebras",
|
132 |
+
"math.OC": "Optimization and Control",
|
133 |
+
"math.PR": "Probability",
|
134 |
+
"math.QA": "Quantum Algebra",
|
135 |
+
"math.RA": "Rings and Algebras",
|
136 |
+
"math.RT": "Representation Theory",
|
137 |
+
"math.SG": "Symplectic Geometry",
|
138 |
+
"math.SP": "Spectral Theory",
|
139 |
+
"math.ST": "Statistics Theory",
|
140 |
+
},
|
141 |
+
"Biology": {
|
142 |
+
"q-bio.BM": "Biomolecules",
|
143 |
+
"q-bio.CB": "Cell Behavior",
|
144 |
+
"q-bio.GN": "Genomics",
|
145 |
+
"q-bio.MN": "Molecular Networks",
|
146 |
+
"q-bio.NC": "Neurons and Cognition",
|
147 |
+
"q-bio.OT": "Other Quantitative Biology",
|
148 |
+
"q-bio.PE": "Populations and Evolution",
|
149 |
+
"q-bio.QM": "Quantitative Methods",
|
150 |
+
"q-bio.SC": "Subcellular Processes",
|
151 |
+
"q-bio.TO": "Tissues and Organs",
|
152 |
+
},
|
153 |
+
"Statistics": {
|
154 |
+
"stat.AP": "Applications",
|
155 |
+
"stat.CO": "Computation",
|
156 |
+
"stat.ME": "Methodology",
|
157 |
+
"stat.ML": "Machine Learning",
|
158 |
+
"stat.OT": "Other Statistics",
|
159 |
+
"stat.TH": "Theory",
|
160 |
+
},
|
161 |
+
"Economics": {
|
162 |
+
"econ.EM": "Econometrics",
|
163 |
+
"econ.GN": "General Economics",
|
164 |
+
"econ.TH": "Economic Theory",
|
165 |
+
},
|
166 |
+
"Electrical Engineering and Systems Science": {
|
167 |
+
"eess.AS": "Audio and Speech Processing",
|
168 |
+
"eess.IV": "Image and Video Processing",
|
169 |
+
"eess.SP": "Signal Processing",
|
170 |
+
"eess.SY": "Systems and Control",
|
171 |
+
},
|
172 |
+
}
|
173 |
+
|
174 |
+
# Flatten categories for easy access
|
175 |
+
|
176 |
+
ARXIV_CATEGORIES_FLAT: dict[str, str] = {}
|
177 |
+
|
178 |
+
for main_cat, subcats in ARXIV_CATEGORIES.items():
|
179 |
+
for cat_code, cat_name in subcats.items():
|
180 |
+
ARXIV_CATEGORIES_FLAT[cat_code] = f"{main_cat}: {cat_name} ({cat_code})"
|
181 |
+
|
182 |
+
|
183 |
+
def clean_doi(doi: str) -> str:
|
184 |
+
if doi.startswith("https://arxiv.org/abs/"):
|
185 |
+
return doi.split("/")[-1]
|
186 |
+
elif doi.startswith("https://arxiv.org/pdf/"):
|
187 |
+
return doi.split("/")[-1].split(".pdf")[0]
|
188 |
+
elif doi.startswith("arXiv:"):
|
189 |
+
return doi.split(":")[-1]
|
190 |
+
elif doi.startswith("http"):
|
191 |
+
return "Invalid arXiv link. Please provide a link to the abstract page."
|
192 |
+
elif doi.startswith("10."):
|
193 |
+
# Fetch the arXiv ID from the DOI
|
194 |
+
base_url = "http://dx.doi.org/"
|
195 |
+
headers = {"Accept": "application/x-bibtex"}
|
196 |
+
response = requests.get(base_url + doi, headers=headers)
|
197 |
+
|
198 |
+
if response.status_code != 200:
|
199 |
+
return "No paper found with that DOI."
|
200 |
+
|
201 |
+
bibtext = response.text
|
202 |
+
return bibtext.split("eprint = {arXiv:")[-1].split("}")[0]
|
203 |
+
elif doi.replace("v", "").replace(".", "").isdigit():
|
204 |
+
return doi
|
205 |
+
else:
|
206 |
+
return "Invalid arXiv ID or DOI. Please provide a valid arXiv ID, DOI, or arXiv URL."
|
207 |
+
|
208 |
+
|
209 |
+
def retrieve_arxiv_paper(arxiv_id: str) -> dict:
|
210 |
+
"""Retrieve the paper from arXiv.
|
211 |
+
|
212 |
+
Args:
|
213 |
+
arxiv_id: The arXiv ID of the paper to retrieve.
|
214 |
+
|
215 |
+
Returns:
|
216 |
+
A dict object representing the paper.
|
217 |
+
"""
|
218 |
+
global arxiv_client
|
219 |
+
query_string = arxiv.Search(id_list=[arxiv_id])
|
220 |
+
|
221 |
+
results = arxiv_client.results(query_string)
|
222 |
+
try:
|
223 |
+
paper = next(results)
|
224 |
+
except StopIteration:
|
225 |
+
raise ValueError("No paper found with that arXiv ID.")
|
226 |
+
|
227 |
+
return dict(
|
228 |
+
arxiv_id=paper.entry_id.split("/")[-1],
|
229 |
+
title=paper.title,
|
230 |
+
authors=[author.name for author in paper.authors],
|
231 |
+
categories=[category for category in paper.categories],
|
232 |
+
abstract=paper.summary,
|
233 |
+
published_date=paper.published,
|
234 |
+
)
|
235 |
+
|
236 |
+
|
237 |
+
def build_arxiv_category_query(
|
238 |
+
categories: Union[str, list[str]],
|
239 |
+
start_date: Optional[datetime] = None,
|
240 |
+
end_date: Optional[datetime] = None,
|
241 |
+
start: int = 0,
|
242 |
+
max_results: int = 5,
|
243 |
+
) -> arxiv.Search:
|
244 |
+
"""Builds a query string for the arXiv API.
|
245 |
+
|
246 |
+
Args:
|
247 |
+
categories: List of arXiv categories to search.
|
248 |
+
start_date: Optional datetime to start search from.
|
249 |
+
end_date: Optional datetime to end search at.
|
250 |
+
start: Index of first result to return.
|
251 |
+
max_results: Maximum number of results to return.
|
252 |
+
|
253 |
+
Returns:
|
254 |
+
arxiv.Search object with the constructed query.
|
255 |
+
"""
|
256 |
+
if isinstance(categories, str):
|
257 |
+
categories = [categories]
|
258 |
+
|
259 |
+
if start_date and end_date:
|
260 |
+
date_str = f"{start_date.strftime('%Y%m%d%H%M')}+TO+{end_date.strftime('%Y%m%d%H%M')}"
|
261 |
+
elif start_date:
|
262 |
+
date_str = start_date.strftime("%Y%m%d%H%M")
|
263 |
+
date_str = f"{date_str}+TO+{datetime.now(timezone.utc).strftime('%Y%m%d%H%M')}"
|
264 |
+
else:
|
265 |
+
date_str = ""
|
266 |
+
|
267 |
+
# Construct the category string, including the date range if provided
|
268 |
+
cat_str = " OR ".join([f"cat:{cat}" for cat in categories]) if categories else ""
|
269 |
+
if date_str:
|
270 |
+
cat_str = f"({cat_str}) AND submittedDate:[{date_str}]"
|
271 |
+
|
272 |
+
search = arxiv.Search(
|
273 |
+
query=cat_str,
|
274 |
+
max_results=max_results,
|
275 |
+
sort_by=arxiv.SortCriterion.SubmittedDate,
|
276 |
+
sort_order=arxiv.SortOrder.Descending,
|
277 |
+
)
|
278 |
+
|
279 |
+
return search
|
280 |
+
|
281 |
+
|
282 |
+
def retrieve_arxiv_papers(
|
283 |
+
categories: Union[str, list[str]],
|
284 |
+
start_date: Optional[datetime] = None,
|
285 |
+
end_date: Optional[datetime] = None,
|
286 |
+
start: int = 0,
|
287 |
+
max_results: int = 5,
|
288 |
+
) -> list[dict]:
|
289 |
+
"""Searches arXiv for papers in the given categories.
|
290 |
+
|
291 |
+
Args:
|
292 |
+
categories: List of arXiv categories to search.
|
293 |
+
start_date: Date to start searching from.
|
294 |
+
end_date: Date to stop searching at.
|
295 |
+
start: Index of the first result to return.
|
296 |
+
max_results: Maximum number of results to return.
|
297 |
+
|
298 |
+
Returns:
|
299 |
+
A generator of dict objects.
|
300 |
+
"""
|
301 |
+
|
302 |
+
global arxiv_client
|
303 |
+
query_string = build_arxiv_category_query(categories, start_date, end_date, start, max_results)
|
304 |
+
|
305 |
+
papers = []
|
306 |
+
for result in arxiv_client.results(query_string, offset=start):
|
307 |
+
papers.append(
|
308 |
+
dict(
|
309 |
+
arxiv_id=result.entry_id.split("/")[-1],
|
310 |
+
title=result.title,
|
311 |
+
authors=[author.name for author in result.authors],
|
312 |
+
categories=[category for category in result.categories],
|
313 |
+
abstract=result.summary,
|
314 |
+
published_date=result.published,
|
315 |
+
)
|
316 |
+
)
|
317 |
+
|
318 |
+
return papers
|
319 |
+
|
320 |
+
|
321 |
+
def fetch_todays_papers(categories: Union[str, list[str]], start: int = 0, max_results: int = 5) -> list[dict]:
|
322 |
+
"""Fetch papers from today in the given categories
|
323 |
+
|
324 |
+
Args:
|
325 |
+
categories: List of arXiv categories to search
|
326 |
+
start: Index of the first result to return
|
327 |
+
max_results: Maximum number of results to return
|
328 |
+
|
329 |
+
Returns:
|
330 |
+
Generator of arXiv.Result objects
|
331 |
+
"""
|
332 |
+
if isinstance(categories, str):
|
333 |
+
categories = [categories]
|
334 |
+
|
335 |
+
papers = retrieve_arxiv_papers(
|
336 |
+
categories,
|
337 |
+
start_date=datetime.now(timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0),
|
338 |
+
start=start,
|
339 |
+
max_results=max_results,
|
340 |
+
)
|
341 |
+
|
342 |
+
return papers
|
343 |
+
|
344 |
+
|
345 |
+
def fetch_24_hours_papers(categories: Union[str, list[str]], start: int = 0, max_results: int = 5) -> list[dict]:
|
346 |
+
"""Fetch papers from the last 24 hours in the given categories
|
347 |
+
|
348 |
+
Args:
|
349 |
+
categories: List of arXiv categories to search
|
350 |
+
start: Index of the first result to return
|
351 |
+
max_results: Maximum number of results to return
|
352 |
+
|
353 |
+
Returns:
|
354 |
+
Generator of dict objects
|
355 |
+
"""
|
356 |
+
if isinstance(categories, str):
|
357 |
+
categories = [categories]
|
358 |
+
|
359 |
+
twenty_four_hours_ago = datetime.now(timezone.utc) - timedelta(days=1)
|
360 |
+
|
361 |
+
papers = retrieve_arxiv_papers(
|
362 |
+
categories,
|
363 |
+
start_date=twenty_four_hours_ago,
|
364 |
+
start=start,
|
365 |
+
max_results=max_results,
|
366 |
+
)
|
367 |
+
|
368 |
+
return papers
|
369 |
+
|
370 |
+
|
371 |
+
def random_arxiv_category():
|
372 |
+
return random.choice(list(ARXIV_CATEGORIES_FLAT.values()))
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accelerate
|
2 |
+
arxiv
|
3 |
+
bitsandbytes
|
4 |
+
datasets
|
5 |
+
faiss-cpu
|
6 |
+
gradio
|
7 |
+
sentence-transformers
|
8 |
+
spaces
|
9 |
+
torch
|