nomadicsynth commited on
Commit
e6a1391
Β·
1 Parent(s): 2a20f7f

Flatten repo history

Browse files
Files changed (5) hide show
  1. .gitignore +1 -0
  2. README.md +8 -2
  3. app.py +630 -0
  4. arxiv_stuff.py +372 -0
  5. requirements.txt +9 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
README.md CHANGED
@@ -1,14 +1,20 @@
1
  ---
2
  title: Research Compass
3
  emoji: 🌍
4
- colorFrom: red
5
  colorTo: red
 
6
  sdk: gradio
7
  sdk_version: 5.22.0
8
  app_file: app.py
9
  pinned: false
10
  license: agpl-3.0
11
  short_description: Connect research papers. Discover new insights.
 
 
 
 
 
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Research Compass
3
  emoji: 🌍
4
+ colorFrom: blue
5
  colorTo: red
6
+ python_version: 3.10
7
  sdk: gradio
8
  sdk_version: 5.22.0
9
  app_file: app.py
10
  pinned: false
11
  license: agpl-3.0
12
  short_description: Connect research papers. Discover new insights.
13
+ datasets:
14
+ - "nomadicsynth/arxiv-dataset-abstract-embeddings"
15
+ models:
16
+ - "nomadicsynth/research-compass-arxiv-abstracts-embedding-model"
17
+ - "meta-llama/Llama-3.2-3B-Instruct"
18
  ---
19
 
20
+ Check out the configuration reference at <https://huggingface.co/docs/hub/spaces-config-reference>
app.py ADDED
@@ -0,0 +1,630 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import faiss
5
+ import gradio as gr
6
+ import pandas as pd
7
+ import spaces
8
+ import torch
9
+ from datasets import load_dataset
10
+ from huggingface_hub import InferenceClient, hf_hub_download
11
+ from huggingface_hub import login as hf_hub_login
12
+ from huggingface_hub import upload_file
13
+ from sentence_transformers import SentenceTransformer
14
+
15
+ from arxiv_stuff import ARXIV_CATEGORIES_FLAT
16
+
17
+ # Get HF_TOKEN from environment variables
18
+ HF_TOKEN = os.getenv("HF_TOKEN")
19
+
20
+ # Login to Hugging Face Hub
21
+ hf_hub_login(token=HF_TOKEN, add_to_git_credential=True)
22
+
23
+ # Dataset details
24
+ dataset_name = "nomadicsynth/arxiv-dataset-abstract-embeddings"
25
+ dataset_revision = "v1.0.0"
26
+ local_index_path = "arxiv_faiss_index.faiss"
27
+
28
+ # Embedding model details
29
+ embedding_model_name = "nomadicsynth/research-compass-arxiv-abstracts-embedding-model"
30
+ embedding_model_revision = "2025-01-28_23-06-17-1epochs-12batch-32eval-512embed-final"
31
+
32
+ # Amalysis model details
33
+
34
+ # Settings for Llama-3.3-70B-Instruct
35
+ reasoning_model_id = "meta-llama/Llama-3.3-70B-Instruct"
36
+ max_length = 1024 * 4
37
+ temperature = None
38
+ top_p = None
39
+ presence_penalty = None
40
+
41
+ # Settings for QwQ-32B
42
+ # reasoning_model_id = "Qwen/QwQ-32B"
43
+ # reasoning_start_tag = "<think>"
44
+ # reasoning_end_tag = "</think>"
45
+ # max_length = 1024 * 4
46
+ # temperature = 0.6
47
+ # top_p = 0.95
48
+ # presence_penalty = 0.1
49
+
50
+ # Global variables
51
+ dataset = None
52
+ embedding_model = None
53
+ reasoning_model = None
54
+
55
+
56
+ def save_faiss_index_to_hub():
57
+ """Save the FAISS index to the Hub for easy access"""
58
+ global dataset, local_index_path
59
+ # 1. Save the index to a local file
60
+ dataset["train"].save_faiss_index("embedding", local_index_path)
61
+ print(f"FAISS index saved locally to {local_index_path}")
62
+
63
+ # 2. Upload the index file to the Hub
64
+ remote_path = upload_file(
65
+ path_or_fileobj=local_index_path,
66
+ path_in_repo=local_index_path, # Same name on the Hub
67
+ repo_id=dataset_name, # Use your dataset repo
68
+ token=HF_TOKEN,
69
+ repo_type="dataset", # This is a dataset file
70
+ revision=dataset_revision, # Use the same revision as the dataset
71
+ commit_message="Add FAISS index", # Commit message
72
+ )
73
+
74
+ print(f"FAISS index uploaded to Hub at {remote_path}")
75
+
76
+ # Remove the local file. It's now stored on the Hub.
77
+ os.remove(local_index_path)
78
+
79
+
80
+ def setup_dataset():
81
+ """Load dataset with FAISS index"""
82
+ global dataset
83
+ print("Loading dataset from Hugging Face...")
84
+
85
+ # Load dataset
86
+ dataset = load_dataset(
87
+ dataset_name,
88
+ revision=dataset_revision,
89
+ )
90
+
91
+ # Try to load the index from the Hub
92
+ try:
93
+ print("Downloading pre-built FAISS index...")
94
+ index_path = hf_hub_download(
95
+ repo_id=dataset_name,
96
+ filename="arxiv_faiss_index.faiss",
97
+ revision=dataset_revision,
98
+ token=HF_TOKEN,
99
+ repo_type="dataset",
100
+ )
101
+
102
+ print("Loading pre-built FAISS index...")
103
+ dataset["train"].load_faiss_index("embedding", index_path)
104
+ print("Pre-built FAISS index loaded successfully")
105
+
106
+ except Exception as e:
107
+ print(f"Could not load pre-built index: {e}")
108
+ print("Building new FAISS index...")
109
+
110
+ # Add FAISS index if it doesn't exist
111
+ if not dataset["train"].features.get("embedding"):
112
+ print("Dataset doesn't have 'embedding' column, cannot create FAISS index")
113
+ raise ValueError("Dataset doesn't have 'embedding' column")
114
+
115
+ dataset["train"].add_faiss_index(
116
+ column="embedding",
117
+ metric_type=faiss.METRIC_INNER_PRODUCT,
118
+ string_factory="HNSW,RFlat", # Using reranking
119
+ )
120
+
121
+ # Save the FAISS index to the Hub
122
+ save_faiss_index_to_hub()
123
+
124
+ print(f"Dataset loaded with {len(dataset['train'])} items and FAISS index ready")
125
+
126
+
127
+ def init_embedding_model(model_name_or_path: str, model_revision: str = None) -> SentenceTransformer:
128
+ global embedding_model
129
+
130
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
131
+ embedding_model = SentenceTransformer(
132
+ model_name_or_path,
133
+ revision=model_revision,
134
+ token=HF_TOKEN,
135
+ device=device,
136
+ )
137
+
138
+
139
+ def init_reasoning_model(model_name: str) -> InferenceClient:
140
+ global reasoning_model
141
+ reasoning_model = InferenceClient(
142
+ model=model_name,
143
+ provider="hf-inference",
144
+ api_key=HF_TOKEN,
145
+ )
146
+ return reasoning_model
147
+
148
+
149
+ def generate(messages: list[dict[str, str]]) -> str:
150
+ """
151
+ Generate a response to a list of messages.
152
+
153
+ Args:
154
+ messages: A list of message dictionaries with a "role" and "content" key.
155
+
156
+ Returns:
157
+ The generated response as a string.
158
+ """
159
+ global reasoning_model
160
+
161
+ system_message = {
162
+ "role": "system",
163
+ "content": "You are an expert in evaluating connections between research papers.",
164
+ }
165
+
166
+ messages.insert(0, system_message)
167
+
168
+ response_schema = r"""{
169
+ "$schema": "http://json-schema.org/draft-07/schema#",
170
+ "title": "Generated schema for Root",
171
+ "type": "object",
172
+ "properties": {
173
+ "reasoning": {
174
+ "type": "string"
175
+ },
176
+ "key_connections": {
177
+ "type": "array",
178
+ "items": {
179
+ "type": "object",
180
+ "properties": {
181
+ "connection": {
182
+ "type": "string"
183
+ },
184
+ "description": {
185
+ "type": "string"
186
+ }
187
+ },
188
+ "required": [
189
+ "connection",
190
+ "description"
191
+ ]
192
+ }
193
+ },
194
+ "synergies_and_complementarities": {
195
+ "type": "array",
196
+ "items": {
197
+ "type": "object",
198
+ "properties": {
199
+ "type": {
200
+ "type": "array",
201
+ "items": {
202
+ "type": "string"
203
+ }
204
+ },
205
+ "description": {
206
+ "type": "string"
207
+ }
208
+ },
209
+ "required": [
210
+ "type",
211
+ "description"
212
+ ]
213
+ }
214
+ },
215
+ "research_potential": {
216
+ "type": "array",
217
+ "items": {
218
+ "type": "object",
219
+ "properties": {
220
+ "potential": {
221
+ "type": "string"
222
+ },
223
+ "description": {
224
+ "type": "string"
225
+ }
226
+ },
227
+ "required": [
228
+ "potential",
229
+ "description"
230
+ ]
231
+ }
232
+ },
233
+ "rating": {
234
+ "type": "number"
235
+ },
236
+ "confidence": {
237
+ "type": "number"
238
+ }
239
+ },
240
+ "required": [
241
+ "reasoning",
242
+ "key_connections",
243
+ "synergies_and_complementarities",
244
+ "research_potential",
245
+ "rating",
246
+ "confidence"
247
+ ]
248
+ }"""
249
+
250
+ response_format = {
251
+ "type": "json",
252
+ "value": response_schema,
253
+ }
254
+
255
+ result = reasoning_model.chat.completions.create(
256
+ messages=messages,
257
+ max_tokens=max_length,
258
+ temperature=temperature,
259
+ presence_penalty=presence_penalty,
260
+ response_format=response_format,
261
+ top_p=top_p,
262
+ )
263
+
264
+ output = result.choices[0].message.content.strip()
265
+ return output
266
+
267
+
268
+ @spaces.GPU
269
+ def embed_text(text: str | list[str]) -> torch.Tensor:
270
+ global embedding_model
271
+
272
+ # Strip any leading/trailing whitespace
273
+ text = text.strip() if isinstance(text, str) else [t.strip() for t in text]
274
+ embed_text = embedding_model.encode(text, normalize_embeddings=True) # Ensure vectors are normalized
275
+ return embed_text
276
+
277
+
278
+ def analyse_abstracts(query_abstract: str, compare_abstract: dict) -> str:
279
+ """Analyze the relationship between two abstracts and return formatted analysis"""
280
+ # Highlight the synergies in thesede papers that would justify further research
281
+ messages = [
282
+ {
283
+ "role": "user",
284
+ "content": f"""You are trained in evaluating connections between research papers. Please **identify and analyze the links** between these two papers:
285
+
286
+ Paper 1 Abstract:
287
+ {query_abstract}
288
+
289
+ Paper 2 Abstract:
290
+ {compare_abstract["abstract"]}
291
+
292
+ Consider the following aspects in your evaluation:
293
+
294
+ * **Methodological Cross-Pollination**: How do the methods or approaches from one paper **directly enhance or inform** the other?
295
+ * **Principle or Mechanism Extension**: Do the papers **share underlying principles or mechanisms** that can be **combined or extended** to yield new insights?
296
+ * **Interdisciplinary Connections**: Are there **clear opportunities** for interdisciplinary collaborations or knowledge transfer between the two papers?
297
+ * **Solution or Application Bridge**: Can the solutions or applications presented in one paper be **directly adapted or integrated** with the other to create **novel, actionable outcomes**?
298
+
299
+ Consider the connections in either direction, that is, from Paper 1 -> Paper 2, or vice versa, from Paper 2 -> Paper 1
300
+
301
+ Return a valid JSON object with this structure:
302
+ {{
303
+ "reasoning": "Step-by-step analysis of the papers, highlighting **key established connections**, identified synergies, and **concrete complementarities**. Emphasize the most **critical, actionable insights** or **key takeaways** from the analysis using markdown bold.",
304
+
305
+ # Main connecting concepts, methods, or principles
306
+ "key_connections": [
307
+ {{
308
+ "connection": "connection 1",
309
+ "description": "Brief description (1-2 sentences) for the **established connection**, explaining its **direct relevance** to the synergy analysis."
310
+ }},
311
+ ...
312
+ ],
313
+
314
+ "synergies_and_complementarities": [
315
+ {{
316
+ "type": ["Methodological Cross-Pollination", "Principle or Mechanism Extension", "Interdisciplinary Connections", "Solution or Application Bridge"], # Choose only one type per entry, and only include relevant types to this analysis
317
+ "description": "Brief explanation (1-2 sentences) of the **identified, concrete synergy** or **complementarity**, and a **specific, actionable example** to illustrate the concept."
318
+ }},
319
+ ...
320
+ ],
321
+
322
+ # Novel, actionable outcomes or applications emerging from the synergies
323
+ "research_potential": [
324
+ {{
325
+ "potential": "Actionable outcome or application 1",
326
+ "description": "Brief description (1-2 sentences) of the **concrete potential outcome** or **application**, and a **specific scenario** to illustrate its **direct impact**."
327
+ }},
328
+ ...
329
+ ],
330
+
331
+ "rating": 1-5, # Overall rating of the papers' synergy potential, where:
332
+ # 1 = **No synergy or connection** (definitely no link between the papers)
333
+ # 2 = **Low potential for synergy** (some vague or speculative connection, but highly uncertain)
334
+ # 3 = **Plausible synergy potential** (some potential connections, but requiring further investigation to confirm)
335
+ # 4 = **Established synergy with potential for growth** (clear connections with opportunities for further development)
336
+ # 5 = **High established synergy with direct, clear opportunities** (strong, concrete links with immediate, actionable outcomes)
337
+
338
+ "confidence": 0.0-1.0, # Confidence in your analysis, as a floating-point value representing the probability of your assessment being accurate
339
+ }}
340
+
341
+ Return only the JSON object, with double quotes around key names and all string values.""",
342
+ },
343
+ ]
344
+
345
+ # Generate analysis
346
+ try:
347
+ output = generate(messages)
348
+ except Exception as e:
349
+ return f"Error: {e}"
350
+
351
+ # Parse the JSON output
352
+ try:
353
+ output = json.loads(output)
354
+ except Exception as e:
355
+ return f"Error: {e}"
356
+
357
+ # Format the output as markdown for better display
358
+ key_connections = ""
359
+ synergies_and_complementarities = ""
360
+ research_potential = ""
361
+ if "key_connections" in output:
362
+ for connection in output["key_connections"]:
363
+ key_connections += f"- {connection['connection']}: {connection['description']}\n"
364
+
365
+ if "synergies_and_complementarities" in output:
366
+ for synergy in output["synergies_and_complementarities"]:
367
+ synergies_and_complementarities += f"- {', '.join(synergy['type'])}: {synergy['description']}\n"
368
+
369
+ if "research_potential" in output:
370
+ for potential in output["research_potential"]:
371
+ research_potential += f"- {potential['potential']}: {potential['description']}\n"
372
+
373
+ formatted_output = f"""## Synergy Analysis
374
+
375
+ **Rating**: {'β˜…' * output['rating']}{'β˜†' * (5-output['rating'])} **Confidence**: {'β˜…' * round(output['confidence'] * 5)}{'β˜†' * round((1-output['confidence']) * 5)}
376
+
377
+ ### Key Connections
378
+ {key_connections}
379
+
380
+ ### Synergies and Complementarities
381
+ {synergies_and_complementarities}
382
+
383
+ ### Research Potential
384
+ {research_potential}
385
+
386
+ ### Reasoning
387
+ {output['reasoning']}
388
+ """
389
+ return formatted_output
390
+ # return '```"""\n' + output + '\n"""```'
391
+
392
+
393
+ # arXiv Embedding Dataset Details
394
+ # DatasetDict({
395
+ # train: Dataset({
396
+ # features: ['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'report-no', 'categories', 'license', 'abstract', 'update_date', 'embedding', 'timestamp', 'embedding_model'],
397
+ # num_rows: 2689088
398
+ # })
399
+ # })
400
+
401
+
402
+ def find_synergistic_papers(abstract: str, limit=25) -> list[dict]:
403
+ """Find papers synergistic with the given abstract using FAISS with cosine similarity"""
404
+ global dataset
405
+
406
+ # Generate embedding for the query abstract (normalized for cosine similarity)
407
+ abstract_embedding = embed_text(abstract)
408
+
409
+ # Search for similar papers using FAISS with inner product (cosine similarity for normalized vectors)
410
+ scores, examples = dataset["train"].get_nearest_examples("embedding", abstract_embedding, k=limit)
411
+
412
+ papers = []
413
+ for i in range(len(scores)):
414
+ # With cosine similarity, higher scores are better (closer to 1)
415
+ paper_dict = {
416
+ "id": examples["id"][i],
417
+ "title": examples["title"][i],
418
+ "authors": examples["authors"][i],
419
+ "categories": examples["categories"][i],
420
+ "abstract": examples["abstract"][i],
421
+ "update_date": examples["update_date"][i],
422
+ "synergy_score": float(scores[i]), # Convert to float for serialization
423
+ }
424
+ papers.append(paper_dict)
425
+
426
+ return papers
427
+
428
+
429
+ def format_search_results(abstract: str) -> tuple[pd.DataFrame, list[dict]]:
430
+ """Format search results as a DataFrame for display"""
431
+ # Find papers synergistic with the given abstract
432
+ papers = find_synergistic_papers(abstract)
433
+
434
+ # Convert to DataFrame for display
435
+ df = pd.DataFrame(
436
+ [
437
+ {
438
+ "Title": p["title"],
439
+ "Authors": p["authors"][:50] + "..." if len(p["authors"]) > 50 else p["authors"],
440
+ "Categories": p["categories"],
441
+ "Date": p["update_date"],
442
+ "Match Score": f"{int(p['synergy_score'] * 100)}%",
443
+ "ID": p["id"], # Hidden column for reference
444
+ }
445
+ for p in papers
446
+ ]
447
+ )
448
+
449
+ return df, papers # Return both DataFrame and original data
450
+
451
+
452
+ def format_paper_as_markdown(paper: dict) -> str:
453
+ # Convert category codes to full names, handling unknown categories
454
+ subjects = []
455
+ for subject in paper["categories"].split():
456
+ if subject in ARXIV_CATEGORIES_FLAT:
457
+ subjects.append(ARXIV_CATEGORIES_FLAT[subject])
458
+ else:
459
+ subjects.append(f"Unknown Category ({subject})")
460
+
461
+ paper["title"] = paper["title"].replace("\n", " ").strip()
462
+ paper["authors"] = paper["authors"].replace("\n", " ").strip()
463
+
464
+ return f"""# {paper["title"]}
465
+ ### {paper["authors"]}
466
+ #### {', '.join(subjects)} | {paper["update_date"]} | **Score**: {int(paper['synergy_score'] * 100)}%
467
+ **[arxiv:{paper["id"]}](https://arxiv.org/abs/{paper["id"]})** - [PDF](https://arxiv.org/pdf/{paper["id"]})<br>
468
+
469
+ {paper["abstract"]}
470
+ """
471
+
472
+
473
+ latex_delimiters = [
474
+ {"left": "$$", "right": "$$", "display": True},
475
+ # {"left": "$", "right": "$", "display": False},
476
+ # {"left": "\\(", "right": "\\)", "display": False},
477
+ # {"left": "\\begin{equation}", "right": "\\end{equation}", "display": True},
478
+ # {"left": "\\begin{align}", "right": "\\end{align}", "display": True},
479
+ # {"left": "\\begin{alignat}", "right": "\\end{alignat}", "display": True},
480
+ # {"left": "\\begin{gather}", "right": "\\end{gather}", "display": True},
481
+ # {"left": "\\begin{CD}", "right": "\\end{CD}", "display": True},
482
+ # {"left": "\\[", "right": "\\]", "display": True},
483
+ # {"left": "\\underline{", "right": "}", "display": False},
484
+ # {"left": "\\textit{", "right": "}", "display": False},
485
+ # {"left": "\\textit{", "right": "}", "display": False},
486
+ # {"left": "{", "right": "}", "display": False},
487
+ ]
488
+
489
+
490
+ def create_interface():
491
+ with gr.Blocks(
492
+ css="""
493
+ .cell-menu-button {
494
+ display: none;
495
+ }"""
496
+ ) as demo:
497
+ gr.HTML(
498
+ """
499
+ <div style="text-align: center; margin-bottom: 1rem">
500
+ <h1>Research Compass</h1>
501
+ <p>Find synergistic papers to enrich your research</p>
502
+ <p>An experiment in AI-driven research synergy analysis</p>
503
+ </div>
504
+ """
505
+ )
506
+
507
+ with gr.Accordion(label="Instructions", open=False):
508
+ gr.Markdown(
509
+ """
510
+ 1. **Enter Abstract**: Paste an abstract or describe your research details in the text box.
511
+ 2. **Search for Synergistic Papers**: Click the button to find papers with similar themes.
512
+ 3. **Select a Paper**: Click on a row in the results table to view paper details.
513
+ 4. **Analyze Connection Potential**: Click the button to analyze the synergy potential between the papers.
514
+ 5. **Synergy Analysis**: View the detailed analysis of the connection potential between the papers.
515
+ """
516
+ )
517
+
518
+ abstract_input = gr.Textbox(
519
+ label="Paper Abstract or Description",
520
+ placeholder="Paste an abstract or describe research details...",
521
+ lines=8,
522
+ key="abstract",
523
+ )
524
+ search_btn = gr.Button("Search for Synergistic Papers", variant="primary")
525
+
526
+ # Store full paper data
527
+ paper_data_state = gr.State([])
528
+
529
+ # Store query abstract
530
+ query_abstract_state = gr.State("")
531
+
532
+ # Store selected paper
533
+ selected_paper_state = gr.State(None)
534
+
535
+ # Use Dataframe for results
536
+ results_df = gr.Dataframe(
537
+ headers=["Title", "Authors", "Categories", "Date", "Match Score"],
538
+ datatype=["markdown", "markdown", "str", "date", "str"],
539
+ latex_delimiters=latex_delimiters,
540
+ label="Synergistic Papers",
541
+ interactive=False,
542
+ wrap=False,
543
+ line_breaks=False,
544
+ column_widths=["40%", "20%", "20%", "10%", "10%", "0%"], # Hide ID column
545
+ key="results",
546
+ )
547
+
548
+ with gr.Row():
549
+ with gr.Column(scale=1):
550
+ paper_details_output = gr.Markdown(
551
+ value="# Paper Details",
552
+ label="Paper Details",
553
+ latex_delimiters=latex_delimiters,
554
+ show_copy_button=True,
555
+ key="paper_details",
556
+ )
557
+ analyze_btn = gr.Button("Analyze Connection Potential", variant="primary", interactive=False)
558
+ with gr.Column(scale=1):
559
+ # Analysis output
560
+ analysis_output = gr.Markdown(
561
+ value="# Synergy Analysis",
562
+ label="Synergy Analysis",
563
+ latex_delimiters=latex_delimiters,
564
+ show_copy_button=True,
565
+ key="analysis_output",
566
+ )
567
+
568
+ # Display paper details when row is selected
569
+ def on_select(evt: gr.SelectData, papers, query):
570
+ selected_index = evt.index[0] # Get the row index
571
+ selected = papers[selected_index]
572
+
573
+ # Format paper details
574
+ details_md = format_paper_as_markdown(selected)
575
+
576
+ return details_md, selected
577
+
578
+ # Connect search button to the search function
579
+ search_btn.click(
580
+ format_search_results,
581
+ inputs=[abstract_input],
582
+ outputs=[results_df, paper_data_state],
583
+ ).then(
584
+ lambda x: x, # Identity function to pass through the abstract
585
+ inputs=[abstract_input],
586
+ outputs=[query_abstract_state],
587
+ ).then(
588
+ lambda: None, # Reset selected paper
589
+ outputs=[selected_paper_state],
590
+ ).then(
591
+ lambda: gr.update(interactive=False), # Disable analyze button until paper selected
592
+ outputs=[analyze_btn],
593
+ ).then(
594
+ lambda: "# Synergy Analysis", # Clear previous analysis
595
+ outputs=[analysis_output],
596
+ )
597
+
598
+ # Use built-in select event from Dataframe
599
+ results_df.select(
600
+ on_select,
601
+ inputs=[paper_data_state, query_abstract_state],
602
+ outputs=[paper_details_output, selected_paper_state],
603
+ ).then(
604
+ lambda: gr.update(interactive=True), # Enable analyze button when paper selected
605
+ outputs=[analyze_btn],
606
+ )
607
+
608
+ # Connect analyze button to run analysis
609
+ analyze_btn.click(
610
+ analyse_abstracts,
611
+ inputs=[query_abstract_state, selected_paper_state],
612
+ outputs=[analysis_output],
613
+ show_progress_on=[paper_details_output, analysis_output],
614
+ )
615
+
616
+ return demo
617
+
618
+
619
+ if __name__ == "__main__":
620
+ # Load dataset with FAISS index
621
+ setup_dataset()
622
+
623
+ # Initialize the embedding model
624
+ init_embedding_model(embedding_model_name, embedding_model_revision)
625
+
626
+ # Initialize the reasoning model
627
+ reasoning_model = init_reasoning_model(reasoning_model_id)
628
+
629
+ demo = create_interface()
630
+ demo.queue().launch(ssr_mode=False)
arxiv_stuff.py ADDED
@@ -0,0 +1,372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from datetime import datetime, timedelta, timezone
3
+ from typing import Optional, Union
4
+
5
+ import arxiv
6
+ import requests
7
+
8
+ # Initialize the arXiv API client
9
+ arxiv_client = arxiv.Client()
10
+
11
+ ARXIV_CATEGORIES = {
12
+ "Computer Science": {
13
+ "cs.AI": "Artificial Intelligence",
14
+ "cs.AR": "Hardware Architecture",
15
+ "cs.CC": "Computational Complexity",
16
+ "cs.CE": "Computational Engineering",
17
+ "cs.CG": "Computational Geometry",
18
+ "cs.CL": "Computation and Language",
19
+ "cs.CR": "Cryptography and Security",
20
+ "cs.CV": "Computer Vision and Pattern Recognition",
21
+ "cs.CY": "Computers and Society",
22
+ "cs.DB": "Databases",
23
+ "cs.DC": "Distributed Computing",
24
+ "cs.DL": "Digital Libraries",
25
+ "cs.DM": "Discrete Mathematics",
26
+ "cs.DS": "Data Structures and Algorithms",
27
+ "cs.ET": "Emerging Technologies",
28
+ "cs.FL": "Formal Languages and Automata Theory",
29
+ "cs.GL": "General Literature",
30
+ "cs.GR": "Graphics",
31
+ "cs.GT": "Computer Science and Game Theory",
32
+ "cs.HC": "Human-Computer Interaction",
33
+ "cs.IR": "Information Retrieval",
34
+ "cs.IT": "Information Theory",
35
+ "cs.LG": "Machine Learning",
36
+ "cs.LO": "Logic in Computer Science",
37
+ "cs.MA": "Multiagent Systems",
38
+ "cs.MM": "Multimedia",
39
+ "cs.MS": "Mathematical Software",
40
+ "cs.NA": "Numerical Analysis",
41
+ "cs.NE": "Neural and Evolutionary Computing",
42
+ "cs.NI": "Networking and Internet Architecture",
43
+ "cs.OH": "Other Computer Science",
44
+ "cs.OS": "Operating Systems",
45
+ "cs.PF": "Performance",
46
+ "cs.PL": "Programming Languages",
47
+ "cs.RO": "Robotics",
48
+ "cs.SC": "Symbolic Computation",
49
+ "cs.SD": "Sound",
50
+ "cs.SE": "Software Engineering",
51
+ "cs.SI": "Social and Information Networks",
52
+ "cs.SY": "Systems and Control",
53
+ },
54
+ "Physics": {
55
+ "astro-ph.CO": "Cosmology and Nongalactic Astrophysics",
56
+ "astro-ph.EP": "Earth and Planetary Astrophysics",
57
+ "astro-ph.GA": "Astrophysics of Galaxies",
58
+ "astro-ph.HE": "High Energy Astrophysical Phenomena",
59
+ "astro-ph.IM": "Instrumentation and Methods for Astrophysics",
60
+ "astro-ph.SR": "Solar and Stellar Astrophysics",
61
+ "cond-mat.dis-nn": "Disordered Systems and Neural Networks",
62
+ "cond-mat.mes-hall": "Mesoscale and Nanoscale Physics",
63
+ "cond-mat.mtrl-sci": "Materials Science",
64
+ "cond-mat.other": "Other Condensed Matter",
65
+ "cond-mat.quant-gas": "Quantum Gases",
66
+ "cond-mat.soft": "Soft Condensed Matter",
67
+ "cond-mat.stat-mech": "Statistical Mechanics",
68
+ "cond-mat.str-el": "Strongly Correlated Electrons",
69
+ "cond-mat.supr-con": "Superconductivity",
70
+ "gr-qc": "General Relativity and Quantum Cosmology",
71
+ "hep-ex": "High Energy Physics - Experiment",
72
+ "hep-lat": "High Energy Physics - Lattice",
73
+ "hep-ph": "High Energy Physics - Phenomenology",
74
+ "hep-th": "High Energy Physics - Theory",
75
+ "math-ph": "Mathematical Physics",
76
+ "nlin.AO": "Adaptation and Self-Organizing Systems",
77
+ "nlin.CD": "Chaotic Dynamics",
78
+ "nlin.CG": "Cellular Automata and Lattice Gases",
79
+ "nlin.PS": "Pattern Formation and Solitons",
80
+ "nlin.SI": "Exactly Solvable and Integrable Systems",
81
+ "nucl-ex": "Nuclear Experiment",
82
+ "nucl-th": "Nuclear Theory",
83
+ "physics.acc-ph": "Accelerator Physics",
84
+ "physics.ao-ph": "Atmospheric and Oceanic Physics",
85
+ "physics.app-ph": "Applied Physics",
86
+ "physics.atm-clus": "Atomic and Molecular Clusters",
87
+ "physics.atom-ph": "Atomic Physics",
88
+ "physics.bio-ph": "Biological Physics",
89
+ "physics.chem-ph": "Chemical Physics",
90
+ "physics.class-ph": "Classical Physics",
91
+ "physics.comp-ph": "Computational Physics",
92
+ "physics.data-an": "Data Analysis, Statistics and Probability",
93
+ "physics.ed-ph": "Physics Education",
94
+ "physics.flu-dyn": "Fluid Dynamics",
95
+ "physics.gen-ph": "General Physics",
96
+ "physics.geo-ph": "Geophysics",
97
+ "physics.hist-ph": "History and Philosophy of Physics",
98
+ "physics.ins-det": "Instrumentation and Detectors",
99
+ "physics.med-ph": "Medical Physics",
100
+ "physics.optics": "Optics",
101
+ "physics.plasm-ph": "Plasma Physics",
102
+ "physics.pop-ph": "Popular Physics",
103
+ "physics.soc-ph": "Physics and Society",
104
+ "physics.space-ph": "Space Physics",
105
+ "quant-ph": "Quantum Physics",
106
+ },
107
+ "Mathematics": {
108
+ "math.AC": "Commutative Algebra",
109
+ "math.AG": "Algebraic Geometry",
110
+ "math.AP": "Analysis of PDEs",
111
+ "math.AT": "Algebraic Topology",
112
+ "math.CA": "Classical Analysis and ODEs",
113
+ "math.CO": "Combinatorics",
114
+ "math.CT": "Category Theory",
115
+ "math.CV": "Complex Variables",
116
+ "math.DG": "Differential Geometry",
117
+ "math.DS": "Dynamical Systems",
118
+ "math.FA": "Functional Analysis",
119
+ "math.GM": "General Mathematics",
120
+ "math.GN": "General Topology",
121
+ "math.GR": "Group Theory",
122
+ "math.GT": "Geometric Topology",
123
+ "math.HO": "History and Overview",
124
+ "math.IT": "Information Theory",
125
+ "math.KT": "K-Theory and Homology",
126
+ "math.LO": "Logic",
127
+ "math.MG": "Metric Geometry",
128
+ "math.MP": "Mathematical Physics",
129
+ "math.NA": "Numerical Analysis",
130
+ "math.NT": "Number Theory",
131
+ "math.OA": "Operator Algebras",
132
+ "math.OC": "Optimization and Control",
133
+ "math.PR": "Probability",
134
+ "math.QA": "Quantum Algebra",
135
+ "math.RA": "Rings and Algebras",
136
+ "math.RT": "Representation Theory",
137
+ "math.SG": "Symplectic Geometry",
138
+ "math.SP": "Spectral Theory",
139
+ "math.ST": "Statistics Theory",
140
+ },
141
+ "Biology": {
142
+ "q-bio.BM": "Biomolecules",
143
+ "q-bio.CB": "Cell Behavior",
144
+ "q-bio.GN": "Genomics",
145
+ "q-bio.MN": "Molecular Networks",
146
+ "q-bio.NC": "Neurons and Cognition",
147
+ "q-bio.OT": "Other Quantitative Biology",
148
+ "q-bio.PE": "Populations and Evolution",
149
+ "q-bio.QM": "Quantitative Methods",
150
+ "q-bio.SC": "Subcellular Processes",
151
+ "q-bio.TO": "Tissues and Organs",
152
+ },
153
+ "Statistics": {
154
+ "stat.AP": "Applications",
155
+ "stat.CO": "Computation",
156
+ "stat.ME": "Methodology",
157
+ "stat.ML": "Machine Learning",
158
+ "stat.OT": "Other Statistics",
159
+ "stat.TH": "Theory",
160
+ },
161
+ "Economics": {
162
+ "econ.EM": "Econometrics",
163
+ "econ.GN": "General Economics",
164
+ "econ.TH": "Economic Theory",
165
+ },
166
+ "Electrical Engineering and Systems Science": {
167
+ "eess.AS": "Audio and Speech Processing",
168
+ "eess.IV": "Image and Video Processing",
169
+ "eess.SP": "Signal Processing",
170
+ "eess.SY": "Systems and Control",
171
+ },
172
+ }
173
+
174
+ # Flatten categories for easy access
175
+
176
+ ARXIV_CATEGORIES_FLAT: dict[str, str] = {}
177
+
178
+ for main_cat, subcats in ARXIV_CATEGORIES.items():
179
+ for cat_code, cat_name in subcats.items():
180
+ ARXIV_CATEGORIES_FLAT[cat_code] = f"{main_cat}: {cat_name} ({cat_code})"
181
+
182
+
183
+ def clean_doi(doi: str) -> str:
184
+ if doi.startswith("https://arxiv.org/abs/"):
185
+ return doi.split("/")[-1]
186
+ elif doi.startswith("https://arxiv.org/pdf/"):
187
+ return doi.split("/")[-1].split(".pdf")[0]
188
+ elif doi.startswith("arXiv:"):
189
+ return doi.split(":")[-1]
190
+ elif doi.startswith("http"):
191
+ return "Invalid arXiv link. Please provide a link to the abstract page."
192
+ elif doi.startswith("10."):
193
+ # Fetch the arXiv ID from the DOI
194
+ base_url = "http://dx.doi.org/"
195
+ headers = {"Accept": "application/x-bibtex"}
196
+ response = requests.get(base_url + doi, headers=headers)
197
+
198
+ if response.status_code != 200:
199
+ return "No paper found with that DOI."
200
+
201
+ bibtext = response.text
202
+ return bibtext.split("eprint = {arXiv:")[-1].split("}")[0]
203
+ elif doi.replace("v", "").replace(".", "").isdigit():
204
+ return doi
205
+ else:
206
+ return "Invalid arXiv ID or DOI. Please provide a valid arXiv ID, DOI, or arXiv URL."
207
+
208
+
209
+ def retrieve_arxiv_paper(arxiv_id: str) -> dict:
210
+ """Retrieve the paper from arXiv.
211
+
212
+ Args:
213
+ arxiv_id: The arXiv ID of the paper to retrieve.
214
+
215
+ Returns:
216
+ A dict object representing the paper.
217
+ """
218
+ global arxiv_client
219
+ query_string = arxiv.Search(id_list=[arxiv_id])
220
+
221
+ results = arxiv_client.results(query_string)
222
+ try:
223
+ paper = next(results)
224
+ except StopIteration:
225
+ raise ValueError("No paper found with that arXiv ID.")
226
+
227
+ return dict(
228
+ arxiv_id=paper.entry_id.split("/")[-1],
229
+ title=paper.title,
230
+ authors=[author.name for author in paper.authors],
231
+ categories=[category for category in paper.categories],
232
+ abstract=paper.summary,
233
+ published_date=paper.published,
234
+ )
235
+
236
+
237
+ def build_arxiv_category_query(
238
+ categories: Union[str, list[str]],
239
+ start_date: Optional[datetime] = None,
240
+ end_date: Optional[datetime] = None,
241
+ start: int = 0,
242
+ max_results: int = 5,
243
+ ) -> arxiv.Search:
244
+ """Builds a query string for the arXiv API.
245
+
246
+ Args:
247
+ categories: List of arXiv categories to search.
248
+ start_date: Optional datetime to start search from.
249
+ end_date: Optional datetime to end search at.
250
+ start: Index of first result to return.
251
+ max_results: Maximum number of results to return.
252
+
253
+ Returns:
254
+ arxiv.Search object with the constructed query.
255
+ """
256
+ if isinstance(categories, str):
257
+ categories = [categories]
258
+
259
+ if start_date and end_date:
260
+ date_str = f"{start_date.strftime('%Y%m%d%H%M')}+TO+{end_date.strftime('%Y%m%d%H%M')}"
261
+ elif start_date:
262
+ date_str = start_date.strftime("%Y%m%d%H%M")
263
+ date_str = f"{date_str}+TO+{datetime.now(timezone.utc).strftime('%Y%m%d%H%M')}"
264
+ else:
265
+ date_str = ""
266
+
267
+ # Construct the category string, including the date range if provided
268
+ cat_str = " OR ".join([f"cat:{cat}" for cat in categories]) if categories else ""
269
+ if date_str:
270
+ cat_str = f"({cat_str}) AND submittedDate:[{date_str}]"
271
+
272
+ search = arxiv.Search(
273
+ query=cat_str,
274
+ max_results=max_results,
275
+ sort_by=arxiv.SortCriterion.SubmittedDate,
276
+ sort_order=arxiv.SortOrder.Descending,
277
+ )
278
+
279
+ return search
280
+
281
+
282
+ def retrieve_arxiv_papers(
283
+ categories: Union[str, list[str]],
284
+ start_date: Optional[datetime] = None,
285
+ end_date: Optional[datetime] = None,
286
+ start: int = 0,
287
+ max_results: int = 5,
288
+ ) -> list[dict]:
289
+ """Searches arXiv for papers in the given categories.
290
+
291
+ Args:
292
+ categories: List of arXiv categories to search.
293
+ start_date: Date to start searching from.
294
+ end_date: Date to stop searching at.
295
+ start: Index of the first result to return.
296
+ max_results: Maximum number of results to return.
297
+
298
+ Returns:
299
+ A generator of dict objects.
300
+ """
301
+
302
+ global arxiv_client
303
+ query_string = build_arxiv_category_query(categories, start_date, end_date, start, max_results)
304
+
305
+ papers = []
306
+ for result in arxiv_client.results(query_string, offset=start):
307
+ papers.append(
308
+ dict(
309
+ arxiv_id=result.entry_id.split("/")[-1],
310
+ title=result.title,
311
+ authors=[author.name for author in result.authors],
312
+ categories=[category for category in result.categories],
313
+ abstract=result.summary,
314
+ published_date=result.published,
315
+ )
316
+ )
317
+
318
+ return papers
319
+
320
+
321
+ def fetch_todays_papers(categories: Union[str, list[str]], start: int = 0, max_results: int = 5) -> list[dict]:
322
+ """Fetch papers from today in the given categories
323
+
324
+ Args:
325
+ categories: List of arXiv categories to search
326
+ start: Index of the first result to return
327
+ max_results: Maximum number of results to return
328
+
329
+ Returns:
330
+ Generator of arXiv.Result objects
331
+ """
332
+ if isinstance(categories, str):
333
+ categories = [categories]
334
+
335
+ papers = retrieve_arxiv_papers(
336
+ categories,
337
+ start_date=datetime.now(timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0),
338
+ start=start,
339
+ max_results=max_results,
340
+ )
341
+
342
+ return papers
343
+
344
+
345
+ def fetch_24_hours_papers(categories: Union[str, list[str]], start: int = 0, max_results: int = 5) -> list[dict]:
346
+ """Fetch papers from the last 24 hours in the given categories
347
+
348
+ Args:
349
+ categories: List of arXiv categories to search
350
+ start: Index of the first result to return
351
+ max_results: Maximum number of results to return
352
+
353
+ Returns:
354
+ Generator of dict objects
355
+ """
356
+ if isinstance(categories, str):
357
+ categories = [categories]
358
+
359
+ twenty_four_hours_ago = datetime.now(timezone.utc) - timedelta(days=1)
360
+
361
+ papers = retrieve_arxiv_papers(
362
+ categories,
363
+ start_date=twenty_four_hours_ago,
364
+ start=start,
365
+ max_results=max_results,
366
+ )
367
+
368
+ return papers
369
+
370
+
371
+ def random_arxiv_category():
372
+ return random.choice(list(ARXIV_CATEGORIES_FLAT.values()))
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ accelerate
2
+ arxiv
3
+ bitsandbytes
4
+ datasets
5
+ faiss-cpu
6
+ gradio
7
+ sentence-transformers
8
+ spaces
9
+ torch