Spaces:

nomadicsynth
/

inkling

Running on Zero

App Files Files Community

nomadicsynth commited on Mar 21

Commit

e6a1391

1 Parent(s): 2a20f7f

Flatten repo history

Browse files

Files changed (5) hide show

.gitignore +1 -0
README.md +8 -2
app.py +630 -0
arxiv_stuff.py +372 -0
requirements.txt +9 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .env

README.md CHANGED Viewed

@@ -1,14 +1,20 @@
 ---
 title: Research Compass
 emoji: 🌍
-colorFrom: red
 colorTo: red
 sdk: gradio
 sdk_version: 5.22.0
 app_file: app.py
 pinned: false
 license: agpl-3.0
 short_description: Connect research papers. Discover new insights.
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Research Compass
 emoji: 🌍
+colorFrom: blue
 colorTo: red
+python_version: 3.10
 sdk: gradio
 sdk_version: 5.22.0
 app_file: app.py
 pinned: false
 license: agpl-3.0
 short_description: Connect research papers. Discover new insights.
+datasets:
+    - "nomadicsynth/arxiv-dataset-abstract-embeddings"
+models:
+    - "nomadicsynth/research-compass-arxiv-abstracts-embedding-model"
+    - "meta-llama/Llama-3.2-3B-Instruct"
 ---
+Check out the configuration reference at <https://huggingface.co/docs/hub/spaces-config-reference>

app.py ADDED Viewed

	@@ -0,0 +1,630 @@

+import json
+import os
+import faiss
+import gradio as gr
+import pandas as pd
+import spaces
+import torch
+from datasets import load_dataset
+from huggingface_hub import InferenceClient, hf_hub_download
+from huggingface_hub import login as hf_hub_login
+from huggingface_hub import upload_file
+from sentence_transformers import SentenceTransformer
+from arxiv_stuff import ARXIV_CATEGORIES_FLAT
+# Get HF_TOKEN from environment variables
+HF_TOKEN = os.getenv("HF_TOKEN")
+# Login to Hugging Face Hub
+hf_hub_login(token=HF_TOKEN, add_to_git_credential=True)
+# Dataset details
+dataset_name = "nomadicsynth/arxiv-dataset-abstract-embeddings"
+dataset_revision = "v1.0.0"
+local_index_path = "arxiv_faiss_index.faiss"
+# Embedding model details
+embedding_model_name = "nomadicsynth/research-compass-arxiv-abstracts-embedding-model"
+embedding_model_revision = "2025-01-28_23-06-17-1epochs-12batch-32eval-512embed-final"
+# Amalysis model details
+# Settings for Llama-3.3-70B-Instruct
+reasoning_model_id = "meta-llama/Llama-3.3-70B-Instruct"
+max_length = 1024 * 4
+temperature = None
+top_p = None
+presence_penalty = None
+# Settings for QwQ-32B
+# reasoning_model_id = "Qwen/QwQ-32B"
+# reasoning_start_tag = "<think>"
+# reasoning_end_tag = "</think>"
+# max_length = 1024 * 4
+# temperature = 0.6
+# top_p = 0.95
+# presence_penalty = 0.1
+# Global variables
+dataset = None
+embedding_model = None
+reasoning_model = None
+def save_faiss_index_to_hub():
+    """Save the FAISS index to the Hub for easy access"""
+    global dataset, local_index_path
+    # 1. Save the index to a local file
+    dataset["train"].save_faiss_index("embedding", local_index_path)
+    print(f"FAISS index saved locally to {local_index_path}")
+    # 2. Upload the index file to the Hub
+    remote_path = upload_file(
+        path_or_fileobj=local_index_path,
+        path_in_repo=local_index_path,  # Same name on the Hub
+        repo_id=dataset_name,  # Use your dataset repo
+        token=HF_TOKEN,
+        repo_type="dataset",  # This is a dataset file
+        revision=dataset_revision,  # Use the same revision as the dataset
+        commit_message="Add FAISS index",  # Commit message
+    )
+    print(f"FAISS index uploaded to Hub at {remote_path}")
+    # Remove the local file. It's now stored on the Hub.
+    os.remove(local_index_path)
+def setup_dataset():
+    """Load dataset with FAISS index"""
+    global dataset
+    print("Loading dataset from Hugging Face...")
+    # Load dataset
+    dataset = load_dataset(
+        dataset_name,
+        revision=dataset_revision,
+    )
+    # Try to load the index from the Hub
+    try:
+        print("Downloading pre-built FAISS index...")
+        index_path = hf_hub_download(
+            repo_id=dataset_name,
+            filename="arxiv_faiss_index.faiss",
+            revision=dataset_revision,
+            token=HF_TOKEN,
+            repo_type="dataset",
+        )
+        print("Loading pre-built FAISS index...")
+        dataset["train"].load_faiss_index("embedding", index_path)
+        print("Pre-built FAISS index loaded successfully")
+    except Exception as e:
+        print(f"Could not load pre-built index: {e}")
+        print("Building new FAISS index...")
+        # Add FAISS index if it doesn't exist
+        if not dataset["train"].features.get("embedding"):
+            print("Dataset doesn't have 'embedding' column, cannot create FAISS index")
+            raise ValueError("Dataset doesn't have 'embedding' column")
+        dataset["train"].add_faiss_index(
+            column="embedding",
+            metric_type=faiss.METRIC_INNER_PRODUCT,
+            string_factory="HNSW,RFlat",  # Using reranking
+        )
+        # Save the FAISS index to the Hub
+        save_faiss_index_to_hub()
+    print(f"Dataset loaded with {len(dataset['train'])} items and FAISS index ready")
+def init_embedding_model(model_name_or_path: str, model_revision: str = None) -> SentenceTransformer:
+    global embedding_model
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    embedding_model = SentenceTransformer(
+        model_name_or_path,
+        revision=model_revision,
+        token=HF_TOKEN,
+        device=device,
+    )
+def init_reasoning_model(model_name: str) -> InferenceClient:
+    global reasoning_model
+    reasoning_model = InferenceClient(
+        model=model_name,
+        provider="hf-inference",
+        api_key=HF_TOKEN,
+    )
+    return reasoning_model
+def generate(messages: list[dict[str, str]]) -> str:
+    """
+    Generate a response to a list of messages.
+    Args:
+        messages: A list of message dictionaries with a "role" and "content" key.
+    Returns:
+        The generated response as a string.
+    """
+    global reasoning_model
+    system_message = {
+        "role": "system",
+        "content": "You are an expert in evaluating connections between research papers.",
+    }
+    messages.insert(0, system_message)
+    response_schema = r"""{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "Generated schema for Root",
+  "type": "object",
+  "properties": {
+    "reasoning": {
+      "type": "string"
+    },
+    "key_connections": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "connection": {
+            "type": "string"
+          },
+          "description": {
+            "type": "string"
+          }
+        },
+        "required": [
+          "connection",
+          "description"
+        ]
+      }
+    },
+    "synergies_and_complementarities": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "type": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            }
+          },
+          "description": {
+            "type": "string"
+          }
+        },
+        "required": [
+          "type",
+          "description"
+        ]
+      }
+    },
+    "research_potential": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "potential": {
+            "type": "string"
+          },
+          "description": {
+            "type": "string"
+          }
+        },
+        "required": [
+          "potential",
+          "description"
+        ]
+      }
+    },
+    "rating": {
+      "type": "number"
+    },
+    "confidence": {
+      "type": "number"
+    }
+  },
+  "required": [
+    "reasoning",
+    "key_connections",
+    "synergies_and_complementarities",
+    "research_potential",
+    "rating",
+    "confidence"
+  ]
+}"""
+    response_format = {
+        "type": "json",
+        "value": response_schema,
+    }
+    result = reasoning_model.chat.completions.create(
+        messages=messages,
+        max_tokens=max_length,
+        temperature=temperature,
+        presence_penalty=presence_penalty,
+        response_format=response_format,
+        top_p=top_p,
+    )
+    output = result.choices[0].message.content.strip()
+    return output
+@spaces.GPU
+def embed_text(text: str | list[str]) -> torch.Tensor:
+    global embedding_model
+    # Strip any leading/trailing whitespace
+    text = text.strip() if isinstance(text, str) else [t.strip() for t in text]
+    embed_text = embedding_model.encode(text, normalize_embeddings=True)  # Ensure vectors are normalized
+    return embed_text
+def analyse_abstracts(query_abstract: str, compare_abstract: dict) -> str:
+    """Analyze the relationship between two abstracts and return formatted analysis"""
+    # Highlight the synergies in thesede papers that would justify further research
+    messages = [
+        {
+            "role": "user",
+            "content": f"""You are trained in evaluating connections between research papers. Please **identify and analyze the links** between these two papers:
+Paper 1 Abstract:
+{query_abstract}
+Paper 2 Abstract:
+{compare_abstract["abstract"]}
+Consider the following aspects in your evaluation:
+* **Methodological Cross-Pollination**: How do the methods or approaches from one paper **directly enhance or inform** the other?
+* **Principle or Mechanism Extension**: Do the papers **share underlying principles or mechanisms** that can be **combined or extended** to yield new insights?
+* **Interdisciplinary Connections**: Are there **clear opportunities** for interdisciplinary collaborations or knowledge transfer between the two papers?
+* **Solution or Application Bridge**: Can the solutions or applications presented in one paper be **directly adapted or integrated** with the other to create **novel, actionable outcomes**?
+Consider the connections in either direction, that is, from Paper 1 -> Paper 2, or vice versa, from Paper 2 -> Paper 1
+Return a valid JSON object with this structure:
+{{
+    "reasoning": "Step-by-step analysis of the papers, highlighting **key established connections**, identified synergies, and **concrete complementarities**. Emphasize the most **critical, actionable insights** or **key takeaways** from the analysis using markdown bold.",
+    # Main connecting concepts, methods, or principles
+    "key_connections": [
+        {{
+            "connection": "connection 1",
+            "description": "Brief description (1-2 sentences) for the **established connection**, explaining its **direct relevance** to the synergy analysis."
+        }},
+       ...
+    ],
+    "synergies_and_complementarities": [
+        {{
+            "type": ["Methodological Cross-Pollination", "Principle or Mechanism Extension", "Interdisciplinary Connections", "Solution or Application Bridge"],  # Choose only one type per entry, and only include relevant types to this analysis
+            "description": "Brief explanation (1-2 sentences) of the **identified, concrete synergy** or **complementarity**, and a **specific, actionable example** to illustrate the concept."
+        }},
+      ...
+    ],
+    # Novel, actionable outcomes or applications emerging from the synergies
+    "research_potential": [
+        {{
+            "potential": "Actionable outcome or application 1",
+            "description": "Brief description (1-2 sentences) of the **concrete potential outcome** or **application**, and a **specific scenario** to illustrate its **direct impact**."
+        }},
+       ...
+    ],
+    "rating": 1-5,  # Overall rating of the papers' synergy potential, where:
+                      # 1 = **No synergy or connection** (definitely no link between the papers)
+                      # 2 = **Low potential for synergy** (some vague or speculative connection, but highly uncertain)
+                      # 3 = **Plausible synergy potential** (some potential connections, but requiring further investigation to confirm)
+                      # 4 = **Established synergy with potential for growth** (clear connections with opportunities for further development)
+                      # 5 = **High established synergy with direct, clear opportunities** (strong, concrete links with immediate, actionable outcomes)
+    "confidence": 0.0-1.0,  # Confidence in your analysis, as a floating-point value representing the probability of your assessment being accurate
+}}
+Return only the JSON object, with double quotes around key names and all string values.""",
+        },
+    ]
+    # Generate analysis
+    try:
+        output = generate(messages)
+    except Exception as e:
+        return f"Error: {e}"
+    # Parse the JSON output
+    try:
+        output = json.loads(output)
+    except Exception as e:
+        return f"Error: {e}"
+    # Format the output as markdown for better display
+    key_connections = ""
+    synergies_and_complementarities = ""
+    research_potential = ""
+    if "key_connections" in output:
+        for connection in output["key_connections"]:
+            key_connections += f"- {connection['connection']}: {connection['description']}\n"
+    if "synergies_and_complementarities" in output:
+        for synergy in output["synergies_and_complementarities"]:
+            synergies_and_complementarities += f"- {', '.join(synergy['type'])}: {synergy['description']}\n"
+    if "research_potential" in output:
+        for potential in output["research_potential"]:
+            research_potential += f"- {potential['potential']}: {potential['description']}\n"
+    formatted_output = f"""## Synergy Analysis
+**Rating**: {'★' * output['rating']}{'☆' * (5-output['rating'])}        **Confidence**: {'★' * round(output['confidence'] * 5)}{'☆' * round((1-output['confidence']) * 5)}
+### Key Connections
+{key_connections}
+### Synergies and Complementarities
+{synergies_and_complementarities}
+### Research Potential
+{research_potential}
+### Reasoning
+{output['reasoning']}
+"""
+    return formatted_output
+    # return '```"""\n' + output + '\n"""```'
+# arXiv Embedding Dataset Details
+# DatasetDict({
+#     train: Dataset({
+#         features: ['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'report-no', 'categories', 'license', 'abstract', 'update_date', 'embedding', 'timestamp', 'embedding_model'],
+#         num_rows: 2689088
+#     })
+# })
+def find_synergistic_papers(abstract: str, limit=25) -> list[dict]:
+    """Find papers synergistic with the given abstract using FAISS with cosine similarity"""
+    global dataset
+    # Generate embedding for the query abstract (normalized for cosine similarity)
+    abstract_embedding = embed_text(abstract)
+    # Search for similar papers using FAISS with inner product (cosine similarity for normalized vectors)
+    scores, examples = dataset["train"].get_nearest_examples("embedding", abstract_embedding, k=limit)
+    papers = []
+    for i in range(len(scores)):
+        # With cosine similarity, higher scores are better (closer to 1)
+        paper_dict = {
+            "id": examples["id"][i],
+            "title": examples["title"][i],
+            "authors": examples["authors"][i],
+            "categories": examples["categories"][i],
+            "abstract": examples["abstract"][i],
+            "update_date": examples["update_date"][i],
+            "synergy_score": float(scores[i]),  # Convert to float for serialization
+        }
+        papers.append(paper_dict)
+    return papers
+def format_search_results(abstract: str) -> tuple[pd.DataFrame, list[dict]]:
+    """Format search results as a DataFrame for display"""
+    # Find papers synergistic with the given abstract
+    papers = find_synergistic_papers(abstract)
+    # Convert to DataFrame for display
+    df = pd.DataFrame(
+        [
+            {
+                "Title": p["title"],
+                "Authors": p["authors"][:50] + "..." if len(p["authors"]) > 50 else p["authors"],
+                "Categories": p["categories"],
+                "Date": p["update_date"],
+                "Match Score": f"{int(p['synergy_score'] * 100)}%",
+                "ID": p["id"],  # Hidden column for reference
+            }
+            for p in papers
+        ]
+    )
+    return df, papers  # Return both DataFrame and original data
+def format_paper_as_markdown(paper: dict) -> str:
+    # Convert category codes to full names, handling unknown categories
+    subjects = []
+    for subject in paper["categories"].split():
+        if subject in ARXIV_CATEGORIES_FLAT:
+            subjects.append(ARXIV_CATEGORIES_FLAT[subject])
+        else:
+            subjects.append(f"Unknown Category ({subject})")
+    paper["title"] = paper["title"].replace("\n", " ").strip()
+    paper["authors"] = paper["authors"].replace("\n", " ").strip()
+    return f"""# {paper["title"]}
+### {paper["authors"]}
+#### {', '.join(subjects)} | {paper["update_date"]} | **Score**: {int(paper['synergy_score'] * 100)}%
+**[arxiv:{paper["id"]}](https://arxiv.org/abs/{paper["id"]})** - [PDF](https://arxiv.org/pdf/{paper["id"]})<br>
+{paper["abstract"]}
+"""
+latex_delimiters = [
+    {"left": "$$", "right": "$$", "display": True},
+    # {"left": "$", "right": "$", "display": False},
+    # {"left": "\\(", "right": "\\)", "display": False},
+    # {"left": "\\begin{equation}", "right": "\\end{equation}", "display": True},
+    # {"left": "\\begin{align}", "right": "\\end{align}", "display": True},
+    # {"left": "\\begin{alignat}", "right": "\\end{alignat}", "display": True},
+    # {"left": "\\begin{gather}", "right": "\\end{gather}", "display": True},
+    # {"left": "\\begin{CD}", "right": "\\end{CD}", "display": True},
+    # {"left": "\\[", "right": "\\]", "display": True},
+    # {"left": "\\underline{", "right": "}", "display": False},
+    # {"left": "\\textit{", "right": "}", "display": False},
+    # {"left": "\\textit{", "right": "}", "display": False},
+    # {"left": "{", "right": "}", "display": False},
+]
+def create_interface():
+    with gr.Blocks(
+        css="""
+    .cell-menu-button {
+        display: none;
+    }"""
+    ) as demo:
+        gr.HTML(
+            """
+            <div style="text-align: center; margin-bottom: 1rem">
+                <h1>Research Compass</h1>
+                <p>Find synergistic papers to enrich your research</p>
+                <p>An experiment in AI-driven research synergy analysis</p>
+            </div>
+        """
+        )
+        with gr.Accordion(label="Instructions", open=False):
+            gr.Markdown(
+                """
+                1. **Enter Abstract**: Paste an abstract or describe your research details in the text box.
+                2. **Search for Synergistic Papers**: Click the button to find papers with similar themes.
+                3. **Select a Paper**: Click on a row in the results table to view paper details.
+                4. **Analyze Connection Potential**: Click the button to analyze the synergy potential between the papers.
+                5. **Synergy Analysis**: View the detailed analysis of the connection potential between the papers.
+                """
+            )
+        abstract_input = gr.Textbox(
+            label="Paper Abstract or Description",
+            placeholder="Paste an abstract or describe research details...",
+            lines=8,
+            key="abstract",
+        )
+        search_btn = gr.Button("Search for Synergistic Papers", variant="primary")
+        # Store full paper data
+        paper_data_state = gr.State([])
+        # Store query abstract
+        query_abstract_state = gr.State("")
+        # Store selected paper
+        selected_paper_state = gr.State(None)
+        # Use Dataframe for results
+        results_df = gr.Dataframe(
+            headers=["Title", "Authors", "Categories", "Date", "Match Score"],
+            datatype=["markdown", "markdown", "str", "date", "str"],
+            latex_delimiters=latex_delimiters,
+            label="Synergistic Papers",
+            interactive=False,
+            wrap=False,
+            line_breaks=False,
+            column_widths=["40%", "20%", "20%", "10%", "10%", "0%"],  # Hide ID column
+            key="results",
+        )
+        with gr.Row():
+            with gr.Column(scale=1):
+                paper_details_output = gr.Markdown(
+                    value="# Paper Details",
+                    label="Paper Details",
+                    latex_delimiters=latex_delimiters,
+                    show_copy_button=True,
+                    key="paper_details",
+                )
+                analyze_btn = gr.Button("Analyze Connection Potential", variant="primary", interactive=False)
+            with gr.Column(scale=1):
+                # Analysis output
+                analysis_output = gr.Markdown(
+                    value="# Synergy Analysis",
+                    label="Synergy Analysis",
+                    latex_delimiters=latex_delimiters,
+                    show_copy_button=True,
+                    key="analysis_output",
+                )
+        # Display paper details when row is selected
+        def on_select(evt: gr.SelectData, papers, query):
+            selected_index = evt.index[0]  # Get the row index
+            selected = papers[selected_index]
+            # Format paper details
+            details_md = format_paper_as_markdown(selected)
+            return details_md, selected
+        # Connect search button to the search function
+        search_btn.click(
+            format_search_results,
+            inputs=[abstract_input],
+            outputs=[results_df, paper_data_state],
+        ).then(
+            lambda x: x,  # Identity function to pass through the abstract
+            inputs=[abstract_input],
+            outputs=[query_abstract_state],
+        ).then(
+            lambda: None,  # Reset selected paper
+            outputs=[selected_paper_state],
+        ).then(
+            lambda: gr.update(interactive=False),  # Disable analyze button until paper selected
+            outputs=[analyze_btn],
+        ).then(
+            lambda: "# Synergy Analysis",  # Clear previous analysis
+            outputs=[analysis_output],
+        )
+        # Use built-in select event from Dataframe
+        results_df.select(
+            on_select,
+            inputs=[paper_data_state, query_abstract_state],
+            outputs=[paper_details_output, selected_paper_state],
+        ).then(
+            lambda: gr.update(interactive=True),  # Enable analyze button when paper selected
+            outputs=[analyze_btn],
+        )
+        # Connect analyze button to run analysis
+        analyze_btn.click(
+            analyse_abstracts,
+            inputs=[query_abstract_state, selected_paper_state],
+            outputs=[analysis_output],
+            show_progress_on=[paper_details_output, analysis_output],
+        )
+    return demo
+if __name__ == "__main__":
+    # Load dataset with FAISS index
+    setup_dataset()
+    # Initialize the embedding model
+    init_embedding_model(embedding_model_name, embedding_model_revision)
+    # Initialize the reasoning model
+    reasoning_model = init_reasoning_model(reasoning_model_id)
+    demo = create_interface()
+    demo.queue().launch(ssr_mode=False)

arxiv_stuff.py ADDED Viewed

	@@ -0,0 +1,372 @@

+import random
+from datetime import datetime, timedelta, timezone
+from typing import Optional, Union
+import arxiv
+import requests
+# Initialize the arXiv API client
+arxiv_client = arxiv.Client()
+ARXIV_CATEGORIES = {
+    "Computer Science": {
+        "cs.AI": "Artificial Intelligence",
+        "cs.AR": "Hardware Architecture",
+        "cs.CC": "Computational Complexity",
+        "cs.CE": "Computational Engineering",
+        "cs.CG": "Computational Geometry",
+        "cs.CL": "Computation and Language",
+        "cs.CR": "Cryptography and Security",
+        "cs.CV": "Computer Vision and Pattern Recognition",
+        "cs.CY": "Computers and Society",
+        "cs.DB": "Databases",
+        "cs.DC": "Distributed Computing",
+        "cs.DL": "Digital Libraries",
+        "cs.DM": "Discrete Mathematics",
+        "cs.DS": "Data Structures and Algorithms",
+        "cs.ET": "Emerging Technologies",
+        "cs.FL": "Formal Languages and Automata Theory",
+        "cs.GL": "General Literature",
+        "cs.GR": "Graphics",
+        "cs.GT": "Computer Science and Game Theory",
+        "cs.HC": "Human-Computer Interaction",
+        "cs.IR": "Information Retrieval",
+        "cs.IT": "Information Theory",
+        "cs.LG": "Machine Learning",
+        "cs.LO": "Logic in Computer Science",
+        "cs.MA": "Multiagent Systems",
+        "cs.MM": "Multimedia",
+        "cs.MS": "Mathematical Software",
+        "cs.NA": "Numerical Analysis",
+        "cs.NE": "Neural and Evolutionary Computing",
+        "cs.NI": "Networking and Internet Architecture",
+        "cs.OH": "Other Computer Science",
+        "cs.OS": "Operating Systems",
+        "cs.PF": "Performance",
+        "cs.PL": "Programming Languages",
+        "cs.RO": "Robotics",
+        "cs.SC": "Symbolic Computation",
+        "cs.SD": "Sound",
+        "cs.SE": "Software Engineering",
+        "cs.SI": "Social and Information Networks",
+        "cs.SY": "Systems and Control",
+    },
+    "Physics": {
+        "astro-ph.CO": "Cosmology and Nongalactic Astrophysics",
+        "astro-ph.EP": "Earth and Planetary Astrophysics",
+        "astro-ph.GA": "Astrophysics of Galaxies",
+        "astro-ph.HE": "High Energy Astrophysical Phenomena",
+        "astro-ph.IM": "Instrumentation and Methods for Astrophysics",
+        "astro-ph.SR": "Solar and Stellar Astrophysics",
+        "cond-mat.dis-nn": "Disordered Systems and Neural Networks",
+        "cond-mat.mes-hall": "Mesoscale and Nanoscale Physics",
+        "cond-mat.mtrl-sci": "Materials Science",
+        "cond-mat.other": "Other Condensed Matter",
+        "cond-mat.quant-gas": "Quantum Gases",
+        "cond-mat.soft": "Soft Condensed Matter",
+        "cond-mat.stat-mech": "Statistical Mechanics",
+        "cond-mat.str-el": "Strongly Correlated Electrons",
+        "cond-mat.supr-con": "Superconductivity",
+        "gr-qc": "General Relativity and Quantum Cosmology",
+        "hep-ex": "High Energy Physics - Experiment",
+        "hep-lat": "High Energy Physics - Lattice",
+        "hep-ph": "High Energy Physics - Phenomenology",
+        "hep-th": "High Energy Physics - Theory",
+        "math-ph": "Mathematical Physics",
+        "nlin.AO": "Adaptation and Self-Organizing Systems",
+        "nlin.CD": "Chaotic Dynamics",
+        "nlin.CG": "Cellular Automata and Lattice Gases",
+        "nlin.PS": "Pattern Formation and Solitons",
+        "nlin.SI": "Exactly Solvable and Integrable Systems",
+        "nucl-ex": "Nuclear Experiment",
+        "nucl-th": "Nuclear Theory",
+        "physics.acc-ph": "Accelerator Physics",
+        "physics.ao-ph": "Atmospheric and Oceanic Physics",
+        "physics.app-ph": "Applied Physics",
+        "physics.atm-clus": "Atomic and Molecular Clusters",
+        "physics.atom-ph": "Atomic Physics",
+        "physics.bio-ph": "Biological Physics",
+        "physics.chem-ph": "Chemical Physics",
+        "physics.class-ph": "Classical Physics",
+        "physics.comp-ph": "Computational Physics",
+        "physics.data-an": "Data Analysis, Statistics and Probability",
+        "physics.ed-ph": "Physics Education",
+        "physics.flu-dyn": "Fluid Dynamics",
+        "physics.gen-ph": "General Physics",
+        "physics.geo-ph": "Geophysics",
+        "physics.hist-ph": "History and Philosophy of Physics",
+        "physics.ins-det": "Instrumentation and Detectors",
+        "physics.med-ph": "Medical Physics",
+        "physics.optics": "Optics",
+        "physics.plasm-ph": "Plasma Physics",
+        "physics.pop-ph": "Popular Physics",
+        "physics.soc-ph": "Physics and Society",
+        "physics.space-ph": "Space Physics",
+        "quant-ph": "Quantum Physics",
+    },
+    "Mathematics": {
+        "math.AC": "Commutative Algebra",
+        "math.AG": "Algebraic Geometry",
+        "math.AP": "Analysis of PDEs",
+        "math.AT": "Algebraic Topology",
+        "math.CA": "Classical Analysis and ODEs",
+        "math.CO": "Combinatorics",
+        "math.CT": "Category Theory",
+        "math.CV": "Complex Variables",
+        "math.DG": "Differential Geometry",
+        "math.DS": "Dynamical Systems",
+        "math.FA": "Functional Analysis",
+        "math.GM": "General Mathematics",
+        "math.GN": "General Topology",
+        "math.GR": "Group Theory",
+        "math.GT": "Geometric Topology",
+        "math.HO": "History and Overview",
+        "math.IT": "Information Theory",
+        "math.KT": "K-Theory and Homology",
+        "math.LO": "Logic",
+        "math.MG": "Metric Geometry",
+        "math.MP": "Mathematical Physics",
+        "math.NA": "Numerical Analysis",
+        "math.NT": "Number Theory",
+        "math.OA": "Operator Algebras",
+        "math.OC": "Optimization and Control",
+        "math.PR": "Probability",
+        "math.QA": "Quantum Algebra",
+        "math.RA": "Rings and Algebras",
+        "math.RT": "Representation Theory",
+        "math.SG": "Symplectic Geometry",
+        "math.SP": "Spectral Theory",
+        "math.ST": "Statistics Theory",
+    },
+    "Biology": {
+        "q-bio.BM": "Biomolecules",
+        "q-bio.CB": "Cell Behavior",
+        "q-bio.GN": "Genomics",
+        "q-bio.MN": "Molecular Networks",
+        "q-bio.NC": "Neurons and Cognition",
+        "q-bio.OT": "Other Quantitative Biology",
+        "q-bio.PE": "Populations and Evolution",
+        "q-bio.QM": "Quantitative Methods",
+        "q-bio.SC": "Subcellular Processes",
+        "q-bio.TO": "Tissues and Organs",
+    },
+    "Statistics": {
+        "stat.AP": "Applications",
+        "stat.CO": "Computation",
+        "stat.ME": "Methodology",
+        "stat.ML": "Machine Learning",
+        "stat.OT": "Other Statistics",
+        "stat.TH": "Theory",
+    },
+    "Economics": {
+        "econ.EM": "Econometrics",
+        "econ.GN": "General Economics",
+        "econ.TH": "Economic Theory",
+    },
+    "Electrical Engineering and Systems Science": {
+        "eess.AS": "Audio and Speech Processing",
+        "eess.IV": "Image and Video Processing",
+        "eess.SP": "Signal Processing",
+        "eess.SY": "Systems and Control",
+    },
+}
+# Flatten categories for easy access
+ARXIV_CATEGORIES_FLAT: dict[str, str] = {}
+for main_cat, subcats in ARXIV_CATEGORIES.items():
+    for cat_code, cat_name in subcats.items():
+        ARXIV_CATEGORIES_FLAT[cat_code] = f"{main_cat}: {cat_name} ({cat_code})"
+def clean_doi(doi: str) -> str:
+    if doi.startswith("https://arxiv.org/abs/"):
+        return doi.split("/")[-1]
+    elif doi.startswith("https://arxiv.org/pdf/"):
+        return doi.split("/")[-1].split(".pdf")[0]
+    elif doi.startswith("arXiv:"):
+        return doi.split(":")[-1]
+    elif doi.startswith("http"):
+        return "Invalid arXiv link. Please provide a link to the abstract page."
+    elif doi.startswith("10."):
+        # Fetch the arXiv ID from the DOI
+        base_url = "http://dx.doi.org/"
+        headers = {"Accept": "application/x-bibtex"}
+        response = requests.get(base_url + doi, headers=headers)
+        if response.status_code != 200:
+            return "No paper found with that DOI."
+        bibtext = response.text
+        return bibtext.split("eprint = {arXiv:")[-1].split("}")[0]
+    elif doi.replace("v", "").replace(".", "").isdigit():
+        return doi
+    else:
+        return "Invalid arXiv ID or DOI. Please provide a valid arXiv ID, DOI, or arXiv URL."
+def retrieve_arxiv_paper(arxiv_id: str) -> dict:
+    """Retrieve the paper from arXiv.
+    Args:
+        arxiv_id: The arXiv ID of the paper to retrieve.
+    Returns:
+        A dict object representing the paper.
+    """
+    global arxiv_client
+    query_string = arxiv.Search(id_list=[arxiv_id])
+    results = arxiv_client.results(query_string)
+    try:
+        paper = next(results)
+    except StopIteration:
+        raise ValueError("No paper found with that arXiv ID.")
+    return dict(
+        arxiv_id=paper.entry_id.split("/")[-1],
+        title=paper.title,
+        authors=[author.name for author in paper.authors],
+        categories=[category for category in paper.categories],
+        abstract=paper.summary,
+        published_date=paper.published,
+    )
+def build_arxiv_category_query(
+    categories: Union[str, list[str]],
+    start_date: Optional[datetime] = None,
+    end_date: Optional[datetime] = None,
+    start: int = 0,
+    max_results: int = 5,
+) -> arxiv.Search:
+    """Builds a query string for the arXiv API.
+    Args:
+        categories: List of arXiv categories to search.
+        start_date: Optional datetime to start search from.
+        end_date: Optional datetime to end search at.
+        start: Index of first result to return.
+        max_results: Maximum number of results to return.
+    Returns:
+        arxiv.Search object with the constructed query.
+    """
+    if isinstance(categories, str):
+        categories = [categories]
+    if start_date and end_date:
+        date_str = f"{start_date.strftime('%Y%m%d%H%M')}+TO+{end_date.strftime('%Y%m%d%H%M')}"
+    elif start_date:
+        date_str = start_date.strftime("%Y%m%d%H%M")
+        date_str = f"{date_str}+TO+{datetime.now(timezone.utc).strftime('%Y%m%d%H%M')}"
+    else:
+        date_str = ""
+    # Construct the category string, including the date range if provided
+    cat_str = " OR ".join([f"cat:{cat}" for cat in categories]) if categories else ""
+    if date_str:
+        cat_str = f"({cat_str}) AND submittedDate:[{date_str}]"
+    search = arxiv.Search(
+        query=cat_str,
+        max_results=max_results,
+        sort_by=arxiv.SortCriterion.SubmittedDate,
+        sort_order=arxiv.SortOrder.Descending,
+    )
+    return search
+def retrieve_arxiv_papers(
+    categories: Union[str, list[str]],
+    start_date: Optional[datetime] = None,
+    end_date: Optional[datetime] = None,
+    start: int = 0,
+    max_results: int = 5,
+) -> list[dict]:
+    """Searches arXiv for papers in the given categories.
+    Args:
+        categories: List of arXiv categories to search.
+        start_date: Date to start searching from.
+        end_date: Date to stop searching at.
+        start: Index of the first result to return.
+        max_results: Maximum number of results to return.
+    Returns:
+        A generator of dict objects.
+    """
+    global arxiv_client
+    query_string = build_arxiv_category_query(categories, start_date, end_date, start, max_results)
+    papers = []
+    for result in arxiv_client.results(query_string, offset=start):
+        papers.append(
+            dict(
+                arxiv_id=result.entry_id.split("/")[-1],
+                title=result.title,
+                authors=[author.name for author in result.authors],
+                categories=[category for category in result.categories],
+                abstract=result.summary,
+                published_date=result.published,
+            )
+        )
+    return papers
+def fetch_todays_papers(categories: Union[str, list[str]], start: int = 0, max_results: int = 5) -> list[dict]:
+    """Fetch papers from today in the given categories
+    Args:
+        categories: List of arXiv categories to search
+        start: Index of the first result to return
+        max_results: Maximum number of results to return
+    Returns:
+        Generator of arXiv.Result objects
+    """
+    if isinstance(categories, str):
+        categories = [categories]
+    papers = retrieve_arxiv_papers(
+        categories,
+        start_date=datetime.now(timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0),
+        start=start,
+        max_results=max_results,
+    )
+    return papers
+def fetch_24_hours_papers(categories: Union[str, list[str]], start: int = 0, max_results: int = 5) -> list[dict]:
+    """Fetch papers from the last 24 hours in the given categories
+    Args:
+        categories: List of arXiv categories to search
+        start: Index of the first result to return
+        max_results: Maximum number of results to return
+    Returns:
+        Generator of dict objects
+    """
+    if isinstance(categories, str):
+        categories = [categories]
+    twenty_four_hours_ago = datetime.now(timezone.utc) - timedelta(days=1)
+    papers = retrieve_arxiv_papers(
+        categories,
+        start_date=twenty_four_hours_ago,
+        start=start,
+        max_results=max_results,
+    )
+    return papers
+def random_arxiv_category():
+    return random.choice(list(ARXIV_CATEGORIES_FLAT.values()))

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+accelerate
+arxiv
+bitsandbytes
+datasets
+faiss-cpu
+gradio
+sentence-transformers
+spaces
+torch