Spaces:
Sleeping
Sleeping
import streamlit as st | |
st.set_page_config(layout="wide") | |
import numpy as np | |
from abc import ABC, abstractmethod | |
from typing import List, Dict, Any, Tuple | |
from collections import defaultdict | |
from tqdm import tqdm | |
import pandas as pd | |
from datetime import datetime, date | |
from datasets import load_dataset, load_from_disk | |
from collections import Counter | |
import yaml, json, requests, sys, os, time, hickle | |
import concurrent.futures | |
ts = time.time() | |
from nltk.corpus import stopwords | |
import nltk | |
from openai import OpenAI | |
import anthropic | |
import cohere | |
import faiss | |
import spacy | |
from string import punctuation | |
import pytextrank | |
nlp = spacy.load("en_core_web_sm") | |
nlp.add_pipe("textrank") | |
try: | |
stopwords.words('english') | |
except: | |
nltk.download('stopwords') | |
stopwords.words('english') | |
from bokeh.plotting import figure | |
from bokeh.models import ColumnDataSource | |
from bokeh.palettes import Spectral10 | |
# try to load the data, if it doesn't work, pull from huggingface and make the pickle files | |
st.image('local_files/pathfinder_logo.png') | |
st.expander("About", expanded=False).write( | |
""" | |
Pathfinder v2.0 is a framework for searching and visualizing astronomy papers on the [arXiv](https://arxiv.org/) and [ADS](https://ui.adsabs.harvard.edu/) using the context | |
sensitivity from modern large language models (LLMs) to better parse patterns in paper contexts. | |
This tool was built during the [JSALT workshop](https://www.clsp.jhu.edu/2024-jelinek-summer-workshop-on-speech-and-language-technology/) to do awesome things. | |
**π Select a tool from the sidebar** to see some examples | |
of what this framework can do! | |
### Tool summary: | |
- Please wait while the initial data loads and compiles, this takes about a minute initially. | |
- `Paper search` looks for relevant papers given an arxiv id or a question. | |
This is not meant to be a replacement to existing tools like the | |
[ADS](https://ui.adsabs.harvard.edu/), | |
[arxivsorter](https://www.arxivsorter.org/), semantic search or google scholar, but rather a supplement to find papers | |
that otherwise might be missed during a literature survey. | |
It is trained on astro-ph (astrophysics of galaxies) papers up to last-year-ish mined from arxiv and supplemented with ADS metadata, | |
if you are interested in extending it please reach out! | |
Also add: more pages, actual generation, diff. toggles for retrieval/gen, feedback form, socials, literature, contact us, copyright, collaboration, etc. | |
The image below shows a representation of all the astro-ph.GA papers that can be explored in more detail | |
using the `Arxiv embedding` page. The papers tend to cluster together by similarity, and result in an | |
atlas that shows well studied (forests) and currently uncharted areas (water). | |
""" | |
) | |
if 'arxiv_corpus' not in st.session_state: | |
with st.spinner('loading data...'): | |
try: | |
arxiv_corpus = load_from_disk('data/') | |
except: | |
st.write('downloading data') | |
arxiv_corpus = load_dataset('kiyer/pathfinder_arxiv_data',split='train') | |
arxiv_corpus.save_to_disk('data/') | |
st.session_state.arxiv_corpus = arxiv_corpus | |
st.toast('loaded arxiv corpus') | |
if 'ids' not in st.session_state: | |
st.session_state.ids = arxiv_corpus['ads_id'] | |
st.session_state.titles = arxiv_corpus['title'] | |
st.session_state.abstracts = arxiv_corpus['abstract'] | |
st.session_state.cites = arxiv_corpus['cites'] | |
st.session_state.years = arxiv_corpus['date'] | |
st.toast('done caching. time:taken: {}'.format(time.time()-ts)) | |
else: | |
arxiv_corpus = st.session_state.arxiv_corpus | |
# Function to simulate question answering (replace with actual implementation) | |
def answer_question(question, keywords, toggles, method, question_type): | |
# Simulated answer (replace with actual logic) | |
return f"Answer to '{question}' using method {method} for {question_type} question." | |
# Function to simulate paper retrieval (replace with actual implementation) | |
def get_papers(): | |
# Simulated paper data (replace with actual data retrieval) | |
return pd.DataFrame({ | |
'Title': ['Paper 1', 'Paper 2', 'Paper 3'], | |
'Relevance': [0.9, 0.7, 0.5] | |
}) | |
# Function to create embedding plot (replace with actual implementation) | |
def create_embedding_plot(): | |
# Simulated embedding data (replace with actual embedding calculation) | |
source = ColumnDataSource(data=dict( | |
x=[1, 2, 3, 4, 5], | |
y=[6, 7, 2, 4, 5], | |
colors=Spectral10[0:5], | |
labels=['A', 'B', 'C', 'D', 'E'] | |
)) | |
p = figure(width=400, height=400, title="Embedding Map") | |
p.circle('x', 'y', size=20, source=source, color='colors', alpha=0.6) | |
return p | |
# Function to simulate keyword extraction (replace with actual implementation) | |
def extract_keywords(question): | |
# Simulated keyword extraction (replace with actual logic) | |
return ['keyword1', 'keyword2', 'keyword3'] | |
# Function to estimate consensus (replace with actual implementation) | |
def estimate_consensus(): | |
# Simulated consensus estimation (replace with actual calculation) | |
return 0.75 | |
# Streamlit app | |
def main(): | |
# st.title("Question Answering App") | |
# Sidebar (Inputs) | |
st.sidebar.header("Inputs") | |
question = st.sidebar.text_input("Enter your question:") | |
extra_keywords = st.sidebar.text_input("Enter extra keywords (comma-separated):") | |
st.sidebar.subheader("Toggles") | |
toggle_a = st.sidebar.checkbox("Toggle A") | |
toggle_b = st.sidebar.checkbox("Toggle B") | |
toggle_c = st.sidebar.checkbox("Toggle C") | |
method = st.sidebar.radio("Choose a method:", ["h1", "h2", "h3"]) | |
question_type = st.sidebar.selectbox("Select question type:", ["Type 1", "Type 2", "Type 3"]) | |
store_output = st.sidebar.checkbox("Store the output") | |
submit_button = st.sidebar.button("Submit") | |
# Main page (Outputs) | |
if submit_button: | |
# Process inputs | |
keywords = [kw.strip() for kw in extra_keywords.split(',')] if extra_keywords else [] | |
toggles = {'A': toggle_a, 'B': toggle_b, 'C': toggle_c} | |
# Generate outputs | |
answer = answer_question(question, keywords, toggles, method, question_type) | |
papers_df = get_papers() | |
embedding_plot = create_embedding_plot() | |
triggered_keywords = extract_keywords(question) | |
consensus = estimate_consensus() | |
# Display outputs | |
st.header("Results") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.subheader("Answer") | |
st.write(answer) | |
st.subheader("Papers Used") | |
st.dataframe(papers_df) | |
st.subheader("Triggered Keywords") | |
st.write(", ".join(triggered_keywords)) | |
with col2: | |
st.subheader("Embedding Map") | |
st.bokeh_chart(embedding_plot) | |
st.subheader("Question Type") | |
st.write(question_type) | |
st.subheader("Consensus Estimate") | |
st.write(f"{consensus:.2%}") | |
if store_output: | |
st.success("Output stored successfully!") | |
else: | |
st.info("Use the sidebar to input parameters and submit to see results.") | |
if __name__ == "__main__": | |
main() |