Spaces:

kiyer
/

synthesist

Sleeping

File size: 7,465 Bytes

import streamlit as st
st.set_page_config(layout="wide")

import numpy as np
from abc import ABC, abstractmethod
from typing import List, Dict, Any, Tuple
from collections import defaultdict
from tqdm import tqdm
import pandas as pd 
from datetime import datetime, date
from datasets import load_dataset, load_from_disk
from collections import Counter

import yaml, json, requests, sys, os, time, hickle
import concurrent.futures
ts = time.time()

from nltk.corpus import stopwords
import nltk
from openai import OpenAI
import anthropic
import cohere
import faiss

import spacy
from string import punctuation
import pytextrank

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textrank")

try:
    stopwords.words('english')
except:
    nltk.download('stopwords')
    stopwords.words('english')

from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.palettes import Spectral10

# try to load the data, if it doesn't work, pull from huggingface and make the pickle files

st.image('local_files/pathfinder_logo.png')

st.expander("About", expanded=False).write(
        """
        Pathfinder v2.0 is a framework for searching and visualizing astronomy papers on the [arXiv](https://arxiv.org/) and [ADS](https://ui.adsabs.harvard.edu/) using the context
        sensitivity from modern large language models (LLMs) to better parse patterns in paper contexts.
        
        This tool was built during the [JSALT workshop](https://www.clsp.jhu.edu/2024-jelinek-summer-workshop-on-speech-and-language-technology/) to do awesome things.

        **👈 Select a tool from the sidebar** to see some examples
        of what this framework can do!

        ### Tool summary:
        - Please wait while the initial data loads and compiles, this takes about a minute initially.
        - `Paper search` looks for relevant papers given an arxiv id or a question.

        This is not meant to be a replacement to existing tools like the
        [ADS](https://ui.adsabs.harvard.edu/),
        [arxivsorter](https://www.arxivsorter.org/), semantic search or google scholar, but rather a supplement to find papers
        that otherwise might be missed during a literature survey.
        It is trained on astro-ph (astrophysics of galaxies) papers up to last-year-ish mined from arxiv and supplemented with ADS metadata,
        if you are interested in extending it please reach out!
        
        
        Also add: more pages, actual generation, diff. toggles for retrieval/gen, feedback form, socials, literature, contact us, copyright, collaboration, etc.

        The image below shows a representation of all the astro-ph.GA papers that can be explored in more detail
        using the `Arxiv embedding` page. The papers tend to cluster together by similarity, and result in an
        atlas that shows well studied (forests) and currently uncharted areas (water).
        """
    )
    
    
    
if 'arxiv_corpus' not in st.session_state:
    with st.spinner('loading data...'):
        try:    
            arxiv_corpus = load_from_disk('data/')
        except:
            st.write('downloading data')
            arxiv_corpus = load_dataset('kiyer/pathfinder_arxiv_data',split='train')
            arxiv_corpus.save_to_disk('data/')
        st.session_state.arxiv_corpus = arxiv_corpus
        st.toast('loaded arxiv corpus')
    
if 'ids' not in st.session_state:
    st.session_state.ids = arxiv_corpus['ads_id']
    st.session_state.titles = arxiv_corpus['title']
    st.session_state.abstracts = arxiv_corpus['abstract']
    st.session_state.cites = arxiv_corpus['cites']
    st.session_state.years = arxiv_corpus['date']
    st.toast('done caching. time:taken: {}'.format(time.time()-ts))
   
        
else:
    arxiv_corpus = st.session_state.arxiv_corpus
# Function to simulate question answering (replace with actual implementation)
def answer_question(question, keywords, toggles, method, question_type):
    # Simulated answer (replace with actual logic)
    return f"Answer to '{question}' using method {method} for {question_type} question."

# Function to simulate paper retrieval (replace with actual implementation)
def get_papers():
    # Simulated paper data (replace with actual data retrieval)
    return pd.DataFrame({
        'Title': ['Paper 1', 'Paper 2', 'Paper 3'],
        'Relevance': [0.9, 0.7, 0.5]
    })

# Function to create embedding plot (replace with actual implementation)
def create_embedding_plot():
    # Simulated embedding data (replace with actual embedding calculation)
    source = ColumnDataSource(data=dict(
        x=[1, 2, 3, 4, 5],
        y=[6, 7, 2, 4, 5],
        colors=Spectral10[0:5],
        labels=['A', 'B', 'C', 'D', 'E']
    ))
    
    p = figure(width=400, height=400, title="Embedding Map")
    p.circle('x', 'y', size=20, source=source, color='colors', alpha=0.6)
    return p

# Function to simulate keyword extraction (replace with actual implementation)
def extract_keywords(question):
    # Simulated keyword extraction (replace with actual logic)
    return ['keyword1', 'keyword2', 'keyword3']

# Function to estimate consensus (replace with actual implementation)
def estimate_consensus():
    # Simulated consensus estimation (replace with actual calculation)
    return 0.75

# Streamlit app
def main():
    
    # st.title("Question Answering App")
    
    # Sidebar (Inputs)
    st.sidebar.header("Inputs")
    question = st.sidebar.text_input("Enter your question:")
    extra_keywords = st.sidebar.text_input("Enter extra keywords (comma-separated):")
    
    st.sidebar.subheader("Toggles")
    toggle_a = st.sidebar.checkbox("Toggle A")
    toggle_b = st.sidebar.checkbox("Toggle B")
    toggle_c = st.sidebar.checkbox("Toggle C")
    
    method = st.sidebar.radio("Choose a method:", ["h1", "h2", "h3"])
    question_type = st.sidebar.selectbox("Select question type:", ["Type 1", "Type 2", "Type 3"])
    store_output = st.sidebar.checkbox("Store the output")

    submit_button = st.sidebar.button("Submit")

    # Main page (Outputs)
    if submit_button:
        # Process inputs
        keywords = [kw.strip() for kw in extra_keywords.split(',')] if extra_keywords else []
        toggles = {'A': toggle_a, 'B': toggle_b, 'C': toggle_c}

        # Generate outputs
        answer = answer_question(question, keywords, toggles, method, question_type)
        papers_df = get_papers()
        embedding_plot = create_embedding_plot()
        triggered_keywords = extract_keywords(question)
        consensus = estimate_consensus()

        # Display outputs
        st.header("Results")
        
        col1, col2 = st.columns(2)
        
        with col1:
            st.subheader("Answer")
            st.write(answer)
            
            st.subheader("Papers Used")
            st.dataframe(papers_df)
            
            st.subheader("Triggered Keywords")
            st.write(", ".join(triggered_keywords))
        
        with col2:
            st.subheader("Embedding Map")
            st.bokeh_chart(embedding_plot)
            
            st.subheader("Question Type")
            st.write(question_type)
            
            st.subheader("Consensus Estimate")
            st.write(f"{consensus:.2%}")

        if store_output:
            st.success("Output stored successfully!")
    else:
        st.info("Use the sidebar to input parameters and submit to see results.")

if __name__ == "__main__":
    main()