import streamlit as st st.set_page_config(layout="wide") import numpy as np from abc import ABC, abstractmethod from typing import List, Dict, Any, Tuple from collections import defaultdict from tqdm import tqdm import pandas as pd from datetime import datetime, date from datasets import load_dataset, load_from_disk from collections import Counter import yaml, json, requests, sys, os, time, hickle import concurrent.futures ts = time.time() from nltk.corpus import stopwords import nltk from openai import OpenAI import anthropic import cohere import faiss import spacy from string import punctuation import pytextrank nlp = spacy.load("en_core_web_sm") nlp.add_pipe("textrank") try: stopwords.words('english') except: nltk.download('stopwords') stopwords.words('english') from bokeh.plotting import figure from bokeh.models import ColumnDataSource from bokeh.palettes import Spectral10 # try to load the data, if it doesn't work, pull from huggingface and make the pickle files st.image('local_files/pathfinder_logo.png') st.expander("About", expanded=False).write( """ Pathfinder v2.0 is a framework for searching and visualizing astronomy papers on the [arXiv](https://arxiv.org/) and [ADS](https://ui.adsabs.harvard.edu/) using the context sensitivity from modern large language models (LLMs) to better parse patterns in paper contexts. This tool was built during the [JSALT workshop](https://www.clsp.jhu.edu/2024-jelinek-summer-workshop-on-speech-and-language-technology/) to do awesome things. **👈 Select a tool from the sidebar** to see some examples of what this framework can do! ### Tool summary: - Please wait while the initial data loads and compiles, this takes about a minute initially. - `Paper search` looks for relevant papers given an arxiv id or a question. This is not meant to be a replacement to existing tools like the [ADS](https://ui.adsabs.harvard.edu/), [arxivsorter](https://www.arxivsorter.org/), semantic search or google scholar, but rather a supplement to find papers that otherwise might be missed during a literature survey. It is trained on astro-ph (astrophysics of galaxies) papers up to last-year-ish mined from arxiv and supplemented with ADS metadata, if you are interested in extending it please reach out! Also add: more pages, actual generation, diff. toggles for retrieval/gen, feedback form, socials, literature, contact us, copyright, collaboration, etc. The image below shows a representation of all the astro-ph.GA papers that can be explored in more detail using the `Arxiv embedding` page. The papers tend to cluster together by similarity, and result in an atlas that shows well studied (forests) and currently uncharted areas (water). """ ) if 'arxiv_corpus' not in st.session_state: with st.spinner('loading data...'): try: arxiv_corpus = load_from_disk('data/') except: st.write('downloading data') arxiv_corpus = load_dataset('kiyer/pathfinder_arxiv_data',split='train') arxiv_corpus.save_to_disk('data/') st.session_state.arxiv_corpus = arxiv_corpus st.toast('loaded arxiv corpus') if 'ids' not in st.session_state: st.session_state.ids = arxiv_corpus['ads_id'] st.session_state.titles = arxiv_corpus['title'] st.session_state.abstracts = arxiv_corpus['abstract'] st.session_state.cites = arxiv_corpus['cites'] st.session_state.years = arxiv_corpus['date'] st.toast('done caching. time:taken: {}'.format(time.time()-ts)) else: arxiv_corpus = st.session_state.arxiv_corpus # Function to simulate question answering (replace with actual implementation) def answer_question(question, keywords, toggles, method, question_type): # Simulated answer (replace with actual logic) return f"Answer to '{question}' using method {method} for {question_type} question." # Function to simulate paper retrieval (replace with actual implementation) def get_papers(): # Simulated paper data (replace with actual data retrieval) return pd.DataFrame({ 'Title': ['Paper 1', 'Paper 2', 'Paper 3'], 'Relevance': [0.9, 0.7, 0.5] }) # Function to create embedding plot (replace with actual implementation) def create_embedding_plot(): # Simulated embedding data (replace with actual embedding calculation) source = ColumnDataSource(data=dict( x=[1, 2, 3, 4, 5], y=[6, 7, 2, 4, 5], colors=Spectral10[0:5], labels=['A', 'B', 'C', 'D', 'E'] )) p = figure(width=400, height=400, title="Embedding Map") p.circle('x', 'y', size=20, source=source, color='colors', alpha=0.6) return p # Function to simulate keyword extraction (replace with actual implementation) def extract_keywords(question): # Simulated keyword extraction (replace with actual logic) return ['keyword1', 'keyword2', 'keyword3'] # Function to estimate consensus (replace with actual implementation) def estimate_consensus(): # Simulated consensus estimation (replace with actual calculation) return 0.75 # Streamlit app def main(): # st.title("Question Answering App") # Sidebar (Inputs) st.sidebar.header("Inputs") question = st.sidebar.text_input("Enter your question:") extra_keywords = st.sidebar.text_input("Enter extra keywords (comma-separated):") st.sidebar.subheader("Toggles") toggle_a = st.sidebar.checkbox("Toggle A") toggle_b = st.sidebar.checkbox("Toggle B") toggle_c = st.sidebar.checkbox("Toggle C") method = st.sidebar.radio("Choose a method:", ["h1", "h2", "h3"]) question_type = st.sidebar.selectbox("Select question type:", ["Type 1", "Type 2", "Type 3"]) store_output = st.sidebar.checkbox("Store the output") submit_button = st.sidebar.button("Submit") # Main page (Outputs) if submit_button: # Process inputs keywords = [kw.strip() for kw in extra_keywords.split(',')] if extra_keywords else [] toggles = {'A': toggle_a, 'B': toggle_b, 'C': toggle_c} # Generate outputs answer = answer_question(question, keywords, toggles, method, question_type) papers_df = get_papers() embedding_plot = create_embedding_plot() triggered_keywords = extract_keywords(question) consensus = estimate_consensus() # Display outputs st.header("Results") col1, col2 = st.columns(2) with col1: st.subheader("Answer") st.write(answer) st.subheader("Papers Used") st.dataframe(papers_df) st.subheader("Triggered Keywords") st.write(", ".join(triggered_keywords)) with col2: st.subheader("Embedding Map") st.bokeh_chart(embedding_plot) st.subheader("Question Type") st.write(question_type) st.subheader("Consensus Estimate") st.write(f"{consensus:.2%}") if store_output: st.success("Output stored successfully!") else: st.info("Use the sidebar to input parameters and submit to see results.") if __name__ == "__main__": main()