File size: 7,465 Bytes
6931cbb
ea7a22d
6931cbb
ea7a22d
 
 
 
 
 
 
 
 
6931cbb
ea7a22d
 
 
6931cbb
ea7a22d
 
 
 
 
 
6931cbb
ea7a22d
 
 
6931cbb
ea7a22d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6931cbb
ea7a22d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6931cbb
ea7a22d
 
 
 
6931cbb
ea7a22d
 
 
 
6931cbb
ea7a22d
 
6931cbb
ea7a22d
6931cbb
ea7a22d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6931cbb
ea7a22d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6931cbb
ea7a22d
 
 
 
6931cbb
ea7a22d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import streamlit as st
st.set_page_config(layout="wide")

import numpy as np
from abc import ABC, abstractmethod
from typing import List, Dict, Any, Tuple
from collections import defaultdict
from tqdm import tqdm
import pandas as pd 
from datetime import datetime, date
from datasets import load_dataset, load_from_disk
from collections import Counter

import yaml, json, requests, sys, os, time, hickle
import concurrent.futures
ts = time.time()

from nltk.corpus import stopwords
import nltk
from openai import OpenAI
import anthropic
import cohere
import faiss

import spacy
from string import punctuation
import pytextrank

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textrank")

try:
    stopwords.words('english')
except:
    nltk.download('stopwords')
    stopwords.words('english')

from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.palettes import Spectral10

# try to load the data, if it doesn't work, pull from huggingface and make the pickle files

st.image('local_files/pathfinder_logo.png')

st.expander("About", expanded=False).write(
        """
        Pathfinder v2.0 is a framework for searching and visualizing astronomy papers on the [arXiv](https://arxiv.org/) and [ADS](https://ui.adsabs.harvard.edu/) using the context
        sensitivity from modern large language models (LLMs) to better parse patterns in paper contexts.
        
        This tool was built during the [JSALT workshop](https://www.clsp.jhu.edu/2024-jelinek-summer-workshop-on-speech-and-language-technology/) to do awesome things.

        **πŸ‘ˆ Select a tool from the sidebar** to see some examples
        of what this framework can do!

        ### Tool summary:
        - Please wait while the initial data loads and compiles, this takes about a minute initially.
        - `Paper search` looks for relevant papers given an arxiv id or a question.

        This is not meant to be a replacement to existing tools like the
        [ADS](https://ui.adsabs.harvard.edu/),
        [arxivsorter](https://www.arxivsorter.org/), semantic search or google scholar, but rather a supplement to find papers
        that otherwise might be missed during a literature survey.
        It is trained on astro-ph (astrophysics of galaxies) papers up to last-year-ish mined from arxiv and supplemented with ADS metadata,
        if you are interested in extending it please reach out!
        
        
        Also add: more pages, actual generation, diff. toggles for retrieval/gen, feedback form, socials, literature, contact us, copyright, collaboration, etc.

        The image below shows a representation of all the astro-ph.GA papers that can be explored in more detail
        using the `Arxiv embedding` page. The papers tend to cluster together by similarity, and result in an
        atlas that shows well studied (forests) and currently uncharted areas (water).
        """
    )
    
    
    
if 'arxiv_corpus' not in st.session_state:
    with st.spinner('loading data...'):
        try:    
            arxiv_corpus = load_from_disk('data/')
        except:
            st.write('downloading data')
            arxiv_corpus = load_dataset('kiyer/pathfinder_arxiv_data',split='train')
            arxiv_corpus.save_to_disk('data/')
        st.session_state.arxiv_corpus = arxiv_corpus
        st.toast('loaded arxiv corpus')
    
if 'ids' not in st.session_state:
    st.session_state.ids = arxiv_corpus['ads_id']
    st.session_state.titles = arxiv_corpus['title']
    st.session_state.abstracts = arxiv_corpus['abstract']
    st.session_state.cites = arxiv_corpus['cites']
    st.session_state.years = arxiv_corpus['date']
    st.toast('done caching. time:taken: {}'.format(time.time()-ts))
   
        
else:
    arxiv_corpus = st.session_state.arxiv_corpus
# Function to simulate question answering (replace with actual implementation)
def answer_question(question, keywords, toggles, method, question_type):
    # Simulated answer (replace with actual logic)
    return f"Answer to '{question}' using method {method} for {question_type} question."

# Function to simulate paper retrieval (replace with actual implementation)
def get_papers():
    # Simulated paper data (replace with actual data retrieval)
    return pd.DataFrame({
        'Title': ['Paper 1', 'Paper 2', 'Paper 3'],
        'Relevance': [0.9, 0.7, 0.5]
    })

# Function to create embedding plot (replace with actual implementation)
def create_embedding_plot():
    # Simulated embedding data (replace with actual embedding calculation)
    source = ColumnDataSource(data=dict(
        x=[1, 2, 3, 4, 5],
        y=[6, 7, 2, 4, 5],
        colors=Spectral10[0:5],
        labels=['A', 'B', 'C', 'D', 'E']
    ))
    
    p = figure(width=400, height=400, title="Embedding Map")
    p.circle('x', 'y', size=20, source=source, color='colors', alpha=0.6)
    return p

# Function to simulate keyword extraction (replace with actual implementation)
def extract_keywords(question):
    # Simulated keyword extraction (replace with actual logic)
    return ['keyword1', 'keyword2', 'keyword3']

# Function to estimate consensus (replace with actual implementation)
def estimate_consensus():
    # Simulated consensus estimation (replace with actual calculation)
    return 0.75

# Streamlit app
def main():
    
    # st.title("Question Answering App")
    
    # Sidebar (Inputs)
    st.sidebar.header("Inputs")
    question = st.sidebar.text_input("Enter your question:")
    extra_keywords = st.sidebar.text_input("Enter extra keywords (comma-separated):")
    
    st.sidebar.subheader("Toggles")
    toggle_a = st.sidebar.checkbox("Toggle A")
    toggle_b = st.sidebar.checkbox("Toggle B")
    toggle_c = st.sidebar.checkbox("Toggle C")
    
    method = st.sidebar.radio("Choose a method:", ["h1", "h2", "h3"])
    question_type = st.sidebar.selectbox("Select question type:", ["Type 1", "Type 2", "Type 3"])
    store_output = st.sidebar.checkbox("Store the output")

    submit_button = st.sidebar.button("Submit")

    # Main page (Outputs)
    if submit_button:
        # Process inputs
        keywords = [kw.strip() for kw in extra_keywords.split(',')] if extra_keywords else []
        toggles = {'A': toggle_a, 'B': toggle_b, 'C': toggle_c}

        # Generate outputs
        answer = answer_question(question, keywords, toggles, method, question_type)
        papers_df = get_papers()
        embedding_plot = create_embedding_plot()
        triggered_keywords = extract_keywords(question)
        consensus = estimate_consensus()

        # Display outputs
        st.header("Results")
        
        col1, col2 = st.columns(2)
        
        with col1:
            st.subheader("Answer")
            st.write(answer)
            
            st.subheader("Papers Used")
            st.dataframe(papers_df)
            
            st.subheader("Triggered Keywords")
            st.write(", ".join(triggered_keywords))
        
        with col2:
            st.subheader("Embedding Map")
            st.bokeh_chart(embedding_plot)
            
            st.subheader("Question Type")
            st.write(question_type)
            
            st.subheader("Consensus Estimate")
            st.write(f"{consensus:.2%}")

        if store_output:
            st.success("Output stored successfully!")
    else:
        st.info("Use the sidebar to input parameters and submit to see results.")

if __name__ == "__main__":
    main()