kartheikiyer commited on
Commit
ea7a22d
Β·
1 Parent(s): 888f3cd

new app skeleton

Browse files
app.py CHANGED
@@ -1,52 +1,202 @@
1
  import streamlit as st
2
- from fns import *
3
 
4
- st.set_page_config(
5
- page_title="Synthesist",
6
- page_icon="πŸ‘‹",
7
- )
 
 
 
 
 
8
 
9
- # st.write("# Welcome to Pathfinder! πŸ‘‹")
10
- st.image('local_files/synth_logo.png')
 
11
 
12
- st.sidebar.success("Select a function above.")
13
- st.sidebar.markdown("Current functions include visualizing papers in the arxiv embedding, searching for similar papers to an input paper or prompt phrase, or answering quick questions.")
 
 
 
 
14
 
 
 
 
15
 
16
- st.markdown("")
17
- st.markdown(
18
- """
19
- **Synthesist** (from Peter Watt's [Blindsight](https://scalar.usc.edu/works/network-ecologies/on-peter-watts-blindsight)) is a framework for searching and visualizing papers on the [arXiv](https://arxiv.org/) using the context
20
- sensitivity from modern large language models (LLMs) to better parse patterns in paper contexts.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- This tool was built during the [JSALT workshop](https://www.clsp.jhu.edu/2024-jelinek-summer-workshop-on-speech-and-language-technology/) to do awesome things.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- **πŸ‘ˆ Select a tool from the sidebar** to see some examples
25
- of what this framework can do!
 
 
26
 
27
- ### Tool summary:
28
- - Please wait while the initial data loads and compiles, this takes about a minute initially.
29
- - `Paper search` looks for relevant papers given an arxiv id or a question.
 
30
 
31
- This is not meant to be a replacement to existing tools like the
32
- [ADS](https://ui.adsabs.harvard.edu/),
33
- [arxivsorter](https://www.arxivsorter.org/), semantic search or google scholar, but rather a supplement to find papers
34
- that otherwise might be missed during a literature survey.
35
- It is trained on astro-ph (astrophysics of galaxies) papers up to last-year-ish mined from arxiv and supplemented with ADS metadata,
36
- if you are interested in extending it please reach out!
37
 
 
38
 
39
- Also add: more pages, actual generation, diff. toggles for retrieval/gen, feedback form, socials, literature, contact us, copyright, collaboration, etc.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- The image below shows a representation of all the astro-ph.GA papers that can be explored in more detail
42
- using the `Arxiv embedding` page. The papers tend to cluster together by similarity, and result in an
43
- atlas that shows well studied (forests) and currently uncharted areas (water).
44
- """
45
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
 
 
 
 
47
 
48
- s = time.time()
49
- st.markdown(f'Loading data for retrieval system, please wait before jumping to one of the pages....')
50
- st.session_state.retrieval_system = EmbeddingRetrievalSystem()
51
- st.session_state.dataset = load_dataset('arxiv_corpus/', split = "train")
52
- st.markdown(f'Loaded retrieval system, time taken: %.1f sec' %(time.time()-s))
 
1
  import streamlit as st
2
+ st.set_page_config(layout="wide")
3
 
4
+ import numpy as np
5
+ from abc import ABC, abstractmethod
6
+ from typing import List, Dict, Any, Tuple
7
+ from collections import defaultdict
8
+ from tqdm import tqdm
9
+ import pandas as pd
10
+ from datetime import datetime, date
11
+ from datasets import load_dataset, load_from_disk
12
+ from collections import Counter
13
 
14
+ import yaml, json, requests, sys, os, time, hickle
15
+ import concurrent.futures
16
+ ts = time.time()
17
 
18
+ from nltk.corpus import stopwords
19
+ import nltk
20
+ from openai import OpenAI
21
+ import anthropic
22
+ import cohere
23
+ import faiss
24
 
25
+ import spacy
26
+ from string import punctuation
27
+ import pytextrank
28
 
29
+ nlp = spacy.load("en_core_web_sm")
30
+ nlp.add_pipe("textrank")
31
+
32
+ try:
33
+ stopwords.words('english')
34
+ except:
35
+ nltk.download('stopwords')
36
+ stopwords.words('english')
37
+
38
+ from bokeh.plotting import figure
39
+ from bokeh.models import ColumnDataSource
40
+ from bokeh.palettes import Spectral10
41
+
42
+ # try to load the data, if it doesn't work, pull from huggingface and make the pickle files
43
+
44
+ st.image('local_files/pathfinder_logo.png')
45
+
46
+ st.expander("About", expanded=False).write(
47
+ """
48
+ Pathfinder v2.0 is a framework for searching and visualizing astronomy papers on the [arXiv](https://arxiv.org/) and [ADS](https://ui.adsabs.harvard.edu/) using the context
49
+ sensitivity from modern large language models (LLMs) to better parse patterns in paper contexts.
50
+
51
+ This tool was built during the [JSALT workshop](https://www.clsp.jhu.edu/2024-jelinek-summer-workshop-on-speech-and-language-technology/) to do awesome things.
52
+
53
+ **πŸ‘ˆ Select a tool from the sidebar** to see some examples
54
+ of what this framework can do!
55
+
56
+ ### Tool summary:
57
+ - Please wait while the initial data loads and compiles, this takes about a minute initially.
58
+ - `Paper search` looks for relevant papers given an arxiv id or a question.
59
+
60
+ This is not meant to be a replacement to existing tools like the
61
+ [ADS](https://ui.adsabs.harvard.edu/),
62
+ [arxivsorter](https://www.arxivsorter.org/), semantic search or google scholar, but rather a supplement to find papers
63
+ that otherwise might be missed during a literature survey.
64
+ It is trained on astro-ph (astrophysics of galaxies) papers up to last-year-ish mined from arxiv and supplemented with ADS metadata,
65
+ if you are interested in extending it please reach out!
66
+
67
+
68
+ Also add: more pages, actual generation, diff. toggles for retrieval/gen, feedback form, socials, literature, contact us, copyright, collaboration, etc.
69
+
70
+ The image below shows a representation of all the astro-ph.GA papers that can be explored in more detail
71
+ using the `Arxiv embedding` page. The papers tend to cluster together by similarity, and result in an
72
+ atlas that shows well studied (forests) and currently uncharted areas (water).
73
+ """
74
+ )
75
+
76
 
77
+
78
+ if 'arxiv_corpus' not in st.session_state:
79
+ with st.spinner('loading data...'):
80
+ try:
81
+ arxiv_corpus = load_from_disk('data/')
82
+ except:
83
+ st.write('downloading data')
84
+ arxiv_corpus = load_dataset('kiyer/pathfinder_arxiv_data',split='train')
85
+ arxiv_corpus.save_to_disk('data/')
86
+ st.session_state.arxiv_corpus = arxiv_corpus
87
+ st.toast('loaded arxiv corpus')
88
+
89
+ if 'ids' not in st.session_state:
90
+ st.session_state.ids = arxiv_corpus['ads_id']
91
+ st.session_state.titles = arxiv_corpus['title']
92
+ st.session_state.abstracts = arxiv_corpus['abstract']
93
+ st.session_state.cites = arxiv_corpus['cites']
94
+ st.session_state.years = arxiv_corpus['date']
95
+ st.toast('done caching. time:taken: {}'.format(time.time()-ts))
96
+
97
+
98
+ else:
99
+ arxiv_corpus = st.session_state.arxiv_corpus
100
+ # Function to simulate question answering (replace with actual implementation)
101
+ def answer_question(question, keywords, toggles, method, question_type):
102
+ # Simulated answer (replace with actual logic)
103
+ return f"Answer to '{question}' using method {method} for {question_type} question."
104
+
105
+ # Function to simulate paper retrieval (replace with actual implementation)
106
+ def get_papers():
107
+ # Simulated paper data (replace with actual data retrieval)
108
+ return pd.DataFrame({
109
+ 'Title': ['Paper 1', 'Paper 2', 'Paper 3'],
110
+ 'Relevance': [0.9, 0.7, 0.5]
111
+ })
112
+
113
+ # Function to create embedding plot (replace with actual implementation)
114
+ def create_embedding_plot():
115
+ # Simulated embedding data (replace with actual embedding calculation)
116
+ source = ColumnDataSource(data=dict(
117
+ x=[1, 2, 3, 4, 5],
118
+ y=[6, 7, 2, 4, 5],
119
+ colors=Spectral10[0:5],
120
+ labels=['A', 'B', 'C', 'D', 'E']
121
+ ))
122
+
123
+ p = figure(width=400, height=400, title="Embedding Map")
124
+ p.circle('x', 'y', size=20, source=source, color='colors', alpha=0.6)
125
+ return p
126
 
127
+ # Function to simulate keyword extraction (replace with actual implementation)
128
+ def extract_keywords(question):
129
+ # Simulated keyword extraction (replace with actual logic)
130
+ return ['keyword1', 'keyword2', 'keyword3']
131
 
132
+ # Function to estimate consensus (replace with actual implementation)
133
+ def estimate_consensus():
134
+ # Simulated consensus estimation (replace with actual calculation)
135
+ return 0.75
136
 
137
+ # Streamlit app
138
+ def main():
 
 
 
 
139
 
140
+ # st.title("Question Answering App")
141
 
142
+ # Sidebar (Inputs)
143
+ st.sidebar.header("Inputs")
144
+ question = st.sidebar.text_input("Enter your question:")
145
+ extra_keywords = st.sidebar.text_input("Enter extra keywords (comma-separated):")
146
+
147
+ st.sidebar.subheader("Toggles")
148
+ toggle_a = st.sidebar.checkbox("Toggle A")
149
+ toggle_b = st.sidebar.checkbox("Toggle B")
150
+ toggle_c = st.sidebar.checkbox("Toggle C")
151
+
152
+ method = st.sidebar.radio("Choose a method:", ["h1", "h2", "h3"])
153
+ question_type = st.sidebar.selectbox("Select question type:", ["Type 1", "Type 2", "Type 3"])
154
+ store_output = st.sidebar.checkbox("Store the output")
155
+
156
+ submit_button = st.sidebar.button("Submit")
157
+
158
+ # Main page (Outputs)
159
+ if submit_button:
160
+ # Process inputs
161
+ keywords = [kw.strip() for kw in extra_keywords.split(',')] if extra_keywords else []
162
+ toggles = {'A': toggle_a, 'B': toggle_b, 'C': toggle_c}
163
+
164
+ # Generate outputs
165
+ answer = answer_question(question, keywords, toggles, method, question_type)
166
+ papers_df = get_papers()
167
+ embedding_plot = create_embedding_plot()
168
+ triggered_keywords = extract_keywords(question)
169
+ consensus = estimate_consensus()
170
 
171
+ # Display outputs
172
+ st.header("Results")
173
+
174
+ col1, col2 = st.columns(2)
175
+
176
+ with col1:
177
+ st.subheader("Answer")
178
+ st.write(answer)
179
+
180
+ st.subheader("Papers Used")
181
+ st.dataframe(papers_df)
182
+
183
+ st.subheader("Triggered Keywords")
184
+ st.write(", ".join(triggered_keywords))
185
+
186
+ with col2:
187
+ st.subheader("Embedding Map")
188
+ st.bokeh_chart(embedding_plot)
189
+
190
+ st.subheader("Question Type")
191
+ st.write(question_type)
192
+
193
+ st.subheader("Consensus Estimate")
194
+ st.write(f"{consensus:.2%}")
195
 
196
+ if store_output:
197
+ st.success("Output stored successfully!")
198
+ else:
199
+ st.info("Use the sidebar to input parameters and submit to see results.")
200
 
201
+ if __name__ == "__main__":
202
+ main()
 
 
 
app.py:Zone.Identifier ADDED
File without changes
local_files/arxiv_ads_corpus_coordsonly_v3.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:84bf489fe113fa63fbb21def14b97c080d01b91146fc9e867c53012721770780
3
- size 4348940
 
 
 
 
local_files/embeddings_matrix.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4be06660e0ce1cf21461c03bc51bca09bc86d5f7d1c60d283eb6b28bbeb10788
3
- size 3336732800
 
 
 
 
local_files/index_mapping.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:87bc197008b1faf15a9d64ac42fc4ee03b7f6e5d4bd9ffc6ac64c4156c3a8e24
3
- size 7656835
 
 
 
 
local_files/keyword_index.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b100ce52f6bff88576430aae32d4617722deb2168b3740842e96ea265d3aaf59
3
- size 138660705
 
 
 
 
local_files/metadata.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5baae76dffec0f2326a6c0d4138e61ef2e81faca762c6f1fb43076c29a61ca3
3
- size 1180396608
 
 
 
 
pages/1 retrieval.py DELETED
@@ -1,124 +0,0 @@
1
- import time
2
- s = time.time()
3
-
4
- import os
5
- import datetime
6
- import faiss
7
- import streamlit as st
8
- import feedparser
9
- import urllib
10
- import cloudpickle as cp
11
- import pickle
12
- from urllib.request import urlopen
13
- from summa import summarizer
14
- import numpy as np
15
- import matplotlib.pyplot as plt
16
- import requests
17
- import json
18
- from scipy import ndimage
19
-
20
- from langchain_openai import AzureOpenAIEmbeddings
21
- # from langchain.llms import OpenAI
22
- from langchain_community.llms import OpenAI
23
- from langchain_openai import AzureChatOpenAI
24
-
25
- from fns import *
26
-
27
- st.image('local_files/synth_logo.png')
28
- st.markdown("")
29
-
30
- query = st.text_input('Ask me anything:',
31
- value="What causes galaxy quenching at high redshifts?")
32
-
33
- arxiv_id = None
34
- top_k = st.slider('How many papers should I show?', 1, 30, 6)
35
-
36
- retrieval_system = st.session_state.retrieval_system
37
- results = retrieval_system.retrieve(query, arxiv_id, top_k)
38
-
39
- aids = st.session_state.dataset['id']
40
- titles = st.session_state.dataset['title']
41
- auths = st.session_state.dataset['author']
42
- bibcodes = st.session_state.dataset['bibcode']
43
- all_keywords = st.session_state.dataset['keyword_search']
44
- allyrs = st.session_state.dataset['year']
45
- ret_indices = np.array([aids.index(results[i]) for i in range(top_k)])
46
- yrs = []
47
- for i in range(len(ret_indices)):
48
- yr = allyrs[ret_indices[i]]
49
- if yr < 50:
50
- yr = yr + 2000
51
- else:
52
- yr = yr + 1900
53
- yrs.append(yr)
54
- print_titles = [titles[ret_indices[i]][0] for i in range(len(ret_indices))]
55
- print_auths = [auths[ret_indices[i]][0]+' et al. '+str(yrs[i]) for i in range(len(ret_indices))]
56
- print_links = ['['+bibcodes[ret_indices[i]]+'](https://ui.adsabs.harvard.edu/abs/'+bibcodes[ret_indices[i]]+'/abstract)' for i in range(len(ret_indices))]
57
-
58
- st.divider()
59
- st.header('top-k papers:')
60
-
61
- for i in range(len(ret_indices)):
62
- st.subheader(str(i+1)+'. '+print_titles[i])
63
- st.write(print_auths[i]+' '+print_links[i])
64
-
65
-
66
- st.divider()
67
- st.header('top-k papers in context:')
68
-
69
- gtkws = get_keywords(query, ret_indices, all_keywords)
70
-
71
- umap, clbls, all_kws = load_umapcoords('local_files/arxiv_ads_corpus_coordsonly_v3.pkl')
72
-
73
- fig = plt.figure(figsize=(12*1.8*1.2,9*2.*1.2))
74
- im = plt.imread('local_files/astro_worldmap.png')
75
- implot = plt.imshow(im,)
76
-
77
- xax = (umap[0:,1]-np.amin(umap[0:,1]))+.0
78
- xax = xax / np.amax(xax)
79
- xax = xax * 1580 + 170
80
- yax = (umap[0:,0]-np.amin(umap[0:,0]))+.0
81
- yax = yax / np.amax(yax)
82
- yax = (np.amax(yax)-yax) * 1700 + 30
83
- # plt.scatter(xax, yax,s=2,alpha=0.7,c='k')
84
-
85
- for i in range(np.amax(clbls)):
86
-
87
- clust_ids = np.arange(len(clbls))[clbls == i]
88
- clust_centroid = (np.median(xax[clust_ids]),np.median(yax[clust_ids]))
89
- # plt.text(clust_centroid[1], clust_centroid[0], all_kws[i],fontsize=9,ha="center", va="center",
90
- # bbox=dict(facecolor='white', edgecolor='black', boxstyle='round,pad=0.3',alpha=0.3))
91
- plt.text(clust_centroid[0], clust_centroid[1], all_kws[i],fontsize=9,ha="center", va="center",
92
- fontfamily='serif',color='w',
93
- bbox=dict(facecolor='k', edgecolor='none', boxstyle='round,pad=0.1',alpha=0.3))
94
-
95
- plt.scatter(xax[ret_indices], yax[ret_indices], c='k',s=300,zorder=100)
96
- plt.scatter(xax[ret_indices], yax[ret_indices], c='firebrick',s=100,zorder=101)
97
- plt.scatter(xax[ret_indices[0]], yax[ret_indices[0]], c='k',s=300,zorder=101)
98
- plt.scatter(xax[ret_indices[0]], yax[ret_indices[0]], c='w',s=100,zorder=101)
99
-
100
- tempx = plt.xlim(); tempy = plt.ylim()
101
- plt.text(0.012*tempx[1], (0.012+0.03)*tempy[0], 'The world of astronomy literature',fontsize=36, fontfamily='serif')
102
- plt.text(0.012*tempx[1], (0.012+0.06)*tempy[0], 'Query: '+query,fontsize=18, fontfamily='serif')
103
- plt.text(0.012*tempx[1], (0.012+0.08)*tempy[0], gtkws,fontsize=18, fontfamily='serif', va='top')
104
- plt.axis('off')
105
- st.pyplot(fig, transparent = True, bbox_inches='tight')
106
-
107
-
108
-
109
-
110
-
111
-
112
-
113
-
114
-
115
-
116
-
117
-
118
-
119
-
120
-
121
-
122
-
123
-
124
-