Spaces:
Sleeping
Sleeping
kartheikiyer
commited on
Commit
Β·
ea7a22d
1
Parent(s):
888f3cd
new app skeleton
Browse files- app.py +187 -37
- app.py:Zone.Identifier +0 -0
- local_files/arxiv_ads_corpus_coordsonly_v3.pkl +0 -3
- local_files/embeddings_matrix.npy +0 -3
- local_files/index_mapping.pkl +0 -3
- local_files/keyword_index.json +0 -3
- local_files/metadata.json +0 -3
- pages/1 retrieval.py +0 -124
app.py
CHANGED
@@ -1,52 +1,202 @@
|
|
1 |
import streamlit as st
|
2 |
-
|
3 |
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
-
|
10 |
-
|
|
|
11 |
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
14 |
|
|
|
|
|
|
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
-
|
25 |
-
|
|
|
|
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
[arxivsorter](https://www.arxivsorter.org/), semantic search or google scholar, but rather a supplement to find papers
|
34 |
-
that otherwise might be missed during a literature survey.
|
35 |
-
It is trained on astro-ph (astrophysics of galaxies) papers up to last-year-ish mined from arxiv and supplemented with ADS metadata,
|
36 |
-
if you are interested in extending it please reach out!
|
37 |
|
|
|
38 |
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
|
|
|
|
|
|
|
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
st.session_state.retrieval_system = EmbeddingRetrievalSystem()
|
51 |
-
st.session_state.dataset = load_dataset('arxiv_corpus/', split = "train")
|
52 |
-
st.markdown(f'Loaded retrieval system, time taken: %.1f sec' %(time.time()-s))
|
|
|
1 |
import streamlit as st
|
2 |
+
st.set_page_config(layout="wide")
|
3 |
|
4 |
+
import numpy as np
|
5 |
+
from abc import ABC, abstractmethod
|
6 |
+
from typing import List, Dict, Any, Tuple
|
7 |
+
from collections import defaultdict
|
8 |
+
from tqdm import tqdm
|
9 |
+
import pandas as pd
|
10 |
+
from datetime import datetime, date
|
11 |
+
from datasets import load_dataset, load_from_disk
|
12 |
+
from collections import Counter
|
13 |
|
14 |
+
import yaml, json, requests, sys, os, time, hickle
|
15 |
+
import concurrent.futures
|
16 |
+
ts = time.time()
|
17 |
|
18 |
+
from nltk.corpus import stopwords
|
19 |
+
import nltk
|
20 |
+
from openai import OpenAI
|
21 |
+
import anthropic
|
22 |
+
import cohere
|
23 |
+
import faiss
|
24 |
|
25 |
+
import spacy
|
26 |
+
from string import punctuation
|
27 |
+
import pytextrank
|
28 |
|
29 |
+
nlp = spacy.load("en_core_web_sm")
|
30 |
+
nlp.add_pipe("textrank")
|
31 |
+
|
32 |
+
try:
|
33 |
+
stopwords.words('english')
|
34 |
+
except:
|
35 |
+
nltk.download('stopwords')
|
36 |
+
stopwords.words('english')
|
37 |
+
|
38 |
+
from bokeh.plotting import figure
|
39 |
+
from bokeh.models import ColumnDataSource
|
40 |
+
from bokeh.palettes import Spectral10
|
41 |
+
|
42 |
+
# try to load the data, if it doesn't work, pull from huggingface and make the pickle files
|
43 |
+
|
44 |
+
st.image('local_files/pathfinder_logo.png')
|
45 |
+
|
46 |
+
st.expander("About", expanded=False).write(
|
47 |
+
"""
|
48 |
+
Pathfinder v2.0 is a framework for searching and visualizing astronomy papers on the [arXiv](https://arxiv.org/) and [ADS](https://ui.adsabs.harvard.edu/) using the context
|
49 |
+
sensitivity from modern large language models (LLMs) to better parse patterns in paper contexts.
|
50 |
+
|
51 |
+
This tool was built during the [JSALT workshop](https://www.clsp.jhu.edu/2024-jelinek-summer-workshop-on-speech-and-language-technology/) to do awesome things.
|
52 |
+
|
53 |
+
**π Select a tool from the sidebar** to see some examples
|
54 |
+
of what this framework can do!
|
55 |
+
|
56 |
+
### Tool summary:
|
57 |
+
- Please wait while the initial data loads and compiles, this takes about a minute initially.
|
58 |
+
- `Paper search` looks for relevant papers given an arxiv id or a question.
|
59 |
+
|
60 |
+
This is not meant to be a replacement to existing tools like the
|
61 |
+
[ADS](https://ui.adsabs.harvard.edu/),
|
62 |
+
[arxivsorter](https://www.arxivsorter.org/), semantic search or google scholar, but rather a supplement to find papers
|
63 |
+
that otherwise might be missed during a literature survey.
|
64 |
+
It is trained on astro-ph (astrophysics of galaxies) papers up to last-year-ish mined from arxiv and supplemented with ADS metadata,
|
65 |
+
if you are interested in extending it please reach out!
|
66 |
+
|
67 |
+
|
68 |
+
Also add: more pages, actual generation, diff. toggles for retrieval/gen, feedback form, socials, literature, contact us, copyright, collaboration, etc.
|
69 |
+
|
70 |
+
The image below shows a representation of all the astro-ph.GA papers that can be explored in more detail
|
71 |
+
using the `Arxiv embedding` page. The papers tend to cluster together by similarity, and result in an
|
72 |
+
atlas that shows well studied (forests) and currently uncharted areas (water).
|
73 |
+
"""
|
74 |
+
)
|
75 |
+
|
76 |
|
77 |
+
|
78 |
+
if 'arxiv_corpus' not in st.session_state:
|
79 |
+
with st.spinner('loading data...'):
|
80 |
+
try:
|
81 |
+
arxiv_corpus = load_from_disk('data/')
|
82 |
+
except:
|
83 |
+
st.write('downloading data')
|
84 |
+
arxiv_corpus = load_dataset('kiyer/pathfinder_arxiv_data',split='train')
|
85 |
+
arxiv_corpus.save_to_disk('data/')
|
86 |
+
st.session_state.arxiv_corpus = arxiv_corpus
|
87 |
+
st.toast('loaded arxiv corpus')
|
88 |
+
|
89 |
+
if 'ids' not in st.session_state:
|
90 |
+
st.session_state.ids = arxiv_corpus['ads_id']
|
91 |
+
st.session_state.titles = arxiv_corpus['title']
|
92 |
+
st.session_state.abstracts = arxiv_corpus['abstract']
|
93 |
+
st.session_state.cites = arxiv_corpus['cites']
|
94 |
+
st.session_state.years = arxiv_corpus['date']
|
95 |
+
st.toast('done caching. time:taken: {}'.format(time.time()-ts))
|
96 |
+
|
97 |
+
|
98 |
+
else:
|
99 |
+
arxiv_corpus = st.session_state.arxiv_corpus
|
100 |
+
# Function to simulate question answering (replace with actual implementation)
|
101 |
+
def answer_question(question, keywords, toggles, method, question_type):
|
102 |
+
# Simulated answer (replace with actual logic)
|
103 |
+
return f"Answer to '{question}' using method {method} for {question_type} question."
|
104 |
+
|
105 |
+
# Function to simulate paper retrieval (replace with actual implementation)
|
106 |
+
def get_papers():
|
107 |
+
# Simulated paper data (replace with actual data retrieval)
|
108 |
+
return pd.DataFrame({
|
109 |
+
'Title': ['Paper 1', 'Paper 2', 'Paper 3'],
|
110 |
+
'Relevance': [0.9, 0.7, 0.5]
|
111 |
+
})
|
112 |
+
|
113 |
+
# Function to create embedding plot (replace with actual implementation)
|
114 |
+
def create_embedding_plot():
|
115 |
+
# Simulated embedding data (replace with actual embedding calculation)
|
116 |
+
source = ColumnDataSource(data=dict(
|
117 |
+
x=[1, 2, 3, 4, 5],
|
118 |
+
y=[6, 7, 2, 4, 5],
|
119 |
+
colors=Spectral10[0:5],
|
120 |
+
labels=['A', 'B', 'C', 'D', 'E']
|
121 |
+
))
|
122 |
+
|
123 |
+
p = figure(width=400, height=400, title="Embedding Map")
|
124 |
+
p.circle('x', 'y', size=20, source=source, color='colors', alpha=0.6)
|
125 |
+
return p
|
126 |
|
127 |
+
# Function to simulate keyword extraction (replace with actual implementation)
|
128 |
+
def extract_keywords(question):
|
129 |
+
# Simulated keyword extraction (replace with actual logic)
|
130 |
+
return ['keyword1', 'keyword2', 'keyword3']
|
131 |
|
132 |
+
# Function to estimate consensus (replace with actual implementation)
|
133 |
+
def estimate_consensus():
|
134 |
+
# Simulated consensus estimation (replace with actual calculation)
|
135 |
+
return 0.75
|
136 |
|
137 |
+
# Streamlit app
|
138 |
+
def main():
|
|
|
|
|
|
|
|
|
139 |
|
140 |
+
# st.title("Question Answering App")
|
141 |
|
142 |
+
# Sidebar (Inputs)
|
143 |
+
st.sidebar.header("Inputs")
|
144 |
+
question = st.sidebar.text_input("Enter your question:")
|
145 |
+
extra_keywords = st.sidebar.text_input("Enter extra keywords (comma-separated):")
|
146 |
+
|
147 |
+
st.sidebar.subheader("Toggles")
|
148 |
+
toggle_a = st.sidebar.checkbox("Toggle A")
|
149 |
+
toggle_b = st.sidebar.checkbox("Toggle B")
|
150 |
+
toggle_c = st.sidebar.checkbox("Toggle C")
|
151 |
+
|
152 |
+
method = st.sidebar.radio("Choose a method:", ["h1", "h2", "h3"])
|
153 |
+
question_type = st.sidebar.selectbox("Select question type:", ["Type 1", "Type 2", "Type 3"])
|
154 |
+
store_output = st.sidebar.checkbox("Store the output")
|
155 |
+
|
156 |
+
submit_button = st.sidebar.button("Submit")
|
157 |
+
|
158 |
+
# Main page (Outputs)
|
159 |
+
if submit_button:
|
160 |
+
# Process inputs
|
161 |
+
keywords = [kw.strip() for kw in extra_keywords.split(',')] if extra_keywords else []
|
162 |
+
toggles = {'A': toggle_a, 'B': toggle_b, 'C': toggle_c}
|
163 |
+
|
164 |
+
# Generate outputs
|
165 |
+
answer = answer_question(question, keywords, toggles, method, question_type)
|
166 |
+
papers_df = get_papers()
|
167 |
+
embedding_plot = create_embedding_plot()
|
168 |
+
triggered_keywords = extract_keywords(question)
|
169 |
+
consensus = estimate_consensus()
|
170 |
|
171 |
+
# Display outputs
|
172 |
+
st.header("Results")
|
173 |
+
|
174 |
+
col1, col2 = st.columns(2)
|
175 |
+
|
176 |
+
with col1:
|
177 |
+
st.subheader("Answer")
|
178 |
+
st.write(answer)
|
179 |
+
|
180 |
+
st.subheader("Papers Used")
|
181 |
+
st.dataframe(papers_df)
|
182 |
+
|
183 |
+
st.subheader("Triggered Keywords")
|
184 |
+
st.write(", ".join(triggered_keywords))
|
185 |
+
|
186 |
+
with col2:
|
187 |
+
st.subheader("Embedding Map")
|
188 |
+
st.bokeh_chart(embedding_plot)
|
189 |
+
|
190 |
+
st.subheader("Question Type")
|
191 |
+
st.write(question_type)
|
192 |
+
|
193 |
+
st.subheader("Consensus Estimate")
|
194 |
+
st.write(f"{consensus:.2%}")
|
195 |
|
196 |
+
if store_output:
|
197 |
+
st.success("Output stored successfully!")
|
198 |
+
else:
|
199 |
+
st.info("Use the sidebar to input parameters and submit to see results.")
|
200 |
|
201 |
+
if __name__ == "__main__":
|
202 |
+
main()
|
|
|
|
|
|
app.py:Zone.Identifier
ADDED
File without changes
|
local_files/arxiv_ads_corpus_coordsonly_v3.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:84bf489fe113fa63fbb21def14b97c080d01b91146fc9e867c53012721770780
|
3 |
-
size 4348940
|
|
|
|
|
|
|
|
local_files/embeddings_matrix.npy
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:4be06660e0ce1cf21461c03bc51bca09bc86d5f7d1c60d283eb6b28bbeb10788
|
3 |
-
size 3336732800
|
|
|
|
|
|
|
|
local_files/index_mapping.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:87bc197008b1faf15a9d64ac42fc4ee03b7f6e5d4bd9ffc6ac64c4156c3a8e24
|
3 |
-
size 7656835
|
|
|
|
|
|
|
|
local_files/keyword_index.json
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:b100ce52f6bff88576430aae32d4617722deb2168b3740842e96ea265d3aaf59
|
3 |
-
size 138660705
|
|
|
|
|
|
|
|
local_files/metadata.json
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:f5baae76dffec0f2326a6c0d4138e61ef2e81faca762c6f1fb43076c29a61ca3
|
3 |
-
size 1180396608
|
|
|
|
|
|
|
|
pages/1 retrieval.py
DELETED
@@ -1,124 +0,0 @@
|
|
1 |
-
import time
|
2 |
-
s = time.time()
|
3 |
-
|
4 |
-
import os
|
5 |
-
import datetime
|
6 |
-
import faiss
|
7 |
-
import streamlit as st
|
8 |
-
import feedparser
|
9 |
-
import urllib
|
10 |
-
import cloudpickle as cp
|
11 |
-
import pickle
|
12 |
-
from urllib.request import urlopen
|
13 |
-
from summa import summarizer
|
14 |
-
import numpy as np
|
15 |
-
import matplotlib.pyplot as plt
|
16 |
-
import requests
|
17 |
-
import json
|
18 |
-
from scipy import ndimage
|
19 |
-
|
20 |
-
from langchain_openai import AzureOpenAIEmbeddings
|
21 |
-
# from langchain.llms import OpenAI
|
22 |
-
from langchain_community.llms import OpenAI
|
23 |
-
from langchain_openai import AzureChatOpenAI
|
24 |
-
|
25 |
-
from fns import *
|
26 |
-
|
27 |
-
st.image('local_files/synth_logo.png')
|
28 |
-
st.markdown("")
|
29 |
-
|
30 |
-
query = st.text_input('Ask me anything:',
|
31 |
-
value="What causes galaxy quenching at high redshifts?")
|
32 |
-
|
33 |
-
arxiv_id = None
|
34 |
-
top_k = st.slider('How many papers should I show?', 1, 30, 6)
|
35 |
-
|
36 |
-
retrieval_system = st.session_state.retrieval_system
|
37 |
-
results = retrieval_system.retrieve(query, arxiv_id, top_k)
|
38 |
-
|
39 |
-
aids = st.session_state.dataset['id']
|
40 |
-
titles = st.session_state.dataset['title']
|
41 |
-
auths = st.session_state.dataset['author']
|
42 |
-
bibcodes = st.session_state.dataset['bibcode']
|
43 |
-
all_keywords = st.session_state.dataset['keyword_search']
|
44 |
-
allyrs = st.session_state.dataset['year']
|
45 |
-
ret_indices = np.array([aids.index(results[i]) for i in range(top_k)])
|
46 |
-
yrs = []
|
47 |
-
for i in range(len(ret_indices)):
|
48 |
-
yr = allyrs[ret_indices[i]]
|
49 |
-
if yr < 50:
|
50 |
-
yr = yr + 2000
|
51 |
-
else:
|
52 |
-
yr = yr + 1900
|
53 |
-
yrs.append(yr)
|
54 |
-
print_titles = [titles[ret_indices[i]][0] for i in range(len(ret_indices))]
|
55 |
-
print_auths = [auths[ret_indices[i]][0]+' et al. '+str(yrs[i]) for i in range(len(ret_indices))]
|
56 |
-
print_links = ['['+bibcodes[ret_indices[i]]+'](https://ui.adsabs.harvard.edu/abs/'+bibcodes[ret_indices[i]]+'/abstract)' for i in range(len(ret_indices))]
|
57 |
-
|
58 |
-
st.divider()
|
59 |
-
st.header('top-k papers:')
|
60 |
-
|
61 |
-
for i in range(len(ret_indices)):
|
62 |
-
st.subheader(str(i+1)+'. '+print_titles[i])
|
63 |
-
st.write(print_auths[i]+' '+print_links[i])
|
64 |
-
|
65 |
-
|
66 |
-
st.divider()
|
67 |
-
st.header('top-k papers in context:')
|
68 |
-
|
69 |
-
gtkws = get_keywords(query, ret_indices, all_keywords)
|
70 |
-
|
71 |
-
umap, clbls, all_kws = load_umapcoords('local_files/arxiv_ads_corpus_coordsonly_v3.pkl')
|
72 |
-
|
73 |
-
fig = plt.figure(figsize=(12*1.8*1.2,9*2.*1.2))
|
74 |
-
im = plt.imread('local_files/astro_worldmap.png')
|
75 |
-
implot = plt.imshow(im,)
|
76 |
-
|
77 |
-
xax = (umap[0:,1]-np.amin(umap[0:,1]))+.0
|
78 |
-
xax = xax / np.amax(xax)
|
79 |
-
xax = xax * 1580 + 170
|
80 |
-
yax = (umap[0:,0]-np.amin(umap[0:,0]))+.0
|
81 |
-
yax = yax / np.amax(yax)
|
82 |
-
yax = (np.amax(yax)-yax) * 1700 + 30
|
83 |
-
# plt.scatter(xax, yax,s=2,alpha=0.7,c='k')
|
84 |
-
|
85 |
-
for i in range(np.amax(clbls)):
|
86 |
-
|
87 |
-
clust_ids = np.arange(len(clbls))[clbls == i]
|
88 |
-
clust_centroid = (np.median(xax[clust_ids]),np.median(yax[clust_ids]))
|
89 |
-
# plt.text(clust_centroid[1], clust_centroid[0], all_kws[i],fontsize=9,ha="center", va="center",
|
90 |
-
# bbox=dict(facecolor='white', edgecolor='black', boxstyle='round,pad=0.3',alpha=0.3))
|
91 |
-
plt.text(clust_centroid[0], clust_centroid[1], all_kws[i],fontsize=9,ha="center", va="center",
|
92 |
-
fontfamily='serif',color='w',
|
93 |
-
bbox=dict(facecolor='k', edgecolor='none', boxstyle='round,pad=0.1',alpha=0.3))
|
94 |
-
|
95 |
-
plt.scatter(xax[ret_indices], yax[ret_indices], c='k',s=300,zorder=100)
|
96 |
-
plt.scatter(xax[ret_indices], yax[ret_indices], c='firebrick',s=100,zorder=101)
|
97 |
-
plt.scatter(xax[ret_indices[0]], yax[ret_indices[0]], c='k',s=300,zorder=101)
|
98 |
-
plt.scatter(xax[ret_indices[0]], yax[ret_indices[0]], c='w',s=100,zorder=101)
|
99 |
-
|
100 |
-
tempx = plt.xlim(); tempy = plt.ylim()
|
101 |
-
plt.text(0.012*tempx[1], (0.012+0.03)*tempy[0], 'The world of astronomy literature',fontsize=36, fontfamily='serif')
|
102 |
-
plt.text(0.012*tempx[1], (0.012+0.06)*tempy[0], 'Query: '+query,fontsize=18, fontfamily='serif')
|
103 |
-
plt.text(0.012*tempx[1], (0.012+0.08)*tempy[0], gtkws,fontsize=18, fontfamily='serif', va='top')
|
104 |
-
plt.axis('off')
|
105 |
-
st.pyplot(fig, transparent = True, bbox_inches='tight')
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|