Spaces:
Running
Running
updated to pull files locally
Browse files
local_files/astro_ph_ga_embedding_27-Jun-2023.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:476fb3dc5155a733d6427b9f3cc126134d8be881cb0d598df2876f9a61dd672d
|
3 |
+
size 265762
|
local_files/astro_ph_ga_feeds_ada_embedding_27-Jun-2023.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:655af27d3c033e15ded2051d0b3a668dd429f64ec3bed5ceac3eacd969e618dd
|
3 |
+
size 407961763
|
local_files/astro_ph_ga_feeds_upto_27-Jun-2023.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:29237e0e973a5fcd4df826c09432a069e6a471d5725fdfc9a0f8c7c62b69e188
|
3 |
+
size 89228171
|
pages/1_paper_search.py
CHANGED
@@ -7,6 +7,7 @@ import streamlit as st
|
|
7 |
import feedparser
|
8 |
import urllib
|
9 |
import cloudpickle as cp
|
|
|
10 |
from urllib.request import urlopen
|
11 |
from summa import summarizer
|
12 |
import numpy as np
|
@@ -19,14 +20,21 @@ os.environ["OPENAI_API_KEY"] = openai.api_key
|
|
19 |
|
20 |
@st.cache_data
|
21 |
def get_feeds_data(url):
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
24 |
return data
|
25 |
|
26 |
embeddings = OpenAIEmbeddings()
|
27 |
|
28 |
-
feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_"
|
29 |
-
embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0"
|
|
|
|
|
|
|
|
|
30 |
gal_feeds = get_feeds_data(feeds_link)
|
31 |
arxiv_ada_embeddings = get_feeds_data(embed_link)
|
32 |
|
@@ -174,6 +182,7 @@ model_data = [arxiv_ada_embeddings, embeddings, all_titles, all_text, all_author
|
|
174 |
|
175 |
st.title('ArXiv similarity search:')
|
176 |
st.markdown('Search for similar papers by arxiv id or phrase:')
|
|
|
177 |
|
178 |
search_type = st.radio(
|
179 |
"What are you searching by?",
|
|
|
7 |
import feedparser
|
8 |
import urllib
|
9 |
import cloudpickle as cp
|
10 |
+
import pickle
|
11 |
from urllib.request import urlopen
|
12 |
from summa import summarizer
|
13 |
import numpy as np
|
|
|
20 |
|
21 |
@st.cache_data
|
22 |
def get_feeds_data(url):
|
23 |
+
with open(url, "rb") as fp:
|
24 |
+
data = pickle.load(fp)
|
25 |
+
st.sidebar.success("Loaded data!")
|
26 |
+
# data = cp.load(urlopen(url))
|
27 |
+
# st.sidebar.success("Fetched data from API!")
|
28 |
return data
|
29 |
|
30 |
embeddings = OpenAIEmbeddings()
|
31 |
|
32 |
+
# feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_"
|
33 |
+
# embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0"
|
34 |
+
|
35 |
+
dateval = "27-Jun-2023"
|
36 |
+
feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
|
37 |
+
embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
|
38 |
gal_feeds = get_feeds_data(feeds_link)
|
39 |
arxiv_ada_embeddings = get_feeds_data(embed_link)
|
40 |
|
|
|
182 |
|
183 |
st.title('ArXiv similarity search:')
|
184 |
st.markdown('Search for similar papers by arxiv id or phrase:')
|
185 |
+
st.markdown('[Includes papers up to: `'+dateval+'`]')
|
186 |
|
187 |
search_type = st.radio(
|
188 |
"What are you searching by?",
|
pages/2_arxiv_embedding.py
CHANGED
@@ -7,33 +7,78 @@ from bokeh.palettes import OrRd
|
|
7 |
from bokeh.plotting import figure, show
|
8 |
from bokeh.plotting import ColumnDataSource, figure, output_notebook, show
|
9 |
import cloudpickle as cp
|
|
|
10 |
from scipy import stats
|
11 |
from urllib.request import urlopen
|
12 |
|
13 |
st.title("ArXiv+GPT3 embedding explorer")
|
|
|
14 |
st.markdown("This is an explorer for astro-ph.GA papers on the arXiv (up to Apt 18th, 2023). The papers have been preprocessed with `chaotic_neural` [(link)](http://chaotic-neural.readthedocs.io/) after which the collected abstracts are run through `text-embedding-ada-002` with [langchain](https://python.langchain.com/en/latest/ecosystem/openai.html) to generate a unique vector correpsonding to each paper. These are then compressed using [umap](https://umap-learn.readthedocs.io/en/latest/) and shown here, and can be used for similarity searches with methods like [faiss](https://github.com/facebookresearch/faiss). The scatterplot here can be paired with a heatmap for more targeted searches looking at a specific topic or area (see sidebar). Upgrade to chaotic neural suggested by Jo Ciucă, thank you! More to come (hopefully) with GPT-4 and its applications!")
|
15 |
st.markdown("Interpreting the UMAP plot: the algorithm creates a 2d embedding from the high-dim vector space that tries to conserve as much similarity information as possible. Nearby points in UMAP space are similar, and grow dissimiliar as you move farther away. The axes do not have any physical meaning.")
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
@st.cache_data
|
18 |
def get_embedding_data(url):
|
19 |
-
data = cp.load(urlopen(url))
|
|
|
|
|
20 |
st.sidebar.success("Fetched data from API!")
|
21 |
return data
|
22 |
|
23 |
-
url = "
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
def density_estimation(m1, m2, xmin=0, ymin=0, xmax=15, ymax=15):
|
27 |
-
X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
|
28 |
-
positions = np.vstack([X.ravel(), Y.ravel()])
|
29 |
-
values = np.vstack([m1, m2])
|
30 |
-
kernel = stats.gaussian_kde(values)
|
31 |
Z = np.reshape(kernel(positions).T, X.shape)
|
32 |
return X, Y, Z
|
33 |
-
|
34 |
st.sidebar.markdown('This is a widget that allows you to look for papers containing specific phrases in the dataset and show it as a heatmap. Enter the phrase of interest, then change the size and opacity of the heatmap as desired to find the high-density regions. Hover over blue points to see the details of individual papers.')
|
35 |
st.sidebar.markdown('`Note`: (i) if you enter a query that is not in the corpus of abstracts, it will return an error. just enter a different query in that case. (ii) there are some empty tooltips when you hover, these correspond to the underlying hexbins, and can be ignored.')
|
36 |
-
|
37 |
st.sidebar.text_input("Search query", key="phrase", value="")
|
38 |
alpha_value = st.sidebar.slider("Pick the hexbin opacity",0.0,1.0,0.1)
|
39 |
size_value = st.sidebar.slider("Pick the hexbin size",0.0,2.0,0.2)
|
@@ -44,7 +89,7 @@ phrase_flags = np.zeros((len(all_text),))
|
|
44 |
for i in range(len(all_text)):
|
45 |
if phrase.lower() in all_text[i].lower():
|
46 |
phrase_flags[i] = 1
|
47 |
-
|
48 |
|
49 |
source = ColumnDataSource(data=dict(
|
50 |
x=embedding[0:,0],
|
@@ -61,11 +106,11 @@ ID: $index
|
|
61 |
@link <br> <br>
|
62 |
</div>
|
63 |
"""
|
64 |
-
|
65 |
p = figure(width=700, height=583, tooltips=TOOLTIPS, x_range=(0, 15), y_range=(2.5,15),
|
66 |
title="UMAP projection of trained ArXiv corpus | heatmap keyword: "+phrase)
|
67 |
|
68 |
-
p.hexbin(embedding[phrase_flags==1,0],embedding[phrase_flags==1,1], size=size_value,
|
69 |
palette = np.flip(OrRd[8]), alpha=alpha_value)
|
70 |
p.circle('x', 'y', size=3, source=source, alpha=0.3)
|
71 |
|
|
|
7 |
from bokeh.plotting import figure, show
|
8 |
from bokeh.plotting import ColumnDataSource, figure, output_notebook, show
|
9 |
import cloudpickle as cp
|
10 |
+
import pickle
|
11 |
from scipy import stats
|
12 |
from urllib.request import urlopen
|
13 |
|
14 |
st.title("ArXiv+GPT3 embedding explorer")
|
15 |
+
st.markdown('[Includes papers up to: `'+dateval+'`]')
|
16 |
st.markdown("This is an explorer for astro-ph.GA papers on the arXiv (up to Apt 18th, 2023). The papers have been preprocessed with `chaotic_neural` [(link)](http://chaotic-neural.readthedocs.io/) after which the collected abstracts are run through `text-embedding-ada-002` with [langchain](https://python.langchain.com/en/latest/ecosystem/openai.html) to generate a unique vector correpsonding to each paper. These are then compressed using [umap](https://umap-learn.readthedocs.io/en/latest/) and shown here, and can be used for similarity searches with methods like [faiss](https://github.com/facebookresearch/faiss). The scatterplot here can be paired with a heatmap for more targeted searches looking at a specific topic or area (see sidebar). Upgrade to chaotic neural suggested by Jo Ciucă, thank you! More to come (hopefully) with GPT-4 and its applications!")
|
17 |
st.markdown("Interpreting the UMAP plot: the algorithm creates a 2d embedding from the high-dim vector space that tries to conserve as much similarity information as possible. Nearby points in UMAP space are similar, and grow dissimiliar as you move farther away. The axes do not have any physical meaning.")
|
18 |
|
19 |
+
@st.cache_data
|
20 |
+
def get_feeds_data(url):
|
21 |
+
# data = cp.load(urlopen(url))
|
22 |
+
with open(url, "rb") as fp:
|
23 |
+
data = pickle.load(fp)
|
24 |
+
st.sidebar.success("Fetched data from API!")
|
25 |
+
return data
|
26 |
+
|
27 |
+
# embeddings = OpenAIEmbeddings()
|
28 |
+
|
29 |
+
dateval = "27-Jun-2023"
|
30 |
+
feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
|
31 |
+
embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
|
32 |
+
gal_feeds = get_feeds_data(feeds_link)
|
33 |
+
arxiv_ada_embeddings = get_feeds_data(embed_link)
|
34 |
+
|
35 |
@st.cache_data
|
36 |
def get_embedding_data(url):
|
37 |
+
# data = cp.load(urlopen(url))
|
38 |
+
with open(url, "rb") as fp:
|
39 |
+
data = pickle.load(fp)
|
40 |
st.sidebar.success("Fetched data from API!")
|
41 |
return data
|
42 |
|
43 |
+
url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl"
|
44 |
+
# e2d, _, _, _, _ = get_embedding_data(url)
|
45 |
+
embedding = get_embedding_data(url)
|
46 |
+
|
47 |
+
st.title("ArXiv+GPT3 embedding explorer")
|
48 |
+
st.markdown('[Includes papers up to: `'+dateval+'`]')
|
49 |
+
st.markdown("This is an explorer for astro-ph.GA papers on the arXiv (up to Apt 18th, 2023). The papers have been preprocessed with `chaotic_neural` [(link)](http://chaotic-neural.readthedocs.io/) after which the collected abstracts are run through `text-embedding-ada-002` with [langchain](https://python.langchain.com/en/latest/ecosystem/openai.html) to generate a unique vector correpsonding to each paper. These are then compressed using [umap](https://umap-learn.readthedocs.io/en/latest/) and shown here, and can be used for similarity searches with methods like [faiss](https://github.com/facebookresearch/faiss). The scatterplot here can be paired with a heatmap for more targeted searches looking at a specific topic or area (see sidebar). Upgrade to chaotic neural suggested by Jo Ciucă, thank you! More to come (hopefully) with GPT-4 and its applications!")
|
50 |
+
st.markdown("Interpreting the UMAP plot: the algorithm creates a 2d embedding from the high-dim vector space that tries to conserve as much similarity information as possible. Nearby points in UMAP space are similar, and grow dissimiliar as you move farther away. The axes do not have any physical meaning.")
|
51 |
+
|
52 |
+
from tqdm import tqdm
|
53 |
+
ctr = -1
|
54 |
+
num_chunks = len(gal_feeds)
|
55 |
+
all_text = []
|
56 |
+
all_titles = []
|
57 |
+
all_arxivid = []
|
58 |
+
all_links = []
|
59 |
+
|
60 |
+
for nc in tqdm(range(num_chunks)):
|
61 |
+
for i in range(len(gal_feeds[nc].entries)):
|
62 |
+
text = gal_feeds[nc].entries[i].summary
|
63 |
+
text = text.replace('\n', ' ')
|
64 |
+
text = text.replace('\\', '')
|
65 |
+
all_text.append(text)
|
66 |
+
all_titles.append(gal_feeds[nc].entries[i].title)
|
67 |
+
all_arxivid.append(gal_feeds[nc].entries[i].id.split('/')[-1][0:-2])
|
68 |
+
all_links.append(gal_feeds[nc].entries[i].links[1].href)
|
69 |
+
|
70 |
|
71 |
def density_estimation(m1, m2, xmin=0, ymin=0, xmax=15, ymax=15):
|
72 |
+
X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
|
73 |
+
positions = np.vstack([X.ravel(), Y.ravel()])
|
74 |
+
values = np.vstack([m1, m2])
|
75 |
+
kernel = stats.gaussian_kde(values)
|
76 |
Z = np.reshape(kernel(positions).T, X.shape)
|
77 |
return X, Y, Z
|
78 |
+
|
79 |
st.sidebar.markdown('This is a widget that allows you to look for papers containing specific phrases in the dataset and show it as a heatmap. Enter the phrase of interest, then change the size and opacity of the heatmap as desired to find the high-density regions. Hover over blue points to see the details of individual papers.')
|
80 |
st.sidebar.markdown('`Note`: (i) if you enter a query that is not in the corpus of abstracts, it will return an error. just enter a different query in that case. (ii) there are some empty tooltips when you hover, these correspond to the underlying hexbins, and can be ignored.')
|
81 |
+
|
82 |
st.sidebar.text_input("Search query", key="phrase", value="")
|
83 |
alpha_value = st.sidebar.slider("Pick the hexbin opacity",0.0,1.0,0.1)
|
84 |
size_value = st.sidebar.slider("Pick the hexbin size",0.0,2.0,0.2)
|
|
|
89 |
for i in range(len(all_text)):
|
90 |
if phrase.lower() in all_text[i].lower():
|
91 |
phrase_flags[i] = 1
|
92 |
+
|
93 |
|
94 |
source = ColumnDataSource(data=dict(
|
95 |
x=embedding[0:,0],
|
|
|
106 |
@link <br> <br>
|
107 |
</div>
|
108 |
"""
|
109 |
+
|
110 |
p = figure(width=700, height=583, tooltips=TOOLTIPS, x_range=(0, 15), y_range=(2.5,15),
|
111 |
title="UMAP projection of trained ArXiv corpus | heatmap keyword: "+phrase)
|
112 |
|
113 |
+
p.hexbin(embedding[phrase_flags==1,0],embedding[phrase_flags==1,1], size=size_value,
|
114 |
palette = np.flip(OrRd[8]), alpha=alpha_value)
|
115 |
p.circle('x', 'y', size=3, source=source, alpha=0.3)
|
116 |
|
pages/3_qa_sources.py
CHANGED
@@ -7,6 +7,7 @@ import streamlit as st
|
|
7 |
import feedparser
|
8 |
import urllib
|
9 |
import cloudpickle as cp
|
|
|
10 |
from urllib.request import urlopen
|
11 |
from summa import summarizer
|
12 |
import numpy as np
|
@@ -26,25 +27,34 @@ os.environ["OPENAI_API_KEY"] = openai.api_key
|
|
26 |
|
27 |
@st.cache_data
|
28 |
def get_feeds_data(url):
|
29 |
-
data = cp.load(urlopen(url))
|
30 |
-
|
|
|
|
|
31 |
return data
|
32 |
|
33 |
embeddings = OpenAIEmbeddings()
|
34 |
|
35 |
-
feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_"
|
36 |
-
embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0"
|
|
|
|
|
|
|
37 |
gal_feeds = get_feeds_data(feeds_link)
|
38 |
arxiv_ada_embeddings = get_feeds_data(embed_link)
|
39 |
|
40 |
@st.cache_data
|
41 |
def get_embedding_data(url):
|
42 |
-
data = cp.load(urlopen(url))
|
|
|
|
|
43 |
st.sidebar.success("Fetched data from API!")
|
44 |
return data
|
45 |
|
46 |
-
url = "https://drive.google.com/uc?export=download&id=1133tynMwsfdR1wxbkFLhbES3FwDWTPjP"
|
47 |
-
|
|
|
|
|
48 |
|
49 |
ctr = -1
|
50 |
num_chunks = len(gal_feeds)
|
@@ -286,6 +296,7 @@ def run_query(query, return_n = 3, show_pure_answer = False, show_all_sources =
|
|
286 |
return output
|
287 |
|
288 |
st.title('ArXiv-based question answering')
|
|
|
289 |
st.markdown('Concise answers for questions using arxiv abstracts + GPT-4. Please use sparingly because it costs me money right now. You might need to wait for a few seconds for the GPT-4 query to return an answer (check top right corner to see if it is still running).')
|
290 |
|
291 |
query = st.text_input('Your question here:', value="What sersic index does a disk galaxy have?")
|
|
|
7 |
import feedparser
|
8 |
import urllib
|
9 |
import cloudpickle as cp
|
10 |
+
import pickle
|
11 |
from urllib.request import urlopen
|
12 |
from summa import summarizer
|
13 |
import numpy as np
|
|
|
27 |
|
28 |
@st.cache_data
|
29 |
def get_feeds_data(url):
|
30 |
+
# data = cp.load(urlopen(url))
|
31 |
+
with open(url, "rb") as fp:
|
32 |
+
data = pickle.load(fp)
|
33 |
+
st.sidebar.success("Loaded data")
|
34 |
return data
|
35 |
|
36 |
embeddings = OpenAIEmbeddings()
|
37 |
|
38 |
+
# feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_"
|
39 |
+
# embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0"
|
40 |
+
dateval = "27-Jun-2023"
|
41 |
+
feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
|
42 |
+
embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
|
43 |
gal_feeds = get_feeds_data(feeds_link)
|
44 |
arxiv_ada_embeddings = get_feeds_data(embed_link)
|
45 |
|
46 |
@st.cache_data
|
47 |
def get_embedding_data(url):
|
48 |
+
# data = cp.load(urlopen(url))
|
49 |
+
with open(url, "rb") as fp:
|
50 |
+
data = pickle.load(fp)
|
51 |
st.sidebar.success("Fetched data from API!")
|
52 |
return data
|
53 |
|
54 |
+
# url = "https://drive.google.com/uc?export=download&id=1133tynMwsfdR1wxbkFLhbES3FwDWTPjP"
|
55 |
+
url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl"
|
56 |
+
e2d = get_embedding_data(url)
|
57 |
+
# e2d, _, _, _, _ = get_embedding_data(url)
|
58 |
|
59 |
ctr = -1
|
60 |
num_chunks = len(gal_feeds)
|
|
|
296 |
return output
|
297 |
|
298 |
st.title('ArXiv-based question answering')
|
299 |
+
st.markdown('[Includes papers up to: `'+dateval+'`]')
|
300 |
st.markdown('Concise answers for questions using arxiv abstracts + GPT-4. Please use sparingly because it costs me money right now. You might need to wait for a few seconds for the GPT-4 query to return an answer (check top right corner to see if it is still running).')
|
301 |
|
302 |
query = st.text_input('Your question here:', value="What sersic index does a disk galaxy have?")
|