kiyer commited on
Commit
237026a
·
1 Parent(s): ec7d775

updated to pull files locally

Browse files
local_files/astro_ph_ga_embedding_27-Jun-2023.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:476fb3dc5155a733d6427b9f3cc126134d8be881cb0d598df2876f9a61dd672d
3
+ size 265762
local_files/astro_ph_ga_feeds_ada_embedding_27-Jun-2023.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:655af27d3c033e15ded2051d0b3a668dd429f64ec3bed5ceac3eacd969e618dd
3
+ size 407961763
local_files/astro_ph_ga_feeds_upto_27-Jun-2023.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29237e0e973a5fcd4df826c09432a069e6a471d5725fdfc9a0f8c7c62b69e188
3
+ size 89228171
pages/1_paper_search.py CHANGED
@@ -7,6 +7,7 @@ import streamlit as st
7
  import feedparser
8
  import urllib
9
  import cloudpickle as cp
 
10
  from urllib.request import urlopen
11
  from summa import summarizer
12
  import numpy as np
@@ -19,14 +20,21 @@ os.environ["OPENAI_API_KEY"] = openai.api_key
19
 
20
  @st.cache_data
21
  def get_feeds_data(url):
22
- data = cp.load(urlopen(url))
23
- st.sidebar.success("Fetched data from API!")
 
 
 
24
  return data
25
 
26
  embeddings = OpenAIEmbeddings()
27
 
28
- feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_"
29
- embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0"
 
 
 
 
30
  gal_feeds = get_feeds_data(feeds_link)
31
  arxiv_ada_embeddings = get_feeds_data(embed_link)
32
 
@@ -174,6 +182,7 @@ model_data = [arxiv_ada_embeddings, embeddings, all_titles, all_text, all_author
174
 
175
  st.title('ArXiv similarity search:')
176
  st.markdown('Search for similar papers by arxiv id or phrase:')
 
177
 
178
  search_type = st.radio(
179
  "What are you searching by?",
 
7
  import feedparser
8
  import urllib
9
  import cloudpickle as cp
10
+ import pickle
11
  from urllib.request import urlopen
12
  from summa import summarizer
13
  import numpy as np
 
20
 
21
  @st.cache_data
22
  def get_feeds_data(url):
23
+ with open(url, "rb") as fp:
24
+ data = pickle.load(fp)
25
+ st.sidebar.success("Loaded data!")
26
+ # data = cp.load(urlopen(url))
27
+ # st.sidebar.success("Fetched data from API!")
28
  return data
29
 
30
  embeddings = OpenAIEmbeddings()
31
 
32
+ # feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_"
33
+ # embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0"
34
+
35
+ dateval = "27-Jun-2023"
36
+ feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
37
+ embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
38
  gal_feeds = get_feeds_data(feeds_link)
39
  arxiv_ada_embeddings = get_feeds_data(embed_link)
40
 
 
182
 
183
  st.title('ArXiv similarity search:')
184
  st.markdown('Search for similar papers by arxiv id or phrase:')
185
+ st.markdown('[Includes papers up to: `'+dateval+'`]')
186
 
187
  search_type = st.radio(
188
  "What are you searching by?",
pages/2_arxiv_embedding.py CHANGED
@@ -7,33 +7,78 @@ from bokeh.palettes import OrRd
7
  from bokeh.plotting import figure, show
8
  from bokeh.plotting import ColumnDataSource, figure, output_notebook, show
9
  import cloudpickle as cp
 
10
  from scipy import stats
11
  from urllib.request import urlopen
12
 
13
  st.title("ArXiv+GPT3 embedding explorer")
 
14
  st.markdown("This is an explorer for astro-ph.GA papers on the arXiv (up to Apt 18th, 2023). The papers have been preprocessed with `chaotic_neural` [(link)](http://chaotic-neural.readthedocs.io/) after which the collected abstracts are run through `text-embedding-ada-002` with [langchain](https://python.langchain.com/en/latest/ecosystem/openai.html) to generate a unique vector correpsonding to each paper. These are then compressed using [umap](https://umap-learn.readthedocs.io/en/latest/) and shown here, and can be used for similarity searches with methods like [faiss](https://github.com/facebookresearch/faiss). The scatterplot here can be paired with a heatmap for more targeted searches looking at a specific topic or area (see sidebar). Upgrade to chaotic neural suggested by Jo Ciucă, thank you! More to come (hopefully) with GPT-4 and its applications!")
15
  st.markdown("Interpreting the UMAP plot: the algorithm creates a 2d embedding from the high-dim vector space that tries to conserve as much similarity information as possible. Nearby points in UMAP space are similar, and grow dissimiliar as you move farther away. The axes do not have any physical meaning.")
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  @st.cache_data
18
  def get_embedding_data(url):
19
- data = cp.load(urlopen(url))
 
 
20
  st.sidebar.success("Fetched data from API!")
21
  return data
22
 
23
- url = "https://drive.google.com/uc?export=download&id=1133tynMwsfdR1wxbkFLhbES3FwDWTPjP"
24
- embedding, all_text, all_titles, all_arxivid, all_links = get_embedding_data(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  def density_estimation(m1, m2, xmin=0, ymin=0, xmax=15, ymax=15):
27
- X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
28
- positions = np.vstack([X.ravel(), Y.ravel()])
29
- values = np.vstack([m1, m2])
30
- kernel = stats.gaussian_kde(values)
31
  Z = np.reshape(kernel(positions).T, X.shape)
32
  return X, Y, Z
33
-
34
  st.sidebar.markdown('This is a widget that allows you to look for papers containing specific phrases in the dataset and show it as a heatmap. Enter the phrase of interest, then change the size and opacity of the heatmap as desired to find the high-density regions. Hover over blue points to see the details of individual papers.')
35
  st.sidebar.markdown('`Note`: (i) if you enter a query that is not in the corpus of abstracts, it will return an error. just enter a different query in that case. (ii) there are some empty tooltips when you hover, these correspond to the underlying hexbins, and can be ignored.')
36
-
37
  st.sidebar.text_input("Search query", key="phrase", value="")
38
  alpha_value = st.sidebar.slider("Pick the hexbin opacity",0.0,1.0,0.1)
39
  size_value = st.sidebar.slider("Pick the hexbin size",0.0,2.0,0.2)
@@ -44,7 +89,7 @@ phrase_flags = np.zeros((len(all_text),))
44
  for i in range(len(all_text)):
45
  if phrase.lower() in all_text[i].lower():
46
  phrase_flags[i] = 1
47
-
48
 
49
  source = ColumnDataSource(data=dict(
50
  x=embedding[0:,0],
@@ -61,11 +106,11 @@ ID: $index
61
  @link <br> <br>
62
  </div>
63
  """
64
-
65
  p = figure(width=700, height=583, tooltips=TOOLTIPS, x_range=(0, 15), y_range=(2.5,15),
66
  title="UMAP projection of trained ArXiv corpus | heatmap keyword: "+phrase)
67
 
68
- p.hexbin(embedding[phrase_flags==1,0],embedding[phrase_flags==1,1], size=size_value,
69
  palette = np.flip(OrRd[8]), alpha=alpha_value)
70
  p.circle('x', 'y', size=3, source=source, alpha=0.3)
71
 
 
7
  from bokeh.plotting import figure, show
8
  from bokeh.plotting import ColumnDataSource, figure, output_notebook, show
9
  import cloudpickle as cp
10
+ import pickle
11
  from scipy import stats
12
  from urllib.request import urlopen
13
 
14
  st.title("ArXiv+GPT3 embedding explorer")
15
+ st.markdown('[Includes papers up to: `'+dateval+'`]')
16
  st.markdown("This is an explorer for astro-ph.GA papers on the arXiv (up to Apt 18th, 2023). The papers have been preprocessed with `chaotic_neural` [(link)](http://chaotic-neural.readthedocs.io/) after which the collected abstracts are run through `text-embedding-ada-002` with [langchain](https://python.langchain.com/en/latest/ecosystem/openai.html) to generate a unique vector correpsonding to each paper. These are then compressed using [umap](https://umap-learn.readthedocs.io/en/latest/) and shown here, and can be used for similarity searches with methods like [faiss](https://github.com/facebookresearch/faiss). The scatterplot here can be paired with a heatmap for more targeted searches looking at a specific topic or area (see sidebar). Upgrade to chaotic neural suggested by Jo Ciucă, thank you! More to come (hopefully) with GPT-4 and its applications!")
17
  st.markdown("Interpreting the UMAP plot: the algorithm creates a 2d embedding from the high-dim vector space that tries to conserve as much similarity information as possible. Nearby points in UMAP space are similar, and grow dissimiliar as you move farther away. The axes do not have any physical meaning.")
18
 
19
+ @st.cache_data
20
+ def get_feeds_data(url):
21
+ # data = cp.load(urlopen(url))
22
+ with open(url, "rb") as fp:
23
+ data = pickle.load(fp)
24
+ st.sidebar.success("Fetched data from API!")
25
+ return data
26
+
27
+ # embeddings = OpenAIEmbeddings()
28
+
29
+ dateval = "27-Jun-2023"
30
+ feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
31
+ embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
32
+ gal_feeds = get_feeds_data(feeds_link)
33
+ arxiv_ada_embeddings = get_feeds_data(embed_link)
34
+
35
  @st.cache_data
36
  def get_embedding_data(url):
37
+ # data = cp.load(urlopen(url))
38
+ with open(url, "rb") as fp:
39
+ data = pickle.load(fp)
40
  st.sidebar.success("Fetched data from API!")
41
  return data
42
 
43
+ url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl"
44
+ # e2d, _, _, _, _ = get_embedding_data(url)
45
+ embedding = get_embedding_data(url)
46
+
47
+ st.title("ArXiv+GPT3 embedding explorer")
48
+ st.markdown('[Includes papers up to: `'+dateval+'`]')
49
+ st.markdown("This is an explorer for astro-ph.GA papers on the arXiv (up to Apt 18th, 2023). The papers have been preprocessed with `chaotic_neural` [(link)](http://chaotic-neural.readthedocs.io/) after which the collected abstracts are run through `text-embedding-ada-002` with [langchain](https://python.langchain.com/en/latest/ecosystem/openai.html) to generate a unique vector correpsonding to each paper. These are then compressed using [umap](https://umap-learn.readthedocs.io/en/latest/) and shown here, and can be used for similarity searches with methods like [faiss](https://github.com/facebookresearch/faiss). The scatterplot here can be paired with a heatmap for more targeted searches looking at a specific topic or area (see sidebar). Upgrade to chaotic neural suggested by Jo Ciucă, thank you! More to come (hopefully) with GPT-4 and its applications!")
50
+ st.markdown("Interpreting the UMAP plot: the algorithm creates a 2d embedding from the high-dim vector space that tries to conserve as much similarity information as possible. Nearby points in UMAP space are similar, and grow dissimiliar as you move farther away. The axes do not have any physical meaning.")
51
+
52
+ from tqdm import tqdm
53
+ ctr = -1
54
+ num_chunks = len(gal_feeds)
55
+ all_text = []
56
+ all_titles = []
57
+ all_arxivid = []
58
+ all_links = []
59
+
60
+ for nc in tqdm(range(num_chunks)):
61
+ for i in range(len(gal_feeds[nc].entries)):
62
+ text = gal_feeds[nc].entries[i].summary
63
+ text = text.replace('\n', ' ')
64
+ text = text.replace('\\', '')
65
+ all_text.append(text)
66
+ all_titles.append(gal_feeds[nc].entries[i].title)
67
+ all_arxivid.append(gal_feeds[nc].entries[i].id.split('/')[-1][0:-2])
68
+ all_links.append(gal_feeds[nc].entries[i].links[1].href)
69
+
70
 
71
  def density_estimation(m1, m2, xmin=0, ymin=0, xmax=15, ymax=15):
72
+ X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
73
+ positions = np.vstack([X.ravel(), Y.ravel()])
74
+ values = np.vstack([m1, m2])
75
+ kernel = stats.gaussian_kde(values)
76
  Z = np.reshape(kernel(positions).T, X.shape)
77
  return X, Y, Z
78
+
79
  st.sidebar.markdown('This is a widget that allows you to look for papers containing specific phrases in the dataset and show it as a heatmap. Enter the phrase of interest, then change the size and opacity of the heatmap as desired to find the high-density regions. Hover over blue points to see the details of individual papers.')
80
  st.sidebar.markdown('`Note`: (i) if you enter a query that is not in the corpus of abstracts, it will return an error. just enter a different query in that case. (ii) there are some empty tooltips when you hover, these correspond to the underlying hexbins, and can be ignored.')
81
+
82
  st.sidebar.text_input("Search query", key="phrase", value="")
83
  alpha_value = st.sidebar.slider("Pick the hexbin opacity",0.0,1.0,0.1)
84
  size_value = st.sidebar.slider("Pick the hexbin size",0.0,2.0,0.2)
 
89
  for i in range(len(all_text)):
90
  if phrase.lower() in all_text[i].lower():
91
  phrase_flags[i] = 1
92
+
93
 
94
  source = ColumnDataSource(data=dict(
95
  x=embedding[0:,0],
 
106
  @link <br> <br>
107
  </div>
108
  """
109
+
110
  p = figure(width=700, height=583, tooltips=TOOLTIPS, x_range=(0, 15), y_range=(2.5,15),
111
  title="UMAP projection of trained ArXiv corpus | heatmap keyword: "+phrase)
112
 
113
+ p.hexbin(embedding[phrase_flags==1,0],embedding[phrase_flags==1,1], size=size_value,
114
  palette = np.flip(OrRd[8]), alpha=alpha_value)
115
  p.circle('x', 'y', size=3, source=source, alpha=0.3)
116
 
pages/3_qa_sources.py CHANGED
@@ -7,6 +7,7 @@ import streamlit as st
7
  import feedparser
8
  import urllib
9
  import cloudpickle as cp
 
10
  from urllib.request import urlopen
11
  from summa import summarizer
12
  import numpy as np
@@ -26,25 +27,34 @@ os.environ["OPENAI_API_KEY"] = openai.api_key
26
 
27
  @st.cache_data
28
  def get_feeds_data(url):
29
- data = cp.load(urlopen(url))
30
- st.sidebar.success("Fetched data from API!")
 
 
31
  return data
32
 
33
  embeddings = OpenAIEmbeddings()
34
 
35
- feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_"
36
- embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0"
 
 
 
37
  gal_feeds = get_feeds_data(feeds_link)
38
  arxiv_ada_embeddings = get_feeds_data(embed_link)
39
 
40
  @st.cache_data
41
  def get_embedding_data(url):
42
- data = cp.load(urlopen(url))
 
 
43
  st.sidebar.success("Fetched data from API!")
44
  return data
45
 
46
- url = "https://drive.google.com/uc?export=download&id=1133tynMwsfdR1wxbkFLhbES3FwDWTPjP"
47
- e2d, _, _, _, _ = get_embedding_data(url)
 
 
48
 
49
  ctr = -1
50
  num_chunks = len(gal_feeds)
@@ -286,6 +296,7 @@ def run_query(query, return_n = 3, show_pure_answer = False, show_all_sources =
286
  return output
287
 
288
  st.title('ArXiv-based question answering')
 
289
  st.markdown('Concise answers for questions using arxiv abstracts + GPT-4. Please use sparingly because it costs me money right now. You might need to wait for a few seconds for the GPT-4 query to return an answer (check top right corner to see if it is still running).')
290
 
291
  query = st.text_input('Your question here:', value="What sersic index does a disk galaxy have?")
 
7
  import feedparser
8
  import urllib
9
  import cloudpickle as cp
10
+ import pickle
11
  from urllib.request import urlopen
12
  from summa import summarizer
13
  import numpy as np
 
27
 
28
  @st.cache_data
29
  def get_feeds_data(url):
30
+ # data = cp.load(urlopen(url))
31
+ with open(url, "rb") as fp:
32
+ data = pickle.load(fp)
33
+ st.sidebar.success("Loaded data")
34
  return data
35
 
36
  embeddings = OpenAIEmbeddings()
37
 
38
+ # feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_"
39
+ # embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0"
40
+ dateval = "27-Jun-2023"
41
+ feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
42
+ embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
43
  gal_feeds = get_feeds_data(feeds_link)
44
  arxiv_ada_embeddings = get_feeds_data(embed_link)
45
 
46
  @st.cache_data
47
  def get_embedding_data(url):
48
+ # data = cp.load(urlopen(url))
49
+ with open(url, "rb") as fp:
50
+ data = pickle.load(fp)
51
  st.sidebar.success("Fetched data from API!")
52
  return data
53
 
54
+ # url = "https://drive.google.com/uc?export=download&id=1133tynMwsfdR1wxbkFLhbES3FwDWTPjP"
55
+ url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl"
56
+ e2d = get_embedding_data(url)
57
+ # e2d, _, _, _, _ = get_embedding_data(url)
58
 
59
  ctr = -1
60
  num_chunks = len(gal_feeds)
 
296
  return output
297
 
298
  st.title('ArXiv-based question answering')
299
+ st.markdown('[Includes papers up to: `'+dateval+'`]')
300
  st.markdown('Concise answers for questions using arxiv abstracts + GPT-4. Please use sparingly because it costs me money right now. You might need to wait for a few seconds for the GPT-4 query to return an answer (check top right corner to see if it is still running).')
301
 
302
  query = st.text_input('Your question here:', value="What sersic index does a disk galaxy have?")