Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
added more up to date corpus (2024)
Browse files- local_files/astro-ph_ga_feeds_upto_16-Jun-2024.pkl +3 -0
- local_files/astro_ph_ga_embedding_16-Jun-2024.pkl +3 -0
- local_files/astro_ph_ga_feeds_ada_embedding_16-Jun-2024.pkl +3 -0
- pages/7_answering_questions_2024.py +352 -0
- pages/8_arxiv_embedding_explorer_2024.py +121 -0
- pages/9_research_hotspots_2024.py +130 -0
local_files/astro-ph_ga_feeds_upto_16-Jun-2024.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:89114c7ff34595e424f1585d32aec5665a07f26399e75bb8b40b4de7737ac2d0
|
3 |
+
size 134799303
|
local_files/astro_ph_ga_embedding_16-Jun-2024.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e8149e23eb9102bdaa41019eb0ed33ec0fb5fcd8f1868cd0a5a12cac52538a99
|
3 |
+
size 400163
|
local_files/astro_ph_ga_feeds_ada_embedding_16-Jun-2024.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7142d6cbd1eed73405990fa80b791d231da401208200ab1987a9b61d861f6c17
|
3 |
+
size 614400163
|
pages/7_answering_questions_2024.py
ADDED
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import datetime
|
3 |
+
import faiss
|
4 |
+
import streamlit as st
|
5 |
+
import feedparser
|
6 |
+
import urllib
|
7 |
+
import cloudpickle as cp
|
8 |
+
import pickle
|
9 |
+
from urllib.request import urlopen
|
10 |
+
from summa import summarizer
|
11 |
+
import numpy as np
|
12 |
+
import matplotlib.pyplot as plt
|
13 |
+
import requests
|
14 |
+
import json
|
15 |
+
|
16 |
+
from langchain.document_loaders import TextLoader
|
17 |
+
from langchain.indexes import VectorstoreIndexCreator
|
18 |
+
from langchain_openai import AzureOpenAIEmbeddings
|
19 |
+
from langchain.llms import OpenAI
|
20 |
+
from langchain_openai import AzureChatOpenAI
|
21 |
+
from langchain import hub
|
22 |
+
from langchain_core.prompts import PromptTemplate
|
23 |
+
from langchain_core.runnables import RunnablePassthrough
|
24 |
+
from langchain_core.output_parsers import StrOutputParser
|
25 |
+
from langchain_core.runnables import RunnableParallel
|
26 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
27 |
+
from langchain_community.vectorstores import Chroma
|
28 |
+
|
29 |
+
os.environ["OPENAI_API_TYPE"] = "azure"
|
30 |
+
os.environ["AZURE_ENDPOINT"] = st.secrets["endpoint1"]
|
31 |
+
os.environ["OPENAI_API_KEY"] = st.secrets["key1"]
|
32 |
+
os.environ["OPENAI_API_VERSION"] = "2023-05-15"
|
33 |
+
|
34 |
+
embeddings = AzureOpenAIEmbeddings(
|
35 |
+
deployment="embedding",
|
36 |
+
model="text-embedding-ada-002",
|
37 |
+
azure_endpoint=st.secrets["endpoint1"],
|
38 |
+
)
|
39 |
+
|
40 |
+
llm = AzureChatOpenAI(
|
41 |
+
deployment_name="gpt4_small",
|
42 |
+
openai_api_version="2023-12-01-preview",
|
43 |
+
azure_endpoint=st.secrets["endpoint2"],
|
44 |
+
openai_api_key=st.secrets["key2"],
|
45 |
+
openai_api_type="azure",
|
46 |
+
temperature=0.
|
47 |
+
)
|
48 |
+
|
49 |
+
|
50 |
+
@st.cache_data
|
51 |
+
def get_feeds_data(url):
|
52 |
+
# data = cp.load(urlopen(url))
|
53 |
+
with open(url, "rb") as fp:
|
54 |
+
data = pickle.load(fp)
|
55 |
+
st.sidebar.success("Loaded data")
|
56 |
+
return data
|
57 |
+
|
58 |
+
# feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_"
|
59 |
+
# embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0"
|
60 |
+
dateval = "16-Jun-2024"
|
61 |
+
feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
|
62 |
+
embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
|
63 |
+
gal_feeds = get_feeds_data(feeds_link)
|
64 |
+
arxiv_ada_embeddings = get_feeds_data(embed_link)
|
65 |
+
|
66 |
+
@st.cache_data
|
67 |
+
def get_embedding_data(url):
|
68 |
+
# data = cp.load(urlopen(url))
|
69 |
+
with open(url, "rb") as fp:
|
70 |
+
data = pickle.load(fp)
|
71 |
+
st.sidebar.success("Fetched data from API!")
|
72 |
+
return data
|
73 |
+
|
74 |
+
# url = "https://drive.google.com/uc?export=download&id=1133tynMwsfdR1wxbkFLhbES3FwDWTPjP"
|
75 |
+
url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl"
|
76 |
+
e2d = get_embedding_data(url)
|
77 |
+
# e2d, _, _, _, _ = get_embedding_data(url)
|
78 |
+
|
79 |
+
ctr = -1
|
80 |
+
num_chunks = len(gal_feeds)
|
81 |
+
all_text, all_titles, all_arxivid, all_links, all_authors = [], [], [], [], []
|
82 |
+
|
83 |
+
for nc in range(num_chunks):
|
84 |
+
|
85 |
+
for i in range(len(gal_feeds[nc].entries)):
|
86 |
+
text = gal_feeds[nc].entries[i].summary
|
87 |
+
text = text.replace('\n', ' ')
|
88 |
+
text = text.replace('\\', '')
|
89 |
+
all_text.append(text)
|
90 |
+
all_titles.append(gal_feeds[nc].entries[i].title)
|
91 |
+
all_arxivid.append(gal_feeds[nc].entries[i].id.split('/')[-1][0:-2])
|
92 |
+
all_links.append(gal_feeds[nc].entries[i].links[1].href)
|
93 |
+
all_authors.append(gal_feeds[nc].entries[i].authors)
|
94 |
+
|
95 |
+
d = arxiv_ada_embeddings.shape[1] # dimension
|
96 |
+
nb = arxiv_ada_embeddings.shape[0] # database size
|
97 |
+
xb = arxiv_ada_embeddings.astype('float32')
|
98 |
+
index = faiss.IndexFlatL2(d)
|
99 |
+
index.add(xb)
|
100 |
+
|
101 |
+
def run_simple_query(search_query = 'all:sed+fitting', max_results = 10, start = 0, sort_by = 'lastUpdatedDate', sort_order = 'descending'):
|
102 |
+
"""
|
103 |
+
Query ArXiv to return search results for a particular query
|
104 |
+
Parameters
|
105 |
+
----------
|
106 |
+
query: str
|
107 |
+
query term. use prefixes ti, au, abs, co, jr, cat, m, id, all as applicable.
|
108 |
+
max_results: int, default = 10
|
109 |
+
number of results to return. numbers > 1000 generally lead to timeouts
|
110 |
+
start: int, default = 0
|
111 |
+
start index for results reported. use this if you're interested in running chunks.
|
112 |
+
Returns
|
113 |
+
-------
|
114 |
+
feed: dict
|
115 |
+
object containing requested results parsed with feedparser
|
116 |
+
Notes
|
117 |
+
-----
|
118 |
+
add functionality for chunk parsing, as well as storage and retreival
|
119 |
+
"""
|
120 |
+
|
121 |
+
base_url = 'http://export.arxiv.org/api/query?';
|
122 |
+
query = 'search_query=%s&start=%i&max_results=%i&sortBy=%s&sortOrder=%s' % (search_query,
|
123 |
+
start,
|
124 |
+
max_results,sort_by,sort_order)
|
125 |
+
|
126 |
+
response = urllib.request.urlopen(base_url+query).read()
|
127 |
+
feed = feedparser.parse(response)
|
128 |
+
return feed
|
129 |
+
|
130 |
+
def find_papers_by_author(auth_name):
|
131 |
+
|
132 |
+
doc_ids = []
|
133 |
+
for doc_id in range(len(all_authors)):
|
134 |
+
for auth_id in range(len(all_authors[doc_id])):
|
135 |
+
if auth_name.lower() in all_authors[doc_id][auth_id]['name'].lower():
|
136 |
+
print('Doc ID: ',doc_id, ' | arXiv: ', all_arxivid[doc_id], '| ', all_titles[doc_id],' | Author entry: ', all_authors[doc_id][auth_id]['name'])
|
137 |
+
doc_ids.append(doc_id)
|
138 |
+
|
139 |
+
return doc_ids
|
140 |
+
|
141 |
+
def faiss_based_indices(input_vector, nindex=10):
|
142 |
+
xq = input_vector.reshape(-1,1).T.astype('float32')
|
143 |
+
D, I = index.search(xq, nindex)
|
144 |
+
return I[0], D[0]
|
145 |
+
|
146 |
+
def list_similar_papers_v2(model_data,
|
147 |
+
doc_id = [], input_type = 'doc_id',
|
148 |
+
show_authors = False, show_summary = False,
|
149 |
+
return_n = 10):
|
150 |
+
|
151 |
+
arxiv_ada_embeddings, embeddings, all_titles, all_abstracts, all_authors = model_data
|
152 |
+
|
153 |
+
if input_type == 'doc_id':
|
154 |
+
print('Doc ID: ',doc_id,', title: ',all_titles[doc_id])
|
155 |
+
# inferred_vector = model.infer_vector(train_corpus[doc_id].words)
|
156 |
+
inferred_vector = arxiv_ada_embeddings[doc_id,0:]
|
157 |
+
start_range = 1
|
158 |
+
elif input_type == 'arxiv_id':
|
159 |
+
print('ArXiv id: ',doc_id)
|
160 |
+
arxiv_query_feed = run_simple_query(search_query='id:'+str(doc_id))
|
161 |
+
if len(arxiv_query_feed.entries) == 0:
|
162 |
+
print('error: arxiv id not found.')
|
163 |
+
return
|
164 |
+
else:
|
165 |
+
print('Title: '+arxiv_query_feed.entries[0].title)
|
166 |
+
inferred_vector = np.array(embeddings.embed_query(arxiv_query_feed.entries[0].summary))
|
167 |
+
start_range = 0
|
168 |
+
elif input_type == 'keywords':
|
169 |
+
inferred_vector = np.array(embeddings.embed_query(doc_id))
|
170 |
+
start_range = 0
|
171 |
+
else:
|
172 |
+
print('unrecognized input type.')
|
173 |
+
return
|
174 |
+
|
175 |
+
sims, dists = faiss_based_indices(inferred_vector, return_n+2)
|
176 |
+
textstr = ''
|
177 |
+
abstracts_relevant = []
|
178 |
+
fhdrs = []
|
179 |
+
|
180 |
+
for i in range(start_range,start_range+return_n):
|
181 |
+
|
182 |
+
abstracts_relevant.append(all_text[sims[i]])
|
183 |
+
fhdr = str(sims[i])+'_'+all_authors[sims[i]][0]['name'].split()[-1] + all_arxivid[sims[i]][0:2] +'_'+ all_arxivid[sims[i]]
|
184 |
+
fhdrs.append(fhdr)
|
185 |
+
textstr = textstr + str(i+1)+'. **'+ all_titles[sims[i]] +'** (Distance: %.2f' %dists[i]+') \n'
|
186 |
+
textstr = textstr + '**ArXiv:** ['+all_arxivid[sims[i]]+'](https://arxiv.org/abs/'+all_arxivid[sims[i]]+') \n'
|
187 |
+
if show_authors == True:
|
188 |
+
textstr = textstr + '**Authors:** '
|
189 |
+
temp = all_authors[sims[i]]
|
190 |
+
for ak in range(len(temp)):
|
191 |
+
if ak < len(temp)-1:
|
192 |
+
textstr = textstr + temp[ak].name + ', '
|
193 |
+
else:
|
194 |
+
textstr = textstr + temp[ak].name + ' \n'
|
195 |
+
if show_summary == True:
|
196 |
+
textstr = textstr + '**Summary:** '
|
197 |
+
text = all_text[sims[i]]
|
198 |
+
text = text.replace('\n', ' ')
|
199 |
+
textstr = textstr + summarizer.summarize(text) + ' \n'
|
200 |
+
if show_authors == True or show_summary == True:
|
201 |
+
textstr = textstr + ' '
|
202 |
+
textstr = textstr + ' \n'
|
203 |
+
return textstr, abstracts_relevant, fhdrs, sims
|
204 |
+
|
205 |
+
|
206 |
+
def generate_chat_completion(messages, model="gpt-4", temperature=1, max_tokens=None):
|
207 |
+
headers = {
|
208 |
+
"Content-Type": "application/json",
|
209 |
+
"Authorization": f"Bearer {openai.api_key}",
|
210 |
+
}
|
211 |
+
|
212 |
+
data = {
|
213 |
+
"model": model,
|
214 |
+
"messages": messages,
|
215 |
+
"temperature": temperature,
|
216 |
+
}
|
217 |
+
|
218 |
+
if max_tokens is not None:
|
219 |
+
data["max_tokens"] = max_tokens
|
220 |
+
response = requests.post(API_ENDPOINT, headers=headers, data=json.dumps(data))
|
221 |
+
if response.status_code == 200:
|
222 |
+
return response.json()["choices"][0]["message"]["content"]
|
223 |
+
else:
|
224 |
+
raise Exception(f"Error {response.status_code}: {response.text}")
|
225 |
+
|
226 |
+
model_data = [arxiv_ada_embeddings, embeddings, all_titles, all_text, all_authors]
|
227 |
+
|
228 |
+
def format_docs(docs):
|
229 |
+
return "\n\n".join(doc.page_content for doc in docs)
|
230 |
+
|
231 |
+
def get_textstr(i, show_authors=False, show_summary=False):
|
232 |
+
textstr = ''
|
233 |
+
textstr = '**'+ all_titles[i] +'** \n'
|
234 |
+
textstr = textstr + '**ArXiv:** ['+all_arxivid[i]+'](https://arxiv.org/abs/'+all_arxivid[i]+') \n'
|
235 |
+
if show_authors == True:
|
236 |
+
textstr = textstr + '**Authors:** '
|
237 |
+
temp = all_authors[i]
|
238 |
+
for ak in range(len(temp)):
|
239 |
+
if ak < len(temp)-1:
|
240 |
+
textstr = textstr + temp[ak].name + ', '
|
241 |
+
else:
|
242 |
+
textstr = textstr + temp[ak].name + ' \n'
|
243 |
+
if show_summary == True:
|
244 |
+
textstr = textstr + '**Summary:** '
|
245 |
+
text = all_text[i]
|
246 |
+
text = text.replace('\n', ' ')
|
247 |
+
textstr = textstr + summarizer.summarize(text) + ' \n'
|
248 |
+
if show_authors == True or show_summary == True:
|
249 |
+
textstr = textstr + ' '
|
250 |
+
textstr = textstr + ' \n'
|
251 |
+
|
252 |
+
return textstr
|
253 |
+
|
254 |
+
|
255 |
+
def run_rag(query, return_n = 10, show_authors = True, show_summary = True):
|
256 |
+
|
257 |
+
sims, absts, fhdrs, simids = list_similar_papers_v2(model_data,
|
258 |
+
doc_id = query,
|
259 |
+
input_type='keywords',
|
260 |
+
show_authors = show_authors, show_summary = show_summary,
|
261 |
+
return_n = return_n)
|
262 |
+
|
263 |
+
temp_abst = ''
|
264 |
+
loaders = []
|
265 |
+
for i in range(len(absts)):
|
266 |
+
temp_abst = absts[i]
|
267 |
+
|
268 |
+
try:
|
269 |
+
text_file = open("absts/"+fhdrs[i]+".txt", "w")
|
270 |
+
except:
|
271 |
+
os.mkdir('absts')
|
272 |
+
text_file = open("absts/"+fhdrs[i]+".txt", "w")
|
273 |
+
n = text_file.write(temp_abst)
|
274 |
+
text_file.close()
|
275 |
+
loader = TextLoader("absts/"+fhdrs[i]+".txt")
|
276 |
+
loaders.append(loader)
|
277 |
+
|
278 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
|
279 |
+
splits = text_splitter.split_documents([loader.load()[0] for loader in loaders])
|
280 |
+
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
|
281 |
+
retriever = vectorstore.as_retriever()
|
282 |
+
|
283 |
+
template = """You are an assistant with expertise in astrophysics for question-answering tasks.
|
284 |
+
Use the following pieces of retrieved context from the literature to answer the question.
|
285 |
+
If you don't know the answer, just say that you don't know.
|
286 |
+
Use six sentences maximum and keep the answer concise.
|
287 |
+
|
288 |
+
{context}
|
289 |
+
|
290 |
+
Question: {question}
|
291 |
+
|
292 |
+
Answer:"""
|
293 |
+
custom_rag_prompt = PromptTemplate.from_template(template)
|
294 |
+
|
295 |
+
rag_chain_from_docs = (
|
296 |
+
RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
|
297 |
+
| custom_rag_prompt
|
298 |
+
| llm
|
299 |
+
| StrOutputParser()
|
300 |
+
)
|
301 |
+
|
302 |
+
rag_chain_with_source = RunnableParallel(
|
303 |
+
{"context": retriever, "question": RunnablePassthrough()}
|
304 |
+
).assign(answer=rag_chain_from_docs)
|
305 |
+
|
306 |
+
rag_answer = rag_chain_with_source.invoke(query)
|
307 |
+
|
308 |
+
st.markdown('### User query: '+query)
|
309 |
+
|
310 |
+
st.markdown(rag_answer['answer'])
|
311 |
+
opstr = '#### Primary sources: \n'
|
312 |
+
srcnames = []
|
313 |
+
for i in range(len(rag_answer['context'])):
|
314 |
+
srcnames.append(rag_answer['context'][0].metadata['source'])
|
315 |
+
|
316 |
+
srcnames = np.unique(srcnames)
|
317 |
+
srcindices = []
|
318 |
+
for i in range(len(srcnames)):
|
319 |
+
temp = srcnames[i].split('_')[1]
|
320 |
+
srcindices.append(int(srcnames[i].split('_')[0].split('/')[1]))
|
321 |
+
if int(temp[-2:]) < 40:
|
322 |
+
temp = temp[0:-2] + ' et al. 20' + temp[-2:]
|
323 |
+
else:
|
324 |
+
temp = temp[0:-2] + ' et al. 19' + temp[-2:]
|
325 |
+
temp = '['+temp+']('+all_links[int(srcnames[i].split('_')[0].split('/')[1])]+')'
|
326 |
+
st.markdown(temp)
|
327 |
+
abs_indices = np.array(srcindices)
|
328 |
+
|
329 |
+
fig = plt.figure(figsize=(9,9))
|
330 |
+
plt.scatter(e2d[0:,0], e2d[0:,1],s=2)
|
331 |
+
plt.scatter(e2d[simids,0], e2d[simids,1],s=30)
|
332 |
+
plt.scatter(e2d[abs_indices,0], e2d[abs_indices,1],s=100,color='k',marker='d')
|
333 |
+
plt.title('localization for question: '+query)
|
334 |
+
st.pyplot(fig)
|
335 |
+
|
336 |
+
st.markdown('\n #### List of relevant papers:')
|
337 |
+
st.markdown(sims)
|
338 |
+
|
339 |
+
return rag_answer
|
340 |
+
|
341 |
+
|
342 |
+
st.title('ArXiv-based question answering')
|
343 |
+
st.markdown('[Includes papers up to: `'+dateval+'`]')
|
344 |
+
st.markdown('Concise answers for questions using arxiv abstracts + GPT-4. You might need to wait for a few seconds for the GPT-4 query to return an answer (check top right corner to see if it is still running).')
|
345 |
+
st.markdown('The answers are followed by relevant source(s) used in the answer, a graph showing which part of the astro-ph.GA manifold it drew the answer from (tightly clustered points generally indicate high quality/consensus answers) followed by a bunch of relevant papers used by the RAG to compose the answer.')
|
346 |
+
st.markdown('If this does not satisfactorily answer your question or rambles too much, you can also try the older `qa_sources_v1` page.')
|
347 |
+
|
348 |
+
query = st.text_input('Your question here:',
|
349 |
+
value="What causes galaxy quenching at high redshifts?")
|
350 |
+
return_n = st.slider('How many papers should I show?', 1, 30, 10)
|
351 |
+
|
352 |
+
sims = run_rag(query, return_n = return_n)
|
pages/8_arxiv_embedding_explorer_2024.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import pickle
|
6 |
+
from bokeh.palettes import OrRd
|
7 |
+
from bokeh.plotting import figure, show
|
8 |
+
from bokeh.plotting import ColumnDataSource, figure, output_notebook, show
|
9 |
+
import cloudpickle as cp
|
10 |
+
import pickle
|
11 |
+
from scipy import stats
|
12 |
+
from urllib.request import urlopen
|
13 |
+
|
14 |
+
@st.cache_data
|
15 |
+
def get_feeds_data(url):
|
16 |
+
# data = cp.load(urlopen(url))
|
17 |
+
with open(url, "rb") as fp:
|
18 |
+
data = pickle.load(fp)
|
19 |
+
st.sidebar.success("Fetched data from API!")
|
20 |
+
return data
|
21 |
+
|
22 |
+
# embeddings = OpenAIEmbeddings()
|
23 |
+
|
24 |
+
dateval = "16-Jun-2024"
|
25 |
+
feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
|
26 |
+
embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
|
27 |
+
gal_feeds = get_feeds_data(feeds_link)
|
28 |
+
arxiv_ada_embeddings = get_feeds_data(embed_link)
|
29 |
+
|
30 |
+
@st.cache_data
|
31 |
+
def get_embedding_data(url):
|
32 |
+
# data = cp.load(urlopen(url))
|
33 |
+
with open(url, "rb") as fp:
|
34 |
+
data = pickle.load(fp)
|
35 |
+
st.sidebar.success("Fetched data from API!")
|
36 |
+
return data
|
37 |
+
|
38 |
+
url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl"
|
39 |
+
# e2d, _, _, _, _ = get_embedding_data(url)
|
40 |
+
embedding = get_embedding_data(url)
|
41 |
+
|
42 |
+
st.title("ArXiv+GPT3 embedding explorer")
|
43 |
+
st.markdown('[Includes papers up to: `'+dateval+'`]')
|
44 |
+
st.markdown("This is an explorer for astro-ph.GA papers on the arXiv (up to Apt 18th, 2023). The papers have been preprocessed with `chaotic_neural` [(link)](http://chaotic-neural.readthedocs.io/) after which the collected abstracts are run through `text-embedding-ada-002` with [langchain](https://python.langchain.com/en/latest/ecosystem/openai.html) to generate a unique vector correpsonding to each paper. These are then compressed using [umap](https://umap-learn.readthedocs.io/en/latest/) and shown here, and can be used for similarity searches with methods like [faiss](https://github.com/facebookresearch/faiss). The scatterplot here can be paired with a heatmap for more targeted searches looking at a specific topic or area (see sidebar). Upgrade to chaotic neural suggested by Jo Ciucă, thank you! More to come (hopefully) with GPT-4 and its applications!")
|
45 |
+
st.markdown("Interpreting the UMAP plot: the algorithm creates a 2d embedding from the high-dim vector space that tries to conserve as much similarity information as possible. Nearby points in UMAP space are similar, and grow dissimiliar as you move farther away. The axes do not have any physical meaning.")
|
46 |
+
|
47 |
+
from tqdm import tqdm
|
48 |
+
ctr = -1
|
49 |
+
num_chunks = len(gal_feeds)
|
50 |
+
all_text = []
|
51 |
+
all_titles = []
|
52 |
+
all_arxivid = []
|
53 |
+
all_links = []
|
54 |
+
|
55 |
+
for nc in tqdm(range(num_chunks)):
|
56 |
+
for i in range(len(gal_feeds[nc].entries)):
|
57 |
+
text = gal_feeds[nc].entries[i].summary
|
58 |
+
text = text.replace('\n', ' ')
|
59 |
+
text = text.replace('\\', '')
|
60 |
+
all_text.append(text)
|
61 |
+
all_titles.append(gal_feeds[nc].entries[i].title)
|
62 |
+
all_arxivid.append(gal_feeds[nc].entries[i].id.split('/')[-1][0:-2])
|
63 |
+
all_links.append(gal_feeds[nc].entries[i].links[1].href)
|
64 |
+
|
65 |
+
|
66 |
+
def density_estimation(m1, m2, xmin=0, ymin=0, xmax=15, ymax=15):
|
67 |
+
X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
|
68 |
+
positions = np.vstack([X.ravel(), Y.ravel()])
|
69 |
+
values = np.vstack([m1, m2])
|
70 |
+
kernel = stats.gaussian_kde(values)
|
71 |
+
Z = np.reshape(kernel(positions).T, X.shape)
|
72 |
+
return X, Y, Z
|
73 |
+
|
74 |
+
st.sidebar.markdown('This is a widget that allows you to look for papers containing specific phrases in the dataset and show it as a heatmap. Enter the phrase of interest, then change the size and opacity of the heatmap as desired to find the high-density regions. Hover over blue points to see the details of individual papers.')
|
75 |
+
st.sidebar.markdown('`Note`: (i) if you enter a query that is not in the corpus of abstracts, it will return an error. just enter a different query in that case. (ii) there are some empty tooltips when you hover, these correspond to the underlying hexbins, and can be ignored.')
|
76 |
+
|
77 |
+
st.sidebar.text_input("Search query", key="phrase", value="Quenching")
|
78 |
+
alpha_value = st.sidebar.slider("Pick the hexbin opacity",0.0,1.0,0.81)
|
79 |
+
size_value = st.sidebar.slider("Pick the hexbin gridsize",10,50,20)
|
80 |
+
|
81 |
+
phrase=st.session_state.phrase
|
82 |
+
|
83 |
+
phrase_flags = np.zeros((len(all_text),))
|
84 |
+
for i in range(len(all_text)):
|
85 |
+
if phrase.lower() in all_text[i].lower():
|
86 |
+
phrase_flags[i] = 1
|
87 |
+
|
88 |
+
|
89 |
+
source = ColumnDataSource(data=dict(
|
90 |
+
x=embedding[0:,0],
|
91 |
+
y=embedding[0:,1],
|
92 |
+
title=all_titles,
|
93 |
+
link=all_links,
|
94 |
+
))
|
95 |
+
|
96 |
+
TOOLTIPS = """
|
97 |
+
<div style="width:300px;">
|
98 |
+
ID: $index
|
99 |
+
($x, $y)
|
100 |
+
@title <br>
|
101 |
+
@link <br> <br>
|
102 |
+
</div>
|
103 |
+
"""
|
104 |
+
|
105 |
+
p = figure(width=700, height=583, tooltips=TOOLTIPS, x_range=(0, 15), y_range=(2.5,15),
|
106 |
+
title="UMAP projection of embeddings for the astro-ph.GA corpus"+phrase)
|
107 |
+
|
108 |
+
# p.hexbin(embedding[phrase_flags==1,0],embedding[phrase_flags==1,1], size=size_value,
|
109 |
+
# palette = np.flip(OrRd[8]), alpha=alpha_value)
|
110 |
+
p.circle('x', 'y', size=3, source=source, alpha=0.3)
|
111 |
+
st.bokeh_chart(p)
|
112 |
+
|
113 |
+
fig = plt.figure(figsize=(10.5,9*0.8328))
|
114 |
+
plt.scatter(embedding[0:,0], embedding[0:,1],s=2,alpha=0.1)
|
115 |
+
plt.hexbin(embedding[phrase_flags==1,0],embedding[phrase_flags==1,1],
|
116 |
+
gridsize=size_value, cmap = 'viridis', alpha=alpha_value,extent=(-1,16,1.5,16),mincnt=10)
|
117 |
+
plt.title("UMAP localization of heatmap keyword: "+phrase)
|
118 |
+
plt.axis([0,15,2.5,15]);
|
119 |
+
clbr = plt.colorbar(); clbr.set_label('# papers')
|
120 |
+
plt.axis('off')
|
121 |
+
st.pyplot(fig)
|
pages/9_research_hotspots_2024.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import datetime
|
3 |
+
import faiss
|
4 |
+
import streamlit as st
|
5 |
+
import feedparser
|
6 |
+
import urllib
|
7 |
+
import cloudpickle as cp
|
8 |
+
import pickle
|
9 |
+
from urllib.request import urlopen
|
10 |
+
from summa import summarizer
|
11 |
+
import numpy as np
|
12 |
+
import matplotlib.pyplot as plt
|
13 |
+
import requests
|
14 |
+
import json
|
15 |
+
from scipy import ndimage
|
16 |
+
|
17 |
+
from langchain_openai import AzureOpenAIEmbeddings
|
18 |
+
from langchain.llms import OpenAI
|
19 |
+
from langchain_openai import AzureChatOpenAI
|
20 |
+
|
21 |
+
os.environ["OPENAI_API_TYPE"] = "azure"
|
22 |
+
os.environ["AZURE_ENDPOINT"] = st.secrets["endpoint1"]
|
23 |
+
os.environ["OPENAI_API_KEY"] = st.secrets["key1"]
|
24 |
+
os.environ["OPENAI_API_VERSION"] = "2023-05-15"
|
25 |
+
|
26 |
+
embeddings = AzureOpenAIEmbeddings(
|
27 |
+
deployment="embedding",
|
28 |
+
model="text-embedding-ada-002",
|
29 |
+
azure_endpoint=st.secrets["endpoint1"],
|
30 |
+
)
|
31 |
+
|
32 |
+
llm = AzureChatOpenAI(
|
33 |
+
deployment_name="gpt4_small",
|
34 |
+
openai_api_version="2023-12-01-preview",
|
35 |
+
azure_endpoint=st.secrets["endpoint2"],
|
36 |
+
openai_api_key=st.secrets["key2"],
|
37 |
+
openai_api_type="azure",
|
38 |
+
temperature=0.
|
39 |
+
)
|
40 |
+
|
41 |
+
|
42 |
+
@st.cache_data
|
43 |
+
def get_feeds_data(url):
|
44 |
+
# data = cp.load(urlopen(url))
|
45 |
+
with open(url, "rb") as fp:
|
46 |
+
data = pickle.load(fp)
|
47 |
+
st.sidebar.success("Loaded data")
|
48 |
+
return data
|
49 |
+
|
50 |
+
# feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_"
|
51 |
+
# embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0"
|
52 |
+
dateval = "16-Jun-2024"
|
53 |
+
feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
|
54 |
+
embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
|
55 |
+
gal_feeds = get_feeds_data(feeds_link)
|
56 |
+
arxiv_ada_embeddings = get_feeds_data(embed_link)
|
57 |
+
|
58 |
+
@st.cache_data
|
59 |
+
def get_embedding_data(url):
|
60 |
+
# data = cp.load(urlopen(url))
|
61 |
+
with open(url, "rb") as fp:
|
62 |
+
data = pickle.load(fp)
|
63 |
+
st.sidebar.success("Fetched data from API!")
|
64 |
+
return data
|
65 |
+
|
66 |
+
# url = "https://drive.google.com/uc?export=download&id=1133tynMwsfdR1wxbkFLhbES3FwDWTPjP"
|
67 |
+
url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl"
|
68 |
+
e2d = get_embedding_data(url)
|
69 |
+
# e2d, _, _, _, _ = get_embedding_data(url)
|
70 |
+
|
71 |
+
ctr = -1
|
72 |
+
num_chunks = len(gal_feeds)
|
73 |
+
ctr = -1
|
74 |
+
num_chunks = len(gal_feeds)
|
75 |
+
all_text, all_titles, all_arxivid, all_links, all_authors, all_pubdates, all_old = [], [], [], [], [], [], []
|
76 |
+
|
77 |
+
for nc in range(num_chunks):
|
78 |
+
|
79 |
+
for i in range(len(gal_feeds[nc].entries)):
|
80 |
+
text = gal_feeds[nc].entries[i].summary
|
81 |
+
text = text.replace('\n', ' ')
|
82 |
+
text = text.replace('\\', '')
|
83 |
+
all_text.append(text)
|
84 |
+
all_titles.append(gal_feeds[nc].entries[i].title)
|
85 |
+
all_arxivid.append(gal_feeds[nc].entries[i].id.split('/')[-1][0:-2])
|
86 |
+
all_links.append(gal_feeds[nc].entries[i].links[1].href)
|
87 |
+
all_authors.append(gal_feeds[nc].entries[i].authors)
|
88 |
+
temp = gal_feeds[nc].entries[i].published
|
89 |
+
datetime_object = datetime.datetime.strptime(temp[0:10]+' '+temp[11:-1], '%Y-%m-%d %H:%M:%S')
|
90 |
+
all_pubdates.append(datetime_object)
|
91 |
+
all_old.append((datetime.datetime.now() - datetime_object).days)
|
92 |
+
|
93 |
+
def make_time_excess_plot(midage = 0, tolage = 1, onlyolder = False):
|
94 |
+
|
95 |
+
bw = 0.05
|
96 |
+
sigma = 4.0
|
97 |
+
mask = (np.abs(np.array(all_old) - midage*365) < tolage*365)
|
98 |
+
|
99 |
+
if onlyolder == True:
|
100 |
+
mask2 = (np.array(all_old) > midage*365 + tolage*365/2)
|
101 |
+
a = np.histogram2d(e2d[0:,0][mask2], e2d[0:,1][mask2], bins=(np.arange(0,17,bw)), density=True)
|
102 |
+
else:
|
103 |
+
a = np.histogram2d(e2d[0:,0], e2d[0:,1], bins=(np.arange(0,17,bw)), density=True)
|
104 |
+
b = np.histogram2d(e2d[0:,0][mask], e2d[0:,1][mask], bins=(np.arange(0,17,bw)), density=True)
|
105 |
+
temp = b[0].T - a[0].T
|
106 |
+
temp = ndimage.gaussian_filter(temp, sigma, mode='nearest')
|
107 |
+
vscale = (np.nanpercentile(temp,99.5) - np.nanpercentile(temp,0.5))/2
|
108 |
+
|
109 |
+
fig = plt.figure(figsize=(11,9))
|
110 |
+
plt.pcolor(a[1][0:-1] + (a[1][1]-a[1][0])/2, a[2][0:-1] + (a[2][1]-a[2][0])/2,
|
111 |
+
temp,cmap='bwr',
|
112 |
+
vmin=-vscale,vmax=vscale); plt.colorbar()
|
113 |
+
# plt.scatter(e2d[0:,0], e2d[0:,1],s=2,color='k',alpha=0.1)
|
114 |
+
plt.title('excess research over the last %.1f yrs centered at %.1f yrs' %(tolage, midage))
|
115 |
+
plt.axis([0,14,1,15])
|
116 |
+
plt.axis('off')
|
117 |
+
st.pyplot(fig)
|
118 |
+
return
|
119 |
+
|
120 |
+
st.title('Research hotspots')
|
121 |
+
st.markdown('[Includes papers up to: `'+dateval+'`]')
|
122 |
+
|
123 |
+
midage = st.slider('Age', 0., 10., 0.)
|
124 |
+
tolage = st.slider('Period width', 0., 10., 1.)
|
125 |
+
|
126 |
+
st.markdown('Compare the research in a given time period to the full manifold.')
|
127 |
+
make_time_excess_plot(midage, tolage, onlyolder = False)
|
128 |
+
|
129 |
+
st.markdown('Compare the research in a given time period to research older than that.')
|
130 |
+
make_time_excess_plot(midage, tolage, onlyolder = True)
|