Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
added embedding plot
Browse files- app.py +40 -3
- pfdr_arxiv_cutoff_distances.npy +3 -0
app.py
CHANGED
@@ -276,12 +276,15 @@ class RetrievalSystem():
|
|
276 |
def return_formatted_df(self, top_results, small_df):
|
277 |
|
278 |
df = pd.DataFrame(small_df)
|
279 |
-
df = df.drop(columns=['
|
280 |
links = ['https://ui.adsabs.harvard.edu/abs/'+i+'/abstract' for i in small_df['bibcode']]
|
281 |
scores = [top_results[i] for i in top_results]
|
|
|
282 |
df.insert(1,'ADS Link',links,True)
|
283 |
df.insert(2,'Relevance',scores,True)
|
284 |
-
df
|
|
|
|
|
285 |
return df
|
286 |
|
287 |
# @st.cache_resource
|
@@ -547,7 +550,39 @@ def evaluate_overall_consensus(query: str, abstracts: List[str]) -> OverallConse
|
|
547 |
|
548 |
return response
|
549 |
|
|
|
550 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
551 |
|
552 |
|
553 |
# ---------------------------------------
|
@@ -599,7 +634,6 @@ if st.session_state.get('runpfdr'):
|
|
599 |
question_type_gen = question_type_gen.replace('\n',' \n')
|
600 |
st.markdown(question_type_gen)
|
601 |
|
602 |
-
with col2:
|
603 |
with st.spinner("Evaluating abstract consensus"):
|
604 |
with st.expander("Abstract consensus", expanded=True):
|
605 |
consensus_answer = evaluate_overall_consensus(query, [papers_df['abstract'][i] for i in range(len(papers_df))])
|
@@ -607,6 +641,9 @@ if st.session_state.get('runpfdr'):
|
|
607 |
st.markdown(consensus_answer.explanation)
|
608 |
st.markdown('Relevance of retrieved papers to answer: %.1f' %consensus_answer.relevance_score)
|
609 |
|
|
|
|
|
|
|
610 |
session_vars = {
|
611 |
"runtime": "pathfinder_v1_online",
|
612 |
"query": query,
|
|
|
276 |
def return_formatted_df(self, top_results, small_df):
|
277 |
|
278 |
df = pd.DataFrame(small_df)
|
279 |
+
df = df.drop(columns=['umap_x','umap_y','cite_bibcodes','ref_bibcodes'])
|
280 |
links = ['https://ui.adsabs.harvard.edu/abs/'+i+'/abstract' for i in small_df['bibcode']]
|
281 |
scores = [top_results[i] for i in top_results]
|
282 |
+
indices = [i for i in top_results]
|
283 |
df.insert(1,'ADS Link',links,True)
|
284 |
df.insert(2,'Relevance',scores,True)
|
285 |
+
df.insert(3,'Indices',indices,True)
|
286 |
+
df = df[['ADS Link','Relevance','date','cites','title','authors','abstract','keywords','ads_id','Indices','embed']]
|
287 |
+
df.index += 1
|
288 |
return df
|
289 |
|
290 |
# @st.cache_resource
|
|
|
550 |
|
551 |
return response
|
552 |
|
553 |
+
def calc_outlier_flag(papers_df, top_k, cutoff_adjust = 0.1):
|
554 |
|
555 |
+
cut_dist = np.load('pfdr_arxiv_cutoff_distances.npy') - cutoff_adjust
|
556 |
+
pts = np.array(papers_df['embed'].tolist())
|
557 |
+
centroid = np.mean(pts,0)
|
558 |
+
dists = np.sqrt(np.sum((pts-centroid)**2,1))
|
559 |
+
outlier_flag = (dists > cut_dist[top_k-1])
|
560 |
+
|
561 |
+
return outlier_flag
|
562 |
+
|
563 |
+
def make_embedding_plot(papers_df, consensus_answer):
|
564 |
+
|
565 |
+
plt_indices = np.array(papers_df['Indices'].tolist())
|
566 |
+
|
567 |
+
if 'arxiv_corpus' not in st.session_state:
|
568 |
+
st.session_state.arxiv_corpus = load_arxiv_corpus()
|
569 |
+
|
570 |
+
xax = np.array(st.session_state.arxiv_corpus['umap_x'])
|
571 |
+
yax = np.array(st.session_state.arxiv_corpus['umap_y'])
|
572 |
+
|
573 |
+
outlier_flag = calc_outlier_flag(papers_df, top_k, cutoff_adjust=0.25)
|
574 |
+
alphas = np.ones((len(plt_indices),)) * 0.9
|
575 |
+
alphas[outlier_flag] = 0.5
|
576 |
+
|
577 |
+
fig = plt.figure(figsize=(9,12))
|
578 |
+
plt.scatter(xax,yax, s=1, alpha=0.01, c='k')
|
579 |
+
plt.scatter(xax[plt_indices], yax[plt_indices], s=300*alphas**2, alpha=alphas, c='w')
|
580 |
+
plt.scatter(xax[plt_indices], yax[plt_indices], s=100*alphas**2, alpha=alphas, c='dodgerblue')
|
581 |
+
# plt.scatter(xax[plt_indices][outlier_flag], yax[plt_indices][outlier_flag], s=100, alpha=1., c='firebrick')
|
582 |
+
plt.axis([0,20,-4.2,18])
|
583 |
+
plt.axis('off')
|
584 |
+
plt.title('Query: '+st.session_state.query+'\n'+r'N$_{\rm outliers}: %.0f/%.0f$, Consensus: ' %(np.sum(outlier_flag), len(outlier_flag)) + consensus_answer.consensus + ' (%.1f)' %consensus_answer.relevance_score)
|
585 |
+
st.pyplot(fig)
|
586 |
|
587 |
|
588 |
# ---------------------------------------
|
|
|
634 |
question_type_gen = question_type_gen.replace('\n',' \n')
|
635 |
st.markdown(question_type_gen)
|
636 |
|
|
|
637 |
with st.spinner("Evaluating abstract consensus"):
|
638 |
with st.expander("Abstract consensus", expanded=True):
|
639 |
consensus_answer = evaluate_overall_consensus(query, [papers_df['abstract'][i] for i in range(len(papers_df))])
|
|
|
641 |
st.markdown(consensus_answer.explanation)
|
642 |
st.markdown('Relevance of retrieved papers to answer: %.1f' %consensus_answer.relevance_score)
|
643 |
|
644 |
+
with col2:
|
645 |
+
make_embedding_plot(papers_df, consensus_answer)
|
646 |
+
|
647 |
session_vars = {
|
648 |
"runtime": "pathfinder_v1_online",
|
649 |
"query": query,
|
pfdr_arxiv_cutoff_distances.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:64edda3cf9c3cde63a6dc818f0e6df573dc1ce32217acac1e2bcdfe7f3a4e0e3
|
3 |
+
size 928
|