Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
add author search feature
Browse files- pages/4_author_search.py +137 -0
pages/4_author_search.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import datetime
|
3 |
+
import faiss
|
4 |
+
import streamlit as st
|
5 |
+
import feedparser
|
6 |
+
import urllib
|
7 |
+
import cloudpickle as cp
|
8 |
+
import pickle
|
9 |
+
from urllib.request import urlopen
|
10 |
+
from summa import summarizer
|
11 |
+
import numpy as np
|
12 |
+
import matplotlib.pyplot as plt
|
13 |
+
import requests
|
14 |
+
import json
|
15 |
+
|
16 |
+
from langchain_openai import AzureOpenAIEmbeddings
|
17 |
+
from langchain.llms import OpenAI
|
18 |
+
from langchain_openai import AzureChatOpenAI
|
19 |
+
|
20 |
+
os.environ["OPENAI_API_TYPE"] = "azure"
|
21 |
+
os.environ["AZURE_ENDPOINT"] = st.secrets["endpoint1"]
|
22 |
+
os.environ["OPENAI_API_KEY"] = st.secrets["key1"]
|
23 |
+
os.environ["OPENAI_API_VERSION"] = "2023-05-15"
|
24 |
+
|
25 |
+
embeddings = AzureOpenAIEmbeddings(
|
26 |
+
deployment="embedding",
|
27 |
+
model="text-embedding-ada-002",
|
28 |
+
azure_endpoint=st.secrets["endpoint1"],
|
29 |
+
)
|
30 |
+
|
31 |
+
llm = AzureChatOpenAI(
|
32 |
+
deployment_name="gpt4_small",
|
33 |
+
openai_api_version="2023-12-01-preview",
|
34 |
+
azure_endpoint=st.secrets["endpoint2"],
|
35 |
+
openai_api_key=st.secrets["key2"],
|
36 |
+
openai_api_type="azure",
|
37 |
+
temperature=0.
|
38 |
+
)
|
39 |
+
|
40 |
+
|
41 |
+
@st.cache_data
|
42 |
+
def get_feeds_data(url):
|
43 |
+
# data = cp.load(urlopen(url))
|
44 |
+
with open(url, "rb") as fp:
|
45 |
+
data = pickle.load(fp)
|
46 |
+
st.sidebar.success("Loaded data")
|
47 |
+
return data
|
48 |
+
|
49 |
+
# feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_"
|
50 |
+
# embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0"
|
51 |
+
dateval = "27-Jun-2023"
|
52 |
+
feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
|
53 |
+
embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
|
54 |
+
gal_feeds = get_feeds_data(feeds_link)
|
55 |
+
arxiv_ada_embeddings = get_feeds_data(embed_link)
|
56 |
+
|
57 |
+
@st.cache_data
|
58 |
+
def get_embedding_data(url):
|
59 |
+
# data = cp.load(urlopen(url))
|
60 |
+
with open(url, "rb") as fp:
|
61 |
+
data = pickle.load(fp)
|
62 |
+
st.sidebar.success("Fetched data from API!")
|
63 |
+
return data
|
64 |
+
|
65 |
+
# url = "https://drive.google.com/uc?export=download&id=1133tynMwsfdR1wxbkFLhbES3FwDWTPjP"
|
66 |
+
url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl"
|
67 |
+
e2d = get_embedding_data(url)
|
68 |
+
# e2d, _, _, _, _ = get_embedding_data(url)
|
69 |
+
|
70 |
+
ctr = -1
|
71 |
+
num_chunks = len(gal_feeds)
|
72 |
+
ctr = -1
|
73 |
+
num_chunks = len(gal_feeds)
|
74 |
+
all_text, all_titles, all_arxivid, all_links, all_authors, all_pubdates, all_old = [], [], [], [], [], [], []
|
75 |
+
|
76 |
+
for nc in range(num_chunks):
|
77 |
+
|
78 |
+
for i in range(len(gal_feeds[nc].entries)):
|
79 |
+
text = gal_feeds[nc].entries[i].summary
|
80 |
+
text = text.replace('\n', ' ')
|
81 |
+
text = text.replace('\\', '')
|
82 |
+
all_text.append(text)
|
83 |
+
all_titles.append(gal_feeds[nc].entries[i].title)
|
84 |
+
all_arxivid.append(gal_feeds[nc].entries[i].id.split('/')[-1][0:-2])
|
85 |
+
all_links.append(gal_feeds[nc].entries[i].links[1].href)
|
86 |
+
all_authors.append(gal_feeds[nc].entries[i].authors)
|
87 |
+
temp = gal_feeds[nc].entries[i].published
|
88 |
+
datetime_object = datetime.datetime.strptime(temp[0:10]+' '+temp[11:-1], '%Y-%m-%d %H:%M:%S')
|
89 |
+
all_pubdates.append(datetime_object)
|
90 |
+
all_old.append((datetime.datetime.now() - datetime_object).days)
|
91 |
+
|
92 |
+
def make_author_plot(inputstr, print_summary = False):
|
93 |
+
|
94 |
+
authr_list = inputstr.split(', ')
|
95 |
+
author_flag = np.zeros((len(all_authors),))
|
96 |
+
ctr = 0
|
97 |
+
pts = []
|
98 |
+
for i in range(len(all_authors)):
|
99 |
+
for j in range(len(all_authors[i])):
|
100 |
+
for k in range(len(authr_list)):
|
101 |
+
if authr.lower() in all_authors[i][j]['name'].lower():
|
102 |
+
author_flag[i] = 1
|
103 |
+
ctr = ctr+1
|
104 |
+
printstr = str(ctr)+'. [age= %.1f yr, x: %.1f, y: %.1f]' %(all_old[i]/365,e2d[i,0], e2d[i,1])+' name: '+all_authors[i][j]['name']
|
105 |
+
pts.append(printstr)
|
106 |
+
pts.append('Paper title: ' + all_titles[i])
|
107 |
+
else:
|
108 |
+
continue
|
109 |
+
print(np.sum(author_flag))
|
110 |
+
author_flag = author_flag.astype(bool)
|
111 |
+
|
112 |
+
fig = plt.figure(figsize=(10.8,9.))
|
113 |
+
plt.scatter(e2d[0:,0], e2d[0:,1],s=1,color='k',alpha=0.3)
|
114 |
+
plt.scatter(e2d[0:,0][author_flag], e2d[0:,1][author_flag],
|
115 |
+
s=100,c=np.array(all_old)[author_flag]/365,alpha=1.0, cmap='coolwarm')
|
116 |
+
clbr = plt.colorbar(); clbr.set_label('lookback time [years]',fontsize=18)
|
117 |
+
tempx = plt.xlim(); tempy = plt.ylim()
|
118 |
+
plt.title('Author: '+authr,fontsize=18,fontweight='bold')
|
119 |
+
st.pyplot(fig)
|
120 |
+
|
121 |
+
if print_summary == True:
|
122 |
+
st.markdown('---')
|
123 |
+
for i in range(len(pts)):
|
124 |
+
st.markdown(pts[i])
|
125 |
+
|
126 |
+
return
|
127 |
+
|
128 |
+
|
129 |
+
st.title('Author search')
|
130 |
+
st.markdown('[Includes papers up to: `'+dateval+'`]')
|
131 |
+
st.markdown('Trace the location and trajectory of a researcher in the astro-ph.GA manifold.')
|
132 |
+
st.markdown('The current text matching is exact (not case sensitive), so look at the printed summaries below to refine your input string. If you have multiple aliases by which you publish, separate the inputs with a comma followed by a space like in the example below.')
|
133 |
+
|
134 |
+
query = st.text_input('Author name:',
|
135 |
+
value="'Kartheik Iyer, Kartheik G. Iyer, K. G. Iyer'")
|
136 |
+
|
137 |
+
make_author_plot(query, print_summary=True)
|