kiyer commited on
Commit
27b7558
1 Parent(s): ceb86c6

add author search feature

Browse files
Files changed (1) hide show
  1. pages/4_author_search.py +137 -0
pages/4_author_search.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import datetime
3
+ import faiss
4
+ import streamlit as st
5
+ import feedparser
6
+ import urllib
7
+ import cloudpickle as cp
8
+ import pickle
9
+ from urllib.request import urlopen
10
+ from summa import summarizer
11
+ import numpy as np
12
+ import matplotlib.pyplot as plt
13
+ import requests
14
+ import json
15
+
16
+ from langchain_openai import AzureOpenAIEmbeddings
17
+ from langchain.llms import OpenAI
18
+ from langchain_openai import AzureChatOpenAI
19
+
20
+ os.environ["OPENAI_API_TYPE"] = "azure"
21
+ os.environ["AZURE_ENDPOINT"] = st.secrets["endpoint1"]
22
+ os.environ["OPENAI_API_KEY"] = st.secrets["key1"]
23
+ os.environ["OPENAI_API_VERSION"] = "2023-05-15"
24
+
25
+ embeddings = AzureOpenAIEmbeddings(
26
+ deployment="embedding",
27
+ model="text-embedding-ada-002",
28
+ azure_endpoint=st.secrets["endpoint1"],
29
+ )
30
+
31
+ llm = AzureChatOpenAI(
32
+ deployment_name="gpt4_small",
33
+ openai_api_version="2023-12-01-preview",
34
+ azure_endpoint=st.secrets["endpoint2"],
35
+ openai_api_key=st.secrets["key2"],
36
+ openai_api_type="azure",
37
+ temperature=0.
38
+ )
39
+
40
+
41
+ @st.cache_data
42
+ def get_feeds_data(url):
43
+ # data = cp.load(urlopen(url))
44
+ with open(url, "rb") as fp:
45
+ data = pickle.load(fp)
46
+ st.sidebar.success("Loaded data")
47
+ return data
48
+
49
+ # feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_"
50
+ # embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0"
51
+ dateval = "27-Jun-2023"
52
+ feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
53
+ embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
54
+ gal_feeds = get_feeds_data(feeds_link)
55
+ arxiv_ada_embeddings = get_feeds_data(embed_link)
56
+
57
+ @st.cache_data
58
+ def get_embedding_data(url):
59
+ # data = cp.load(urlopen(url))
60
+ with open(url, "rb") as fp:
61
+ data = pickle.load(fp)
62
+ st.sidebar.success("Fetched data from API!")
63
+ return data
64
+
65
+ # url = "https://drive.google.com/uc?export=download&id=1133tynMwsfdR1wxbkFLhbES3FwDWTPjP"
66
+ url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl"
67
+ e2d = get_embedding_data(url)
68
+ # e2d, _, _, _, _ = get_embedding_data(url)
69
+
70
+ ctr = -1
71
+ num_chunks = len(gal_feeds)
72
+ ctr = -1
73
+ num_chunks = len(gal_feeds)
74
+ all_text, all_titles, all_arxivid, all_links, all_authors, all_pubdates, all_old = [], [], [], [], [], [], []
75
+
76
+ for nc in range(num_chunks):
77
+
78
+ for i in range(len(gal_feeds[nc].entries)):
79
+ text = gal_feeds[nc].entries[i].summary
80
+ text = text.replace('\n', ' ')
81
+ text = text.replace('\\', '')
82
+ all_text.append(text)
83
+ all_titles.append(gal_feeds[nc].entries[i].title)
84
+ all_arxivid.append(gal_feeds[nc].entries[i].id.split('/')[-1][0:-2])
85
+ all_links.append(gal_feeds[nc].entries[i].links[1].href)
86
+ all_authors.append(gal_feeds[nc].entries[i].authors)
87
+ temp = gal_feeds[nc].entries[i].published
88
+ datetime_object = datetime.datetime.strptime(temp[0:10]+' '+temp[11:-1], '%Y-%m-%d %H:%M:%S')
89
+ all_pubdates.append(datetime_object)
90
+ all_old.append((datetime.datetime.now() - datetime_object).days)
91
+
92
+ def make_author_plot(inputstr, print_summary = False):
93
+
94
+ authr_list = inputstr.split(', ')
95
+ author_flag = np.zeros((len(all_authors),))
96
+ ctr = 0
97
+ pts = []
98
+ for i in range(len(all_authors)):
99
+ for j in range(len(all_authors[i])):
100
+ for k in range(len(authr_list)):
101
+ if authr.lower() in all_authors[i][j]['name'].lower():
102
+ author_flag[i] = 1
103
+ ctr = ctr+1
104
+ printstr = str(ctr)+'. [age= %.1f yr, x: %.1f, y: %.1f]' %(all_old[i]/365,e2d[i,0], e2d[i,1])+' name: '+all_authors[i][j]['name']
105
+ pts.append(printstr)
106
+ pts.append('Paper title: ' + all_titles[i])
107
+ else:
108
+ continue
109
+ print(np.sum(author_flag))
110
+ author_flag = author_flag.astype(bool)
111
+
112
+ fig = plt.figure(figsize=(10.8,9.))
113
+ plt.scatter(e2d[0:,0], e2d[0:,1],s=1,color='k',alpha=0.3)
114
+ plt.scatter(e2d[0:,0][author_flag], e2d[0:,1][author_flag],
115
+ s=100,c=np.array(all_old)[author_flag]/365,alpha=1.0, cmap='coolwarm')
116
+ clbr = plt.colorbar(); clbr.set_label('lookback time [years]',fontsize=18)
117
+ tempx = plt.xlim(); tempy = plt.ylim()
118
+ plt.title('Author: '+authr,fontsize=18,fontweight='bold')
119
+ st.pyplot(fig)
120
+
121
+ if print_summary == True:
122
+ st.markdown('---')
123
+ for i in range(len(pts)):
124
+ st.markdown(pts[i])
125
+
126
+ return
127
+
128
+
129
+ st.title('Author search')
130
+ st.markdown('[Includes papers up to: `'+dateval+'`]')
131
+ st.markdown('Trace the location and trajectory of a researcher in the astro-ph.GA manifold.')
132
+ st.markdown('The current text matching is exact (not case sensitive), so look at the printed summaries below to refine your input string. If you have multiple aliases by which you publish, separate the inputs with a comma followed by a space like in the example below.')
133
+
134
+ query = st.text_input('Author name:',
135
+ value="'Kartheik Iyer, Kartheik G. Iyer, K. G. Iyer'")
136
+
137
+ make_author_plot(query, print_summary=True)