Spaces:

kiyer
/

pathfinder

Running on CPU Upgrade

App Files Files Community

pathfinder / pages /4_author_search.py

kiyer

add author search feature

27b7558 verified 12 months ago

raw

history blame

5.06 kB

	import os
	import datetime
	import faiss
	import streamlit as st
	import feedparser
	import urllib
	import cloudpickle as cp
	import pickle
	from urllib.request import urlopen
	from summa import summarizer
	import numpy as np
	import matplotlib.pyplot as plt
	import requests
	import json

	from langchain_openai import AzureOpenAIEmbeddings
	from langchain.llms import OpenAI
	from langchain_openai import AzureChatOpenAI

	os.environ["OPENAI_API_TYPE"] = "azure"
	os.environ["AZURE_ENDPOINT"] = st.secrets["endpoint1"]
	os.environ["OPENAI_API_KEY"] = st.secrets["key1"]
	os.environ["OPENAI_API_VERSION"] = "2023-05-15"

	embeddings = AzureOpenAIEmbeddings(
	deployment="embedding",
	model="text-embedding-ada-002",
	azure_endpoint=st.secrets["endpoint1"],
	)

	llm = AzureChatOpenAI(
	deployment_name="gpt4_small",
	openai_api_version="2023-12-01-preview",
	azure_endpoint=st.secrets["endpoint2"],
	openai_api_key=st.secrets["key2"],
	openai_api_type="azure",
	temperature=0.
	)


	@st.cache_data
	def get_feeds_data(url):
	# data = cp.load(urlopen(url))
	with open(url, "rb") as fp:
	data = pickle.load(fp)
	st.sidebar.success("Loaded data")
	return data

	# feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_"
	# embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0"
	dateval = "27-Jun-2023"
	feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
	embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
	gal_feeds = get_feeds_data(feeds_link)
	arxiv_ada_embeddings = get_feeds_data(embed_link)

	@st.cache_data
	def get_embedding_data(url):
	# data = cp.load(urlopen(url))
	with open(url, "rb") as fp:
	data = pickle.load(fp)
	st.sidebar.success("Fetched data from API!")
	return data

	# url = "https://drive.google.com/uc?export=download&id=1133tynMwsfdR1wxbkFLhbES3FwDWTPjP"
	url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl"
	e2d = get_embedding_data(url)
	# e2d, _, _, _, _ = get_embedding_data(url)

	ctr = -1
	num_chunks = len(gal_feeds)
	ctr = -1
	num_chunks = len(gal_feeds)
	all_text, all_titles, all_arxivid, all_links, all_authors, all_pubdates, all_old = [], [], [], [], [], [], []

	for nc in range(num_chunks):

	for i in range(len(gal_feeds[nc].entries)):
	text = gal_feeds[nc].entries[i].summary
	text = text.replace('\n', ' ')
	text = text.replace('\\', '')
	all_text.append(text)
	all_titles.append(gal_feeds[nc].entries[i].title)
	all_arxivid.append(gal_feeds[nc].entries[i].id.split('/')[-1][0:-2])
	all_links.append(gal_feeds[nc].entries[i].links[1].href)
	all_authors.append(gal_feeds[nc].entries[i].authors)
	temp = gal_feeds[nc].entries[i].published
	datetime_object = datetime.datetime.strptime(temp[0:10]+' '+temp[11:-1], '%Y-%m-%d %H:%M:%S')
	all_pubdates.append(datetime_object)
	all_old.append((datetime.datetime.now() - datetime_object).days)

	def make_author_plot(inputstr, print_summary = False):

	authr_list = inputstr.split(', ')
	author_flag = np.zeros((len(all_authors),))
	ctr = 0
	pts = []
	for i in range(len(all_authors)):
	for j in range(len(all_authors[i])):
	for k in range(len(authr_list)):
	if authr.lower() in all_authors[i][j]['name'].lower():
	author_flag[i] = 1
	ctr = ctr+1
	printstr = str(ctr)+'. [age= %.1f yr, x: %.1f, y: %.1f]' %(all_old[i]/365,e2d[i,0], e2d[i,1])+' name: '+all_authors[i][j]['name']
	pts.append(printstr)
	pts.append('Paper title: ' + all_titles[i])
	else:
	continue
	print(np.sum(author_flag))
	author_flag = author_flag.astype(bool)

	fig = plt.figure(figsize=(10.8,9.))
	plt.scatter(e2d[0:,0], e2d[0:,1],s=1,color='k',alpha=0.3)
	plt.scatter(e2d[0:,0][author_flag], e2d[0:,1][author_flag],
	s=100,c=np.array(all_old)[author_flag]/365,alpha=1.0, cmap='coolwarm')
	clbr = plt.colorbar(); clbr.set_label('lookback time [years]',fontsize=18)
	tempx = plt.xlim(); tempy = plt.ylim()
	plt.title('Author: '+authr,fontsize=18,fontweight='bold')
	st.pyplot(fig)

	if print_summary == True:
	st.markdown('---')
	for i in range(len(pts)):
	st.markdown(pts[i])

	return


	st.title('Author search')
	st.markdown('[Includes papers up to: `'+dateval+'`]')
	st.markdown('Trace the location and trajectory of a researcher in the astro-ph.GA manifold.')
	st.markdown('The current text matching is exact (not case sensitive), so look at the printed summaries below to refine your input string. If you have multiple aliases by which you publish, separate the inputs with a comma followed by a space like in the example below.')

	query = st.text_input('Author name:',
	value="'Kartheik Iyer, Kartheik G. Iyer, K. G. Iyer'")

	make_author_plot(query, print_summary=True)