Spaces:

kiyer
/

pathfinder

Running on CPU Upgrade

App Files Files Community

pathfinder / pages /9_research_hotspots_2024.py

kiyer

possible st.pyplot argument fix

9c7a7db 7 months ago

raw

history blame

4.73 kB

	import os
	import datetime
	import faiss
	import streamlit as st
	import feedparser
	import urllib
	import cloudpickle as cp
	import pickle
	from urllib.request import urlopen
	from summa import summarizer
	import numpy as np
	import matplotlib.pyplot as plt
	import requests
	import json
	from scipy import ndimage

	from langchain_openai import AzureOpenAIEmbeddings
	from langchain.llms import OpenAI
	from langchain_openai import AzureChatOpenAI

	os.environ["OPENAI_API_TYPE"] = "azure"
	os.environ["AZURE_ENDPOINT"] = st.secrets["endpoint1"]
	os.environ["OPENAI_API_KEY"] = st.secrets["key1"]
	os.environ["OPENAI_API_VERSION"] = "2023-05-15"

	embeddings = AzureOpenAIEmbeddings(
	deployment="embedding",
	model="text-embedding-ada-002",
	azure_endpoint=st.secrets["endpoint1"],
	)

	llm = AzureChatOpenAI(
	deployment_name="gpt4_small",
	openai_api_version="2023-12-01-preview",
	azure_endpoint=st.secrets["endpoint2"],
	openai_api_key=st.secrets["key2"],
	openai_api_type="azure",
	temperature=0.
	)


	@st.cache_data
	def get_feeds_data(url):
	# data = cp.load(urlopen(url))
	with open(url, "rb") as fp:
	data = pickle.load(fp)
	st.sidebar.success("Loaded data")
	return data

	# feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_"
	# embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0"
	dateval = "16-Jun-2024"
	feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
	embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
	gal_feeds = get_feeds_data(feeds_link)
	arxiv_ada_embeddings = get_feeds_data(embed_link)

	@st.cache_data
	def get_embedding_data(url):
	# data = cp.load(urlopen(url))
	with open(url, "rb") as fp:
	data = pickle.load(fp)
	st.sidebar.success("Fetched data from API!")
	return data

	# url = "https://drive.google.com/uc?export=download&id=1133tynMwsfdR1wxbkFLhbES3FwDWTPjP"
	url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl"
	e2d = get_embedding_data(url)
	# e2d, _, _, _, _ = get_embedding_data(url)

	ctr = -1
	num_chunks = len(gal_feeds)
	ctr = -1
	num_chunks = len(gal_feeds)
	all_text, all_titles, all_arxivid, all_links, all_authors, all_pubdates, all_old = [], [], [], [], [], [], []

	for nc in range(num_chunks):

	for i in range(len(gal_feeds[nc].entries)):
	text = gal_feeds[nc].entries[i].summary
	text = text.replace('\n', ' ')
	text = text.replace('\\', '')
	all_text.append(text)
	all_titles.append(gal_feeds[nc].entries[i].title)
	all_arxivid.append(gal_feeds[nc].entries[i].id.split('/')[-1][0:-2])
	all_links.append(gal_feeds[nc].entries[i].links[1].href)
	all_authors.append(gal_feeds[nc].entries[i].authors)
	temp = gal_feeds[nc].entries[i].published
	datetime_object = datetime.datetime.strptime(temp[0:10]+' '+temp[11:-1], '%Y-%m-%d %H:%M:%S')
	all_pubdates.append(datetime_object)
	all_old.append((datetime.datetime.now() - datetime_object).days)

	def make_time_excess_plot(midage = 0, tolage = 1, onlyolder = False):

	bw = 0.05
	sigma = 4.0
	mask = (np.abs(np.array(all_old) - midage365) < tolage365)

	if onlyolder == True:
	mask2 = (np.array(all_old) > midage365 + tolage365/2)
	a = np.histogram2d(e2d[0:,0][mask2], e2d[0:,1][mask2], bins=(np.arange(0,17,bw)), density=True)
	else:
	a = np.histogram2d(e2d[0:,0], e2d[0:,1], bins=(np.arange(0,17,bw)), density=True)
	b = np.histogram2d(e2d[0:,0][mask], e2d[0:,1][mask], bins=(np.arange(0,17,bw)), density=True)
	temp = b[0].T - a[0].T
	temp = ndimage.gaussian_filter(temp, sigma, mode='nearest')
	vscale = (np.nanpercentile(temp,99.5) - np.nanpercentile(temp,0.5))/2

	fig, ax = plt.subplots(1,1,figsize=(11,9))
	plt.pcolor(a[1][0:-1] + (a[1][1]-a[1][0])/2, a[2][0:-1] + (a[2][1]-a[2][0])/2,
	temp,cmap='bwr',
	vmin=-vscale,vmax=vscale); plt.colorbar()
	# plt.scatter(e2d[0:,0], e2d[0:,1],s=2,color='k',alpha=0.1)
	plt.title('excess research over the last %.1f yrs centered at %.1f yrs' %(tolage, midage))
	plt.axis([0,14,1,15])
	plt.axis('off')
	st.pyplot(fig)
	return

	st.title('Research hotspots')
	st.markdown('[Includes papers up to: `'+dateval+'`]')

	midage = st.slider('Age', 0., 10., 0.)
	tolage = st.slider('Period width', 0., 10., 1.)

	st.markdown('Compare the research in a given time period to the full manifold.')
	make_time_excess_plot(midage, tolage, onlyolder = False)

	st.markdown('Compare the research in a given time period to research older than that.')
	make_time_excess_plot(midage, tolage, onlyolder = True)