Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import os | |
import datetime | |
import faiss | |
import streamlit as st | |
import feedparser | |
import urllib | |
import cloudpickle as cp | |
import pickle | |
from urllib.request import urlopen | |
from summa import summarizer | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import requests | |
import json | |
from scipy import ndimage | |
from langchain_openai import AzureOpenAIEmbeddings | |
from langchain.llms import OpenAI | |
from langchain_openai import AzureChatOpenAI | |
os.environ["OPENAI_API_TYPE"] = "azure" | |
os.environ["AZURE_ENDPOINT"] = st.secrets["endpoint1"] | |
os.environ["OPENAI_API_KEY"] = st.secrets["key1"] | |
os.environ["OPENAI_API_VERSION"] = "2023-05-15" | |
embeddings = AzureOpenAIEmbeddings( | |
deployment="embedding", | |
model="text-embedding-ada-002", | |
azure_endpoint=st.secrets["endpoint1"], | |
) | |
llm = AzureChatOpenAI( | |
deployment_name="gpt4_small", | |
openai_api_version="2023-12-01-preview", | |
azure_endpoint=st.secrets["endpoint2"], | |
openai_api_key=st.secrets["key2"], | |
openai_api_type="azure", | |
temperature=0. | |
) | |
def get_feeds_data(url): | |
# data = cp.load(urlopen(url)) | |
with open(url, "rb") as fp: | |
data = pickle.load(fp) | |
st.sidebar.success("Loaded data") | |
return data | |
# feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_" | |
# embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0" | |
dateval = "16-Jun-2024" | |
feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl" | |
embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl" | |
gal_feeds = get_feeds_data(feeds_link) | |
arxiv_ada_embeddings = get_feeds_data(embed_link) | |
def get_embedding_data(url): | |
# data = cp.load(urlopen(url)) | |
with open(url, "rb") as fp: | |
data = pickle.load(fp) | |
st.sidebar.success("Fetched data from API!") | |
return data | |
# url = "https://drive.google.com/uc?export=download&id=1133tynMwsfdR1wxbkFLhbES3FwDWTPjP" | |
url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl" | |
e2d = get_embedding_data(url) | |
# e2d, _, _, _, _ = get_embedding_data(url) | |
ctr = -1 | |
num_chunks = len(gal_feeds) | |
ctr = -1 | |
num_chunks = len(gal_feeds) | |
all_text, all_titles, all_arxivid, all_links, all_authors, all_pubdates, all_old = [], [], [], [], [], [], [] | |
for nc in range(num_chunks): | |
for i in range(len(gal_feeds[nc].entries)): | |
text = gal_feeds[nc].entries[i].summary | |
text = text.replace('\n', ' ') | |
text = text.replace('\\', '') | |
all_text.append(text) | |
all_titles.append(gal_feeds[nc].entries[i].title) | |
all_arxivid.append(gal_feeds[nc].entries[i].id.split('/')[-1][0:-2]) | |
all_links.append(gal_feeds[nc].entries[i].links[1].href) | |
all_authors.append(gal_feeds[nc].entries[i].authors) | |
temp = gal_feeds[nc].entries[i].published | |
datetime_object = datetime.datetime.strptime(temp[0:10]+' '+temp[11:-1], '%Y-%m-%d %H:%M:%S') | |
all_pubdates.append(datetime_object) | |
all_old.append((datetime.datetime.now() - datetime_object).days) | |
def make_time_excess_plot(midage = 0, tolage = 1, onlyolder = False): | |
bw = 0.05 | |
sigma = 4.0 | |
mask = (np.abs(np.array(all_old) - midage*365) < tolage*365) | |
if onlyolder == True: | |
mask2 = (np.array(all_old) > midage*365 + tolage*365/2) | |
a = np.histogram2d(e2d[0:,0][mask2], e2d[0:,1][mask2], bins=(np.arange(0,17,bw)), density=True) | |
else: | |
a = np.histogram2d(e2d[0:,0], e2d[0:,1], bins=(np.arange(0,17,bw)), density=True) | |
b = np.histogram2d(e2d[0:,0][mask], e2d[0:,1][mask], bins=(np.arange(0,17,bw)), density=True) | |
temp = b[0].T - a[0].T | |
temp = ndimage.gaussian_filter(temp, sigma, mode='nearest') | |
vscale = (np.nanpercentile(temp,99.5) - np.nanpercentile(temp,0.5))/2 | |
fig, ax = plt.subplots(1,1,figsize=(11,9)) | |
plt.pcolor(a[1][0:-1] + (a[1][1]-a[1][0])/2, a[2][0:-1] + (a[2][1]-a[2][0])/2, | |
temp,cmap='bwr', | |
vmin=-vscale,vmax=vscale); plt.colorbar() | |
# plt.scatter(e2d[0:,0], e2d[0:,1],s=2,color='k',alpha=0.1) | |
plt.title('excess research over the last %.1f yrs centered at %.1f yrs' %(tolage, midage)) | |
plt.axis([0,14,1,15]) | |
plt.axis('off') | |
st.pyplot(fig) | |
return | |
st.title('Research hotspots') | |
st.markdown('[Includes papers up to: `'+dateval+'`]') | |
midage = st.slider('Age', 0., 10., 0.) | |
tolage = st.slider('Period width', 0., 10., 1.) | |
st.markdown('Compare the research in a given time period to the full manifold.') | |
make_time_excess_plot(midage, tolage, onlyolder = False) | |
st.markdown('Compare the research in a given time period to research older than that.') | |
make_time_excess_plot(midage, tolage, onlyolder = True) | |