import os import datetime import faiss import streamlit as st import feedparser import urllib import cloudpickle as cp import pickle from urllib.request import urlopen from summa import summarizer import numpy as np import matplotlib.pyplot as plt import requests import json from scipy import ndimage from langchain_openai import AzureOpenAIEmbeddings from langchain.llms import OpenAI from langchain_openai import AzureChatOpenAI os.environ["OPENAI_API_TYPE"] = "azure" os.environ["AZURE_ENDPOINT"] = st.secrets["endpoint1"] os.environ["OPENAI_API_KEY"] = st.secrets["key1"] os.environ["OPENAI_API_VERSION"] = "2023-05-15" embeddings = AzureOpenAIEmbeddings( deployment="embedding", model="text-embedding-ada-002", azure_endpoint=st.secrets["endpoint1"], ) llm = AzureChatOpenAI( deployment_name="gpt4_small", openai_api_version="2023-12-01-preview", azure_endpoint=st.secrets["endpoint2"], openai_api_key=st.secrets["key2"], openai_api_type="azure", temperature=0. ) @st.cache_data def get_feeds_data(url): # data = cp.load(urlopen(url)) with open(url, "rb") as fp: data = pickle.load(fp) st.sidebar.success("Loaded data") return data # feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_" # embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0" dateval = "16-Jun-2024" feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl" embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl" gal_feeds = get_feeds_data(feeds_link) arxiv_ada_embeddings = get_feeds_data(embed_link) @st.cache_data def get_embedding_data(url): # data = cp.load(urlopen(url)) with open(url, "rb") as fp: data = pickle.load(fp) st.sidebar.success("Fetched data from API!") return data # url = "https://drive.google.com/uc?export=download&id=1133tynMwsfdR1wxbkFLhbES3FwDWTPjP" url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl" e2d = get_embedding_data(url) # e2d, _, _, _, _ = get_embedding_data(url) ctr = -1 num_chunks = len(gal_feeds) ctr = -1 num_chunks = len(gal_feeds) all_text, all_titles, all_arxivid, all_links, all_authors, all_pubdates, all_old = [], [], [], [], [], [], [] for nc in range(num_chunks): for i in range(len(gal_feeds[nc].entries)): text = gal_feeds[nc].entries[i].summary text = text.replace('\n', ' ') text = text.replace('\\', '') all_text.append(text) all_titles.append(gal_feeds[nc].entries[i].title) all_arxivid.append(gal_feeds[nc].entries[i].id.split('/')[-1][0:-2]) all_links.append(gal_feeds[nc].entries[i].links[1].href) all_authors.append(gal_feeds[nc].entries[i].authors) temp = gal_feeds[nc].entries[i].published datetime_object = datetime.datetime.strptime(temp[0:10]+' '+temp[11:-1], '%Y-%m-%d %H:%M:%S') all_pubdates.append(datetime_object) all_old.append((datetime.datetime.now() - datetime_object).days) def make_time_excess_plot(midage = 0, tolage = 1, onlyolder = False): bw = 0.05 sigma = 4.0 mask = (np.abs(np.array(all_old) - midage*365) < tolage*365) if onlyolder == True: mask2 = (np.array(all_old) > midage*365 + tolage*365/2) a = np.histogram2d(e2d[0:,0][mask2], e2d[0:,1][mask2], bins=(np.arange(0,17,bw)), density=True) else: a = np.histogram2d(e2d[0:,0], e2d[0:,1], bins=(np.arange(0,17,bw)), density=True) b = np.histogram2d(e2d[0:,0][mask], e2d[0:,1][mask], bins=(np.arange(0,17,bw)), density=True) temp = b[0].T - a[0].T temp = ndimage.gaussian_filter(temp, sigma, mode='nearest') vscale = (np.nanpercentile(temp,99.5) - np.nanpercentile(temp,0.5))/2 fig, ax = plt.subplots(1,1,figsize=(11,9)) plt.pcolor(a[1][0:-1] + (a[1][1]-a[1][0])/2, a[2][0:-1] + (a[2][1]-a[2][0])/2, temp,cmap='bwr', vmin=-vscale,vmax=vscale); plt.colorbar() # plt.scatter(e2d[0:,0], e2d[0:,1],s=2,color='k',alpha=0.1) plt.title('excess research over the last %.1f yrs centered at %.1f yrs' %(tolage, midage)) plt.axis([0,14,1,15]) plt.axis('off') st.pyplot(fig) return st.title('Research hotspots') st.markdown('[Includes papers up to: `'+dateval+'`]') midage = st.slider('Age', 0., 10., 0.) tolage = st.slider('Period width', 0., 10., 1.) st.markdown('Compare the research in a given time period to the full manifold.') make_time_excess_plot(midage, tolage, onlyolder = False) st.markdown('Compare the research in a given time period to research older than that.') make_time_excess_plot(midage, tolage, onlyolder = True)