Spaces:

kiyer
/

synthesist

Sleeping

App Files Files Community

kartheikiyer commited on Jul 20, 2024

Commit

ea7a22d

1 Parent(s): 888f3cd

new app skeleton

Browse files

Files changed (8) hide show

app.py +187 -37
app.py:Zone.Identifier +0 -0
local_files/arxiv_ads_corpus_coordsonly_v3.pkl +0 -3
local_files/embeddings_matrix.npy +0 -3
local_files/index_mapping.pkl +0 -3
local_files/keyword_index.json +0 -3
local_files/metadata.json +0 -3
pages/1 retrieval.py +0 -124

app.py CHANGED Viewed

@@ -1,52 +1,202 @@
 import streamlit as st
-from fns import *
-st.set_page_config(
-    page_title="Synthesist",
-    page_icon="👋",
-)
-# st.write("# Welcome to Pathfinder! 👋")
-st.image('local_files/synth_logo.png')
-st.sidebar.success("Select a function above.")
-st.sidebar.markdown("Current functions include visualizing papers in the arxiv embedding, searching for similar papers to an input paper or prompt phrase, or answering quick questions.")
-st.markdown("")
-st.markdown(
-    """
-    **Synthesist** (from Peter Watt's [Blindsight](https://scalar.usc.edu/works/network-ecologies/on-peter-watts-blindsight)) is a framework for searching and visualizing papers on the [arXiv](https://arxiv.org/) using the context
-    sensitivity from modern large language models (LLMs) to better parse patterns in paper contexts.
-    This tool was built during the [JSALT workshop](https://www.clsp.jhu.edu/2024-jelinek-summer-workshop-on-speech-and-language-technology/) to do awesome things.
-    **👈 Select a tool from the sidebar** to see some examples
-    of what this framework can do!
-    ### Tool summary:
-    - Please wait while the initial data loads and compiles, this takes about a minute initially.
-    - `Paper search` looks for relevant papers given an arxiv id or a question.
-    This is not meant to be a replacement to existing tools like the
-    [ADS](https://ui.adsabs.harvard.edu/),
-    [arxivsorter](https://www.arxivsorter.org/), semantic search or google scholar, but rather a supplement to find papers
-    that otherwise might be missed during a literature survey.
-    It is trained on astro-ph (astrophysics of galaxies) papers up to last-year-ish mined from arxiv and supplemented with ADS metadata,
-    if you are interested in extending it please reach out!
-    Also add: more pages, actual generation, diff. toggles for retrieval/gen, feedback form, socials, literature, contact us, copyright, collaboration, etc.
-    The image below shows a representation of all the astro-ph.GA papers that can be explored in more detail
-    using the `Arxiv embedding` page. The papers tend to cluster together by similarity, and result in an
-    atlas that shows well studied (forests) and currently uncharted areas (water).
-    """
-)
-s = time.time()
-st.markdown(f'Loading data for retrieval system, please wait before jumping to one of the pages....')
-st.session_state.retrieval_system = EmbeddingRetrievalSystem()
-st.session_state.dataset = load_dataset('arxiv_corpus/', split = "train")
-st.markdown(f'Loaded retrieval system, time taken: %.1f sec' %(time.time()-s))

 import streamlit as st
+st.set_page_config(layout="wide")
+import numpy as np
+from abc import ABC, abstractmethod
+from typing import List, Dict, Any, Tuple
+from collections import defaultdict
+from tqdm import tqdm
+import pandas as pd
+from datetime import datetime, date
+from datasets import load_dataset, load_from_disk
+from collections import Counter
+import yaml, json, requests, sys, os, time, hickle
+import concurrent.futures
+ts = time.time()
+from nltk.corpus import stopwords
+import nltk
+from openai import OpenAI
+import anthropic
+import cohere
+import faiss
+import spacy
+from string import punctuation
+import pytextrank
+nlp = spacy.load("en_core_web_sm")
+nlp.add_pipe("textrank")
+try:
+    stopwords.words('english')
+except:
+    nltk.download('stopwords')
+    stopwords.words('english')
+from bokeh.plotting import figure
+from bokeh.models import ColumnDataSource
+from bokeh.palettes import Spectral10
+# try to load the data, if it doesn't work, pull from huggingface and make the pickle files
+st.image('local_files/pathfinder_logo.png')
+st.expander("About", expanded=False).write(
+        """
+        Pathfinder v2.0 is a framework for searching and visualizing astronomy papers on the [arXiv](https://arxiv.org/) and [ADS](https://ui.adsabs.harvard.edu/) using the context
+        sensitivity from modern large language models (LLMs) to better parse patterns in paper contexts.
+        This tool was built during the [JSALT workshop](https://www.clsp.jhu.edu/2024-jelinek-summer-workshop-on-speech-and-language-technology/) to do awesome things.
+        **👈 Select a tool from the sidebar** to see some examples
+        of what this framework can do!
+        ### Tool summary:
+        - Please wait while the initial data loads and compiles, this takes about a minute initially.
+        - `Paper search` looks for relevant papers given an arxiv id or a question.
+        This is not meant to be a replacement to existing tools like the
+        [ADS](https://ui.adsabs.harvard.edu/),
+        [arxivsorter](https://www.arxivsorter.org/), semantic search or google scholar, but rather a supplement to find papers
+        that otherwise might be missed during a literature survey.
+        It is trained on astro-ph (astrophysics of galaxies) papers up to last-year-ish mined from arxiv and supplemented with ADS metadata,
+        if you are interested in extending it please reach out!
+        Also add: more pages, actual generation, diff. toggles for retrieval/gen, feedback form, socials, literature, contact us, copyright, collaboration, etc.
+        The image below shows a representation of all the astro-ph.GA papers that can be explored in more detail
+        using the `Arxiv embedding` page. The papers tend to cluster together by similarity, and result in an
+        atlas that shows well studied (forests) and currently uncharted areas (water).
+        """
+    )
+if 'arxiv_corpus' not in st.session_state:
+    with st.spinner('loading data...'):
+        try:
+            arxiv_corpus = load_from_disk('data/')
+        except:
+            st.write('downloading data')
+            arxiv_corpus = load_dataset('kiyer/pathfinder_arxiv_data',split='train')
+            arxiv_corpus.save_to_disk('data/')
+        st.session_state.arxiv_corpus = arxiv_corpus
+        st.toast('loaded arxiv corpus')
+if 'ids' not in st.session_state:
+    st.session_state.ids = arxiv_corpus['ads_id']
+    st.session_state.titles = arxiv_corpus['title']
+    st.session_state.abstracts = arxiv_corpus['abstract']
+    st.session_state.cites = arxiv_corpus['cites']
+    st.session_state.years = arxiv_corpus['date']
+    st.toast('done caching. time:taken: {}'.format(time.time()-ts))
+else:
+    arxiv_corpus = st.session_state.arxiv_corpus
+# Function to simulate question answering (replace with actual implementation)
+def answer_question(question, keywords, toggles, method, question_type):
+    # Simulated answer (replace with actual logic)
+    return f"Answer to '{question}' using method {method} for {question_type} question."
+# Function to simulate paper retrieval (replace with actual implementation)
+def get_papers():
+    # Simulated paper data (replace with actual data retrieval)
+    return pd.DataFrame({
+        'Title': ['Paper 1', 'Paper 2', 'Paper 3'],
+        'Relevance': [0.9, 0.7, 0.5]
+    })
+# Function to create embedding plot (replace with actual implementation)
+def create_embedding_plot():
+    # Simulated embedding data (replace with actual embedding calculation)
+    source = ColumnDataSource(data=dict(
+        x=[1, 2, 3, 4, 5],
+        y=[6, 7, 2, 4, 5],
+        colors=Spectral10[0:5],
+        labels=['A', 'B', 'C', 'D', 'E']
+    ))
+    p = figure(width=400, height=400, title="Embedding Map")
+    p.circle('x', 'y', size=20, source=source, color='colors', alpha=0.6)
+    return p
+# Function to simulate keyword extraction (replace with actual implementation)
+def extract_keywords(question):
+    # Simulated keyword extraction (replace with actual logic)
+    return ['keyword1', 'keyword2', 'keyword3']
+# Function to estimate consensus (replace with actual implementation)
+def estimate_consensus():
+    # Simulated consensus estimation (replace with actual calculation)
+    return 0.75
+# Streamlit app
+def main():
+    # st.title("Question Answering App")
+    # Sidebar (Inputs)
+    st.sidebar.header("Inputs")
+    question = st.sidebar.text_input("Enter your question:")
+    extra_keywords = st.sidebar.text_input("Enter extra keywords (comma-separated):")
+    st.sidebar.subheader("Toggles")
+    toggle_a = st.sidebar.checkbox("Toggle A")
+    toggle_b = st.sidebar.checkbox("Toggle B")
+    toggle_c = st.sidebar.checkbox("Toggle C")
+    method = st.sidebar.radio("Choose a method:", ["h1", "h2", "h3"])
+    question_type = st.sidebar.selectbox("Select question type:", ["Type 1", "Type 2", "Type 3"])
+    store_output = st.sidebar.checkbox("Store the output")
+    submit_button = st.sidebar.button("Submit")
+    # Main page (Outputs)
+    if submit_button:
+        # Process inputs
+        keywords = [kw.strip() for kw in extra_keywords.split(',')] if extra_keywords else []
+        toggles = {'A': toggle_a, 'B': toggle_b, 'C': toggle_c}
+        # Generate outputs
+        answer = answer_question(question, keywords, toggles, method, question_type)
+        papers_df = get_papers()
+        embedding_plot = create_embedding_plot()
+        triggered_keywords = extract_keywords(question)
+        consensus = estimate_consensus()
+        # Display outputs
+        st.header("Results")
+        col1, col2 = st.columns(2)
+        with col1:
+            st.subheader("Answer")
+            st.write(answer)
+            st.subheader("Papers Used")
+            st.dataframe(papers_df)
+            st.subheader("Triggered Keywords")
+            st.write(", ".join(triggered_keywords))
+        with col2:
+            st.subheader("Embedding Map")
+            st.bokeh_chart(embedding_plot)
+            st.subheader("Question Type")
+            st.write(question_type)
+            st.subheader("Consensus Estimate")
+            st.write(f"{consensus:.2%}")
+        if store_output:
+            st.success("Output stored successfully!")
+    else:
+        st.info("Use the sidebar to input parameters and submit to see results.")
+if __name__ == "__main__":
+    main()

app.py:Zone.Identifier ADDED Viewed

File without changes

local_files/arxiv_ads_corpus_coordsonly_v3.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:84bf489fe113fa63fbb21def14b97c080d01b91146fc9e867c53012721770780
-size 4348940

local_files/embeddings_matrix.npy DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4be06660e0ce1cf21461c03bc51bca09bc86d5f7d1c60d283eb6b28bbeb10788
-size 3336732800

local_files/index_mapping.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:87bc197008b1faf15a9d64ac42fc4ee03b7f6e5d4bd9ffc6ac64c4156c3a8e24
-size 7656835

local_files/keyword_index.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b100ce52f6bff88576430aae32d4617722deb2168b3740842e96ea265d3aaf59
-size 138660705

local_files/metadata.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f5baae76dffec0f2326a6c0d4138e61ef2e81faca762c6f1fb43076c29a61ca3
-size 1180396608

pages/1 retrieval.py DELETED Viewed

@@ -1,124 +0,0 @@
-import time
-s = time.time()
-import os
-import datetime
-import faiss
-import streamlit as st
-import feedparser
-import urllib
-import cloudpickle as cp
-import pickle
-from urllib.request import urlopen
-from summa import summarizer
-import numpy as np
-import matplotlib.pyplot as plt
-import requests
-import json
-from scipy import ndimage
-from langchain_openai import AzureOpenAIEmbeddings
-# from langchain.llms import OpenAI
-from langchain_community.llms import OpenAI
-from langchain_openai import AzureChatOpenAI
-from fns import *
-st.image('local_files/synth_logo.png')
-st.markdown("")
-query = st.text_input('Ask me anything:',
-value="What causes galaxy quenching at high redshifts?")
-arxiv_id = None
-top_k = st.slider('How many papers should I show?', 1, 30, 6)
-retrieval_system = st.session_state.retrieval_system
-results = retrieval_system.retrieve(query, arxiv_id, top_k)
-aids = st.session_state.dataset['id']
-titles = st.session_state.dataset['title']
-auths = st.session_state.dataset['author']
-bibcodes = st.session_state.dataset['bibcode']
-all_keywords = st.session_state.dataset['keyword_search']
-allyrs = st.session_state.dataset['year']
-ret_indices = np.array([aids.index(results[i]) for i in range(top_k)])
-yrs = []
-for i in range(len(ret_indices)):
-    yr = allyrs[ret_indices[i]]
-    if yr < 50:
-        yr = yr + 2000
-    else:
-        yr = yr + 1900
-    yrs.append(yr)
-print_titles = [titles[ret_indices[i]][0] for i in range(len(ret_indices))]
-print_auths = [auths[ret_indices[i]][0]+' et al. '+str(yrs[i]) for i in range(len(ret_indices))]
-print_links = ['['+bibcodes[ret_indices[i]]+'](https://ui.adsabs.harvard.edu/abs/'+bibcodes[ret_indices[i]]+'/abstract)' for i in range(len(ret_indices))]
-st.divider()
-st.header('top-k papers:')
-for i in range(len(ret_indices)):
-    st.subheader(str(i+1)+'. '+print_titles[i])
-    st.write(print_auths[i]+' '+print_links[i])
-st.divider()
-st.header('top-k papers in context:')
-gtkws = get_keywords(query, ret_indices, all_keywords)
-umap, clbls, all_kws = load_umapcoords('local_files/arxiv_ads_corpus_coordsonly_v3.pkl')
-fig = plt.figure(figsize=(12*1.8*1.2,9*2.*1.2))
-im = plt.imread('local_files/astro_worldmap.png')
-implot = plt.imshow(im,)
-xax = (umap[0:,1]-np.amin(umap[0:,1]))+.0
-xax = xax / np.amax(xax)
-xax = xax * 1580 + 170
-yax = (umap[0:,0]-np.amin(umap[0:,0]))+.0
-yax = yax / np.amax(yax)
-yax = (np.amax(yax)-yax) * 1700 + 30
-# plt.scatter(xax, yax,s=2,alpha=0.7,c='k')
-for i in range(np.amax(clbls)):
-    clust_ids = np.arange(len(clbls))[clbls == i]
-    clust_centroid = (np.median(xax[clust_ids]),np.median(yax[clust_ids]))
-    # plt.text(clust_centroid[1], clust_centroid[0], all_kws[i],fontsize=9,ha="center", va="center",
-    #          bbox=dict(facecolor='white', edgecolor='black', boxstyle='round,pad=0.3',alpha=0.3))
-    plt.text(clust_centroid[0], clust_centroid[1], all_kws[i],fontsize=9,ha="center", va="center",
-             fontfamily='serif',color='w',
-            bbox=dict(facecolor='k', edgecolor='none', boxstyle='round,pad=0.1',alpha=0.3))
-plt.scatter(xax[ret_indices], yax[ret_indices], c='k',s=300,zorder=100)
-plt.scatter(xax[ret_indices], yax[ret_indices], c='firebrick',s=100,zorder=101)
-plt.scatter(xax[ret_indices[0]], yax[ret_indices[0]], c='k',s=300,zorder=101)
-plt.scatter(xax[ret_indices[0]], yax[ret_indices[0]], c='w',s=100,zorder=101)
-tempx = plt.xlim(); tempy = plt.ylim()
-plt.text(0.012*tempx[1], (0.012+0.03)*tempy[0], 'The world of astronomy literature',fontsize=36, fontfamily='serif')
-plt.text(0.012*tempx[1], (0.012+0.06)*tempy[0], 'Query: '+query,fontsize=18, fontfamily='serif')
-plt.text(0.012*tempx[1], (0.012+0.08)*tempy[0], gtkws,fontsize=18, fontfamily='serif', va='top')
-plt.axis('off')
-st.pyplot(fig, transparent = True, bbox_inches='tight')