Spaces:

kiyer
/

synthesist

Sleeping

synthesist / app.py

kartheikiyer

new app skeleton

ea7a22d 12 months ago

7.47 kB

	import streamlit as st
	st.set_page_config(layout="wide")

	import numpy as np
	from abc import ABC, abstractmethod
	from typing import List, Dict, Any, Tuple
	from collections import defaultdict
	from tqdm import tqdm
	import pandas as pd
	from datetime import datetime, date
	from datasets import load_dataset, load_from_disk
	from collections import Counter

	import yaml, json, requests, sys, os, time, hickle
	import concurrent.futures
	ts = time.time()

	from nltk.corpus import stopwords
	import nltk
	from openai import OpenAI
	import anthropic
	import cohere
	import faiss

	import spacy
	from string import punctuation
	import pytextrank

	nlp = spacy.load("en_core_web_sm")
	nlp.add_pipe("textrank")

	try:
	stopwords.words('english')
	except:
	nltk.download('stopwords')
	stopwords.words('english')

	from bokeh.plotting import figure
	from bokeh.models import ColumnDataSource
	from bokeh.palettes import Spectral10

	# try to load the data, if it doesn't work, pull from huggingface and make the pickle files

	st.image('local_files/pathfinder_logo.png')

	st.expander("About", expanded=False).write(
	"""
	Pathfinder v2.0 is a framework for searching and visualizing astronomy papers on the [arXiv](https://arxiv.org/) and [ADS](https://ui.adsabs.harvard.edu/) using the context
	sensitivity from modern large language models (LLMs) to better parse patterns in paper contexts.

	This tool was built during the [JSALT workshop](https://www.clsp.jhu.edu/2024-jelinek-summer-workshop-on-speech-and-language-technology/) to do awesome things.

	👈 Select a tool from the sidebar to see some examples
	of what this framework can do!

	### Tool summary:
	- Please wait while the initial data loads and compiles, this takes about a minute initially.
	- `Paper search` looks for relevant papers given an arxiv id or a question.

	This is not meant to be a replacement to existing tools like the
	[ADS](https://ui.adsabs.harvard.edu/),
	[arxivsorter](https://www.arxivsorter.org/), semantic search or google scholar, but rather a supplement to find papers
	that otherwise might be missed during a literature survey.
	It is trained on astro-ph (astrophysics of galaxies) papers up to last-year-ish mined from arxiv and supplemented with ADS metadata,
	if you are interested in extending it please reach out!


	Also add: more pages, actual generation, diff. toggles for retrieval/gen, feedback form, socials, literature, contact us, copyright, collaboration, etc.

	The image below shows a representation of all the astro-ph.GA papers that can be explored in more detail
	using the `Arxiv embedding` page. The papers tend to cluster together by similarity, and result in an
	atlas that shows well studied (forests) and currently uncharted areas (water).
	"""
	)



	if 'arxiv_corpus' not in st.session_state:
	with st.spinner('loading data...'):
	try:
	arxiv_corpus = load_from_disk('data/')
	except:
	st.write('downloading data')
	arxiv_corpus = load_dataset('kiyer/pathfinder_arxiv_data',split='train')
	arxiv_corpus.save_to_disk('data/')
	st.session_state.arxiv_corpus = arxiv_corpus
	st.toast('loaded arxiv corpus')

	if 'ids' not in st.session_state:
	st.session_state.ids = arxiv_corpus['ads_id']
	st.session_state.titles = arxiv_corpus['title']
	st.session_state.abstracts = arxiv_corpus['abstract']
	st.session_state.cites = arxiv_corpus['cites']
	st.session_state.years = arxiv_corpus['date']
	st.toast('done caching. time:taken: {}'.format(time.time()-ts))


	else:
	arxiv_corpus = st.session_state.arxiv_corpus
	# Function to simulate question answering (replace with actual implementation)
	def answer_question(question, keywords, toggles, method, question_type):
	# Simulated answer (replace with actual logic)
	return f"Answer to '{question}' using method {method} for {question_type} question."

	# Function to simulate paper retrieval (replace with actual implementation)
	def get_papers():
	# Simulated paper data (replace with actual data retrieval)
	return pd.DataFrame({
	'Title': ['Paper 1', 'Paper 2', 'Paper 3'],
	'Relevance': [0.9, 0.7, 0.5]
	})

	# Function to create embedding plot (replace with actual implementation)
	def create_embedding_plot():
	# Simulated embedding data (replace with actual embedding calculation)
	source = ColumnDataSource(data=dict(
	x=[1, 2, 3, 4, 5],
	y=[6, 7, 2, 4, 5],
	colors=Spectral10[0:5],
	labels=['A', 'B', 'C', 'D', 'E']
	))

	p = figure(width=400, height=400, title="Embedding Map")
	p.circle('x', 'y', size=20, source=source, color='colors', alpha=0.6)
	return p

	# Function to simulate keyword extraction (replace with actual implementation)
	def extract_keywords(question):
	# Simulated keyword extraction (replace with actual logic)
	return ['keyword1', 'keyword2', 'keyword3']

	# Function to estimate consensus (replace with actual implementation)
	def estimate_consensus():
	# Simulated consensus estimation (replace with actual calculation)
	return 0.75

	# Streamlit app
	def main():

	# st.title("Question Answering App")

	# Sidebar (Inputs)
	st.sidebar.header("Inputs")
	question = st.sidebar.text_input("Enter your question:")
	extra_keywords = st.sidebar.text_input("Enter extra keywords (comma-separated):")

	st.sidebar.subheader("Toggles")
	toggle_a = st.sidebar.checkbox("Toggle A")
	toggle_b = st.sidebar.checkbox("Toggle B")
	toggle_c = st.sidebar.checkbox("Toggle C")

	method = st.sidebar.radio("Choose a method:", ["h1", "h2", "h3"])
	question_type = st.sidebar.selectbox("Select question type:", ["Type 1", "Type 2", "Type 3"])
	store_output = st.sidebar.checkbox("Store the output")

	submit_button = st.sidebar.button("Submit")

	# Main page (Outputs)
	if submit_button:
	# Process inputs
	keywords = [kw.strip() for kw in extra_keywords.split(',')] if extra_keywords else []
	toggles = {'A': toggle_a, 'B': toggle_b, 'C': toggle_c}

	# Generate outputs
	answer = answer_question(question, keywords, toggles, method, question_type)
	papers_df = get_papers()
	embedding_plot = create_embedding_plot()
	triggered_keywords = extract_keywords(question)
	consensus = estimate_consensus()

	# Display outputs
	st.header("Results")

	col1, col2 = st.columns(2)

	with col1:
	st.subheader("Answer")
	st.write(answer)

	st.subheader("Papers Used")
	st.dataframe(papers_df)

	st.subheader("Triggered Keywords")
	st.write(", ".join(triggered_keywords))

	with col2:
	st.subheader("Embedding Map")
	st.bokeh_chart(embedding_plot)

	st.subheader("Question Type")
	st.write(question_type)

	st.subheader("Consensus Estimate")
	st.write(f"{consensus:.2%}")

	if store_output:
	st.success("Output stored successfully!")
	else:
	st.info("Use the sidebar to input parameters and submit to see results.")

	if __name__ == "__main__":
	main()