hf-legisqa / app.py
gabrielaltay's picture
deterministic sort order
b2c7bf9
"""
"""
from collections import defaultdict
import json
import os
import re
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_together import ChatTogether
from langchain_google_genai import ChatGoogleGenerativeAI
import streamlit as st
import utils_mod
import doc_format_mod
import guide_mod
import sidebar_mod
import usage_mod
import vectorstore_mod
st.set_page_config(layout="wide", page_title="LegisQA")
os.environ["LANGCHAIN_API_KEY"] = st.secrets["langchain_api_key"]
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = st.secrets["langchain_project"]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
SS = st.session_state
SEED = 292764
CONGRESS_NUMBERS = [113, 114, 115, 116, 117, 118]
SPONSOR_PARTIES = ["D", "R", "L", "I"]
OPENAI_CHAT_MODELS = {
"gpt-4o-mini": {"cost": {"pmi": 0.15, "pmo": 0.60}},
"gpt-4o": {"cost": {"pmi": 5.00, "pmo": 15.0}},
}
ANTHROPIC_CHAT_MODELS = {
"claude-3-haiku-20240307": {"cost": {"pmi": 0.25, "pmo": 1.25}},
"claude-3-5-sonnet-20240620": {"cost": {"pmi": 3.00, "pmo": 15.0}},
"claude-3-opus-20240229": {"cost": {"pmi": 15.0, "pmo": 75.0}},
}
TOGETHER_CHAT_MODELS = {
"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo": {"cost": {"pmi": 0.18, "pmo": 0.18}},
"meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo": {
"cost": {"pmi": 0.88, "pmo": 0.88}
},
"meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo": {
"cost": {"pmi": 5.00, "pmo": 5.00}
},
}
GOOGLE_CHAT_MODELS = {
"gemini-1.5-flash": {"cost": {"pmi": 0.0, "pmo": 0.0}},
"gemini-1.5-pro": {"cost": {"pmi": 0.0, "pmo": 0.0}},
"gemini-1.5-pro-exp-0801": {"cost": {"pmi": 0.0, "pmo": 0.0}},
}
PROVIDER_MODELS = {
"OpenAI": OPENAI_CHAT_MODELS,
"Anthropic": ANTHROPIC_CHAT_MODELS,
"Together": TOGETHER_CHAT_MODELS,
"Google": GOOGLE_CHAT_MODELS,
}
def render_example_queries():
with st.expander("Example Queries"):
st.write(
"""
```
What are the themes around artificial intelligence?
```
```
Write a well cited 3 paragraph essay on food insecurity.
```
```
Create a table summarizing major climate change ideas with columns legis_id, title, idea.
```
```
Write an action plan to keep social security solvent.
```
```
Suggest reforms that would benefit the Medicaid program.
```
"""
)
def get_generative_config(key_prefix: str) -> dict:
output = {}
key = "provider"
output[key] = st.selectbox(
label=key, options=PROVIDER_MODELS.keys(), key=f"{key_prefix}|{key}"
)
key = "model_name"
output[key] = st.selectbox(
label=key,
options=PROVIDER_MODELS[output["provider"]],
key=f"{key_prefix}|{key}",
)
key = "temperature"
output[key] = st.slider(
key,
min_value=0.0,
max_value=2.0,
value=0.0,
key=f"{key_prefix}|{key}",
)
key = "max_output_tokens"
output[key] = st.slider(
key,
min_value=1024,
max_value=2048,
key=f"{key_prefix}|{key}",
)
key = "top_p"
output[key] = st.slider(
key, min_value=0.0, max_value=1.0, value=0.9, key=f"{key_prefix}|{key}"
)
key = "should_escape_markdown"
output[key] = st.checkbox(
key,
value=False,
key=f"{key_prefix}|{key}",
)
key = "should_add_legis_urls"
output[key] = st.checkbox(
key,
value=True,
key=f"{key_prefix}|{key}",
)
return output
def get_retrieval_config(key_prefix: str) -> dict:
output = {}
key = "n_ret_docs"
output[key] = st.slider(
"Number of chunks to retrieve",
min_value=1,
max_value=32,
value=8,
key=f"{key_prefix}|{key}",
)
key = "filter_legis_id"
output[key] = st.text_input("Bill ID (e.g. 118-s-2293)", key=f"{key_prefix}|{key}")
key = "filter_bioguide_id"
output[key] = st.text_input("Bioguide ID (e.g. R000595)", key=f"{key_prefix}|{key}")
key = "filter_congress_nums"
output[key] = st.multiselect(
"Congress Numbers",
CONGRESS_NUMBERS,
default=CONGRESS_NUMBERS,
key=f"{key_prefix}|{key}",
)
key = "filter_sponsor_parties"
output[key] = st.multiselect(
"Sponsor Party",
SPONSOR_PARTIES,
default=SPONSOR_PARTIES,
key=f"{key_prefix}|{key}",
)
return output
def get_llm(gen_config: dict):
match gen_config["provider"]:
case "OpenAI":
llm = ChatOpenAI(
model=gen_config["model_name"],
temperature=gen_config["temperature"],
api_key=st.secrets["openai_api_key"],
top_p=gen_config["top_p"],
seed=SEED,
max_tokens=gen_config["max_output_tokens"],
)
case "Anthropic":
llm = ChatAnthropic(
model_name=gen_config["model_name"],
temperature=gen_config["temperature"],
api_key=st.secrets["anthropic_api_key"],
top_p=gen_config["top_p"],
max_tokens_to_sample=gen_config["max_output_tokens"],
)
case "Together":
llm = ChatTogether(
model=gen_config["model_name"],
temperature=gen_config["temperature"],
max_tokens=gen_config["max_output_tokens"],
top_p=gen_config["top_p"],
seed=SEED,
api_key=st.secrets["together_api_key"],
)
case "Google":
llm = ChatGoogleGenerativeAI(
model=gen_config["model_name"],
temperature=gen_config["temperature"],
api_key=st.secrets["google_api_key"],
max_output_tokens=gen_config["max_output_tokens"],
top_p=gen_config["top_p"],
)
case _:
raise ValueError()
return llm
def create_rag_chain(llm, retriever):
QUERY_RAG_TEMPLATE = """You are an expert legislative analyst. Use the following excerpts from US congressional legislation to respond to the user's query. The excerpts are formatted as a JSON list. Each JSON object has "legis_id", "title", "introduced_date", "sponsor", and "snippets" keys. If a snippet is useful in writing part of your response, then cite the "legis_id", "title", "introduced_date", and "sponsor" in the response. When citing legis_id, use the same format as the excerpts (e.g. "116-hr-125"). If you don't know how to respond, just tell the user.
---
Congressional Legislation Excerpts:
{context}
---
Query: {query}"""
prompt = ChatPromptTemplate.from_messages(
[
("human", QUERY_RAG_TEMPLATE),
]
)
rag_chain = (
RunnableParallel(
{
"docs": retriever,
"query": RunnablePassthrough(),
}
)
.assign(context=lambda x: doc_format_mod.format_docs(x["docs"]))
.assign(aimessage=prompt | llm)
)
return rag_chain
def process_query(gen_config: dict, ret_config: dict, query: str):
vectorstore = vectorstore_mod.load_pinecone_vectorstore()
llm = get_llm(gen_config)
vs_filter = vectorstore_mod.get_vectorstore_filter(ret_config)
retriever = vectorstore.as_retriever(
search_kwargs={"k": ret_config["n_ret_docs"], "filter": vs_filter},
)
rag_chain = create_rag_chain(llm, retriever)
response = rag_chain.invoke(query)
return response
def render_response(
response: dict,
model_info: dict,
provider: str,
should_escape_markdown: bool,
should_add_legis_urls: bool,
tag: str | None = None,
):
response_text = response["aimessage"].content
if should_escape_markdown:
response_text = utils_mod.escape_markdown(response_text)
if should_add_legis_urls:
response_text = utils_mod.replace_legis_ids_with_urls(response_text)
with st.container(border=True):
if tag is None:
st.write("Response")
else:
st.write(f"Response ({tag})")
st.info(response_text)
usage_mod.display_api_usage(response["aimessage"], model_info, provider, tag=tag)
doc_format_mod.render_retrieved_chunks(response["docs"], tag=tag)
def render_query_rag_tab():
key_prefix = "query_rag"
render_example_queries()
with st.form(f"{key_prefix}|query_form"):
query = st.text_area(
"Enter a query that can be answered with congressional legislation:"
)
cols = st.columns(2)
with cols[0]:
query_submitted = st.form_submit_button("Submit")
with cols[1]:
status_placeholder = st.empty()
col1, col2 = st.columns(2)
with col1:
with st.expander("Generative Config"):
gen_config = get_generative_config(key_prefix)
with col2:
with st.expander("Retrieval Config"):
ret_config = get_retrieval_config(key_prefix)
rkey = f"{key_prefix}|response"
if query_submitted:
with status_placeholder:
with st.spinner("generating response"):
SS[rkey] = process_query(gen_config, ret_config, query)
if response := SS.get(rkey):
model_info = PROVIDER_MODELS[gen_config["provider"]][gen_config["model_name"]]
render_response(
response,
model_info,
gen_config["provider"],
gen_config["should_escape_markdown"],
gen_config["should_add_legis_urls"],
)
with st.expander("Debug"):
st.write(response)
def render_query_rag_sbs_tab():
base_key_prefix = "query_rag_sbs"
with st.form(f"{base_key_prefix}|query_form"):
query = st.text_area(
"Enter a query that can be answered with congressional legislation:"
)
cols = st.columns(2)
with cols[0]:
query_submitted = st.form_submit_button("Submit")
with cols[1]:
status_placeholder = st.empty()
grp1a, grp2a = st.columns(2)
gen_configs = {}
ret_configs = {}
with grp1a:
st.header("Group 1")
key_prefix = f"{base_key_prefix}|grp1"
with st.expander("Generative Config"):
gen_configs["grp1"] = get_generative_config(key_prefix)
with st.expander("Retrieval Config"):
ret_configs["grp1"] = get_retrieval_config(key_prefix)
with grp2a:
st.header("Group 2")
key_prefix = f"{base_key_prefix}|grp2"
with st.expander("Generative Config"):
gen_configs["grp2"] = get_generative_config(key_prefix)
with st.expander("Retrieval Config"):
ret_configs["grp2"] = get_retrieval_config(key_prefix)
grp1b, grp2b = st.columns(2)
sbs_cols = {"grp1": grp1b, "grp2": grp2b}
grp_names = {"grp1": "Group 1", "grp2": "Group 2"}
for post_key_prefix in ["grp1", "grp2"]:
with sbs_cols[post_key_prefix]:
key_prefix = f"{base_key_prefix}|{post_key_prefix}"
rkey = f"{key_prefix}|response"
if query_submitted:
with status_placeholder:
with st.spinner(
"generating response for {}".format(grp_names[post_key_prefix])
):
SS[rkey] = process_query(
gen_configs[post_key_prefix],
ret_configs[post_key_prefix],
query,
)
if response := SS.get(rkey):
model_info = PROVIDER_MODELS[gen_configs[post_key_prefix]["provider"]][
gen_configs[post_key_prefix]["model_name"]
]
render_response(
response,
model_info,
gen_configs[post_key_prefix]["provider"],
gen_configs[post_key_prefix]["should_escape_markdown"],
gen_configs[post_key_prefix]["should_add_legis_urls"],
tag=grp_names[post_key_prefix],
)
def main():
st.title(":classical_building: LegisQA :classical_building:")
st.header("Query Congressional Bills")
with st.sidebar:
sidebar_mod.render_sidebar()
query_rag_tab, query_rag_sbs_tab, guide_tab = st.tabs(
[
"RAG",
"RAG (side-by-side)",
"Guide",
]
)
with query_rag_tab:
render_query_rag_tab()
with query_rag_sbs_tab:
render_query_rag_sbs_tab()
with guide_tab:
guide_mod.render_guide()
if __name__ == "__main__":
main()