Spaces:
Sleeping
Sleeping
File size: 8,928 Bytes
0ccd99e c1f42ca 0ccd99e bf691dd 0ccd99e 7106162 0ccd99e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 |
import math
import altair as alt
from haystack import Document
from haystack.document_stores import InMemoryDocumentStore, ElasticsearchDocumentStore, FAISSDocumentStore
from haystack.nodes import BM25Retriever
from haystack.pipelines import DocumentSearchPipeline
import pandas as pd
import streamlit as st
@st.cache_data
def load_data(file):
df = pd.read_csv(file, sep="\t", lineterminator='\n')
# rearrange column order
col_list = ['Agency', 'Name of Inventory Item',
'Primary Type of AI',
'Purpose of AI', 'Length of Usage',
'Does it directly impact the public?',
'Vendor System',
'Description of Inventory Item',
'Other Notes\r']
df = df[col_list]
# remove trailing \r from 'Other Notes' header
df = df.rename(columns = {'Other Notes\r' : 'Other Notes'})
# remove trailing spaces from agency names (caused duplicate instance of "DOC")
df['Agency'] = df['Agency'].apply(lambda x : x.rstrip())
return df
@st.cache_data
def plot_impact_by_agency(df):
df = df.copy()
df = df.rename(columns={'Does it directly impact the public?': 'Impact on public'})
impact_counts = df.groupby('Agency')['Impact on public'].value_counts()
impact_counts = impact_counts.sort_index(level="Agency", ascending=False)
impact_count_df = pd.DataFrame(impact_counts).rename(columns={'Impact on public' : "Count"}).reset_index()
domain = ['Direct impact', 'Indirect impact', 'No impact']
range_ = ['red', 'darkorange', 'steelblue']
chart = (alt.Chart(impact_count_df).mark_bar(align="right").encode(
x=alt.X("count", type="quantitative", title="Number of entries", axis=alt.Axis()),
y=alt.Y("Agency", type="nominal", title="Agency", axis=alt.Axis(labelLimit=300, labelFlushOffset=5)),
color=alt.Color("Impact on public", scale=alt.Scale(domain=domain, range=range_)),
)
)
return chart
@st.cache_data
def plot_count_by_category(column, df):
table = pd.DataFrame(df[column].value_counts().sort_values(ascending=True)).reset_index()
table.columns = [column, "Count"]
chart = (
alt.Chart(table)
.mark_bar(align="right")
.encode(
x=alt.X("Count", type="quantitative", title="Number of entries", axis=alt.Axis()),
y=alt.Y(column, type="nominal", title="", axis=alt.Axis(labelLimit=300, labelFlushOffset=5),
sort="-x"))
)
return chart
@st.cache_data
def filter_table(df, choices):
"""
Function to filter table based on user choices in dropdown menus.
choices: dict with column as key, list of selected values as value
e.g. {"Agency" : ["USDA", "USDOC"]
"""
for column in choices:
desired_values = choices[column]
if "Select all" not in desired_values:
df = df[df[column].isin(desired_values)]
return df
@st.cache_data
def create_search_pipeline(df, col_list):
document_store = InMemoryDocumentStore(use_bm25=True)
docs = []
indices = list(df.index.values)
for col in col_list:
values = df[col].tolist()
assert len(indices) == len(values)
for i, val in zip(indices, values):
dictionary = {'content' : val,
'meta' : {"index": i, "column_header" : col}
}
docs.append(Document.from_dict(dictionary))
document_store.write_documents(docs)
retriever = BM25Retriever(document_store=document_store)
pipeline = DocumentSearchPipeline(retriever)
return pipeline
@st.cache_data
def run_search(text, _pipeline):
if text == "":
return None
res = pipeline.run(query=text, params={"Retriever": {"top_k": 10}})
relevant_results = [r for r in res['documents'] if r.score > 0.5]
result_rows = [doc.meta['index'] for doc in relevant_results]
result_cols = [doc.meta['column_header'] for doc in relevant_results]
return (result_rows, result_cols)
@st.cache_data
def produce_table(df, table_indices):
if not table_indices:
return None
result_df = df.iloc[table_indices[0], :]
result_df = result_df.drop_duplicates()
# highlight the cells found in search
color_df = result_df.copy()
color_df.loc[:,:] = ''
for row, col in zip(table_indices[0], table_indices[1]):
color_df.loc[row, col] = 'background-color: yellow'
return result_df, color_df
@st.cache_data
def convert_df(df):
return df.to_csv(sep="\t", index=False).encode('utf-8')
if __name__ == "__main__":
input_file = "Agency Inventory AI Usage - Sheet1.tsv"
st.markdown("# U.S. Federal Government Use of AI")
main_text = """
The data visualized here come from a report by Anna Blue, a 2023 Social Impact Fellow
at the [Responsible AI Institute](https://www.responsible.ai). The report was released in May 2023. Some agencies have
released updated inventories since then, which are not reflected here.
Anna's report consolidated and annotated data released by individual government agencies in compliance with
Executive Order 13960, which requires federal agencies to produce an annual inventory of their AI usage.
See her [blog post](https://www.responsible.ai/post/federal-government-ai-use-cases) for additional details,
including links to the original data sources.
"""
st.markdown(main_text)
df = load_data(input_file)
# Plot stacked bar chart of impact on the public by agency
st.subheader("Impact of systems on the public, by agency")
stacked_bar_chart = plot_impact_by_agency(df)
st.altair_chart(stacked_bar_chart, use_container_width=True)
# Plot counts by category, allowing user to select category
st.subheader("Number of entries by category")
no_filter_cols = ['Name of Inventory Item', 'Description of Inventory Item', "Other Notes"]
filter_cols = [c for c in df.columns.unique() if c not in no_filter_cols]
column = st.selectbox("Choose what to plot", filter_cols)
count_chart = plot_count_by_category(column, df)
st.altair_chart(count_chart, use_container_width=True)
# Table with filters for user browsing
st.subheader("Explore the entries")
st.write("Use the menus to filter the table. You can download the filtered table below.")
filter_names = ["Agency", "Primary Type of AI", "Purpose of AI", "Length of Usage",
"Does it directly impact the public?", "Vendor System"]
c1, c2 = st.columns((1, 1))
filter_dict = {}
with c1:
for filter_name in filter_names[:3]:
menu = st.multiselect(filter_name, ["Select all"] + list(df[filter_name].unique()), default="Select all")
filter_dict[filter_name] = menu
with c2:
for filter_name in filter_names[3:]:
menu = st.multiselect(filter_name, ["Select all"] + list(df[filter_name].unique()), default="Select all")
filter_dict[filter_name] = menu
filtered_df = filter_table(df, filter_dict)
st.write(filtered_df)
# Download filtered table
st.write("Download current table as TSV (tab-separated values) file")
table_output_file = st.text_input("Enter name for the file to be downloaded", value="table.tsv")
if table_output_file is not None:
csv = convert_df(filtered_df)
st.download_button("Download", csv, file_name=table_output_file)
# Text search
st.subheader("Search the data")
st.markdown("""
This will search text in the following columns:
* Name of Inventory Item
* Primary Type of AI
* Purpose of AI
* Description of Inventory Item
* Other Notes
This is a keyword search based on the BM25 algorithm.
Yellow highlighting indicates text retrieved in the search.
A download button will appear after you run a search.
""")
searchable_cols = ['Name of Inventory Item',
'Primary Type of AI',
'Purpose of AI',
'Description of Inventory Item',
'Other Notes']
pipeline = create_search_pipeline(df, searchable_cols)
input_text = st.text_input("Enter text", "")
if input_text:
result_rows, result_cols = run_search(input_text, pipeline)
result_df, color_df = produce_table(df, (result_rows, result_cols))
st.dataframe(result_df.style.apply(lambda x: color_df, axis=None))
st.write("Download search results as TSV (tab-separated values) file")
search_output_file = st.text_input("Enter name for the file to be downloaded", value="search_results.tsv")
csv = convert_df(result_df) #TODO: change to search results
st.download_button("Download", csv, file_name=search_output_file)
|