Spaces:
Sleeping
Sleeping
Commit
·
0ccd99e
1
Parent(s):
5c799df
add app.py
Browse files- app.py +221 -0
- requirements.txt +3 -0
app.py
ADDED
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
|
3 |
+
import altair as alt
|
4 |
+
from haystack import Document
|
5 |
+
from haystack.document_stores import InMemoryDocumentStore, ElasticsearchDocumentStore, FAISSDocumentStore
|
6 |
+
from haystack.nodes import BM25Retriever
|
7 |
+
from haystack.pipelines import DocumentSearchPipeline
|
8 |
+
import pandas as pd
|
9 |
+
import streamlit as st
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
@st.cache_data
|
14 |
+
def load_data(file):
|
15 |
+
df = pd.read_csv(file, sep="\t", lineterminator='\n')
|
16 |
+
|
17 |
+
# rearrange column order
|
18 |
+
col_list = ['Agency', 'Name of Inventory Item',
|
19 |
+
'Primary Type of AI',
|
20 |
+
'Purpose of AI', 'Length of Usage',
|
21 |
+
'Does it directly impact the public?',
|
22 |
+
'Vendor System',
|
23 |
+
'Description of Inventory Item',
|
24 |
+
'Other Notes\r']
|
25 |
+
df = df[col_list]
|
26 |
+
|
27 |
+
# remove trailing \r from 'Other Notes' header
|
28 |
+
df = df.rename(columns = {'Other Notes\r' : 'Other Notes'})
|
29 |
+
|
30 |
+
# remove trailing spaces from agency names (caused duplicate instance of "DOC")
|
31 |
+
df['Agency'] = df['Agency'].apply(lambda x : x.rstrip())
|
32 |
+
return df
|
33 |
+
|
34 |
+
|
35 |
+
@st.cache_data
|
36 |
+
def plot_impact_by_agency(df):
|
37 |
+
df = df.copy()
|
38 |
+
df = df.rename(columns={'Does it directly impact the public?': 'Impact on public'})
|
39 |
+
impact_counts = df.groupby('Agency')['Impact on public'].value_counts()
|
40 |
+
impact_counts = impact_counts.sort_index(level="Agency", ascending=False)
|
41 |
+
impact_count_df = pd.DataFrame(impact_counts).rename(columns={'Impact on public' : "Count"}).reset_index()
|
42 |
+
domain = ['Direct impact', 'Indirect impact', 'No impact']
|
43 |
+
range_ = ['red', 'darkorange', 'steelblue']
|
44 |
+
chart = (alt.Chart(impact_count_df).mark_bar(align="right").encode(
|
45 |
+
x=alt.X("Count", type="quantitative", title="Number of entries", axis=alt.Axis()),
|
46 |
+
y=alt.Y("Agency", type="nominal", title="Agency", axis=alt.Axis(labelLimit=300, labelFlushOffset=5)),
|
47 |
+
color=alt.Color("Impact on public", scale=alt.Scale(domain=domain, range=range_)),
|
48 |
+
)
|
49 |
+
)
|
50 |
+
return chart
|
51 |
+
|
52 |
+
|
53 |
+
@st.cache_data
|
54 |
+
def plot_count_by_category(column, df):
|
55 |
+
table = pd.DataFrame(df[column].value_counts().sort_values(ascending=True)).reset_index()
|
56 |
+
table.columns = [column, "Count"]
|
57 |
+
chart = (
|
58 |
+
alt.Chart(table)
|
59 |
+
.mark_bar(align="right")
|
60 |
+
.encode(
|
61 |
+
x=alt.X("Count", type="quantitative", title="Number of entries", axis=alt.Axis()),
|
62 |
+
y=alt.Y(column, type="nominal", title="", axis=alt.Axis(labelLimit=300, labelFlushOffset=5),
|
63 |
+
sort="-x"))
|
64 |
+
)
|
65 |
+
return chart
|
66 |
+
|
67 |
+
|
68 |
+
@st.cache_data
|
69 |
+
def filter_table(df, choices):
|
70 |
+
"""
|
71 |
+
Function to filter table based on user choices in dropdown menus.
|
72 |
+
choices: dict with column as key, list of selected values as value
|
73 |
+
e.g. {"Agency" : ["USDA", "USDOC"]
|
74 |
+
"""
|
75 |
+
for column in choices:
|
76 |
+
desired_values = choices[column]
|
77 |
+
if "Select all" not in desired_values:
|
78 |
+
df = df[df[column].isin(desired_values)]
|
79 |
+
return df
|
80 |
+
|
81 |
+
|
82 |
+
@st.cache_data
|
83 |
+
def create_search_pipeline(df, col_list):
|
84 |
+
document_store = InMemoryDocumentStore(use_bm25=True)
|
85 |
+
docs = []
|
86 |
+
indices = list(df.index.values)
|
87 |
+
for col in col_list:
|
88 |
+
values = df[col].tolist()
|
89 |
+
assert len(indices) == len(values)
|
90 |
+
for i, val in zip(indices, values):
|
91 |
+
dictionary = {'content' : val,
|
92 |
+
'meta' : {"index": i, "column_header" : col}
|
93 |
+
}
|
94 |
+
docs.append(Document.from_dict(dictionary))
|
95 |
+
document_store.write_documents(docs)
|
96 |
+
retriever = BM25Retriever(document_store=document_store)
|
97 |
+
pipeline = DocumentSearchPipeline(retriever)
|
98 |
+
return pipeline
|
99 |
+
|
100 |
+
|
101 |
+
@st.cache_data
|
102 |
+
def run_search(text, _pipeline):
|
103 |
+
if text == "":
|
104 |
+
return None
|
105 |
+
res = pipeline.run(query=text, params={"Retriever": {"top_k": 10}})
|
106 |
+
relevant_results = [r for r in res['documents'] if r.score > 0.5]
|
107 |
+
result_rows = [doc.meta['index'] for doc in relevant_results]
|
108 |
+
result_cols = [doc.meta['column_header'] for doc in relevant_results]
|
109 |
+
return (result_rows, result_cols)
|
110 |
+
|
111 |
+
|
112 |
+
@st.cache_data
|
113 |
+
def produce_table(df, table_indices):
|
114 |
+
if not table_indices:
|
115 |
+
return None
|
116 |
+
result_df = df.iloc[table_indices[0], :]
|
117 |
+
result_df = result_df.drop_duplicates()
|
118 |
+
# highlight the cells found in search
|
119 |
+
color_df = result_df.copy()
|
120 |
+
color_df.loc[:,:] = ''
|
121 |
+
for row, col in zip(table_indices[0], table_indices[1]):
|
122 |
+
color_df.loc[row, col] = 'background-color: yellow'
|
123 |
+
return result_df, color_df
|
124 |
+
|
125 |
+
|
126 |
+
@st.cache_data
|
127 |
+
def convert_df(df):
|
128 |
+
return df.to_csv(sep="\t", index=False).encode('utf-8')
|
129 |
+
|
130 |
+
|
131 |
+
|
132 |
+
if __name__ == "__main__":
|
133 |
+
input_file = "/Users/carolanderson/Dropbox/repos/miscellany/webapps/Agency Inventory AI Usage - Sheet1.tsv"
|
134 |
+
|
135 |
+
st.markdown("# U.S. Federal Government Use of AI")
|
136 |
+
|
137 |
+
main_text = """
|
138 |
+
The data visualized here come from a report by Anna Blue, a 2023 Social Impact Fellow
|
139 |
+
at the [Responsible AI Institute](https://www.responsible.ai). The report was released in May 2023. Some agencies have
|
140 |
+
released updated inventories since then, which are not reflected here.
|
141 |
+
|
142 |
+
Anna's report consolidated and annotated data released by individual government agencies in compliance with
|
143 |
+
Executive Order 13960, which requires federal agencies to produce an annual inventory of their AI usage.
|
144 |
+
See her [blog post](https://www.responsible.ai/post/federal-government-ai-use-cases) for additional details,
|
145 |
+
including links to the original data sources.
|
146 |
+
"""
|
147 |
+
|
148 |
+
st.markdown(main_text)
|
149 |
+
|
150 |
+
df = load_data(input_file)
|
151 |
+
|
152 |
+
# Plot stacked bar chart of impact on the public by agency
|
153 |
+
st.subheader("Impact of systems on the public, by agency")
|
154 |
+
stacked_bar_chart = plot_impact_by_agency(df)
|
155 |
+
st.altair_chart(stacked_bar_chart, use_container_width=True)
|
156 |
+
|
157 |
+
# Plot counts by category, allowing user to select category
|
158 |
+
st.subheader("Number of entries by category")
|
159 |
+
no_filter_cols = ['Name of Inventory Item', 'Description of Inventory Item', "Other Notes"]
|
160 |
+
filter_cols = [c for c in df.columns.unique() if c not in no_filter_cols]
|
161 |
+
column = st.selectbox("Choose what to plot", filter_cols)
|
162 |
+
count_chart = plot_count_by_category(column, df)
|
163 |
+
st.altair_chart(count_chart, use_container_width=True)
|
164 |
+
|
165 |
+
# Table with filters for user browsing
|
166 |
+
st.subheader("Explore the entries")
|
167 |
+
st.write("Use the menus to filter the table. You can download the filtered table below.")
|
168 |
+
filter_names = ["Agency", "Primary Type of AI", "Purpose of AI", "Length of Usage",
|
169 |
+
"Does it directly impact the public?", "Vendor System"]
|
170 |
+
c1, c2 = st.columns((1, 1))
|
171 |
+
filter_dict = {}
|
172 |
+
with c1:
|
173 |
+
for filter_name in filter_names[:3]:
|
174 |
+
menu = st.multiselect(filter_name, ["Select all"] + list(df[filter_name].unique()), default="Select all")
|
175 |
+
filter_dict[filter_name] = menu
|
176 |
+
with c2:
|
177 |
+
for filter_name in filter_names[3:]:
|
178 |
+
menu = st.multiselect(filter_name, ["Select all"] + list(df[filter_name].unique()), default="Select all")
|
179 |
+
filter_dict[filter_name] = menu
|
180 |
+
filtered_df = filter_table(df, filter_dict)
|
181 |
+
st.write(filtered_df)
|
182 |
+
|
183 |
+
# Download filtered table
|
184 |
+
st.write("Download current table as TSV (tab-separated values) file")
|
185 |
+
table_output_file = st.text_input("Enter name for the file to be downloaded", value="table.tsv")
|
186 |
+
if table_output_file is not None:
|
187 |
+
csv = convert_df(filtered_df)
|
188 |
+
st.download_button("Download", csv, file_name=table_output_file)
|
189 |
+
|
190 |
+
|
191 |
+
# Text search
|
192 |
+
st.subheader("Search the data")
|
193 |
+
st.markdown("""
|
194 |
+
This will search text in the following columns:
|
195 |
+
* Name of Inventory Item
|
196 |
+
* Primary Type of AI
|
197 |
+
* Purpose of AI
|
198 |
+
* Description of Inventory Item
|
199 |
+
* Other Notes
|
200 |
+
|
201 |
+
This is a keyword search based on the BM25 algorithm as implemented in the Haystack python library.
|
202 |
+
Yellow highlighting indicates text retrieved in the search.
|
203 |
+
""")
|
204 |
+
searchable_cols = ['Name of Inventory Item',
|
205 |
+
'Primary Type of AI',
|
206 |
+
'Purpose of AI',
|
207 |
+
'Description of Inventory Item',
|
208 |
+
'Other Notes']
|
209 |
+
pipeline = create_search_pipeline(df, searchable_cols)
|
210 |
+
input_text = st.text_input("Enter text", "")
|
211 |
+
if input_text:
|
212 |
+
result_rows, result_cols = run_search(input_text, pipeline)
|
213 |
+
result_df, color_df = produce_table(df, (result_rows, result_cols))
|
214 |
+
st.dataframe(result_df.style.apply(lambda x: color_df, axis=None))
|
215 |
+
st.write("Download search results as TSV (tab-separated values) file")
|
216 |
+
search_output_file = st.text_input("Enter name for the file to be downloaded", value="search_results.tsv")
|
217 |
+
csv = convert_df(result_df) #TODO: change to search results
|
218 |
+
st.download_button("Download", csv, file_name=search_output_file)
|
219 |
+
|
220 |
+
|
221 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
altair
|
2 |
+
farm-haystack[inference]
|
3 |
+
pandas
|