carolanderson commited on
Commit
0ccd99e
·
1 Parent(s): 5c799df

add app.py

Browse files
Files changed (2) hide show
  1. app.py +221 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import altair as alt
4
+ from haystack import Document
5
+ from haystack.document_stores import InMemoryDocumentStore, ElasticsearchDocumentStore, FAISSDocumentStore
6
+ from haystack.nodes import BM25Retriever
7
+ from haystack.pipelines import DocumentSearchPipeline
8
+ import pandas as pd
9
+ import streamlit as st
10
+
11
+
12
+
13
+ @st.cache_data
14
+ def load_data(file):
15
+ df = pd.read_csv(file, sep="\t", lineterminator='\n')
16
+
17
+ # rearrange column order
18
+ col_list = ['Agency', 'Name of Inventory Item',
19
+ 'Primary Type of AI',
20
+ 'Purpose of AI', 'Length of Usage',
21
+ 'Does it directly impact the public?',
22
+ 'Vendor System',
23
+ 'Description of Inventory Item',
24
+ 'Other Notes\r']
25
+ df = df[col_list]
26
+
27
+ # remove trailing \r from 'Other Notes' header
28
+ df = df.rename(columns = {'Other Notes\r' : 'Other Notes'})
29
+
30
+ # remove trailing spaces from agency names (caused duplicate instance of "DOC")
31
+ df['Agency'] = df['Agency'].apply(lambda x : x.rstrip())
32
+ return df
33
+
34
+
35
+ @st.cache_data
36
+ def plot_impact_by_agency(df):
37
+ df = df.copy()
38
+ df = df.rename(columns={'Does it directly impact the public?': 'Impact on public'})
39
+ impact_counts = df.groupby('Agency')['Impact on public'].value_counts()
40
+ impact_counts = impact_counts.sort_index(level="Agency", ascending=False)
41
+ impact_count_df = pd.DataFrame(impact_counts).rename(columns={'Impact on public' : "Count"}).reset_index()
42
+ domain = ['Direct impact', 'Indirect impact', 'No impact']
43
+ range_ = ['red', 'darkorange', 'steelblue']
44
+ chart = (alt.Chart(impact_count_df).mark_bar(align="right").encode(
45
+ x=alt.X("Count", type="quantitative", title="Number of entries", axis=alt.Axis()),
46
+ y=alt.Y("Agency", type="nominal", title="Agency", axis=alt.Axis(labelLimit=300, labelFlushOffset=5)),
47
+ color=alt.Color("Impact on public", scale=alt.Scale(domain=domain, range=range_)),
48
+ )
49
+ )
50
+ return chart
51
+
52
+
53
+ @st.cache_data
54
+ def plot_count_by_category(column, df):
55
+ table = pd.DataFrame(df[column].value_counts().sort_values(ascending=True)).reset_index()
56
+ table.columns = [column, "Count"]
57
+ chart = (
58
+ alt.Chart(table)
59
+ .mark_bar(align="right")
60
+ .encode(
61
+ x=alt.X("Count", type="quantitative", title="Number of entries", axis=alt.Axis()),
62
+ y=alt.Y(column, type="nominal", title="", axis=alt.Axis(labelLimit=300, labelFlushOffset=5),
63
+ sort="-x"))
64
+ )
65
+ return chart
66
+
67
+
68
+ @st.cache_data
69
+ def filter_table(df, choices):
70
+ """
71
+ Function to filter table based on user choices in dropdown menus.
72
+ choices: dict with column as key, list of selected values as value
73
+ e.g. {"Agency" : ["USDA", "USDOC"]
74
+ """
75
+ for column in choices:
76
+ desired_values = choices[column]
77
+ if "Select all" not in desired_values:
78
+ df = df[df[column].isin(desired_values)]
79
+ return df
80
+
81
+
82
+ @st.cache_data
83
+ def create_search_pipeline(df, col_list):
84
+ document_store = InMemoryDocumentStore(use_bm25=True)
85
+ docs = []
86
+ indices = list(df.index.values)
87
+ for col in col_list:
88
+ values = df[col].tolist()
89
+ assert len(indices) == len(values)
90
+ for i, val in zip(indices, values):
91
+ dictionary = {'content' : val,
92
+ 'meta' : {"index": i, "column_header" : col}
93
+ }
94
+ docs.append(Document.from_dict(dictionary))
95
+ document_store.write_documents(docs)
96
+ retriever = BM25Retriever(document_store=document_store)
97
+ pipeline = DocumentSearchPipeline(retriever)
98
+ return pipeline
99
+
100
+
101
+ @st.cache_data
102
+ def run_search(text, _pipeline):
103
+ if text == "":
104
+ return None
105
+ res = pipeline.run(query=text, params={"Retriever": {"top_k": 10}})
106
+ relevant_results = [r for r in res['documents'] if r.score > 0.5]
107
+ result_rows = [doc.meta['index'] for doc in relevant_results]
108
+ result_cols = [doc.meta['column_header'] for doc in relevant_results]
109
+ return (result_rows, result_cols)
110
+
111
+
112
+ @st.cache_data
113
+ def produce_table(df, table_indices):
114
+ if not table_indices:
115
+ return None
116
+ result_df = df.iloc[table_indices[0], :]
117
+ result_df = result_df.drop_duplicates()
118
+ # highlight the cells found in search
119
+ color_df = result_df.copy()
120
+ color_df.loc[:,:] = ''
121
+ for row, col in zip(table_indices[0], table_indices[1]):
122
+ color_df.loc[row, col] = 'background-color: yellow'
123
+ return result_df, color_df
124
+
125
+
126
+ @st.cache_data
127
+ def convert_df(df):
128
+ return df.to_csv(sep="\t", index=False).encode('utf-8')
129
+
130
+
131
+
132
+ if __name__ == "__main__":
133
+ input_file = "/Users/carolanderson/Dropbox/repos/miscellany/webapps/Agency Inventory AI Usage - Sheet1.tsv"
134
+
135
+ st.markdown("# U.S. Federal Government Use of AI")
136
+
137
+ main_text = """
138
+ The data visualized here come from a report by Anna Blue, a 2023 Social Impact Fellow
139
+ at the [Responsible AI Institute](https://www.responsible.ai). The report was released in May 2023. Some agencies have
140
+ released updated inventories since then, which are not reflected here.
141
+
142
+ Anna's report consolidated and annotated data released by individual government agencies in compliance with
143
+ Executive Order 13960, which requires federal agencies to produce an annual inventory of their AI usage.
144
+ See her [blog post](https://www.responsible.ai/post/federal-government-ai-use-cases) for additional details,
145
+ including links to the original data sources.
146
+ """
147
+
148
+ st.markdown(main_text)
149
+
150
+ df = load_data(input_file)
151
+
152
+ # Plot stacked bar chart of impact on the public by agency
153
+ st.subheader("Impact of systems on the public, by agency")
154
+ stacked_bar_chart = plot_impact_by_agency(df)
155
+ st.altair_chart(stacked_bar_chart, use_container_width=True)
156
+
157
+ # Plot counts by category, allowing user to select category
158
+ st.subheader("Number of entries by category")
159
+ no_filter_cols = ['Name of Inventory Item', 'Description of Inventory Item', "Other Notes"]
160
+ filter_cols = [c for c in df.columns.unique() if c not in no_filter_cols]
161
+ column = st.selectbox("Choose what to plot", filter_cols)
162
+ count_chart = plot_count_by_category(column, df)
163
+ st.altair_chart(count_chart, use_container_width=True)
164
+
165
+ # Table with filters for user browsing
166
+ st.subheader("Explore the entries")
167
+ st.write("Use the menus to filter the table. You can download the filtered table below.")
168
+ filter_names = ["Agency", "Primary Type of AI", "Purpose of AI", "Length of Usage",
169
+ "Does it directly impact the public?", "Vendor System"]
170
+ c1, c2 = st.columns((1, 1))
171
+ filter_dict = {}
172
+ with c1:
173
+ for filter_name in filter_names[:3]:
174
+ menu = st.multiselect(filter_name, ["Select all"] + list(df[filter_name].unique()), default="Select all")
175
+ filter_dict[filter_name] = menu
176
+ with c2:
177
+ for filter_name in filter_names[3:]:
178
+ menu = st.multiselect(filter_name, ["Select all"] + list(df[filter_name].unique()), default="Select all")
179
+ filter_dict[filter_name] = menu
180
+ filtered_df = filter_table(df, filter_dict)
181
+ st.write(filtered_df)
182
+
183
+ # Download filtered table
184
+ st.write("Download current table as TSV (tab-separated values) file")
185
+ table_output_file = st.text_input("Enter name for the file to be downloaded", value="table.tsv")
186
+ if table_output_file is not None:
187
+ csv = convert_df(filtered_df)
188
+ st.download_button("Download", csv, file_name=table_output_file)
189
+
190
+
191
+ # Text search
192
+ st.subheader("Search the data")
193
+ st.markdown("""
194
+ This will search text in the following columns:
195
+ * Name of Inventory Item
196
+ * Primary Type of AI
197
+ * Purpose of AI
198
+ * Description of Inventory Item
199
+ * Other Notes
200
+
201
+ This is a keyword search based on the BM25 algorithm as implemented in the Haystack python library.
202
+ Yellow highlighting indicates text retrieved in the search.
203
+ """)
204
+ searchable_cols = ['Name of Inventory Item',
205
+ 'Primary Type of AI',
206
+ 'Purpose of AI',
207
+ 'Description of Inventory Item',
208
+ 'Other Notes']
209
+ pipeline = create_search_pipeline(df, searchable_cols)
210
+ input_text = st.text_input("Enter text", "")
211
+ if input_text:
212
+ result_rows, result_cols = run_search(input_text, pipeline)
213
+ result_df, color_df = produce_table(df, (result_rows, result_cols))
214
+ st.dataframe(result_df.style.apply(lambda x: color_df, axis=None))
215
+ st.write("Download search results as TSV (tab-separated values) file")
216
+ search_output_file = st.text_input("Enter name for the file to be downloaded", value="search_results.tsv")
217
+ csv = convert_df(result_df) #TODO: change to search results
218
+ st.download_button("Download", csv, file_name=search_output_file)
219
+
220
+
221
+
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ altair
2
+ farm-haystack[inference]
3
+ pandas