File size: 8,928 Bytes
0ccd99e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1f42ca
0ccd99e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf691dd
0ccd99e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7106162
 
 
0ccd99e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import math

import altair as alt
from haystack import Document
from haystack.document_stores import InMemoryDocumentStore, ElasticsearchDocumentStore, FAISSDocumentStore
from haystack.nodes import BM25Retriever
from haystack.pipelines import DocumentSearchPipeline
import pandas as pd
import streamlit as st



@st.cache_data
def load_data(file):
    df = pd.read_csv(file, sep="\t", lineterminator='\n')

    # rearrange column order
    col_list = ['Agency', 'Name of Inventory Item',
         'Primary Type of AI',
           'Purpose of AI', 'Length of Usage',
           'Does it directly impact the public?',
            'Vendor  System', 
            'Description of Inventory Item',
           'Other Notes\r']
    df = df[col_list]

    # remove trailing \r from 'Other Notes' header
    df = df.rename(columns = {'Other Notes\r' : 'Other Notes'})

    # remove trailing spaces from agency names (caused duplicate instance of "DOC")
    df['Agency'] = df['Agency'].apply(lambda x : x.rstrip())
    return df


@st.cache_data
def plot_impact_by_agency(df):
    df = df.copy()
    df = df.rename(columns={'Does it directly impact the public?': 'Impact on public'})
    impact_counts = df.groupby('Agency')['Impact on public'].value_counts()
    impact_counts = impact_counts.sort_index(level="Agency", ascending=False)
    impact_count_df = pd.DataFrame(impact_counts).rename(columns={'Impact on public' : "Count"}).reset_index()
    domain = ['Direct impact', 'Indirect impact', 'No impact']
    range_ = ['red', 'darkorange', 'steelblue']
    chart = (alt.Chart(impact_count_df).mark_bar(align="right").encode(
            x=alt.X("count", type="quantitative", title="Number of entries", axis=alt.Axis()),
            y=alt.Y("Agency", type="nominal", title="Agency", axis=alt.Axis(labelLimit=300, labelFlushOffset=5)), 
            color=alt.Color("Impact on public", scale=alt.Scale(domain=domain, range=range_)),
                   )
    )
    return chart


@st.cache_data
def plot_count_by_category(column, df):
    table = pd.DataFrame(df[column].value_counts().sort_values(ascending=True)).reset_index()
    table.columns = [column, "Count"]
    chart = (
        alt.Chart(table)
        .mark_bar(align="right")
        .encode(
            x=alt.X("Count", type="quantitative", title="Number of entries", axis=alt.Axis()),
            y=alt.Y(column, type="nominal", title="", axis=alt.Axis(labelLimit=300, labelFlushOffset=5),
                                                                      sort="-x"))
    ) 
    return chart
    
    
@st.cache_data
def filter_table(df, choices):
    """
    Function to filter table based on user choices in dropdown menus.
    choices: dict with column as key, list of selected values as value 
    e.g. {"Agency" : ["USDA", "USDOC"]
    """
    for column in choices:
        desired_values = choices[column]
        if "Select all" not in desired_values:
            df = df[df[column].isin(desired_values)]
    return df


@st.cache_data
def create_search_pipeline(df, col_list):
    document_store = InMemoryDocumentStore(use_bm25=True)
    docs = []
    indices = list(df.index.values)
    for col in col_list:
        values = df[col].tolist()
        assert len(indices) == len(values)
        for i, val in zip(indices, values):
            dictionary = {'content' : val,
                         'meta' : {"index": i, "column_header" : col}
                         }
            docs.append(Document.from_dict(dictionary))
    document_store.write_documents(docs)
    retriever = BM25Retriever(document_store=document_store)
    pipeline = DocumentSearchPipeline(retriever)
    return pipeline 


@st.cache_data
def run_search(text, _pipeline):
    if text == "":
        return None
    res = pipeline.run(query=text, params={"Retriever": {"top_k": 10}})
    relevant_results = [r for r in res['documents'] if r.score > 0.5]
    result_rows = [doc.meta['index'] for doc in relevant_results]
    result_cols = [doc.meta['column_header'] for doc in relevant_results]
    return (result_rows, result_cols)


@st.cache_data
def produce_table(df, table_indices):
    if not table_indices:
        return None
    result_df = df.iloc[table_indices[0], :]
    result_df = result_df.drop_duplicates()
    # highlight the cells found in search
    color_df = result_df.copy()
    color_df.loc[:,:] = '' 
    for row, col in zip(table_indices[0], table_indices[1]):
        color_df.loc[row, col] = 'background-color: yellow'
    return result_df, color_df


@st.cache_data
def convert_df(df):
    return df.to_csv(sep="\t", index=False).encode('utf-8')



if __name__ == "__main__":
    input_file = "Agency Inventory AI Usage - Sheet1.tsv"
    
    st.markdown("# U.S. Federal Government Use of AI")
    
    main_text = """
    The data visualized here come from a report by Anna Blue, a 2023 Social Impact Fellow
    at the [Responsible AI Institute](https://www.responsible.ai). The report was released in May 2023. Some agencies have 
    released updated inventories since then, which are not reflected here.

    Anna's report consolidated and annotated data released by individual government agencies in compliance with 
    Executive Order 13960, which requires federal agencies to produce an annual inventory of their AI usage. 
    See her [blog post](https://www.responsible.ai/post/federal-government-ai-use-cases) for additional details,
     including links to the original data sources.
    """
    
    st.markdown(main_text)

    df = load_data(input_file)
    
    # Plot stacked bar chart of impact on the public by agency
    st.subheader("Impact of systems on the public, by agency")
    stacked_bar_chart = plot_impact_by_agency(df)
    st.altair_chart(stacked_bar_chart, use_container_width=True)

    # Plot counts by category, allowing user to select category
    st.subheader("Number of entries by category")
    no_filter_cols = ['Name of Inventory Item', 'Description of Inventory Item', "Other Notes"]
    filter_cols = [c for c in df.columns.unique() if c not in no_filter_cols]
    column = st.selectbox("Choose what to plot", filter_cols)
    count_chart = plot_count_by_category(column, df)
    st.altair_chart(count_chart, use_container_width=True)

    # Table with filters for user browsing
    st.subheader("Explore the entries")
    st.write("Use the menus to filter the table. You can download the filtered table below.")
    filter_names = ["Agency", "Primary Type of AI", "Purpose of AI", "Length of Usage",
                    "Does it directly impact the public?", "Vendor  System"]
    c1, c2 = st.columns((1, 1))
    filter_dict = {}
    with c1:
        for filter_name in filter_names[:3]:
            menu = st.multiselect(filter_name, ["Select all"] + list(df[filter_name].unique()), default="Select all")
            filter_dict[filter_name] = menu
    with c2:
        for filter_name in filter_names[3:]:
            menu = st.multiselect(filter_name, ["Select all"] + list(df[filter_name].unique()), default="Select all")
            filter_dict[filter_name] = menu
    filtered_df = filter_table(df, filter_dict)
    st.write(filtered_df)

    # Download filtered table
    st.write("Download current table as TSV (tab-separated values) file")
    table_output_file = st.text_input("Enter name for the file to be downloaded", value="table.tsv")
    if table_output_file is not None:
        csv = convert_df(filtered_df)
        st.download_button("Download", csv, file_name=table_output_file)
    
    
    # Text search
    st.subheader("Search the data")
    st.markdown("""
            This will search text in the following columns: 
            * Name of Inventory Item
            * Primary Type of AI
            * Purpose of AI
            * Description of Inventory Item
            * Other Notes  
            
            This is a keyword search based on the BM25 algorithm.  
            Yellow highlighting indicates text retrieved in the search.   
            A download button will appear after you run a search.
            """)
    searchable_cols = ['Name of Inventory Item',
     'Primary Type of AI',
       'Purpose of AI', 
        'Description of Inventory Item',
       'Other Notes']
    pipeline = create_search_pipeline(df, searchable_cols)
    input_text = st.text_input("Enter text", "")
    if input_text:
        result_rows, result_cols = run_search(input_text, pipeline)
        result_df, color_df = produce_table(df, (result_rows, result_cols))
        st.dataframe(result_df.style.apply(lambda x: color_df, axis=None))
        st.write("Download search results as TSV (tab-separated values) file")
        search_output_file = st.text_input("Enter name for the file to be downloaded", value="search_results.tsv")
        csv = convert_df(result_df)   #TODO: change to search results
        st.download_button("Download", csv, file_name=search_output_file)