File size: 6,158 Bytes
9cd5d9e
317f161
11067be
 
68b374e
 
256f7c8
 
68b374e
317f161
 
68b374e
317f161
68b374e
 
1fd02a5
68b374e
 
 
 
 
 
 
1fd02a5
68b374e
 
 
 
 
 
 
 
256f7c8
68b374e
 
317f161
 
68b374e
11067be
 
 
68b374e
 
256f7c8
68b374e
 
 
256f7c8
 
 
 
 
 
68b374e
256f7c8
68b374e
 
256f7c8
68b374e
256f7c8
68b374e
 
 
256f7c8
68b374e
256f7c8
68b374e
256f7c8
 
 
 
 
 
 
 
 
 
 
 
68b374e
 
256f7c8
68b374e
 
 
256f7c8
 
 
 
 
 
68b374e
256f7c8
11067be
68b374e
11067be
 
 
256f7c8
68b374e
 
256f7c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68b374e
256f7c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11067be
256f7c8
 
 
 
 
68b374e
9cd5d9e
f239c56
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import gradio as gr
from datasets import load_dataset
from nltk.util import ngrams
from collections import Counter
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt

# Load the dataset and convert it to a Pandas dataframe
sotu_dataset = "jsulz/state-of-the-union-addresses"
dataset = load_dataset(sotu_dataset)
df = dataset["train"].to_pandas()
# decode the tokens-nostop column from a byte array to a list of string
"""
df["tokens-nostop"] = df["tokens-nostop"].apply(
    lambda x: x.decode("utf-8")
    .replace('"', "")
    .replace("[", "")
    .replace("]", "")
    .split(",")
)
"""
df["word_count"] = df["speech_html"].apply(lambda x: len(x.split()))
# calculate the automated readibility index reading ease score for each address
# automated readability index = 4.71 * (characters/words) + 0.5 * (words/sentences) - 21.43
df["ari"] = df["no-contractions"].apply(
    lambda x: (4.71 * (len(x.replace(" ", "")) / len(x.split())))
    + (0.5 * (len(x.split()) / len(x.split("."))))
    - 21.43
)
df = df.sort_values(by="date")
written = df[df["categories"] == "Written"]
spoken = df[df["categories"] == "Spoken"]

# Create a Gradio interface with blocks
with gr.Blocks() as demo:
    gr.Markdown(
        """
        # A Dashboard to Analyze the State of the Union Addresses
        """
    )
    fig1 = px.line(
        df,
        x="date",
        y="word_count",
        title="Total Number of Words in Addresses",
        line_shape="spline",
    )
    fig1.update_layout(
        xaxis=dict(title="Date of Address"),
        yaxis=dict(title="Word Count"),
    )
    gr.Plot(fig1)
    # group by president and category and calculate the average word count sort by date
    avg_word_count = (
        df.groupby(["potus", "categories"])["word_count"].mean().reset_index()
    )
    fig2 = px.bar(
        avg_word_count,
        x="potus",
        y="word_count",
        title="Average Number of Words in Addresses by President",
        color="categories",
        barmode="group",
    )
    fig2.update_layout(
        xaxis=dict(
            title="President",
            tickangle=-45,  # Rotate labels 45 degrees counterclockwise
        ),
        yaxis=dict(
            title="Average Word Count",
            tickangle=0,  # Default label angle (horizontal)
        ),
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
    )
    gr.Plot(fig2)
    with gr.Row():
        ari = df[["potus", "date", "ari", "categories"]]
        fig3 = px.line(
            ari,
            x="date",
            y="ari",
            title="Automated Readability Index in each Address",
            line_shape="spline",
        )
        fig3.update_layout(
            xaxis=dict(title="Date of Address"),
            yaxis=dict(title="ARI Score"),
        )
        gr.Plot(fig3)
    # get all unique president names
    presidents = df["potus"].unique()
    # convert presidents to a list
    presidents = presidents.tolist()
    # create a dropdown to select a president
    president = gr.Dropdown(label="Select a President", choices=presidents)
    grams = gr.Slider(minimum=1, maximum=4, step=1, label="N-grams", interactive=True)

    def plotly_bar(n_grams, potus):
        if potus is not None:
            # create a Counter object from the trigrams
            potus_df = df[df["potus"] == potus]
            # decode the tokens-nostop column from a byte array to a list of string
            trigrams = (
                potus_df["tokens-nostop"]
                .apply(lambda x: list(ngrams(x, n_grams)))
                .apply(Counter)
                .sum()
            )
            # get the most common trigrams
            common_trigrams = trigrams.most_common(10)
            # unzip the list of tuples and plot the trigrams and counts as a bar chart
            trigrams, counts = zip(*common_trigrams)
            # join the trigrams into a single string
            trigrams = [" ".join(trigram) for trigram in trigrams]
            # create a dataframe from the trigrams and counts
            trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
            fig4 = px.bar(
                trigrams_df,
                x="counts",
                y="trigrams",
                title=f"Top {n_grams}-grams",
                orientation="h",
                height=400,
            )
            return fig4

    if president != "All" and president is not None:
        gr.Plot(plotly_bar, inputs=[grams, president])

    def plotly_line(president):
        if president != "All" and president is not None:
            potus_df = df[df["potus"] == president]
            fig5 = make_subplots(specs=[[{"secondary_y": True}]])
            fig5.add_trace(
                go.Scatter(
                    x=potus_df["date"],
                    y=potus_df["word_count"],
                    name="Word Count",
                ),
                secondary_y=False,
            )
            fig5.add_trace(
                go.Scatter(
                    x=potus_df["date"],
                    y=potus_df["ari"],
                    name="ARI",
                ),
                secondary_y=True,
            )
            # Add figure title
            fig5.update_layout(title_text="Double Y Axis Example")

            # Set x-axis title
            fig5.update_xaxes(title_text="xaxis title")

            # Set y-axes titles
            fig5.update_yaxes(
                title_text="<b>primary</b> yaxis title", secondary_y=False
            )
            fig5.update_yaxes(
                title_text="<b>secondary</b> yaxis title", secondary_y=True
            )
            return fig5

    # calculate the total number of words in the speech_html column and add it to a new column
    # if the president is "All", show the word count for all presidents
    # if the president is not "All", show the word count for the selected president
    if president != "All" and president is not None:
        gr.Plot(plotly_line, inputs=[president])


demo.launch(share=True)