jsulz HF staff commited on
Commit
b831af7
·
1 Parent(s): afdf108

making plots better at handling initial state

Browse files
Files changed (1) hide show
  1. app.py +92 -80
app.py CHANGED
@@ -9,34 +9,46 @@ from plotly.subplots import make_subplots
9
  from matplotlib import pyplot as plt
10
  from wordcloud import WordCloud
11
 
12
- # Load the dataset and convert it to a Pandas dataframe
13
- sotu_dataset = "jsulz/state-of-the-union-addresses"
14
- dataset = load_dataset(sotu_dataset)
15
- df = dataset["train"].to_pandas()
16
- # Do some on-the-fly calculations
17
- # calcualte the number of words in each address
18
- df["word_count"] = df["speech_html"].apply(lambda x: len(x.split()))
19
- # calculate the automated readibility index reading ease score for each address
20
- # automated readability index = 4.71 * (characters/words) + 0.5 * (words/sentences) - 21.43
21
- df["ari"] = df["no-contractions"].apply(
22
- lambda x: (4.71 * (len(x.replace(" ", "")) / len(x.split())))
23
- + (0.5 * (len(x.split()) / len(x.split("."))))
24
- - 21.43
25
- )
26
- # Sort the dataframe by date because Plotly doesn't do any of this automatically
27
- df = df.sort_values(by="date")
28
- written = df[df["categories"] == "Written"]
29
- spoken = df[df["categories"] == "Spoken"]
 
 
 
 
30
 
31
  """
32
  Helper functions for Plotly charts
33
  """
34
 
35
 
36
- def plotly_ngrams(n_grams, potus):
37
- if potus is not None:
38
  # Filter on the potus
39
- potus_df = df[df["potus"] == potus]
 
 
 
 
 
 
 
 
40
  # Create a counter generator for the n-grams
41
  trigrams = (
42
  potus_df["tokens-nostop"]
@@ -63,59 +75,58 @@ def plotly_ngrams(n_grams, potus):
63
  return fig4
64
 
65
 
66
- def plotly_word_and_ari(president):
67
- if president != "All" and president is not None:
68
- potus_df = df[df["potus"] == president]
69
- fig5 = make_subplots(specs=[[{"secondary_y": True}]])
70
- fig5.add_trace(
71
- go.Scatter(
72
- x=potus_df["date"],
73
- y=potus_df["word_count"],
74
- name="Word Count",
75
- ),
76
- secondary_y=False,
77
- )
78
- fig5.add_trace(
79
- go.Scatter(
80
- x=potus_df["date"],
81
- y=potus_df["ari"],
82
- name="ARI",
83
- ),
84
- secondary_y=True,
85
- )
86
- # Add figure title
87
- fig5.update_layout(title_text="Address Word Count and ARI")
88
-
89
- # Set x-axis title
90
- fig5.update_xaxes(title_text="Date of Address")
91
-
92
- # Set y-axes titles
93
- fig5.update_yaxes(title_text="Word Count", secondary_y=False)
94
- fig5.update_yaxes(title_text="ARI", secondary_y=True)
95
- return fig5
96
-
97
-
98
- def plt_wordcloud(president):
99
- if president != "All" and president is not None:
100
- potus_df = df[df["potus"] == president]
101
- lemmatized = potus_df["lemmatized"].apply(lambda x: " ".join(x))
102
- # build a single string from lemmatized
103
- lemmatized = " ".join(lemmatized)
104
- # create a wordcloud from the lemmatized column of the dataframe
105
- wordcloud = WordCloud(background_color="white", width=800, height=400).generate(
106
- lemmatized
107
- )
108
- # create a matplotlib figure
109
- fig6 = plt.figure(figsize=(8, 4))
110
- # add the wordcloud to the figure
111
- plt.tight_layout()
112
- plt.imshow(wordcloud, interpolation="bilinear")
113
- plt.axis("off")
114
- return fig6
115
 
116
 
117
  # Create a Gradio interface with blocks
118
  with gr.Blocks() as demo:
 
119
  # Build out the top level static charts and content
120
  gr.Markdown(
121
  """
@@ -214,23 +225,24 @@ with gr.Blocks() as demo:
214
  )
215
  # get all unique president names
216
  presidents = df["potus"].unique()
217
- # convert presidents to a list
218
  presidents = presidents.tolist()
 
219
  # create a dropdown to select a president
220
- president = gr.Dropdown(label="Select a President", choices=presidents)
221
  # create a slider for number of word grams
222
- grams = gr.Slider(minimum=1, maximum=4, step=1, label="N-grams", interactive=True)
 
 
 
 
223
 
224
  # show a bar chart of the top n-grams for a selected president
225
- if president != "All" and president is not None:
226
- gr.Plot(plotly_ngrams, inputs=[grams, president])
227
 
228
- if president != "All" and president is not None:
229
- gr.Plot(plt_wordcloud, scale=2, inputs=[president])
230
 
231
  # show a line chart of word count and ARI for a selected president
232
- if president != "All" and president is not None:
233
- gr.Plot(plotly_word_and_ari, inputs=[president])
234
 
235
 
236
- demo.launch(share=True)
 
9
  from matplotlib import pyplot as plt
10
  from wordcloud import WordCloud
11
 
12
+
13
+ def load_transform_dataset():
14
+ # Load the dataset and convert it to a Pandas dataframe
15
+ sotu_dataset = "jsulz/state-of-the-union-addresses"
16
+ dataset = load_dataset(sotu_dataset)
17
+ df = dataset["train"].to_pandas()
18
+ # Do some on-the-fly calculations
19
+ # calcualte the number of words in each address
20
+ df["word_count"] = df["speech_html"].apply(lambda x: len(x.split()))
21
+ # calculate the automated readibility index reading ease score for each address
22
+ # automated readability index = 4.71 * (characters/words) + 0.5 * (words/sentences) - 21.43
23
+ df["ari"] = df["no-contractions"].apply(
24
+ lambda x: (4.71 * (len(x.replace(" ", "")) / len(x.split())))
25
+ + (0.5 * (len(x.split()) / len(x.split("."))))
26
+ - 21.43
27
+ )
28
+ # Sort the dataframe by date because Plotly doesn't do any of this automatically
29
+ df = df.sort_values(by="date")
30
+ written = df[df["categories"] == "Written"]
31
+ spoken = df[df["categories"] == "Spoken"]
32
+ return df, written, spoken
33
+
34
 
35
  """
36
  Helper functions for Plotly charts
37
  """
38
 
39
 
40
+ def filter_potus(potus, _df):
41
+ if potus != "All":
42
  # Filter on the potus
43
+ potus_df = _df[_df["potus"] == potus]
44
+ else:
45
+ potus_df = _df
46
+ return potus_df
47
+
48
+
49
+ def plotly_ngrams(n_grams, potus, _df):
50
+ if potus is not None:
51
+ potus_df = filter_potus(potus, _df)
52
  # Create a counter generator for the n-grams
53
  trigrams = (
54
  potus_df["tokens-nostop"]
 
75
  return fig4
76
 
77
 
78
+ def plotly_word_and_ari(president, _df):
79
+ potus_df = filter_potus(president, _df)
80
+ fig5 = make_subplots(specs=[[{"secondary_y": True}]])
81
+ fig5.add_trace(
82
+ go.Scatter(
83
+ x=potus_df["date"],
84
+ y=potus_df["word_count"],
85
+ name="Word Count",
86
+ ),
87
+ secondary_y=False,
88
+ )
89
+ fig5.add_trace(
90
+ go.Scatter(
91
+ x=potus_df["date"],
92
+ y=potus_df["ari"],
93
+ name="ARI",
94
+ ),
95
+ secondary_y=True,
96
+ )
97
+ # Add figure title
98
+ fig5.update_layout(title_text="Address Word Count and ARI")
99
+
100
+ # Set x-axis title
101
+ fig5.update_xaxes(title_text="Date of Address")
102
+
103
+ # Set y-axes titles
104
+ fig5.update_yaxes(title_text="Word Count", secondary_y=False)
105
+ fig5.update_yaxes(title_text="ARI", secondary_y=True)
106
+ return fig5
107
+
108
+
109
+ def plt_wordcloud(president, _df):
110
+ potus_df = filter_potus(president, _df)
111
+ lemmatized = potus_df["lemmatized"].apply(lambda x: " ".join(x))
112
+ # build a single string from lemmatized
113
+ lemmatized = " ".join(lemmatized)
114
+ # create a wordcloud from the lemmatized column of the dataframe
115
+ wordcloud = WordCloud(background_color="white", width=800, height=400).generate(
116
+ lemmatized
117
+ )
118
+ # create a matplotlib figure
119
+ fig6 = plt.figure(figsize=(8, 4))
120
+ # add the wordcloud to the figure
121
+ plt.tight_layout()
122
+ plt.imshow(wordcloud, interpolation="bilinear")
123
+ plt.axis("off")
124
+ return fig6
 
 
125
 
126
 
127
  # Create a Gradio interface with blocks
128
  with gr.Blocks() as demo:
129
+ df, written, spoken = load_transform_dataset()
130
  # Build out the top level static charts and content
131
  gr.Markdown(
132
  """
 
225
  )
226
  # get all unique president names
227
  presidents = df["potus"].unique()
 
228
  presidents = presidents.tolist()
229
+ presidents.append("All")
230
  # create a dropdown to select a president
231
+ president = gr.Dropdown(label="Select a President", choices=presidents, value="All")
232
  # create a slider for number of word grams
233
+ grams = gr.Slider(
234
+ minimum=1, maximum=4, step=1, label="N-grams", interactive=True, value=1
235
+ )
236
+
237
+ df_state = gr.State(df)
238
 
239
  # show a bar chart of the top n-grams for a selected president
240
+ gr.Plot(plotly_ngrams, inputs=[grams, president, df_state])
 
241
 
242
+ gr.Plot(plt_wordcloud, scale=2, inputs=[president, df_state])
 
243
 
244
  # show a line chart of word count and ARI for a selected president
245
+ gr.Plot(plotly_word_and_ari, inputs=[president, df_state])
 
246
 
247
 
248
+ demo.launch()