phitoduck commited on
Commit
6142d0c
·
1 Parent(s): f4d315a

added prose and sequencing to the article

Browse files
Files changed (3) hide show
  1. app.py +182 -65
  2. concattedfiles.py +0 -381
  3. utils.py +24 -4
app.py CHANGED
@@ -1,64 +1,168 @@
1
  import streamlit as st
2
  import pandas as pd
3
  from datetime import time, date
4
- from utils import generate_random_data, evaluate_alarm_state, aggregate_data, re_aggregate_data
5
  from textwrap import dedent
6
  from matplotlib import pyplot as plt
7
 
8
  # Constants
9
- HARD_CODED_DATE = date(2024, 7, 26)
10
 
11
  def main():
12
- st.title("Streamlit App for Data Generation and Analysis")
 
 
 
 
 
 
 
 
13
 
14
  # Initialize session state
15
  initialize_session_state()
16
 
17
  # Section 1 - Generate random data
18
- st.header("Section 1 - Generate Random Data")
 
 
 
 
 
 
 
 
19
  generate_data_form()
20
 
21
  if not st.session_state.df.empty:
22
- display_dataframe("Raw Event Data", st.session_state.df)
23
- st.line_chart(st.session_state.df.set_index("Timestamp"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- # Section 2 - Calculate Aggregations
26
- st.header("Section 2 - Calculate Aggregations")
27
- aggregation_form()
28
 
29
- if not st.session_state.aggregated_df.empty:
30
- display_dataframe("Aggregated Summary Data (Storage)", st.session_state.aggregated_df)
31
- aggregation_function_input__storage = st.selectbox(
32
- "Aggregation Function (Storage)",
33
- ['p50', 'p95', 'p99', 'max', 'min', 'average'],
34
- key='aggregation_function_input__storage',
35
- help="Select the aggregation function for visualizing the data."
36
- )
37
- st.line_chart(st.session_state.aggregated_df.set_index("Timestamp")[aggregation_function_input__storage])
38
-
39
- # Section 3 - Summary Data Aggregated by Period
40
- st.header("Section 3 - Summary Data Aggregated by Period")
41
- summary_by_period_form()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- if not st.session_state.summary_by_period_df.empty:
44
- display_dataframe("Summary Data Aggregated by Period (for Alarm)", st.session_state.summary_by_period_df)
45
- aggregation_function_input__alarm = st.selectbox(
46
- "Aggregation Function (Alarm)",
47
- ['p50', 'p95', 'p99', 'max', 'min', 'average'],
48
- key='aggregation_function_input__alarm',
49
- help="Select the aggregation function for visualizing the data."
50
- )
51
- st.line_chart(st.session_state.summary_by_period_df.set_index("Timestamp")[aggregation_function_input__alarm])
52
-
53
- # Section 4 - Evaluate Alarm State
54
- st.header("Section 4 - Evaluate Alarm State")
55
- alarm_state_form()
56
-
57
- if not st.session_state.alarm_state_df.empty:
58
  plot_time_series(st.session_state.summary_by_period_df, st.session_state.threshold_input, st.session_state.alarm_condition_input, st.session_state.evaluation_range_input)
 
 
 
59
  display_alarm_state_evaluation(st.session_state.alarm_state_df)
60
 
61
- display_key_tables()
62
 
63
  def initialize_session_state() -> None:
64
  if 'df' not in st.session_state:
@@ -72,16 +176,16 @@ def initialize_session_state() -> None:
72
 
73
  def generate_data_form() -> None:
74
  with st.form(key='generate_data_form'):
75
- start_time_input = st.time_input("Start Time", time(12, 0), help="Select the start time for generating random data.")
76
- end_time_input = st.time_input("End Time", time(12, 30), help="Select the end time for generating random data.")
77
- count_input = st.slider("Count", min_value=1, max_value=200, value=60, help="Specify the number of data points to generate.")
78
- response_time_range_input = st.slider("Response Time Range (ms)", min_value=50, max_value=300, value=(100, 250), help="Select the range of response times in milliseconds.")
79
- null_percentage_input = st.slider("Null Percentage", min_value=0.0, max_value=1.0, value=0.5, help="Select the percentage of null values in the generated data.")
80
  submit_button = st.form_submit_button(label='Generate Data')
81
 
82
  if submit_button:
83
  st.session_state.df = generate_random_data(
84
- date=HARD_CODED_DATE,
85
  start_time=start_time_input,
86
  end_time=end_time_input,
87
  count=count_input,
@@ -90,41 +194,54 @@ def generate_data_form() -> None:
90
  )
91
 
92
  def aggregation_form() -> None:
93
- freq_input = st.selectbox("Period (bin)", ['1min', '5min', '15min'], key='freq_input', help="Select the frequency for aggregating the data.")
94
  if not st.session_state.df.empty:
95
  st.session_state.aggregated_df = aggregate_data(st.session_state.df, freq_input)
96
 
97
  def summary_by_period_form() -> None:
98
- period_length_input = st.selectbox("Period Length", ['1min', '5min', '15min'], key='period_length_input', help="Select the period length for aggregating the summary data.")
99
  if not st.session_state.aggregated_df.empty:
100
- st.session_state.summary_by_period_df = re_aggregate_data(st.session_state.aggregated_df, period_length_input)
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  def alarm_state_form() -> None:
103
- threshold_input = st.slider("Threshold (ms)", min_value=50, max_value=300, value=150, key='threshold_input', help="Specify the threshold value for evaluating the alarm state.")
104
- datapoints_to_alarm_input = st.number_input("Datapoints to Alarm", min_value=1, value=3, key='datapoints_to_alarm_input', help="Specify the number of data points required to trigger an alarm.")
105
- evaluation_range_input = st.number_input("Evaluation Range", min_value=1, value=5, key='evaluation_range_input', help="Specify the range of data points to evaluate for alarm state.")
106
  alarm_condition_input = st.selectbox(
107
  "Alarm Condition",
108
  ['>', '>=', '<', '<='],
109
  key='alarm_condition_input',
110
  help="Select the condition for evaluating the alarm state."
111
  )
 
 
 
 
112
  if not st.session_state.summary_by_period_df.empty:
113
  st.session_state.alarm_state_df = evaluate_alarm_state(
114
  summary_df=st.session_state.summary_by_period_df,
115
- threshold=threshold_input,
116
- datapoints_to_alarm=datapoints_to_alarm_input,
117
- evaluation_range=evaluation_range_input,
118
  aggregation_function=st.session_state.aggregation_function_input__alarm,
119
- alarm_condition=alarm_condition_input
120
- )
121
 
122
  def display_dataframe(title: str, df: pd.DataFrame) -> None:
123
  st.write(title)
124
  st.dataframe(df)
125
 
126
  def plot_time_series(df: pd.DataFrame, threshold: int, alarm_condition: str, evaluation_range: int) -> None:
127
- timestamps = df['Timestamp']
128
  response_times = df[st.session_state.aggregation_function_input__alarm]
129
 
130
  segments = []
@@ -145,7 +262,7 @@ def plot_time_series(df: pd.DataFrame, threshold: int, alarm_condition: str, eva
145
  fig, ax1 = plt.subplots()
146
 
147
  color = 'tab:blue'
148
- ax1.set_xlabel('Timestamp')
149
  ax1.set_ylabel('Response Time (ms)', color=color)
150
 
151
  for segment in segments:
@@ -170,11 +287,11 @@ def plot_time_series(df: pd.DataFrame, threshold: int, alarm_condition: str, eva
170
 
171
  for idx in period_indices:
172
  if idx % evaluation_range == 0:
173
- ax1.axvline(x=df['Timestamp'].iloc[idx], color='green', linestyle='-', alpha=0.3)
174
  max_value = max(filter(lambda x: x is not None, df[st.session_state.aggregation_function_input__alarm]))
175
- ax1.text(df['Timestamp'].iloc[idx], max_value * 0.95, f"[{idx // evaluation_range}]", rotation=90, verticalalignment='bottom', color='grey', alpha=0.7, fontsize=8)
176
  else:
177
- ax1.axvline(x=df['Timestamp'].iloc[idx], color='grey', linestyle='--', alpha=0.3)
178
 
179
  ax1.annotate('Alarm threshold', xy=(0.98, threshold), xycoords=('axes fraction', 'data'), ha='right', va='bottom', fontsize=8, color='red', backgroundcolor='none')
180
 
@@ -193,16 +310,17 @@ def display_key_tables() -> None:
193
  symbol_data = {
194
  "Symbol": ["🔴", "⚫️", "🟢"],
195
  "Meaning": [
196
- "Breaching data point: This data point exceeds the threshold.",
197
- "Missing data point: This data point is missing or not reported.",
198
- "Non-breaching data point: This data point is within the threshold."
199
  ]
200
  }
201
  symbol_df = pd.DataFrame(symbol_data)
202
  st.table(symbol_df)
203
 
204
  # Columns
205
- st.write(dedent(""" #### Columns: Strategies for handling missing data points [docs](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/AlarmThatSendsEmail.html#alarms-and-missing-data)
 
206
 
207
  Sometimes, no metric events may have been reported during a given time period. In this case,
208
  you must decide how you will treat missing data points. Ignore it? Or consider it a failure.
@@ -238,4 +356,3 @@ def display_key_tables() -> None:
238
 
239
  if __name__ == "__main__":
240
  main()
241
-
 
1
  import streamlit as st
2
  import pandas as pd
3
  from datetime import time, date
4
+ from utils import generate_random_data, evaluate_alarm_state, aggregate_data, re_aggregate_data, downsample
5
  from textwrap import dedent
6
  from matplotlib import pyplot as plt
7
 
8
  # Constants
9
+ TODAYS_DATE = date.today()
10
 
11
  def main():
12
+ st.title("AWS CloudWatch Simulator")
13
+ st.markdown(dedent("""\
14
+ Monitoring and alerting can be confusing to learn. There is some theory you need to understand first.
15
+
16
+ This app is an interative tutorial to help you understand how to record metrics describing the performance
17
+ of an app, and build alerts off of them using AWS CloudWatch.
18
+
19
+ Lets get started! 🎉
20
+ """))
21
 
22
  # Initialize session state
23
  initialize_session_state()
24
 
25
  # Section 1 - Generate random data
26
+ st.header("1 - Generate a series of measurements")
27
+ st.markdown(dedent("""\
28
+ Suppose we have a REST API with a ✨very popular✨ `GET /greeting?name=...` endpoint.
29
+
30
+ Each time someone calls the endpoint, we can record how long it takes to respond, aka the ***response latency***.
31
+
32
+ Use this form to generate a random dataset of response times.
33
+ """))
34
+
35
  generate_data_form()
36
 
37
  if not st.session_state.df.empty:
38
+ st.markdown("### Recorded request latencies")
39
+ display_dataframe("Raw timeseries events", st.session_state.df)
40
+ st.scatter_chart(st.session_state.df.set_index("timestamp"))
41
+
42
+ st.markdown(dedent("""\
43
+ #### 🚚 ➡ ☁️
44
+ We can ship these metrics to a time series database such as AWS CloudWatch in a few ways.
45
+ """))
46
+ st.warning("In the CloudWatch Metrics database, data points are organized into [Namespaces, Metrics, and Dimensions](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/cloudwatch_concepts.html#Namespace). Think of a Metric as a dedicated table in a database for a single timeseries, e.g. response latency measurements.", icon="💡")
47
+ st.markdown(dedent("""\
48
+ #### Option 1 - AWS SDK (good)
49
+
50
+ Our application could use the AWS SDK to upload the data points using the AWS CloudWatch endpoints, e.g.
51
+
52
+ ```python
53
+ import boto3
54
+ cloudwatch = boto3.client('cloudwatch')
55
+ cloudwatch.put_metric_data(
56
+ Namespace='MyApp',
57
+ MetricData=[
58
+ {
59
+ 'MetricName': 'Latency',
60
+ 'timestamp': '2021-08-01T12:00:00',
61
+ 'Value': 102,
62
+ 'Unit': 'Milliseconds'
63
+ },
64
+ ... # more metrics data points, recorded at different times
65
+ ]
66
+ )
67
+ ```
68
+
69
+ It is more cost effective to send data points in a batch, but they can be sent individually as well.
70
+
71
+ ---
72
+
73
+ #### Option 2 - Structured Logs (better)
74
+
75
+ Our application could write metrics to stdout in AWS's [Embedded Metric Format (EMF)](https://www.youtube.com/watch?v=HdopVzW6pX0) (structured JSON) and sent to CloudWatch Logs.
76
+
77
+ CloudWatch logs automatically extracts metrics from EMF-formatted logs and sends them to CloudWatch Metrics.
78
+
79
+ That is great because it is
80
+
81
+ 1. 💰 **cheaper**: you are not charged for calls to CloudWatch's PutMetric endpoint, and
82
+ 2. ⚡️ **faster**: logging to stdout is WAY faster than making a network call--especially a 2-way, synchonous HTTP call. And a side process can batch and send our logs without our app having to slow down or worry about that.
83
+
84
+ ---
85
+
86
+ ### Option 3 - Built-in Metrics (best)
87
+
88
+ Some common metrics, such as API Gateway response latency or Lambda runtime can actually be recorded
89
+ in CloudWatch Metrics automatically. No code required!
90
+
91
+ This is ideal, but not all metrics are automatically captured, such as application-specific metrics like "how many OpenAI tokens have we used?"
92
+
93
+ ---
94
+ """))
95
 
96
+ if not st.session_state.df.empty:
 
 
97
 
98
+ # Section 2 - Calculate Aggregations
99
+ st.header("2 - AWS aggregates the metrics")
100
+ st.markdown(dedent("""\
101
+ This step represents our metrics data after AWS CloudWatch processes and stores it.
102
+
103
+ Storing raw metrics data can be expensive 💰 (see [CloudWatch Metrics pricing](https://aws.amazon.com/cloudwatch/pricing/)). If your app has high traffic, or bad code, you could send 100s, 1,000s, or 1,000,000s+ of measurement
104
+ data points per second to AWS CloudWatch.
105
+
106
+ This metrics data is meant to be analyzed with queries that power visualizations and alerts--which requires compute--which costs more money the more metrics data you have stored.
107
+
108
+ AWS CloudWatch generally aggregates data into a ***resolution*** of 5 minute intervals.
109
+
110
+ In other words, CloudWatch bins data, genrally into ***periods*** of 5 minutes
111
+ and only stores aggregate statistics for each period. This decreases the amount of data stored and queried by orders of magnitude. ✅
112
+
113
+ You can pay more for AWS to aggregate data at a "higher" (or "finer") resolution, e.g. 1-minute or even 1-second periods.
114
+ """))
115
+ st.info("Use this form to aggregate the raw data points into periods of different lengths and plot some of [the many statistics that CloudWatch computes](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/Statistics-definitions.html) over aggregated periods.", icon="📌")
116
+ aggregation_form()
117
+
118
+ if not st.session_state.aggregated_df.empty:
119
+ display_dataframe("Aggregated Statistics over Periods", st.session_state.aggregated_df)
120
+ aggregation_function_input__storage = st.selectbox(
121
+ "Aggregation Statistic (just for exploration; does not affect downstream steps)",
122
+ ['p50', 'p95', 'p99', 'max', 'min', 'average'],
123
+ key='aggregation_function_input__storage',
124
+ help="Select the aggregation function for visualizing the data."
125
+ )
126
+ st.line_chart(st.session_state.aggregated_df.set_index("timestamp")[aggregation_function_input__storage])
127
+
128
+ # Section 3 - Summary Data Aggregated by Period
129
+ st.header("3 - Optionally aggregate metrics further for Alarms and Dashboards")
130
+ st.markdown(dedent("""\
131
+ You can plot metrics in a CloudWatch dashboard.
132
+
133
+ When doing this, you can choose to aggregate the data further or run additional queries on it to analyze it and answer particular questions.
134
+
135
+ We will skip discussing dashboards and focus on ***alerts*** (or ***alarms*** in CloudWatch terms).
136
+
137
+ Suppose we want an alert that triggers if our endpoint starts to take longer than usual to respond.
138
+
139
+ CloudWatch's concept of alarms can alert you when a metric, such as response latency, "breaches" a certain *threshold* for a certain *number of periods*.
140
+ """))
141
+ st.info("Use this form to bin the data into periods (optionally of shorter length than the previous step).\n\nThis will set the period length used to create an alarm in the next step.", icon="📌")
142
+ summary_by_period_form()
143
+
144
+ if not st.session_state.summary_by_period_df.empty:
145
+ display_dataframe("Summary Data Aggregated by Period (for Alarm)", st.session_state.summary_by_period_df)
146
+ aggregation_function_input__alarm = st.selectbox(
147
+ "Aggregation Statistic (used for alarm evaluation in next step)",
148
+ ['p50', 'p95', 'p99', 'max', 'min', 'average'],
149
+ key='aggregation_function_input__alarm',
150
+ help="Select the aggregation function for visualizing the data."
151
+ )
152
+ st.line_chart(st.session_state.summary_by_period_df.set_index("timestamp")[aggregation_function_input__alarm])
153
 
154
+ # Section 4 - Evaluate Alarm State
155
+ st.header("4 - Configure and evaluate an alarm")
156
+
157
+ # define what "breaching" means (threshold and condition) and evaluate the data
158
+ alarm_state_form()
 
 
 
 
 
 
 
 
 
 
159
  plot_time_series(st.session_state.summary_by_period_df, st.session_state.threshold_input, st.session_state.alarm_condition_input, st.session_state.evaluation_range_input)
160
+ datapoints_to_alarm_input = st.number_input("Datapoints to Alarm", min_value=1, value=3, key='datapoints_to_alarm_input', help="Specify the number of data points with in the overall evaluation range that must be breaching in order to trigger an alarm.")
161
+ evaluate_breaching_data_points()
162
+ st.write("%d out of %d data points must be breaching to trigger an alarm." % (st.session_state.datapoints_to_alarm_input, st.session_state.evaluation_range_input))
163
  display_alarm_state_evaluation(st.session_state.alarm_state_df)
164
 
165
+ display_key_tables()
166
 
167
  def initialize_session_state() -> None:
168
  if 'df' not in st.session_state:
 
176
 
177
  def generate_data_form() -> None:
178
  with st.form(key='generate_data_form'):
179
+ start_time_input = st.time_input("Start Time", time(12, 0), help="No generated data points will have earlier timestamps than this.")
180
+ end_time_input = st.time_input("End Time", time(12, 30), help="No generated data points will have later timestamps than this.")
181
+ count_input = st.slider("Number of requests", min_value=1, max_value=200, value=20, help="Specify the number of data points to generate.")
182
+ response_time_range_input = st.slider("Response Time Range (ms)", min_value=50, max_value=300, value=(140, 180), help="Select the range of response times in milliseconds. The generated response latencies will be in this range.")
183
+ null_percentage_input = st.slider("Percentage of null data points", min_value=0.0, max_value=1.0, value=0., help="Select the percentage of null values in the generated data. We will use this to simulate 'missing data'--or time periods where no requests were recorded.\n\nCloudWatch does not actually have a concept of data points with null values.")
184
  submit_button = st.form_submit_button(label='Generate Data')
185
 
186
  if submit_button:
187
  st.session_state.df = generate_random_data(
188
+ date=TODAYS_DATE,
189
  start_time=start_time_input,
190
  end_time=end_time_input,
191
  count=count_input,
 
194
  )
195
 
196
  def aggregation_form() -> None:
197
+ freq_input = st.selectbox("Storage resolution for metric", ['1min', '2min', '3min', '5min', '10min', '15min'], key='freq_input', help="Select the frequency for aggregating the data.")
198
  if not st.session_state.df.empty:
199
  st.session_state.aggregated_df = aggregate_data(st.session_state.df, freq_input)
200
 
201
  def summary_by_period_form() -> None:
202
+ period_length_input = st.selectbox("Period Length", ['1min', '2min', '3min', '5min', '10min', '15min'], key='period_length_input', help="Select the period length for aggregating the summary data.")
203
  if not st.session_state.aggregated_df.empty:
204
+ agg_period = int(st.session_state.freq_input.replace('min', ''))
205
+ new_period = int(period_length_input.replace('min', ''))
206
+
207
+ if new_period < agg_period:
208
+ st.warning(f"The data from Step 2 was downsampled from a {agg_period}-minute resolution to a {new_period}-minute resolution.\n\nRepresentative values for each finer-resolution period were interpolated.", icon="📌")
209
+ elif new_period > agg_period:
210
+ st.warning(f"The data from Step 2 was re-aggregated to a lower resolution (longer period) of {new_period} minutes.\n\nThe resulting values for min, max, and average reflect the values of the collected metrics, but p50, p95, and p99 are merely estimates.", icon="📌")
211
+
212
+ if new_period < agg_period:
213
+ st.session_state.summary_by_period_df = downsample(st.session_state.aggregated_df, new_period)
214
+ else:
215
+ st.session_state.summary_by_period_df = re_aggregate_data(st.session_state.aggregated_df, period_length_input)
216
 
217
  def alarm_state_form() -> None:
218
+ threshold_input = st.slider("Threshold (ms)", min_value=50, max_value=300, value=160, key='threshold_input', help="Specify the threshold value for evaluating the alarm state.")
 
 
219
  alarm_condition_input = st.selectbox(
220
  "Alarm Condition",
221
  ['>', '>=', '<', '<='],
222
  key='alarm_condition_input',
223
  help="Select the condition for evaluating the alarm state."
224
  )
225
+
226
+ evaluation_range_input = st.number_input("Evaluation Range (# periods btw green bars)", min_value=1, value=5, key='evaluation_range_input', help="Specify the number of consecutive data points to evaluate for alarm state.")
227
+
228
+ def evaluate_breaching_data_points() -> None:
229
  if not st.session_state.summary_by_period_df.empty:
230
  st.session_state.alarm_state_df = evaluate_alarm_state(
231
  summary_df=st.session_state.summary_by_period_df,
232
+ threshold=st.session_state.threshold_input,
233
+ datapoints_to_alarm=st.session_state.datapoints_to_alarm_input,
234
+ evaluation_range=st.session_state.evaluation_range_input,
235
  aggregation_function=st.session_state.aggregation_function_input__alarm,
236
+ alarm_condition=st.session_state.alarm_condition_input
237
+ )
238
 
239
  def display_dataframe(title: str, df: pd.DataFrame) -> None:
240
  st.write(title)
241
  st.dataframe(df)
242
 
243
  def plot_time_series(df: pd.DataFrame, threshold: int, alarm_condition: str, evaluation_range: int) -> None:
244
+ timestamps = df['timestamp']
245
  response_times = df[st.session_state.aggregation_function_input__alarm]
246
 
247
  segments = []
 
262
  fig, ax1 = plt.subplots()
263
 
264
  color = 'tab:blue'
265
+ ax1.set_xlabel('timestamp')
266
  ax1.set_ylabel('Response Time (ms)', color=color)
267
 
268
  for segment in segments:
 
287
 
288
  for idx in period_indices:
289
  if idx % evaluation_range == 0:
290
+ ax1.axvline(x=df['timestamp'].iloc[idx], color='green', linestyle='-', alpha=0.3)
291
  max_value = max(filter(lambda x: x is not None, df[st.session_state.aggregation_function_input__alarm]))
292
+ ax1.text(df['timestamp'].iloc[idx], max_value * 0.95, f"[{idx // evaluation_range}]", rotation=90, verticalalignment='bottom', color='grey', alpha=0.7, fontsize=8)
293
  else:
294
+ ax1.axvline(x=df['timestamp'].iloc[idx], color='grey', linestyle='--', alpha=0.3)
295
 
296
  ax1.annotate('Alarm threshold', xy=(0.98, threshold), xycoords=('axes fraction', 'data'), ha='right', va='bottom', fontsize=8, color='red', backgroundcolor='none')
297
 
 
310
  symbol_data = {
311
  "Symbol": ["🔴", "⚫️", "🟢"],
312
  "Meaning": [
313
+ "Breaching data point: This data point breaches the threshold and alarm condition (<, <=, >=, >)",
314
+ "Missing data point: This data point is missing or not reported",
315
+ "Non-breaching data point: This data point is does not breach the threshold and alarm condition (<, <=, >=, >)"
316
  ]
317
  }
318
  symbol_df = pd.DataFrame(symbol_data)
319
  st.table(symbol_df)
320
 
321
  # Columns
322
+ st.write(dedent("""\
323
+ #### Columns: [The 4 Strategies](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/AlarmThatSendsEmail.html#alarms-and-missing-data) for handling missing data points
324
 
325
  Sometimes, no metric events may have been reported during a given time period. In this case,
326
  you must decide how you will treat missing data points. Ignore it? Or consider it a failure.
 
356
 
357
  if __name__ == "__main__":
358
  main()
 
concattedfiles.py DELETED
@@ -1,381 +0,0 @@
1
- .
2
- ├── streamlit_app.py
3
- └── utils.py
4
-
5
- 1 directory, 2 files
6
-
7
-
8
-
9
- # File: ./streamlit_app.py
10
- import streamlit as st
11
- import pandas as pd
12
- import matplotlib.pyplot as plt
13
- from datetime import datetime, time, date
14
- from typing import List, Dict, Any, Tuple
15
- from utils import generate_random_data, calculate_percentile, evaluate_alarm_state, aggregate_data
16
-
17
- # Constants
18
- HARD_CODED_DATE = date(2024, 7, 26)
19
-
20
- def main():
21
- st.title("Streamlit App for Data Generation and Analysis")
22
-
23
- # Initialize session state
24
- initialize_session_state()
25
-
26
- # Section 1 - Generate random data
27
- st.header("Section 1 - Generate Random Data")
28
- generate_data_form()
29
-
30
- if not st.session_state.df.empty:
31
- display_dataframe("Raw Event Data", st.session_state.df)
32
-
33
- # Section 2 - Calculate Percentile
34
- st.header("Section 2 - Calculate Percentile")
35
- percentile_form()
36
-
37
- if not st.session_state.percentile_df.empty:
38
- display_dataframe("Aggregated Summary Data", st.session_state.percentile_df)
39
-
40
- # Section 3 - Summary Data Aggregated by Period
41
- st.header("Section 3 - Summary Data Aggregated by Period")
42
- summary_by_period_form()
43
-
44
- if not st.session_state.summary_by_period_df.empty:
45
- display_dataframe("Summary Data Aggregated by Period", st.session_state.summary_by_period_df)
46
-
47
- # Section 4 - Evaluate Alarm State
48
- st.header("Section 4 - Evaluate Alarm State")
49
- alarm_state_form()
50
-
51
- if not st.session_state.alarm_state_df.empty:
52
- plot_time_series(st.session_state.summary_by_period_df, st.session_state.threshold_input, st.session_state.alarm_condition_input, st.session_state.evaluation_range_input)
53
- display_alarm_state_evaluation(st.session_state.alarm_state_df)
54
-
55
- display_key_tables()
56
-
57
- def initialize_session_state() -> None:
58
- if 'df' not in st.session_state:
59
- st.session_state.df = pd.DataFrame()
60
- if 'percentile_df' not in st.session_state:
61
- st.session_state.percentile_df = pd.DataFrame()
62
- if 'summary_by_period_df' not in st.session_state:
63
- st.session_state.summary_by_period_df = pd.DataFrame()
64
- if 'alarm_state_df' not in st.session_state:
65
- st.session_state.alarm_state_df = pd.DataFrame()
66
-
67
- def generate_data_form() -> None:
68
- with st.form(key='generate_data_form'):
69
- start_time_input = st.time_input("Start Time", time(12, 0), help="Select the start time for generating random data.")
70
- end_time_input = st.time_input("End Time", time(12, 30), help="Select the end time for generating random data.")
71
- count_input = st.slider("Count", min_value=1, max_value=200, value=60, help="Specify the number of data points to generate.")
72
- response_time_range_input = st.slider("Response Time Range (ms)", min_value=50, max_value=300, value=(100, 250), help="Select the range of response times in milliseconds.")
73
- null_percentage_input = st.slider("Null Percentage", min_value=0.0, max_value=1.0, value=0.5, help="Select the percentage of null values in the generated data.")
74
- submit_button = st.form_submit_button(label='Generate Data')
75
-
76
- if submit_button:
77
- st.session_state.df = generate_random_data(
78
- date=HARD_CODED_DATE,
79
- start_time=start_time_input,
80
- end_time=end_time_input,
81
- count=count_input,
82
- response_time_range=response_time_range_input,
83
- null_percentage=null_percentage_input
84
- )
85
-
86
- def percentile_form() -> None:
87
- freq_input = st.selectbox("Period (bin)", ['1min', '5min', '15min'], key='freq_input', help="Select the frequency for aggregating the data.")
88
- percentile_input = st.slider("Percentile", min_value=0.0, max_value=1.0, value=0.95, key='percentile_input', help="Select the percentile for calculating the aggregated summary data.")
89
- if not st.session_state.df.empty:
90
- st.session_state.percentile_df = calculate_percentile(st.session_state.df, freq_input, percentile_input)
91
-
92
- def summary_by_period_form() -> None:
93
- period_length_input = st.selectbox("Period Length", ['1min', '5min', '15min'], key='period_length_input', help="Select the period length for aggregating the summary data.")
94
- if not st.session_state.df.empty:
95
- st.session_state.summary_by_period_df = aggregate_data(st.session_state.df, period_length_input)
96
-
97
- def alarm_state_form() -> None:
98
- threshold_input = st.number_input("Threshold (ms)", min_value=50, max_value=300, value=150, key='threshold_input', help="Specify the threshold value for evaluating the alarm state.")
99
- datapoints_to_alarm_input = st.number_input("Datapoints to Alarm", min_value=1, value=3, key='datapoints_to_alarm_input', help="Specify the number of data points required to trigger an alarm.")
100
- evaluation_range_input = st.number_input("Evaluation Range", min_value=1, value=5, key='evaluation_range_input', help="Specify the range of data points to evaluate for alarm state.")
101
- aggregation_function_input = st.selectbox(
102
- "Aggregation Function",
103
- ['p50', 'p95', 'p99', 'max', 'min', 'average'],
104
- key='aggregation_function_input',
105
- help="Select the aggregation function for visualizing the data and computing alarms."
106
- )
107
- alarm_condition_input = st.selectbox(
108
- "Alarm Condition",
109
- ['>', '>=', '<', '<='],
110
- key='alarm_condition_input',
111
- help="Select the condition for evaluating the alarm state."
112
- )
113
- if not st.session_state.summary_by_period_df.empty:
114
- st.session_state.alarm_state_df = evaluate_alarm_state(
115
- summary_df=st.session_state.summary_by_period_df,
116
- threshold=threshold_input,
117
- datapoints_to_alarm=datapoints_to_alarm_input,
118
- evaluation_range=evaluation_range_input,
119
- aggregation_function=aggregation_function_input,
120
- alarm_condition=alarm_condition_input
121
- )
122
-
123
- def display_dataframe(title: str, df: pd.DataFrame) -> None:
124
- st.write(title)
125
- st.dataframe(df)
126
-
127
- def plot_time_series(df: pd.DataFrame, threshold: int, alarm_condition: str, evaluation_range: int) -> None:
128
- timestamps = df['Timestamp']
129
- response_times = df[st.session_state.aggregation_function_input]
130
-
131
- segments = []
132
- current_segment = {'timestamps': [], 'values': []}
133
-
134
- for timestamp, value in zip(timestamps, response_times):
135
- if pd.isna(value):
136
- if current_segment['timestamps']:
137
- segments.append(current_segment)
138
- current_segment = {'timestamps': [], 'values': []}
139
- else:
140
- current_segment['timestamps'].append(timestamp)
141
- current_segment['values'].append(value)
142
-
143
- if current_segment['timestamps']:
144
- segments.append(current_segment)
145
-
146
- fig, ax1 = plt.subplots()
147
-
148
- color = 'tab:blue'
149
- ax1.set_xlabel('Timestamp')
150
- ax1.set_ylabel('Response Time (ms)', color=color)
151
-
152
- for segment in segments:
153
- ax1.plot(segment['timestamps'], segment['values'], color=color, linewidth=0.5)
154
- ax1.scatter(segment['timestamps'], segment['values'], color=color, s=10)
155
-
156
- line_style = '--' if alarm_condition in ['<', '>'] else '-'
157
- ax1.axhline(y=threshold, color='r', linestyle=line_style, linewidth=0.8, label='Threshold')
158
- ax1.tick_params(axis='y', labelcolor=color)
159
-
160
- if alarm_condition in ['<=', '<']:
161
- ax1.fill_between(timestamps, 0, threshold, color='pink', alpha=0.3)
162
- else:
163
- ax1.fill_between(timestamps, threshold, response_times.max(), color='pink', alpha=0.3)
164
-
165
- period_indices = range(len(df))
166
- ax2 = ax1.twiny()
167
- ax2.set_xticks(period_indices)
168
- ax2.set_xticklabels(period_indices, fontsize=8)
169
- ax2.set_xlabel('Time Periods', fontsize=8)
170
- ax2.xaxis.set_tick_params(width=0.5)
171
-
172
- for idx in period_indices:
173
- if idx % evaluation_range == 0:
174
- ax1.axvline(x=df['Timestamp'].iloc[idx], color='green', linestyle='-', alpha=0.3)
175
- max_value = max(filter(lambda x: x is not None, df[st.session_state.aggregation_function_input]))
176
- ax1.text(df['Timestamp'].iloc[idx], max_value * 0.95, f"[{idx // evaluation_range}]", rotation=90, verticalalignment='bottom', color='grey', alpha=0.7, fontsize=8)
177
- else:
178
- ax1.axvline(x=df['Timestamp'].iloc[idx], color='grey', linestyle='--', alpha=0.3)
179
-
180
- ax1.annotate('Alarm threshold', xy=(0.98, threshold), xycoords=('axes fraction', 'data'), ha='right', va='bottom', fontsize=8, color='red', backgroundcolor='none')
181
-
182
- fig.tight_layout()
183
- st.pyplot(fig)
184
-
185
- def display_alarm_state_evaluation(df: pd.DataFrame) -> None:
186
- st.write("Alarm State Evaluation")
187
- st.dataframe(df)
188
-
189
- def display_key_tables() -> None:
190
- st.write("### Key")
191
-
192
- # Symbols
193
- st.write("#### Symbols")
194
- symbol_data = {
195
- "Symbol": ["X", "-", "0"],
196
- "Meaning": [
197
- "Breaching data point: This data point exceeds the threshold.",
198
- "Missing data point: This data point is missing or not reported.",
199
- "Non-breaching data point: This data point is within the threshold."
200
- ]
201
- }
202
- symbol_df = pd.DataFrame(symbol_data)
203
- st.table(symbol_df)
204
-
205
- # Columns
206
- st.write("#### Columns")
207
- column_data = {
208
- "Column": ["MISSING", "IGNORE", "BREACHING", "NOT BREACHING"],
209
- "Meaning": [
210
- "Action to take when all data points are missing. Possible values: INSUFFICIENT_DATA, Retain current state, ALARM, OK.",
211
- "Action to take when data points are missing but ignored. Possible values: Retain current state, ALARM, OK.",
212
- "Action to take when missing data points are treated as breaching. Possible values: ALARM, OK.",
213
- "Action to take when missing data points are treated as not breaching. Possible values: ALARM, OK."
214
- ]
215
- }
216
- column_df = pd.DataFrame(column_data)
217
- st.table(column_df)
218
-
219
- # States
220
- st.write("#### States")
221
- state_data = {
222
- "State": ["ALARM", "OK", "Retain current state", "INSUFFICIENT_DATA"],
223
- "Description": [
224
- "Alarm state is triggered.",
225
- "Everything is within the threshold.",
226
- "The current alarm state is maintained.",
227
- "Not enough data to make a determination."
228
- ]
229
- }
230
- state_df = pd.DataFrame(state_data)
231
- st.table(state_df)
232
-
233
- if __name__ == "__main__":
234
- main()
235
-
236
-
237
-
238
- # File: ./utils.py
239
- import random
240
- from datetime import datetime, timedelta, date, time
241
- import pandas as pd
242
- import numpy as np
243
- from typing import List, Iterator, Dict, Any, Optional
244
-
245
- def generate_random_data(
246
- date: date,
247
- start_time: time,
248
- end_time: time,
249
- count: int,
250
- response_time_range: (int, int),
251
- null_percentage: float
252
- ) -> pd.DataFrame:
253
- start_datetime: datetime = datetime.combine(date, start_time)
254
- end_datetime: datetime = datetime.combine(date, end_time)
255
-
256
- random_timestamps: List[datetime] = [
257
- start_datetime + timedelta(seconds=random.randint(0, int((end_datetime - start_datetime).total_seconds())))
258
- for _ in range(count)
259
- ]
260
- random_timestamps.sort()
261
-
262
- random_response_times: List[Optional[int]] = [
263
- random.randint(response_time_range[0], response_time_range[1]) for _ in range(count)
264
- ]
265
-
266
- null_count: int = int(null_percentage * count)
267
- null_indices: List[int] = random.sample(range(count), null_count)
268
- for idx in null_indices:
269
- random_response_times[idx] = None
270
-
271
- data: Dict[str, Any] = {
272
- 'Timestamp': random_timestamps,
273
- 'ResponseTime(ms)': random_response_times
274
- }
275
- df: pd.DataFrame = pd.DataFrame(data)
276
- return df
277
-
278
- def calculate_percentile(
279
- df: pd.DataFrame,
280
- freq: str,
281
- percentile: float
282
- ) -> pd.DataFrame:
283
- percentile_df: pd.DataFrame = df.groupby(pd.Grouper(key='Timestamp', freq=freq))["ResponseTime(ms)"]\
284
- .quantile(percentile).reset_index(name=f"p{int(percentile * 100)}_ResponseTime(ms)")
285
- percentile_df.replace(to_replace=np.nan, value=None, inplace=True)
286
- return percentile_df
287
-
288
- def aggregate_data(
289
- df: pd.DataFrame,
290
- period_length: str
291
- ) -> pd.DataFrame:
292
- aggregation_funcs = {
293
- 'p50': lambda x: np.percentile(x.dropna(), 50),
294
- 'p95': lambda x: np.percentile(x.dropna(), 95),
295
- 'p99': lambda x: np.percentile(x.dropna(), 99),
296
- 'max': lambda x: np.max(x.dropna()),
297
- 'min': lambda x: np.min(x.dropna()),
298
- 'average': lambda x: np.mean(x.dropna())
299
- }
300
-
301
- summary_df = df.groupby(pd.Grouper(key='Timestamp', freq=period_length)).agg(
302
- p50=('ResponseTime(ms)', aggregation_funcs['p50']),
303
- p95=('ResponseTime(ms)', aggregation_funcs['p95']),
304
- p99=('ResponseTime(ms)', aggregation_funcs['p99']),
305
- max=('ResponseTime(ms)', aggregation_funcs['max']),
306
- min=('ResponseTime(ms)', aggregation_funcs['min']),
307
- average=('ResponseTime(ms)', aggregation_funcs['average']),
308
- ).reset_index()
309
- return summary_df
310
-
311
- def chunk_list(input_list: List[Any], size: int = 3) -> Iterator[List[Any]]:
312
- while input_list:
313
- chunk: List[Any] = input_list[:size]
314
- yield chunk
315
- input_list = input_list[size:]
316
-
317
- def evaluate_alarm_state(
318
- summary_df: pd.DataFrame,
319
- threshold: int,
320
- datapoints_to_alarm: int,
321
- evaluation_range: int,
322
- aggregation_function: str,
323
- alarm_condition: str
324
- ) -> pd.DataFrame:
325
- data_points: List[Optional[float]] = list(summary_df[aggregation_function].values)
326
-
327
- data_table_dict: Dict[str, List[Any]] = {
328
- "DataPoints": [],
329
- "# of data points that must be filled": [],
330
- "MISSING": [],
331
- "IGNORE": [],
332
- "BREACHING": [],
333
- "NOT BREACHING": []
334
- }
335
-
336
- def check_condition(value, threshold, condition):
337
- if condition == '>':
338
- return value > threshold
339
- elif condition == '>=':
340
- return value >= threshold
341
- elif condition == '<':
342
- return value < threshold
343
- elif condition == '<=':
344
- return value <= threshold
345
-
346
- for chunk in chunk_list(input_list=data_points, size=evaluation_range):
347
- data_point_repr: str = ''
348
- num_dp_that_must_be_filled: int = 0
349
-
350
- for dp in chunk:
351
- if dp is None:
352
- data_point_repr += '-'
353
- elif check_condition(dp, threshold, alarm_condition):
354
- data_point_repr += 'X'
355
- else:
356
- data_point_repr += '0'
357
-
358
- if len(chunk) < evaluation_range:
359
- data_point_repr += '-' * (evaluation_range - len(chunk))
360
-
361
- if data_point_repr.count('-') > (evaluation_range - datapoints_to_alarm):
362
- num_dp_that_must_be_filled = datapoints_to_alarm - sum([data_point_repr.count('0'), data_point_repr.count('X')])
363
-
364
- data_table_dict["DataPoints"].append(data_point_repr)
365
- data_table_dict["# of data points that must be filled"].append(num_dp_that_must_be_filled)
366
-
367
- if num_dp_that_must_be_filled > 0:
368
- data_table_dict["MISSING"].append("INSUFFICIENT_DATA" if data_point_repr.count('-') == evaluation_range else "Retain current state")
369
- data_table_dict["IGNORE"].append("Retain current state")
370
- data_table_dict["BREACHING"].append("ALARM")
371
- data_table_dict["NOT BREACHING"].append("OK")
372
- else:
373
- data_table_dict["MISSING"].append("OK")
374
- data_table_dict["IGNORE"].append("Retain current state")
375
- data_table_dict["BREACHING"].append("ALARM" if 'X' * datapoints_to_alarm in data_point_repr else "OK")
376
- data_table_dict["NOT BREACHING"].append("ALARM" if '0' * datapoints_to_alarm not in data_point_repr else "OK")
377
-
378
- return pd.DataFrame(data_table_dict)
379
-
380
-
381
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils.py CHANGED
@@ -31,7 +31,7 @@ def generate_random_data(
31
  random_response_times[idx] = None
32
 
33
  data: Dict[str, Any] = {
34
- 'Timestamp': random_timestamps,
35
  'ResponseTime(ms)': random_response_times
36
  }
37
  df: pd.DataFrame = pd.DataFrame(data)
@@ -42,7 +42,7 @@ def calculate_percentile(
42
  freq: str,
43
  percentile: float
44
  ) -> pd.DataFrame:
45
- percentile_df: pd.DataFrame = df.groupby(pd.Grouper(key='Timestamp', freq=freq))["ResponseTime(ms)"]\
46
  .quantile(percentile).reset_index(name=f"p{int(percentile * 100)}_ResponseTime(ms)")
47
  percentile_df.replace(to_replace=np.nan, value=None, inplace=True)
48
  return percentile_df
@@ -63,7 +63,7 @@ def aggregate_data(
63
  'average': lambda x: np.mean(x.dropna()) if not x.dropna().empty else np.nan
64
  }
65
 
66
- summary_df = df.groupby(pd.Grouper(key='Timestamp', freq=period_length)).agg(
67
  p50=('ResponseTime(ms)', aggregation_funcs['p50']),
68
  p95=('ResponseTime(ms)', aggregation_funcs['p95']),
69
  p99=('ResponseTime(ms)', aggregation_funcs['p99']),
@@ -89,7 +89,7 @@ def re_aggregate_data(
89
  'average': lambda x: np.mean(x.dropna()) if not x.dropna().empty else np.nan
90
  }
91
 
92
- summary_df = df.groupby(pd.Grouper(key='Timestamp', freq=period_length)).agg(
93
  p50=('p50', aggregation_funcs['p50']),
94
  p95=('p95', aggregation_funcs['p95']),
95
  p99=('p99', aggregation_funcs['p99']),
@@ -99,6 +99,26 @@ def re_aggregate_data(
99
  ).reset_index()
100
  return summary_df
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
  def chunk_list(input_list: List[Any], size: int = 3) -> Iterator[List[Any]]:
104
  while input_list:
 
31
  random_response_times[idx] = None
32
 
33
  data: Dict[str, Any] = {
34
+ 'timestamp': random_timestamps,
35
  'ResponseTime(ms)': random_response_times
36
  }
37
  df: pd.DataFrame = pd.DataFrame(data)
 
42
  freq: str,
43
  percentile: float
44
  ) -> pd.DataFrame:
45
+ percentile_df: pd.DataFrame = df.groupby(pd.Grouper(key='timestamp', freq=freq))["ResponseTime(ms)"]\
46
  .quantile(percentile).reset_index(name=f"p{int(percentile * 100)}_ResponseTime(ms)")
47
  percentile_df.replace(to_replace=np.nan, value=None, inplace=True)
48
  return percentile_df
 
63
  'average': lambda x: np.mean(x.dropna()) if not x.dropna().empty else np.nan
64
  }
65
 
66
+ summary_df = df.groupby(pd.Grouper(key='timestamp', freq=period_length)).agg(
67
  p50=('ResponseTime(ms)', aggregation_funcs['p50']),
68
  p95=('ResponseTime(ms)', aggregation_funcs['p95']),
69
  p99=('ResponseTime(ms)', aggregation_funcs['p99']),
 
89
  'average': lambda x: np.mean(x.dropna()) if not x.dropna().empty else np.nan
90
  }
91
 
92
+ summary_df = df.groupby(pd.Grouper(key='timestamp', freq=period_length)).agg(
93
  p50=('p50', aggregation_funcs['p50']),
94
  p95=('p95', aggregation_funcs['p95']),
95
  p99=('p99', aggregation_funcs['p99']),
 
99
  ).reset_index()
100
  return summary_df
101
 
102
+ def downsample(df, period_minutes):
103
+ # Create a new datetime index at specified intervals
104
+ freq_str = f'{period_minutes}T'
105
+ new_index = pd.date_range(start=df['timestamp'].min(), end=df['timestamp'].max(), freq=freq_str)
106
+
107
+ # Create an empty DataFrame with the new index
108
+ df_downsampled = pd.DataFrame(index=new_index)
109
+
110
+ # Set the original DataFrame's index to the timestamp column
111
+ df.set_index('timestamp', inplace=True)
112
+
113
+ # Interpolate the values for each column
114
+ for column in df.columns:
115
+ df_downsampled[column] = df[column].resample(freq_str).interpolate(method='linear')
116
+
117
+ # Reset index to have timestamp as a column again
118
+ df_downsampled.reset_index(inplace=True)
119
+ df_downsampled.rename(columns={'index': 'timestamp'}, inplace=True)
120
+
121
+ return df_downsampled
122
 
123
  def chunk_list(input_list: List[Any], size: int = 3) -> Iterator[List[Any]]:
124
  while input_list: