svenwey commited on
Commit
37ee0fe
·
1 Parent(s): 502c950

decompose metrics into components

Browse files
Files changed (1) hide show
  1. logmetric.py +132 -64
logmetric.py CHANGED
@@ -107,73 +107,80 @@ class LogMetric(evaluate.Metric):
107
  return len(intersection) / len(union)
108
 
109
  # A score depending on the difference in length of two sentences
110
- def get_length_score(self, sentence1, sentence2):
111
- s1len = len(sentence1)
112
- s2len = len(sentence2)
113
 
114
- return 1 - (abs(s1len - s2len) / max(s1len, s2len))
 
115
 
116
- # Use minimum edit distance between two sentences
117
- def get_overall_similarity(self, sentence1, sentence2):
118
- return self.sacrebleu_metric.compute(predictions=sentence1, references=sentence2)["score"]
119
-
120
- # s1split = sentence1.split()
121
- # s2split = sentence2.split()
122
 
123
- # jaccard_score = self.get_jaccard_similarity(set(s1split), set(s2split))
124
- # length_score = self.get_length_score(s1split, s2split)
 
 
125
 
126
- # return (jaccard_score * 0.7 + length_score * 0.3) * 100.0
 
127
 
128
- def getLogMetric(self, pred : str, ref : str):
129
- ref = ref.strip(' \t\n\r')
130
- pred = pred.strip(' \t\n\r')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
- # Split log on timestamps
133
- pred_split_log = self.timestamp_pattern.split(pred)
134
- ref_split_log = self.timestamp_pattern.split(ref)
135
 
136
- # One logentry always consists of timestamp + log-message
137
- pred_logentries = []
138
- ref_logentries = []
139
 
140
- # reorganize log into logentry-tuples, consisting of timestamp + log-message
141
- for i in range(1, len(pred_split_log), 2):
142
- pred_logentries.append((pred_split_log[i],pred_split_log[i+1]))
143
 
144
- for i in range(1, len(ref_split_log), 2):
145
- ref_logentries.append((ref_split_log[i],ref_split_log[i+1]))
146
-
147
- # The number of logentries of the reference/prediction which has more/less entries/timestamps
148
- max_logentries = max(len(pred_logentries), len(ref_logentries))
149
- min_logentries = min(len(pred_logentries), len(ref_logentries))
150
-
151
- # Case there are no timestamps in reference and none in prediction
152
- # we can compute bleu directly from original prediction (ref will be empty, but we offload this to the bleu metric)
153
- if (len(pred_logentries) == 0 and len(ref_logentries) == 0):
154
- # any sensible log reference is empty if there is no timestamp, hence it suffices to check exact match
155
- logmsg_score = 100.0 if pred == ref else 0.0
156
- return 0.3 * 100.0 + 0.7 * logmsg_score
157
 
158
- # Case one has 0 timestamps, other has >0 timestamps
159
- if (len(pred_logentries) == 0 or len(ref_logentries) == 0):
160
- # It is nonsensical to compare something in this case
161
- return 0.0
162
 
163
-
164
  # replace all digits in the reference timestamp (first timestamp) with '/d' to get
165
  # a regex that describes the format
166
- pred_timestring_pattern = re.sub(r'\d', r'\\d', re.escape(pred_logentries[0][0]))
167
 
168
- matchesPatternScore = 100.0
169
- monotonicallyIncreasingScore = 100.0
170
 
171
  # A variable to save the previous timestamp (as datetime obj) to check monotonicity
172
  prev_datetime = None
173
  # Convert matches to datetime objects
174
- # TODO TODO TODO fix this:
175
- for i in range(min_logentries):
176
- ts = pred_logentries[i][0]
177
  try:
178
  # Check if the format matches with the format of the first timestamp
179
  # TODO!! Check this later, maybe it is too restricting for training a llm
@@ -192,18 +199,69 @@ class LogMetric(evaluate.Metric):
192
  # e.g. date format not parsable by dateutil.parser
193
  matchesPatternScore = 0.0
194
  monotonicallyIncreasingScore = 0.0
195
-
196
- # apply jaccard-similarity to every pred-ref pair and then take mean score * 100
197
- local_score = self.get_overall_similarity(
198
- list(map(lambda t: t[1], pred_logentries))[:min_logentries],
199
- list(map(lambda t: t[1], ref_logentries))[:min_logentries]
200
- )
 
 
 
 
 
 
 
 
 
 
201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
- # we aggregate the bleu scores where we weight the difference in logentries with a score of 0
204
- logmessage_aggregated_score = ((min_logentries / max_logentries) * local_score)
205
  # return weighted overall score of all the different scores
206
- return 0.2 * monotonicallyIncreasingScore + 0.1 * matchesPatternScore + 0.7 * logmessage_aggregated_score
 
 
 
 
 
 
 
207
 
208
  def _compute(self, predictions, references):
209
  """Returns the scores"""
@@ -211,13 +269,23 @@ class LogMetric(evaluate.Metric):
211
  # TODO: get separate log entries (split before timestamps), replace timestamps with token and compare the log entry with BLEU
212
 
213
  t_before_logmetric = time.perf_counter()
214
- timestamp_score = np.mean([self.getLogMetric(p,r) for p,r in zip(predictions,references)])
215
- t_after_logmetric = time.perf_counter()
 
 
 
 
 
 
 
 
 
 
216
 
 
217
  logmetric_duration = f"{t_after_logmetric - t_before_logmetric:0.10f}"
218
 
219
- return {
220
- "score": timestamp_score,
221
- "duration": logmetric_duration
222
- }
223
 
 
107
  return len(intersection) / len(union)
108
 
109
  # A score depending on the difference in length of two sentences
110
+ def get_length_score(self, preds_split, refs_split):
 
 
111
 
112
+ pred_content_lengths = np.vectorize(len)(preds_split)
113
+ ref_content_lengths = np.vectorize(len)(refs_split)
114
 
115
+ return self.smapeScore(pred_content_lengths, ref_content_lengths)
 
 
 
 
 
116
 
117
+ # helper function that computes the smape_score either between two numbers or two lists of numbers (must be the same length)
118
+ def smapeScore(self, P, R):
119
+ P_isnumber = isinstance(P, (int, float))
120
+ R_isnumber = isinstance(R, (int, float))
121
 
122
+ # either both must be numbers or both must be no number
123
+ assert P_isnumber == R_isnumber
124
 
125
+ if not P_isnumber:
126
+ assert(len(P) == len(R))
127
+
128
+ if P_isnumber and R_isnumber:
129
+ if P == 0 and R == 0: return 1.0 # since this leads to (|R| + |P|) = 0
130
+ n = 1
131
+ else:
132
+ if P == [] and R == []: return 1.0 # since this leads to n = 0
133
+ n = len(P)
134
+
135
+ return 1/n * np.sum(np.abs(R - P) / (np.abs(R) + np.abs(P)))
136
+
137
+ # splits both strings at \n and then computes the smape_score of their lengths
138
+ def getLineCountScore(self, pred, ref):
139
+ pred_lines_amt = len(pred.splitlines())
140
+ ref_lines_amt = len(ref.splitlines())
141
+
142
+ # print("#pred_lines:", pred_lines_amt)
143
+ # print("#ref_lines:", ref_lines_amt)
144
+
145
+ return self.smapeScore(pred_lines_amt, ref_lines_amt)
146
+
147
+
148
+ # Get differenct scores regarding the content of a log-message
149
+ def getLineContentScore(self, pred_logMessages, ref_logMessages):
150
+ sacrebleu_score = self.sacrebleu_metric.compute(predictions=pred_logMessages, references=ref_logMessages)["score"]/ 100.0
151
 
152
+ smape_length_score = self.get_length_score(pred_logMessages, ref_logMessages)
 
 
153
 
154
+ # Split the single log-messages (for jaccard)
155
+ preds_split = [p.split() for p in pred_logMessages]
156
+ refs_split = [r.split() for r in ref_logMessages]
157
 
158
+ # compute jaccard pair-wise, then take mean of all results
159
+ mean_jaccard_score = np.mean([self.get_jaccard_similarity(set(p_split), set(r_split)) for p_split, r_split in zip(preds_split,refs_split)])
 
160
 
161
+ return sacrebleu_score, mean_jaccard_score, smape_length_score
162
+
163
+ # get different scores regarding the timestamp
164
+ def getTimestampsScore(self, pred_timestamps, ref_timestamps):
165
+ timestamp_amt_score = self.smapeScore(len(pred_timestamps), len(ref_timestamps))
 
 
 
 
 
 
 
 
166
 
167
+ # if there are no predicted timestamps, return early. It is still consistent and monotonic.
168
+ if (len(pred_timestamps) == 0):
169
+ return timestamp_amt_score, 1.0, 1.0
170
+
171
 
 
172
  # replace all digits in the reference timestamp (first timestamp) with '/d' to get
173
  # a regex that describes the format
174
+ pred_timestring_pattern = re.sub(r'\d', r'\\d', re.escape(pred_timestamps[0]))
175
 
176
+ matchesPatternScore = 1.0
177
+ monotonicallyIncreasingScore = 1.0
178
 
179
  # A variable to save the previous timestamp (as datetime obj) to check monotonicity
180
  prev_datetime = None
181
  # Convert matches to datetime objects
182
+ for i in range(len(pred_timestamps)):
183
+ ts = pred_timestamps[i]
 
184
  try:
185
  # Check if the format matches with the format of the first timestamp
186
  # TODO!! Check this later, maybe it is too restricting for training a llm
 
199
  # e.g. date format not parsable by dateutil.parser
200
  matchesPatternScore = 0.0
201
  monotonicallyIncreasingScore = 0.0
202
+
203
+ # matchesPatternScore and monotonicallyIncreasingScore are in {0,1}
204
+ return timestamp_amt_score, matchesPatternScore, monotonicallyIncreasingScore
205
+
206
+
207
+
208
+ def getLogMetric(self, pred : str, ref : str):
209
+ ref = ref.strip(' \t\n\r')
210
+ pred = pred.strip(' \t\n\r')
211
+
212
+ linecount_difference_SMAPE = self.getLineCountScore(pred, ref)
213
+
214
+
215
+ # Split log on timestamps
216
+ pred_split_log = self.timestamp_pattern.split(pred)
217
+ ref_split_log = self.timestamp_pattern.split(ref)
218
 
219
+ # One logentry always consists of timestamp + log-message
220
+ # pred_logentries = []
221
+ # ref_logentries = []
222
+
223
+ pred_timestamps = []
224
+ pred_logMessages = []
225
+
226
+ ref_timestamps = []
227
+ ref_logMessages = []
228
+ # reorganize log into logentry-tuples, consisting of timestamp + log-message
229
+ for i in range(1, len(pred_split_log), 2):
230
+ # pred_logentries.append((pred_split_log[i],pred_split_log[i+1]))
231
+ pred_timestamps.append(pred_split_log[i])
232
+ pred_logMessages.append(pred_split_log[i+1])
233
+
234
+
235
+ for i in range(1, len(ref_split_log), 2):
236
+ # ref_logentries.append((ref_split_log[i],ref_split_log[i+1]))
237
+ ref_timestamps.append(ref_split_log[i])
238
+ ref_logMessages.append(ref_split_log[i+1])
239
+
240
+ # We cut off the longer list, since we only want to compute the content-scores for actually predicted lines
241
+ # -> hence, the content score can be good, even if the amount of predicted lines are wrong
242
+ min_logentries = min(len(pred_logMessages), len(ref_logMessages))
243
+ # print("min_logentries:", min_logentries)
244
+ pred_logMessages = pred_logMessages[:min_logentries]
245
+ ref_logMessages = ref_logMessages[:min_logentries]
246
+
247
+
248
+ linecontent_sacrebleu, linecontent_jaccard, linecontentlength_difference_SMAPE = self.getLineContentScore(pred_logMessages, ref_logMessages)
249
+
250
+ timestamps_difference_SMAPE, timestamps_formatConsistency_absolute, timestamps_monotinicity_absolute = self.getTimestampsScore(pred_timestamps, ref_timestamps)
251
+
252
+ #TODO:
253
+ # linecontentordering_permutations = getLineContentOrderingScore(pred_logMessages, ref_logMessages)
254
+
255
 
 
 
256
  # return weighted overall score of all the different scores
257
+ return {"linecount_difference_SMAPE_score": linecount_difference_SMAPE,
258
+ "linecontentlength_difference_SMAPE_score": linecontentlength_difference_SMAPE,
259
+ "linecontent_sacrebleu_score": linecontent_sacrebleu,
260
+ "linecontent_jaccard_score": linecontent_jaccard,
261
+ "timestamps_SMAPE_difference_score": timestamps_difference_SMAPE,
262
+ "timestamps_formatConsistency_score": timestamps_formatConsistency_absolute,
263
+ "timestamps_monotinicity_score": timestamps_monotinicity_absolute
264
+ }
265
 
266
  def _compute(self, predictions, references):
267
  """Returns the scores"""
 
269
  # TODO: get separate log entries (split before timestamps), replace timestamps with token and compare the log entry with BLEU
270
 
271
  t_before_logmetric = time.perf_counter()
272
+ metric_dicts = [self.getLogMetric(p,r) for p,r in zip(predictions,references)]
273
+ # Extract keys (assuming all dictionaries have the same keys)
274
+ keys = metric_dicts[0].keys()
275
+
276
+ # Convert list of dictionaries into a 2D numpy array
277
+ values = np.array([list(d.values()) for d in metric_dicts])
278
+
279
+ # Calculate the mean along the vertical axis (axis=0)
280
+ mean_values = np.mean(values, axis=0)
281
+
282
+ # a dictionary, matching the keys with their corresponding mean values
283
+ metric_result = dict(zip(keys, mean_values))
284
 
285
+ t_after_logmetric = time.perf_counter()
286
  logmetric_duration = f"{t_after_logmetric - t_before_logmetric:0.10f}"
287
 
288
+ metric_result["duration"] = logmetric_duration
289
+
290
+ return metric_result
 
291