decompose metrics into components
Browse files- logmetric.py +132 -64
logmetric.py
CHANGED
@@ -107,73 +107,80 @@ class LogMetric(evaluate.Metric):
|
|
107 |
return len(intersection) / len(union)
|
108 |
|
109 |
# A score depending on the difference in length of two sentences
|
110 |
-
def get_length_score(self,
|
111 |
-
s1len = len(sentence1)
|
112 |
-
s2len = len(sentence2)
|
113 |
|
114 |
-
|
|
|
115 |
|
116 |
-
|
117 |
-
def get_overall_similarity(self, sentence1, sentence2):
|
118 |
-
return self.sacrebleu_metric.compute(predictions=sentence1, references=sentence2)["score"]
|
119 |
-
|
120 |
-
# s1split = sentence1.split()
|
121 |
-
# s2split = sentence2.split()
|
122 |
|
123 |
-
|
124 |
-
|
|
|
|
|
125 |
|
126 |
-
#
|
|
|
127 |
|
128 |
-
|
129 |
-
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
-
|
133 |
-
pred_split_log = self.timestamp_pattern.split(pred)
|
134 |
-
ref_split_log = self.timestamp_pattern.split(ref)
|
135 |
|
136 |
-
#
|
137 |
-
|
138 |
-
|
139 |
|
140 |
-
#
|
141 |
-
|
142 |
-
pred_logentries.append((pred_split_log[i],pred_split_log[i+1]))
|
143 |
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
min_logentries = min(len(pred_logentries), len(ref_logentries))
|
150 |
-
|
151 |
-
# Case there are no timestamps in reference and none in prediction
|
152 |
-
# we can compute bleu directly from original prediction (ref will be empty, but we offload this to the bleu metric)
|
153 |
-
if (len(pred_logentries) == 0 and len(ref_logentries) == 0):
|
154 |
-
# any sensible log reference is empty if there is no timestamp, hence it suffices to check exact match
|
155 |
-
logmsg_score = 100.0 if pred == ref else 0.0
|
156 |
-
return 0.3 * 100.0 + 0.7 * logmsg_score
|
157 |
|
158 |
-
#
|
159 |
-
if (len(
|
160 |
-
|
161 |
-
|
162 |
|
163 |
-
|
164 |
# replace all digits in the reference timestamp (first timestamp) with '/d' to get
|
165 |
# a regex that describes the format
|
166 |
-
pred_timestring_pattern = re.sub(r'\d', r'\\d', re.escape(
|
167 |
|
168 |
-
matchesPatternScore =
|
169 |
-
monotonicallyIncreasingScore =
|
170 |
|
171 |
# A variable to save the previous timestamp (as datetime obj) to check monotonicity
|
172 |
prev_datetime = None
|
173 |
# Convert matches to datetime objects
|
174 |
-
|
175 |
-
|
176 |
-
ts = pred_logentries[i][0]
|
177 |
try:
|
178 |
# Check if the format matches with the format of the first timestamp
|
179 |
# TODO!! Check this later, maybe it is too restricting for training a llm
|
@@ -192,18 +199,69 @@ class LogMetric(evaluate.Metric):
|
|
192 |
# e.g. date format not parsable by dateutil.parser
|
193 |
matchesPatternScore = 0.0
|
194 |
monotonicallyIncreasingScore = 0.0
|
195 |
-
|
196 |
-
#
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
|
203 |
-
# we aggregate the bleu scores where we weight the difference in logentries with a score of 0
|
204 |
-
logmessage_aggregated_score = ((min_logentries / max_logentries) * local_score)
|
205 |
# return weighted overall score of all the different scores
|
206 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
|
208 |
def _compute(self, predictions, references):
|
209 |
"""Returns the scores"""
|
@@ -211,13 +269,23 @@ class LogMetric(evaluate.Metric):
|
|
211 |
# TODO: get separate log entries (split before timestamps), replace timestamps with token and compare the log entry with BLEU
|
212 |
|
213 |
t_before_logmetric = time.perf_counter()
|
214 |
-
|
215 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
216 |
|
|
|
217 |
logmetric_duration = f"{t_after_logmetric - t_before_logmetric:0.10f}"
|
218 |
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
}
|
223 |
|
|
|
107 |
return len(intersection) / len(union)
|
108 |
|
109 |
# A score depending on the difference in length of two sentences
|
110 |
+
def get_length_score(self, preds_split, refs_split):
|
|
|
|
|
111 |
|
112 |
+
pred_content_lengths = np.vectorize(len)(preds_split)
|
113 |
+
ref_content_lengths = np.vectorize(len)(refs_split)
|
114 |
|
115 |
+
return self.smapeScore(pred_content_lengths, ref_content_lengths)
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
+
# helper function that computes the smape_score either between two numbers or two lists of numbers (must be the same length)
|
118 |
+
def smapeScore(self, P, R):
|
119 |
+
P_isnumber = isinstance(P, (int, float))
|
120 |
+
R_isnumber = isinstance(R, (int, float))
|
121 |
|
122 |
+
# either both must be numbers or both must be no number
|
123 |
+
assert P_isnumber == R_isnumber
|
124 |
|
125 |
+
if not P_isnumber:
|
126 |
+
assert(len(P) == len(R))
|
127 |
+
|
128 |
+
if P_isnumber and R_isnumber:
|
129 |
+
if P == 0 and R == 0: return 1.0 # since this leads to (|R| + |P|) = 0
|
130 |
+
n = 1
|
131 |
+
else:
|
132 |
+
if P == [] and R == []: return 1.0 # since this leads to n = 0
|
133 |
+
n = len(P)
|
134 |
+
|
135 |
+
return 1/n * np.sum(np.abs(R - P) / (np.abs(R) + np.abs(P)))
|
136 |
+
|
137 |
+
# splits both strings at \n and then computes the smape_score of their lengths
|
138 |
+
def getLineCountScore(self, pred, ref):
|
139 |
+
pred_lines_amt = len(pred.splitlines())
|
140 |
+
ref_lines_amt = len(ref.splitlines())
|
141 |
+
|
142 |
+
# print("#pred_lines:", pred_lines_amt)
|
143 |
+
# print("#ref_lines:", ref_lines_amt)
|
144 |
+
|
145 |
+
return self.smapeScore(pred_lines_amt, ref_lines_amt)
|
146 |
+
|
147 |
+
|
148 |
+
# Get differenct scores regarding the content of a log-message
|
149 |
+
def getLineContentScore(self, pred_logMessages, ref_logMessages):
|
150 |
+
sacrebleu_score = self.sacrebleu_metric.compute(predictions=pred_logMessages, references=ref_logMessages)["score"]/ 100.0
|
151 |
|
152 |
+
smape_length_score = self.get_length_score(pred_logMessages, ref_logMessages)
|
|
|
|
|
153 |
|
154 |
+
# Split the single log-messages (for jaccard)
|
155 |
+
preds_split = [p.split() for p in pred_logMessages]
|
156 |
+
refs_split = [r.split() for r in ref_logMessages]
|
157 |
|
158 |
+
# compute jaccard pair-wise, then take mean of all results
|
159 |
+
mean_jaccard_score = np.mean([self.get_jaccard_similarity(set(p_split), set(r_split)) for p_split, r_split in zip(preds_split,refs_split)])
|
|
|
160 |
|
161 |
+
return sacrebleu_score, mean_jaccard_score, smape_length_score
|
162 |
+
|
163 |
+
# get different scores regarding the timestamp
|
164 |
+
def getTimestampsScore(self, pred_timestamps, ref_timestamps):
|
165 |
+
timestamp_amt_score = self.smapeScore(len(pred_timestamps), len(ref_timestamps))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
|
167 |
+
# if there are no predicted timestamps, return early. It is still consistent and monotonic.
|
168 |
+
if (len(pred_timestamps) == 0):
|
169 |
+
return timestamp_amt_score, 1.0, 1.0
|
170 |
+
|
171 |
|
|
|
172 |
# replace all digits in the reference timestamp (first timestamp) with '/d' to get
|
173 |
# a regex that describes the format
|
174 |
+
pred_timestring_pattern = re.sub(r'\d', r'\\d', re.escape(pred_timestamps[0]))
|
175 |
|
176 |
+
matchesPatternScore = 1.0
|
177 |
+
monotonicallyIncreasingScore = 1.0
|
178 |
|
179 |
# A variable to save the previous timestamp (as datetime obj) to check monotonicity
|
180 |
prev_datetime = None
|
181 |
# Convert matches to datetime objects
|
182 |
+
for i in range(len(pred_timestamps)):
|
183 |
+
ts = pred_timestamps[i]
|
|
|
184 |
try:
|
185 |
# Check if the format matches with the format of the first timestamp
|
186 |
# TODO!! Check this later, maybe it is too restricting for training a llm
|
|
|
199 |
# e.g. date format not parsable by dateutil.parser
|
200 |
matchesPatternScore = 0.0
|
201 |
monotonicallyIncreasingScore = 0.0
|
202 |
+
|
203 |
+
# matchesPatternScore and monotonicallyIncreasingScore are in {0,1}
|
204 |
+
return timestamp_amt_score, matchesPatternScore, monotonicallyIncreasingScore
|
205 |
+
|
206 |
+
|
207 |
+
|
208 |
+
def getLogMetric(self, pred : str, ref : str):
|
209 |
+
ref = ref.strip(' \t\n\r')
|
210 |
+
pred = pred.strip(' \t\n\r')
|
211 |
+
|
212 |
+
linecount_difference_SMAPE = self.getLineCountScore(pred, ref)
|
213 |
+
|
214 |
+
|
215 |
+
# Split log on timestamps
|
216 |
+
pred_split_log = self.timestamp_pattern.split(pred)
|
217 |
+
ref_split_log = self.timestamp_pattern.split(ref)
|
218 |
|
219 |
+
# One logentry always consists of timestamp + log-message
|
220 |
+
# pred_logentries = []
|
221 |
+
# ref_logentries = []
|
222 |
+
|
223 |
+
pred_timestamps = []
|
224 |
+
pred_logMessages = []
|
225 |
+
|
226 |
+
ref_timestamps = []
|
227 |
+
ref_logMessages = []
|
228 |
+
# reorganize log into logentry-tuples, consisting of timestamp + log-message
|
229 |
+
for i in range(1, len(pred_split_log), 2):
|
230 |
+
# pred_logentries.append((pred_split_log[i],pred_split_log[i+1]))
|
231 |
+
pred_timestamps.append(pred_split_log[i])
|
232 |
+
pred_logMessages.append(pred_split_log[i+1])
|
233 |
+
|
234 |
+
|
235 |
+
for i in range(1, len(ref_split_log), 2):
|
236 |
+
# ref_logentries.append((ref_split_log[i],ref_split_log[i+1]))
|
237 |
+
ref_timestamps.append(ref_split_log[i])
|
238 |
+
ref_logMessages.append(ref_split_log[i+1])
|
239 |
+
|
240 |
+
# We cut off the longer list, since we only want to compute the content-scores for actually predicted lines
|
241 |
+
# -> hence, the content score can be good, even if the amount of predicted lines are wrong
|
242 |
+
min_logentries = min(len(pred_logMessages), len(ref_logMessages))
|
243 |
+
# print("min_logentries:", min_logentries)
|
244 |
+
pred_logMessages = pred_logMessages[:min_logentries]
|
245 |
+
ref_logMessages = ref_logMessages[:min_logentries]
|
246 |
+
|
247 |
+
|
248 |
+
linecontent_sacrebleu, linecontent_jaccard, linecontentlength_difference_SMAPE = self.getLineContentScore(pred_logMessages, ref_logMessages)
|
249 |
+
|
250 |
+
timestamps_difference_SMAPE, timestamps_formatConsistency_absolute, timestamps_monotinicity_absolute = self.getTimestampsScore(pred_timestamps, ref_timestamps)
|
251 |
+
|
252 |
+
#TODO:
|
253 |
+
# linecontentordering_permutations = getLineContentOrderingScore(pred_logMessages, ref_logMessages)
|
254 |
+
|
255 |
|
|
|
|
|
256 |
# return weighted overall score of all the different scores
|
257 |
+
return {"linecount_difference_SMAPE_score": linecount_difference_SMAPE,
|
258 |
+
"linecontentlength_difference_SMAPE_score": linecontentlength_difference_SMAPE,
|
259 |
+
"linecontent_sacrebleu_score": linecontent_sacrebleu,
|
260 |
+
"linecontent_jaccard_score": linecontent_jaccard,
|
261 |
+
"timestamps_SMAPE_difference_score": timestamps_difference_SMAPE,
|
262 |
+
"timestamps_formatConsistency_score": timestamps_formatConsistency_absolute,
|
263 |
+
"timestamps_monotinicity_score": timestamps_monotinicity_absolute
|
264 |
+
}
|
265 |
|
266 |
def _compute(self, predictions, references):
|
267 |
"""Returns the scores"""
|
|
|
269 |
# TODO: get separate log entries (split before timestamps), replace timestamps with token and compare the log entry with BLEU
|
270 |
|
271 |
t_before_logmetric = time.perf_counter()
|
272 |
+
metric_dicts = [self.getLogMetric(p,r) for p,r in zip(predictions,references)]
|
273 |
+
# Extract keys (assuming all dictionaries have the same keys)
|
274 |
+
keys = metric_dicts[0].keys()
|
275 |
+
|
276 |
+
# Convert list of dictionaries into a 2D numpy array
|
277 |
+
values = np.array([list(d.values()) for d in metric_dicts])
|
278 |
+
|
279 |
+
# Calculate the mean along the vertical axis (axis=0)
|
280 |
+
mean_values = np.mean(values, axis=0)
|
281 |
+
|
282 |
+
# a dictionary, matching the keys with their corresponding mean values
|
283 |
+
metric_result = dict(zip(keys, mean_values))
|
284 |
|
285 |
+
t_after_logmetric = time.perf_counter()
|
286 |
logmetric_duration = f"{t_after_logmetric - t_before_logmetric:0.10f}"
|
287 |
|
288 |
+
metric_result["duration"] = logmetric_duration
|
289 |
+
|
290 |
+
return metric_result
|
|
|
291 |
|