Spaces:

svenwey
/

logmetric

Sleeping

App Files Files Community

svenwey commited on Sep 20, 2024

Commit

37ee0fe

1 Parent(s): 502c950

decompose metrics into components

Browse files

Files changed (1) hide show

logmetric.py +132 -64

logmetric.py CHANGED Viewed

@@ -107,73 +107,80 @@ class LogMetric(evaluate.Metric):
         return len(intersection) / len(union)
     # A score depending on the difference in length of two sentences
-    def get_length_score(self, sentence1, sentence2):
-        s1len = len(sentence1)
-        s2len = len(sentence2)
-        return 1 - (abs(s1len - s2len) / max(s1len, s2len))
-    # Use minimum edit distance between two sentences
-    def get_overall_similarity(self, sentence1, sentence2):
-        return self.sacrebleu_metric.compute(predictions=sentence1, references=sentence2)["score"]
-        # s1split = sentence1.split()
-        # s2split = sentence2.split()
-        # jaccard_score = self.get_jaccard_similarity(set(s1split), set(s2split))
-        # length_score = self.get_length_score(s1split, s2split)
-        # return (jaccard_score * 0.7 + length_score * 0.3) * 100.0
-    def getLogMetric(self, pred : str, ref : str):
-        ref = ref.strip(' \t\n\r')
-        pred = pred.strip(' \t\n\r')
-        # Split log on timestamps
-        pred_split_log = self.timestamp_pattern.split(pred)
-        ref_split_log = self.timestamp_pattern.split(ref)
-        # One logentry always consists of timestamp + log-message
-        pred_logentries = []
-        ref_logentries = []
-        # reorganize log into logentry-tuples, consisting of timestamp + log-message
-        for i in range(1, len(pred_split_log), 2):
-            pred_logentries.append((pred_split_log[i],pred_split_log[i+1]))
-        for i in range(1, len(ref_split_log), 2):
-            ref_logentries.append((ref_split_log[i],ref_split_log[i+1]))
-        # The number of logentries of the reference/prediction which has more/less entries/timestamps
-        max_logentries = max(len(pred_logentries), len(ref_logentries))
-        min_logentries = min(len(pred_logentries), len(ref_logentries))
-        # Case there are no timestamps in reference and none in prediction
-        # we can compute bleu directly from original prediction (ref will be empty, but we offload this to the bleu metric)
-        if (len(pred_logentries) == 0 and len(ref_logentries) == 0):
-            # any sensible log reference is empty if there is no timestamp, hence it suffices to check exact match
-            logmsg_score = 100.0 if pred == ref else 0.0
-            return 0.3 * 100.0 + 0.7 * logmsg_score
-        # Case one has 0 timestamps, other has >0 timestamps
-        if (len(pred_logentries) == 0 or len(ref_logentries) == 0):
-            # It is nonsensical to compare something in this case
-            return 0.0
         # replace all digits in the reference timestamp (first timestamp) with '/d' to get
         # a regex that describes the format
-        pred_timestring_pattern = re.sub(r'\d', r'\\d', re.escape(pred_logentries[0][0]))
-        matchesPatternScore = 100.0
-        monotonicallyIncreasingScore = 100.0
         # A variable to save the previous timestamp (as datetime obj) to check monotonicity
         prev_datetime = None
         # Convert matches to datetime objects
-        # TODO TODO TODO fix this:
-        for i in range(min_logentries):
-            ts = pred_logentries[i][0]
             try:
                 # Check if the format matches with the format of the first timestamp
                 # TODO!! Check this later, maybe it is too restricting for training a llm
@@ -192,18 +199,69 @@ class LogMetric(evaluate.Metric):
                 # e.g. date format not parsable by dateutil.parser
                 matchesPatternScore = 0.0
                 monotonicallyIncreasingScore = 0.0
-        # apply jaccard-similarity to every pred-ref pair and then take mean score * 100
-        local_score = self.get_overall_similarity(
-                                        list(map(lambda t: t[1], pred_logentries))[:min_logentries],
-                                        list(map(lambda t: t[1], ref_logentries))[:min_logentries]
-                                        )
-        # we aggregate the bleu scores where we weight the difference in logentries with a score of 0
-        logmessage_aggregated_score = ((min_logentries / max_logentries) * local_score)
         # return weighted overall score of all the different scores
-        return 0.2 * monotonicallyIncreasingScore + 0.1 * matchesPatternScore + 0.7 * logmessage_aggregated_score
     def _compute(self, predictions, references):
         """Returns the scores"""
@@ -211,13 +269,23 @@ class LogMetric(evaluate.Metric):
         # TODO: get separate log entries (split before timestamps), replace timestamps with token and compare the log entry with BLEU
         t_before_logmetric = time.perf_counter()
-        timestamp_score = np.mean([self.getLogMetric(p,r) for p,r in zip(predictions,references)])
-        t_after_logmetric = time.perf_counter()
         logmetric_duration = f"{t_after_logmetric - t_before_logmetric:0.10f}"
-        return {
-            "score": timestamp_score,
-            "duration": logmetric_duration
-        }

         return len(intersection) / len(union)
     # A score depending on the difference in length of two sentences
+    def get_length_score(self, preds_split, refs_split):
+        pred_content_lengths = np.vectorize(len)(preds_split)
+        ref_content_lengths = np.vectorize(len)(refs_split)
+        return self.smapeScore(pred_content_lengths, ref_content_lengths)
+    # helper function that computes the smape_score either between two numbers or two lists of numbers (must be the same length)
+    def smapeScore(self, P, R):
+        P_isnumber = isinstance(P, (int, float))
+        R_isnumber = isinstance(R, (int, float))
+        # either both must be numbers or both must be no number
+        assert P_isnumber == R_isnumber
+        if not P_isnumber:
+            assert(len(P) == len(R))
+        if P_isnumber and R_isnumber:
+            if P == 0 and R == 0: return 1.0      # since this leads to (|R| + |P|) = 0
+            n = 1
+        else:
+            if P == [] and R == []: return 1.0     # since this leads to n = 0
+            n = len(P)
+        return 1/n * np.sum(np.abs(R - P) / (np.abs(R) + np.abs(P)))
+    # splits both strings at \n and then computes the smape_score of their lengths
+    def getLineCountScore(self, pred, ref):
+        pred_lines_amt = len(pred.splitlines())
+        ref_lines_amt = len(ref.splitlines())
+        # print("#pred_lines:", pred_lines_amt)
+        # print("#ref_lines:", ref_lines_amt)
+        return self.smapeScore(pred_lines_amt, ref_lines_amt)
+    # Get differenct scores regarding the content of a log-message
+    def getLineContentScore(self, pred_logMessages, ref_logMessages):
+        sacrebleu_score = self.sacrebleu_metric.compute(predictions=pred_logMessages, references=ref_logMessages)["score"]/ 100.0
+        smape_length_score = self.get_length_score(pred_logMessages, ref_logMessages)
+        # Split the single log-messages (for jaccard)
+        preds_split = [p.split() for p in pred_logMessages]
+        refs_split = [r.split() for r in ref_logMessages]
+        # compute jaccard pair-wise, then take mean of all results
+        mean_jaccard_score = np.mean([self.get_jaccard_similarity(set(p_split), set(r_split)) for p_split, r_split in zip(preds_split,refs_split)])
+        return sacrebleu_score, mean_jaccard_score, smape_length_score
+    # get different scores regarding the timestamp
+    def getTimestampsScore(self, pred_timestamps, ref_timestamps):
+        timestamp_amt_score = self.smapeScore(len(pred_timestamps), len(ref_timestamps))
+        # if there are no predicted timestamps, return early. It is still consistent and monotonic.
+        if (len(pred_timestamps) == 0):
+            return timestamp_amt_score, 1.0, 1.0
         # replace all digits in the reference timestamp (first timestamp) with '/d' to get
         # a regex that describes the format
+        pred_timestring_pattern = re.sub(r'\d', r'\\d', re.escape(pred_timestamps[0]))
+        matchesPatternScore = 1.0
+        monotonicallyIncreasingScore = 1.0
         # A variable to save the previous timestamp (as datetime obj) to check monotonicity
         prev_datetime = None
         # Convert matches to datetime objects
+        for i in range(len(pred_timestamps)):
+            ts = pred_timestamps[i]
             try:
                 # Check if the format matches with the format of the first timestamp
                 # TODO!! Check this later, maybe it is too restricting for training a llm
                 # e.g. date format not parsable by dateutil.parser
                 matchesPatternScore = 0.0
                 monotonicallyIncreasingScore = 0.0
+        # matchesPatternScore and monotonicallyIncreasingScore are in {0,1}
+        return timestamp_amt_score, matchesPatternScore, monotonicallyIncreasingScore
+    def getLogMetric(self, pred : str, ref : str):
+        ref = ref.strip(' \t\n\r')
+        pred = pred.strip(' \t\n\r')
+        linecount_difference_SMAPE = self.getLineCountScore(pred, ref)
+        # Split log on timestamps
+        pred_split_log = self.timestamp_pattern.split(pred)
+        ref_split_log = self.timestamp_pattern.split(ref)
+        # One logentry always consists of timestamp + log-message
+        # pred_logentries = []
+        # ref_logentries = []
+        pred_timestamps = []
+        pred_logMessages = []
+        ref_timestamps = []
+        ref_logMessages = []
+        # reorganize log into logentry-tuples, consisting of timestamp + log-message
+        for i in range(1, len(pred_split_log), 2):
+            # pred_logentries.append((pred_split_log[i],pred_split_log[i+1]))
+            pred_timestamps.append(pred_split_log[i])
+            pred_logMessages.append(pred_split_log[i+1])
+        for i in range(1, len(ref_split_log), 2):
+            # ref_logentries.append((ref_split_log[i],ref_split_log[i+1]))
+            ref_timestamps.append(ref_split_log[i])
+            ref_logMessages.append(ref_split_log[i+1])
+        # We cut off the longer list, since we only want to compute the content-scores for actually predicted lines
+        # -> hence, the content score can be good, even if the amount of predicted lines are wrong
+        min_logentries = min(len(pred_logMessages), len(ref_logMessages))
+        # print("min_logentries:", min_logentries)
+        pred_logMessages = pred_logMessages[:min_logentries]
+        ref_logMessages = ref_logMessages[:min_logentries]
+        linecontent_sacrebleu, linecontent_jaccard, linecontentlength_difference_SMAPE = self.getLineContentScore(pred_logMessages, ref_logMessages)
+        timestamps_difference_SMAPE, timestamps_formatConsistency_absolute, timestamps_monotinicity_absolute = self.getTimestampsScore(pred_timestamps, ref_timestamps)
+        #TODO:
+        # linecontentordering_permutations = getLineContentOrderingScore(pred_logMessages, ref_logMessages)
         # return weighted overall score of all the different scores
+        return {"linecount_difference_SMAPE_score": linecount_difference_SMAPE,
+                "linecontentlength_difference_SMAPE_score": linecontentlength_difference_SMAPE,
+                "linecontent_sacrebleu_score": linecontent_sacrebleu,
+                "linecontent_jaccard_score": linecontent_jaccard,
+                "timestamps_SMAPE_difference_score": timestamps_difference_SMAPE,
+                "timestamps_formatConsistency_score": timestamps_formatConsistency_absolute,
+                "timestamps_monotinicity_score": timestamps_monotinicity_absolute
+                }
     def _compute(self, predictions, references):
         """Returns the scores"""
         # TODO: get separate log entries (split before timestamps), replace timestamps with token and compare the log entry with BLEU
         t_before_logmetric = time.perf_counter()
+        metric_dicts = [self.getLogMetric(p,r) for p,r in zip(predictions,references)]
+        # Extract keys (assuming all dictionaries have the same keys)
+        keys = metric_dicts[0].keys()
+        # Convert list of dictionaries into a 2D numpy array
+        values = np.array([list(d.values()) for d in metric_dicts])
+        # Calculate the mean along the vertical axis (axis=0)
+        mean_values = np.mean(values, axis=0)
+        # a dictionary, matching the keys with their corresponding mean values
+        metric_result = dict(zip(keys, mean_values))
+        t_after_logmetric = time.perf_counter()
         logmetric_duration = f"{t_after_logmetric - t_before_logmetric:0.10f}"
+        metric_result["duration"] = logmetric_duration
+        return metric_result