#!/usr/bin/python3 # -*- coding: utf-8 -*- import difflib from typing import List, Tuple import editdistance class ChunkSimilarity(object): def edit_distance(self, chunk: str, match_content: str) -> List[Tuple[str, float, str]]: edit_distance = editdistance.distance(chunk, match_content) chunk_length = len(chunk) content_length = len(match_content) normalized_edit_distance = edit_distance / (chunk_length + content_length) normalized_edit_distance2 = 2 * edit_distance / (chunk_length + content_length) result = [ ("edit_distance", edit_distance, ""), ( "ed_score", round(1 - normalized_edit_distance, 4), "1 - d / (l1 + l2)" ), ( "ed_score2", round(1 - normalized_edit_distance2, 4), "1 - 2*d / (l1 + l2)" ), ] return result def seq_match(self, chunk: str, match_content: str) -> List[Tuple[str, str, str]]: seq_match = difflib.SequenceMatcher() seq_match.set_seqs(chunk, match_content) score = seq_match.ratio() result = [ ("seq_match", round(score, 4), "(2.0*M / T) similar to edit_distance"), ] return result def similar(self, chunk: str, match_content: str): result = [ ("metric", "score", "note") ] scores = self.edit_distance(chunk, match_content) result.extend(scores) scores = self.seq_match(chunk, match_content) result.extend(scores) return result CHUNK_TRUE = """ 2023年人工智能发展现状:生成式AI的突破之年 """ CHUNK_EDIT = """ Among these PEFT methods, the reparameterization-based method low-rank adaptation (LoRA) (Hu et al.,2021) is considered one of the most efficient and effective methods at present. LoRA is especially popular after open-sourced LLMs become ubiquitous (Dettmers et al., 2023). LoRA assumes that the change of the model’s parameters for adaptation is intrinsically low-dimensional and performs adaptation by optimizing the matrix obtained from low-rank decomposition. Since it is in the form of weight matrix reparameterization, LoRA parameters can be merged with the original LLMs and cause no forward propagation latency. """ def main(): chunk_similarity = ChunkSimilarity() scores = chunk_similarity.similar( CHUNK_TRUE, CHUNK_EDIT ) for score in scores: metric, score, _ = score print(f"{metric}: {score}") return if __name__ == "__main__": main()