Spaces:
Sleeping
Sleeping
File size: 2,617 Bytes
e94100d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import difflib
from typing import List, Tuple
import editdistance
class ChunkSimilarity(object):
def edit_distance(self, chunk: str, match_content: str) -> List[Tuple[str, float, str]]:
edit_distance = editdistance.distance(chunk, match_content)
chunk_length = len(chunk)
content_length = len(match_content)
normalized_edit_distance = edit_distance / (chunk_length + content_length)
normalized_edit_distance2 = 2 * edit_distance / (chunk_length + content_length)
result = [
("edit_distance", edit_distance, ""),
(
"ed_score", round(1 - normalized_edit_distance, 4),
"1 - d / (l1 + l2)"
),
(
"ed_score2", round(1 - normalized_edit_distance2, 4),
"1 - 2*d / (l1 + l2)"
),
]
return result
def seq_match(self, chunk: str, match_content: str) -> List[Tuple[str, str, str]]:
seq_match = difflib.SequenceMatcher()
seq_match.set_seqs(chunk, match_content)
score = seq_match.ratio()
result = [
("seq_match", round(score, 4), "(2.0*M / T) similar to edit_distance"),
]
return result
def similar(self, chunk: str, match_content: str):
result = [
("metric", "score", "note")
]
scores = self.edit_distance(chunk, match_content)
result.extend(scores)
scores = self.seq_match(chunk, match_content)
result.extend(scores)
return result
CHUNK_TRUE = """
2023年人工智能发展现状:生成式AI的突破之年
"""
CHUNK_EDIT = """
Among these PEFT methods, the reparameterization-based method low-rank adaptation (LoRA) (Hu et al.,2021) is considered one of the most efficient and effective methods at present.
LoRA is especially popular after open-sourced LLMs become ubiquitous (Dettmers et al., 2023).
LoRA assumes that the change of the model’s parameters for adaptation is intrinsically low-dimensional and performs adaptation by optimizing the matrix obtained from low-rank decomposition.
Since it is in the form of weight matrix reparameterization, LoRA parameters can be merged with the original LLMs and cause no forward propagation latency.
"""
def main():
chunk_similarity = ChunkSimilarity()
scores = chunk_similarity.similar(
CHUNK_TRUE,
CHUNK_EDIT
)
for score in scores:
metric, score, _ = score
print(f"{metric}: {score}")
return
if __name__ == "__main__":
main()
|