Spaces:

intelli-zen
/

document_loaders

Running

File size: 2,617 Bytes

e94100d

#!/usr/bin/python3
# -*- coding: utf-8 -*-
import difflib
from typing import List, Tuple

import editdistance


class ChunkSimilarity(object):
    def edit_distance(self, chunk: str, match_content: str) -> List[Tuple[str, float, str]]:
        edit_distance = editdistance.distance(chunk, match_content)

        chunk_length = len(chunk)
        content_length = len(match_content)

        normalized_edit_distance = edit_distance / (chunk_length + content_length)
        normalized_edit_distance2 = 2 * edit_distance / (chunk_length + content_length)

        result = [
            ("edit_distance", edit_distance, ""),
            (
                "ed_score", round(1 - normalized_edit_distance, 4),
                "1 - d / (l1 + l2)"
            ),
            (
                "ed_score2", round(1 - normalized_edit_distance2, 4),
                "1 - 2*d / (l1 + l2)"
            ),
        ]
        return result

    def seq_match(self, chunk: str, match_content: str) -> List[Tuple[str, str, str]]:
        seq_match = difflib.SequenceMatcher()
        seq_match.set_seqs(chunk, match_content)
        score = seq_match.ratio()

        result = [
            ("seq_match", round(score, 4), "(2.0*M / T) similar to edit_distance"),
        ]
        return result

    def similar(self, chunk: str, match_content: str):
        result = [
            ("metric", "score", "note")
        ]
        scores = self.edit_distance(chunk, match_content)
        result.extend(scores)
        scores = self.seq_match(chunk, match_content)
        result.extend(scores)

        return result


CHUNK_TRUE = """
2023年人工智能发展现状：生成式AI的突破之年
"""


CHUNK_EDIT = """
Among these PEFT methods, the reparameterization-based method low-rank adaptation (LoRA) (Hu et al.,2021) is considered one of the most efficient and effective methods at present. 
LoRA is especially popular after open-sourced LLMs become ubiquitous (Dettmers et al., 2023). 
LoRA assumes that the change of the model’s parameters for adaptation is intrinsically low-dimensional and performs adaptation by optimizing the matrix obtained from low-rank decomposition. 
Since it is in the form of weight matrix reparameterization, LoRA parameters can be merged with the original LLMs and cause no forward propagation latency.
"""


def main():

    chunk_similarity = ChunkSimilarity()
    scores = chunk_similarity.similar(
        CHUNK_TRUE,
        CHUNK_EDIT
    )
    for score in scores:
        metric, score, _ = score
        print(f"{metric}: {score}")
    return


if __name__ == "__main__":
    main()