Spaces:

intelli-zen
/

document_loaders

Running

File size: 6,612 Bytes

e94100d

#!/usr/bin/python3
# -*- coding: utf-8 -*-
import difflib
from typing import List, Tuple

import editdistance

from project_settings import project_path
from toolbox.string.tokenization import FastTokenizer


class ChunkSearcher(object):
    def __init__(self,
                 vocab_file: str = (project_path / "data/vocab.txt").as_posix()
                 ):
        # 需要一个多语言的分词器. (找一个多语言 bert 的词汇表).
        self.tokenizer = self.init_tokenizer(vocab_file)

    @staticmethod
    def init_tokenizer(vocab_file):
        tokenizer = FastTokenizer()
        with open(vocab_file, "r", encoding="utf-8") as f:
            for row in f:
                token = str(row).strip()
                tokenizer.insert(token)
        return tokenizer

    def chunk_search(self, chunk: str, content: str, win_size_radio: float = 1.5):
        chunk_tokens, _ = self.tokenizer.tokenize(chunk)
        content_tokens, _ = self.tokenizer.tokenize(content)

        counter = [0] * len(content_tokens)
        win_score = [0] * len(content_tokens)

        for token1 in chunk_tokens:
            if len(token1.strip()) == 0:
                continue
            for idx, token2 in enumerate(content_tokens):
                if token1 == token2:
                    counter[idx] = 1

        win_size = len(chunk_tokens) * win_size_radio
        win_size = int(win_size)
        for begin in range(0, len(content_tokens) - win_size, 1):
            win = counter[begin: begin+win_size]
            score = sum(win)
            win_score[begin] = score

        idx = win_score.index(max(win_score))

        match = content_tokens[idx: idx+win_size]
        match_content = "".join(match)

        match_content = self.rstrip_match_content(chunk, match_content)
        match_content = self.rstrip_match_content(chunk[::-1], match_content[::-1])
        match_content = match_content[::-1]

        return match_content

    def rstrip_match_content(self, chunk: str, match_content: str):
        differ = difflib.Differ()
        diff = differ.compare(match_content, chunk)

        operation_list = list()
        for d in diff:
            operation = d[0]
            operation_list.append(operation)

        r_strip_count = 0
        for operation in reversed(operation_list):
            if operation != "-":
                break
            r_strip_count += 1

        if r_strip_count != 0:
            match_content = match_content[:-r_strip_count].strip()
        return match_content


class ChunkSimilarity(object):

    def edit_distance(self, chunk: str, match_content: str) -> List[Tuple[str, float, str]]:
        edit_distance = editdistance.distance(chunk, match_content)

        chunk_length = len(chunk)
        content_length = len(match_content)

        normalized_edit_distance = edit_distance / (chunk_length + content_length)
        normalized_edit_distance2 = 2 * edit_distance / (chunk_length + content_length)

        result = [
            ("edit_distance", edit_distance, ""),
            (
                "ed_score", round(1 - normalized_edit_distance, 4),
                "1 - d / (l1 + l2)"
            ),
            (
                "ed_score2", round(1 - normalized_edit_distance2, 4),
                "1 - 2*d / (l1 + l2)"
            ),
        ]
        return result

    def seq_match(self, chunk: str, match_content: str) -> List[Tuple[str, str, str]]:
        seq_match = difflib.SequenceMatcher()
        seq_match.set_seqs(chunk, match_content)
        score = seq_match.ratio()

        result = [
            ("seq_match", round(score, 4), "(2.0*M / T) similar to edit_distance"),
        ]
        return result

    def similar(self, chunk: str, match_content: str):
        result = [
            ("metric", "score", "note")
        ]
        scores = self.edit_distance(chunk, match_content)
        result.extend(scores)
        scores = self.seq_match(chunk, match_content)
        result.extend(scores)

        return result


PAGE_CONTENT = """
40
麦肯锡中国金融业 CEO季刊        2023年秋季刊
2023年人工智能发展现状：  
生成式 AI的突破之年
Michael Chui ，Eric Hazan ，Lareina Yee ，Bryce Hall ，Alex Singla  
和Alexander Sukharevsky如 今 ，生 成 式 AI工具遍地开花， 各组织均在快速部署； 麦肯锡调查的
受访者们预计， 该技术将对自己所在行业及就业产生重大影响。

41
2023年 人 工 智 能 发 展 现 状 ：生 成 式 AI的突破之年
麦肯锡针对人工智能发展现状的最新年度全球调研结果证实， 生
成式人工智能 （简称 GenAI ）工 具 已 出 现 爆 炸 式 增 长 。许 多 此 类 工
具 至 今 推 出 尚 不 满 一 年 ，但 已 有 1/3的 受 访 者 表 示 ，其 所 在 组 织 会 在
至少一项业务职能中经常使 用 GenAI 。 随着这些最新进展， 人工智
能 已 经 从 一 个 技 术 话 题 上 升 为 企 业 领 导 的 关 注 焦 点 ：近 1/4受访高
管 表 示 ，他 们 会 在 工 作 中 使 用 GenAI 工具； 而在已应用人工智能的
企 业 中，有 超 过 1/4的受访者表示 GenAI 已 被 列 入 董 事 会 议 程 。此 外 ，
40% 的受访者表示， 其所在组织将会因 GenAI 的最新进 展而增加对
人工智能的整体投入。 调查结果表明， GenAI 相关风险管理仍处于
早期阶段： 即便是针对受访者眼中最常见的不准确问题， 也只有不
"""


CHUNK = """2023年人工智能发展现状：生成式AI的突破之年"""


CHUNK1 = """
Among these PEFT methods, the reparameterization-based method low-rank adaptation (LoRA) (Hu et al.,2021) is considered one of the most efficient and effective methods at present. 
LoRA is especially popular after open-sourced LLMs become ubiquitous (Dettmers et al., 2023). 
LoRA assumes that the change of the model’s parameters for adaptation is intrinsically low-dimensional and performs adaptation by optimizing the matrix obtained from low-rank decomposition. 
Since it is in the form of weight matrix reparameterization, LoRA parameters can be merged with the original LLMs and cause no forward propagation latency.
"""


def main():
    from project_settings import project_path
    searcher = ChunkSearcher()
    match_content = searcher.chunk_search(
        CHUNK,
        PAGE_CONTENT,
        win_size_radio=1.6,
    )
    print(match_content)

    chunk_similarity = ChunkSimilarity()
    scores = chunk_similarity.similar(CHUNK, match_content)
    print(scores)
    return


if __name__ == "__main__":
    main()