File size: 6,612 Bytes
e94100d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import difflib
from typing import List, Tuple

import editdistance

from project_settings import project_path
from toolbox.string.tokenization import FastTokenizer


class ChunkSearcher(object):
    def __init__(self,
                 vocab_file: str = (project_path / "data/vocab.txt").as_posix()
                 ):
        # 需要一个多语言的分词器. (找一个多语言 bert 的词汇表).
        self.tokenizer = self.init_tokenizer(vocab_file)

    @staticmethod
    def init_tokenizer(vocab_file):
        tokenizer = FastTokenizer()
        with open(vocab_file, "r", encoding="utf-8") as f:
            for row in f:
                token = str(row).strip()
                tokenizer.insert(token)
        return tokenizer

    def chunk_search(self, chunk: str, content: str, win_size_radio: float = 1.5):
        chunk_tokens, _ = self.tokenizer.tokenize(chunk)
        content_tokens, _ = self.tokenizer.tokenize(content)

        counter = [0] * len(content_tokens)
        win_score = [0] * len(content_tokens)

        for token1 in chunk_tokens:
            if len(token1.strip()) == 0:
                continue
            for idx, token2 in enumerate(content_tokens):
                if token1 == token2:
                    counter[idx] = 1

        win_size = len(chunk_tokens) * win_size_radio
        win_size = int(win_size)
        for begin in range(0, len(content_tokens) - win_size, 1):
            win = counter[begin: begin+win_size]
            score = sum(win)
            win_score[begin] = score

        idx = win_score.index(max(win_score))

        match = content_tokens[idx: idx+win_size]
        match_content = "".join(match)

        match_content = self.rstrip_match_content(chunk, match_content)
        match_content = self.rstrip_match_content(chunk[::-1], match_content[::-1])
        match_content = match_content[::-1]

        return match_content

    def rstrip_match_content(self, chunk: str, match_content: str):
        differ = difflib.Differ()
        diff = differ.compare(match_content, chunk)

        operation_list = list()
        for d in diff:
            operation = d[0]
            operation_list.append(operation)

        r_strip_count = 0
        for operation in reversed(operation_list):
            if operation != "-":
                break
            r_strip_count += 1

        if r_strip_count != 0:
            match_content = match_content[:-r_strip_count].strip()
        return match_content


class ChunkSimilarity(object):

    def edit_distance(self, chunk: str, match_content: str) -> List[Tuple[str, float, str]]:
        edit_distance = editdistance.distance(chunk, match_content)

        chunk_length = len(chunk)
        content_length = len(match_content)

        normalized_edit_distance = edit_distance / (chunk_length + content_length)
        normalized_edit_distance2 = 2 * edit_distance / (chunk_length + content_length)

        result = [
            ("edit_distance", edit_distance, ""),
            (
                "ed_score", round(1 - normalized_edit_distance, 4),
                "1 - d / (l1 + l2)"
            ),
            (
                "ed_score2", round(1 - normalized_edit_distance2, 4),
                "1 - 2*d / (l1 + l2)"
            ),
        ]
        return result

    def seq_match(self, chunk: str, match_content: str) -> List[Tuple[str, str, str]]:
        seq_match = difflib.SequenceMatcher()
        seq_match.set_seqs(chunk, match_content)
        score = seq_match.ratio()

        result = [
            ("seq_match", round(score, 4), "(2.0*M / T) similar to edit_distance"),
        ]
        return result

    def similar(self, chunk: str, match_content: str):
        result = [
            ("metric", "score", "note")
        ]
        scores = self.edit_distance(chunk, match_content)
        result.extend(scores)
        scores = self.seq_match(chunk, match_content)
        result.extend(scores)

        return result


PAGE_CONTENT = """
40
麦肯锡中国金融业 CEO季刊        2023年秋季刊
2023年人工智能发展现状:  
生成式 AI的突破之年
Michael Chui ,Eric Hazan ,Lareina Yee ,Bryce Hall ,Alex Singla  
和Alexander Sukharevsky如 今 ,生 成 式 AI工具遍地开花, 各组织均在快速部署; 麦肯锡调查的
受访者们预计, 该技术将对自己所在行业及就业产生重大影响。

41
2023年 人 工 智 能 发 展 现 状 :生 成 式 AI的突破之年
麦肯锡针对人工智能发展现状的最新年度全球调研结果证实, 生
成式人工智能 (简称 GenAI )工 具 已 出 现 爆 炸 式 增 长 。许 多 此 类 工
具 至 今 推 出 尚 不 满 一 年 ,但 已 有 1/3的 受 访 者 表 示 ,其 所 在 组 织 会 在
至少一项业务职能中经常使 用 GenAI 。 随着这些最新进展, 人工智
能 已 经 从 一 个 技 术 话 题 上 升 为 企 业 领 导 的 关 注 焦 点 :近 1/4受访高
管 表 示 ,他 们 会 在 工 作 中 使 用 GenAI 工具; 而在已应用人工智能的
企 业 中,有 超 过 1/4的受访者表示 GenAI 已 被 列 入 董 事 会 议 程 。此 外 ,
40% 的受访者表示, 其所在组织将会因 GenAI 的最新进 展而增加对
人工智能的整体投入。 调查结果表明, GenAI 相关风险管理仍处于
早期阶段: 即便是针对受访者眼中最常见的不准确问题, 也只有不
"""


CHUNK = """2023年人工智能发展现状:生成式AI的突破之年"""


CHUNK1 = """
Among these PEFT methods, the reparameterization-based method low-rank adaptation (LoRA) (Hu et al.,2021) is considered one of the most efficient and effective methods at present. 
LoRA is especially popular after open-sourced LLMs become ubiquitous (Dettmers et al., 2023). 
LoRA assumes that the change of the model’s parameters for adaptation is intrinsically low-dimensional and performs adaptation by optimizing the matrix obtained from low-rank decomposition. 
Since it is in the form of weight matrix reparameterization, LoRA parameters can be merged with the original LLMs and cause no forward propagation latency.
"""


def main():
    from project_settings import project_path
    searcher = ChunkSearcher()
    match_content = searcher.chunk_search(
        CHUNK,
        PAGE_CONTENT,
        win_size_radio=1.6,
    )
    print(match_content)

    chunk_similarity = ChunkSimilarity()
    scores = chunk_similarity.similar(CHUNK, match_content)
    print(scores)
    return


if __name__ == "__main__":
    main()