Spaces:
Running
Running
File size: 6,612 Bytes
e94100d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import difflib
from typing import List, Tuple
import editdistance
from project_settings import project_path
from toolbox.string.tokenization import FastTokenizer
class ChunkSearcher(object):
def __init__(self,
vocab_file: str = (project_path / "data/vocab.txt").as_posix()
):
# 需要一个多语言的分词器. (找一个多语言 bert 的词汇表).
self.tokenizer = self.init_tokenizer(vocab_file)
@staticmethod
def init_tokenizer(vocab_file):
tokenizer = FastTokenizer()
with open(vocab_file, "r", encoding="utf-8") as f:
for row in f:
token = str(row).strip()
tokenizer.insert(token)
return tokenizer
def chunk_search(self, chunk: str, content: str, win_size_radio: float = 1.5):
chunk_tokens, _ = self.tokenizer.tokenize(chunk)
content_tokens, _ = self.tokenizer.tokenize(content)
counter = [0] * len(content_tokens)
win_score = [0] * len(content_tokens)
for token1 in chunk_tokens:
if len(token1.strip()) == 0:
continue
for idx, token2 in enumerate(content_tokens):
if token1 == token2:
counter[idx] = 1
win_size = len(chunk_tokens) * win_size_radio
win_size = int(win_size)
for begin in range(0, len(content_tokens) - win_size, 1):
win = counter[begin: begin+win_size]
score = sum(win)
win_score[begin] = score
idx = win_score.index(max(win_score))
match = content_tokens[idx: idx+win_size]
match_content = "".join(match)
match_content = self.rstrip_match_content(chunk, match_content)
match_content = self.rstrip_match_content(chunk[::-1], match_content[::-1])
match_content = match_content[::-1]
return match_content
def rstrip_match_content(self, chunk: str, match_content: str):
differ = difflib.Differ()
diff = differ.compare(match_content, chunk)
operation_list = list()
for d in diff:
operation = d[0]
operation_list.append(operation)
r_strip_count = 0
for operation in reversed(operation_list):
if operation != "-":
break
r_strip_count += 1
if r_strip_count != 0:
match_content = match_content[:-r_strip_count].strip()
return match_content
class ChunkSimilarity(object):
def edit_distance(self, chunk: str, match_content: str) -> List[Tuple[str, float, str]]:
edit_distance = editdistance.distance(chunk, match_content)
chunk_length = len(chunk)
content_length = len(match_content)
normalized_edit_distance = edit_distance / (chunk_length + content_length)
normalized_edit_distance2 = 2 * edit_distance / (chunk_length + content_length)
result = [
("edit_distance", edit_distance, ""),
(
"ed_score", round(1 - normalized_edit_distance, 4),
"1 - d / (l1 + l2)"
),
(
"ed_score2", round(1 - normalized_edit_distance2, 4),
"1 - 2*d / (l1 + l2)"
),
]
return result
def seq_match(self, chunk: str, match_content: str) -> List[Tuple[str, str, str]]:
seq_match = difflib.SequenceMatcher()
seq_match.set_seqs(chunk, match_content)
score = seq_match.ratio()
result = [
("seq_match", round(score, 4), "(2.0*M / T) similar to edit_distance"),
]
return result
def similar(self, chunk: str, match_content: str):
result = [
("metric", "score", "note")
]
scores = self.edit_distance(chunk, match_content)
result.extend(scores)
scores = self.seq_match(chunk, match_content)
result.extend(scores)
return result
PAGE_CONTENT = """
40
麦肯锡中国金融业 CEO季刊 2023年秋季刊
2023年人工智能发展现状:
生成式 AI的突破之年
Michael Chui ,Eric Hazan ,Lareina Yee ,Bryce Hall ,Alex Singla
和Alexander Sukharevsky如 今 ,生 成 式 AI工具遍地开花, 各组织均在快速部署; 麦肯锡调查的
受访者们预计, 该技术将对自己所在行业及就业产生重大影响。
41
2023年 人 工 智 能 发 展 现 状 :生 成 式 AI的突破之年
麦肯锡针对人工智能发展现状的最新年度全球调研结果证实, 生
成式人工智能 (简称 GenAI )工 具 已 出 现 爆 炸 式 增 长 。许 多 此 类 工
具 至 今 推 出 尚 不 满 一 年 ,但 已 有 1/3的 受 访 者 表 示 ,其 所 在 组 织 会 在
至少一项业务职能中经常使 用 GenAI 。 随着这些最新进展, 人工智
能 已 经 从 一 个 技 术 话 题 上 升 为 企 业 领 导 的 关 注 焦 点 :近 1/4受访高
管 表 示 ,他 们 会 在 工 作 中 使 用 GenAI 工具; 而在已应用人工智能的
企 业 中,有 超 过 1/4的受访者表示 GenAI 已 被 列 入 董 事 会 议 程 。此 外 ,
40% 的受访者表示, 其所在组织将会因 GenAI 的最新进 展而增加对
人工智能的整体投入。 调查结果表明, GenAI 相关风险管理仍处于
早期阶段: 即便是针对受访者眼中最常见的不准确问题, 也只有不
"""
CHUNK = """2023年人工智能发展现状:生成式AI的突破之年"""
CHUNK1 = """
Among these PEFT methods, the reparameterization-based method low-rank adaptation (LoRA) (Hu et al.,2021) is considered one of the most efficient and effective methods at present.
LoRA is especially popular after open-sourced LLMs become ubiquitous (Dettmers et al., 2023).
LoRA assumes that the change of the model’s parameters for adaptation is intrinsically low-dimensional and performs adaptation by optimizing the matrix obtained from low-rank decomposition.
Since it is in the form of weight matrix reparameterization, LoRA parameters can be merged with the original LLMs and cause no forward propagation latency.
"""
def main():
from project_settings import project_path
searcher = ChunkSearcher()
match_content = searcher.chunk_search(
CHUNK,
PAGE_CONTENT,
win_size_radio=1.6,
)
print(match_content)
chunk_similarity = ChunkSimilarity()
scores = chunk_similarity.similar(CHUNK, match_content)
print(scores)
return
if __name__ == "__main__":
main()
|