Spaces:
Sleeping
Sleeping
TintinMeimei
commited on
Commit
•
d938037
1
Parent(s):
2ac7b58
Upload 3 files
Browse files- tmp/algo.py +143 -0
- tmp/demo_ai_search.py +36 -0
- tmp/lvchan.xlsx +0 -0
tmp/algo.py
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
import jieba
|
4 |
+
import pandas as pd
|
5 |
+
from sentence_transformers import SentenceTransformer, util
|
6 |
+
|
7 |
+
|
8 |
+
class AlgoRule:
|
9 |
+
|
10 |
+
|
11 |
+
def __init__(self) -> None:
|
12 |
+
df_lvchan = pd.read_excel('lvchan.xlsx', sheet_name='Sheet1')
|
13 |
+
df_lvchan.columns = df_lvchan.iloc[0]
|
14 |
+
df_lvchan = df_lvchan[1:]
|
15 |
+
sep = r'[,、]'
|
16 |
+
self.dict_rule_index = {
|
17 |
+
'kuan': {},
|
18 |
+
'wuxiang': {},
|
19 |
+
'wuxiang_xianding': {},
|
20 |
+
}
|
21 |
+
for _, row in df_lvchan.iterrows():
|
22 |
+
item = row['三级标题']
|
23 |
+
for word in re.split(sep, row['宽口径(复核)']):
|
24 |
+
self.dict_rule_index['kuan'].setdefault(word, []).append(item)
|
25 |
+
for word in re.split(sep, row['物象关键词(复核)']):
|
26 |
+
self.dict_rule_index['wuxiang'].setdefault(word, []).append(item)
|
27 |
+
for word2 in re.split(sep, row['限定词(复核)']):
|
28 |
+
self.dict_rule_index['wuxiang_xianding'].setdefault('_'.join([word, word2]), []).append(item)
|
29 |
+
for k in self.dict_rule_index.keys():
|
30 |
+
for key in self.dict_rule_index[k].keys():
|
31 |
+
self.dict_rule_index[k][key] = list(set(self.dict_rule_index[k][key]))
|
32 |
+
|
33 |
+
|
34 |
+
def _tokenize(self, text):
|
35 |
+
tokens = [tok for tok in jieba.cut(text)]
|
36 |
+
return tokens
|
37 |
+
|
38 |
+
|
39 |
+
def _is_match(self, word, query):
|
40 |
+
items = self._tokenize(query)
|
41 |
+
for item in items:
|
42 |
+
if item == word:
|
43 |
+
return True
|
44 |
+
return False
|
45 |
+
|
46 |
+
|
47 |
+
def _match(self, query):
|
48 |
+
result = {}
|
49 |
+
matches = {
|
50 |
+
'wuxiang_xianding': [],
|
51 |
+
'wuxiang': [],
|
52 |
+
'kuan': [],
|
53 |
+
}
|
54 |
+
# Test 1st route: match both wuxiang and xianding
|
55 |
+
flag = False
|
56 |
+
for key in self.dict_rule_index['wuxiang_xianding'].keys():
|
57 |
+
wuxiang, xianding = key.split('_')
|
58 |
+
items = self.dict_rule_index['wuxiang_xianding'][key]
|
59 |
+
if self._is_match(wuxiang, query) and self._is_match(xianding, query):
|
60 |
+
# if wuxiang in query and xianding in query:
|
61 |
+
for item in items:
|
62 |
+
r = result.setdefault(item, {})
|
63 |
+
r.setdefault('限定词+物项关键词', []).append('+'.join([xianding, wuxiang]))
|
64 |
+
flag = True
|
65 |
+
if flag is True:
|
66 |
+
# clean result
|
67 |
+
for key1 in result.keys():
|
68 |
+
for key2 in result[key1].keys():
|
69 |
+
result[key1][key2] = ' ; '.join(result[key1][key2])
|
70 |
+
return result
|
71 |
+
# Test 2nd route: match wuxiang only
|
72 |
+
r2 = ''
|
73 |
+
for key in self.dict_rule_index['wuxiang'].keys():
|
74 |
+
items = self.dict_rule_index['wuxiang'][key]
|
75 |
+
if self._is_match(key, query):
|
76 |
+
# if key in query:
|
77 |
+
for item in items:
|
78 |
+
r = result.setdefault(item, {})
|
79 |
+
r.setdefault('物项关键词', []).append(key)
|
80 |
+
# Test 3rd route: match kuan
|
81 |
+
r3 = ''
|
82 |
+
for key in self.dict_rule_index['kuan'].keys():
|
83 |
+
items = self.dict_rule_index['kuan'][key]
|
84 |
+
if self._is_match(key, query):
|
85 |
+
# if key in query:
|
86 |
+
for item in items:
|
87 |
+
r = result.setdefault(item, {})
|
88 |
+
r.setdefault('宽口径', []).append(key)
|
89 |
+
# clean result
|
90 |
+
for key1 in result.keys():
|
91 |
+
for key2 in result[key1].keys():
|
92 |
+
result[key1][key2] = ' ; '.join(result[key1][key2])
|
93 |
+
return result
|
94 |
+
|
95 |
+
|
96 |
+
def algo(self, query):
|
97 |
+
result = self._match(query)
|
98 |
+
result = [item.strip() for item in result.keys()]
|
99 |
+
return result
|
100 |
+
|
101 |
+
|
102 |
+
class AlgoAI:
|
103 |
+
|
104 |
+
|
105 |
+
def __init__(self) -> None:
|
106 |
+
# self.model = SentenceTransformer('DMetaSoul/sbert-chinese-general-v2')
|
107 |
+
self.model = SentenceTransformer('TintinMeimei/menglang_yongtulv_aimatch_v1')
|
108 |
+
df_lvchan = pd.read_excel('../lvchan.xlsx', sheet_name='Sheet1')
|
109 |
+
df_lvchan.columns = df_lvchan.iloc[0]
|
110 |
+
df_lvchan = df_lvchan[1:]
|
111 |
+
dict_lvchan = dict((row['三级标题'].strip(), '\n'.join([row['三级标题'].strip(), row['解释说明']])) for _, row in df_lvchan.iterrows())
|
112 |
+
self.dict_lvchan_vectors = dict((key, self.model.encode(dict_lvchan[key], convert_to_tensor=True)) for key in dict_lvchan.keys())
|
113 |
+
self.thres = 0.25
|
114 |
+
|
115 |
+
|
116 |
+
def _sim(self, query, item):
|
117 |
+
emb1 = self.model.encode(query, convert_to_tensor=True)
|
118 |
+
emb2 = item
|
119 |
+
score = util.cos_sim(emb1, emb2)
|
120 |
+
return score
|
121 |
+
|
122 |
+
|
123 |
+
def _match(self, query):
|
124 |
+
result = []
|
125 |
+
for key in self.dict_lvchan_vectors.keys():
|
126 |
+
score = self._sim(query, self.dict_lvchan_vectors[key])
|
127 |
+
if score > self.thres:
|
128 |
+
result.append(key)
|
129 |
+
return result
|
130 |
+
|
131 |
+
|
132 |
+
def algo(self, query):
|
133 |
+
result = self._match(query)
|
134 |
+
return result
|
135 |
+
|
136 |
+
|
137 |
+
if __name__ == '__main__':
|
138 |
+
algo = AlgoRule()
|
139 |
+
query = '无害生活垃圾'
|
140 |
+
print(algo.algo(query))
|
141 |
+
|
142 |
+
algo2 = AlgoAI()
|
143 |
+
print(algo2.algo(query))
|
tmp/demo_ai_search.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
import algo
|
4 |
+
|
5 |
+
algo_rule = algo.AlgoRule()
|
6 |
+
algo_ai = algo.AlgoAI()
|
7 |
+
|
8 |
+
def process(query):
|
9 |
+
r1 = algo_rule.algo(query)
|
10 |
+
r1 = sorted(r1)
|
11 |
+
text_r1 = ''
|
12 |
+
for item in r1:
|
13 |
+
text_r1 += '\n'+'- '+item
|
14 |
+
|
15 |
+
r2 = algo_ai.algo(query)
|
16 |
+
text_r2 = ''
|
17 |
+
for item in r2:
|
18 |
+
text_r2 += '\n'+'- '+item
|
19 |
+
|
20 |
+
output = f'''
|
21 |
+
绿产目录匹配结果 - 关键词规则:
|
22 |
+
{text_r1}
|
23 |
+
|
24 |
+
|
25 |
+
绿产目录匹配结果 - AI匹配:
|
26 |
+
{text_r2}
|
27 |
+
'''
|
28 |
+
return output
|
29 |
+
|
30 |
+
|
31 |
+
# We instantiate the Textbox class
|
32 |
+
textbox_input = gr.Textbox(label="输入", placeholder="", lines=2)
|
33 |
+
textbox_output = gr.Textbox(label="绿产目录匹配", placeholder="", lines=15)
|
34 |
+
|
35 |
+
demo = gr.Interface(fn=process, inputs=textbox_input, outputs=textbox_output)
|
36 |
+
demo.launch(share=False, server_name='0.0.0.0', server_port=8001)
|
tmp/lvchan.xlsx
ADDED
Binary file (79.3 kB). View file
|
|