Spaces:
Runtime error
Runtime error
File size: 2,397 Bytes
d825710 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
from __future__ import absolute_import, unicode_literals
import re
import os
import sys
import pickle
from .._compat import *
MIN_FLOAT = -3.14e100
PROB_START_P = "prob_start.p"
PROB_TRANS_P = "prob_trans.p"
PROB_EMIT_P = "prob_emit.p"
PrevStatus = {
'B': 'ES',
'M': 'MB',
'S': 'SE',
'E': 'BM'
}
def load_model():
start_p = pickle.load(get_module_res("finalseg", PROB_START_P))
trans_p = pickle.load(get_module_res("finalseg", PROB_TRANS_P))
emit_p = pickle.load(get_module_res("finalseg", PROB_EMIT_P))
return start_p, trans_p, emit_p
if sys.platform.startswith("java"):
start_P, trans_P, emit_P = load_model()
else:
from .prob_start import P as start_P
from .prob_trans import P as trans_P
from .prob_emit import P as emit_P
def viterbi(obs, states, start_p, trans_p, emit_p):
V = [{}] # tabular
path = {}
for y in states: # init
V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
path[y] = [y]
for t in xrange(1, len(obs)):
V.append({})
newpath = {}
for y in states:
em_p = emit_p[y].get(obs[t], MIN_FLOAT)
(prob, state) = max(
[(V[t - 1][y0] + trans_p[y0].get(y, MIN_FLOAT) + em_p, y0) for y0 in PrevStatus[y]])
V[t][y] = prob
newpath[y] = path[state] + [y]
path = newpath
(prob, state) = max((V[len(obs) - 1][y], y) for y in 'ES')
return (prob, path[state])
def __cut(sentence):
global emit_P
prob, pos_list = viterbi(sentence, 'BMES', start_P, trans_P, emit_P)
begin, nexti = 0, 0
# print pos_list, sentence
for i, char in enumerate(sentence):
pos = pos_list[i]
if pos == 'B':
begin = i
elif pos == 'E':
yield sentence[begin:i + 1]
nexti = i + 1
elif pos == 'S':
yield char
nexti = i + 1
if nexti < len(sentence):
yield sentence[nexti:]
re_han = re.compile("([\u4E00-\u9FD5]+)")
re_skip = re.compile("(\d+\.\d+|[a-zA-Z0-9]+)")
def cut(sentence):
sentence = strdecode(sentence)
blocks = re_han.split(sentence)
for blk in blocks:
if re_han.match(blk):
for word in __cut(blk):
yield word
else:
tmp = re_skip.split(blk)
for x in tmp:
if x:
yield x
|