File size: 2,361 Bytes
e7a412f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import io
import jaconv
import spacy
import re

# 文字列としての検索
def find_string(text, wordlist):
    rtn = []
    for word in wordlist:
        if re.search(word, text):
            rtn.append(word)
    return rtn

# 単語としての検索
def find_word(text, wordlist):
    nlp = spacy.load("ja_ginza")
    doc = nlp(text)
    rtn = []
    for token in doc:
        if token.lemma_ in wordlist:
            rtn.append(str(token))
    return rtn

# 「1単語からなるもの」は単語として検索、「2単語以上からなるもの」は文字列として検索
def search_ng_word(input_text, ng_wordlist_1, ng_wordlist_2):
    rtn = find_word(input_text, ng_wordlist_1) + find_string(input_text, ng_wordlist_2)
    rtn = list(set(rtn))
    return rtn

def get_ng_wordlist(wordlist_path, discrepancies=False):
    with io.open(wordlist_path, "r", encoding="utf-8") as f:
        ng_wordlist = f.read().split("\n")
    ng_wordlist = [word for word in ng_wordlist if len(word) > 0]

    # 表記揺れに対応するためひらがな版とカタカナ版を用意
    if discrepancies:
        l = len(ng_wordlist)
        for i in range(l):
            ng_wordlist.append(jaconv.kata2hira(ng_wordlist[i]))
            ng_wordlist.append(jaconv.hira2kata(ng_wordlist[i]))
            ng_wordlist.append(jaconv.hira2kata(ng_wordlist[i]).lower())
            ng_wordlist.append(jaconv.hira2kata(ng_wordlist[i]).upper())
        ng_wordlist = list(set(ng_wordlist))

    # NGワードを「1単語からなるもの」と「2単語以上からなるもの」に分類
    nlp = spacy.load("ja_ginza")
    ng_wordlist_1 = []
    ng_wordlist_2 = []

    for word in ng_wordlist:
        doc = nlp(word)
        if len(doc) == 1:
            ng_wordlist_1.append(word)
        elif len(doc) >= 2:
            ng_wordlist_2.append(word)

    return ng_wordlist_1, ng_wordlist_2

def get_ng_wordlist_from_saved(wordlist_1_path, wordlist_2_path):

    with io.open(wordlist_1_path, "r", encoding="utf-8") as f:
        ng_wordlist_1 = f.read().split("\n")
    ng_wordlist_1 = [word for word in ng_wordlist_1 if len(word) > 0]

    with io.open(wordlist_2_path, "r", encoding="utf-8") as f:
        ng_wordlist_2 = f.read().split("\n")
    ng_wordlist_2 = [word for word in ng_wordlist_2 if len(word) > 0]

    return ng_wordlist_1, ng_wordlist_2