NGword-detector-ver2 / utils /find_ng_word.py
koheibaba
upload files
e7a412f
raw
history blame
2.36 kB
import io
import jaconv
import spacy
import re
# 文字列としての検索
def find_string(text, wordlist):
rtn = []
for word in wordlist:
if re.search(word, text):
rtn.append(word)
return rtn
# 単語としての検索
def find_word(text, wordlist):
nlp = spacy.load("ja_ginza")
doc = nlp(text)
rtn = []
for token in doc:
if token.lemma_ in wordlist:
rtn.append(str(token))
return rtn
# 「1単語からなるもの」は単語として検索、「2単語以上からなるもの」は文字列として検索
def search_ng_word(input_text, ng_wordlist_1, ng_wordlist_2):
rtn = find_word(input_text, ng_wordlist_1) + find_string(input_text, ng_wordlist_2)
rtn = list(set(rtn))
return rtn
def get_ng_wordlist(wordlist_path, discrepancies=False):
with io.open(wordlist_path, "r", encoding="utf-8") as f:
ng_wordlist = f.read().split("\n")
ng_wordlist = [word for word in ng_wordlist if len(word) > 0]
# 表記揺れに対応するためひらがな版とカタカナ版を用意
if discrepancies:
l = len(ng_wordlist)
for i in range(l):
ng_wordlist.append(jaconv.kata2hira(ng_wordlist[i]))
ng_wordlist.append(jaconv.hira2kata(ng_wordlist[i]))
ng_wordlist.append(jaconv.hira2kata(ng_wordlist[i]).lower())
ng_wordlist.append(jaconv.hira2kata(ng_wordlist[i]).upper())
ng_wordlist = list(set(ng_wordlist))
# NGワードを「1単語からなるもの」と「2単語以上からなるもの」に分類
nlp = spacy.load("ja_ginza")
ng_wordlist_1 = []
ng_wordlist_2 = []
for word in ng_wordlist:
doc = nlp(word)
if len(doc) == 1:
ng_wordlist_1.append(word)
elif len(doc) >= 2:
ng_wordlist_2.append(word)
return ng_wordlist_1, ng_wordlist_2
def get_ng_wordlist_from_saved(wordlist_1_path, wordlist_2_path):
with io.open(wordlist_1_path, "r", encoding="utf-8") as f:
ng_wordlist_1 = f.read().split("\n")
ng_wordlist_1 = [word for word in ng_wordlist_1 if len(word) > 0]
with io.open(wordlist_2_path, "r", encoding="utf-8") as f:
ng_wordlist_2 = f.read().split("\n")
ng_wordlist_2 = [word for word in ng_wordlist_2 if len(word) > 0]
return ng_wordlist_1, ng_wordlist_2