Spaces:
Sleeping
Sleeping
import io | |
import jaconv | |
import spacy | |
import re | |
# 文字列としての検索 | |
def find_string(text, wordlist): | |
rtn = [] | |
for word in wordlist: | |
if re.search(word, text): | |
rtn.append(word) | |
return rtn | |
# 単語としての検索 | |
def find_word(text, wordlist): | |
nlp = spacy.load("ja_ginza") | |
doc = nlp(text) | |
rtn = [] | |
for token in doc: | |
if token.lemma_ in wordlist: | |
rtn.append(str(token)) | |
return rtn | |
# 「1単語からなるもの」は単語として検索、「2単語以上からなるもの」は文字列として検索 | |
def search_ng_word(input_text, ng_wordlist_1, ng_wordlist_2): | |
rtn = find_word(input_text, ng_wordlist_1) + find_string(input_text, ng_wordlist_2) | |
rtn = list(set(rtn)) | |
return rtn | |
def get_ng_wordlist(wordlist_path, discrepancies=False): | |
with io.open(wordlist_path, "r", encoding="utf-8") as f: | |
ng_wordlist = f.read().split("\n") | |
ng_wordlist = [word for word in ng_wordlist if len(word) > 0] | |
# 表記揺れに対応するためひらがな版とカタカナ版を用意 | |
if discrepancies: | |
l = len(ng_wordlist) | |
for i in range(l): | |
ng_wordlist.append(jaconv.kata2hira(ng_wordlist[i])) | |
ng_wordlist.append(jaconv.hira2kata(ng_wordlist[i])) | |
ng_wordlist.append(jaconv.hira2kata(ng_wordlist[i]).lower()) | |
ng_wordlist.append(jaconv.hira2kata(ng_wordlist[i]).upper()) | |
ng_wordlist = list(set(ng_wordlist)) | |
# NGワードを「1単語からなるもの」と「2単語以上からなるもの」に分類 | |
nlp = spacy.load("ja_ginza") | |
ng_wordlist_1 = [] | |
ng_wordlist_2 = [] | |
for word in ng_wordlist: | |
doc = nlp(word) | |
if len(doc) == 1: | |
ng_wordlist_1.append(word) | |
elif len(doc) >= 2: | |
ng_wordlist_2.append(word) | |
return ng_wordlist_1, ng_wordlist_2 | |
def get_ng_wordlist_from_saved(wordlist_1_path, wordlist_2_path): | |
with io.open(wordlist_1_path, "r", encoding="utf-8") as f: | |
ng_wordlist_1 = f.read().split("\n") | |
ng_wordlist_1 = [word for word in ng_wordlist_1 if len(word) > 0] | |
with io.open(wordlist_2_path, "r", encoding="utf-8") as f: | |
ng_wordlist_2 = f.read().split("\n") | |
ng_wordlist_2 = [word for word in ng_wordlist_2 if len(word) > 0] | |
return ng_wordlist_1, ng_wordlist_2 |