Spaces:
Sleeping
Sleeping
File size: 2,361 Bytes
e7a412f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import io
import jaconv
import spacy
import re
# 文字列としての検索
def find_string(text, wordlist):
rtn = []
for word in wordlist:
if re.search(word, text):
rtn.append(word)
return rtn
# 単語としての検索
def find_word(text, wordlist):
nlp = spacy.load("ja_ginza")
doc = nlp(text)
rtn = []
for token in doc:
if token.lemma_ in wordlist:
rtn.append(str(token))
return rtn
# 「1単語からなるもの」は単語として検索、「2単語以上からなるもの」は文字列として検索
def search_ng_word(input_text, ng_wordlist_1, ng_wordlist_2):
rtn = find_word(input_text, ng_wordlist_1) + find_string(input_text, ng_wordlist_2)
rtn = list(set(rtn))
return rtn
def get_ng_wordlist(wordlist_path, discrepancies=False):
with io.open(wordlist_path, "r", encoding="utf-8") as f:
ng_wordlist = f.read().split("\n")
ng_wordlist = [word for word in ng_wordlist if len(word) > 0]
# 表記揺れに対応するためひらがな版とカタカナ版を用意
if discrepancies:
l = len(ng_wordlist)
for i in range(l):
ng_wordlist.append(jaconv.kata2hira(ng_wordlist[i]))
ng_wordlist.append(jaconv.hira2kata(ng_wordlist[i]))
ng_wordlist.append(jaconv.hira2kata(ng_wordlist[i]).lower())
ng_wordlist.append(jaconv.hira2kata(ng_wordlist[i]).upper())
ng_wordlist = list(set(ng_wordlist))
# NGワードを「1単語からなるもの」と「2単語以上からなるもの」に分類
nlp = spacy.load("ja_ginza")
ng_wordlist_1 = []
ng_wordlist_2 = []
for word in ng_wordlist:
doc = nlp(word)
if len(doc) == 1:
ng_wordlist_1.append(word)
elif len(doc) >= 2:
ng_wordlist_2.append(word)
return ng_wordlist_1, ng_wordlist_2
def get_ng_wordlist_from_saved(wordlist_1_path, wordlist_2_path):
with io.open(wordlist_1_path, "r", encoding="utf-8") as f:
ng_wordlist_1 = f.read().split("\n")
ng_wordlist_1 = [word for word in ng_wordlist_1 if len(word) > 0]
with io.open(wordlist_2_path, "r", encoding="utf-8") as f:
ng_wordlist_2 = f.read().split("\n")
ng_wordlist_2 = [word for word in ng_wordlist_2 if len(word) > 0]
return ng_wordlist_1, ng_wordlist_2 |