import io import jaconv import spacy import re # 文字列としての検索 def find_string(text, wordlist): rtn = [] for word in wordlist: if re.search(word, text): rtn.append(word) return rtn # 単語としての検索 def find_word(text, wordlist): nlp = spacy.load("ja_ginza") doc = nlp(text) rtn = [] for token in doc: if token.lemma_ in wordlist: rtn.append(str(token)) return rtn # 「1単語からなるもの」は単語として検索、「2単語以上からなるもの」は文字列として検索 def search_ng_word(input_text, ng_wordlist_1, ng_wordlist_2): rtn = find_word(input_text, ng_wordlist_1) + find_string(input_text, ng_wordlist_2) rtn = list(set(rtn)) return rtn def get_ng_wordlist(wordlist_path, discrepancies=False): with io.open(wordlist_path, "r", encoding="utf-8") as f: ng_wordlist = f.read().split("\n") ng_wordlist = [word for word in ng_wordlist if len(word) > 0] # 表記揺れに対応するためひらがな版とカタカナ版を用意 if discrepancies: l = len(ng_wordlist) for i in range(l): ng_wordlist.append(jaconv.kata2hira(ng_wordlist[i])) ng_wordlist.append(jaconv.hira2kata(ng_wordlist[i])) ng_wordlist.append(jaconv.hira2kata(ng_wordlist[i]).lower()) ng_wordlist.append(jaconv.hira2kata(ng_wordlist[i]).upper()) ng_wordlist = list(set(ng_wordlist)) # NGワードを「1単語からなるもの」と「2単語以上からなるもの」に分類 nlp = spacy.load("ja_ginza") ng_wordlist_1 = [] ng_wordlist_2 = [] for word in ng_wordlist: doc = nlp(word) if len(doc) == 1: ng_wordlist_1.append(word) elif len(doc) >= 2: ng_wordlist_2.append(word) return ng_wordlist_1, ng_wordlist_2 def get_ng_wordlist_from_saved(wordlist_1_path, wordlist_2_path): with io.open(wordlist_1_path, "r", encoding="utf-8") as f: ng_wordlist_1 = f.read().split("\n") ng_wordlist_1 = [word for word in ng_wordlist_1 if len(word) > 0] with io.open(wordlist_2_path, "r", encoding="utf-8") as f: ng_wordlist_2 = f.read().split("\n") ng_wordlist_2 = [word for word in ng_wordlist_2 if len(word) > 0] return ng_wordlist_1, ng_wordlist_2