import nltk |
import re |
import string |
import polars as pl |
from nltk.stem import WordNetLemmatizer |
from nltk.stem import PorterStemmer |
from nltk.corpus import wordnet as wn |
from nltk import word_tokenize |
import calendar |
from tqdm import tqdm |
import gradio as gr |
stemmer = PorterStemmer() |
nltk.download('stopwords') |
nltk.download('wordnet') |
all_names = [x.lower() for x in list(nltk.corpus.names.words())] |
custom_words = [] |
my_stop_words = custom_words |
cal_month = (list(calendar.month_name)) |
cal_month = [x.lower() for x in cal_month] |
cal_month = [i for i in cal_month if i] |
custom_words.extend(cal_month) |
''' |
# + |
# Remove all html elements from the text. Inspired by this: https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string |
def remove_email_start(text): |
cleanr = re.compile('.*importance:|.*subject:') |
cleantext = re.sub(cleanr, '', text) |
return cleantext |
def remove_email_end(text): |
cleanr = re.compile('kind regards.*|many thanks.*|sincerely.*') |
cleantext = re.sub(cleanr, '', text) |
return cleantext |
def cleanhtml(text): |
cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0') |
cleantext = re.sub(cleanr, '', text) |
return cleantext |
## The above doesn't work when there is no > at the end of the string to match the initial <. Trying this: <[^>]+> but needs work: https://stackoverflow.com/questions/2013124/regex-matching-up-to-the-first-occurrence-of-a-character |
# Remove all email addresses and numbers from the text |
def cleanemail(text): |
cleanr = re.compile('\S*@\S*\s?|\xa0') |
cleantext = re.sub(cleanr, '', text) |
return cleantext |
def cleannum(text): |
cleanr = re.compile(r'[0-9]+') |
cleantext = re.sub(cleanr, '', text) |
return cleantext |
def cleanpostcode(text): |
cleanr = re.compile(r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)') |
cleantext = re.sub(cleanr, '', text) |
return cleantext |
def cleanwarning(text): |
cleanr = re.compile('caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.') |
cleantext = re.sub(cleanr, '', text) |
return cleantext |
# - |
def initial_clean(texts): |
clean_texts = [] |
for text in texts: |
text = remove_email_start(text) |
text = remove_email_end(text) |
text = cleanpostcode(text) |
text = remove_hyphens(text) |
text = cleanhtml(text) |
text = cleanemail(text) |
#text = cleannum(text) |
clean_texts.append(text) |
return clean_texts |
''' |
email_start_pattern_regex = r'.*importance:|.*subject:' |
email_end_pattern_regex = r'kind regards.*|many thanks.*|sincerely.*' |
html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0| ' |
email_pattern_regex = r'\S*@\S*\s?' |
num_pattern_regex = r'[0-9]+' |
postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)' |
warning_pattern_regex = r'caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.' |
nbsp_pattern_regex = r' ' |
email_start_pattern = re.compile(email_start_pattern_regex) |
email_end_pattern = re.compile(email_end_pattern_regex) |
html_pattern = re.compile(html_pattern_regex) |
email_pattern = re.compile(email_end_pattern_regex) |
num_pattern = re.compile(num_pattern_regex) |
postcode_pattern = re.compile(postcode_pattern_regex) |
warning_pattern = re.compile(warning_pattern_regex) |
nbsp_pattern = re.compile(nbsp_pattern_regex) |
def stem_sentence(sentence): |
words = sentence.split() |
stemmed_words = [stemmer.stem(word).lower().rstrip("'") for word in words] |
return stemmed_words |
def stem_sentences(sentences, progress=gr.Progress()): |
"""Stem each sentence in a list of sentences.""" |
stemmed_sentences = [stem_sentence(sentence) for sentence in progress.tqdm(sentences)] |
return stemmed_sentences |
def get_lemma_text(text): |
tokens = word_tokenize(text) |
lemmas = [] |
for word in tokens: |
if len(word) > 3: |
lemma = wn.morphy(word) |
else: |
lemma = None |
if lemma is None: |
lemmas.append(word) |
else: |
lemmas.append(lemma) |
return lemmas |
def get_lemma_tokens(tokens): |
lemmas = [] |
for word in tokens: |
if len(word) > 3: |
lemma = wn.morphy(word) |
else: |
lemma = None |
if lemma is None: |
lemmas.append(word) |
else: |
lemmas.append(lemma) |
return lemmas |
def initial_clean(texts , progress=gr.Progress()): |
texts = pl.Series(texts) |
text = texts.str.replace_all(email_start_pattern_regex, '') |
text = text.str.replace_all(email_end_pattern_regex, '') |
text = text.str.replace_all(html_pattern_regex, '') |
text = text.str.replace_all(email_pattern_regex, '') |
text = text.to_list() |
return text |
all_names = [x.lower() for x in list(nltk.corpus.names.words())] |
def remove_hyphens(text_text): |
return re.sub(r'(\w+)-(\w+)-?(\w)?', r'\1 \2 \3', text_text) |
def tokenize_text(text_text): |
TOKEN_PATTERN = r'\s+' |
regex_wt = nltk.RegexpTokenizer(pattern=TOKEN_PATTERN, gaps=True) |
word_tokens = regex_wt.tokenize(text_text) |
return word_tokens |
def remove_characters_after_tokenization(tokens): |
pattern = re.compile('[{}]'.format(re.escape(string.punctuation))) |
filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens]) |
return filtered_tokens |
def convert_to_lowercase(tokens): |
return [token.lower() for token in tokens if token.isalpha()] |
def remove_stopwords(tokens, custom_stopwords): |
stopword_list = nltk.corpus.stopwords.words('english') |
stopword_list += my_stop_words |
filtered_tokens = [token for token in tokens if token not in stopword_list] |
return filtered_tokens |
def remove_names(tokens): |
stopword_list = list(nltk.corpus.names.words()) |
stopword_list = [x.lower() for x in stopword_list] |
filtered_tokens = [token for token in tokens if token not in stopword_list] |
return filtered_tokens |
def remove_short_tokens(tokens): |
return [token for token in tokens if len(token) > 3] |
def keep_only_words_in_wordnet(tokens): |
return [token for token in tokens if wn.synsets(token)] |
def apply_lemmatize(tokens, wnl=WordNetLemmatizer()): |
def lem_word(word): |
if len(word) > 3: out_word = wnl.lemmatize(word) |
else: out_word = word |
return out_word |
return [lem_word(token) for token in tokens] |
def cleanTexttexts(texts): |
clean_texts = [] |
for text in texts: |
text = remove_hyphens(text) |
text = cleanhtml(text) |
text = cleanemail(text) |
text = cleanpostcode(text) |
text = cleannum(text) |
text_i = tokenize_text(text) |
text_i = remove_characters_after_tokenization(text_i) |
text_i = convert_to_lowercase(text_i) |
text_i = get_lemma(text_i) |
text_i = keep_only_words_in_wordnet(text_i) |
text_i = apply_lemmatize(text_i) |
clean_texts.append(text_i) |
return clean_texts |
def remove_dups_text(data_samples_ready, data_samples_clean, data_samples): |
seen = set() |
dupes = [] |
for i, doi in enumerate(data_samples_ready): |
if doi not in seen: |
seen.add(doi) |
else: |
dupes.append(i) |
from collections import defaultdict |
D = defaultdict(list) |
for i,item in enumerate(data_samples_ready): |
D[item].append(i) |
D = {k:v for k,v in D.items() if len(v)>1} |
L = list(D.values()) |
flat_list_dups = [item for sublist in L for item in sublist] |
for index in sorted(flat_list_dups, reverse=True): |
del data_samples_ready[index] |
del data_samples_clean[index] |
del data_samples[index] |
data_samples_ready = [i for i in data_samples_ready if i] |
data_samples_clean = [i for i in data_samples_clean if i] |
data_samples = [i for i in data_samples if i] |
return data_samples_ready, data_samples_clean, flat_list_dups, data_samples |