# ## Some functions to clean text # ### Some other suggested cleaning approaches # # #### From here: https://shravan-kuchkula.github.io/topic-modeling/#interactive-plot-showing-results-of-k-means-clustering-lda-topic-modeling-and-sentiment-analysis # # - remove_hyphens # - tokenize_text # - remove_special_characters # - convert to lower case # - remove stopwords # - lemmatize the token # - remove short tokens # - keep only words in wordnet # - I ADDED ON - creating custom stopwords list # + # Create a custom stop words list import nltk import re import string from nltk.stem import WordNetLemmatizer from nltk.stem import PorterStemmer from nltk.corpus import wordnet as wn from nltk import word_tokenize # Add calendar months onto stop words import calendar from tqdm import tqdm import gradio as gr stemmer = PorterStemmer() nltk.download('stopwords') nltk.download('wordnet') #nltk.download('words') #nltk.download('names') #nltk.corpus.words.words('en') #from sklearn.feature_extraction import text # Adding common names to stopwords all_names = [x.lower() for x in list(nltk.corpus.names.words())] # Adding custom words to the stopwords custom_words = [] my_stop_words = custom_words cal_month = (list(calendar.month_name)) cal_month = [x.lower() for x in cal_month] # Remove blanks cal_month = [i for i in cal_month if i] #print(cal_month) custom_words.extend(cal_month) #my_stop_words = frozenset(text.ENGLISH_STOP_WORDS.union(custom_words).union(all_names)) #custom_stopwords = my_stop_words # - # #### Some of my cleaning functions ''' # + # Remove all html elements from the text. Inspired by this: https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string def remove_email_start(text): cleanr = re.compile('.*importance:|.*subject:') cleantext = re.sub(cleanr, '', text) return cleantext def remove_email_end(text): cleanr = re.compile('kind regards.*|many thanks.*|sincerely.*') cleantext = re.sub(cleanr, '', text) return cleantext def cleanhtml(text): cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0') cleantext = re.sub(cleanr, '', text) return cleantext ## The above doesn't work when there is no > at the end of the string to match the initial <. Trying this: <[^>]+> but needs work: https://stackoverflow.com/questions/2013124/regex-matching-up-to-the-first-occurrence-of-a-character # Remove all email addresses and numbers from the text def cleanemail(text): cleanr = re.compile('\S*@\S*\s?|\xa0') cleantext = re.sub(cleanr, '', text) return cleantext def cleannum(text): cleanr = re.compile(r'[0-9]+') cleantext = re.sub(cleanr, '', text) return cleantext def cleanpostcode(text): cleanr = re.compile(r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)') cleantext = re.sub(cleanr, '', text) return cleantext def cleanwarning(text): cleanr = re.compile('caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.') cleantext = re.sub(cleanr, '', text) return cleantext # - def initial_clean(texts): clean_texts = [] for text in texts: text = remove_email_start(text) text = remove_email_end(text) text = cleanpostcode(text) text = remove_hyphens(text) text = cleanhtml(text) text = cleanemail(text) #text = cleannum(text) clean_texts.append(text) return clean_texts ''' # Pre-compiling the regular expressions for efficiency email_start_pattern = re.compile('.*importance:|.*subject:') email_end_pattern = re.compile('kind regards.*|many thanks.*|sincerely.*') html_pattern = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0') email_pattern = re.compile('\S*@\S*\s?') num_pattern = re.compile(r'[0-9]+') postcode_pattern = re.compile(r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)') warning_pattern = re.compile('caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.') nbsp_pattern = re.compile(r' ') def stem_sentence(sentence): words = sentence.split() stemmed_words = [stemmer.stem(word).lower().rstrip("'") for word in words] return stemmed_words def stem_sentences(sentences, progress=gr.Progress()): """Stem each sentence in a list of sentences.""" stemmed_sentences = [stem_sentence(sentence) for sentence in progress.tqdm(sentences)] return stemmed_sentences def get_lemma_text(text): # Tokenize the input string into words tokens = word_tokenize(text) lemmas = [] for word in tokens: if len(word) > 3: lemma = wn.morphy(word) else: lemma = None if lemma is None: lemmas.append(word) else: lemmas.append(lemma) return lemmas def get_lemma_tokens(tokens): # Tokenize the input string into words lemmas = [] for word in tokens: if len(word) > 3: lemma = wn.morphy(word) else: lemma = None if lemma is None: lemmas.append(word) else: lemmas.append(lemma) return lemmas def initial_clean(texts , progress=gr.Progress()): clean_texts = [] i = 1 #progress(0, desc="Cleaning texts") for text in progress.tqdm(texts, desc = "Cleaning data", unit = "rows"): #print("Cleaning row: ", i) text = re.sub(email_start_pattern, '', text) text = re.sub(email_end_pattern, '', text) text = re.sub(postcode_pattern, '', text) text = remove_hyphens(text) text = re.sub(html_pattern, '', text) text = re.sub(email_pattern, '', text) text = re.sub(nbsp_pattern, '', text) #text = re.sub(warning_pattern, '', text) #text = stem_sentence(text) text = get_lemma_text(text) text = ' '.join(text) # Uncomment the next line if you want to remove numbers as well # text = re.sub(num_pattern, '', text) clean_texts.append(text) i += 1 return clean_texts # Sample execution #sample_texts = [ # "Hello, this is a test email. kind regards, John", # "