# reference: # https://ayselaydin.medium.com/1-text-preprocessing-techniques-for-nlp-37544483c007 import re import nltk #nltk.download('stopwords') #nltk.download() from DefaultPackages import openFile, saveFile import json from nltk.corpus import stopwords from nltk.corpus.reader.api import wordpunct_tokenize from nltk.tokenize import word_tokenize #from wordsegment import load, segment from wordsegment import load, segment class cleanGenText(): def __init__(self): #self.text = text load() pass def removePunct(self,text,KeepPeriod=False): punctuation = r'[^\w\s]' if KeepPeriod==True: punctuation = r'[^\w\s\.]' return re.sub(punctuation, '', text) def removeURL(self,text): url_pattern = re.compile(r'https?://\S+|www\.\S+') return url_pattern.sub(r'', text) def removeHTMLTag(self,text): html_tags_pattern = r'<.*?>' return re.sub(html_tags_pattern, '', text) def removeTabWhiteSpaceNewLine(self,text): # remove \n or \t and unnecessary white space cleanText = text.replace("\n\n","") cleanText = text.replace("\n","") cleanText = cleanText.replace("\t","") cleanText = cleanText.strip() return cleanText def removeExtraSpaceBetweenWords(self,text): return re.sub(r'\s+', ' ',text).strip() def removeStopWords(self,text): #extraUnwantedWords = ["resource","groups","https","table","online","figure","frequency","aslo","fig","shows","respectively"] filteredWord = [] stopWords = set(list(set(stopwords.words('english'))))# + extraUnwantedWords) textWords = word_tokenize(text) for word in textWords: if word.lower() not in stopWords: filteredWord.append(word) # and w.isalpha()==True] return filteredWord def removeLowercaseBetweenUppercase(self,segment): # segment such as "Myanmar (formerly Burma)" # but not change anything for "Viet Nam" # for special cases: # the capital letter: # When there is a lowercase word between: # e.g: "Myanmar (formerly Burma)" can be "Myanmar", "Burma" instead of "myanmar formerly burma" # When there is no lowercase word or uppercase words in a row: # e.g: "Viet Nam" can be "Viet Nam" or "viet nam", instead of "Viet", "Nam" outputUp = [] segment = self.removeTabWhiteSpaceNewLine(segment) segments = segment.split(" ") for w in range(len(segments)): word = segments[w] cleanWord = self.removePunct(word) cleanWord = self.removeTabWhiteSpaceNewLine(cleanWord) prevWord = "" if w > 0: prevWord = segments[w-1] cleanPreWord = self.removePunct(prevWord) cleanPreWord = self.removeTabWhiteSpaceNewLine(cleanPreWord) if cleanWord[0].isupper() == True: # check isupper of first letter of capital word if len(prevWord)>0 and prevWord[0].isupper() == True: outputUp[-1] += " " + cleanWord else: outputUp.append(cleanWord) return outputUp def textPreprocessing(self, text, keepPeriod=False): # lowercase #lowerText = self.text.lower() # remove punctuation & special characacters cleanText = self.removePunct(text, KeepPeriod=keepPeriod) # removal of URLs in text cleanText = self.removeURL(cleanText) # removal of HTML Tags cleanText = self.removeHTMLTag(cleanText) # remove \n or \t and unnecessary white space cleanText = self.removeTabWhiteSpaceNewLine(cleanText) # stop-words removal filteredWord = self.removeStopWords(cleanText) # a sentence or the capital word behind a period "." return cleanText, filteredWord #generateNewChar = textPreprocessing("/content/drive/MyDrive/CollectData/NER/CountriesNameNCBI.json") #saveFile.saveFile("/content/drive/MyDrive/CollectData/NER/NewCharCountriesNameNCBI.json", json.dumps(generateNewChar)) def splitStickWords(self,word): #output = [] split_words = segment(word) '''for w in split_words: pos = word.lower().find(w) if word[pos].isupper() == True: output.append(w[0].upper() + w[1:]) else: output.append(w) if pos >=0: if pos+len(w)