Spaces:
Running
Running
File size: 4,867 Bytes
2621d77 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
# reference:
# https://ayselaydin.medium.com/1-text-preprocessing-techniques-for-nlp-37544483c007
import re
import nltk
#nltk.download('stopwords')
#nltk.download()
from DefaultPackages import openFile, saveFile
import json
from nltk.corpus import stopwords
from nltk.corpus.reader.api import wordpunct_tokenize
from nltk.tokenize import word_tokenize
#from wordsegment import load, segment
from wordsegment import load, segment
class cleanGenText():
def __init__(self):
#self.text = text
load()
pass
def removePunct(self,text,KeepPeriod=False):
punctuation = r'[^\w\s]'
if KeepPeriod==True:
punctuation = r'[^\w\s\.]'
return re.sub(punctuation, '', text)
def removeURL(self,text):
url_pattern = re.compile(r'https?://\S+|www\.\S+')
return url_pattern.sub(r'', text)
def removeHTMLTag(self,text):
html_tags_pattern = r'<.*?>'
return re.sub(html_tags_pattern, '', text)
def removeTabWhiteSpaceNewLine(self,text):
# remove \n or \t and unnecessary white space
cleanText = text.replace("\n\n","")
cleanText = text.replace("\n","")
cleanText = cleanText.replace("\t","")
cleanText = cleanText.strip()
return cleanText
def removeExtraSpaceBetweenWords(self,text):
return re.sub(r'\s+', ' ',text).strip()
def removeStopWords(self,text):
#extraUnwantedWords = ["resource","groups","https","table","online","figure","frequency","aslo","fig","shows","respectively"]
filteredWord = []
stopWords = set(list(set(stopwords.words('english'))))# + extraUnwantedWords)
textWords = word_tokenize(text)
for word in textWords:
if word.lower() not in stopWords:
filteredWord.append(word) # and w.isalpha()==True]
return filteredWord
def removeLowercaseBetweenUppercase(self,segment):
# segment such as "Myanmar (formerly Burma)"
# but not change anything for "Viet Nam"
# for special cases:
# the capital letter:
# When there is a lowercase word between:
# e.g: "Myanmar (formerly Burma)" can be "Myanmar", "Burma" instead of "myanmar formerly burma"
# When there is no lowercase word or uppercase words in a row:
# e.g: "Viet Nam" can be "Viet Nam" or "viet nam", instead of "Viet", "Nam"
outputUp = []
segment = self.removeTabWhiteSpaceNewLine(segment)
segments = segment.split(" ")
for w in range(len(segments)):
word = segments[w]
cleanWord = self.removePunct(word)
cleanWord = self.removeTabWhiteSpaceNewLine(cleanWord)
prevWord = ""
if w > 0:
prevWord = segments[w-1]
cleanPreWord = self.removePunct(prevWord)
cleanPreWord = self.removeTabWhiteSpaceNewLine(cleanPreWord)
if cleanWord[0].isupper() == True: # check isupper of first letter of capital word
if len(prevWord)>0 and prevWord[0].isupper() == True:
outputUp[-1] += " " + cleanWord
else:
outputUp.append(cleanWord)
return outputUp
def textPreprocessing(self, text, keepPeriod=False):
# lowercase
#lowerText = self.text.lower()
# remove punctuation & special characacters
cleanText = self.removePunct(text, KeepPeriod=keepPeriod)
# removal of URLs in text
cleanText = self.removeURL(cleanText)
# removal of HTML Tags
cleanText = self.removeHTMLTag(cleanText)
# remove \n or \t and unnecessary white space
cleanText = self.removeTabWhiteSpaceNewLine(cleanText)
# stop-words removal
filteredWord = self.removeStopWords(cleanText)
# a sentence or the capital word behind a period "."
return cleanText, filteredWord
#generateNewChar = textPreprocessing("/content/drive/MyDrive/CollectData/NER/CountriesNameNCBI.json")
#saveFile.saveFile("/content/drive/MyDrive/CollectData/NER/NewCharCountriesNameNCBI.json", json.dumps(generateNewChar))
def splitStickWords(self,word):
#output = []
split_words = segment(word)
'''for w in split_words:
pos = word.lower().find(w)
if word[pos].isupper() == True:
output.append(w[0].upper() + w[1:])
else:
output.append(w)
if pos >=0:
if pos+len(w)<len(word):
if word[pos+len(w)] == ".":
output[-1] = output[-1] + "." '''
return " ".join(split_words)
def removeDOI(self, word, doiLink=None):
# if they have the word DOI in that: ex: 1368598DOI after general clean
if "DOI" in word:
word = word.replace(word,"")
# if they have the link DOI in that: ex: 10.1007s004390161742yORIGINAL, but we still split the word
if doiLink != None:
w = self.splitStickWords(word)
cleanDOI = self.removePunct(doiLink)
if cleanDOI in w:
word = w.replace(cleanDOI,"")
return word |