File size: 4,867 Bytes
2621d77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# reference:
# https://ayselaydin.medium.com/1-text-preprocessing-techniques-for-nlp-37544483c007
import re
import nltk
#nltk.download('stopwords')
#nltk.download()
from DefaultPackages import openFile, saveFile
import json
from nltk.corpus import stopwords
from nltk.corpus.reader.api import wordpunct_tokenize
from nltk.tokenize import word_tokenize
#from wordsegment import load, segment
from wordsegment import load, segment
class cleanGenText():
  def __init__(self):
    #self.text = text
    load()
    pass
  def removePunct(self,text,KeepPeriod=False):
    punctuation = r'[^\w\s]'
    if KeepPeriod==True:
      punctuation = r'[^\w\s\.]' 
    return re.sub(punctuation, '', text)
  def removeURL(self,text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)
  def removeHTMLTag(self,text):
    html_tags_pattern = r'<.*?>'
    return re.sub(html_tags_pattern, '', text)
  def removeTabWhiteSpaceNewLine(self,text):
    # remove \n or \t and unnecessary white space
    cleanText = text.replace("\n\n","")
    cleanText = text.replace("\n","")
    cleanText = cleanText.replace("\t","")
    cleanText = cleanText.strip()
    return cleanText
  def removeExtraSpaceBetweenWords(self,text):
    return re.sub(r'\s+', ' ',text).strip()  
  def removeStopWords(self,text):
    #extraUnwantedWords = ["resource","groups","https","table","online","figure","frequency","aslo","fig","shows","respectively"]
    filteredWord = []
    stopWords = set(list(set(stopwords.words('english'))))# + extraUnwantedWords)
    textWords = word_tokenize(text)
    for word in textWords:
      if word.lower() not in stopWords:
        filteredWord.append(word) # and w.isalpha()==True]
    return filteredWord
  def removeLowercaseBetweenUppercase(self,segment):
    # segment such as "Myanmar (formerly Burma)"
    # but not change anything for "Viet Nam"
    # for special cases:
        # the capital letter:
        # When there is a lowercase word between:
        # e.g: "Myanmar (formerly Burma)" can be "Myanmar", "Burma" instead of "myanmar formerly burma"        
        # When there is no lowercase word or uppercase words in a row:
        # e.g: "Viet Nam" can be "Viet Nam" or "viet nam", instead of "Viet", "Nam"
    outputUp = []
    segment = self.removeTabWhiteSpaceNewLine(segment)
    segments = segment.split(" ")
    for w in range(len(segments)):
      word = segments[w]
      cleanWord = self.removePunct(word)
      cleanWord = self.removeTabWhiteSpaceNewLine(cleanWord) 
      prevWord = ""
      if w > 0:
        prevWord = segments[w-1]
        cleanPreWord = self.removePunct(prevWord)
        cleanPreWord = self.removeTabWhiteSpaceNewLine(cleanPreWord)
      if cleanWord[0].isupper() == True: # check isupper of first letter of capital word
        if len(prevWord)>0 and prevWord[0].isupper() == True:
          outputUp[-1] += " " + cleanWord 
        else:
          outputUp.append(cleanWord)
    return outputUp    
  def textPreprocessing(self, text, keepPeriod=False):
    # lowercase
    #lowerText = self.text.lower()
    # remove punctuation & special characacters
    cleanText = self.removePunct(text, KeepPeriod=keepPeriod)
    # removal of URLs in text
    cleanText = self.removeURL(cleanText)
    # removal of HTML Tags
    cleanText = self.removeHTMLTag(cleanText)
    # remove \n or \t and unnecessary white space
    cleanText = self.removeTabWhiteSpaceNewLine(cleanText)
    # stop-words removal
    filteredWord = self.removeStopWords(cleanText)
    # a sentence or the capital word behind a period "."
    return cleanText, filteredWord
  #generateNewChar = textPreprocessing("/content/drive/MyDrive/CollectData/NER/CountriesNameNCBI.json")
  #saveFile.saveFile("/content/drive/MyDrive/CollectData/NER/NewCharCountriesNameNCBI.json", json.dumps(generateNewChar))
  def splitStickWords(self,word):
    #output = []
    split_words = segment(word)
    '''for w in split_words:

      pos = word.lower().find(w)

      if word[pos].isupper() == True:

        output.append(w[0].upper() + w[1:])   

      else:

        output.append(w)

      if pos >=0:

        if pos+len(w)<len(word):

          if word[pos+len(w)] == ".":   

            output[-1] = output[-1] + "."  '''
    return " ".join(split_words)
  def removeDOI(self, word, doiLink=None):
    # if they have the word DOI in that: ex: 1368598DOI after general clean
    if "DOI" in word:
      word = word.replace(word,"")
    # if they have the link DOI in that: ex: 10.1007s004390161742yORIGINAL, but we still split the word
    if doiLink != None:
      w = self.splitStickWords(word)
      cleanDOI = self.removePunct(doiLink)
      if cleanDOI in w:
        word = w.replace(cleanDOI,"")
    return word