File size: 17,449 Bytes
8835144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
'''WORD TO VECTOR'''
import pandas as pd
import json
import gensim
import spacy
from DefaultPackages import openFile, saveFile
from NER import cleanText
from gensim.models.keyedvectors import KeyedVectors
from gensim.test.utils import common_texts
from gensim.models.word2vec import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import Phrases
from gensim.models.phrases import Phraser
import sys
import subprocess
import os
# can try multiprocessing to run quicker
import multiprocessing
import copy
sys.setrecursionlimit(1000)
# creat folder word2Vec
#! mkdir /content/drive/MyDrive/CollectData/NER/word2Vec
# create word2vec model
#model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/CollectData/NER/word2Vec', binary=True)
'''Some notes for this model

sometimes when we do the corpus, there are some adverbs which are unnecessary but might be seen as

a similar word to the word we are finding, so can we try to preprocess text so that

we make the corpus more effective and only contains the important words. Then when we

train the model, the important words will be seen as important. Or

when we already have the similar list of words, we can remove the words in there

that are stopwords/unnecessary words.'''
### For more complex analysis, consider using sentence embedding models like "Doc2Vec" to represent the meaning of entire sentences instead of just individual words
class word2Vec():
  def __init__(self, nameFile=None, modelName=None):
    self.nameFile = nameFile
    self.modelName = modelName
    #self.nlp = spacy.load("en_core_web_lg")
    self.cl = cleanText.cleanGenText()
  def spacy_similarity(self, word):
    # when use word2vec, try medium or large is better
    # maybe try odc similarity?
    doc = self.nlp(word)
    for token1 in doc:
      for token2 in doc:
        print(token1.text, token2.text, token1.similarity(token2))
    pass
  # clean text before transform to corpus
  def cleanTextBeforeCorpus(self,oriText, doi=None):
    #cl = cleanText.cleanGenText()
    #cl = cleanGenText()
    output = ""
    alreadyRemoveDoi = False
    for word in oriText.split(" "):
      # remove DOI
      if doi != None and doi in oriText:
        if alreadyRemoveDoi == False:
          newWord = self.cl.removeDOI(word,doi)
          if len(newWord) > 0 and newWord != word:
            alreadyRemoveDoi = True
            word = newWord
      # remove punctuation
      # split the sticked words
      #word = cl.splitStickWords(word)
      # remove punctuation
      word = self.cl.removePunct(word,True)
      # remove URL
      word = self.cl.removeURL(word)
      # remove HTMLTag
      word = self.cl.removeHTMLTag(word)
      # remove tab, white space, newline
      word = self.cl.removeTabWhiteSpaceNewLine(word)
      # optional: remove stopwords
      #word = cl.removeStopWords(word)
      if len(word)>0:
        output += word + " "
    return output
  def cleanAllTextBeforeCorpus(self, allText, doi=None):
    cleanOutput = ""
    remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
    if len(allText) > 0:
      corpusText = allText.split("\n\n")
      for pos in range(len(corpusText)):
        lines = corpusText[pos]
        if len(lines) > 0:
          for line in lines.split("\n"):
            if remove in line:  line = line.replace(remove, "")
            clean_text = self.cleanTextBeforeCorpus(line, doi)
            cleanOutput += clean_text + "\n"
          cleanOutput += "\n\n"
    return cleanOutput
  import urllib.parse, requests

  def tableTransformToCorpusText(self, df, excelFile=None):
    # PDF, Excel, WordDoc
    #cl = cleanText.cleanGenText()
    corpus = {}
      # PDF or df
    if excelFile == None:
      if len(df) > 0:
        try:
          for i in range(len(df)):
            # each new dimension/page is considered to be a sentence which ends with the period.
            # each new line is a new list, and each new df is a new corpus
            outputDF = []
            text = df[i].values.tolist()
            if len(text) > 0:
              outputRowDF = self.helperRowTableToCorpus(text)
              #outputColDF = self.helperColTableToCorpus(text)
              outputDF.extend(outputRowDF)
              #outputDF.extend(outputColDF)
            if len(outputDF) > 0:
              corpus["corpus" + str(i)] = outputDF
        except:
          outputDF = []
          text = df.values.tolist()
          if len(text) > 0:
            outputRowDF = self.helperRowTableToCorpus(text)
            #outputColDF = self.helperColTableToCorpus(text)
            outputDF.extend(outputRowDF)
            #outputDF.extend(outputColDF)
          if len(outputDF) > 0:
            corpus["corpus0"] = outputDF
    else:
      try:
          df = pd.ExcelFile(excelFile)
      except:
          if excelFile.endswith('.xls'):
            df = pd.read_excel(excelFile, engine='xlrd')
          else:
            df = pd.read_excel(excelFile, engine='openpyxl')    
      sheetNames = df.sheet_names
      output = []
      if len(sheetNames) > 0:
        for s in range(len(sheetNames)):
          outputDF = []
          with pd.ExcelFile(excelFile) as xls:
            data = pd.read_excel(xls, sheetNames[s])
          if sheetNames[s] != 'Evaluation Warning':
            text = data.values.tolist()
            if len(text) > 0:
              outputRowDF = self.helperRowTableToCorpus(text)
              #outputColDF = self.helperColTableToCorpus(text)
              outputDF.extend(outputRowDF)
              #outputDF.extend(outputColDF)
          if len(outputDF) > 0:
            corpus["corpus" + str(s)] = outputDF
    return corpus
  def helperRowTableToCorpus(self, textList):
    #cl = cleanGenText()
    #cl = cleanText.cleanGenText()
    stopWords = ["NaN","Unnamed:","nan"]
    outputDF = []
    for line in textList:
      outputLine = []
      for words in line:
        words = str(words)
        if len(words) > 0:
          for word in words.split(" "):
            # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
            if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
              #word = cl.splitStickWords(word)
              word = self.cl.removePunct(word)
              word = " ".join(self.cl.removeStopWords(word))
              word = self.cl.removeTabWhiteSpaceNewLine(word)
              if len(word) > 1:
                if len(word.split(" ")) > 1:
                  for x in word.split(" "):
                    if len(x) > 1 and x.isnumeric()==False:
                      outputLine.append(x.lower())
                else:
                  if word.isnumeric() == False:
                    outputLine.append(word.lower())
      if len(outputLine) > 0:
        outputDF.append(outputLine)
    return outputDF
  def helperColTableToCorpus(self, dfList):
    #cl = cleanGenText()
    #cl = cleanText.cleanGenText()
    stopWords = ["NaN","Unnamed:","nan"]
    outputDF = []
    # use the first length line as the column ref
    for pos in range(len(dfList[0])):
      outputLine = []
      for line in dfList:
        if pos < len(line):
          words = line[pos]
          words = str(words)
        else: words = ""
        if len(words) > 0:
          for word in words.split(" "):
            # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
            if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
              #word = cl.splitStickWords(word)
              word = self.cl.removePunct(word)
              word = " ".join(self.cl.removeStopWords(word))
              word = self.cl.removeTabWhiteSpaceNewLine(word)
              if len(word) > 1:
                if len(word.split(" ")) > 1:
                  for x in word.split(" "):
                    if len(x) > 1 and x.isnumeric()==False:
                      outputLine.append(x.lower())
                else:
                  if word.isnumeric() == False:
                    outputLine.append(word.lower())
      if len(outputLine) > 0:
        outputDF.append(outputLine)
    return outputDF
  # create a corpus
  def createCorpusText(self, corpusText):
    '''ex: "Tom is cat. Jerry is mouse."

    corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
    # the output should be like this:
    '''texts = {

      "Paragraph 1": [["Cat", "is", "an","animal], ["Tom", "is", "cat"]],

      "Paragraph 2": [["Mouse", "is", "an", "animal"], ["Jerry", "is", "mouse"]]

    }

    '''
    # separate paragraph
    '''Ex: Cat is an animal. Tom is cat.



    Mouse is an animal.

    Jerry is mouse.'''
    texts = {}
    #cl = cleanText.cleanGenText()
    #cl = cleanGenText()
    corpus = corpusText.split("\n\n")
    for pos in range(len(corpus)):
      if len(corpus[pos]) > 0:
        texts["Paragraph "+str(pos)] = []
        lines = corpus[pos]
        for line in lines.split("\n"):
          for l in line.split("."):
            if len(l) > 0:
              l = self.cl.removeTabWhiteSpaceNewLine(l)
              l = l.lower()
              newL = []
              for word in l.split(" "):
                if len(word) > 0:
                  word = self.cl.removeStopWords(word)
                  for w in word:
                    if len(w) > 0 and w.isnumeric()==False:
                      newL.append(w)
              if len(newL)>0:
                texts["Paragraph "+str(pos)].append(newL)
        if len(texts["Paragraph "+str(pos)]) == 0:
          del texts["Paragraph "+str(pos)]
    return texts

  def selectParaForWC(self, corpus):
    """

    corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]

    Heuristically determine Word2Vec parameters.

    """
    corSize = len(corpus)
    
    if corSize == 0:
        return None, None, None, None, None, None

    # Adjust parameters based on corpus size
    if corSize < 2000:
        # Small corpus — need high generalization
        window = 3
        vector_size = 100
        sample = 1e-3
        negative = 5
        epochs = 20
        sg = 1  # Skip-gram preferred for rare words
    elif corSize < 10000:
        window = 5
        vector_size = 150
        sample = 1e-4
        negative = 10
        epochs = 20
        sg = 1
    elif corSize < 100000:
        window = 7
        vector_size = 200
        sample = 1e-5
        negative = 15
        epochs = 15
        sg = 1
    elif corSize < 500000:
        window = 10
        vector_size = 250
        sample = 1e-5
        negative = 15
        epochs = 10
        sg = 0  # CBOW is okay when data is large
    else:
        # Very large corpus
        window = 12
        vector_size = 300
        sample = 1e-6
        negative = 20
        epochs = 5
        sg = 0

    return window, vector_size, sample, negative, epochs, sg
  

  def trainWord2Vec(self,nameFile,modelName,saveFolder,window=None,

                    vector_size=None,sample=None,negative=None,epochs=None,sg=None):
    jsonFile = ""
    jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
    if not jsonFile:
        print("No corpus to train")
        return
    cores = multiprocessing.cpu_count()
    combinedCorpus = []
    for key in jsonFile:
      combinedCorpus.extend(jsonFile[key])
    # detect phrase before choosing parameters
    phrases = Phrases(combinedCorpus, min_count=2, threshold=10)
    bigram = Phraser(phrases)
    combinedCorpus = [bigram[sent] for sent in combinedCorpus]

    if window==None and vector_size==None and sample==None and negative==None and epochs==None and sg==None:   
      window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
    # # min_count=1 ensures all words are included
    #w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
    accept = False
    # add retry limit because if training keeps failing (bad corpus or corrupted input), it’ll keep retrying without limit.
    retries = 0
    while not accept and retries < 3:
      if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
        try:
          w2vModel = Word2Vec(
                          min_count=1,
                          window=window,
                          vector_size=vector_size,
                          sample=sample,
                          alpha=0.03,
                          min_alpha=0.0007,
                          negative=negative,
                          workers=cores-1,
                          epochs = epochs,
                          sg=sg)
          w2vModel.build_vocab(combinedCorpus)
          w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=epochs)
          accept = True
        except Exception as e:
          print(f"Retry #{retries+1} failed: {e}")
          retries +=1
      else:
        print("no parameter to train")
        break
    #w2vModel.build_vocab(combinedCorpus)
    #w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
    #w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
    #w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
    w2vModel.save(saveFolder+"/"+modelName+".model")
    w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
    print("done w2v")
    #return combinedCorpus
  def updateWord2Vec(self, modelPath, newCorpus, saveFolder=None):
    if not newCorpus:
        raise ValueError("New corpus is empty!")

    model = Word2Vec.load(modelPath)

    # Phrase detection on new data
    phrases = Phrases(newCorpus, min_count=2, threshold=10)
    bigram = Phraser(phrases)
    newCorpus = [bigram[sent] for sent in newCorpus]

    # Update vocab & retrain
    model.build_vocab(newCorpus, update=True)
    model.train(newCorpus, total_examples=len(newCorpus), epochs=model.epochs)

  def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
    # might not be a meaningful keyword
    #stopWords = ["show"]
    # same word but just plural nouns, tense
    simWords = [word+"s",word+"es",word+"ing",word+"ed"]
    model = KeyedVectors.load_word2vec_format(modelFile, binary = False) # model file in format txt
    results = model.most_similar(positive=[word],topn=n)
    #removeIndex = []
    #currN = copy.deepcopy(n)
    '''for r in range(len(results)):

      if len(results[r][0]) < 2:

        removeIndex.append(results[r])

      # remove the same word but just plural and singular noun and lower than the cos_thres

      elif results[r][0] == word:

        removeIndex.append(results[r])

      elif results[r][0] in simWords or float(results[r][1]) < cos_thres or results[r][0] in stopWords:

        removeIndex.append(results[r])

    for rem in removeIndex:

      results.remove(rem)

    while len(results)!=n and len(results) != 0:

      moreNewResult = model.most_similar(positive=[word],topn=currN+1)[-1]

      if moreNewResult not in results and len(moreNewResult[0])>1:

        if moreNewResult[0] not in stopWords and results[0] != word:

          results.append(moreNewResult)

      currN +=1'''
    return results
  # add more data to existing word2vec model
  def updateWord2Vec(self, modelPath, newCorpus, saveFolder=None):
    if not newCorpus:
        raise ValueError("New corpus is empty!")

    model = Word2Vec.load(modelPath)

    # Phrase detection on new data
    phrases = Phrases(newCorpus, min_count=2, threshold=10)
    bigram = Phraser(phrases)
    newCorpus = [bigram[sent] for sent in newCorpus]

    # Update vocab & retrain
    model.build_vocab(newCorpus, update=True)
    model.train(newCorpus, total_examples=len(newCorpus), epochs=model.epochs)

    # Save updated model
    if saveFolder:
        os.makedirs(saveFolder, exist_ok=True)
        name = os.path.basename(modelPath).replace(".model", "_updated.model")
        model.save(f"{saveFolder}/{name}")
        print(f"🔁 Model updated and saved to {saveFolder}/{name}")
    else:
        model.save(modelPath)
        print(f"🔁 Model updated and overwritten at {modelPath}")
  
  # adding our model into spacy
  # this deals with command line; but instead of using it, we write python script to run command line
  def loadWordVec(self,modelName,wordVec):
    # modelName is the name you want to save into spacy
    # wordVec is the trained word2vec in txt format
    subprocess.run([sys.executable,
                    "-m",
                    "spacy",
                    "init-model",
                    "en",
                    modelName, # this modelName comes from the saved modelName of function trainWord2Vec
                    "--vectors-loc",
                    wordVec])
    print("done")