VyLala commited on
Commit
b5d574a
·
verified ·
1 Parent(s): 0a98cc3

Update NER/word2Vec/word2vec.py

Browse files
Files changed (1) hide show
  1. NER/word2Vec/word2vec.py +102 -541
NER/word2Vec/word2vec.py CHANGED
@@ -1,374 +1,3 @@
1
- <<<<<<< HEAD
2
- '''WORD TO VECTOR'''
3
- import pandas as pd
4
- import json
5
- import gensim
6
- import spacy
7
- from DefaultPackages import openFile, saveFile
8
- from NER import cleanText
9
- from gensim.models.keyedvectors import KeyedVectors
10
- from gensim.test.utils import common_texts
11
- from gensim.models.word2vec import Word2Vec
12
- from gensim.scripts.glove2word2vec import glove2word2vec
13
- from gensim.test.utils import datapath, get_tmpfile
14
- import sys
15
- import subprocess
16
- # can try multiprocessing to run quicker
17
- import multiprocessing
18
- import copy
19
- sys.setrecursionlimit(1000)
20
- # creat folder word2Vec
21
- #! mkdir /content/drive/MyDrive/CollectData/NER/word2Vec
22
- # create word2vec model
23
- #model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/CollectData/NER/word2Vec', binary=True)
24
- '''Some notes for this model
25
- sometimes when we do the corpus, there are some adverbs which are unnecessary but might be seen as
26
- a similar word to the word we are finding, so can we try to preprocess text so that
27
- we make the corpus more effective and only contains the important words. Then when we
28
- train the model, the important words will be seen as important. Or
29
- when we already have the similar list of words, we can remove the words in there
30
- that are stopwords/unnecessary words.'''
31
- ### For more complex analysis, consider using sentence embedding models like "Doc2Vec" to represent the meaning of entire sentences instead of just individual words
32
- class word2Vec():
33
- def __init__(self, nameFile=None, modelName=None):
34
- self.nameFile = nameFile
35
- self.modelName = modelName
36
- def spacy_similarity(self, word):
37
- # when use word2vec, try medium or large is better
38
- # maybe try odc similarity?
39
- nlp = spacy.load("en_core_web_lg")
40
- doc = nlp(word)
41
- for token1 in doc:
42
- for token2 in doc:
43
- print(token1.text, token2.text, token1.similarity(token2))
44
- pass
45
- # clean text before transform to corpus
46
- def cleanTextBeforeCorpus(self,oriText, doi=None):
47
- cl = cleanText.cleanGenText()
48
- #cl = cleanGenText()
49
- output = ""
50
- alreadyRemoveDoi = False
51
- for word in oriText.split(" "):
52
- # remove DOI
53
- if doi != None and doi in oriText:
54
- if alreadyRemoveDoi == False:
55
- newWord = cl.removeDOI(word,doi)
56
- if len(newWord) > 0 and newWord != word:
57
- alreadyRemoveDoi = True
58
- word = newWord
59
- # remove punctuation
60
- # split the sticked words
61
- #word = cl.splitStickWords(word)
62
- # remove punctuation
63
- word = cl.removePunct(word,True)
64
- # remove URL
65
- word = cl.removeURL(word)
66
- # remove HTMLTag
67
- word = cl.removeHTMLTag(word)
68
- # remove tab, white space, newline
69
- word = cl.removeTabWhiteSpaceNewLine(word)
70
- # optional: remove stopwords
71
- #word = cl.removeStopWords(word)
72
- if len(word)>0:
73
- output += word + " "
74
- return output
75
- def cleanAllTextBeforeCorpus(self, allText, doi=None):
76
- cleanOutput = ""
77
- remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
78
- if len(allText) > 0:
79
- corpusText = allText
80
- for pos in range(len(corpusText.split("\n\n"))):
81
- if len(corpusText.split("\n\n")[pos]) > 0:
82
- lines = corpusText.split("\n\n")[pos]
83
- for line in lines.split("\n"):
84
- if remove in line: line = line.replace(remove, "")
85
- clean_text = self.cleanTextBeforeCorpus(line, doi)
86
- cleanOutput += clean_text + "\n"
87
- cleanOutput += "\n\n"
88
- return cleanOutput
89
- def tableTransformToCorpusText(self, df, excelFile=None):
90
- # PDF, Excel, WordDoc
91
- #cl = cleanText.cleanGenText()
92
- corpus = {}
93
- # PDF or df
94
- if excelFile == None:
95
- if len(df) > 0:
96
- try:
97
- for i in range(len(df)):
98
- # each new dimension/page is considered to be a sentence which ends with the period.
99
- # each new line is a new list, and each new df is a new corpus
100
- outputDF = []
101
- text = df[i].values.tolist()
102
- if len(text) > 0:
103
- outputRowDF = self.helperRowTableToCorpus(text)
104
- #outputColDF = self.helperColTableToCorpus(text)
105
- outputDF.extend(outputRowDF)
106
- #outputDF.extend(outputColDF)
107
- if len(outputDF) > 0:
108
- corpus["corpus" + str(i)] = outputDF
109
- except:
110
- outputDF = []
111
- text = df.values.tolist()
112
- if len(text) > 0:
113
- outputRowDF = self.helperRowTableToCorpus(text)
114
- #outputColDF = self.helperColTableToCorpus(text)
115
- outputDF.extend(outputRowDF)
116
- #outputDF.extend(outputColDF)
117
- if len(outputDF) > 0:
118
- corpus["corpus0"] = outputDF
119
- else:
120
- try:
121
- df = pd.ExcelFile(excelFile)
122
- except:
123
- if filepath.endswith('.xls'):
124
- df = pd.read_excel(filepath, engine='xlrd')
125
- else:
126
- df = pd.read_excel(filepath, engine='openpyxl')
127
- sheetNames = df.sheet_names
128
- output = []
129
- if len(sheetNames) > 0:
130
- for s in range(len(sheetNames)):
131
- outputDF = []
132
- with pd.ExcelFile(excelFile) as xls:
133
- data = pd.read_excel(xls, sheetNames[s])
134
- if sheetNames[s] != 'Evaluation Warning':
135
- text = data.values.tolist()
136
- if len(text) > 0:
137
- outputRowDF = self.helperRowTableToCorpus(text)
138
- #outputColDF = self.helperColTableToCorpus(text)
139
- outputDF.extend(outputRowDF)
140
- #outputDF.extend(outputColDF)
141
- if len(outputDF) > 0:
142
- corpus["corpus" + str(s)] = outputDF
143
- return corpus
144
- def helperRowTableToCorpus(self, textList):
145
- #cl = cleanGenText()
146
- cl = cleanText.cleanGenText()
147
- stopWords = ["NaN","Unnamed:","nan"]
148
- outputDF = []
149
- for line in textList:
150
- outputLine = []
151
- for words in line:
152
- words = str(words)
153
- if len(words) > 0:
154
- for word in words.split(" "):
155
- # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
156
- if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
157
- #word = cl.splitStickWords(word)
158
- word = cl.removePunct(word)
159
- word = " ".join(cl.removeStopWords(word))
160
- word = cl.removeTabWhiteSpaceNewLine(word)
161
- if len(word) > 1:
162
- if len(word.split(" ")) > 1:
163
- for x in word.split(" "):
164
- if len(x) > 1 and x.isnumeric()==False:
165
- outputLine.append(x.lower())
166
- else:
167
- if word.isnumeric() == False:
168
- outputLine.append(word.lower())
169
- if len(outputLine) > 0:
170
- outputDF.append(outputLine)
171
- return outputDF
172
- def helperColTableToCorpus(self, dfList):
173
- #cl = cleanGenText()
174
- cl = cleanText.cleanGenText()
175
- stopWords = ["NaN","Unnamed:","nan"]
176
- outputDF = []
177
- # use the first length line as the column ref
178
- for pos in range(len(dfList[0])):
179
- outputLine = []
180
- for line in dfList:
181
- if pos < len(line):
182
- words = line[pos]
183
- words = str(words)
184
- else: words = ""
185
- if len(words) > 0:
186
- for word in words.split(" "):
187
- # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
188
- if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
189
- #word = cl.splitStickWords(word)
190
- word = cl.removePunct(word)
191
- word = " ".join(cl.removeStopWords(word))
192
- word = cl.removeTabWhiteSpaceNewLine(word)
193
- if len(word) > 1:
194
- if len(word.split(" ")) > 1:
195
- for x in word.split(" "):
196
- if len(x) > 1 and x.isnumeric()==False:
197
- outputLine.append(x.lower())
198
- else:
199
- if word.isnumeric() == False:
200
- outputLine.append(word.lower())
201
- if len(outputLine) > 0:
202
- outputDF.append(outputLine)
203
- return outputDF
204
- # create a corpus
205
- def createCorpusText(self, corpusText):
206
- '''ex: "Tom is cat. Jerry is mouse."
207
- corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
208
- # the output should be like this:
209
- '''texts = {
210
- "Paragraph 1": [["Cat", "is", "an","animal], ["Tom", "is", "cat"]],
211
- "Paragraph 2": [["Mouse", "is", "an", "animal"], ["Jerry", "is", "mouse"]]
212
- }
213
- '''
214
- # separate paragraph
215
- '''Ex: Cat is an animal. Tom is cat.
216
-
217
- Mouse is an animal.
218
- Jerry is mouse.'''
219
- texts = {}
220
- cl = cleanText.cleanGenText()
221
- #cl = cleanGenText()
222
- for pos in range(len(corpusText.split("\n\n"))):
223
- if len(corpusText.split("\n\n")[pos]) > 0:
224
- texts["Paragraph "+str(pos)] = []
225
- lines = corpusText.split("\n\n")[pos]
226
- for line in lines.split("\n"):
227
- for l in line.split("."):
228
- if len(l) > 0:
229
- cl.removeTabWhiteSpaceNewLine(l)
230
- l = l.lower()
231
- newL = []
232
- for word in l.split(" "):
233
- if len(word) > 0:
234
- word = cl.removeStopWords(word)
235
- for w in word:
236
- if len(w) > 0 and w.isnumeric()==False:
237
- newL.append(w)
238
- if len(newL)>0:
239
- texts["Paragraph "+str(pos)].append(newL)
240
- if len(texts["Paragraph "+str(pos)]) == 0:
241
- del texts["Paragraph "+str(pos)]
242
- return texts
243
- def selectParaForWC(self,corpus):
244
- ''' corpus should be in the format:
245
- corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
246
- corSize, window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None, None
247
- corSize = len(corpus)
248
- # less than 2000
249
- if 0 < corSize < 2000:
250
- window=3.5
251
- vector_size=75
252
- sample=1e-3
253
- negative=10
254
- epochs=10
255
- sg=1
256
- # 2000 - 100000
257
- elif 2000 <= corSize < 100000:
258
- window=3.5
259
- vector_size=75
260
- sample=1e-5
261
- negative=10
262
- epochs=10
263
- sg=1
264
- elif 100000 <=corSize < 1000000:
265
- window=7.5
266
- vector_size=150
267
- sample=1e-5
268
- negative=10
269
- epochs=6
270
- sg=0
271
- return window, vector_size, sample, negative, epochs, sg
272
- def trainWord2Vec(self,nameFile,modelName,saveFolder,window=3.5,
273
- vector_size=75,sample=1e-3,negative=10,epochs=10,sg=1):
274
- # if you dont have backup file, you can use again the nameFile just to increase the lenght of corpus
275
- jsonFile = ""
276
- jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
277
- cores = multiprocessing.cpu_count()
278
- combinedCorpus = []
279
- window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None
280
- if len(jsonFile) > 0:
281
- for key in jsonFile:
282
- combinedCorpus.extend(jsonFile[key])
283
- window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
284
- # # min_count=1 ensures all words are included
285
- '''w2vModel = Word2Vec(
286
- min_count=1,
287
- window=window,
288
- vector_size=vector_size,
289
- sample=sample,
290
- alpha=0.03,
291
- min_alpha=0.0007,
292
- negative=negative,
293
- workers=cores-1,
294
- epochs = epochs,
295
- sg=sg)'''
296
- #w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
297
- accept = False
298
- while not accept:
299
- if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
300
- try:
301
- w2vModel = Word2Vec(
302
- min_count=1,
303
- window=window,
304
- vector_size=vector_size,
305
- sample=sample,
306
- alpha=0.03,
307
- min_alpha=0.0007,
308
- negative=negative,
309
- workers=cores-1,
310
- epochs = epochs,
311
- sg=sg)
312
- w2vModel.build_vocab(combinedCorpus)
313
- w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
314
- accept = True
315
- except:
316
- for key in jsonFile:
317
- combinedCorpus.extend(jsonFile[key])
318
- window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
319
- print("next is " + str(len(combinedCorpus)))
320
- else:
321
- print("no parameter to train")
322
- break
323
- #w2vModel.build_vocab(combinedCorpus)
324
- #w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
325
- #w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
326
- #w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
327
- w2vModel.save(saveFolder+"/"+modelName+".model")
328
- w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
329
- print("done w2v")
330
- else: print("no corpus to train")
331
- #return combinedCorpus
332
- def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
333
- # might not be a meaningful keyword
334
- #stopWords = ["show"]
335
- # same word but just plural nouns, tense
336
- simWords = [word+"s",word+"es",word+"ing",word+"ed"]
337
- model = KeyedVectors.load_word2vec_format(modelFile, binary = False) # model file in format txt
338
- results = model.most_similar(positive=[word],topn=n)
339
- #removeIndex = []
340
- #currN = copy.deepcopy(n)
341
- '''for r in range(len(results)):
342
- if len(results[r][0]) < 2:
343
- removeIndex.append(results[r])
344
- # remove the same word but just plural and singular noun and lower than the cos_thres
345
- elif results[r][0] == word:
346
- removeIndex.append(results[r])
347
- elif results[r][0] in simWords or float(results[r][1]) < cos_thres or results[r][0] in stopWords:
348
- removeIndex.append(results[r])
349
- for rem in removeIndex:
350
- results.remove(rem)
351
- while len(results)!=n and len(results) != 0:
352
- moreNewResult = model.most_similar(positive=[word],topn=currN+1)[-1]
353
- if moreNewResult not in results and len(moreNewResult[0])>1:
354
- if moreNewResult[0] not in stopWords and results[0] != word:
355
- results.append(moreNewResult)
356
- currN +=1'''
357
- return results
358
- # adding our model into spacy
359
- # this deals with command line; but instead of using it, we write python script to run command line
360
- def loadWordVec(self,modelName,wordVec):
361
- # modelName is the name you want to save into spacy
362
- # wordVec is the trained word2vec in txt format
363
- subprocess.run([sys.executable,
364
- "-m",
365
- "spacy",
366
- "init-model",
367
- "en",
368
- modelName, # this modelName comes from the saved modelName of function trainWord2Vec
369
- "--vectors-loc",
370
- wordVec])
371
- =======
372
  '''WORD TO VECTOR'''
373
  import pandas as pd
374
  import json
@@ -381,11 +10,8 @@ from gensim.test.utils import common_texts
381
  from gensim.models.word2vec import Word2Vec
382
  from gensim.scripts.glove2word2vec import glove2word2vec
383
  from gensim.test.utils import datapath, get_tmpfile
384
- from gensim.models import Phrases
385
- from gensim.models.phrases import Phraser
386
  import sys
387
  import subprocess
388
- import os
389
  # can try multiprocessing to run quicker
390
  import multiprocessing
391
  import copy
@@ -406,19 +32,18 @@ class word2Vec():
406
  def __init__(self, nameFile=None, modelName=None):
407
  self.nameFile = nameFile
408
  self.modelName = modelName
409
- #self.nlp = spacy.load("en_core_web_lg")
410
- self.cl = cleanText.cleanGenText()
411
  def spacy_similarity(self, word):
412
  # when use word2vec, try medium or large is better
413
  # maybe try odc similarity?
414
- doc = self.nlp(word)
 
415
  for token1 in doc:
416
  for token2 in doc:
417
  print(token1.text, token2.text, token1.similarity(token2))
418
  pass
419
  # clean text before transform to corpus
420
  def cleanTextBeforeCorpus(self,oriText, doi=None):
421
- #cl = cleanText.cleanGenText()
422
  #cl = cleanGenText()
423
  output = ""
424
  alreadyRemoveDoi = False
@@ -426,7 +51,7 @@ class word2Vec():
426
  # remove DOI
427
  if doi != None and doi in oriText:
428
  if alreadyRemoveDoi == False:
429
- newWord = self.cl.removeDOI(word,doi)
430
  if len(newWord) > 0 and newWord != word:
431
  alreadyRemoveDoi = True
432
  word = newWord
@@ -434,13 +59,13 @@ class word2Vec():
434
  # split the sticked words
435
  #word = cl.splitStickWords(word)
436
  # remove punctuation
437
- word = self.cl.removePunct(word,True)
438
  # remove URL
439
- word = self.cl.removeURL(word)
440
  # remove HTMLTag
441
- word = self.cl.removeHTMLTag(word)
442
  # remove tab, white space, newline
443
- word = self.cl.removeTabWhiteSpaceNewLine(word)
444
  # optional: remove stopwords
445
  #word = cl.removeStopWords(word)
446
  if len(word)>0:
@@ -450,18 +75,16 @@ class word2Vec():
450
  cleanOutput = ""
451
  remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
452
  if len(allText) > 0:
453
- corpusText = allText.split("\n\n")
454
- for pos in range(len(corpusText)):
455
- lines = corpusText[pos]
456
- if len(lines) > 0:
457
  for line in lines.split("\n"):
458
  if remove in line: line = line.replace(remove, "")
459
  clean_text = self.cleanTextBeforeCorpus(line, doi)
460
  cleanOutput += clean_text + "\n"
461
  cleanOutput += "\n\n"
462
  return cleanOutput
463
- import urllib.parse, requests
464
-
465
  def tableTransformToCorpusText(self, df, excelFile=None):
466
  # PDF, Excel, WordDoc
467
  #cl = cleanText.cleanGenText()
@@ -496,10 +119,10 @@ class word2Vec():
496
  try:
497
  df = pd.ExcelFile(excelFile)
498
  except:
499
- if excelFile.endswith('.xls'):
500
- df = pd.read_excel(excelFile, engine='xlrd')
501
  else:
502
- df = pd.read_excel(excelFile, engine='openpyxl')
503
  sheetNames = df.sheet_names
504
  output = []
505
  if len(sheetNames) > 0:
@@ -519,7 +142,7 @@ class word2Vec():
519
  return corpus
520
  def helperRowTableToCorpus(self, textList):
521
  #cl = cleanGenText()
522
- #cl = cleanText.cleanGenText()
523
  stopWords = ["NaN","Unnamed:","nan"]
524
  outputDF = []
525
  for line in textList:
@@ -531,9 +154,9 @@ class word2Vec():
531
  # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
532
  if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
533
  #word = cl.splitStickWords(word)
534
- word = self.cl.removePunct(word)
535
- word = " ".join(self.cl.removeStopWords(word))
536
- word = self.cl.removeTabWhiteSpaceNewLine(word)
537
  if len(word) > 1:
538
  if len(word.split(" ")) > 1:
539
  for x in word.split(" "):
@@ -547,7 +170,7 @@ class word2Vec():
547
  return outputDF
548
  def helperColTableToCorpus(self, dfList):
549
  #cl = cleanGenText()
550
- #cl = cleanText.cleanGenText()
551
  stopWords = ["NaN","Unnamed:","nan"]
552
  outputDF = []
553
  # use the first length line as the column ref
@@ -563,9 +186,9 @@ class word2Vec():
563
  # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
564
  if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
565
  #word = cl.splitStickWords(word)
566
- word = self.cl.removePunct(word)
567
- word = " ".join(self.cl.removeStopWords(word))
568
- word = self.cl.removeTabWhiteSpaceNewLine(word)
569
  if len(word) > 1:
570
  if len(word.split(" ")) > 1:
571
  for x in word.split(" "):
@@ -593,22 +216,21 @@ class word2Vec():
593
  Mouse is an animal.
594
  Jerry is mouse.'''
595
  texts = {}
596
- #cl = cleanText.cleanGenText()
597
  #cl = cleanGenText()
598
- corpus = corpusText.split("\n\n")
599
- for pos in range(len(corpus)):
600
- if len(corpus[pos]) > 0:
601
  texts["Paragraph "+str(pos)] = []
602
- lines = corpus[pos]
603
  for line in lines.split("\n"):
604
  for l in line.split("."):
605
  if len(l) > 0:
606
- l = self.cl.removeTabWhiteSpaceNewLine(l)
607
  l = l.lower()
608
  newL = []
609
  for word in l.split(" "):
610
  if len(word) > 0:
611
- word = self.cl.removeStopWords(word)
612
  for w in word:
613
  if len(w) > 0 and w.isnumeric()==False:
614
  newL.append(w)
@@ -617,86 +239,49 @@ class word2Vec():
617
  if len(texts["Paragraph "+str(pos)]) == 0:
618
  del texts["Paragraph "+str(pos)]
619
  return texts
620
-
621
- def selectParaForWC(self, corpus):
622
- """
623
- corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]
624
- Heuristically determine Word2Vec parameters.
625
- """
626
  corSize = len(corpus)
627
-
628
- if corSize == 0:
629
- return None, None, None, None, None, None
630
-
631
- # Adjust parameters based on corpus size
632
- if corSize < 2000:
633
- # Small corpus — need high generalization
634
- window = 3
635
- vector_size = 100
636
- sample = 1e-3
637
- negative = 5
638
- epochs = 20
639
- sg = 1 # Skip-gram preferred for rare words
640
- elif corSize < 10000:
641
- window = 5
642
- vector_size = 150
643
- sample = 1e-4
644
- negative = 10
645
- epochs = 20
646
- sg = 1
647
- elif corSize < 100000:
648
- window = 7
649
- vector_size = 200
650
- sample = 1e-5
651
- negative = 15
652
- epochs = 15
653
- sg = 1
654
- elif corSize < 500000:
655
- window = 10
656
- vector_size = 250
657
- sample = 1e-5
658
- negative = 15
659
- epochs = 10
660
- sg = 0 # CBOW is okay when data is large
661
- else:
662
- # Very large corpus
663
- window = 12
664
- vector_size = 300
665
- sample = 1e-6
666
- negative = 20
667
- epochs = 5
668
- sg = 0
669
-
670
  return window, vector_size, sample, negative, epochs, sg
671
-
672
-
673
- def trainWord2Vec(self,nameFile,modelName,saveFolder,window=None,
674
- vector_size=None,sample=None,negative=None,epochs=None,sg=None):
675
  jsonFile = ""
676
  jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
677
- if not jsonFile:
678
- print("No corpus to train")
679
- return
680
  cores = multiprocessing.cpu_count()
681
  combinedCorpus = []
682
- for key in jsonFile:
683
- combinedCorpus.extend(jsonFile[key])
684
- # detect phrase before choosing parameters
685
- phrases = Phrases(combinedCorpus, min_count=2, threshold=10)
686
- bigram = Phraser(phrases)
687
- combinedCorpus = [bigram[sent] for sent in combinedCorpus]
688
-
689
- if window==None and vector_size==None and sample==None and negative==None and epochs==None and sg==None:
690
  window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
691
- # # min_count=1 ensures all words are included
692
- #w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
693
- accept = False
694
- # add retry limit because if training keeps failing (bad corpus or corrupted input), it’ll keep retrying without limit.
695
- retries = 0
696
- while not accept and retries < 3:
697
- if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
698
- try:
699
- w2vModel = Word2Vec(
700
  min_count=1,
701
  window=window,
702
  vector_size=vector_size,
@@ -706,39 +291,43 @@ class word2Vec():
706
  negative=negative,
707
  workers=cores-1,
708
  epochs = epochs,
709
- sg=sg)
710
- w2vModel.build_vocab(combinedCorpus)
711
- w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=epochs)
712
- accept = True
713
- except Exception as e:
714
- print(f"Retry #{retries+1} failed: {e}")
715
- retries +=1
716
- else:
717
- print("no parameter to train")
718
- break
719
- #w2vModel.build_vocab(combinedCorpus)
720
- #w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
721
- #w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
722
- #w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
723
- w2vModel.save(saveFolder+"/"+modelName+".model")
724
- w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
725
- print("done w2v")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
726
  #return combinedCorpus
727
- def updateWord2Vec(self, modelPath, newCorpus, saveFolder=None):
728
- if not newCorpus:
729
- raise ValueError("New corpus is empty!")
730
-
731
- model = Word2Vec.load(modelPath)
732
-
733
- # Phrase detection on new data
734
- phrases = Phrases(newCorpus, min_count=2, threshold=10)
735
- bigram = Phraser(phrases)
736
- newCorpus = [bigram[sent] for sent in newCorpus]
737
-
738
- # Update vocab & retrain
739
- model.build_vocab(newCorpus, update=True)
740
- model.train(newCorpus, total_examples=len(newCorpus), epochs=model.epochs)
741
-
742
  def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
743
  # might not be a meaningful keyword
744
  #stopWords = ["show"]
@@ -765,32 +354,6 @@ class word2Vec():
765
  results.append(moreNewResult)
766
  currN +=1'''
767
  return results
768
- # add more data to existing word2vec model
769
- def updateWord2Vec(self, modelPath, newCorpus, saveFolder=None):
770
- if not newCorpus:
771
- raise ValueError("New corpus is empty!")
772
-
773
- model = Word2Vec.load(modelPath)
774
-
775
- # Phrase detection on new data
776
- phrases = Phrases(newCorpus, min_count=2, threshold=10)
777
- bigram = Phraser(phrases)
778
- newCorpus = [bigram[sent] for sent in newCorpus]
779
-
780
- # Update vocab & retrain
781
- model.build_vocab(newCorpus, update=True)
782
- model.train(newCorpus, total_examples=len(newCorpus), epochs=model.epochs)
783
-
784
- # Save updated model
785
- if saveFolder:
786
- os.makedirs(saveFolder, exist_ok=True)
787
- name = os.path.basename(modelPath).replace(".model", "_updated.model")
788
- model.save(f"{saveFolder}/{name}")
789
- print(f"🔁 Model updated and saved to {saveFolder}/{name}")
790
- else:
791
- model.save(modelPath)
792
- print(f"🔁 Model updated and overwritten at {modelPath}")
793
-
794
  # adding our model into spacy
795
  # this deals with command line; but instead of using it, we write python script to run command line
796
  def loadWordVec(self,modelName,wordVec):
@@ -803,6 +366,4 @@ class word2Vec():
803
  "en",
804
  modelName, # this modelName comes from the saved modelName of function trainWord2Vec
805
  "--vectors-loc",
806
- wordVec])
807
- >>>>>>> 597aa7c (WIP: Save local changes which mainly updated appUI before moving to UpdateAppUI)
808
- print("done")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  '''WORD TO VECTOR'''
2
  import pandas as pd
3
  import json
 
10
  from gensim.models.word2vec import Word2Vec
11
  from gensim.scripts.glove2word2vec import glove2word2vec
12
  from gensim.test.utils import datapath, get_tmpfile
 
 
13
  import sys
14
  import subprocess
 
15
  # can try multiprocessing to run quicker
16
  import multiprocessing
17
  import copy
 
32
  def __init__(self, nameFile=None, modelName=None):
33
  self.nameFile = nameFile
34
  self.modelName = modelName
 
 
35
  def spacy_similarity(self, word):
36
  # when use word2vec, try medium or large is better
37
  # maybe try odc similarity?
38
+ nlp = spacy.load("en_core_web_lg")
39
+ doc = nlp(word)
40
  for token1 in doc:
41
  for token2 in doc:
42
  print(token1.text, token2.text, token1.similarity(token2))
43
  pass
44
  # clean text before transform to corpus
45
  def cleanTextBeforeCorpus(self,oriText, doi=None):
46
+ cl = cleanText.cleanGenText()
47
  #cl = cleanGenText()
48
  output = ""
49
  alreadyRemoveDoi = False
 
51
  # remove DOI
52
  if doi != None and doi in oriText:
53
  if alreadyRemoveDoi == False:
54
+ newWord = cl.removeDOI(word,doi)
55
  if len(newWord) > 0 and newWord != word:
56
  alreadyRemoveDoi = True
57
  word = newWord
 
59
  # split the sticked words
60
  #word = cl.splitStickWords(word)
61
  # remove punctuation
62
+ word = cl.removePunct(word,True)
63
  # remove URL
64
+ word = cl.removeURL(word)
65
  # remove HTMLTag
66
+ word = cl.removeHTMLTag(word)
67
  # remove tab, white space, newline
68
+ word = cl.removeTabWhiteSpaceNewLine(word)
69
  # optional: remove stopwords
70
  #word = cl.removeStopWords(word)
71
  if len(word)>0:
 
75
  cleanOutput = ""
76
  remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
77
  if len(allText) > 0:
78
+ corpusText = allText
79
+ for pos in range(len(corpusText.split("\n\n"))):
80
+ if len(corpusText.split("\n\n")[pos]) > 0:
81
+ lines = corpusText.split("\n\n")[pos]
82
  for line in lines.split("\n"):
83
  if remove in line: line = line.replace(remove, "")
84
  clean_text = self.cleanTextBeforeCorpus(line, doi)
85
  cleanOutput += clean_text + "\n"
86
  cleanOutput += "\n\n"
87
  return cleanOutput
 
 
88
  def tableTransformToCorpusText(self, df, excelFile=None):
89
  # PDF, Excel, WordDoc
90
  #cl = cleanText.cleanGenText()
 
119
  try:
120
  df = pd.ExcelFile(excelFile)
121
  except:
122
+ if filepath.endswith('.xls'):
123
+ df = pd.read_excel(filepath, engine='xlrd')
124
  else:
125
+ df = pd.read_excel(filepath, engine='openpyxl')
126
  sheetNames = df.sheet_names
127
  output = []
128
  if len(sheetNames) > 0:
 
142
  return corpus
143
  def helperRowTableToCorpus(self, textList):
144
  #cl = cleanGenText()
145
+ cl = cleanText.cleanGenText()
146
  stopWords = ["NaN","Unnamed:","nan"]
147
  outputDF = []
148
  for line in textList:
 
154
  # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
155
  if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
156
  #word = cl.splitStickWords(word)
157
+ word = cl.removePunct(word)
158
+ word = " ".join(cl.removeStopWords(word))
159
+ word = cl.removeTabWhiteSpaceNewLine(word)
160
  if len(word) > 1:
161
  if len(word.split(" ")) > 1:
162
  for x in word.split(" "):
 
170
  return outputDF
171
  def helperColTableToCorpus(self, dfList):
172
  #cl = cleanGenText()
173
+ cl = cleanText.cleanGenText()
174
  stopWords = ["NaN","Unnamed:","nan"]
175
  outputDF = []
176
  # use the first length line as the column ref
 
186
  # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
187
  if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
188
  #word = cl.splitStickWords(word)
189
+ word = cl.removePunct(word)
190
+ word = " ".join(cl.removeStopWords(word))
191
+ word = cl.removeTabWhiteSpaceNewLine(word)
192
  if len(word) > 1:
193
  if len(word.split(" ")) > 1:
194
  for x in word.split(" "):
 
216
  Mouse is an animal.
217
  Jerry is mouse.'''
218
  texts = {}
219
+ cl = cleanText.cleanGenText()
220
  #cl = cleanGenText()
221
+ for pos in range(len(corpusText.split("\n\n"))):
222
+ if len(corpusText.split("\n\n")[pos]) > 0:
 
223
  texts["Paragraph "+str(pos)] = []
224
+ lines = corpusText.split("\n\n")[pos]
225
  for line in lines.split("\n"):
226
  for l in line.split("."):
227
  if len(l) > 0:
228
+ cl.removeTabWhiteSpaceNewLine(l)
229
  l = l.lower()
230
  newL = []
231
  for word in l.split(" "):
232
  if len(word) > 0:
233
+ word = cl.removeStopWords(word)
234
  for w in word:
235
  if len(w) > 0 and w.isnumeric()==False:
236
  newL.append(w)
 
239
  if len(texts["Paragraph "+str(pos)]) == 0:
240
  del texts["Paragraph "+str(pos)]
241
  return texts
242
+ def selectParaForWC(self,corpus):
243
+ ''' corpus should be in the format:
244
+ corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
245
+ corSize, window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None, None
 
 
246
  corSize = len(corpus)
247
+ # less than 2000
248
+ if 0 < corSize < 2000:
249
+ window=3.5
250
+ vector_size=75
251
+ sample=1e-3
252
+ negative=10
253
+ epochs=10
254
+ sg=1
255
+ # 2000 - 100000
256
+ elif 2000 <= corSize < 100000:
257
+ window=3.5
258
+ vector_size=75
259
+ sample=1e-5
260
+ negative=10
261
+ epochs=10
262
+ sg=1
263
+ elif 100000 <=corSize < 1000000:
264
+ window=7.5
265
+ vector_size=150
266
+ sample=1e-5
267
+ negative=10
268
+ epochs=6
269
+ sg=0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  return window, vector_size, sample, negative, epochs, sg
271
+ def trainWord2Vec(self,nameFile,modelName,saveFolder,window=3.5,
272
+ vector_size=75,sample=1e-3,negative=10,epochs=10,sg=1):
273
+ # if you dont have backup file, you can use again the nameFile just to increase the lenght of corpus
 
274
  jsonFile = ""
275
  jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
 
 
 
276
  cores = multiprocessing.cpu_count()
277
  combinedCorpus = []
278
+ window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None
279
+ if len(jsonFile) > 0:
280
+ for key in jsonFile:
281
+ combinedCorpus.extend(jsonFile[key])
 
 
 
 
282
  window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
283
+ # # min_count=1 ensures all words are included
284
+ '''w2vModel = Word2Vec(
 
 
 
 
 
 
 
285
  min_count=1,
286
  window=window,
287
  vector_size=vector_size,
 
291
  negative=negative,
292
  workers=cores-1,
293
  epochs = epochs,
294
+ sg=sg)'''
295
+ #w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
296
+ accept = False
297
+ while not accept:
298
+ if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
299
+ try:
300
+ w2vModel = Word2Vec(
301
+ min_count=1,
302
+ window=window,
303
+ vector_size=vector_size,
304
+ sample=sample,
305
+ alpha=0.03,
306
+ min_alpha=0.0007,
307
+ negative=negative,
308
+ workers=cores-1,
309
+ epochs = epochs,
310
+ sg=sg)
311
+ w2vModel.build_vocab(combinedCorpus)
312
+ w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
313
+ accept = True
314
+ except:
315
+ for key in jsonFile:
316
+ combinedCorpus.extend(jsonFile[key])
317
+ window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
318
+ print("next is " + str(len(combinedCorpus)))
319
+ else:
320
+ print("no parameter to train")
321
+ break
322
+ #w2vModel.build_vocab(combinedCorpus)
323
+ #w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
324
+ #w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
325
+ #w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
326
+ w2vModel.save(saveFolder+"/"+modelName+".model")
327
+ w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
328
+ print("done w2v")
329
+ else: print("no corpus to train")
330
  #return combinedCorpus
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
  def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
332
  # might not be a meaningful keyword
333
  #stopWords = ["show"]
 
354
  results.append(moreNewResult)
355
  currN +=1'''
356
  return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
  # adding our model into spacy
358
  # this deals with command line; but instead of using it, we write python script to run command line
359
  def loadWordVec(self,modelName,wordVec):
 
366
  "en",
367
  modelName, # this modelName comes from the saved modelName of function trainWord2Vec
368
  "--vectors-loc",
369
+ wordVec])