VyLala commited on
Commit
cfe9c4c
·
verified ·
1 Parent(s): ab3f65c

Update NER/word2Vec/word2vec.py

Browse files
Files changed (1) hide show
  1. NER/word2Vec/word2vec.py +369 -363
NER/word2Vec/word2vec.py CHANGED
@@ -1,364 +1,370 @@
1
- '''WORD TO VECTOR'''
2
- import pandas as pd
3
- import json
4
- import gensim
5
- import spacy
6
- from DefaultPackages import openFile, saveFile
7
- from NER import cleanText
8
- from gensim.models.keyedvectors import KeyedVectors
9
- from gensim.test.utils import common_texts
10
- from gensim.models.word2vec import Word2Vec
11
- from gensim.scripts.glove2word2vec import glove2word2vec
12
- from gensim.test.utils import datapath, get_tmpfile
13
- import sys
14
- import subprocess
15
- # can try multiprocessing to run quicker
16
- import multiprocessing
17
- import copy
18
- sys.setrecursionlimit(1000)
19
- # creat folder word2Vec
20
- #! mkdir /content/drive/MyDrive/CollectData/NER/word2Vec
21
- # create word2vec model
22
- #model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/CollectData/NER/word2Vec', binary=True)
23
- '''Some notes for this model
24
- sometimes when we do the corpus, there are some adverbs which are unnecessary but might be seen as
25
- a similar word to the word we are finding, so can we try to preprocess text so that
26
- we make the corpus more effective and only contains the important words. Then when we
27
- train the model, the important words will be seen as important. Or
28
- when we already have the similar list of words, we can remove the words in there
29
- that are stopwords/unnecessary words.'''
30
- ### For more complex analysis, consider using sentence embedding models like "Doc2Vec" to represent the meaning of entire sentences instead of just individual words
31
- class word2Vec():
32
- def __init__(self, nameFile=None, modelName=None):
33
- self.nameFile = nameFile
34
- self.modelName = modelName
35
- def spacy_similarity(self, word):
36
- # when use word2vec, try medium or large is better
37
- # maybe try odc similarity?
38
- nlp = spacy.load("en_core_web_lg")
39
- doc = nlp(word)
40
- for token1 in doc:
41
- for token2 in doc:
42
- print(token1.text, token2.text, token1.similarity(token2))
43
- pass
44
- # clean text before transform to corpus
45
- def cleanTextBeforeCorpus(self,oriText, doi=None):
46
- cl = cleanText.cleanGenText()
47
- #cl = cleanGenText()
48
- output = ""
49
- alreadyRemoveDoi = False
50
- for word in oriText.split(" "):
51
- # remove DOI
52
- if doi != None and doi in oriText:
53
- if alreadyRemoveDoi == False:
54
- newWord = cl.removeDOI(word,doi)
55
- if len(newWord) > 0 and newWord != word:
56
- alreadyRemoveDoi = True
57
- word = newWord
58
- # remove punctuation
59
- # split the sticked words
60
- #word = cl.splitStickWords(word)
61
- # remove punctuation
62
- word = cl.removePunct(word,True)
63
- # remove URL
64
- word = cl.removeURL(word)
65
- # remove HTMLTag
66
- word = cl.removeHTMLTag(word)
67
- # remove tab, white space, newline
68
- word = cl.removeTabWhiteSpaceNewLine(word)
69
- # optional: remove stopwords
70
- #word = cl.removeStopWords(word)
71
- if len(word)>0:
72
- output += word + " "
73
- return output
74
- def cleanAllTextBeforeCorpus(self, allText, doi=None):
75
- cleanOutput = ""
76
- remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
77
- if len(allText) > 0:
78
- corpusText = allText
79
- for pos in range(len(corpusText.split("\n\n"))):
80
- if len(corpusText.split("\n\n")[pos]) > 0:
81
- lines = corpusText.split("\n\n")[pos]
82
- for line in lines.split("\n"):
83
- if remove in line: line = line.replace(remove, "")
84
- clean_text = self.cleanTextBeforeCorpus(line, doi)
85
- cleanOutput += clean_text + "\n"
86
- cleanOutput += "\n\n"
87
- return cleanOutput
88
- def tableTransformToCorpusText(self, df, excelFile=None):
89
- # PDF, Excel, WordDoc
90
- #cl = cleanText.cleanGenText()
91
- corpus = {}
92
- # PDF or df
93
- if excelFile == None:
94
- if len(df) > 0:
95
- try:
96
- for i in range(len(df)):
97
- # each new dimension/page is considered to be a sentence which ends with the period.
98
- # each new line is a new list, and each new df is a new corpus
99
- outputDF = []
100
- text = df[i].values.tolist()
101
- if len(text) > 0:
102
- outputRowDF = self.helperRowTableToCorpus(text)
103
- #outputColDF = self.helperColTableToCorpus(text)
104
- outputDF.extend(outputRowDF)
105
- #outputDF.extend(outputColDF)
106
- if len(outputDF) > 0:
107
- corpus["corpus" + str(i)] = outputDF
108
- except:
109
- outputDF = []
110
- text = df.values.tolist()
111
- if len(text) > 0:
112
- outputRowDF = self.helperRowTableToCorpus(text)
113
- #outputColDF = self.helperColTableToCorpus(text)
114
- outputDF.extend(outputRowDF)
115
- #outputDF.extend(outputColDF)
116
- if len(outputDF) > 0:
117
- corpus["corpus0"] = outputDF
118
- else:
119
- df = pd.ExcelFile(excelFile)
120
- sheetNames = df.sheet_names
121
- output = []
122
- if len(sheetNames) > 0:
123
- for s in range(len(sheetNames)):
124
- outputDF = []
125
- with pd.ExcelFile(excelFile) as xls:
126
- data = pd.read_excel(xls, sheetNames[s])
127
- if sheetNames[s] != 'Evaluation Warning':
128
- text = data.values.tolist()
129
- if len(text) > 0:
130
- outputRowDF = self.helperRowTableToCorpus(text)
131
- #outputColDF = self.helperColTableToCorpus(text)
132
- outputDF.extend(outputRowDF)
133
- #outputDF.extend(outputColDF)
134
- if len(outputDF) > 0:
135
- corpus["corpus" + str(s)] = outputDF
136
- return corpus
137
- def helperRowTableToCorpus(self, textList):
138
- #cl = cleanGenText()
139
- cl = cleanText.cleanGenText()
140
- stopWords = ["NaN","Unnamed:","nan"]
141
- outputDF = []
142
- for line in textList:
143
- outputLine = []
144
- for words in line:
145
- words = str(words)
146
- if len(words) > 0:
147
- for word in words.split(" "):
148
- # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
149
- if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
150
- #word = cl.splitStickWords(word)
151
- word = cl.removePunct(word)
152
- word = " ".join(cl.removeStopWords(word))
153
- word = cl.removeTabWhiteSpaceNewLine(word)
154
- if len(word) > 1:
155
- if len(word.split(" ")) > 1:
156
- for x in word.split(" "):
157
- if len(x) > 1 and x.isnumeric()==False:
158
- outputLine.append(x.lower())
159
- else:
160
- if word.isnumeric() == False:
161
- outputLine.append(word.lower())
162
- if len(outputLine) > 0:
163
- outputDF.append(outputLine)
164
- return outputDF
165
- def helperColTableToCorpus(self, dfList):
166
- #cl = cleanGenText()
167
- cl = cleanText.cleanGenText()
168
- stopWords = ["NaN","Unnamed:","nan"]
169
- outputDF = []
170
- # use the first length line as the column ref
171
- for pos in range(len(dfList[0])):
172
- outputLine = []
173
- for line in dfList:
174
- if pos < len(line):
175
- words = line[pos]
176
- words = str(words)
177
- else: words = ""
178
- if len(words) > 0:
179
- for word in words.split(" "):
180
- # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
181
- if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
182
- #word = cl.splitStickWords(word)
183
- word = cl.removePunct(word)
184
- word = " ".join(cl.removeStopWords(word))
185
- word = cl.removeTabWhiteSpaceNewLine(word)
186
- if len(word) > 1:
187
- if len(word.split(" ")) > 1:
188
- for x in word.split(" "):
189
- if len(x) > 1 and x.isnumeric()==False:
190
- outputLine.append(x.lower())
191
- else:
192
- if word.isnumeric() == False:
193
- outputLine.append(word.lower())
194
- if len(outputLine) > 0:
195
- outputDF.append(outputLine)
196
- return outputDF
197
- # create a corpus
198
- def createCorpusText(self, corpusText):
199
- '''ex: "Tom is cat. Jerry is mouse."
200
- corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
201
- # the output should be like this:
202
- '''texts = {
203
- "Paragraph 1": [["Cat", "is", "an","animal], ["Tom", "is", "cat"]],
204
- "Paragraph 2": [["Mouse", "is", "an", "animal"], ["Jerry", "is", "mouse"]]
205
- }
206
- '''
207
- # separate paragraph
208
- '''Ex: Cat is an animal. Tom is cat.
209
-
210
- Mouse is an animal.
211
- Jerry is mouse.'''
212
- texts = {}
213
- cl = cleanText.cleanGenText()
214
- #cl = cleanGenText()
215
- for pos in range(len(corpusText.split("\n\n"))):
216
- if len(corpusText.split("\n\n")[pos]) > 0:
217
- texts["Paragraph "+str(pos)] = []
218
- lines = corpusText.split("\n\n")[pos]
219
- for line in lines.split("\n"):
220
- for l in line.split("."):
221
- if len(l) > 0:
222
- cl.removeTabWhiteSpaceNewLine(l)
223
- l = l.lower()
224
- newL = []
225
- for word in l.split(" "):
226
- if len(word) > 0:
227
- word = cl.removeStopWords(word)
228
- for w in word:
229
- if len(w) > 0 and w.isnumeric()==False:
230
- newL.append(w)
231
- if len(newL)>0:
232
- texts["Paragraph "+str(pos)].append(newL)
233
- if len(texts["Paragraph "+str(pos)]) == 0:
234
- del texts["Paragraph "+str(pos)]
235
- return texts
236
- def selectParaForWC(self,corpus):
237
- ''' corpus should be in the format:
238
- corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
239
- corSize, window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None, None
240
- corSize = len(corpus)
241
- # less than 2000
242
- if 0 < corSize < 2000:
243
- window=3.5
244
- vector_size=75
245
- sample=1e-3
246
- negative=10
247
- epochs=10
248
- sg=1
249
- # 2000 - 100000
250
- elif 2000 <= corSize < 100000:
251
- window=3.5
252
- vector_size=75
253
- sample=1e-5
254
- negative=10
255
- epochs=10
256
- sg=1
257
- elif 100000 <=corSize < 1000000:
258
- window=7.5
259
- vector_size=150
260
- sample=1e-5
261
- negative=10
262
- epochs=6
263
- sg=0
264
- return window, vector_size, sample, negative, epochs, sg
265
- def trainWord2Vec(self,nameFile,modelName,saveFolder,window=3.5,
266
- vector_size=75,sample=1e-3,negative=10,epochs=10,sg=1):
267
- # if you dont have backup file, you can use again the nameFile just to increase the lenght of corpus
268
- jsonFile = ""
269
- jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
270
- cores = multiprocessing.cpu_count()
271
- combinedCorpus = []
272
- window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None
273
- if len(jsonFile) > 0:
274
- for key in jsonFile:
275
- combinedCorpus.extend(jsonFile[key])
276
- window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
277
- # # min_count=1 ensures all words are included
278
- '''w2vModel = Word2Vec(
279
- min_count=1,
280
- window=window,
281
- vector_size=vector_size,
282
- sample=sample,
283
- alpha=0.03,
284
- min_alpha=0.0007,
285
- negative=negative,
286
- workers=cores-1,
287
- epochs = epochs,
288
- sg=sg)'''
289
- #w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
290
- accept = False
291
- while not accept:
292
- if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
293
- try:
294
- w2vModel = Word2Vec(
295
- min_count=1,
296
- window=window,
297
- vector_size=vector_size,
298
- sample=sample,
299
- alpha=0.03,
300
- min_alpha=0.0007,
301
- negative=negative,
302
- workers=cores-1,
303
- epochs = epochs,
304
- sg=sg)
305
- w2vModel.build_vocab(combinedCorpus)
306
- w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
307
- accept = True
308
- except:
309
- for key in jsonFile:
310
- combinedCorpus.extend(jsonFile[key])
311
- window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
312
- print("next is " + str(len(combinedCorpus)))
313
- else:
314
- print("no parameter to train")
315
- break
316
- #w2vModel.build_vocab(combinedCorpus)
317
- #w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
318
- #w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
319
- #w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
320
- w2vModel.save(saveFolder+"/"+modelName+".model")
321
- w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
322
- print("done w2v")
323
- else: print("no corpus to train")
324
- #return combinedCorpus
325
- def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
326
- # might not be a meaningful keyword
327
- #stopWords = ["show"]
328
- # same word but just plural nouns, tense
329
- simWords = [word+"s",word+"es",word+"ing",word+"ed"]
330
- model = KeyedVectors.load_word2vec_format(modelFile, binary = False) # model file in format txt
331
- results = model.most_similar(positive=[word],topn=n)
332
- #removeIndex = []
333
- #currN = copy.deepcopy(n)
334
- '''for r in range(len(results)):
335
- if len(results[r][0]) < 2:
336
- removeIndex.append(results[r])
337
- # remove the same word but just plural and singular noun and lower than the cos_thres
338
- elif results[r][0] == word:
339
- removeIndex.append(results[r])
340
- elif results[r][0] in simWords or float(results[r][1]) < cos_thres or results[r][0] in stopWords:
341
- removeIndex.append(results[r])
342
- for rem in removeIndex:
343
- results.remove(rem)
344
- while len(results)!=n and len(results) != 0:
345
- moreNewResult = model.most_similar(positive=[word],topn=currN+1)[-1]
346
- if moreNewResult not in results and len(moreNewResult[0])>1:
347
- if moreNewResult[0] not in stopWords and results[0] != word:
348
- results.append(moreNewResult)
349
- currN +=1'''
350
- return results
351
- # adding our model into spacy
352
- # this deals with command line; but instead of using it, we write python script to run command line
353
- def loadWordVec(self,modelName,wordVec):
354
- # modelName is the name you want to save into spacy
355
- # wordVec is the trained word2vec in txt format
356
- subprocess.run([sys.executable,
357
- "-m",
358
- "spacy",
359
- "init-model",
360
- "en",
361
- modelName, # this modelName comes from the saved modelName of function trainWord2Vec
362
- "--vectors-loc",
363
- wordVec])
 
 
 
 
 
 
364
  print("done")
 
1
+ '''WORD TO VECTOR'''
2
+ import pandas as pd
3
+ import json
4
+ import gensim
5
+ import spacy
6
+ from DefaultPackages import openFile, saveFile
7
+ from NER import cleanText
8
+ from gensim.models.keyedvectors import KeyedVectors
9
+ from gensim.test.utils import common_texts
10
+ from gensim.models.word2vec import Word2Vec
11
+ from gensim.scripts.glove2word2vec import glove2word2vec
12
+ from gensim.test.utils import datapath, get_tmpfile
13
+ import sys
14
+ import subprocess
15
+ # can try multiprocessing to run quicker
16
+ import multiprocessing
17
+ import copy
18
+ sys.setrecursionlimit(1000)
19
+ # creat folder word2Vec
20
+ #! mkdir /content/drive/MyDrive/CollectData/NER/word2Vec
21
+ # create word2vec model
22
+ #model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/CollectData/NER/word2Vec', binary=True)
23
+ '''Some notes for this model
24
+ sometimes when we do the corpus, there are some adverbs which are unnecessary but might be seen as
25
+ a similar word to the word we are finding, so can we try to preprocess text so that
26
+ we make the corpus more effective and only contains the important words. Then when we
27
+ train the model, the important words will be seen as important. Or
28
+ when we already have the similar list of words, we can remove the words in there
29
+ that are stopwords/unnecessary words.'''
30
+ ### For more complex analysis, consider using sentence embedding models like "Doc2Vec" to represent the meaning of entire sentences instead of just individual words
31
+ class word2Vec():
32
+ def __init__(self, nameFile=None, modelName=None):
33
+ self.nameFile = nameFile
34
+ self.modelName = modelName
35
+ def spacy_similarity(self, word):
36
+ # when use word2vec, try medium or large is better
37
+ # maybe try odc similarity?
38
+ nlp = spacy.load("en_core_web_lg")
39
+ doc = nlp(word)
40
+ for token1 in doc:
41
+ for token2 in doc:
42
+ print(token1.text, token2.text, token1.similarity(token2))
43
+ pass
44
+ # clean text before transform to corpus
45
+ def cleanTextBeforeCorpus(self,oriText, doi=None):
46
+ cl = cleanText.cleanGenText()
47
+ #cl = cleanGenText()
48
+ output = ""
49
+ alreadyRemoveDoi = False
50
+ for word in oriText.split(" "):
51
+ # remove DOI
52
+ if doi != None and doi in oriText:
53
+ if alreadyRemoveDoi == False:
54
+ newWord = cl.removeDOI(word,doi)
55
+ if len(newWord) > 0 and newWord != word:
56
+ alreadyRemoveDoi = True
57
+ word = newWord
58
+ # remove punctuation
59
+ # split the sticked words
60
+ #word = cl.splitStickWords(word)
61
+ # remove punctuation
62
+ word = cl.removePunct(word,True)
63
+ # remove URL
64
+ word = cl.removeURL(word)
65
+ # remove HTMLTag
66
+ word = cl.removeHTMLTag(word)
67
+ # remove tab, white space, newline
68
+ word = cl.removeTabWhiteSpaceNewLine(word)
69
+ # optional: remove stopwords
70
+ #word = cl.removeStopWords(word)
71
+ if len(word)>0:
72
+ output += word + " "
73
+ return output
74
+ def cleanAllTextBeforeCorpus(self, allText, doi=None):
75
+ cleanOutput = ""
76
+ remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
77
+ if len(allText) > 0:
78
+ corpusText = allText
79
+ for pos in range(len(corpusText.split("\n\n"))):
80
+ if len(corpusText.split("\n\n")[pos]) > 0:
81
+ lines = corpusText.split("\n\n")[pos]
82
+ for line in lines.split("\n"):
83
+ if remove in line: line = line.replace(remove, "")
84
+ clean_text = self.cleanTextBeforeCorpus(line, doi)
85
+ cleanOutput += clean_text + "\n"
86
+ cleanOutput += "\n\n"
87
+ return cleanOutput
88
+ def tableTransformToCorpusText(self, df, excelFile=None):
89
+ # PDF, Excel, WordDoc
90
+ #cl = cleanText.cleanGenText()
91
+ corpus = {}
92
+ # PDF or df
93
+ if excelFile == None:
94
+ if len(df) > 0:
95
+ try:
96
+ for i in range(len(df)):
97
+ # each new dimension/page is considered to be a sentence which ends with the period.
98
+ # each new line is a new list, and each new df is a new corpus
99
+ outputDF = []
100
+ text = df[i].values.tolist()
101
+ if len(text) > 0:
102
+ outputRowDF = self.helperRowTableToCorpus(text)
103
+ #outputColDF = self.helperColTableToCorpus(text)
104
+ outputDF.extend(outputRowDF)
105
+ #outputDF.extend(outputColDF)
106
+ if len(outputDF) > 0:
107
+ corpus["corpus" + str(i)] = outputDF
108
+ except:
109
+ outputDF = []
110
+ text = df.values.tolist()
111
+ if len(text) > 0:
112
+ outputRowDF = self.helperRowTableToCorpus(text)
113
+ #outputColDF = self.helperColTableToCorpus(text)
114
+ outputDF.extend(outputRowDF)
115
+ #outputDF.extend(outputColDF)
116
+ if len(outputDF) > 0:
117
+ corpus["corpus0"] = outputDF
118
+ else:
119
+ try:
120
+ df = pd.ExcelFile(excelFile)
121
+ except:
122
+ if filepath.endswith('.xls'):
123
+ df = pd.read_excel(filepath, engine='xlrd')
124
+ else:
125
+ df = pd.read_excel(filepath, engine='openpyxl')
126
+ sheetNames = df.sheet_names
127
+ output = []
128
+ if len(sheetNames) > 0:
129
+ for s in range(len(sheetNames)):
130
+ outputDF = []
131
+ with pd.ExcelFile(excelFile) as xls:
132
+ data = pd.read_excel(xls, sheetNames[s])
133
+ if sheetNames[s] != 'Evaluation Warning':
134
+ text = data.values.tolist()
135
+ if len(text) > 0:
136
+ outputRowDF = self.helperRowTableToCorpus(text)
137
+ #outputColDF = self.helperColTableToCorpus(text)
138
+ outputDF.extend(outputRowDF)
139
+ #outputDF.extend(outputColDF)
140
+ if len(outputDF) > 0:
141
+ corpus["corpus" + str(s)] = outputDF
142
+ return corpus
143
+ def helperRowTableToCorpus(self, textList):
144
+ #cl = cleanGenText()
145
+ cl = cleanText.cleanGenText()
146
+ stopWords = ["NaN","Unnamed:","nan"]
147
+ outputDF = []
148
+ for line in textList:
149
+ outputLine = []
150
+ for words in line:
151
+ words = str(words)
152
+ if len(words) > 0:
153
+ for word in words.split(" "):
154
+ # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
155
+ if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
156
+ #word = cl.splitStickWords(word)
157
+ word = cl.removePunct(word)
158
+ word = " ".join(cl.removeStopWords(word))
159
+ word = cl.removeTabWhiteSpaceNewLine(word)
160
+ if len(word) > 1:
161
+ if len(word.split(" ")) > 1:
162
+ for x in word.split(" "):
163
+ if len(x) > 1 and x.isnumeric()==False:
164
+ outputLine.append(x.lower())
165
+ else:
166
+ if word.isnumeric() == False:
167
+ outputLine.append(word.lower())
168
+ if len(outputLine) > 0:
169
+ outputDF.append(outputLine)
170
+ return outputDF
171
+ def helperColTableToCorpus(self, dfList):
172
+ #cl = cleanGenText()
173
+ cl = cleanText.cleanGenText()
174
+ stopWords = ["NaN","Unnamed:","nan"]
175
+ outputDF = []
176
+ # use the first length line as the column ref
177
+ for pos in range(len(dfList[0])):
178
+ outputLine = []
179
+ for line in dfList:
180
+ if pos < len(line):
181
+ words = line[pos]
182
+ words = str(words)
183
+ else: words = ""
184
+ if len(words) > 0:
185
+ for word in words.split(" "):
186
+ # remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
187
+ if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
188
+ #word = cl.splitStickWords(word)
189
+ word = cl.removePunct(word)
190
+ word = " ".join(cl.removeStopWords(word))
191
+ word = cl.removeTabWhiteSpaceNewLine(word)
192
+ if len(word) > 1:
193
+ if len(word.split(" ")) > 1:
194
+ for x in word.split(" "):
195
+ if len(x) > 1 and x.isnumeric()==False:
196
+ outputLine.append(x.lower())
197
+ else:
198
+ if word.isnumeric() == False:
199
+ outputLine.append(word.lower())
200
+ if len(outputLine) > 0:
201
+ outputDF.append(outputLine)
202
+ return outputDF
203
+ # create a corpus
204
+ def createCorpusText(self, corpusText):
205
+ '''ex: "Tom is cat. Jerry is mouse."
206
+ corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
207
+ # the output should be like this:
208
+ '''texts = {
209
+ "Paragraph 1": [["Cat", "is", "an","animal], ["Tom", "is", "cat"]],
210
+ "Paragraph 2": [["Mouse", "is", "an", "animal"], ["Jerry", "is", "mouse"]]
211
+ }
212
+ '''
213
+ # separate paragraph
214
+ '''Ex: Cat is an animal. Tom is cat.
215
+
216
+ Mouse is an animal.
217
+ Jerry is mouse.'''
218
+ texts = {}
219
+ cl = cleanText.cleanGenText()
220
+ #cl = cleanGenText()
221
+ for pos in range(len(corpusText.split("\n\n"))):
222
+ if len(corpusText.split("\n\n")[pos]) > 0:
223
+ texts["Paragraph "+str(pos)] = []
224
+ lines = corpusText.split("\n\n")[pos]
225
+ for line in lines.split("\n"):
226
+ for l in line.split("."):
227
+ if len(l) > 0:
228
+ cl.removeTabWhiteSpaceNewLine(l)
229
+ l = l.lower()
230
+ newL = []
231
+ for word in l.split(" "):
232
+ if len(word) > 0:
233
+ word = cl.removeStopWords(word)
234
+ for w in word:
235
+ if len(w) > 0 and w.isnumeric()==False:
236
+ newL.append(w)
237
+ if len(newL)>0:
238
+ texts["Paragraph "+str(pos)].append(newL)
239
+ if len(texts["Paragraph "+str(pos)]) == 0:
240
+ del texts["Paragraph "+str(pos)]
241
+ return texts
242
+ def selectParaForWC(self,corpus):
243
+ ''' corpus should be in the format:
244
+ corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
245
+ corSize, window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None, None
246
+ corSize = len(corpus)
247
+ # less than 2000
248
+ if 0 < corSize < 2000:
249
+ window=3.5
250
+ vector_size=75
251
+ sample=1e-3
252
+ negative=10
253
+ epochs=10
254
+ sg=1
255
+ # 2000 - 100000
256
+ elif 2000 <= corSize < 100000:
257
+ window=3.5
258
+ vector_size=75
259
+ sample=1e-5
260
+ negative=10
261
+ epochs=10
262
+ sg=1
263
+ elif 100000 <=corSize < 1000000:
264
+ window=7.5
265
+ vector_size=150
266
+ sample=1e-5
267
+ negative=10
268
+ epochs=6
269
+ sg=0
270
+ return window, vector_size, sample, negative, epochs, sg
271
+ def trainWord2Vec(self,nameFile,modelName,saveFolder,window=3.5,
272
+ vector_size=75,sample=1e-3,negative=10,epochs=10,sg=1):
273
+ # if you dont have backup file, you can use again the nameFile just to increase the lenght of corpus
274
+ jsonFile = ""
275
+ jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
276
+ cores = multiprocessing.cpu_count()
277
+ combinedCorpus = []
278
+ window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None
279
+ if len(jsonFile) > 0:
280
+ for key in jsonFile:
281
+ combinedCorpus.extend(jsonFile[key])
282
+ window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
283
+ # # min_count=1 ensures all words are included
284
+ '''w2vModel = Word2Vec(
285
+ min_count=1,
286
+ window=window,
287
+ vector_size=vector_size,
288
+ sample=sample,
289
+ alpha=0.03,
290
+ min_alpha=0.0007,
291
+ negative=negative,
292
+ workers=cores-1,
293
+ epochs = epochs,
294
+ sg=sg)'''
295
+ #w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
296
+ accept = False
297
+ while not accept:
298
+ if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
299
+ try:
300
+ w2vModel = Word2Vec(
301
+ min_count=1,
302
+ window=window,
303
+ vector_size=vector_size,
304
+ sample=sample,
305
+ alpha=0.03,
306
+ min_alpha=0.0007,
307
+ negative=negative,
308
+ workers=cores-1,
309
+ epochs = epochs,
310
+ sg=sg)
311
+ w2vModel.build_vocab(combinedCorpus)
312
+ w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
313
+ accept = True
314
+ except:
315
+ for key in jsonFile:
316
+ combinedCorpus.extend(jsonFile[key])
317
+ window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
318
+ print("next is " + str(len(combinedCorpus)))
319
+ else:
320
+ print("no parameter to train")
321
+ break
322
+ #w2vModel.build_vocab(combinedCorpus)
323
+ #w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
324
+ #w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
325
+ #w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
326
+ w2vModel.save(saveFolder+"/"+modelName+".model")
327
+ w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
328
+ print("done w2v")
329
+ else: print("no corpus to train")
330
+ #return combinedCorpus
331
+ def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
332
+ # might not be a meaningful keyword
333
+ #stopWords = ["show"]
334
+ # same word but just plural nouns, tense
335
+ simWords = [word+"s",word+"es",word+"ing",word+"ed"]
336
+ model = KeyedVectors.load_word2vec_format(modelFile, binary = False) # model file in format txt
337
+ results = model.most_similar(positive=[word],topn=n)
338
+ #removeIndex = []
339
+ #currN = copy.deepcopy(n)
340
+ '''for r in range(len(results)):
341
+ if len(results[r][0]) < 2:
342
+ removeIndex.append(results[r])
343
+ # remove the same word but just plural and singular noun and lower than the cos_thres
344
+ elif results[r][0] == word:
345
+ removeIndex.append(results[r])
346
+ elif results[r][0] in simWords or float(results[r][1]) < cos_thres or results[r][0] in stopWords:
347
+ removeIndex.append(results[r])
348
+ for rem in removeIndex:
349
+ results.remove(rem)
350
+ while len(results)!=n and len(results) != 0:
351
+ moreNewResult = model.most_similar(positive=[word],topn=currN+1)[-1]
352
+ if moreNewResult not in results and len(moreNewResult[0])>1:
353
+ if moreNewResult[0] not in stopWords and results[0] != word:
354
+ results.append(moreNewResult)
355
+ currN +=1'''
356
+ return results
357
+ # adding our model into spacy
358
+ # this deals with command line; but instead of using it, we write python script to run command line
359
+ def loadWordVec(self,modelName,wordVec):
360
+ # modelName is the name you want to save into spacy
361
+ # wordVec is the trained word2vec in txt format
362
+ subprocess.run([sys.executable,
363
+ "-m",
364
+ "spacy",
365
+ "init-model",
366
+ "en",
367
+ modelName, # this modelName comes from the saved modelName of function trainWord2Vec
368
+ "--vectors-loc",
369
+ wordVec])
370
  print("done")