Spaces:
Running
Running
Update NER/word2Vec/word2vec.py
Browse files- NER/word2Vec/word2vec.py +369 -363
NER/word2Vec/word2vec.py
CHANGED
@@ -1,364 +1,370 @@
|
|
1 |
-
'''WORD TO VECTOR'''
|
2 |
-
import pandas as pd
|
3 |
-
import json
|
4 |
-
import gensim
|
5 |
-
import spacy
|
6 |
-
from DefaultPackages import openFile, saveFile
|
7 |
-
from NER import cleanText
|
8 |
-
from gensim.models.keyedvectors import KeyedVectors
|
9 |
-
from gensim.test.utils import common_texts
|
10 |
-
from gensim.models.word2vec import Word2Vec
|
11 |
-
from gensim.scripts.glove2word2vec import glove2word2vec
|
12 |
-
from gensim.test.utils import datapath, get_tmpfile
|
13 |
-
import sys
|
14 |
-
import subprocess
|
15 |
-
# can try multiprocessing to run quicker
|
16 |
-
import multiprocessing
|
17 |
-
import copy
|
18 |
-
sys.setrecursionlimit(1000)
|
19 |
-
# creat folder word2Vec
|
20 |
-
#! mkdir /content/drive/MyDrive/CollectData/NER/word2Vec
|
21 |
-
# create word2vec model
|
22 |
-
#model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/CollectData/NER/word2Vec', binary=True)
|
23 |
-
'''Some notes for this model
|
24 |
-
sometimes when we do the corpus, there are some adverbs which are unnecessary but might be seen as
|
25 |
-
a similar word to the word we are finding, so can we try to preprocess text so that
|
26 |
-
we make the corpus more effective and only contains the important words. Then when we
|
27 |
-
train the model, the important words will be seen as important. Or
|
28 |
-
when we already have the similar list of words, we can remove the words in there
|
29 |
-
that are stopwords/unnecessary words.'''
|
30 |
-
### For more complex analysis, consider using sentence embedding models like "Doc2Vec" to represent the meaning of entire sentences instead of just individual words
|
31 |
-
class word2Vec():
|
32 |
-
def __init__(self, nameFile=None, modelName=None):
|
33 |
-
self.nameFile = nameFile
|
34 |
-
self.modelName = modelName
|
35 |
-
def spacy_similarity(self, word):
|
36 |
-
# when use word2vec, try medium or large is better
|
37 |
-
# maybe try odc similarity?
|
38 |
-
nlp = spacy.load("en_core_web_lg")
|
39 |
-
doc = nlp(word)
|
40 |
-
for token1 in doc:
|
41 |
-
for token2 in doc:
|
42 |
-
print(token1.text, token2.text, token1.similarity(token2))
|
43 |
-
pass
|
44 |
-
# clean text before transform to corpus
|
45 |
-
def cleanTextBeforeCorpus(self,oriText, doi=None):
|
46 |
-
cl = cleanText.cleanGenText()
|
47 |
-
#cl = cleanGenText()
|
48 |
-
output = ""
|
49 |
-
alreadyRemoveDoi = False
|
50 |
-
for word in oriText.split(" "):
|
51 |
-
# remove DOI
|
52 |
-
if doi != None and doi in oriText:
|
53 |
-
if alreadyRemoveDoi == False:
|
54 |
-
newWord = cl.removeDOI(word,doi)
|
55 |
-
if len(newWord) > 0 and newWord != word:
|
56 |
-
alreadyRemoveDoi = True
|
57 |
-
word = newWord
|
58 |
-
# remove punctuation
|
59 |
-
# split the sticked words
|
60 |
-
#word = cl.splitStickWords(word)
|
61 |
-
# remove punctuation
|
62 |
-
word = cl.removePunct(word,True)
|
63 |
-
# remove URL
|
64 |
-
word = cl.removeURL(word)
|
65 |
-
# remove HTMLTag
|
66 |
-
word = cl.removeHTMLTag(word)
|
67 |
-
# remove tab, white space, newline
|
68 |
-
word = cl.removeTabWhiteSpaceNewLine(word)
|
69 |
-
# optional: remove stopwords
|
70 |
-
#word = cl.removeStopWords(word)
|
71 |
-
if len(word)>0:
|
72 |
-
output += word + " "
|
73 |
-
return output
|
74 |
-
def cleanAllTextBeforeCorpus(self, allText, doi=None):
|
75 |
-
cleanOutput = ""
|
76 |
-
remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
|
77 |
-
if len(allText) > 0:
|
78 |
-
corpusText = allText
|
79 |
-
for pos in range(len(corpusText.split("\n\n"))):
|
80 |
-
if len(corpusText.split("\n\n")[pos]) > 0:
|
81 |
-
lines = corpusText.split("\n\n")[pos]
|
82 |
-
for line in lines.split("\n"):
|
83 |
-
if remove in line: line = line.replace(remove, "")
|
84 |
-
clean_text = self.cleanTextBeforeCorpus(line, doi)
|
85 |
-
cleanOutput += clean_text + "\n"
|
86 |
-
cleanOutput += "\n\n"
|
87 |
-
return cleanOutput
|
88 |
-
def tableTransformToCorpusText(self, df, excelFile=None):
|
89 |
-
# PDF, Excel, WordDoc
|
90 |
-
#cl = cleanText.cleanGenText()
|
91 |
-
corpus = {}
|
92 |
-
# PDF or df
|
93 |
-
if excelFile == None:
|
94 |
-
if len(df) > 0:
|
95 |
-
try:
|
96 |
-
for i in range(len(df)):
|
97 |
-
# each new dimension/page is considered to be a sentence which ends with the period.
|
98 |
-
# each new line is a new list, and each new df is a new corpus
|
99 |
-
outputDF = []
|
100 |
-
text = df[i].values.tolist()
|
101 |
-
if len(text) > 0:
|
102 |
-
outputRowDF = self.helperRowTableToCorpus(text)
|
103 |
-
#outputColDF = self.helperColTableToCorpus(text)
|
104 |
-
outputDF.extend(outputRowDF)
|
105 |
-
#outputDF.extend(outputColDF)
|
106 |
-
if len(outputDF) > 0:
|
107 |
-
corpus["corpus" + str(i)] = outputDF
|
108 |
-
except:
|
109 |
-
outputDF = []
|
110 |
-
text = df.values.tolist()
|
111 |
-
if len(text) > 0:
|
112 |
-
outputRowDF = self.helperRowTableToCorpus(text)
|
113 |
-
#outputColDF = self.helperColTableToCorpus(text)
|
114 |
-
outputDF.extend(outputRowDF)
|
115 |
-
#outputDF.extend(outputColDF)
|
116 |
-
if len(outputDF) > 0:
|
117 |
-
corpus["corpus0"] = outputDF
|
118 |
-
else:
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
'''
|
207 |
-
#
|
208 |
-
'''
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
if
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
#
|
333 |
-
#
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
364 |
print("done")
|
|
|
1 |
+
'''WORD TO VECTOR'''
|
2 |
+
import pandas as pd
|
3 |
+
import json
|
4 |
+
import gensim
|
5 |
+
import spacy
|
6 |
+
from DefaultPackages import openFile, saveFile
|
7 |
+
from NER import cleanText
|
8 |
+
from gensim.models.keyedvectors import KeyedVectors
|
9 |
+
from gensim.test.utils import common_texts
|
10 |
+
from gensim.models.word2vec import Word2Vec
|
11 |
+
from gensim.scripts.glove2word2vec import glove2word2vec
|
12 |
+
from gensim.test.utils import datapath, get_tmpfile
|
13 |
+
import sys
|
14 |
+
import subprocess
|
15 |
+
# can try multiprocessing to run quicker
|
16 |
+
import multiprocessing
|
17 |
+
import copy
|
18 |
+
sys.setrecursionlimit(1000)
|
19 |
+
# creat folder word2Vec
|
20 |
+
#! mkdir /content/drive/MyDrive/CollectData/NER/word2Vec
|
21 |
+
# create word2vec model
|
22 |
+
#model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/CollectData/NER/word2Vec', binary=True)
|
23 |
+
'''Some notes for this model
|
24 |
+
sometimes when we do the corpus, there are some adverbs which are unnecessary but might be seen as
|
25 |
+
a similar word to the word we are finding, so can we try to preprocess text so that
|
26 |
+
we make the corpus more effective and only contains the important words. Then when we
|
27 |
+
train the model, the important words will be seen as important. Or
|
28 |
+
when we already have the similar list of words, we can remove the words in there
|
29 |
+
that are stopwords/unnecessary words.'''
|
30 |
+
### For more complex analysis, consider using sentence embedding models like "Doc2Vec" to represent the meaning of entire sentences instead of just individual words
|
31 |
+
class word2Vec():
|
32 |
+
def __init__(self, nameFile=None, modelName=None):
|
33 |
+
self.nameFile = nameFile
|
34 |
+
self.modelName = modelName
|
35 |
+
def spacy_similarity(self, word):
|
36 |
+
# when use word2vec, try medium or large is better
|
37 |
+
# maybe try odc similarity?
|
38 |
+
nlp = spacy.load("en_core_web_lg")
|
39 |
+
doc = nlp(word)
|
40 |
+
for token1 in doc:
|
41 |
+
for token2 in doc:
|
42 |
+
print(token1.text, token2.text, token1.similarity(token2))
|
43 |
+
pass
|
44 |
+
# clean text before transform to corpus
|
45 |
+
def cleanTextBeforeCorpus(self,oriText, doi=None):
|
46 |
+
cl = cleanText.cleanGenText()
|
47 |
+
#cl = cleanGenText()
|
48 |
+
output = ""
|
49 |
+
alreadyRemoveDoi = False
|
50 |
+
for word in oriText.split(" "):
|
51 |
+
# remove DOI
|
52 |
+
if doi != None and doi in oriText:
|
53 |
+
if alreadyRemoveDoi == False:
|
54 |
+
newWord = cl.removeDOI(word,doi)
|
55 |
+
if len(newWord) > 0 and newWord != word:
|
56 |
+
alreadyRemoveDoi = True
|
57 |
+
word = newWord
|
58 |
+
# remove punctuation
|
59 |
+
# split the sticked words
|
60 |
+
#word = cl.splitStickWords(word)
|
61 |
+
# remove punctuation
|
62 |
+
word = cl.removePunct(word,True)
|
63 |
+
# remove URL
|
64 |
+
word = cl.removeURL(word)
|
65 |
+
# remove HTMLTag
|
66 |
+
word = cl.removeHTMLTag(word)
|
67 |
+
# remove tab, white space, newline
|
68 |
+
word = cl.removeTabWhiteSpaceNewLine(word)
|
69 |
+
# optional: remove stopwords
|
70 |
+
#word = cl.removeStopWords(word)
|
71 |
+
if len(word)>0:
|
72 |
+
output += word + " "
|
73 |
+
return output
|
74 |
+
def cleanAllTextBeforeCorpus(self, allText, doi=None):
|
75 |
+
cleanOutput = ""
|
76 |
+
remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
|
77 |
+
if len(allText) > 0:
|
78 |
+
corpusText = allText
|
79 |
+
for pos in range(len(corpusText.split("\n\n"))):
|
80 |
+
if len(corpusText.split("\n\n")[pos]) > 0:
|
81 |
+
lines = corpusText.split("\n\n")[pos]
|
82 |
+
for line in lines.split("\n"):
|
83 |
+
if remove in line: line = line.replace(remove, "")
|
84 |
+
clean_text = self.cleanTextBeforeCorpus(line, doi)
|
85 |
+
cleanOutput += clean_text + "\n"
|
86 |
+
cleanOutput += "\n\n"
|
87 |
+
return cleanOutput
|
88 |
+
def tableTransformToCorpusText(self, df, excelFile=None):
|
89 |
+
# PDF, Excel, WordDoc
|
90 |
+
#cl = cleanText.cleanGenText()
|
91 |
+
corpus = {}
|
92 |
+
# PDF or df
|
93 |
+
if excelFile == None:
|
94 |
+
if len(df) > 0:
|
95 |
+
try:
|
96 |
+
for i in range(len(df)):
|
97 |
+
# each new dimension/page is considered to be a sentence which ends with the period.
|
98 |
+
# each new line is a new list, and each new df is a new corpus
|
99 |
+
outputDF = []
|
100 |
+
text = df[i].values.tolist()
|
101 |
+
if len(text) > 0:
|
102 |
+
outputRowDF = self.helperRowTableToCorpus(text)
|
103 |
+
#outputColDF = self.helperColTableToCorpus(text)
|
104 |
+
outputDF.extend(outputRowDF)
|
105 |
+
#outputDF.extend(outputColDF)
|
106 |
+
if len(outputDF) > 0:
|
107 |
+
corpus["corpus" + str(i)] = outputDF
|
108 |
+
except:
|
109 |
+
outputDF = []
|
110 |
+
text = df.values.tolist()
|
111 |
+
if len(text) > 0:
|
112 |
+
outputRowDF = self.helperRowTableToCorpus(text)
|
113 |
+
#outputColDF = self.helperColTableToCorpus(text)
|
114 |
+
outputDF.extend(outputRowDF)
|
115 |
+
#outputDF.extend(outputColDF)
|
116 |
+
if len(outputDF) > 0:
|
117 |
+
corpus["corpus0"] = outputDF
|
118 |
+
else:
|
119 |
+
try:
|
120 |
+
df = pd.ExcelFile(excelFile)
|
121 |
+
except:
|
122 |
+
if filepath.endswith('.xls'):
|
123 |
+
df = pd.read_excel(filepath, engine='xlrd')
|
124 |
+
else:
|
125 |
+
df = pd.read_excel(filepath, engine='openpyxl')
|
126 |
+
sheetNames = df.sheet_names
|
127 |
+
output = []
|
128 |
+
if len(sheetNames) > 0:
|
129 |
+
for s in range(len(sheetNames)):
|
130 |
+
outputDF = []
|
131 |
+
with pd.ExcelFile(excelFile) as xls:
|
132 |
+
data = pd.read_excel(xls, sheetNames[s])
|
133 |
+
if sheetNames[s] != 'Evaluation Warning':
|
134 |
+
text = data.values.tolist()
|
135 |
+
if len(text) > 0:
|
136 |
+
outputRowDF = self.helperRowTableToCorpus(text)
|
137 |
+
#outputColDF = self.helperColTableToCorpus(text)
|
138 |
+
outputDF.extend(outputRowDF)
|
139 |
+
#outputDF.extend(outputColDF)
|
140 |
+
if len(outputDF) > 0:
|
141 |
+
corpus["corpus" + str(s)] = outputDF
|
142 |
+
return corpus
|
143 |
+
def helperRowTableToCorpus(self, textList):
|
144 |
+
#cl = cleanGenText()
|
145 |
+
cl = cleanText.cleanGenText()
|
146 |
+
stopWords = ["NaN","Unnamed:","nan"]
|
147 |
+
outputDF = []
|
148 |
+
for line in textList:
|
149 |
+
outputLine = []
|
150 |
+
for words in line:
|
151 |
+
words = str(words)
|
152 |
+
if len(words) > 0:
|
153 |
+
for word in words.split(" "):
|
154 |
+
# remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
|
155 |
+
if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
|
156 |
+
#word = cl.splitStickWords(word)
|
157 |
+
word = cl.removePunct(word)
|
158 |
+
word = " ".join(cl.removeStopWords(word))
|
159 |
+
word = cl.removeTabWhiteSpaceNewLine(word)
|
160 |
+
if len(word) > 1:
|
161 |
+
if len(word.split(" ")) > 1:
|
162 |
+
for x in word.split(" "):
|
163 |
+
if len(x) > 1 and x.isnumeric()==False:
|
164 |
+
outputLine.append(x.lower())
|
165 |
+
else:
|
166 |
+
if word.isnumeric() == False:
|
167 |
+
outputLine.append(word.lower())
|
168 |
+
if len(outputLine) > 0:
|
169 |
+
outputDF.append(outputLine)
|
170 |
+
return outputDF
|
171 |
+
def helperColTableToCorpus(self, dfList):
|
172 |
+
#cl = cleanGenText()
|
173 |
+
cl = cleanText.cleanGenText()
|
174 |
+
stopWords = ["NaN","Unnamed:","nan"]
|
175 |
+
outputDF = []
|
176 |
+
# use the first length line as the column ref
|
177 |
+
for pos in range(len(dfList[0])):
|
178 |
+
outputLine = []
|
179 |
+
for line in dfList:
|
180 |
+
if pos < len(line):
|
181 |
+
words = line[pos]
|
182 |
+
words = str(words)
|
183 |
+
else: words = ""
|
184 |
+
if len(words) > 0:
|
185 |
+
for word in words.split(" "):
|
186 |
+
# remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
|
187 |
+
if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
|
188 |
+
#word = cl.splitStickWords(word)
|
189 |
+
word = cl.removePunct(word)
|
190 |
+
word = " ".join(cl.removeStopWords(word))
|
191 |
+
word = cl.removeTabWhiteSpaceNewLine(word)
|
192 |
+
if len(word) > 1:
|
193 |
+
if len(word.split(" ")) > 1:
|
194 |
+
for x in word.split(" "):
|
195 |
+
if len(x) > 1 and x.isnumeric()==False:
|
196 |
+
outputLine.append(x.lower())
|
197 |
+
else:
|
198 |
+
if word.isnumeric() == False:
|
199 |
+
outputLine.append(word.lower())
|
200 |
+
if len(outputLine) > 0:
|
201 |
+
outputDF.append(outputLine)
|
202 |
+
return outputDF
|
203 |
+
# create a corpus
|
204 |
+
def createCorpusText(self, corpusText):
|
205 |
+
'''ex: "Tom is cat. Jerry is mouse."
|
206 |
+
corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
|
207 |
+
# the output should be like this:
|
208 |
+
'''texts = {
|
209 |
+
"Paragraph 1": [["Cat", "is", "an","animal], ["Tom", "is", "cat"]],
|
210 |
+
"Paragraph 2": [["Mouse", "is", "an", "animal"], ["Jerry", "is", "mouse"]]
|
211 |
+
}
|
212 |
+
'''
|
213 |
+
# separate paragraph
|
214 |
+
'''Ex: Cat is an animal. Tom is cat.
|
215 |
+
|
216 |
+
Mouse is an animal.
|
217 |
+
Jerry is mouse.'''
|
218 |
+
texts = {}
|
219 |
+
cl = cleanText.cleanGenText()
|
220 |
+
#cl = cleanGenText()
|
221 |
+
for pos in range(len(corpusText.split("\n\n"))):
|
222 |
+
if len(corpusText.split("\n\n")[pos]) > 0:
|
223 |
+
texts["Paragraph "+str(pos)] = []
|
224 |
+
lines = corpusText.split("\n\n")[pos]
|
225 |
+
for line in lines.split("\n"):
|
226 |
+
for l in line.split("."):
|
227 |
+
if len(l) > 0:
|
228 |
+
cl.removeTabWhiteSpaceNewLine(l)
|
229 |
+
l = l.lower()
|
230 |
+
newL = []
|
231 |
+
for word in l.split(" "):
|
232 |
+
if len(word) > 0:
|
233 |
+
word = cl.removeStopWords(word)
|
234 |
+
for w in word:
|
235 |
+
if len(w) > 0 and w.isnumeric()==False:
|
236 |
+
newL.append(w)
|
237 |
+
if len(newL)>0:
|
238 |
+
texts["Paragraph "+str(pos)].append(newL)
|
239 |
+
if len(texts["Paragraph "+str(pos)]) == 0:
|
240 |
+
del texts["Paragraph "+str(pos)]
|
241 |
+
return texts
|
242 |
+
def selectParaForWC(self,corpus):
|
243 |
+
''' corpus should be in the format:
|
244 |
+
corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
|
245 |
+
corSize, window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None, None
|
246 |
+
corSize = len(corpus)
|
247 |
+
# less than 2000
|
248 |
+
if 0 < corSize < 2000:
|
249 |
+
window=3.5
|
250 |
+
vector_size=75
|
251 |
+
sample=1e-3
|
252 |
+
negative=10
|
253 |
+
epochs=10
|
254 |
+
sg=1
|
255 |
+
# 2000 - 100000
|
256 |
+
elif 2000 <= corSize < 100000:
|
257 |
+
window=3.5
|
258 |
+
vector_size=75
|
259 |
+
sample=1e-5
|
260 |
+
negative=10
|
261 |
+
epochs=10
|
262 |
+
sg=1
|
263 |
+
elif 100000 <=corSize < 1000000:
|
264 |
+
window=7.5
|
265 |
+
vector_size=150
|
266 |
+
sample=1e-5
|
267 |
+
negative=10
|
268 |
+
epochs=6
|
269 |
+
sg=0
|
270 |
+
return window, vector_size, sample, negative, epochs, sg
|
271 |
+
def trainWord2Vec(self,nameFile,modelName,saveFolder,window=3.5,
|
272 |
+
vector_size=75,sample=1e-3,negative=10,epochs=10,sg=1):
|
273 |
+
# if you dont have backup file, you can use again the nameFile just to increase the lenght of corpus
|
274 |
+
jsonFile = ""
|
275 |
+
jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
|
276 |
+
cores = multiprocessing.cpu_count()
|
277 |
+
combinedCorpus = []
|
278 |
+
window, vector_size, sample, negative, epochs, sg = None, None, None, None, None, None
|
279 |
+
if len(jsonFile) > 0:
|
280 |
+
for key in jsonFile:
|
281 |
+
combinedCorpus.extend(jsonFile[key])
|
282 |
+
window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
|
283 |
+
# # min_count=1 ensures all words are included
|
284 |
+
'''w2vModel = Word2Vec(
|
285 |
+
min_count=1,
|
286 |
+
window=window,
|
287 |
+
vector_size=vector_size,
|
288 |
+
sample=sample,
|
289 |
+
alpha=0.03,
|
290 |
+
min_alpha=0.0007,
|
291 |
+
negative=negative,
|
292 |
+
workers=cores-1,
|
293 |
+
epochs = epochs,
|
294 |
+
sg=sg)'''
|
295 |
+
#w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
|
296 |
+
accept = False
|
297 |
+
while not accept:
|
298 |
+
if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
|
299 |
+
try:
|
300 |
+
w2vModel = Word2Vec(
|
301 |
+
min_count=1,
|
302 |
+
window=window,
|
303 |
+
vector_size=vector_size,
|
304 |
+
sample=sample,
|
305 |
+
alpha=0.03,
|
306 |
+
min_alpha=0.0007,
|
307 |
+
negative=negative,
|
308 |
+
workers=cores-1,
|
309 |
+
epochs = epochs,
|
310 |
+
sg=sg)
|
311 |
+
w2vModel.build_vocab(combinedCorpus)
|
312 |
+
w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
|
313 |
+
accept = True
|
314 |
+
except:
|
315 |
+
for key in jsonFile:
|
316 |
+
combinedCorpus.extend(jsonFile[key])
|
317 |
+
window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
|
318 |
+
print("next is " + str(len(combinedCorpus)))
|
319 |
+
else:
|
320 |
+
print("no parameter to train")
|
321 |
+
break
|
322 |
+
#w2vModel.build_vocab(combinedCorpus)
|
323 |
+
#w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
|
324 |
+
#w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
|
325 |
+
#w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
|
326 |
+
w2vModel.save(saveFolder+"/"+modelName+".model")
|
327 |
+
w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
|
328 |
+
print("done w2v")
|
329 |
+
else: print("no corpus to train")
|
330 |
+
#return combinedCorpus
|
331 |
+
def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
|
332 |
+
# might not be a meaningful keyword
|
333 |
+
#stopWords = ["show"]
|
334 |
+
# same word but just plural nouns, tense
|
335 |
+
simWords = [word+"s",word+"es",word+"ing",word+"ed"]
|
336 |
+
model = KeyedVectors.load_word2vec_format(modelFile, binary = False) # model file in format txt
|
337 |
+
results = model.most_similar(positive=[word],topn=n)
|
338 |
+
#removeIndex = []
|
339 |
+
#currN = copy.deepcopy(n)
|
340 |
+
'''for r in range(len(results)):
|
341 |
+
if len(results[r][0]) < 2:
|
342 |
+
removeIndex.append(results[r])
|
343 |
+
# remove the same word but just plural and singular noun and lower than the cos_thres
|
344 |
+
elif results[r][0] == word:
|
345 |
+
removeIndex.append(results[r])
|
346 |
+
elif results[r][0] in simWords or float(results[r][1]) < cos_thres or results[r][0] in stopWords:
|
347 |
+
removeIndex.append(results[r])
|
348 |
+
for rem in removeIndex:
|
349 |
+
results.remove(rem)
|
350 |
+
while len(results)!=n and len(results) != 0:
|
351 |
+
moreNewResult = model.most_similar(positive=[word],topn=currN+1)[-1]
|
352 |
+
if moreNewResult not in results and len(moreNewResult[0])>1:
|
353 |
+
if moreNewResult[0] not in stopWords and results[0] != word:
|
354 |
+
results.append(moreNewResult)
|
355 |
+
currN +=1'''
|
356 |
+
return results
|
357 |
+
# adding our model into spacy
|
358 |
+
# this deals with command line; but instead of using it, we write python script to run command line
|
359 |
+
def loadWordVec(self,modelName,wordVec):
|
360 |
+
# modelName is the name you want to save into spacy
|
361 |
+
# wordVec is the trained word2vec in txt format
|
362 |
+
subprocess.run([sys.executable,
|
363 |
+
"-m",
|
364 |
+
"spacy",
|
365 |
+
"init-model",
|
366 |
+
"en",
|
367 |
+
modelName, # this modelName comes from the saved modelName of function trainWord2Vec
|
368 |
+
"--vectors-loc",
|
369 |
+
wordVec])
|
370 |
print("done")
|