VyLala's picture
Upload 52 files
8835144 verified
raw
history blame
17.4 kB
'''WORD TO VECTOR'''
import pandas as pd
import json
import gensim
import spacy
from DefaultPackages import openFile, saveFile
from NER import cleanText
from gensim.models.keyedvectors import KeyedVectors
from gensim.test.utils import common_texts
from gensim.models.word2vec import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import Phrases
from gensim.models.phrases import Phraser
import sys
import subprocess
import os
# can try multiprocessing to run quicker
import multiprocessing
import copy
sys.setrecursionlimit(1000)
# creat folder word2Vec
#! mkdir /content/drive/MyDrive/CollectData/NER/word2Vec
# create word2vec model
#model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/CollectData/NER/word2Vec', binary=True)
'''Some notes for this model
sometimes when we do the corpus, there are some adverbs which are unnecessary but might be seen as
a similar word to the word we are finding, so can we try to preprocess text so that
we make the corpus more effective and only contains the important words. Then when we
train the model, the important words will be seen as important. Or
when we already have the similar list of words, we can remove the words in there
that are stopwords/unnecessary words.'''
### For more complex analysis, consider using sentence embedding models like "Doc2Vec" to represent the meaning of entire sentences instead of just individual words
class word2Vec():
def __init__(self, nameFile=None, modelName=None):
self.nameFile = nameFile
self.modelName = modelName
#self.nlp = spacy.load("en_core_web_lg")
self.cl = cleanText.cleanGenText()
def spacy_similarity(self, word):
# when use word2vec, try medium or large is better
# maybe try odc similarity?
doc = self.nlp(word)
for token1 in doc:
for token2 in doc:
print(token1.text, token2.text, token1.similarity(token2))
pass
# clean text before transform to corpus
def cleanTextBeforeCorpus(self,oriText, doi=None):
#cl = cleanText.cleanGenText()
#cl = cleanGenText()
output = ""
alreadyRemoveDoi = False
for word in oriText.split(" "):
# remove DOI
if doi != None and doi in oriText:
if alreadyRemoveDoi == False:
newWord = self.cl.removeDOI(word,doi)
if len(newWord) > 0 and newWord != word:
alreadyRemoveDoi = True
word = newWord
# remove punctuation
# split the sticked words
#word = cl.splitStickWords(word)
# remove punctuation
word = self.cl.removePunct(word,True)
# remove URL
word = self.cl.removeURL(word)
# remove HTMLTag
word = self.cl.removeHTMLTag(word)
# remove tab, white space, newline
word = self.cl.removeTabWhiteSpaceNewLine(word)
# optional: remove stopwords
#word = cl.removeStopWords(word)
if len(word)>0:
output += word + " "
return output
def cleanAllTextBeforeCorpus(self, allText, doi=None):
cleanOutput = ""
remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
if len(allText) > 0:
corpusText = allText.split("\n\n")
for pos in range(len(corpusText)):
lines = corpusText[pos]
if len(lines) > 0:
for line in lines.split("\n"):
if remove in line: line = line.replace(remove, "")
clean_text = self.cleanTextBeforeCorpus(line, doi)
cleanOutput += clean_text + "\n"
cleanOutput += "\n\n"
return cleanOutput
import urllib.parse, requests
def tableTransformToCorpusText(self, df, excelFile=None):
# PDF, Excel, WordDoc
#cl = cleanText.cleanGenText()
corpus = {}
# PDF or df
if excelFile == None:
if len(df) > 0:
try:
for i in range(len(df)):
# each new dimension/page is considered to be a sentence which ends with the period.
# each new line is a new list, and each new df is a new corpus
outputDF = []
text = df[i].values.tolist()
if len(text) > 0:
outputRowDF = self.helperRowTableToCorpus(text)
#outputColDF = self.helperColTableToCorpus(text)
outputDF.extend(outputRowDF)
#outputDF.extend(outputColDF)
if len(outputDF) > 0:
corpus["corpus" + str(i)] = outputDF
except:
outputDF = []
text = df.values.tolist()
if len(text) > 0:
outputRowDF = self.helperRowTableToCorpus(text)
#outputColDF = self.helperColTableToCorpus(text)
outputDF.extend(outputRowDF)
#outputDF.extend(outputColDF)
if len(outputDF) > 0:
corpus["corpus0"] = outputDF
else:
try:
df = pd.ExcelFile(excelFile)
except:
if excelFile.endswith('.xls'):
df = pd.read_excel(excelFile, engine='xlrd')
else:
df = pd.read_excel(excelFile, engine='openpyxl')
sheetNames = df.sheet_names
output = []
if len(sheetNames) > 0:
for s in range(len(sheetNames)):
outputDF = []
with pd.ExcelFile(excelFile) as xls:
data = pd.read_excel(xls, sheetNames[s])
if sheetNames[s] != 'Evaluation Warning':
text = data.values.tolist()
if len(text) > 0:
outputRowDF = self.helperRowTableToCorpus(text)
#outputColDF = self.helperColTableToCorpus(text)
outputDF.extend(outputRowDF)
#outputDF.extend(outputColDF)
if len(outputDF) > 0:
corpus["corpus" + str(s)] = outputDF
return corpus
def helperRowTableToCorpus(self, textList):
#cl = cleanGenText()
#cl = cleanText.cleanGenText()
stopWords = ["NaN","Unnamed:","nan"]
outputDF = []
for line in textList:
outputLine = []
for words in line:
words = str(words)
if len(words) > 0:
for word in words.split(" "):
# remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
#word = cl.splitStickWords(word)
word = self.cl.removePunct(word)
word = " ".join(self.cl.removeStopWords(word))
word = self.cl.removeTabWhiteSpaceNewLine(word)
if len(word) > 1:
if len(word.split(" ")) > 1:
for x in word.split(" "):
if len(x) > 1 and x.isnumeric()==False:
outputLine.append(x.lower())
else:
if word.isnumeric() == False:
outputLine.append(word.lower())
if len(outputLine) > 0:
outputDF.append(outputLine)
return outputDF
def helperColTableToCorpus(self, dfList):
#cl = cleanGenText()
#cl = cleanText.cleanGenText()
stopWords = ["NaN","Unnamed:","nan"]
outputDF = []
# use the first length line as the column ref
for pos in range(len(dfList[0])):
outputLine = []
for line in dfList:
if pos < len(line):
words = line[pos]
words = str(words)
else: words = ""
if len(words) > 0:
for word in words.split(" "):
# remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
#word = cl.splitStickWords(word)
word = self.cl.removePunct(word)
word = " ".join(self.cl.removeStopWords(word))
word = self.cl.removeTabWhiteSpaceNewLine(word)
if len(word) > 1:
if len(word.split(" ")) > 1:
for x in word.split(" "):
if len(x) > 1 and x.isnumeric()==False:
outputLine.append(x.lower())
else:
if word.isnumeric() == False:
outputLine.append(word.lower())
if len(outputLine) > 0:
outputDF.append(outputLine)
return outputDF
# create a corpus
def createCorpusText(self, corpusText):
'''ex: "Tom is cat. Jerry is mouse."
corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
# the output should be like this:
'''texts = {
"Paragraph 1": [["Cat", "is", "an","animal], ["Tom", "is", "cat"]],
"Paragraph 2": [["Mouse", "is", "an", "animal"], ["Jerry", "is", "mouse"]]
}
'''
# separate paragraph
'''Ex: Cat is an animal. Tom is cat.
Mouse is an animal.
Jerry is mouse.'''
texts = {}
#cl = cleanText.cleanGenText()
#cl = cleanGenText()
corpus = corpusText.split("\n\n")
for pos in range(len(corpus)):
if len(corpus[pos]) > 0:
texts["Paragraph "+str(pos)] = []
lines = corpus[pos]
for line in lines.split("\n"):
for l in line.split("."):
if len(l) > 0:
l = self.cl.removeTabWhiteSpaceNewLine(l)
l = l.lower()
newL = []
for word in l.split(" "):
if len(word) > 0:
word = self.cl.removeStopWords(word)
for w in word:
if len(w) > 0 and w.isnumeric()==False:
newL.append(w)
if len(newL)>0:
texts["Paragraph "+str(pos)].append(newL)
if len(texts["Paragraph "+str(pos)]) == 0:
del texts["Paragraph "+str(pos)]
return texts
def selectParaForWC(self, corpus):
"""
corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]
Heuristically determine Word2Vec parameters.
"""
corSize = len(corpus)
if corSize == 0:
return None, None, None, None, None, None
# Adjust parameters based on corpus size
if corSize < 2000:
# Small corpus — need high generalization
window = 3
vector_size = 100
sample = 1e-3
negative = 5
epochs = 20
sg = 1 # Skip-gram preferred for rare words
elif corSize < 10000:
window = 5
vector_size = 150
sample = 1e-4
negative = 10
epochs = 20
sg = 1
elif corSize < 100000:
window = 7
vector_size = 200
sample = 1e-5
negative = 15
epochs = 15
sg = 1
elif corSize < 500000:
window = 10
vector_size = 250
sample = 1e-5
negative = 15
epochs = 10
sg = 0 # CBOW is okay when data is large
else:
# Very large corpus
window = 12
vector_size = 300
sample = 1e-6
negative = 20
epochs = 5
sg = 0
return window, vector_size, sample, negative, epochs, sg
def trainWord2Vec(self,nameFile,modelName,saveFolder,window=None,
vector_size=None,sample=None,negative=None,epochs=None,sg=None):
jsonFile = ""
jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
if not jsonFile:
print("No corpus to train")
return
cores = multiprocessing.cpu_count()
combinedCorpus = []
for key in jsonFile:
combinedCorpus.extend(jsonFile[key])
# detect phrase before choosing parameters
phrases = Phrases(combinedCorpus, min_count=2, threshold=10)
bigram = Phraser(phrases)
combinedCorpus = [bigram[sent] for sent in combinedCorpus]
if window==None and vector_size==None and sample==None and negative==None and epochs==None and sg==None:
window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
# # min_count=1 ensures all words are included
#w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
accept = False
# add retry limit because if training keeps failing (bad corpus or corrupted input), it’ll keep retrying without limit.
retries = 0
while not accept and retries < 3:
if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
try:
w2vModel = Word2Vec(
min_count=1,
window=window,
vector_size=vector_size,
sample=sample,
alpha=0.03,
min_alpha=0.0007,
negative=negative,
workers=cores-1,
epochs = epochs,
sg=sg)
w2vModel.build_vocab(combinedCorpus)
w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=epochs)
accept = True
except Exception as e:
print(f"Retry #{retries+1} failed: {e}")
retries +=1
else:
print("no parameter to train")
break
#w2vModel.build_vocab(combinedCorpus)
#w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
#w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
#w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
w2vModel.save(saveFolder+"/"+modelName+".model")
w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
print("done w2v")
#return combinedCorpus
def updateWord2Vec(self, modelPath, newCorpus, saveFolder=None):
if not newCorpus:
raise ValueError("New corpus is empty!")
model = Word2Vec.load(modelPath)
# Phrase detection on new data
phrases = Phrases(newCorpus, min_count=2, threshold=10)
bigram = Phraser(phrases)
newCorpus = [bigram[sent] for sent in newCorpus]
# Update vocab & retrain
model.build_vocab(newCorpus, update=True)
model.train(newCorpus, total_examples=len(newCorpus), epochs=model.epochs)
def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
# might not be a meaningful keyword
#stopWords = ["show"]
# same word but just plural nouns, tense
simWords = [word+"s",word+"es",word+"ing",word+"ed"]
model = KeyedVectors.load_word2vec_format(modelFile, binary = False) # model file in format txt
results = model.most_similar(positive=[word],topn=n)
#removeIndex = []
#currN = copy.deepcopy(n)
'''for r in range(len(results)):
if len(results[r][0]) < 2:
removeIndex.append(results[r])
# remove the same word but just plural and singular noun and lower than the cos_thres
elif results[r][0] == word:
removeIndex.append(results[r])
elif results[r][0] in simWords or float(results[r][1]) < cos_thres or results[r][0] in stopWords:
removeIndex.append(results[r])
for rem in removeIndex:
results.remove(rem)
while len(results)!=n and len(results) != 0:
moreNewResult = model.most_similar(positive=[word],topn=currN+1)[-1]
if moreNewResult not in results and len(moreNewResult[0])>1:
if moreNewResult[0] not in stopWords and results[0] != word:
results.append(moreNewResult)
currN +=1'''
return results
# add more data to existing word2vec model
def updateWord2Vec(self, modelPath, newCorpus, saveFolder=None):
if not newCorpus:
raise ValueError("New corpus is empty!")
model = Word2Vec.load(modelPath)
# Phrase detection on new data
phrases = Phrases(newCorpus, min_count=2, threshold=10)
bigram = Phraser(phrases)
newCorpus = [bigram[sent] for sent in newCorpus]
# Update vocab & retrain
model.build_vocab(newCorpus, update=True)
model.train(newCorpus, total_examples=len(newCorpus), epochs=model.epochs)
# Save updated model
if saveFolder:
os.makedirs(saveFolder, exist_ok=True)
name = os.path.basename(modelPath).replace(".model", "_updated.model")
model.save(f"{saveFolder}/{name}")
print(f"🔁 Model updated and saved to {saveFolder}/{name}")
else:
model.save(modelPath)
print(f"🔁 Model updated and overwritten at {modelPath}")
# adding our model into spacy
# this deals with command line; but instead of using it, we write python script to run command line
def loadWordVec(self,modelName,wordVec):
# modelName is the name you want to save into spacy
# wordVec is the trained word2vec in txt format
subprocess.run([sys.executable,
"-m",
"spacy",
"init-model",
"en",
modelName, # this modelName comes from the saved modelName of function trainWord2Vec
"--vectors-loc",
wordVec])
print("done")