Spaces:
Running
Running
'''WORD TO VECTOR''' | |
import pandas as pd | |
import json | |
import gensim | |
import spacy | |
from DefaultPackages import openFile, saveFile | |
from NER import cleanText | |
from gensim.models.keyedvectors import KeyedVectors | |
from gensim.test.utils import common_texts | |
from gensim.models.word2vec import Word2Vec | |
from gensim.scripts.glove2word2vec import glove2word2vec | |
from gensim.test.utils import datapath, get_tmpfile | |
from gensim.models import Phrases | |
from gensim.models.phrases import Phraser | |
import sys | |
import subprocess | |
import os | |
# can try multiprocessing to run quicker | |
import multiprocessing | |
import copy | |
sys.setrecursionlimit(1000) | |
# creat folder word2Vec | |
#! mkdir /content/drive/MyDrive/CollectData/NER/word2Vec | |
# create word2vec model | |
#model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/CollectData/NER/word2Vec', binary=True) | |
'''Some notes for this model | |
sometimes when we do the corpus, there are some adverbs which are unnecessary but might be seen as | |
a similar word to the word we are finding, so can we try to preprocess text so that | |
we make the corpus more effective and only contains the important words. Then when we | |
train the model, the important words will be seen as important. Or | |
when we already have the similar list of words, we can remove the words in there | |
that are stopwords/unnecessary words.''' | |
### For more complex analysis, consider using sentence embedding models like "Doc2Vec" to represent the meaning of entire sentences instead of just individual words | |
class word2Vec(): | |
def __init__(self, nameFile=None, modelName=None): | |
self.nameFile = nameFile | |
self.modelName = modelName | |
#self.nlp = spacy.load("en_core_web_lg") | |
self.cl = cleanText.cleanGenText() | |
def spacy_similarity(self, word): | |
# when use word2vec, try medium or large is better | |
# maybe try odc similarity? | |
doc = self.nlp(word) | |
for token1 in doc: | |
for token2 in doc: | |
print(token1.text, token2.text, token1.similarity(token2)) | |
pass | |
# clean text before transform to corpus | |
def cleanTextBeforeCorpus(self,oriText, doi=None): | |
#cl = cleanText.cleanGenText() | |
#cl = cleanGenText() | |
output = "" | |
alreadyRemoveDoi = False | |
for word in oriText.split(" "): | |
# remove DOI | |
if doi != None and doi in oriText: | |
if alreadyRemoveDoi == False: | |
newWord = self.cl.removeDOI(word,doi) | |
if len(newWord) > 0 and newWord != word: | |
alreadyRemoveDoi = True | |
word = newWord | |
# remove punctuation | |
# split the sticked words | |
#word = cl.splitStickWords(word) | |
# remove punctuation | |
word = self.cl.removePunct(word,True) | |
# remove URL | |
word = self.cl.removeURL(word) | |
# remove HTMLTag | |
word = self.cl.removeHTMLTag(word) | |
# remove tab, white space, newline | |
word = self.cl.removeTabWhiteSpaceNewLine(word) | |
# optional: remove stopwords | |
#word = cl.removeStopWords(word) | |
if len(word)>0: | |
output += word + " " | |
return output | |
def cleanAllTextBeforeCorpus(self, allText, doi=None): | |
cleanOutput = "" | |
remove = "Evaluation Warning: The document was created with Spire.Doc for Python." | |
if len(allText) > 0: | |
corpusText = allText.split("\n\n") | |
for pos in range(len(corpusText)): | |
lines = corpusText[pos] | |
if len(lines) > 0: | |
for line in lines.split("\n"): | |
if remove in line: line = line.replace(remove, "") | |
clean_text = self.cleanTextBeforeCorpus(line, doi) | |
cleanOutput += clean_text + "\n" | |
cleanOutput += "\n\n" | |
return cleanOutput | |
import urllib.parse, requests | |
def tableTransformToCorpusText(self, df, excelFile=None): | |
# PDF, Excel, WordDoc | |
#cl = cleanText.cleanGenText() | |
corpus = {} | |
# PDF or df | |
if excelFile == None: | |
if len(df) > 0: | |
try: | |
for i in range(len(df)): | |
# each new dimension/page is considered to be a sentence which ends with the period. | |
# each new line is a new list, and each new df is a new corpus | |
outputDF = [] | |
text = df[i].values.tolist() | |
if len(text) > 0: | |
outputRowDF = self.helperRowTableToCorpus(text) | |
#outputColDF = self.helperColTableToCorpus(text) | |
outputDF.extend(outputRowDF) | |
#outputDF.extend(outputColDF) | |
if len(outputDF) > 0: | |
corpus["corpus" + str(i)] = outputDF | |
except: | |
outputDF = [] | |
text = df.values.tolist() | |
if len(text) > 0: | |
outputRowDF = self.helperRowTableToCorpus(text) | |
#outputColDF = self.helperColTableToCorpus(text) | |
outputDF.extend(outputRowDF) | |
#outputDF.extend(outputColDF) | |
if len(outputDF) > 0: | |
corpus["corpus0"] = outputDF | |
else: | |
try: | |
df = pd.ExcelFile(excelFile) | |
except: | |
if excelFile.endswith('.xls'): | |
df = pd.read_excel(excelFile, engine='xlrd') | |
else: | |
df = pd.read_excel(excelFile, engine='openpyxl') | |
sheetNames = df.sheet_names | |
output = [] | |
if len(sheetNames) > 0: | |
for s in range(len(sheetNames)): | |
outputDF = [] | |
with pd.ExcelFile(excelFile) as xls: | |
data = pd.read_excel(xls, sheetNames[s]) | |
if sheetNames[s] != 'Evaluation Warning': | |
text = data.values.tolist() | |
if len(text) > 0: | |
outputRowDF = self.helperRowTableToCorpus(text) | |
#outputColDF = self.helperColTableToCorpus(text) | |
outputDF.extend(outputRowDF) | |
#outputDF.extend(outputColDF) | |
if len(outputDF) > 0: | |
corpus["corpus" + str(s)] = outputDF | |
return corpus | |
def helperRowTableToCorpus(self, textList): | |
#cl = cleanGenText() | |
#cl = cleanText.cleanGenText() | |
stopWords = ["NaN","Unnamed:","nan"] | |
outputDF = [] | |
for line in textList: | |
outputLine = [] | |
for words in line: | |
words = str(words) | |
if len(words) > 0: | |
for word in words.split(" "): | |
# remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1" | |
if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan" | |
#word = cl.splitStickWords(word) | |
word = self.cl.removePunct(word) | |
word = " ".join(self.cl.removeStopWords(word)) | |
word = self.cl.removeTabWhiteSpaceNewLine(word) | |
if len(word) > 1: | |
if len(word.split(" ")) > 1: | |
for x in word.split(" "): | |
if len(x) > 1 and x.isnumeric()==False: | |
outputLine.append(x.lower()) | |
else: | |
if word.isnumeric() == False: | |
outputLine.append(word.lower()) | |
if len(outputLine) > 0: | |
outputDF.append(outputLine) | |
return outputDF | |
def helperColTableToCorpus(self, dfList): | |
#cl = cleanGenText() | |
#cl = cleanText.cleanGenText() | |
stopWords = ["NaN","Unnamed:","nan"] | |
outputDF = [] | |
# use the first length line as the column ref | |
for pos in range(len(dfList[0])): | |
outputLine = [] | |
for line in dfList: | |
if pos < len(line): | |
words = line[pos] | |
words = str(words) | |
else: words = "" | |
if len(words) > 0: | |
for word in words.split(" "): | |
# remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1" | |
if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan" | |
#word = cl.splitStickWords(word) | |
word = self.cl.removePunct(word) | |
word = " ".join(self.cl.removeStopWords(word)) | |
word = self.cl.removeTabWhiteSpaceNewLine(word) | |
if len(word) > 1: | |
if len(word.split(" ")) > 1: | |
for x in word.split(" "): | |
if len(x) > 1 and x.isnumeric()==False: | |
outputLine.append(x.lower()) | |
else: | |
if word.isnumeric() == False: | |
outputLine.append(word.lower()) | |
if len(outputLine) > 0: | |
outputDF.append(outputLine) | |
return outputDF | |
# create a corpus | |
def createCorpusText(self, corpusText): | |
'''ex: "Tom is cat. Jerry is mouse." | |
corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]''' | |
# the output should be like this: | |
'''texts = { | |
"Paragraph 1": [["Cat", "is", "an","animal], ["Tom", "is", "cat"]], | |
"Paragraph 2": [["Mouse", "is", "an", "animal"], ["Jerry", "is", "mouse"]] | |
} | |
''' | |
# separate paragraph | |
'''Ex: Cat is an animal. Tom is cat. | |
Mouse is an animal. | |
Jerry is mouse.''' | |
texts = {} | |
#cl = cleanText.cleanGenText() | |
#cl = cleanGenText() | |
corpus = corpusText.split("\n\n") | |
for pos in range(len(corpus)): | |
if len(corpus[pos]) > 0: | |
texts["Paragraph "+str(pos)] = [] | |
lines = corpus[pos] | |
for line in lines.split("\n"): | |
for l in line.split("."): | |
if len(l) > 0: | |
l = self.cl.removeTabWhiteSpaceNewLine(l) | |
l = l.lower() | |
newL = [] | |
for word in l.split(" "): | |
if len(word) > 0: | |
word = self.cl.removeStopWords(word) | |
for w in word: | |
if len(w) > 0 and w.isnumeric()==False: | |
newL.append(w) | |
if len(newL)>0: | |
texts["Paragraph "+str(pos)].append(newL) | |
if len(texts["Paragraph "+str(pos)]) == 0: | |
del texts["Paragraph "+str(pos)] | |
return texts | |
def selectParaForWC(self, corpus): | |
""" | |
corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]] | |
Heuristically determine Word2Vec parameters. | |
""" | |
corSize = len(corpus) | |
if corSize == 0: | |
return None, None, None, None, None, None | |
# Adjust parameters based on corpus size | |
if corSize < 2000: | |
# Small corpus — need high generalization | |
window = 3 | |
vector_size = 100 | |
sample = 1e-3 | |
negative = 5 | |
epochs = 20 | |
sg = 1 # Skip-gram preferred for rare words | |
elif corSize < 10000: | |
window = 5 | |
vector_size = 150 | |
sample = 1e-4 | |
negative = 10 | |
epochs = 20 | |
sg = 1 | |
elif corSize < 100000: | |
window = 7 | |
vector_size = 200 | |
sample = 1e-5 | |
negative = 15 | |
epochs = 15 | |
sg = 1 | |
elif corSize < 500000: | |
window = 10 | |
vector_size = 250 | |
sample = 1e-5 | |
negative = 15 | |
epochs = 10 | |
sg = 0 # CBOW is okay when data is large | |
else: | |
# Very large corpus | |
window = 12 | |
vector_size = 300 | |
sample = 1e-6 | |
negative = 20 | |
epochs = 5 | |
sg = 0 | |
return window, vector_size, sample, negative, epochs, sg | |
def trainWord2Vec(self,nameFile,modelName,saveFolder,window=None, | |
vector_size=None,sample=None,negative=None,epochs=None,sg=None): | |
jsonFile = "" | |
jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article | |
if not jsonFile: | |
print("No corpus to train") | |
return | |
cores = multiprocessing.cpu_count() | |
combinedCorpus = [] | |
for key in jsonFile: | |
combinedCorpus.extend(jsonFile[key]) | |
# detect phrase before choosing parameters | |
phrases = Phrases(combinedCorpus, min_count=2, threshold=10) | |
bigram = Phraser(phrases) | |
combinedCorpus = [bigram[sent] for sent in combinedCorpus] | |
if window==None and vector_size==None and sample==None and negative==None and epochs==None and sg==None: | |
window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus) | |
# # min_count=1 ensures all words are included | |
#w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4) | |
accept = False | |
# add retry limit because if training keeps failing (bad corpus or corrupted input), it’ll keep retrying without limit. | |
retries = 0 | |
while not accept and retries < 3: | |
if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None: | |
try: | |
w2vModel = Word2Vec( | |
min_count=1, | |
window=window, | |
vector_size=vector_size, | |
sample=sample, | |
alpha=0.03, | |
min_alpha=0.0007, | |
negative=negative, | |
workers=cores-1, | |
epochs = epochs, | |
sg=sg) | |
w2vModel.build_vocab(combinedCorpus) | |
w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=epochs) | |
accept = True | |
except Exception as e: | |
print(f"Retry #{retries+1} failed: {e}") | |
retries +=1 | |
else: | |
print("no parameter to train") | |
break | |
#w2vModel.build_vocab(combinedCorpus) | |
#w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30) | |
#w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model") | |
#w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt") | |
w2vModel.save(saveFolder+"/"+modelName+".model") | |
w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt") | |
print("done w2v") | |
#return combinedCorpus | |
def updateWord2Vec(self, modelPath, newCorpus, saveFolder=None): | |
if not newCorpus: | |
raise ValueError("New corpus is empty!") | |
model = Word2Vec.load(modelPath) | |
# Phrase detection on new data | |
phrases = Phrases(newCorpus, min_count=2, threshold=10) | |
bigram = Phraser(phrases) | |
newCorpus = [bigram[sent] for sent in newCorpus] | |
# Update vocab & retrain | |
model.build_vocab(newCorpus, update=True) | |
model.train(newCorpus, total_examples=len(newCorpus), epochs=model.epochs) | |
def genSimilar(self,word,modelFile,n=10, cos_thres=0.7): | |
# might not be a meaningful keyword | |
#stopWords = ["show"] | |
# same word but just plural nouns, tense | |
simWords = [word+"s",word+"es",word+"ing",word+"ed"] | |
model = KeyedVectors.load_word2vec_format(modelFile, binary = False) # model file in format txt | |
results = model.most_similar(positive=[word],topn=n) | |
#removeIndex = [] | |
#currN = copy.deepcopy(n) | |
'''for r in range(len(results)): | |
if len(results[r][0]) < 2: | |
removeIndex.append(results[r]) | |
# remove the same word but just plural and singular noun and lower than the cos_thres | |
elif results[r][0] == word: | |
removeIndex.append(results[r]) | |
elif results[r][0] in simWords or float(results[r][1]) < cos_thres or results[r][0] in stopWords: | |
removeIndex.append(results[r]) | |
for rem in removeIndex: | |
results.remove(rem) | |
while len(results)!=n and len(results) != 0: | |
moreNewResult = model.most_similar(positive=[word],topn=currN+1)[-1] | |
if moreNewResult not in results and len(moreNewResult[0])>1: | |
if moreNewResult[0] not in stopWords and results[0] != word: | |
results.append(moreNewResult) | |
currN +=1''' | |
return results | |
# add more data to existing word2vec model | |
def updateWord2Vec(self, modelPath, newCorpus, saveFolder=None): | |
if not newCorpus: | |
raise ValueError("New corpus is empty!") | |
model = Word2Vec.load(modelPath) | |
# Phrase detection on new data | |
phrases = Phrases(newCorpus, min_count=2, threshold=10) | |
bigram = Phraser(phrases) | |
newCorpus = [bigram[sent] for sent in newCorpus] | |
# Update vocab & retrain | |
model.build_vocab(newCorpus, update=True) | |
model.train(newCorpus, total_examples=len(newCorpus), epochs=model.epochs) | |
# Save updated model | |
if saveFolder: | |
os.makedirs(saveFolder, exist_ok=True) | |
name = os.path.basename(modelPath).replace(".model", "_updated.model") | |
model.save(f"{saveFolder}/{name}") | |
print(f"🔁 Model updated and saved to {saveFolder}/{name}") | |
else: | |
model.save(modelPath) | |
print(f"🔁 Model updated and overwritten at {modelPath}") | |
# adding our model into spacy | |
# this deals with command line; but instead of using it, we write python script to run command line | |
def loadWordVec(self,modelName,wordVec): | |
# modelName is the name you want to save into spacy | |
# wordVec is the trained word2vec in txt format | |
subprocess.run([sys.executable, | |
"-m", | |
"spacy", | |
"init-model", | |
"en", | |
modelName, # this modelName comes from the saved modelName of function trainWord2Vec | |
"--vectors-loc", | |
wordVec]) | |
print("done") |