Spaces:

VyLala
/

mtDNALocation

Running

App Files Files Community

mtDNALocation / NER /word2Vec /word2vec.py

VyLala

Upload 52 files

8835144 verified 28 days ago

raw

history blame

17.4 kB

	'''WORD TO VECTOR'''
	import pandas as pd
	import json
	import gensim
	import spacy
	from DefaultPackages import openFile, saveFile
	from NER import cleanText
	from gensim.models.keyedvectors import KeyedVectors
	from gensim.test.utils import common_texts
	from gensim.models.word2vec import Word2Vec
	from gensim.scripts.glove2word2vec import glove2word2vec
	from gensim.test.utils import datapath, get_tmpfile
	from gensim.models import Phrases
	from gensim.models.phrases import Phraser
	import sys
	import subprocess
	import os
	# can try multiprocessing to run quicker
	import multiprocessing
	import copy
	sys.setrecursionlimit(1000)
	# creat folder word2Vec
	#! mkdir /content/drive/MyDrive/CollectData/NER/word2Vec
	# create word2vec model
	#model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/CollectData/NER/word2Vec', binary=True)
	'''Some notes for this model
	sometimes when we do the corpus, there are some adverbs which are unnecessary but might be seen as
	a similar word to the word we are finding, so can we try to preprocess text so that
	we make the corpus more effective and only contains the important words. Then when we
	train the model, the important words will be seen as important. Or
	when we already have the similar list of words, we can remove the words in there
	that are stopwords/unnecessary words.'''
	### For more complex analysis, consider using sentence embedding models like "Doc2Vec" to represent the meaning of entire sentences instead of just individual words
	class word2Vec():
	def __init__(self, nameFile=None, modelName=None):
	self.nameFile = nameFile
	self.modelName = modelName
	#self.nlp = spacy.load("en_core_web_lg")
	self.cl = cleanText.cleanGenText()
	def spacy_similarity(self, word):
	# when use word2vec, try medium or large is better
	# maybe try odc similarity?
	doc = self.nlp(word)
	for token1 in doc:
	for token2 in doc:
	print(token1.text, token2.text, token1.similarity(token2))
	pass
	# clean text before transform to corpus
	def cleanTextBeforeCorpus(self,oriText, doi=None):
	#cl = cleanText.cleanGenText()
	#cl = cleanGenText()
	output = ""
	alreadyRemoveDoi = False
	for word in oriText.split(" "):
	# remove DOI
	if doi != None and doi in oriText:
	if alreadyRemoveDoi == False:
	newWord = self.cl.removeDOI(word,doi)
	if len(newWord) > 0 and newWord != word:
	alreadyRemoveDoi = True
	word = newWord
	# remove punctuation
	# split the sticked words
	#word = cl.splitStickWords(word)
	# remove punctuation
	word = self.cl.removePunct(word,True)
	# remove URL
	word = self.cl.removeURL(word)
	# remove HTMLTag
	word = self.cl.removeHTMLTag(word)
	# remove tab, white space, newline
	word = self.cl.removeTabWhiteSpaceNewLine(word)
	# optional: remove stopwords
	#word = cl.removeStopWords(word)
	if len(word)>0:
	output += word + " "
	return output
	def cleanAllTextBeforeCorpus(self, allText, doi=None):
	cleanOutput = ""
	remove = "Evaluation Warning: The document was created with Spire.Doc for Python."
	if len(allText) > 0:
	corpusText = allText.split("\n\n")
	for pos in range(len(corpusText)):
	lines = corpusText[pos]
	if len(lines) > 0:
	for line in lines.split("\n"):
	if remove in line: line = line.replace(remove, "")
	clean_text = self.cleanTextBeforeCorpus(line, doi)
	cleanOutput += clean_text + "\n"
	cleanOutput += "\n\n"
	return cleanOutput
	import urllib.parse, requests

	def tableTransformToCorpusText(self, df, excelFile=None):
	# PDF, Excel, WordDoc
	#cl = cleanText.cleanGenText()
	corpus = {}
	# PDF or df
	if excelFile == None:
	if len(df) > 0:
	try:
	for i in range(len(df)):
	# each new dimension/page is considered to be a sentence which ends with the period.
	# each new line is a new list, and each new df is a new corpus
	outputDF = []
	text = df[i].values.tolist()
	if len(text) > 0:
	outputRowDF = self.helperRowTableToCorpus(text)
	#outputColDF = self.helperColTableToCorpus(text)
	outputDF.extend(outputRowDF)
	#outputDF.extend(outputColDF)
	if len(outputDF) > 0:
	corpus["corpus" + str(i)] = outputDF
	except:
	outputDF = []
	text = df.values.tolist()
	if len(text) > 0:
	outputRowDF = self.helperRowTableToCorpus(text)
	#outputColDF = self.helperColTableToCorpus(text)
	outputDF.extend(outputRowDF)
	#outputDF.extend(outputColDF)
	if len(outputDF) > 0:
	corpus["corpus0"] = outputDF
	else:
	try:
	df = pd.ExcelFile(excelFile)
	except:
	if excelFile.endswith('.xls'):
	df = pd.read_excel(excelFile, engine='xlrd')
	else:
	df = pd.read_excel(excelFile, engine='openpyxl')
	sheetNames = df.sheet_names
	output = []
	if len(sheetNames) > 0:
	for s in range(len(sheetNames)):
	outputDF = []
	with pd.ExcelFile(excelFile) as xls:
	data = pd.read_excel(xls, sheetNames[s])
	if sheetNames[s] != 'Evaluation Warning':
	text = data.values.tolist()
	if len(text) > 0:
	outputRowDF = self.helperRowTableToCorpus(text)
	#outputColDF = self.helperColTableToCorpus(text)
	outputDF.extend(outputRowDF)
	#outputDF.extend(outputColDF)
	if len(outputDF) > 0:
	corpus["corpus" + str(s)] = outputDF
	return corpus
	def helperRowTableToCorpus(self, textList):
	#cl = cleanGenText()
	#cl = cleanText.cleanGenText()
	stopWords = ["NaN","Unnamed:","nan"]
	outputDF = []
	for line in textList:
	outputLine = []
	for words in line:
	words = str(words)
	if len(words) > 0:
	for word in words.split(" "):
	# remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
	if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
	#word = cl.splitStickWords(word)
	word = self.cl.removePunct(word)
	word = " ".join(self.cl.removeStopWords(word))
	word = self.cl.removeTabWhiteSpaceNewLine(word)
	if len(word) > 1:
	if len(word.split(" ")) > 1:
	for x in word.split(" "):
	if len(x) > 1 and x.isnumeric()==False:
	outputLine.append(x.lower())
	else:
	if word.isnumeric() == False:
	outputLine.append(word.lower())
	if len(outputLine) > 0:
	outputDF.append(outputLine)
	return outputDF
	def helperColTableToCorpus(self, dfList):
	#cl = cleanGenText()
	#cl = cleanText.cleanGenText()
	stopWords = ["NaN","Unnamed:","nan"]
	outputDF = []
	# use the first length line as the column ref
	for pos in range(len(dfList[0])):
	outputLine = []
	for line in dfList:
	if pos < len(line):
	words = line[pos]
	words = str(words)
	else: words = ""
	if len(words) > 0:
	for word in words.split(" "):
	# remove specific stopwords for table: "NaN", "Unnamed: 0", row index: if the number appears first, it's just a row index; keep "KM1"
	if str(word) not in stopWords: # remove "NaN", "Unnamed:","nan"
	#word = cl.splitStickWords(word)
	word = self.cl.removePunct(word)
	word = " ".join(self.cl.removeStopWords(word))
	word = self.cl.removeTabWhiteSpaceNewLine(word)
	if len(word) > 1:
	if len(word.split(" ")) > 1:
	for x in word.split(" "):
	if len(x) > 1 and x.isnumeric()==False:
	outputLine.append(x.lower())
	else:
	if word.isnumeric() == False:
	outputLine.append(word.lower())
	if len(outputLine) > 0:
	outputDF.append(outputLine)
	return outputDF
	# create a corpus
	def createCorpusText(self, corpusText):
	'''ex: "Tom is cat. Jerry is mouse."
	corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]'''
	# the output should be like this:
	'''texts = {
	"Paragraph 1": [["Cat", "is", "an","animal], ["Tom", "is", "cat"]],
	"Paragraph 2": [["Mouse", "is", "an", "animal"], ["Jerry", "is", "mouse"]]
	}
	'''
	# separate paragraph
	'''Ex: Cat is an animal. Tom is cat.

	Mouse is an animal.
	Jerry is mouse.'''
	texts = {}
	#cl = cleanText.cleanGenText()
	#cl = cleanGenText()
	corpus = corpusText.split("\n\n")
	for pos in range(len(corpus)):
	if len(corpus[pos]) > 0:
	texts["Paragraph "+str(pos)] = []
	lines = corpus[pos]
	for line in lines.split("\n"):
	for l in line.split("."):
	if len(l) > 0:
	l = self.cl.removeTabWhiteSpaceNewLine(l)
	l = l.lower()
	newL = []
	for word in l.split(" "):
	if len(word) > 0:
	word = self.cl.removeStopWords(word)
	for w in word:
	if len(w) > 0 and w.isnumeric()==False:
	newL.append(w)
	if len(newL)>0:
	texts["Paragraph "+str(pos)].append(newL)
	if len(texts["Paragraph "+str(pos)]) == 0:
	del texts["Paragraph "+str(pos)]
	return texts

	def selectParaForWC(self, corpus):
	"""
	corpus = [["Tom", "is", "cat"], ["Jerry", "is", "mouse"]]
	Heuristically determine Word2Vec parameters.
	"""
	corSize = len(corpus)

	if corSize == 0:
	return None, None, None, None, None, None

	# Adjust parameters based on corpus size
	if corSize < 2000:
	# Small corpus — need high generalization
	window = 3
	vector_size = 100
	sample = 1e-3
	negative = 5
	epochs = 20
	sg = 1 # Skip-gram preferred for rare words
	elif corSize < 10000:
	window = 5
	vector_size = 150
	sample = 1e-4
	negative = 10
	epochs = 20
	sg = 1
	elif corSize < 100000:
	window = 7
	vector_size = 200
	sample = 1e-5
	negative = 15
	epochs = 15
	sg = 1
	elif corSize < 500000:
	window = 10
	vector_size = 250
	sample = 1e-5
	negative = 15
	epochs = 10
	sg = 0 # CBOW is okay when data is large
	else:
	# Very large corpus
	window = 12
	vector_size = 300
	sample = 1e-6
	negative = 20
	epochs = 5
	sg = 0

	return window, vector_size, sample, negative, epochs, sg


	def trainWord2Vec(self,nameFile,modelName,saveFolder,window=None,
	vector_size=None,sample=None,negative=None,epochs=None,sg=None):
	jsonFile = ""
	jsonFile = openFile.openJsonFile(nameFile) # this is a corpus json file from an article
	if not jsonFile:
	print("No corpus to train")
	return
	cores = multiprocessing.cpu_count()
	combinedCorpus = []
	for key in jsonFile:
	combinedCorpus.extend(jsonFile[key])
	# detect phrase before choosing parameters
	phrases = Phrases(combinedCorpus, min_count=2, threshold=10)
	bigram = Phraser(phrases)
	combinedCorpus = [bigram[sent] for sent in combinedCorpus]

	if window==None and vector_size==None and sample==None and negative==None and epochs==None and sg==None:
	window, vector_size, sample, negative, epochs, sg = self.selectParaForWC(combinedCorpus)
	# # min_count=1 ensures all words are included
	#w2vModel = Word2Vec(vector_size=150, window=10, min_count=1, workers=4)
	accept = False
	# add retry limit because if training keeps failing (bad corpus or corrupted input), it’ll keep retrying without limit.
	retries = 0
	while not accept and retries < 3:
	if window!=None and vector_size!=None and sample!=None and negative!=None and epochs!=None and sg!=None:
	try:
	w2vModel = Word2Vec(
	min_count=1,
	window=window,
	vector_size=vector_size,
	sample=sample,
	alpha=0.03,
	min_alpha=0.0007,
	negative=negative,
	workers=cores-1,
	epochs = epochs,
	sg=sg)
	w2vModel.build_vocab(combinedCorpus)
	w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=epochs)
	accept = True
	except Exception as e:
	print(f"Retry #{retries+1} failed: {e}")
	retries +=1
	else:
	print("no parameter to train")
	break
	#w2vModel.build_vocab(combinedCorpus)
	#w2vModel.train(combinedCorpus, total_examples=w2vModel.corpus_count, epochs=30)
	#w2vModel.save("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".model")
	#w2vModel.wv.save_word2vec_format("/content/drive/MyDrive/CollectData/NER/word2Vec/TestExamples/models/wordVector_"+modelName+".txt")
	w2vModel.save(saveFolder+"/"+modelName+".model")
	w2vModel.wv.save_word2vec_format(saveFolder+"/"+modelName+".txt")
	print("done w2v")
	#return combinedCorpus
	def updateWord2Vec(self, modelPath, newCorpus, saveFolder=None):
	if not newCorpus:
	raise ValueError("New corpus is empty!")

	model = Word2Vec.load(modelPath)

	# Phrase detection on new data
	phrases = Phrases(newCorpus, min_count=2, threshold=10)
	bigram = Phraser(phrases)
	newCorpus = [bigram[sent] for sent in newCorpus]

	# Update vocab & retrain
	model.build_vocab(newCorpus, update=True)
	model.train(newCorpus, total_examples=len(newCorpus), epochs=model.epochs)

	def genSimilar(self,word,modelFile,n=10, cos_thres=0.7):
	# might not be a meaningful keyword
	#stopWords = ["show"]
	# same word but just plural nouns, tense
	simWords = [word+"s",word+"es",word+"ing",word+"ed"]
	model = KeyedVectors.load_word2vec_format(modelFile, binary = False) # model file in format txt
	results = model.most_similar(positive=[word],topn=n)
	#removeIndex = []
	#currN = copy.deepcopy(n)
	'''for r in range(len(results)):
	if len(results[r][0]) < 2:
	removeIndex.append(results[r])
	# remove the same word but just plural and singular noun and lower than the cos_thres
	elif results[r][0] == word:
	removeIndex.append(results[r])
	elif results[r][0] in simWords or float(results[r][1]) < cos_thres or results[r][0] in stopWords:
	removeIndex.append(results[r])
	for rem in removeIndex:
	results.remove(rem)
	while len(results)!=n and len(results) != 0:
	moreNewResult = model.most_similar(positive=[word],topn=currN+1)[-1]
	if moreNewResult not in results and len(moreNewResult[0])>1:
	if moreNewResult[0] not in stopWords and results[0] != word:
	results.append(moreNewResult)
	currN +=1'''
	return results
	# add more data to existing word2vec model
	def updateWord2Vec(self, modelPath, newCorpus, saveFolder=None):
	if not newCorpus:
	raise ValueError("New corpus is empty!")

	model = Word2Vec.load(modelPath)

	# Phrase detection on new data
	phrases = Phrases(newCorpus, min_count=2, threshold=10)
	bigram = Phraser(phrases)
	newCorpus = [bigram[sent] for sent in newCorpus]

	# Update vocab & retrain
	model.build_vocab(newCorpus, update=True)
	model.train(newCorpus, total_examples=len(newCorpus), epochs=model.epochs)

	# Save updated model
	if saveFolder:
	os.makedirs(saveFolder, exist_ok=True)
	name = os.path.basename(modelPath).replace(".model", "_updated.model")
	model.save(f"{saveFolder}/{name}")
	print(f"🔁 Model updated and saved to {saveFolder}/{name}")
	else:
	model.save(modelPath)
	print(f"🔁 Model updated and overwritten at {modelPath}")

	# adding our model into spacy
	# this deals with command line; but instead of using it, we write python script to run command line
	def loadWordVec(self,modelName,wordVec):
	# modelName is the name you want to save into spacy
	# wordVec is the trained word2vec in txt format
	subprocess.run([sys.executable,
	"-m",
	"spacy",
	"init-model",
	"en",
	modelName, # this modelName comes from the saved modelName of function trainWord2Vec
	"--vectors-loc",
	wordVec])
	print("done")