Spaces:

luciagonzalez
/

explorar_palabras

Build error

explorar_palabras / utils_sesgo_en_palabras.py

Lucia Gonzalez

Files

7dda7b9 almost 3 years ago

9.1 kB

	import math

	import gensim
	import matplotlib.pylab as plt
	import numpy as np
	import pandas as pd
	from six import string_types
	from sklearn.cluster import KMeans
	from sklearn.manifold import TSNE
	from sklearn.metrics import accuracy_score
	import gradio as gr



	WORD_EMBEDDING_MODEL_TYPES = (gensim.models.keyedvectors.KeyedVectors,
	gensim.models.fasttext.FastText,
	gensim.models.word2vec.Word2Vec,
	gensim.models.base_any2vec.BaseWordEmbeddingsModel,) # pylint: disable=line-too-long


	def assert_gensim_keyed_vectors(model):
	if not isinstance(model, WORD_EMBEDDING_MODEL_TYPES):
	type_names = (model_type.__name__
	for model_type in WORD_EMBEDDING_MODEL_TYPES)
	raise TypeError('model should be on of the types'
	' ({}), not {}.'
	.format(', '.join(type_names),
	type(model)))

	def generate_words_forms(words):
	return sum([generate_one_word_forms(word) for word in words], [])


	def cosine_similarity(v, u):
	"""Calculate the cosine similarity between two vectors."""
	v_norm = np.linalg.norm(v)
	u_norm = np.linalg.norm(u)
	similarity = v @ u / (v_norm * u_norm)
	return similarity


	def generate_one_word_forms(word):
	return [word.lower(), word.upper(), word.title()]


	def get_seed_vector(seed, bias_word_embedding):

	if seed == 'direction':
	positive_end = bias_word_embedding.positive_end
	negative_end = bias_word_embedding.negative_end
	bias_word_embedding._is_direction_identified() # pylint: disable=protected-access
	seed_vector = bias_word_embedding.direction
	else:
	if seed == 'ends':
	positive_end = bias_word_embedding.positive_end
	negative_end = bias_word_embedding.negative_end

	else:
	positive_end, negative_end = seed

	seed_vector = normalize(bias_word_embedding.model[positive_end]
	- bias_word_embedding.model[negative_end])

	return seed_vector, positive_end, negative_end


	def most_similar(model, positive=None, negative=None,
	topn=10, restrict_vocab=None, indexer=None,
	unrestricted=True):
	"""
	Find the top-N most similar words.

	Positive words contribute positively towards the similarity,
	negative words negatively.

	This function computes cosine similarity between a simple mean
	of the projection weight vectors of the given words and
	the vectors for each word in the model.
	The function corresponds to the `word-analogy` and `distance`
	scripts in the original word2vec implementation.

	Based on Gensim implementation.

	:param model: Word embedding model of ``gensim.model.KeyedVectors``.
	:param list positive: List of words that contribute positively.
	:param list negative: List of words that contribute negatively.
	:param int topn: Number of top-N similar words to return.
	:param int restrict_vocab: Optional integer which limits the
	range of vectors
	which are searched for most-similar values.
	For example, restrict_vocab=10000 would
	only check the first 10000 word vectors
	in the vocabulary order. (This may be
	meaningful if you've sorted the vocabulary
	by descending frequency.)
	:param bool unrestricted: Whether to restricted the most
	similar words to be not from
	the positive or negative word list.
	:return: Sequence of (word, similarity).
	"""
	if topn is not None and topn < 1:
	return []

	if positive is None:
	positive = []
	if negative is None:
	negative = []

	model.init_sims()

	if (isinstance(positive, string_types)
	and not negative):
	# allow calls like most_similar('dog'),
	# as a shorthand for most_similar(['dog'])
	positive = [positive]

	if ((isinstance(positive, string_types) and negative)
	or (isinstance(negative, string_types) and positive)):
	raise ValueError('If positives and negatives are given, '
	'both should be lists!')

	# add weights for each word, if not already present;
	# default to 1.0 for positive and -1.0 for negative words
	positive = [
	(word, 1.0) if isinstance(word, string_types + (np.ndarray,))
	else word
	for word in positive
	]
	negative = [
	(word, -1.0) if isinstance(word, string_types + (np.ndarray,))
	else word
	for word in negative
	]

	# compute the weighted average of all words
	all_words, mean = set(), []
	for word, weight in positive + negative:
	if isinstance(word, np.ndarray):
	mean.append(weight * word)
	else:
	mean.append(weight * model.word_vec(word, use_norm=True))
	if word in model.vocab:
	all_words.add(model.vocab[word].index)

	if not mean:
	raise ValueError("Cannot compute similarity with no input.")
	mean = gensim.matutils.unitvec(np.array(mean)
	.mean(axis=0)).astype(float)

	if indexer is not None:
	return indexer.most_similar(mean, topn)

	limited = (model.vectors_norm if restrict_vocab is None
	else model.vectors_norm[:restrict_vocab])
	dists = limited @ mean

	if topn is None:
	return dists

	best = gensim.matutils.argsort(dists,
	topn=topn + len(all_words),
	reverse=True)

	# if not unrestricted, then ignore (don't return)
	# words from the input
	result = [(model.index2word[sim], float(dists[sim]))
	for sim in best
	if unrestricted or sim not in all_words]

	return result[:topn]


	def normalize(v):
	"""Normalize a 1-D vector."""
	if v.ndim != 1:
	raise ValueError('v should be 1-D, {}-D was given'.format(
	v.ndim))
	norm = np.linalg.norm(v)
	if norm == 0:
	return v
	return v / norm


	def project_params(u, v):
	"""Projecting and rejecting the vector v onto direction u with scalar."""
	normalize_u = normalize(u)
	projection = (v @ normalize_u)
	projected_vector = projection * normalize_u
	rejected_vector = v - projected_vector
	return projection, projected_vector, rejected_vector


	def project_reject_vector(v, u):
	"""Projecting and rejecting the vector v onto direction u."""
	projected_vector = project_vector(v, u)
	rejected_vector = v - projected_vector
	return projected_vector, rejected_vector


	def round_to_extreme(value, digits=2):
	place = 10**digits
	new_value = math.ceil(abs(value) * place) / place
	if value < 0:
	new_value = -new_value
	return new_value


	def take_two_sides_extreme_sorted(df, n_extreme,
	part_column=None,
	head_value='',
	tail_value=''):
	head_df = df.head(n_extreme)[:]
	tail_df = df.tail(n_extreme)[:]

	if part_column is not None:
	head_df[part_column] = head_value
	tail_df[part_column] = tail_value

	return (pd.concat([head_df, tail_df])
	.drop_duplicates()
	.reset_index(drop=True))


	def project_vector(v, u):
	"""Projecting the vector v onto direction u."""
	normalize_u = normalize(u)
	return (v @ normalize_u) * normalize_u


	def reject_vector(v, u):
	"""Rejecting the vector v onto direction u."""
	return v - project_vector(v, u)


	def update_word_vector(model, word, new_vector):
	model.vectors[model.vocab[word].index] = new_vector
	if model.vectors_norm is not None:
	model.vectors_norm[model.vocab[word].index] = normalize(new_vector)


	def project_params(u, v):
	"""Projecting and rejecting the vector v onto direction u with scalar."""
	normalize_u = normalize(u)
	projection = (v @ normalize_u)
	projected_vector = projection * normalize_u
	rejected_vector = v - projected_vector
	return projection, projected_vector, rejected_vector


	def plot_clustering_as_classification(X, y_true, random_state=1, ax=None):

	if ax is None:
	_, ax = plt.subplots(figsize=(10, 5))

	y_cluster = (KMeans(n_clusters=2, random_state=random_state)
	.fit_predict(X))

	embedded_vectors = (TSNE(n_components=2, random_state=random_state)
	.fit_transform(X))

	for y_value in np.unique(y_cluster):
	mask = (y_cluster == y_value)
	label = 'Positive' if y_value else 'Negative'
	ax.scatter(embedded_vectors[mask, 0],
	embedded_vectors[mask, 1],
	label=label)

	ax.legend()

	acc = accuracy_score(y_true, y_cluster)

	return max(acc, 1 - acc)