Spaces:

luciagonzalez
/

explorar_palabras

Build error

explorar_palabras / modules_sesgo_en_palabras.py

Lucia Gonzalez

Files

7dda7b9 over 2 years ago

25.4 kB

	import copy

	from sklearn.decomposition import PCA
	from sklearn.metrics.pairwise import euclidean_distances
	import matplotlib.pyplot as plt
	import numpy as np
	import seaborn as sns
	from transformers import AutoTokenizer, AutoModelForMaskedLM
	import pandas as pd
	from gensim.models import KeyedVectors
	from utils_sesgo_en_palabras import (
	cosine_similarity,
	normalize,
	project_params,
	take_two_sides_extreme_sorted
	)


	DIRECTION_METHODS = ['single', 'sum', 'pca']
	DEBIAS_METHODS = ['neutralize', 'hard', 'soft']
	FIRST_PC_THRESHOLD = 0.5
	MAX_NON_SPECIFIC_EXAMPLES = 1000

	__all__ = ['GenderBiasWE', 'BiasWordEmbedding']


	class Loader():
	def __init__(self):
	self.path_to_data = ''

	def load_tokenizer(self, tokenizer_path):
	tokenizer = AutoTokenizer.from_pretrained(
	tokenizer_path, do_lower_case=True, )
	return tokenizer

	def load_data_from_file(self, data):
	return data

	def load_corpus_from_file(self, data):
	return data

	def load_language_model(self, model_path):
	model = AutoModelForMaskedLM.from_pretrained(
	model_path, output_hidden_states=True)
	return model


	class Corpus():
	def __init__(self, corpus) -> None:
	self.vocabulary = self.load_vocabulary_from_corpus()
	self.corpus = corpus

	def get_context_from_text(self, word):
	pass

	def get_frequency(self, word):
	pass

	def get_most_frequent_coocurrence(self, word):
	pass


	class Embedding():
	def __init__(self, word_vectors_path) -> None:
	self.wv = self.load_we_as_keyed_vectors(word_vectors_path)

	def load_we_as_keyed_vectors(self, word_vectors_path):
	we = KeyedVectors.load_word2vec_format(word_vectors_path)
	we.init_sims(replace=True)
	return we

	def get_word_vector(self, word, context=None):
	return word


	class BiasExplorer():
	def __init__(self, model, only_lower=False, verbose=False,
	identify_direction=False, to_normalize=True):
	# pylint: disable=undefined-variable

	# TODO: this is bad Python, ask someone about it
	# probably should be a better design
	# identify_direction doesn't have any meaning
	# for the class BiasWordEmbedding
	# The goal is to force this interfeace of sub-classes.
	if self.__class__ == __class__ and identify_direction is not False:
	raise ValueError('identify_direction must be False'
	' for an instance of {}'
	.format(__class__))

	self.model = model

	# TODO: write unitest for when it is False
	self.only_lower = only_lower

	self._verbose = verbose

	self.direction = None
	self.positive_end = None
	self.negative_end = None

	if to_normalize:
	self.model.init_sims(replace=True)

	def __copy__(self):
	bias_word_embedding = self.__class__(self.model,
	self.only_lower,
	self._verbose,
	identify_direction=False)
	bias_word_embedding.direction = copy.deepcopy(self.direction)
	bias_word_embedding.positive_end = copy.deepcopy(self.positive_end)
	bias_word_embedding.negative_end = copy.deepcopy(self.negative_end)
	return bias_word_embedding

	def __deepcopy__(self, memo):
	bias_word_embedding = copy.copy(self)
	bias_word_embedding.model = copy.deepcopy(bias_word_embedding.model)
	return bias_word_embedding

	def __getitem__(self, key):
	return self.model[key]

	def __contains__(self, item):
	return item in self.model

	def _is_direction_identified(self):
	if self.direction is None:
	raise RuntimeError('The direction was not identified'
	' for this {} instance'
	.format(self.__class__.__name__))

	def _identify_subspace_by_pca(self, definitional_pairs, n_components):
	matrix = []

	for word1, word2 in definitional_pairs:
	vector1 = normalize(self[word1])
	vector2 = normalize(self[word2])

	center = (vector1 + vector2) / 2

	matrix.append(vector1 - center)
	matrix.append(vector2 - center)

	pca = PCA(n_components=n_components)
	pca.fit(matrix)

	if self._verbose:
	table = enumerate(pca.explained_variance_ratio_, start=1)
	headers = ['Principal Component',
	'Explained Variance Ratio']

	return pca

	def __errorChecking(self, word):
	out_msj = ""

	if not word:
	out_msj = "Error: Primero debe ingresar una palabra!"
	else:
	if not word in self.model:
	out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"

	if out_msj:
	out_msj = "<center><h3>"+out_msj+"</h3></center>"

	return out_msj

	# TODO: add the SVD method from section 6 step 1
	# It seems there is a mistake there, I think it is the same as PCA
	# just with replacing it with SVD
	def _identify_direction(self, positive_end, negative_end,
	definitional, method='pca'):
	if method not in DIRECTION_METHODS:
	raise ValueError('method should be one of {}, {} was given'.format(
	DIRECTION_METHODS, method))

	if positive_end == negative_end:
	raise ValueError('positive_end and negative_end'
	'should be different, and not the same "{}"'
	.format(positive_end))
	if self._verbose:
	print('Identify direction using {} method...'.format(method))

	direction = None

	if method == 'single':
	if self._verbose:
	print('Positive definitional end:', definitional[0])
	print('Negative definitional end:', definitional[1])
	direction = normalize(normalize(self[definitional[0]])
	- normalize(self[definitional[1]]))

	elif method == 'sum':
	group1_sum_vector = np.sum([self[word]
	for word in definitional[0]], axis=0)
	group2_sum_vector = np.sum([self[word]
	for word in definitional[1]], axis=0)

	diff_vector = (normalize(group1_sum_vector)
	- normalize(group2_sum_vector))

	direction = normalize(diff_vector)

	elif method == 'pca':
	pca = self._identify_subspace_by_pca(definitional, 10)
	if pca.explained_variance_ratio_[0] < FIRST_PC_THRESHOLD:
	raise RuntimeError('The Explained variance'
	'of the first principal component should be'
	'at least {}, but it is {}'
	.format(FIRST_PC_THRESHOLD,
	pca.explained_variance_ratio_[0]))
	direction = pca.components_[0]

	# if direction is opposite (e.g. we cannot control
	# what the PCA will return)
	ends_diff_projection = cosine_similarity((self[positive_end]
	- self[negative_end]),
	direction)
	if ends_diff_projection < 0:
	direction = -direction # pylint: disable=invalid-unary-operand-type

	self.direction = direction
	self.positive_end = positive_end
	self.negative_end = negative_end

	def project_on_direction(self, word):
	"""Project the normalized vector of the word on the direction.
	:param str word: The word tor project
	:return float: The projection scalar
	"""

	self._is_direction_identified()

	vector = self[word]
	projection_score = self.model.cosine_similarities(self.direction,
	[vector])[0]
	return projection_score

	def _calc_projection_scores(self, words):
	self._is_direction_identified()

	df = pd.DataFrame({'word': words})

	# TODO: maybe using cosine_similarities on all the vectors?
	# it might be faster
	df['projection'] = df['word'].apply(self.project_on_direction)
	df = df.sort_values('projection', ascending=False)

	return df

	def calc_projection_data(self, words):
	"""
	Calculate projection, projected and rejected vectors of a words list.
	:param list words: List of words
	:return: :class:`pandas.DataFrame` of the projection,
	projected and rejected vectors of the words list
	"""
	projection_data = []
	for word in words:
	vector = self[word]
	projection = self.project_on_direction(word)
	normalized_vector = normalize(vector)

	(projection,
	projected_vector,
	rejected_vector) = project_params(normalized_vector,
	self.direction)

	projection_data.append({'word': word,
	'vector': vector,
	'projection': projection,
	'projected_vector': projected_vector,
	'rejected_vector': rejected_vector})

	return pd.DataFrame(projection_data)

	def plot_dist_projections_on_direction(self, word_groups, ax=None):
	"""Plot the projection scalars distribution on the direction.
	:param dict word_groups word: The groups to projects
	:return float: The ax object of the plot
	"""

	if ax is None:
	_, ax = plt.subplots(1)

	names = sorted(word_groups.keys())

	for name in names:
	words = word_groups[name]
	label = '{} (#{})'.format(name, len(words))
	vectors = [self[word] for word in words]
	projections = self.model.cosine_similarities(self.direction,
	vectors)
	sns.distplot(projections, hist=False, label=label, ax=ax)

	plt.axvline(0, color='k', linestyle='--')

	plt.title('← {} {} {} →'.format(self.negative_end,
	' ' * 20,
	self.positive_end))
	plt.xlabel('Direction Projection')
	plt.ylabel('Density')
	ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

	return ax

	def __errorChecking(self, word):
	out_msj = ""

	if not word:
	out_msj = "Error: Primero debe ingresar una palabra!"
	else:
	if not word in self.model:
	out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"

	if out_msj:
	out_msj = "<center><h3>"+out_msj+"</h3></center>"

	return out_msj

	def parse_words(self, string):
	words = string.strip()
	if words:
	words = [word.strip() for word in words.split(',') if word != ""]
	return words

	def check_oov(self, wordlists):
	for wordlist in wordlists:
	parsed_words = self.parse_words(wordlist)
	for word in parsed_words:
	msg = self.__errorChecking(word)
	if msg:
	return msg
	return None

	def plot_projections_2d(self,
	wordlist,
	wordlist_1,
	wordlist_2,
	wordlist_3,
	wordlist_4,
	color_wordlist,
	color_wordlist_1,
	color_wordlist_2,
	color_wordlist_3,
	color_wordlist_4,
	plot_neighbors,
	n_alpha,
	fontsize,
	figsize=(15, 15),
	method='pca'
	):
	# convertirlas a vector
	choices = [0, 1, 2, 3, 4]
	word_list = []
	wordlist_choice = [wordlist, wordlist_1, wordlist_2, wordlist_3, wordlist_4]
	err= self.check_oov(wordlist_choice)
	if err:
	return None, err
	words_colors = {}
	label_dict = {
	0: 'Diagnostico',
	1: 'Lista de palabras 1',
	2: 'Lista de palabras 2',
	3: 'Lista de palabras 3',
	4: 'Lista de palabras 4'
	}
	color_dict = {
	0: color_wordlist,
	1: color_wordlist_1,
	2: color_wordlist_2,
	3: color_wordlist_3,
	4: color_wordlist_4
	}
	word_bias_space = {}
	alpha = {}

	for raw_word_list, color in zip(wordlist_choice, choices):
	parsed_words = self.parse_words(raw_word_list)
	if parsed_words:
	for word in parsed_words:
	word_bias_space[word] = color
	words_colors[word] = color_dict[color]
	alpha[word] = 1
	if plot_neighbors:
	neighbors = [w for w,s in self.model.most_similar(word,topn=5)]
	for n in neighbors:
	if n not in alpha:
	word_bias_space[n] = color
	words_colors[n] = color_dict[color]
	alpha[n] = n_alpha
	word_list += neighbors
	word_list += parsed_words
	if not word_list:
	return None, "<center><h3>" + "Ingresa al menos 2 palabras para continuar" + "<center><h3>"
	embeddings = [self.model[word] for word in word_list]
	words_embedded = PCA(
	n_components=2, random_state=1).fit_transform(embeddings)
	data = pd.DataFrame(words_embedded)
	data['word'] = word_list
	data['color'] = [words_colors[word] for word in word_list]
	data['alpha'] = [alpha[word] for word in word_list]
	data['word_bias_space'] = [word_bias_space[word] for word in word_list]
	fig, ax = plt.subplots(figsize=figsize)

	sns.scatterplot(
	data=data[data['alpha'] == 1],
	x=0,
	y=1,
	style='word_bias_space',
	hue='word_bias_space',
	ax=ax,
	palette=color_dict
	)
	if plot_neighbors:
	sns.scatterplot(
	data=data[data['alpha'] != 1],
	x=0,
	y=1,
	style='color',
	hue='word_bias_space',
	ax=ax,
	alpha=n_alpha,
	legend=False,
	palette=color_dict
	)
	for i, label in enumerate(word_list):
	x, y = words_embedded[i, :]
	ax.annotate(label, xy=(x, y), xytext=(5, 2),color=words_colors[label],
	textcoords='offset points',
	ha='right', va='bottom', size=fontsize, alpha=alpha[label])

	ax.set_xticks([])
	ax.set_yticks([])

	fig.tight_layout()
	fig.canvas.draw()

	data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
	w, h = fig.canvas.get_width_height()
	im = data.reshape((int(h), int(w), -1))
	return im, ''


	class WEBiasExplorer2d(BiasExplorer):
	def __init__(self, word_embedding) -> None:
	super().__init__(word_embedding)

	def calculate_bias(
	self,
	palabras_extremo_1,
	palabras_extremo_2,
	palabras_para_situar
	):

	wordlists = [
	palabras_extremo_1,
	palabras_extremo_2,
	palabras_para_situar
	]
	err = self.check_oov(wordlists)
	for wordlist in wordlists:
	if not wordlist:
	err = "<center><h3>" + 'Debe ingresar al menos 1 palabra en las lista de palabras a diagnosticar, sesgo 1 y sesgo 2' +"<center><h3>"
	if err:
	return None, err


	err = self.check_oov([palabras_extremo_1,palabras_extremo_2,palabras_para_situar])
	if err:
	return None, err
	palabras_extremo_1 = self.parse_words(palabras_extremo_1)
	palabras_extremo_2 = self.parse_words(palabras_extremo_2)
	palabras_para_situar = self.parse_words(palabras_para_situar)
	im = self.get_bias_plot(
	palabras_para_situar,
	definitional=(
	palabras_extremo_1, palabras_extremo_2),
	method='sum',
	n_extreme=10
	)
	return im, ''

	def get_bias_plot(self,
	palabras_para_situar,
	definitional,
	method='sum',
	n_extreme=10,
	figsize=(10, 10)
	):

	fig, ax = plt.subplots(1, figsize=figsize)
	self.method = method
	self.plot_projection_scores(
	definitional,
	palabras_para_situar, n_extreme, ax=ax,)

	fig.tight_layout()
	fig.canvas.draw()

	data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
	w, h = fig.canvas.get_width_height()
	im = data.reshape((int(h), int(w), -1))
	return im

	def plot_projection_scores(self, definitional,
	words, n_extreme=10,
	ax=None, axis_projection_step=None):
	"""Plot the projection scalar of words on the direction.
	:param list words: The words tor project
	:param int or None n_extreme: The number of extreme words to show
	:return: The ax object of the plot
	"""
	nombre_del_extremo_1 = ', '.join(definitional[0])
	nombre_del_extremo_2 = ', '.join(definitional[1])

	self._identify_direction(nombre_del_extremo_1, nombre_del_extremo_2,
	definitional=definitional,
	method='sum')

	self._is_direction_identified()

	projections_df = self._calc_projection_scores(words)
	projections_df['projection'] = projections_df['projection'].round(2)

	if n_extreme is not None:
	projections_df = take_two_sides_extreme_sorted(projections_df,
	n_extreme=n_extreme)

	if ax is None:
	_, ax = plt.subplots(1)

	if axis_projection_step is None:
	axis_projection_step = 0.1

	cmap = plt.get_cmap('RdBu')
	projections_df['color'] = ((projections_df['projection'] + 0.5)
	.apply(cmap))

	most_extream_projection = np.round(
	projections_df['projection']
	.abs()
	.max(),
	decimals=1)

	sns.barplot(x='projection', y='word', data=projections_df,
	palette=projections_df['color'])

	plt.xticks(np.arange(-most_extream_projection,
	most_extream_projection + axis_projection_step,
	axis_projection_step))
	xlabel = ('← {} {} {} →'.format(self.negative_end,
	' ' * 20,
	self.positive_end))

	plt.xlabel(xlabel)
	plt.ylabel('Words')

	return ax


	class WEBiasExplorer4d(BiasExplorer):
	def __init__(self, word_embedding) -> None:
	super().__init__(word_embedding)

	def calculate_bias(
	self,
	palabras_extremo_1,
	palabras_extremo_2,
	palabras_extremo_3,
	palabras_extremo_4,
	palabras_para_situar
	):
	wordlists = [
	palabras_extremo_1,
	palabras_extremo_2,
	palabras_extremo_3,
	palabras_extremo_4,
	palabras_para_situar
	]
	err = self.check_oov(wordlists)
	for wordlist in wordlists:
	if not wordlist:
	err = "<center><h3>" + '¡Para graficar con 4 espacios, debe ingresar al menos 1 palabra en todas las listas!' + "<center><h3>"
	if err:
	return None, err

	palabras_extremo_1 = self.parse_words(palabras_extremo_1)
	palabras_extremo_2 = self.parse_words(palabras_extremo_2)
	palabras_extremo_3 = self.parse_words(palabras_extremo_3)
	palabras_extremo_4 = self.parse_words(palabras_extremo_4)

	palabras_para_situar = self.parse_words(palabras_para_situar)

	im = self.get_bias_plot(
	palabras_para_situar,
	definitional_1=(
	palabras_extremo_1, palabras_extremo_2),
	definitional_2=(
	palabras_extremo_3, palabras_extremo_4),
	method='sum',
	n_extreme=10
	)
	return im, ''

	def get_bias_plot(self,
	palabras_para_situar,
	definitional_1,
	definitional_2,
	method='sum',
	n_extreme=10,
	figsize=(10, 10)
	):

	fig, ax = plt.subplots(1, figsize=figsize)
	self.method = method
	self.plot_projection_scores(
	definitional_1,
	definitional_2,
	palabras_para_situar, n_extreme, ax=ax,)
	fig.canvas.draw()

	data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
	w, h = fig.canvas.get_width_height()
	im = data.reshape((int(h), int(w), -1))
	return im

	def plot_projection_scores(self, definitional_1, definitional_2,
	words, n_extreme=10,
	ax=None, axis_projection_step=None):
	"""Plot the projection scalar of words on the direction.
	:param list words: The words tor project
	:param int or None n_extreme: The number of extreme words to show
	:return: The ax object of the plot
	"""

	nombre_del_extremo_1 = ', '.join(definitional_1[1])
	nombre_del_extremo_2 = ', '.join(definitional_1[0])

	self._identify_direction(nombre_del_extremo_1, nombre_del_extremo_2,
	definitional=definitional_1,
	method='sum')

	self._is_direction_identified()

	projections_df = self._calc_projection_scores(words)
	projections_df['projection_x'] = projections_df['projection'].round(2)

	nombre_del_extremo_3 = ', '.join(definitional_2[1])
	nombre_del_extremo_4 = ', '.join(definitional_2[0])
	self._identify_direction(nombre_del_extremo_3, nombre_del_extremo_4,
	definitional=definitional_2,
	method='sum')

	self._is_direction_identified()

	projections_df['projection_y'] = self._calc_projection_scores(words)[
	'projection'].round(2)

	if n_extreme is not None:
	projections_df = take_two_sides_extreme_sorted(projections_df,
	n_extreme=n_extreme)

	if ax is None:
	_, ax = plt.subplots(1)

	if axis_projection_step is None:
	axis_projection_step = 0.1

	cmap = plt.get_cmap('RdBu')
	projections_df['color'] = ((projections_df['projection'] + 0.5)
	.apply(cmap))
	most_extream_projection = np.round(
	projections_df['projection']
	.abs()
	.max(),
	decimals=1)
	sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
	palette=projections_df['color'])

	plt.xticks(np.arange(-most_extream_projection,
	most_extream_projection + axis_projection_step,
	axis_projection_step))
	for _, row in (projections_df.iterrows()):
	ax.annotate(
	row['word'], (row['projection_x'], row['projection_y']))
	x_label = '← {} {} {} →'.format(nombre_del_extremo_1,
	' ' * 20,
	nombre_del_extremo_2)

	y_label = '← {} {} {} →'.format(nombre_del_extremo_3,
	' ' * 20,
	nombre_del_extremo_4)

	plt.xlabel(x_label)
	ax.xaxis.set_label_position('bottom')
	ax.xaxis.set_label_coords(.5, 0)

	plt.ylabel(y_label)
	ax.yaxis.set_label_position('left')
	ax.yaxis.set_label_coords(0, .5)

	ax.spines['left'].set_position('center')
	ax.spines['bottom'].set_position('center')

	ax.set_xticks([])
	ax.set_yticks([])
	#plt.yticks([], [])
	# ax.spines['left'].set_position('zero')
	# ax.spines['bottom'].set_position('zero')

	return ax