Spaces:
Runtime error
Runtime error
Commit
·
8081e11
1
Parent(s):
6ff911e
Type hinted BiasExplorer classes.
Browse filesMoved utility functions to utils.py
Backtracked on using child classes.
- modules/module_BiasExplorer.py +135 -314
- modules/module_connection.py +13 -10
- modules/utils.py +58 -0
modules/module_BiasExplorer.py
CHANGED
@@ -1,67 +1,15 @@
|
|
1 |
-
# ToDo: Pendiente eliminar clases/métodos que no son utilizados. Luego, unificar sintaxix e incluir typing.
|
2 |
-
|
3 |
import copy
|
4 |
import numpy as np
|
5 |
import pandas as pd
|
6 |
import seaborn as sns
|
7 |
import matplotlib.pyplot as plt
|
8 |
from sklearn.decomposition import PCA
|
|
|
|
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
part_column=None,
|
14 |
-
head_value='',
|
15 |
-
tail_value=''
|
16 |
-
):
|
17 |
-
|
18 |
-
head_df = df.head(n_extreme)[:]
|
19 |
-
tail_df = df.tail(n_extreme)[:]
|
20 |
-
|
21 |
-
if part_column is not None:
|
22 |
-
head_df[part_column] = head_value
|
23 |
-
tail_df[part_column] = tail_value
|
24 |
-
|
25 |
-
return (pd.concat([head_df, tail_df])
|
26 |
-
.drop_duplicates()
|
27 |
-
.reset_index(drop=True))
|
28 |
-
|
29 |
-
def normalize(v):
|
30 |
-
"""Normalize a 1-D vector."""
|
31 |
-
if v.ndim != 1:
|
32 |
-
raise ValueError('v should be 1-D, {}-D was given'.format(
|
33 |
-
v.ndim))
|
34 |
-
norm = np.linalg.norm(v)
|
35 |
-
if norm == 0:
|
36 |
-
return v
|
37 |
-
return v / norm
|
38 |
-
|
39 |
-
def project_params(u, v):
|
40 |
-
"""Projecting and rejecting the vector v onto direction u with scalar."""
|
41 |
-
normalize_u = normalize(u)
|
42 |
-
projection = (v @ normalize_u)
|
43 |
-
projected_vector = projection * normalize_u
|
44 |
-
rejected_vector = v - projected_vector
|
45 |
-
return projection, projected_vector, rejected_vector
|
46 |
-
|
47 |
-
|
48 |
-
def cosine_similarity(v, u):
|
49 |
-
"""Calculate the cosine similarity between two vectors."""
|
50 |
-
v_norm = np.linalg.norm(v)
|
51 |
-
u_norm = np.linalg.norm(u)
|
52 |
-
similarity = v @ u / (v_norm * u_norm)
|
53 |
-
return similarity
|
54 |
-
|
55 |
-
|
56 |
-
DIRECTION_METHODS = ['single', 'sum', 'pca']
|
57 |
-
DEBIAS_METHODS = ['neutralize', 'hard', 'soft']
|
58 |
-
FIRST_PC_THRESHOLD = 0.5
|
59 |
-
MAX_NON_SPECIFIC_EXAMPLES = 1000
|
60 |
-
|
61 |
-
__all__ = ['GenderBiasWE', 'BiasWordEmbedding']
|
62 |
-
|
63 |
-
|
64 |
-
class WordBiasExplorer():
|
65 |
def __init__(
|
66 |
self,
|
67 |
embedding # Class Embedding instance
|
@@ -71,10 +19,11 @@ class WordBiasExplorer():
|
|
71 |
self.direction = None
|
72 |
self.positive_end = None
|
73 |
self.negative_end = None
|
|
|
74 |
|
75 |
def __copy__(
|
76 |
self
|
77 |
-
):
|
78 |
|
79 |
bias_word_embedding = self.__class__(self.embedding)
|
80 |
bias_word_embedding.direction = copy.deepcopy(self.direction)
|
@@ -84,8 +33,8 @@ class WordBiasExplorer():
|
|
84 |
|
85 |
def __deepcopy__(
|
86 |
self,
|
87 |
-
memo
|
88 |
-
):
|
89 |
|
90 |
bias_word_embedding = copy.copy(self)
|
91 |
bias_word_embedding.model = copy.deepcopy(bias_word_embedding.model)
|
@@ -115,9 +64,9 @@ class WordBiasExplorer():
|
|
115 |
|
116 |
def _identify_subspace_by_pca(
|
117 |
self,
|
118 |
-
definitional_pairs,
|
119 |
-
n_components
|
120 |
-
):
|
121 |
|
122 |
matrix = []
|
123 |
|
@@ -137,15 +86,16 @@ class WordBiasExplorer():
|
|
137 |
|
138 |
def _identify_direction(
|
139 |
self,
|
140 |
-
positive_end,
|
141 |
-
negative_end,
|
142 |
-
definitional,
|
143 |
-
method='pca'
|
144 |
-
|
|
|
145 |
|
146 |
-
if method not in DIRECTION_METHODS:
|
147 |
raise ValueError('method should be one of {}, {} was given'.format(
|
148 |
-
DIRECTION_METHODS, method))
|
149 |
|
150 |
if positive_end == negative_end:
|
151 |
raise ValueError('positive_end and negative_end'
|
@@ -170,11 +120,11 @@ class WordBiasExplorer():
|
|
170 |
|
171 |
elif method == 'pca':
|
172 |
pca = self._identify_subspace_by_pca(definitional, 10)
|
173 |
-
if pca.explained_variance_ratio_[0] <
|
174 |
raise RuntimeError('The Explained variance'
|
175 |
'of the first principal component should be'
|
176 |
'at least {}, but it is {}'
|
177 |
-
.format(
|
178 |
pca.explained_variance_ratio_[0]))
|
179 |
direction = pca.components_[0]
|
180 |
|
@@ -193,7 +143,7 @@ class WordBiasExplorer():
|
|
193 |
def project_on_direction(
|
194 |
self,
|
195 |
word: str
|
196 |
-
):
|
197 |
|
198 |
"""Project the normalized vector of the word on the direction.
|
199 |
:param str word: The word tor project
|
@@ -209,8 +159,8 @@ class WordBiasExplorer():
|
|
209 |
|
210 |
def _calc_projection_scores(
|
211 |
self,
|
212 |
-
words
|
213 |
-
):
|
214 |
|
215 |
self._is_direction_identified()
|
216 |
|
@@ -225,8 +175,8 @@ class WordBiasExplorer():
|
|
225 |
|
226 |
def calc_projection_data(
|
227 |
self,
|
228 |
-
words
|
229 |
-
):
|
230 |
|
231 |
"""
|
232 |
Calculate projection, projected and rejected vectors of a words list.
|
@@ -254,9 +204,9 @@ class WordBiasExplorer():
|
|
254 |
|
255 |
def plot_dist_projections_on_direction(
|
256 |
self,
|
257 |
-
word_groups,
|
258 |
-
ax=None
|
259 |
-
):
|
260 |
|
261 |
"""Plot the projection scalars distribution on the direction.
|
262 |
:param dict word_groups word: The groups to projects
|
@@ -289,8 +239,8 @@ class WordBiasExplorer():
|
|
289 |
|
290 |
def __errorChecking(
|
291 |
self,
|
292 |
-
word
|
293 |
-
):
|
294 |
|
295 |
out_msj = ""
|
296 |
|
@@ -304,8 +254,8 @@ class WordBiasExplorer():
|
|
304 |
|
305 |
def check_oov(
|
306 |
self,
|
307 |
-
wordlists
|
308 |
-
):
|
309 |
|
310 |
for wordlist in wordlists:
|
311 |
for word in wordlist:
|
@@ -314,201 +264,72 @@ class WordBiasExplorer():
|
|
314 |
return msg
|
315 |
return None
|
316 |
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
wordlist_right,
|
321 |
-
wordlist_left,
|
322 |
-
wordlist_top=[],
|
323 |
-
wordlist_bottom=[]
|
324 |
-
):
|
325 |
|
326 |
-
|
|
|
|
|
|
|
|
|
|
|
327 |
|
328 |
-
|
329 |
-
raise Exception('For bar plot, wordlist right and left can NOT be empty')
|
330 |
-
elif not bias_2D and (not wordlist_right or not wordlist_left or not wordlist_top or not wordlist_bottom):
|
331 |
-
raise Exception('For plane plot, wordlist right, left, top and down can NOT be empty')
|
332 |
|
333 |
-
|
|
|
|
|
|
|
|
|
334 |
if err:
|
335 |
raise Exception(err)
|
336 |
|
337 |
return self.get_bias_plot(
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
|
344 |
def get_bias_plot(
|
345 |
self,
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
figsize=(15, 10)
|
353 |
-
):
|
354 |
-
|
355 |
-
fig, ax = plt.subplots(1, figsize=figsize)
|
356 |
-
self.method = method
|
357 |
-
self.plot_projection_scores(plot_2D, words_to_diagnose, definitional_1, definitional_2, n_extreme, ax)
|
358 |
-
|
359 |
-
if plot_2D:
|
360 |
-
fig.tight_layout()
|
361 |
-
fig.canvas.draw()
|
362 |
-
|
363 |
-
return fig
|
364 |
-
|
365 |
-
def plot_projection_scores(
|
366 |
-
self,
|
367 |
-
plot_2D,
|
368 |
-
words,
|
369 |
-
definitional_1,
|
370 |
-
definitional_2=([], []),
|
371 |
-
n_extreme=10,
|
372 |
-
ax=None,
|
373 |
-
axis_projection_step=0.1
|
374 |
-
):
|
375 |
-
|
376 |
-
name_left = ', '.join(definitional_1[1])
|
377 |
-
name_right = ', '.join(definitional_1[0])
|
378 |
-
|
379 |
-
self._identify_direction(name_left, name_right, definitional=definitional_1, method='sum')
|
380 |
-
self._is_direction_identified()
|
381 |
-
|
382 |
-
projections_df = self._calc_projection_scores(words)
|
383 |
-
projections_df['projection_x'] = projections_df['projection'].round(2)
|
384 |
-
|
385 |
-
if not plot_2D:
|
386 |
-
name_top = ', '.join(definitional_2[1])
|
387 |
-
name_bottom = ', '.join(definitional_2[0])
|
388 |
-
self._identify_direction(name_top, name_bottom, definitional=definitional_2, method='sum')
|
389 |
-
self._is_direction_identified()
|
390 |
-
|
391 |
-
projections_df['projection_y'] = self._calc_projection_scores(words)['projection'].round(2)
|
392 |
-
|
393 |
-
if n_extreme is not None:
|
394 |
-
projections_df = take_two_sides_extreme_sorted(projections_df, n_extreme=n_extreme)
|
395 |
-
|
396 |
-
if ax is None:
|
397 |
-
_, ax = plt.subplots(1)
|
398 |
-
|
399 |
-
cmap = plt.get_cmap('RdBu')
|
400 |
-
projections_df['color'] = ((projections_df['projection'] + 0.5).apply(cmap))
|
401 |
-
most_extream_projection = np.round(
|
402 |
-
projections_df['projection']
|
403 |
-
.abs()
|
404 |
-
.max(),
|
405 |
-
decimals=1)
|
406 |
-
|
407 |
-
if plot_2D:
|
408 |
-
sns.barplot(x='projection', y='word', data=projections_df,
|
409 |
-
palette=projections_df['color'])
|
410 |
-
else:
|
411 |
-
# ToDo: revisar este warning:
|
412 |
-
# Ignoring `palette` because no `hue` variable has been assigned. sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
|
413 |
-
|
414 |
-
sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
|
415 |
-
palette=projections_df['color'])
|
416 |
-
|
417 |
-
plt.xticks(np.arange(-most_extream_projection,
|
418 |
-
most_extream_projection + axis_projection_step,
|
419 |
-
axis_projection_step))
|
420 |
-
|
421 |
-
x_label = '← {} {} {} →'.format(name_left,
|
422 |
-
' ' * 20,
|
423 |
-
name_right)
|
424 |
-
if not plot_2D:
|
425 |
-
y_label = '← {} {} {} →'.format(name_top,
|
426 |
-
' ' * 20,
|
427 |
-
name_bottom)
|
428 |
-
for _, row in (projections_df.iterrows()):
|
429 |
-
ax.annotate(row['word'], (row['projection_x'], row['projection_y']))
|
430 |
-
|
431 |
-
plt.xlabel(x_label)
|
432 |
-
plt.ylabel('Words')
|
433 |
-
|
434 |
-
if not plot_2D:
|
435 |
-
ax.xaxis.set_label_position('bottom')
|
436 |
-
ax.xaxis.set_label_coords(.5, 0)
|
437 |
-
|
438 |
-
plt.ylabel(y_label)
|
439 |
-
ax.yaxis.set_label_position('left')
|
440 |
-
ax.yaxis.set_label_coords(0, .5)
|
441 |
-
|
442 |
-
ax.spines['left'].set_position('center')
|
443 |
-
ax.spines['bottom'].set_position('center')
|
444 |
-
|
445 |
-
ax.set_xticks([])
|
446 |
-
ax.set_yticks([])
|
447 |
-
|
448 |
-
return ax
|
449 |
-
|
450 |
-
# TODO: Would be erased if decided to keep all info in BiasWordExplorer
|
451 |
-
class WEBiasExplorer2d(WordBiasExplorer):
|
452 |
-
def __init__(self, word_embedding) -> None:
|
453 |
-
super().__init__(word_embedding)
|
454 |
-
|
455 |
-
def calculate_bias( self,
|
456 |
-
palabras_extremo_1,
|
457 |
-
palabras_extremo_2,
|
458 |
-
palabras_para_situar
|
459 |
-
):
|
460 |
-
wordlists = [palabras_extremo_1, palabras_extremo_2, palabras_para_situar]
|
461 |
-
|
462 |
-
err = self.check_oov(wordlists)
|
463 |
-
for wordlist in wordlists:
|
464 |
-
if not wordlist:
|
465 |
-
err = "<center><h3>" + 'Debe ingresar al menos 1 palabra en las lista de palabras a diagnosticar, sesgo 1 y sesgo 2' + "<center><h3>"
|
466 |
-
if err:
|
467 |
-
return None, err
|
468 |
-
|
469 |
-
im = self.get_bias_plot(
|
470 |
-
palabras_para_situar,
|
471 |
-
definitional=(
|
472 |
-
palabras_extremo_1, palabras_extremo_2),
|
473 |
-
method='sum',
|
474 |
-
n_extreme=10
|
475 |
-
)
|
476 |
-
return im, ''
|
477 |
-
|
478 |
-
def get_bias_plot(self,
|
479 |
-
palabras_para_situar,
|
480 |
-
definitional,
|
481 |
-
method='sum',
|
482 |
-
n_extreme=10,
|
483 |
-
figsize=(10, 10)
|
484 |
-
):
|
485 |
|
486 |
fig, ax = plt.subplots(1, figsize=figsize)
|
487 |
self.method = method
|
488 |
self.plot_projection_scores(
|
489 |
definitional,
|
490 |
-
|
491 |
|
492 |
fig.tight_layout()
|
493 |
fig.canvas.draw()
|
494 |
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
499 |
|
500 |
-
def plot_projection_scores(self, definitional,
|
501 |
-
words, n_extreme=10,
|
502 |
-
ax=None, axis_projection_step=None):
|
503 |
"""Plot the projection scalar of words on the direction.
|
504 |
:param list words: The words tor project
|
505 |
:param int or None n_extreme: The number of extreme words to show
|
506 |
:return: The ax object of the plot
|
507 |
"""
|
508 |
-
|
509 |
-
|
510 |
|
511 |
-
self._identify_direction(
|
512 |
definitional=definitional,
|
513 |
method='sum')
|
514 |
|
@@ -553,80 +374,83 @@ class WEBiasExplorer2d(WordBiasExplorer):
|
|
553 |
return ax
|
554 |
|
555 |
|
556 |
-
class
|
557 |
-
def __init__(self,
|
558 |
-
super().__init__(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
559 |
|
560 |
-
def calculate_bias( self,
|
561 |
-
palabras_extremo_1,
|
562 |
-
palabras_extremo_2,
|
563 |
-
palabras_extremo_3,
|
564 |
-
palabras_extremo_4,
|
565 |
-
palabras_para_situar
|
566 |
-
):
|
567 |
wordlists = [
|
568 |
-
|
569 |
-
|
570 |
-
|
571 |
-
|
572 |
-
|
573 |
]
|
|
|
574 |
for wordlist in wordlists:
|
575 |
if not wordlist:
|
576 |
-
|
577 |
-
'¡Para graficar con 4 espacios, debe ingresar al menos 1 palabra en todas las listas!' + "<center><h3>"
|
578 |
|
579 |
err = self.check_oov(wordlist)
|
580 |
-
|
581 |
if err:
|
582 |
-
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
|
588 |
-
|
589 |
-
|
590 |
-
|
591 |
-
|
592 |
-
|
593 |
-
|
594 |
-
|
595 |
-
|
596 |
-
|
597 |
-
|
598 |
-
|
599 |
-
|
600 |
-
|
601 |
-
figsize=(10, 10)
|
602 |
-
):
|
603 |
|
604 |
fig, ax = plt.subplots(1, figsize=figsize)
|
605 |
self.method = method
|
606 |
self.plot_projection_scores(
|
607 |
definitional_1,
|
608 |
definitional_2,
|
609 |
-
|
610 |
fig.canvas.draw()
|
611 |
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
616 |
|
617 |
-
def plot_projection_scores(self, definitional_1, definitional_2,
|
618 |
-
words, n_extreme=10,
|
619 |
-
ax=None, axis_projection_step=None):
|
620 |
"""Plot the projection scalar of words on the direction.
|
621 |
:param list words: The words tor project
|
622 |
:param int or None n_extreme: The number of extreme words to show
|
623 |
:return: The ax object of the plot
|
624 |
"""
|
625 |
|
626 |
-
|
627 |
-
|
628 |
|
629 |
-
self._identify_direction(
|
630 |
definitional=definitional_1,
|
631 |
method='sum')
|
632 |
|
@@ -635,9 +459,9 @@ class WEBiasExplorer4d(WordBiasExplorer):
|
|
635 |
projections_df = self._calc_projection_scores(words)
|
636 |
projections_df['projection_x'] = projections_df['projection'].round(2)
|
637 |
|
638 |
-
|
639 |
-
|
640 |
-
self._identify_direction(
|
641 |
definitional=definitional_2,
|
642 |
method='sum')
|
643 |
|
@@ -673,13 +497,13 @@ class WEBiasExplorer4d(WordBiasExplorer):
|
|
673 |
for _, row in (projections_df.iterrows()):
|
674 |
ax.annotate(
|
675 |
row['word'], (row['projection_x'], row['projection_y']))
|
676 |
-
x_label = '← {} {} {} →'.format(
|
677 |
' ' * 20,
|
678 |
-
|
679 |
|
680 |
-
y_label = '← {} {} {} →'.format(
|
681 |
' ' * 20,
|
682 |
-
|
683 |
|
684 |
plt.xlabel(x_label)
|
685 |
ax.xaxis.set_label_position('bottom')
|
@@ -694,8 +518,5 @@ class WEBiasExplorer4d(WordBiasExplorer):
|
|
694 |
|
695 |
ax.set_xticks([])
|
696 |
ax.set_yticks([])
|
697 |
-
#plt.yticks([], [])
|
698 |
-
# ax.spines['left'].set_position('zero')
|
699 |
-
# ax.spines['bottom'].set_position('zero')
|
700 |
|
701 |
-
return ax
|
|
|
|
|
|
|
1 |
import copy
|
2 |
import numpy as np
|
3 |
import pandas as pd
|
4 |
import seaborn as sns
|
5 |
import matplotlib.pyplot as plt
|
6 |
from sklearn.decomposition import PCA
|
7 |
+
from typing import List, Dict, Tuple, Optional, Any
|
8 |
+
from modules.utils import normalize, cosine_similarity, project_params, take_two_sides_extreme_sorted
|
9 |
|
10 |
+
__all__ = ['WordBiasExplorer', 'WEBiasExplorer2Spaces', 'WEBiasExplorer4Spaces']
|
11 |
+
|
12 |
+
class WordBiasExplorer:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
def __init__(
|
14 |
self,
|
15 |
embedding # Class Embedding instance
|
|
|
19 |
self.direction = None
|
20 |
self.positive_end = None
|
21 |
self.negative_end = None
|
22 |
+
self.DIRECTION_METHODS = ['single', 'sum', 'pca']
|
23 |
|
24 |
def __copy__(
|
25 |
self
|
26 |
+
) -> 'WordBiasExplorer':
|
27 |
|
28 |
bias_word_embedding = self.__class__(self.embedding)
|
29 |
bias_word_embedding.direction = copy.deepcopy(self.direction)
|
|
|
33 |
|
34 |
def __deepcopy__(
|
35 |
self,
|
36 |
+
memo: Optional[Dict[int, Any]]
|
37 |
+
)-> 'WordBiasExplorer':
|
38 |
|
39 |
bias_word_embedding = copy.copy(self)
|
40 |
bias_word_embedding.model = copy.deepcopy(bias_word_embedding.model)
|
|
|
64 |
|
65 |
def _identify_subspace_by_pca(
|
66 |
self,
|
67 |
+
definitional_pairs: List[Tuple[str, str]],
|
68 |
+
n_components: int
|
69 |
+
) -> PCA:
|
70 |
|
71 |
matrix = []
|
72 |
|
|
|
86 |
|
87 |
def _identify_direction(
|
88 |
self,
|
89 |
+
positive_end: str,
|
90 |
+
negative_end: str,
|
91 |
+
definitional: Tuple[str, str],
|
92 |
+
method: str='pca',
|
93 |
+
first_pca_threshold: float=0.5
|
94 |
+
) -> None:
|
95 |
|
96 |
+
if method not in self.DIRECTION_METHODS:
|
97 |
raise ValueError('method should be one of {}, {} was given'.format(
|
98 |
+
self.DIRECTION_METHODS, method))
|
99 |
|
100 |
if positive_end == negative_end:
|
101 |
raise ValueError('positive_end and negative_end'
|
|
|
120 |
|
121 |
elif method == 'pca':
|
122 |
pca = self._identify_subspace_by_pca(definitional, 10)
|
123 |
+
if pca.explained_variance_ratio_[0] < first_pca_threshold:
|
124 |
raise RuntimeError('The Explained variance'
|
125 |
'of the first principal component should be'
|
126 |
'at least {}, but it is {}'
|
127 |
+
.format(first_pca_threshold,
|
128 |
pca.explained_variance_ratio_[0]))
|
129 |
direction = pca.components_[0]
|
130 |
|
|
|
143 |
def project_on_direction(
|
144 |
self,
|
145 |
word: str
|
146 |
+
) -> float:
|
147 |
|
148 |
"""Project the normalized vector of the word on the direction.
|
149 |
:param str word: The word tor project
|
|
|
159 |
|
160 |
def _calc_projection_scores(
|
161 |
self,
|
162 |
+
words: List[str]
|
163 |
+
) -> pd.DataFrame:
|
164 |
|
165 |
self._is_direction_identified()
|
166 |
|
|
|
175 |
|
176 |
def calc_projection_data(
|
177 |
self,
|
178 |
+
words: List[str]
|
179 |
+
) -> pd.DataFrame:
|
180 |
|
181 |
"""
|
182 |
Calculate projection, projected and rejected vectors of a words list.
|
|
|
204 |
|
205 |
def plot_dist_projections_on_direction(
|
206 |
self,
|
207 |
+
word_groups: Dict[str, List[str]],
|
208 |
+
ax: plt.Axes=None
|
209 |
+
) -> plt.Axes:
|
210 |
|
211 |
"""Plot the projection scalars distribution on the direction.
|
212 |
:param dict word_groups word: The groups to projects
|
|
|
239 |
|
240 |
def __errorChecking(
|
241 |
self,
|
242 |
+
word: str
|
243 |
+
) -> str:
|
244 |
|
245 |
out_msj = ""
|
246 |
|
|
|
254 |
|
255 |
def check_oov(
|
256 |
self,
|
257 |
+
wordlists: List[str]
|
258 |
+
) -> str:
|
259 |
|
260 |
for wordlist in wordlists:
|
261 |
for word in wordlist:
|
|
|
264 |
return msg
|
265 |
return None
|
266 |
|
267 |
+
class WEBiasExplorer2Spaces(WordBiasExplorer):
|
268 |
+
def __init__(self, embedding) -> None:
|
269 |
+
super().__init__(embedding)
|
|
|
|
|
|
|
|
|
|
|
270 |
|
271 |
+
def calculate_bias(
|
272 |
+
self,
|
273 |
+
wordlist_to_diagnose: List[str],
|
274 |
+
wordlist_right: List[str],
|
275 |
+
wordlist_left: List[str]
|
276 |
+
) -> plt.Figure:
|
277 |
|
278 |
+
wordlists = [wordlist_to_diagnose, wordlist_right, wordlist_left]
|
|
|
|
|
|
|
279 |
|
280 |
+
for wordlist in wordlists:
|
281 |
+
if not wordlist:
|
282 |
+
raise Exception('At least one word should be in the to diagnose list, bias 1 list and bias 2 list')
|
283 |
+
|
284 |
+
err = self.check_oov(wordlists)
|
285 |
if err:
|
286 |
raise Exception(err)
|
287 |
|
288 |
return self.get_bias_plot(
|
289 |
+
wordlist_to_diagnose,
|
290 |
+
definitional=(wordlist_left, wordlist_right),
|
291 |
+
method='sum',
|
292 |
+
n_extreme=10
|
293 |
+
)
|
294 |
|
295 |
def get_bias_plot(
|
296 |
self,
|
297 |
+
wordlist_to_diagnose: List[str],
|
298 |
+
definitional: Tuple[List[str], List[str]],
|
299 |
+
method: str='sum',
|
300 |
+
n_extreme: int=10,
|
301 |
+
figsize: Tuple[int, int]=(10, 10)
|
302 |
+
) -> plt.Figure:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
303 |
|
304 |
fig, ax = plt.subplots(1, figsize=figsize)
|
305 |
self.method = method
|
306 |
self.plot_projection_scores(
|
307 |
definitional,
|
308 |
+
wordlist_to_diagnose, n_extreme, ax=ax,)
|
309 |
|
310 |
fig.tight_layout()
|
311 |
fig.canvas.draw()
|
312 |
|
313 |
+
return fig
|
314 |
+
|
315 |
+
def plot_projection_scores(
|
316 |
+
self,
|
317 |
+
definitional: Tuple[List[str], List[str]],
|
318 |
+
words: List[str],
|
319 |
+
n_extreme: int=10,
|
320 |
+
ax: plt.Axes=None,
|
321 |
+
axis_projection_step: float=None
|
322 |
+
) -> plt.Axes:
|
323 |
|
|
|
|
|
|
|
324 |
"""Plot the projection scalar of words on the direction.
|
325 |
:param list words: The words tor project
|
326 |
:param int or None n_extreme: The number of extreme words to show
|
327 |
:return: The ax object of the plot
|
328 |
"""
|
329 |
+
name_left = ', '.join(definitional[0])
|
330 |
+
name_right = ', '.join(definitional[1])
|
331 |
|
332 |
+
self._identify_direction(name_left, name_right,
|
333 |
definitional=definitional,
|
334 |
method='sum')
|
335 |
|
|
|
374 |
return ax
|
375 |
|
376 |
|
377 |
+
class WEBiasExplorer4Spaces(WordBiasExplorer):
|
378 |
+
def __init__(self, embedding) -> None:
|
379 |
+
super().__init__(embedding)
|
380 |
+
|
381 |
+
def calculate_bias(
|
382 |
+
self,
|
383 |
+
wordlist_to_diagnose: List[str],
|
384 |
+
wordlist_right: List[str],
|
385 |
+
wordlist_left: List[str],
|
386 |
+
wordlist_top: List[str],
|
387 |
+
wordlist_bottom: List[str],
|
388 |
+
) -> plt.Figure:
|
389 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
390 |
wordlists = [
|
391 |
+
wordlist_to_diagnose,
|
392 |
+
wordlist_left,
|
393 |
+
wordlist_right,
|
394 |
+
wordlist_top,
|
395 |
+
wordlist_bottom
|
396 |
]
|
397 |
+
|
398 |
for wordlist in wordlists:
|
399 |
if not wordlist:
|
400 |
+
raise Exception('To plot with 4 spaces, you must enter at least one word in all lists')
|
|
|
401 |
|
402 |
err = self.check_oov(wordlist)
|
|
|
403 |
if err:
|
404 |
+
raise Exception(err)
|
405 |
+
|
406 |
+
return self.get_bias_plot(
|
407 |
+
wordlist_to_diagnose,
|
408 |
+
definitional_1=(wordlist_right, wordlist_left),
|
409 |
+
definitional_2=(wordlist_top, wordlist_bottom),
|
410 |
+
method='sum',
|
411 |
+
n_extreme=10
|
412 |
+
)
|
413 |
+
|
414 |
+
def get_bias_plot(
|
415 |
+
self,
|
416 |
+
wordlist_to_diagnose: List[str],
|
417 |
+
definitional_1: Tuple[List[str], List[str]],
|
418 |
+
definitional_2: Tuple[List[str], List[str]],
|
419 |
+
method: str='sum',
|
420 |
+
n_extreme: int=10,
|
421 |
+
figsize: Tuple[int, int]=(10, 10)
|
422 |
+
) -> plt.Figure:
|
|
|
|
|
423 |
|
424 |
fig, ax = plt.subplots(1, figsize=figsize)
|
425 |
self.method = method
|
426 |
self.plot_projection_scores(
|
427 |
definitional_1,
|
428 |
definitional_2,
|
429 |
+
wordlist_to_diagnose, n_extreme, ax=ax,)
|
430 |
fig.canvas.draw()
|
431 |
|
432 |
+
return fig
|
433 |
+
|
434 |
+
def plot_projection_scores(
|
435 |
+
self,
|
436 |
+
definitional_1: Tuple[List[str], List[str]],
|
437 |
+
definitional_2: Tuple[List[str], List[str]],
|
438 |
+
words: List[str],
|
439 |
+
n_extreme: int=10,
|
440 |
+
ax: plt.Axes=None,
|
441 |
+
axis_projection_step: float=None
|
442 |
+
) -> plt.Axes:
|
443 |
|
|
|
|
|
|
|
444 |
"""Plot the projection scalar of words on the direction.
|
445 |
:param list words: The words tor project
|
446 |
:param int or None n_extreme: The number of extreme words to show
|
447 |
:return: The ax object of the plot
|
448 |
"""
|
449 |
|
450 |
+
name_left = ', '.join(definitional_1[1])
|
451 |
+
name_right = ', '.join(definitional_1[0])
|
452 |
|
453 |
+
self._identify_direction(name_left, name_right,
|
454 |
definitional=definitional_1,
|
455 |
method='sum')
|
456 |
|
|
|
459 |
projections_df = self._calc_projection_scores(words)
|
460 |
projections_df['projection_x'] = projections_df['projection'].round(2)
|
461 |
|
462 |
+
name_top = ', '.join(definitional_2[1])
|
463 |
+
name_bottom = ', '.join(definitional_2[0])
|
464 |
+
self._identify_direction(name_top, name_bottom,
|
465 |
definitional=definitional_2,
|
466 |
method='sum')
|
467 |
|
|
|
497 |
for _, row in (projections_df.iterrows()):
|
498 |
ax.annotate(
|
499 |
row['word'], (row['projection_x'], row['projection_y']))
|
500 |
+
x_label = '← {} {} {} →'.format(name_left,
|
501 |
' ' * 20,
|
502 |
+
name_right)
|
503 |
|
504 |
+
y_label = '← {} {} {} →'.format(name_top,
|
505 |
' ' * 20,
|
506 |
+
name_bottom)
|
507 |
|
508 |
plt.xlabel(x_label)
|
509 |
ax.xaxis.set_label_position('bottom')
|
|
|
518 |
|
519 |
ax.set_xticks([])
|
520 |
ax.set_yticks([])
|
|
|
|
|
|
|
521 |
|
522 |
+
return ax
|
modules/module_connection.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
from abc import ABC
|
2 |
|
3 |
from modules.module_WordExplorer import WordExplorer
|
4 |
-
from modules.module_BiasExplorer import
|
5 |
from typing import List, Tuple
|
6 |
|
7 |
|
@@ -120,7 +120,10 @@ class BiasWordExplorerConnector(Connector):
|
|
120 |
else:
|
121 |
raise KeyError
|
122 |
|
123 |
-
self.
|
|
|
|
|
|
|
124 |
embedding=embedding
|
125 |
)
|
126 |
|
@@ -143,14 +146,14 @@ class BiasWordExplorerConnector(Connector):
|
|
143 |
if err:
|
144 |
return None, self.process_error(err)
|
145 |
|
146 |
-
err = self.
|
147 |
if err:
|
148 |
return None, self.process_error(err)
|
149 |
|
150 |
-
fig = self.
|
151 |
to_diagnose_list,
|
152 |
-
|
153 |
-
|
154 |
)
|
155 |
|
156 |
return fig, self.process_error(err)
|
@@ -174,20 +177,20 @@ class BiasWordExplorerConnector(Connector):
|
|
174 |
wordlists = [wordlist_1, wordlist_2, wordlist_3, wordlist_4, to_diagnose_list]
|
175 |
for _list in wordlists:
|
176 |
if not _list:
|
177 |
-
err = "To plot with 4 spaces, you must enter at least one word in all lists
|
178 |
if err:
|
179 |
return None, self.process_error(err)
|
180 |
|
181 |
-
err = self.
|
182 |
if err:
|
183 |
return None, self.process_error(err)
|
184 |
|
185 |
-
fig = self.
|
186 |
to_diagnose_list,
|
187 |
wordlist_1,
|
188 |
wordlist_2,
|
189 |
wordlist_3,
|
190 |
wordlist_4
|
191 |
)
|
192 |
-
|
193 |
return fig, self.process_error(err)
|
|
|
1 |
from abc import ABC
|
2 |
|
3 |
from modules.module_WordExplorer import WordExplorer
|
4 |
+
from modules.module_BiasExplorer import WEBiasExplorer2Spaces, WEBiasExplorer4Spaces
|
5 |
from typing import List, Tuple
|
6 |
|
7 |
|
|
|
120 |
else:
|
121 |
raise KeyError
|
122 |
|
123 |
+
self.bias_word_explorer_2_spaces = WEBiasExplorer2Spaces(
|
124 |
+
embedding=embedding
|
125 |
+
)
|
126 |
+
self.bias_word_explorer_4_spaces = WEBiasExplorer4Spaces(
|
127 |
embedding=embedding
|
128 |
)
|
129 |
|
|
|
146 |
if err:
|
147 |
return None, self.process_error(err)
|
148 |
|
149 |
+
err = self.bias_word_explorer_2_spaces.check_oov(word_lists)
|
150 |
if err:
|
151 |
return None, self.process_error(err)
|
152 |
|
153 |
+
fig = self.bias_word_explorer_2_spaces.calculate_bias(
|
154 |
to_diagnose_list,
|
155 |
+
wordlist_1,
|
156 |
+
wordlist_2
|
157 |
)
|
158 |
|
159 |
return fig, self.process_error(err)
|
|
|
177 |
wordlists = [wordlist_1, wordlist_2, wordlist_3, wordlist_4, to_diagnose_list]
|
178 |
for _list in wordlists:
|
179 |
if not _list:
|
180 |
+
err = "To plot with 4 spaces, you must enter at least one word in all lists"
|
181 |
if err:
|
182 |
return None, self.process_error(err)
|
183 |
|
184 |
+
err = self.bias_word_explorer_4_spaces.check_oov(wordlists)
|
185 |
if err:
|
186 |
return None, self.process_error(err)
|
187 |
|
188 |
+
fig = self.bias_word_explorer_4_spaces.calculate_bias(
|
189 |
to_diagnose_list,
|
190 |
wordlist_1,
|
191 |
wordlist_2,
|
192 |
wordlist_3,
|
193 |
wordlist_4
|
194 |
)
|
195 |
+
|
196 |
return fig, self.process_error(err)
|
modules/utils.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
def take_two_sides_extreme_sorted(
|
5 |
+
df: pd.DataFrame,
|
6 |
+
n_extreme: int,
|
7 |
+
part_column: str=None,
|
8 |
+
head_value: str='',
|
9 |
+
tail_value: str=''
|
10 |
+
) -> pd.DataFrame:
|
11 |
+
|
12 |
+
head_df = df.head(n_extreme)[:]
|
13 |
+
tail_df = df.tail(n_extreme)[:]
|
14 |
+
|
15 |
+
if part_column is not None:
|
16 |
+
head_df[part_column] = head_value
|
17 |
+
tail_df[part_column] = tail_value
|
18 |
+
|
19 |
+
return (pd.concat([head_df, tail_df])
|
20 |
+
.drop_duplicates()
|
21 |
+
.reset_index(drop=True))
|
22 |
+
|
23 |
+
def normalize(
|
24 |
+
v: np.ndarray
|
25 |
+
) -> np.ndarray:
|
26 |
+
|
27 |
+
"""Normalize a 1-D vector."""
|
28 |
+
if v.ndim != 1:
|
29 |
+
raise ValueError('v should be 1-D, {}-D was given'.format(
|
30 |
+
v.ndim))
|
31 |
+
norm = np.linalg.norm(v)
|
32 |
+
if norm == 0:
|
33 |
+
return v
|
34 |
+
return v / norm
|
35 |
+
|
36 |
+
def project_params(
|
37 |
+
u: np.ndarray,
|
38 |
+
v: np.ndarray
|
39 |
+
) -> np.ndarray:
|
40 |
+
|
41 |
+
"""Projecting and rejecting the vector v onto direction u with scalar."""
|
42 |
+
normalize_u = normalize(u)
|
43 |
+
projection = (v @ normalize_u)
|
44 |
+
projected_vector = projection * normalize_u
|
45 |
+
rejected_vector = v - projected_vector
|
46 |
+
return projection, projected_vector, rejected_vector
|
47 |
+
|
48 |
+
|
49 |
+
def cosine_similarity(
|
50 |
+
v: np.ndarray,
|
51 |
+
u: np.ndarray
|
52 |
+
) -> np.ndarray:
|
53 |
+
|
54 |
+
"""Calculate the cosine similarity between two vectors."""
|
55 |
+
v_norm = np.linalg.norm(v)
|
56 |
+
u_norm = np.linalg.norm(u)
|
57 |
+
similarity = v @ u / (v_norm * u_norm)
|
58 |
+
return similarity
|