nanom commited on
Commit
6ff911e
·
1 Parent(s): e47242d

Update all modules

Browse files
.gitignore CHANGED
@@ -1,3 +1,3 @@
1
  __pycache__/
2
- bias_tool_logs/
3
- *.env
 
1
  __pycache__/
2
+ *.env
3
+ logs_edia_we_english/
app.py CHANGED
@@ -6,26 +6,34 @@ import pandas as pd
6
  # --- Imports modules ---
7
  from modules.model_embbeding import Embedding
8
 
 
9
  # --- Imports interfaces ---
10
  from interfaces.interface_WordExplorer import interface as wordExplorer_interface
11
  from interfaces.interface_BiasWordExplorer import interface as biasWordExplorer_interface
12
 
 
13
  # --- Tool config ---
14
- AVAILABLE_LOGS = True # [True | False]
15
- LANGUAGE = "english" # [spanish | english]
16
  EMBEDDINGS_PATH = "data/GoogleNews-vectors-negative300-SLIM.bin"
 
17
  MAX_NEIGHBORS = 20
 
 
 
18
 
19
  # --- Init classes ---
20
  embedding = Embedding(
21
  path=EMBEDDINGS_PATH,
22
- binary=EMBEDDINGS_PATH.endswith('.bin'),
23
- limit=100_000,
24
  randomizedPCA=False,
25
- max_neighbors=20
 
26
  )
 
 
 
27
  labels = pd.read_json(f"language/{LANGUAGE}.json")["app"]
28
 
 
29
  # --- Main App ---
30
  INTERFACE_LIST = [
31
  biasWordExplorer_interface(
 
6
  # --- Imports modules ---
7
  from modules.model_embbeding import Embedding
8
 
9
+
10
  # --- Imports interfaces ---
11
  from interfaces.interface_WordExplorer import interface as wordExplorer_interface
12
  from interfaces.interface_BiasWordExplorer import interface as biasWordExplorer_interface
13
 
14
+
15
  # --- Tool config ---
 
 
16
  EMBEDDINGS_PATH = "data/GoogleNews-vectors-negative300-SLIM.bin"
17
+ LANGUAGE = "english" # [spanish | english]
18
  MAX_NEIGHBORS = 20
19
+ NN_METHOD = 'sklearn' # ['sklearn' | 'ann']
20
+ AVAILABLE_LOGS = True # [True | False]
21
+
22
 
23
  # --- Init classes ---
24
  embedding = Embedding(
25
  path=EMBEDDINGS_PATH,
26
+ limit=100000,
 
27
  randomizedPCA=False,
28
+ max_neighbors=MAX_NEIGHBORS,
29
+ nn_method=NN_METHOD
30
  )
31
+
32
+
33
+ # --- Init Vars ---
34
  labels = pd.read_json(f"language/{LANGUAGE}.json")["app"]
35
 
36
+
37
  # --- Main App ---
38
  INTERFACE_LIST = [
39
  biasWordExplorer_interface(
data/.gitignore CHANGED
@@ -1,2 +1 @@
1
  __pycache__/
2
- data_loader.py
 
1
  __pycache__/
 
data/data_loader.py DELETED
@@ -1,37 +0,0 @@
1
- import pandas as pd
2
- from sklearn.decomposition import PCA
3
- from gensim.models import KeyedVectors
4
-
5
- def load_embeddings(path, binary = False, randomPCA = False, limit = None):
6
- if randomPCA:
7
- pca = PCA(n_components=2,
8
- copy=False,
9
- whiten=False,
10
- svd_solver='randomized',
11
- iterated_power='auto'
12
- )
13
- else:
14
- pca = PCA(n_components=2)
15
-
16
- print("--------> PATH:", path)
17
- model = KeyedVectors.load_word2vec_format(path, binary=binary, limit=limit)
18
-
19
- # Cased Vocab
20
- cased_words = model.index_to_key
21
- cased_emb = model.get_normed_vectors()
22
- cased_pca = pca.fit_transform(cased_emb)
23
-
24
- df_cased = pd.DataFrame(
25
- zip(
26
- cased_words,
27
- cased_emb,
28
- cased_pca
29
- ),
30
- columns=['word', 'embedding', 'pca']
31
- )
32
-
33
- df_cased['word'] = df_cased.word.apply(lambda w: w.lower())
34
- df_uncased = df_cased.drop_duplicates(subset='word')
35
- return df_uncased
36
-
37
- #load_embeddings('data/fasttext-sbwc.100k.vec', limit=1000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/.gitignore CHANGED
@@ -1 +1,2 @@
1
- __pycache__
 
 
1
+ __pycache__
2
+ examples_es.py
examples/{examples.py → examples_en.py} RENAMED
File without changes
interfaces/interface_BiasWordExplorer.py CHANGED
@@ -1,48 +1,96 @@
1
  import gradio as gr
2
  import pandas as pd
3
- from tkinter import image_names
4
 
5
- from tool_info import TOOL_INFO
6
  from modules.module_logsManager import HuggingFaceDatasetSaver
7
  from modules.module_connection import BiasWordExplorerConnector
8
- from examples.examples import examples1_explorar_sesgo_en_palabras, examples2_explorar_sesgo_en_palabras
 
 
9
 
10
  # --- Interface ---
11
- def interface(embedding, available_logs, lang="spanish"):
 
 
 
 
 
12
  # --- Init logs ---
13
  log_callback = HuggingFaceDatasetSaver(
14
- available_logs=available_logs
 
15
  )
 
16
  # --- Init vars ---
17
- connector = BiasWordExplorerConnector(embedding=embedding)
18
- labels = pd.read_json(f"language/{lang}.json")["BiasWordExplorer_interface"]
 
 
 
 
 
 
19
 
 
20
  interface = gr.Blocks()
 
21
  with interface:
22
- gr.Markdown(labels["step1"])
 
 
23
  with gr.Row():
24
  with gr.Column():
25
  with gr.Row():
26
- diagnose_list = gr.Textbox(lines=2, label=labels["wordListToDiagnose"])
 
 
 
27
  with gr.Row():
28
- gr.Markdown(labels["step2&2Spaces"])
 
 
29
  with gr.Row():
30
- wordlist_1 = gr.Textbox(lines=2, label=labels["wordList1"])
31
- wordlist_2 = gr.Textbox(lines=2, label=labels["wordList2"])
 
 
 
 
 
 
32
  with gr.Row():
33
- gr.Markdown(labels["step2&4Spaces"])
 
 
34
  with gr.Row():
35
- wordlist_3 = gr.Textbox(lines=2, label=labels["wordList3"])
36
- wordlist_4 = gr.Textbox(lines=2, label=labels["wordList4"])
 
 
 
 
 
 
 
37
  with gr.Column():
38
  with gr.Row():
39
- bias2d = gr.Button(labels["plot2SpacesButton"])
 
 
40
  with gr.Row():
41
- bias4d = gr.Button(labels["plot4SpacesButton"])
 
 
42
  with gr.Row():
43
- err_msg = gr.Markdown(label='',visible=True)
 
 
 
44
  with gr.Row():
45
- bias_plot = gr.Plot(label="", show_label=False)
 
 
 
 
46
  with gr.Row():
47
  examples = gr.Examples(
48
  fn=connector.calculate_bias_2d,
@@ -54,51 +102,59 @@ def interface(embedding, available_logs, lang="spanish"):
54
  with gr.Row():
55
  examples = gr.Examples(
56
  fn=connector.calculate_bias_4d,
57
- inputs=[wordlist_1, wordlist_2,
58
- wordlist_3, wordlist_4, diagnose_list],
59
- outputs=[bias_plot, err_msg],
 
60
  examples=examples2_explorar_sesgo_en_palabras,
61
  label=labels["examples4Spaces"]
62
  )
63
 
64
  with gr.Row():
65
- gr.Markdown(TOOL_INFO)
 
 
66
 
67
  bias2d.click(
68
- fn=connector.calculate_bias_2d,
69
- inputs=[wordlist_1,wordlist_2,diagnose_list],
70
- outputs=[bias_plot,err_msg]
71
  )
72
-
73
  bias4d.click(
74
  fn=connector.calculate_bias_4d,
75
- inputs=[wordlist_1,wordlist_2,wordlist_3,wordlist_4,diagnose_list],
76
- outputs=[bias_plot,err_msg]
 
77
  )
78
 
79
  # --- Logs ---
80
- save_field = [wordlist_1,wordlist_2,wordlist_3,wordlist_4,diagnose_list]
81
- log_callback.setup(components=save_field, flagging_dir="edia_bias_we_es")
 
 
 
82
 
83
  bias2d.click(
84
  fn=lambda *args: log_callback.flag(
85
- flag_data=args,
86
- flag_option="plot_2d",
87
- username="vialibre"
88
  ),
89
  inputs=save_field,
90
- outputs=None,
91
  preprocess=False
92
  )
93
-
94
  bias4d.click(
95
  fn=lambda *args: log_callback.flag(
96
- flag_data=args,
97
- flag_option="plot_4d",
98
- username="vialibre"
99
  ),
100
  inputs=save_field,
101
- outputs=None,
102
  preprocess=False
103
  )
104
- return interface
 
 
1
  import gradio as gr
2
  import pandas as pd
 
3
 
 
4
  from modules.module_logsManager import HuggingFaceDatasetSaver
5
  from modules.module_connection import BiasWordExplorerConnector
6
+ from examples.examples_en import examples1_explorar_sesgo_en_palabras, examples2_explorar_sesgo_en_palabras
7
+ from tool_info import TOOL_INFO
8
+
9
 
10
  # --- Interface ---
11
+ def interface(
12
+ embedding, # Class Embedding instance
13
+ available_logs: bool,
14
+ lang: str="english"
15
+ ) -> gr.Blocks:
16
+
17
  # --- Init logs ---
18
  log_callback = HuggingFaceDatasetSaver(
19
+ available_logs=available_logs,
20
+ dataset_name=f"logs_edia_we_{lang}"
21
  )
22
+
23
  # --- Init vars ---
24
+ connector = BiasWordExplorerConnector(
25
+ embedding=embedding
26
+ )
27
+
28
+ # --- Load language ---
29
+ labels = pd.read_json(
30
+ f"language/{lang}.json"
31
+ )["BiasWordExplorer_interface"]
32
 
33
+ # --- Interface ---
34
  interface = gr.Blocks()
35
+
36
  with interface:
37
+ gr.Markdown(
38
+ value=labels["step1"]
39
+ )
40
  with gr.Row():
41
  with gr.Column():
42
  with gr.Row():
43
+ diagnose_list = gr.Textbox(
44
+ lines=2,
45
+ label=labels["wordListToDiagnose"]
46
+ )
47
  with gr.Row():
48
+ gr.Markdown(
49
+ value=labels["step2&2Spaces"]
50
+ )
51
  with gr.Row():
52
+ wordlist_1 = gr.Textbox(
53
+ lines=2,
54
+ label=labels["wordList1"]
55
+ )
56
+ wordlist_2 = gr.Textbox(
57
+ lines=2,
58
+ label=labels["wordList2"]
59
+ )
60
  with gr.Row():
61
+ gr.Markdown(
62
+ value=labels["step2&4Spaces"]
63
+ )
64
  with gr.Row():
65
+ wordlist_3 = gr.Textbox(
66
+ lines=2,
67
+ label=labels["wordList3"]
68
+ )
69
+ wordlist_4 = gr.Textbox(
70
+ lines=2,
71
+ label=labels["wordList4"]
72
+ )
73
+
74
  with gr.Column():
75
  with gr.Row():
76
+ bias2d = gr.Button(
77
+ value=labels["plot2SpacesButton"]
78
+ )
79
  with gr.Row():
80
+ bias4d = gr.Button(
81
+ value=labels["plot4SpacesButton"]
82
+ )
83
  with gr.Row():
84
+ err_msg = gr.Markdown(
85
+ label="",
86
+ visible=True
87
+ )
88
  with gr.Row():
89
+ bias_plot = gr.Plot(
90
+ label="",
91
+ show_label=False
92
+ )
93
+
94
  with gr.Row():
95
  examples = gr.Examples(
96
  fn=connector.calculate_bias_2d,
 
102
  with gr.Row():
103
  examples = gr.Examples(
104
  fn=connector.calculate_bias_4d,
105
+ inputs=[wordlist_1, wordlist_2,wordlist_3, wordlist_4, diagnose_list],
106
+ outputs=[
107
+ bias_plot, err_msg
108
+ ],
109
  examples=examples2_explorar_sesgo_en_palabras,
110
  label=labels["examples4Spaces"]
111
  )
112
 
113
  with gr.Row():
114
+ gr.Markdown(
115
+ value=TOOL_INFO
116
+ )
117
 
118
  bias2d.click(
119
+ fn=connector.calculate_bias_2d,
120
+ inputs=[wordlist_1, wordlist_2, diagnose_list],
121
+ outputs=[bias_plot, err_msg]
122
  )
123
+
124
  bias4d.click(
125
  fn=connector.calculate_bias_4d,
126
+ inputs=[wordlist_1, wordlist_2,
127
+ wordlist_3, wordlist_4, diagnose_list],
128
+ outputs=[bias_plot, err_msg]
129
  )
130
 
131
  # --- Logs ---
132
+ save_field = [wordlist_1, wordlist_2,wordlist_3, wordlist_4, diagnose_list]
133
+ log_callback.setup(
134
+ components=save_field,
135
+ flagging_dir="logs_word_bias"
136
+ )
137
 
138
  bias2d.click(
139
  fn=lambda *args: log_callback.flag(
140
+ flag_data=args,
141
+ flag_option="plot_2d",
142
+ username="vialibre"
143
  ),
144
  inputs=save_field,
145
+ outputs=None,
146
  preprocess=False
147
  )
148
+
149
  bias4d.click(
150
  fn=lambda *args: log_callback.flag(
151
+ flag_data=args,
152
+ flag_option="plot_4d",
153
+ username="vialibre"
154
  ),
155
  inputs=save_field,
156
+ outputs=None,
157
  preprocess=False
158
  )
159
+
160
+ return interface
interfaces/interface_WordExplorer.py CHANGED
@@ -2,73 +2,140 @@ import gradio as gr
2
  import pandas as pd
3
  import matplotlib.pyplot as plt
4
 
5
- from tool_info import TOOL_INFO
6
  from modules.module_connection import WordExplorerConnector
7
  from modules.module_logsManager import HuggingFaceDatasetSaver
8
- from examples.examples import examples_explorar_relaciones_entre_palabras
 
9
 
10
  plt.rcParams.update({'font.size': 14})
11
 
12
  def interface(
13
- embedding,
14
  available_logs: bool,
15
- max_neighbors: int, # Updated
16
- lang: str="spanish",
17
  ) -> gr.Blocks:
18
 
19
  # --- Init logs ---
20
  log_callback = HuggingFaceDatasetSaver(
21
- available_logs=available_logs
 
22
  )
 
23
  # --- Init vars ---
24
- connector = WordExplorerConnector(embedding=embedding)
25
- labels = pd.read_json(f"language/{lang}.json")["WordExplorer_interface"]
 
 
 
 
 
 
26
 
27
  # --- Interface ---
28
  interface = gr.Blocks()
 
29
  with interface:
30
- gr.Markdown(labels["title"])
 
 
 
31
  with gr.Row():
32
  with gr.Column(scale=3):
33
  with gr.Row(equal_height=True):
34
  with gr.Column(scale=5):
35
- diagnose_list = gr.Textbox(lines=2, label=labels["wordListToDiagnose"])
 
 
 
36
  with gr.Column(scale=1,min_width=10):
37
- color_wordlist = gr.ColorPicker(label="",value='#000000',)
 
 
 
 
38
  with gr.Row():
39
  with gr.Column(scale=5):
40
- wordlist_1 = gr.Textbox(lines=2, label=labels["wordList1"])
 
 
 
41
  with gr.Column(scale=1,min_width=10):
42
- color_wordlist_1 = gr.ColorPicker(label="",value='#1f78b4')
 
 
 
43
  with gr.Row():
44
  with gr.Column(scale=5):
45
- wordlist_2 = gr.Textbox(lines=2, label=labels["wordList2"])
 
 
 
46
  with gr.Column(scale=1,min_width=10):
47
- color_wordlist_2 = gr.ColorPicker(label="",value='#33a02c')
 
 
 
48
  with gr.Row():
49
  with gr.Column(scale=5):
50
- wordlist_3 = gr.Textbox(lines=2, label=labels["wordList3"])
 
 
 
51
  with gr.Column(scale=1,min_width=10):
52
- color_wordlist_3 = gr.ColorPicker(label="",value='#e31a1c')
 
 
 
53
  with gr.Row():
54
  with gr.Column(scale=5):
55
- wordlist_4 = gr.Textbox(lines=2, label=labels["wordList4"])
 
 
 
56
  with gr.Column(scale=1,min_width=10):
57
- color_wordlist_4 = gr.ColorPicker(label="",value='#6a3d9a')
 
 
 
58
  with gr.Column(scale=4):
59
  with gr.Row():
60
  with gr.Row():
61
- gr.Markdown(labels["plotNeighbours"]["title"])
62
- n_neighbors = gr.Slider(minimum=0,maximum=max_neighbors,step=1,label=labels["plotNeighbours"]["quantity"])
 
 
 
 
 
 
 
63
  with gr.Row():
64
- alpha = gr.Slider(minimum=0.1,maximum=0.9, value=0.3, step=0.1,label=labels["options"]["transparency"])
65
- fontsize=gr.Number(value=25, label=labels["options"]["font-size"])
 
 
 
 
 
 
 
 
 
66
  with gr.Row():
67
- btn_plot = gr.Button(labels["plot_button"])
 
 
68
  with gr.Row():
69
- err_msg = gr.Markdown(label="", visible=True)
 
 
 
70
  with gr.Row():
71
- word_proyections = gr.Plot(label="", show_label=False)
 
 
 
72
 
73
  with gr.Row():
74
  gr.Examples(
@@ -80,7 +147,9 @@ def interface(
80
  )
81
 
82
  with gr.Row():
83
- gr.Markdown(TOOL_INFO)
 
 
84
 
85
  btn_plot.click(
86
  fn=connector.plot_proyection_2d,
@@ -99,21 +168,25 @@ def interface(
99
  fontsize,
100
  n_neighbors
101
  ],
102
- outputs=[word_proyections,err_msg]
103
  )
104
 
105
  # --- Logs ---
106
- save_field = [diagnose_list,wordlist_1,wordlist_2,wordlist_3,wordlist_4]
107
- log_callback.setup(components=save_field, flagging_dir="edia_we_es")
 
 
 
108
 
109
  btn_plot.click(
110
  fn=lambda *args: log_callback.flag(
111
- flag_data=args,
112
- flag_option="explorar_palabras",
113
- username="vialibre",
114
  ),
115
  inputs=save_field,
116
  outputs=None,
117
  preprocess=False
118
  )
 
119
  return interface
 
2
  import pandas as pd
3
  import matplotlib.pyplot as plt
4
 
 
5
  from modules.module_connection import WordExplorerConnector
6
  from modules.module_logsManager import HuggingFaceDatasetSaver
7
+ from examples.examples_en import examples_explorar_relaciones_entre_palabras
8
+ from tool_info import TOOL_INFO
9
 
10
  plt.rcParams.update({'font.size': 14})
11
 
12
  def interface(
13
+ embedding, # Class Embedding instance
14
  available_logs: bool,
15
+ max_neighbors: int,
16
+ lang: str="english",
17
  ) -> gr.Blocks:
18
 
19
  # --- Init logs ---
20
  log_callback = HuggingFaceDatasetSaver(
21
+ available_logs=available_logs,
22
+ dataset_name=f"logs_edia_we_{lang}"
23
  )
24
+
25
  # --- Init vars ---
26
+ connector = WordExplorerConnector(
27
+ embedding=embedding
28
+ )
29
+
30
+ # --- Load language ---
31
+ labels = pd.read_json(
32
+ f"language/{lang}.json"
33
+ )["WordExplorer_interface"]
34
 
35
  # --- Interface ---
36
  interface = gr.Blocks()
37
+
38
  with interface:
39
+ gr.Markdown(
40
+ value=labels["title"]
41
+ )
42
+
43
  with gr.Row():
44
  with gr.Column(scale=3):
45
  with gr.Row(equal_height=True):
46
  with gr.Column(scale=5):
47
+ diagnose_list = gr.Textbox(
48
+ lines=2,
49
+ label=labels["wordListToDiagnose"]
50
+ )
51
  with gr.Column(scale=1,min_width=10):
52
+ color_wordlist = gr.ColorPicker(
53
+ label="",
54
+ value='#000000'
55
+ )
56
+
57
  with gr.Row():
58
  with gr.Column(scale=5):
59
+ wordlist_1 = gr.Textbox(
60
+ lines=2,
61
+ label=labels["wordList1"]
62
+ )
63
  with gr.Column(scale=1,min_width=10):
64
+ color_wordlist_1 = gr.ColorPicker(
65
+ label="",
66
+ value='#1f78b4'
67
+ )
68
  with gr.Row():
69
  with gr.Column(scale=5):
70
+ wordlist_2 = gr.Textbox(
71
+ lines=2,
72
+ label=labels["wordList2"]
73
+ )
74
  with gr.Column(scale=1,min_width=10):
75
+ color_wordlist_2 = gr.ColorPicker(
76
+ label="",
77
+ value='#33a02c'
78
+ )
79
  with gr.Row():
80
  with gr.Column(scale=5):
81
+ wordlist_3 = gr.Textbox(
82
+ lines=2,
83
+ label=labels["wordList3"]
84
+ )
85
  with gr.Column(scale=1,min_width=10):
86
+ color_wordlist_3 = gr.ColorPicker(
87
+ label="",
88
+ value='#e31a1c'
89
+ )
90
  with gr.Row():
91
  with gr.Column(scale=5):
92
+ wordlist_4 = gr.Textbox(
93
+ lines=2,
94
+ label=labels["wordList4"]
95
+ )
96
  with gr.Column(scale=1,min_width=10):
97
+ color_wordlist_4 = gr.ColorPicker(
98
+ label="",
99
+ value='#6a3d9a'
100
+ )
101
  with gr.Column(scale=4):
102
  with gr.Row():
103
  with gr.Row():
104
+ gr.Markdown(
105
+ value=labels["plotNeighbours"]["title"]
106
+ )
107
+ n_neighbors = gr.Slider(
108
+ minimum=0,
109
+ maximum=max_neighbors,
110
+ step=1,
111
+ label=labels["plotNeighbours"]["quantity"]
112
+ )
113
  with gr.Row():
114
+ alpha = gr.Slider(
115
+ minimum=0.1,
116
+ maximum=0.9,
117
+ value=0.3,
118
+ step=0.1,
119
+ label=labels["options"]["transparency"]
120
+ )
121
+ fontsize=gr.Number(
122
+ value=25,
123
+ label=labels["options"]["font-size"]
124
+ )
125
  with gr.Row():
126
+ btn_plot = gr.Button(
127
+ value=labels["plot_button"]
128
+ )
129
  with gr.Row():
130
+ err_msg = gr.Markdown(
131
+ label="",
132
+ visible=True
133
+ )
134
  with gr.Row():
135
+ word_proyections = gr.Plot(
136
+ label="",
137
+ show_label=False
138
+ )
139
 
140
  with gr.Row():
141
  gr.Examples(
 
147
  )
148
 
149
  with gr.Row():
150
+ gr.Markdown(
151
+ value=TOOL_INFO
152
+ )
153
 
154
  btn_plot.click(
155
  fn=connector.plot_proyection_2d,
 
168
  fontsize,
169
  n_neighbors
170
  ],
171
+ outputs=[word_proyections, err_msg]
172
  )
173
 
174
  # --- Logs ---
175
+ save_field = [diagnose_list, wordlist_1, wordlist_2, wordlist_3, wordlist_4]
176
+ log_callback.setup(
177
+ components=save_field,
178
+ flagging_dir="logs_word_explorer"
179
+ )
180
 
181
  btn_plot.click(
182
  fn=lambda *args: log_callback.flag(
183
+ flag_data=args,
184
+ flag_option="word_explorer",
185
+ username="vialibre",
186
  ),
187
  inputs=save_field,
188
  outputs=None,
189
  preprocess=False
190
  )
191
+
192
  return interface
language/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ spanish.json
modules/model_embbeding.py CHANGED
@@ -3,9 +3,8 @@ from memory_profiler import profile
3
  from sklearn.neighbors import NearestNeighbors
4
  from sklearn.decomposition import PCA
5
  from gensim.models import KeyedVectors
6
- from typing import List
7
  import os
8
- import operator
9
  import pandas as pd
10
 
11
  import numpy as np
@@ -14,21 +13,22 @@ from gensim import matutils
14
 
15
 
16
  class Embedding:
17
- @profile
18
  def __init__(self,
19
  path: str,
20
- binary: bool,
21
- limit: int=None,
22
  randomizedPCA: bool=False,
23
- max_neighbors: int=20
 
24
  ) -> None:
25
 
26
  # Embedding vars
27
  self.path = path
28
  self.limit = limit
29
  self.randomizedPCA = randomizedPCA
30
- self.binary = binary
31
  self.max_neighbors = max_neighbors
 
 
 
32
 
33
  # Full embedding dataset
34
  self.ds = None
@@ -44,36 +44,34 @@ class Embedding:
44
  self,
45
  ) -> None:
46
 
 
 
47
  print(f"Preparing {os.path.basename(self.path)} embeddings...")
48
 
49
  # --- Prepare dataset ---
50
  self.ds = self.__preparate(
51
- self.path, self.binary, self.limit, self.randomizedPCA
52
  )
53
 
54
  # --- Estimate Nearest Neighbors
55
- # Method A: Througth annoy using forest tree
56
- self.ann = Ann(
57
- words=self.ds['word'],
58
- vectors=self.ds['embedding'],
59
- coord=self.ds['pca']
60
- )
61
- self.ann.init(
62
- n_trees=20, metric='dot', n_jobs=-1
63
- )
64
-
65
- # Method B: Througth Sklearn method
66
- self.neigh = NearestNeighbors(
67
- n_neighbors=self.max_neighbors
68
- )
69
- self.neigh.fit(
70
- X=self.ds['embedding'].to_list()
71
- )
72
 
73
  def __preparate(
74
  self,
75
- path: str,
76
- binary: bool,
77
  limit: int,
78
  randomizedPCA: bool
79
  ) -> pd.DataFrame:
@@ -94,7 +92,7 @@ class Embedding:
94
 
95
  model = KeyedVectors.load_word2vec_format(
96
  fname=path,
97
- binary=binary,
98
  limit=limit
99
  )
100
 
@@ -116,11 +114,48 @@ class Embedding:
116
  df_uncased = df_cased.drop_duplicates(subset='word')
117
  return df_uncased
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  def __getValue(
120
  self,
121
  word: str,
122
  feature: str
123
- ):
 
124
  word_id, value = None, None
125
 
126
  if word in self:
@@ -128,20 +163,22 @@ class Embedding:
128
 
129
  if word_id != None:
130
  value = self.ds[feature].to_list()[word_id]
 
 
131
 
132
  return value
133
 
134
  def getEmbedding(
135
  self,
136
  word: str
137
- ):
138
 
139
  return self.__getValue(word, 'embedding')
140
 
141
  def getPCA(
142
  self,
143
  word: str
144
- ):
145
 
146
  return self.__getValue(word, 'pca')
147
 
@@ -154,36 +191,61 @@ class Embedding:
154
 
155
  assert(n_neighbors <= self.max_neighbors), f"Error: The value of the parameter 'n_neighbors:{n_neighbors}' must less than or equal to {self.max_neighbors}!."
156
 
 
 
 
 
 
 
 
 
157
  if nn_method == 'ann':
158
- words = self.ann.get(word, n_neighbors)
 
 
 
 
 
 
159
 
160
  elif nn_method == 'sklearn':
161
- word_emb = self.getEmbedding(word).reshape(1,-1)
162
- _, nn_ids = self.neigh.kneighbors(word_emb, n_neighbors+1)
163
- #words = operator.itemgetter(*nn_ids[0])(self.ds['word'].to_list())
164
- words = [self.ds['word'].to_list()[idx] for idx in nn_ids[0]][1:]
165
- else:
166
- words = []
167
- return words
168
 
169
- def __contains__(
170
- self,
171
- word: str
172
- ) -> bool:
173
 
174
- return word in self.ds['word'].to_list()
175
 
176
- # ToDo: Revisar estos dos métodos usados en la pestaña sesgoEnPalabras
177
- # ya que ahora los embedding vienen normalizados
178
- def cosineSimilarities(self, vector_1, vectors_all):
 
 
179
  norm = np.linalg.norm(vector_1)
180
  all_norms = np.linalg.norm(vectors_all, axis=1)
181
  dot_products = dot(vectors_all, vector_1)
182
  similarities = dot_products / (norm * all_norms)
183
  return similarities
184
 
185
- def getCosineSimilarities(self, w1, w2):
 
 
 
 
 
186
  return dot(
187
  matutils.unitvec(self.getEmbedding(w1)),
188
  matutils.unitvec(self.getEmbedding(w2))
189
- )
 
 
 
 
 
 
 
 
3
  from sklearn.neighbors import NearestNeighbors
4
  from sklearn.decomposition import PCA
5
  from gensim.models import KeyedVectors
6
+ from typing import List, Any
7
  import os
 
8
  import pandas as pd
9
 
10
  import numpy as np
 
13
 
14
 
15
  class Embedding:
 
16
  def __init__(self,
17
  path: str,
18
+ limit: int=None,
 
19
  randomizedPCA: bool=False,
20
+ max_neighbors: int=20,
21
+ nn_method: str='sklearn'
22
  ) -> None:
23
 
24
  # Embedding vars
25
  self.path = path
26
  self.limit = limit
27
  self.randomizedPCA = randomizedPCA
 
28
  self.max_neighbors = max_neighbors
29
+
30
+ self.availables_nn_methods = ['sklearn', 'ann']
31
+ self.nn_method = nn_method
32
 
33
  # Full embedding dataset
34
  self.ds = None
 
44
  self,
45
  ) -> None:
46
 
47
+ assert(self.nn_method in self.availables_nn_methods), f"Error: The value of the parameter 'nn method' can only be {self.availables_nn_methods}!"
48
+
49
  print(f"Preparing {os.path.basename(self.path)} embeddings...")
50
 
51
  # --- Prepare dataset ---
52
  self.ds = self.__preparate(
53
+ self.path, self.limit, self.randomizedPCA
54
  )
55
 
56
  # --- Estimate Nearest Neighbors
57
+ if self.nn_method == 'sklearn':
58
+ # Method A: Througth Sklearn method
59
+ self.__init_sklearn_method(
60
+ max_neighbors=self.max_neighbors,
61
+ vectors=self.ds['embedding'].to_list()
62
+ )
63
+
64
+ elif self.nn_method == 'ann':
65
+ # Method B: Througth annoy using forest tree
66
+ self.__init_ann_method(
67
+ words=self.ds['word'].to_list(),
68
+ vectors=self.ds['embedding'].to_list(),
69
+ coord=self.ds['pca'].to_list()
70
+ )
 
 
 
71
 
72
  def __preparate(
73
  self,
74
+ path: str,
 
75
  limit: int,
76
  randomizedPCA: bool
77
  ) -> pd.DataFrame:
 
92
 
93
  model = KeyedVectors.load_word2vec_format(
94
  fname=path,
95
+ binary=path.endswith('.bin'),
96
  limit=limit
97
  )
98
 
 
114
  df_uncased = df_cased.drop_duplicates(subset='word')
115
  return df_uncased
116
 
117
+ def __init_ann_method(
118
+ self,
119
+ words: List[str],
120
+ vectors: List[float],
121
+ coord: List[float],
122
+ n_trees: int=20,
123
+ metric: str='dot'
124
+ ) -> None:
125
+
126
+ print("Initializing Annoy method to search for nearby neighbors...")
127
+ self.ann = Ann(
128
+ words=words,
129
+ vectors=vectors,
130
+ coord=coord,
131
+ )
132
+
133
+ self.ann.init(
134
+ n_trees=n_trees,
135
+ metric=metric,
136
+ n_jobs=-1
137
+ )
138
+
139
+ def __init_sklearn_method(
140
+ self,
141
+ max_neighbors: int,
142
+ vectors: List[float]
143
+ ) -> None:
144
+
145
+ print("Initializing sklearn method to search for nearby neighbors...")
146
+ self.neigh = NearestNeighbors(
147
+ n_neighbors=max_neighbors
148
+ )
149
+ self.neigh.fit(
150
+ X=vectors
151
+ )
152
+
153
  def __getValue(
154
  self,
155
  word: str,
156
  feature: str
157
+ ) -> Any:
158
+
159
  word_id, value = None, None
160
 
161
  if word in self:
 
163
 
164
  if word_id != None:
165
  value = self.ds[feature].to_list()[word_id]
166
+ else:
167
+ print(f"The word '{word}' does not exist")
168
 
169
  return value
170
 
171
  def getEmbedding(
172
  self,
173
  word: str
174
+ ) -> np.ndarray:
175
 
176
  return self.__getValue(word, 'embedding')
177
 
178
  def getPCA(
179
  self,
180
  word: str
181
+ ) -> np.ndarray:
182
 
183
  return self.__getValue(word, 'pca')
184
 
 
191
 
192
  assert(n_neighbors <= self.max_neighbors), f"Error: The value of the parameter 'n_neighbors:{n_neighbors}' must less than or equal to {self.max_neighbors}!."
193
 
194
+ assert(nn_method in self.availables_nn_methods), f"Error: The value of the parameter 'nn method' can only be {self.availables_nn_methods}!"
195
+
196
+ neighbors_list = []
197
+
198
+ if word not in self:
199
+ print(f"The word '{word}' does not exist")
200
+ return neighbors_list
201
+
202
  if nn_method == 'ann':
203
+ if self.ann is None:
204
+ self.__init_ann_method(
205
+ words=self.ds['word'].to_list(),
206
+ vectors=self.ds['embedding'].to_list(),
207
+ coord=self.ds['pca'].to_list()
208
+ )
209
+ neighbors_list = self.ann.get(word, n_neighbors)
210
 
211
  elif nn_method == 'sklearn':
212
+ if self.neigh is None:
213
+ self.__init_sklearn_method(
214
+ max_neighbors=self.max_neighbors,
215
+ vectors=self.ds['embedding'].to_list()
216
+ )
 
 
217
 
218
+ word_emb = self.getEmbedding(word).reshape(1,-1)
219
+ _, nn_ids = self.neigh.kneighbors(word_emb, n_neighbors + 1)
220
+ neighbors_list = [self.ds['word'].to_list()[idx] for idx in nn_ids[0]][1:]
 
221
 
222
+ return neighbors_list
223
 
224
+ def cosineSimilarities(
225
+ self,
226
+ vector_1,
227
+ vectors_all
228
+ ):
229
  norm = np.linalg.norm(vector_1)
230
  all_norms = np.linalg.norm(vectors_all, axis=1)
231
  dot_products = dot(vectors_all, vector_1)
232
  similarities = dot_products / (norm * all_norms)
233
  return similarities
234
 
235
+ def getCosineSimilarities(
236
+ self,
237
+ w1,
238
+ w2
239
+ ):
240
+
241
  return dot(
242
  matutils.unitvec(self.getEmbedding(w1)),
243
  matutils.unitvec(self.getEmbedding(w2))
244
+ )
245
+
246
+ def __contains__(
247
+ self,
248
+ word: str
249
+ ) -> bool:
250
+
251
+ return word in self.ds['word'].to_list()
modules/module_BiasExplorer.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import copy
2
  import numpy as np
3
  import pandas as pd
@@ -5,10 +7,14 @@ import seaborn as sns
5
  import matplotlib.pyplot as plt
6
  from sklearn.decomposition import PCA
7
 
8
- def take_two_sides_extreme_sorted(df, n_extreme,
9
- part_column=None,
10
- head_value='',
11
- tail_value=''):
 
 
 
 
12
  head_df = df.head(n_extreme)[:]
13
  tail_df = df.tail(n_extreme)[:]
14
 
@@ -56,39 +62,63 @@ __all__ = ['GenderBiasWE', 'BiasWordEmbedding']
56
 
57
 
58
  class WordBiasExplorer():
59
- def __init__(self, vocabulary):
60
- # pylint: disable=undefined-variable
 
 
61
 
62
- self.vocabulary = vocabulary
63
  self.direction = None
64
  self.positive_end = None
65
  self.negative_end = None
66
 
67
- def __copy__(self):
68
- bias_word_embedding = self.__class__(self.vocabulary)
 
 
 
69
  bias_word_embedding.direction = copy.deepcopy(self.direction)
70
  bias_word_embedding.positive_end = copy.deepcopy(self.positive_end)
71
  bias_word_embedding.negative_end = copy.deepcopy(self.negative_end)
72
  return bias_word_embedding
73
 
74
- def __deepcopy__(self, memo):
 
 
 
 
75
  bias_word_embedding = copy.copy(self)
76
  bias_word_embedding.model = copy.deepcopy(bias_word_embedding.model)
77
  return bias_word_embedding
78
 
79
- def __getitem__(self, key):
80
- return self.vocabulary.getEmbedding(key)
 
 
81
 
82
- def __contains__(self, item):
83
- return item in self.vocabulary
84
 
85
- def _is_direction_identified(self):
 
 
 
 
 
 
 
 
 
86
  if self.direction is None:
87
  raise RuntimeError('The direction was not identified'
88
  ' for this {} instance'
89
  .format(self.__class__.__name__))
90
 
91
- def _identify_subspace_by_pca(self, definitional_pairs, n_components):
 
 
 
 
 
92
  matrix = []
93
 
94
  for word1, word2 in definitional_pairs:
@@ -105,8 +135,14 @@ class WordBiasExplorer():
105
  return pca
106
 
107
 
108
- def _identify_direction(self, positive_end, negative_end,
109
- definitional, method='pca'):
 
 
 
 
 
 
110
  if method not in DIRECTION_METHODS:
111
  raise ValueError('method should be one of {}, {} was given'.format(
112
  DIRECTION_METHODS, method))
@@ -154,7 +190,11 @@ class WordBiasExplorer():
154
  self.positive_end = positive_end
155
  self.negative_end = negative_end
156
 
157
- def project_on_direction(self, word):
 
 
 
 
158
  """Project the normalized vector of the word on the direction.
159
  :param str word: The word tor project
160
  :return float: The projection scalar
@@ -163,13 +203,15 @@ class WordBiasExplorer():
163
  self._is_direction_identified()
164
 
165
  vector = self[word]
166
- projection_score = self.vocabulary.cosineSimilarities(self.direction,
167
  [vector])[0]
168
  return projection_score
169
 
 
 
 
 
170
 
171
-
172
- def _calc_projection_scores(self, words):
173
  self._is_direction_identified()
174
 
175
  df = pd.DataFrame({'word': words})
@@ -181,7 +223,11 @@ class WordBiasExplorer():
181
 
182
  return df
183
 
184
- def calc_projection_data(self, words):
 
 
 
 
185
  """
186
  Calculate projection, projected and rejected vectors of a words list.
187
  :param list words: List of words
@@ -206,7 +252,12 @@ class WordBiasExplorer():
206
 
207
  return pd.DataFrame(projection_data)
208
 
209
- def plot_dist_projections_on_direction(self, word_groups, ax=None):
 
 
 
 
 
210
  """Plot the projection scalars distribution on the direction.
211
  :param dict word_groups word: The groups to projects
212
  :return float: The ax object of the plot
@@ -221,7 +272,7 @@ class WordBiasExplorer():
221
  words = word_groups[name]
222
  label = '{} (#{})'.format(name, len(words))
223
  vectors = [self[word] for word in words]
224
- projections = self.vocabulary.cosineSimilarities(self.direction,
225
  vectors)
226
  sns.distplot(projections, hist=False, label=label, ax=ax)
227
 
@@ -236,18 +287,26 @@ class WordBiasExplorer():
236
 
237
  return ax
238
 
239
- def __errorChecking(self, word):
 
 
 
 
240
  out_msj = ""
241
 
242
  if not word:
243
  out_msj = "Error: First you most enter a word!"
244
  else:
245
- if word not in self.vocabulary:
246
  out_msj = f"Error: The word '<b>{word}</b>' is not in the vocabulary!"
247
 
248
  return out_msj
249
 
250
- def check_oov(self, wordlists):
 
 
 
 
251
  for wordlist in wordlists:
252
  for word in wordlist:
253
  msg = self.__errorChecking(word)
@@ -255,39 +314,44 @@ class WordBiasExplorer():
255
  return msg
256
  return None
257
 
258
- def plot_biased_words(self,
259
- words_to_diagnose,
260
- wordlist_right,
261
- wordlist_left,
262
- wordlist_top=[],
263
- wordlist_bottom=[]
264
- ):
 
 
265
  bias_2D = wordlist_top == [] and wordlist_bottom == []
266
 
267
  if bias_2D and (not wordlist_right or not wordlist_left):
268
  raise Exception('For bar plot, wordlist right and left can NOT be empty')
269
  elif not bias_2D and (not wordlist_right or not wordlist_left or not wordlist_top or not wordlist_bottom):
270
- raise Exception('For plane (2D) plot, wordlist right, left, top and down can NOT be empty')
271
 
272
  err = self.check_oov([words_to_diagnose + wordlist_right + wordlist_left + wordlist_top + wordlist_bottom])
273
  if err:
274
  raise Exception(err)
275
 
276
- return self.get_bias_plot(bias_2D,
277
- words_to_diagnose,
278
- definitional_1=(wordlist_right, wordlist_left),
279
- definitional_2=(wordlist_top, wordlist_bottom)
280
- )
 
281
 
282
- def get_bias_plot(self,
283
- plot_2D,
284
- words_to_diagnose,
285
- definitional_1,
286
- definitional_2=([], []),
287
- method='sum',
288
- n_extreme=10,
289
- figsize=(15, 10)
290
- ):
 
 
291
  fig, ax = plt.subplots(1, figsize=figsize)
292
  self.method = method
293
  self.plot_projection_scores(plot_2D, words_to_diagnose, definitional_1, definitional_2, n_extreme, ax)
@@ -298,14 +362,17 @@ class WordBiasExplorer():
298
 
299
  return fig
300
 
301
- def plot_projection_scores(self,
302
- plot_2D,
303
- words,
304
- definitional_1,
305
- definitional_2=([], []),
306
- n_extreme=10,
307
- ax=None,
308
- axis_projection_step=0.1):
 
 
 
309
  name_left = ', '.join(definitional_1[1])
310
  name_right = ', '.join(definitional_1[0])
311
 
@@ -341,6 +408,9 @@ class WordBiasExplorer():
341
  sns.barplot(x='projection', y='word', data=projections_df,
342
  palette=projections_df['color'])
343
  else:
 
 
 
344
  sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
345
  palette=projections_df['color'])
346
 
 
1
+ # ToDo: Pendiente eliminar clases/métodos que no son utilizados. Luego, unificar sintaxix e incluir typing.
2
+
3
  import copy
4
  import numpy as np
5
  import pandas as pd
 
7
  import matplotlib.pyplot as plt
8
  from sklearn.decomposition import PCA
9
 
10
+ def take_two_sides_extreme_sorted(
11
+ df,
12
+ n_extreme,
13
+ part_column=None,
14
+ head_value='',
15
+ tail_value=''
16
+ ):
17
+
18
  head_df = df.head(n_extreme)[:]
19
  tail_df = df.tail(n_extreme)[:]
20
 
 
62
 
63
 
64
  class WordBiasExplorer():
65
+ def __init__(
66
+ self,
67
+ embedding # Class Embedding instance
68
+ ) -> None:
69
 
70
+ self.embedding = embedding
71
  self.direction = None
72
  self.positive_end = None
73
  self.negative_end = None
74
 
75
+ def __copy__(
76
+ self
77
+ ):
78
+
79
+ bias_word_embedding = self.__class__(self.embedding)
80
  bias_word_embedding.direction = copy.deepcopy(self.direction)
81
  bias_word_embedding.positive_end = copy.deepcopy(self.positive_end)
82
  bias_word_embedding.negative_end = copy.deepcopy(self.negative_end)
83
  return bias_word_embedding
84
 
85
+ def __deepcopy__(
86
+ self,
87
+ memo
88
+ ):
89
+
90
  bias_word_embedding = copy.copy(self)
91
  bias_word_embedding.model = copy.deepcopy(bias_word_embedding.model)
92
  return bias_word_embedding
93
 
94
+ def __getitem__(
95
+ self,
96
+ key: str
97
+ ) -> np.ndarray:
98
 
99
+ return self.embedding.getEmbedding(key)
 
100
 
101
+ def __contains__(
102
+ self,
103
+ item: str
104
+ ) -> bool:
105
+
106
+ return item in self.embedding
107
+
108
+ def _is_direction_identified(
109
+ self
110
+ ):
111
  if self.direction is None:
112
  raise RuntimeError('The direction was not identified'
113
  ' for this {} instance'
114
  .format(self.__class__.__name__))
115
 
116
+ def _identify_subspace_by_pca(
117
+ self,
118
+ definitional_pairs,
119
+ n_components
120
+ ):
121
+
122
  matrix = []
123
 
124
  for word1, word2 in definitional_pairs:
 
135
  return pca
136
 
137
 
138
+ def _identify_direction(
139
+ self,
140
+ positive_end,
141
+ negative_end,
142
+ definitional,
143
+ method='pca'
144
+ ):
145
+
146
  if method not in DIRECTION_METHODS:
147
  raise ValueError('method should be one of {}, {} was given'.format(
148
  DIRECTION_METHODS, method))
 
190
  self.positive_end = positive_end
191
  self.negative_end = negative_end
192
 
193
+ def project_on_direction(
194
+ self,
195
+ word: str
196
+ ):
197
+
198
  """Project the normalized vector of the word on the direction.
199
  :param str word: The word tor project
200
  :return float: The projection scalar
 
203
  self._is_direction_identified()
204
 
205
  vector = self[word]
206
+ projection_score = self.embedding.cosineSimilarities(self.direction,
207
  [vector])[0]
208
  return projection_score
209
 
210
+ def _calc_projection_scores(
211
+ self,
212
+ words
213
+ ):
214
 
 
 
215
  self._is_direction_identified()
216
 
217
  df = pd.DataFrame({'word': words})
 
223
 
224
  return df
225
 
226
+ def calc_projection_data(
227
+ self,
228
+ words
229
+ ):
230
+
231
  """
232
  Calculate projection, projected and rejected vectors of a words list.
233
  :param list words: List of words
 
252
 
253
  return pd.DataFrame(projection_data)
254
 
255
+ def plot_dist_projections_on_direction(
256
+ self,
257
+ word_groups,
258
+ ax=None
259
+ ):
260
+
261
  """Plot the projection scalars distribution on the direction.
262
  :param dict word_groups word: The groups to projects
263
  :return float: The ax object of the plot
 
272
  words = word_groups[name]
273
  label = '{} (#{})'.format(name, len(words))
274
  vectors = [self[word] for word in words]
275
+ projections = self.embedding.cosineSimilarities(self.direction,
276
  vectors)
277
  sns.distplot(projections, hist=False, label=label, ax=ax)
278
 
 
287
 
288
  return ax
289
 
290
+ def __errorChecking(
291
+ self,
292
+ word
293
+ ):
294
+
295
  out_msj = ""
296
 
297
  if not word:
298
  out_msj = "Error: First you most enter a word!"
299
  else:
300
+ if word not in self.embedding:
301
  out_msj = f"Error: The word '<b>{word}</b>' is not in the vocabulary!"
302
 
303
  return out_msj
304
 
305
+ def check_oov(
306
+ self,
307
+ wordlists
308
+ ):
309
+
310
  for wordlist in wordlists:
311
  for word in wordlist:
312
  msg = self.__errorChecking(word)
 
314
  return msg
315
  return None
316
 
317
+ def plot_biased_words(
318
+ self,
319
+ words_to_diagnose,
320
+ wordlist_right,
321
+ wordlist_left,
322
+ wordlist_top=[],
323
+ wordlist_bottom=[]
324
+ ):
325
+
326
  bias_2D = wordlist_top == [] and wordlist_bottom == []
327
 
328
  if bias_2D and (not wordlist_right or not wordlist_left):
329
  raise Exception('For bar plot, wordlist right and left can NOT be empty')
330
  elif not bias_2D and (not wordlist_right or not wordlist_left or not wordlist_top or not wordlist_bottom):
331
+ raise Exception('For plane plot, wordlist right, left, top and down can NOT be empty')
332
 
333
  err = self.check_oov([words_to_diagnose + wordlist_right + wordlist_left + wordlist_top + wordlist_bottom])
334
  if err:
335
  raise Exception(err)
336
 
337
+ return self.get_bias_plot(
338
+ bias_2D,
339
+ words_to_diagnose,
340
+ definitional_1=(wordlist_right, wordlist_left),
341
+ definitional_2=(wordlist_top, wordlist_bottom)
342
+ )
343
 
344
+ def get_bias_plot(
345
+ self,
346
+ plot_2D,
347
+ words_to_diagnose,
348
+ definitional_1,
349
+ definitional_2=([], []),
350
+ method='sum',
351
+ n_extreme=10,
352
+ figsize=(15, 10)
353
+ ):
354
+
355
  fig, ax = plt.subplots(1, figsize=figsize)
356
  self.method = method
357
  self.plot_projection_scores(plot_2D, words_to_diagnose, definitional_1, definitional_2, n_extreme, ax)
 
362
 
363
  return fig
364
 
365
+ def plot_projection_scores(
366
+ self,
367
+ plot_2D,
368
+ words,
369
+ definitional_1,
370
+ definitional_2=([], []),
371
+ n_extreme=10,
372
+ ax=None,
373
+ axis_projection_step=0.1
374
+ ):
375
+
376
  name_left = ', '.join(definitional_1[1])
377
  name_right = ', '.join(definitional_1[0])
378
 
 
408
  sns.barplot(x='projection', y='word', data=projections_df,
409
  palette=projections_df['color'])
410
  else:
411
+ # ToDo: revisar este warning:
412
+ # Ignoring `palette` because no `hue` variable has been assigned. sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
413
+
414
  sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
415
  palette=projections_df['color'])
416
 
modules/module_WordExplorer.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import numpy as np
2
  import pandas as pd
3
  import seaborn as sns
@@ -5,37 +6,52 @@ from numpy.linalg import norm
5
 
6
  import matplotlib as mpl
7
  mpl.use('Agg')
8
- import matplotlib.pyplot as plt
 
9
 
10
  class WordToPlot:
11
- def __init__(self, word, color, bias_space, alpha):
 
 
 
 
 
 
 
12
  self.word = word
13
  self.color = color
14
  self.bias_space = bias_space
15
  self.alpha = alpha
16
 
 
17
  class WordExplorer:
18
- def __init__(self, vocabulary) -> None:
19
- self.vocabulary = vocabulary
 
 
 
 
 
 
 
 
 
20
 
21
- def __errorChecking(self, word):
22
  out_msj = ""
23
 
24
  if not word:
25
  out_msj = "Error: First you most enter a word!"
26
  else:
27
- if word not in self.vocabulary:
28
- out_msj = f"Error: The word '<b>{word}</b>' is not in the vocabulary!"
29
 
30
  return out_msj
31
 
32
- def parse_words(self, string):
33
- words = string.strip()
34
- if words:
35
- words = [word.strip() for word in words.split(',') if word != ""]
36
- return words
37
 
38
- def check_oov(self, wordlists):
39
  for wordlist in wordlists:
40
  for word in wordlist:
41
  msg = self.__errorChecking(word)
@@ -43,10 +59,21 @@ class WordExplorer:
43
  return msg
44
  return None
45
 
46
- def get_neighbors(self, word, n_neighbors, nn_method):
47
- return self.vocabulary.getNearestNeighbors(word, n_neighbors, nn_method)
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
- def get_df(self, words_embedded, processed_word_list):
50
  df = pd.DataFrame(words_embedded)
51
 
52
  df['word'] = [wtp.word for wtp in processed_word_list]
@@ -55,16 +82,18 @@ class WordExplorer:
55
  df['word_bias_space'] = [wtp.bias_space for wtp in processed_word_list]
56
  return df
57
 
58
- def get_plot(self,
59
- data,
60
- processed_word_list,
61
- words_embedded,
62
- color_dict,
63
- n_neighbors,
64
- n_alpha,
65
- fontsize=18,
66
- figsize=(20, 15)
67
- ):
 
 
68
  fig, ax = plt.subplots(figsize=figsize)
69
 
70
  sns.scatterplot(
@@ -89,11 +118,20 @@ class WordExplorer:
89
  legend=False,
90
  palette=color_dict
91
  )
 
92
  for i, wtp in enumerate(processed_word_list):
93
  x, y = words_embedded[i, :]
94
- ax.annotate(wtp.word, xy=(x, y), xytext=(5, 2), color=wtp.color,
95
- textcoords='offset points',
96
- ha='right', va='bottom', size=fontsize, alpha=wtp.alpha)
 
 
 
 
 
 
 
 
97
 
98
  ax.set_xticks([])
99
  ax.set_yticks([])
@@ -103,25 +141,27 @@ class WordExplorer:
103
 
104
  return fig
105
 
106
- def plot_projections_2d(self,
107
- wordlist_0,
108
- wordlist_1 = [],
109
- wordlist_2 = [],
110
- wordlist_3 = [],
111
- wordlist_4 = [],
112
- **kwargs
113
- ):
 
 
114
  # convertirlas a vector
115
  choices = [0, 1, 2, 3, 4]
116
  wordlist_choice = [
117
- wordlist_0,
118
  wordlist_1,
119
- wordlist_2,
120
- wordlist_3,
121
  wordlist_4
122
  ]
123
 
124
- err = self.check_oov(wordlist_choice)
125
  if err:
126
  raise Exception(err)
127
 
@@ -139,45 +179,69 @@ class WordExplorer:
139
  processed_word_list = []
140
  for word_list_to_process, color in zip(wordlist_choice, choices):
141
  for word in word_list_to_process:
142
- processed_word_list.append(WordToPlot(word, color_dict[color], color, 1))
 
 
143
 
144
  if n_neighbors > 0:
145
- neighbors = self.get_neighbors(word,
146
- n_neighbors=n_neighbors,
147
- nn_method=kwargs.get('nn_method', 'sklearn')
148
- )
 
 
149
  for n in neighbors:
150
  if n not in [wtp.word for wtp in processed_word_list]:
151
- processed_word_list.append(WordToPlot(n, color_dict[color], color, n_alpha))
 
 
152
 
153
  if not processed_word_list:
154
  raise Exception('Only empty lists were passed')
155
-
156
- words_embedded = np.array([self.vocabulary.getPCA(wtp.word) for wtp in processed_word_list])
157
 
158
- data = self.get_df(words_embedded, processed_word_list)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
- fig = self.get_plot(data, processed_word_list, words_embedded,
161
- color_dict, n_neighbors, n_alpha,
162
- kwargs.get('fontsize', 18),
163
- kwargs.get('figsize', (20, 15))
164
- )
165
  plt.show()
166
  return fig
167
 
168
- def doesnt_match(self, wordlist):
 
 
 
 
 
169
  err = self.check_oov([wordlist])
170
  if err:
171
  raise Exception(err)
172
-
173
- words_emb = np.array([self.vocabulary.getEmbedding(word) for word in wordlist])
 
174
  mean_vec = np.mean(words_emb, axis=0)
175
 
176
  doesnt_match = ""
177
  farthest_emb = 1.0
178
  for word in wordlist:
179
- word_emb = self.vocabulary.getEmbedding(word)
180
- cos_sim = np.dot(mean_vec, word_emb) / (norm(mean_vec)*norm(word_emb))
 
181
  if cos_sim <= farthest_emb:
182
  farthest_emb = cos_sim
183
  doesnt_match = word
 
1
+ import matplotlib.pyplot as plt
2
  import numpy as np
3
  import pandas as pd
4
  import seaborn as sns
 
6
 
7
  import matplotlib as mpl
8
  mpl.use('Agg')
9
+ from typing import List, Dict, Tuple
10
+
11
 
12
  class WordToPlot:
13
+ def __init__(
14
+ self,
15
+ word: str,
16
+ color: str,
17
+ bias_space: int,
18
+ alpha: float
19
+ ):
20
+
21
  self.word = word
22
  self.color = color
23
  self.bias_space = bias_space
24
  self.alpha = alpha
25
 
26
+
27
  class WordExplorer:
28
+ def __init__(
29
+ self,
30
+ embedding # Class Embedding instance
31
+ ) -> None:
32
+
33
+ self.embedding = embedding
34
+
35
+ def __errorChecking(
36
+ self,
37
+ word: str
38
+ ) -> str:
39
 
 
40
  out_msj = ""
41
 
42
  if not word:
43
  out_msj = "Error: First you most enter a word!"
44
  else:
45
+ if word not in self.embedding:
46
+ out_msj = f"Error: The word '<b>{word}</b>' is not in the vocabulary!"
47
 
48
  return out_msj
49
 
50
+ def check_oov(
51
+ self,
52
+ wordlists: List[str]
53
+ ) -> str:
 
54
 
 
55
  for wordlist in wordlists:
56
  for word in wordlist:
57
  msg = self.__errorChecking(word)
 
59
  return msg
60
  return None
61
 
62
+ def get_neighbors(
63
+ self,
64
+ word: str,
65
+ n_neighbors: int,
66
+ nn_method: str
67
+ ) -> List[str]:
68
+
69
+ return self.embedding.getNearestNeighbors(word, n_neighbors, nn_method)
70
+
71
+ def get_df(
72
+ self,
73
+ words_embedded: np.ndarray,
74
+ processed_word_list: List[str]
75
+ ) -> pd.DataFrame:
76
 
 
77
  df = pd.DataFrame(words_embedded)
78
 
79
  df['word'] = [wtp.word for wtp in processed_word_list]
 
82
  df['word_bias_space'] = [wtp.bias_space for wtp in processed_word_list]
83
  return df
84
 
85
+ def get_plot(
86
+ self,
87
+ data: pd.DataFrame,
88
+ processed_word_list: List[str],
89
+ words_embedded: np.ndarray,
90
+ color_dict: Dict,
91
+ n_neighbors: int,
92
+ n_alpha: float,
93
+ fontsize: int=18,
94
+ figsize: Tuple[int, int]=(20, 15)
95
+ ):
96
+
97
  fig, ax = plt.subplots(figsize=figsize)
98
 
99
  sns.scatterplot(
 
118
  legend=False,
119
  palette=color_dict
120
  )
121
+
122
  for i, wtp in enumerate(processed_word_list):
123
  x, y = words_embedded[i, :]
124
+ ax.annotate(
125
+ wtp.word,
126
+ xy=(x, y),
127
+ xytext=(5, 2),
128
+ color=wtp.color,
129
+ textcoords='offset points',
130
+ ha='right',
131
+ va='bottom',
132
+ size=fontsize,
133
+ alpha=wtp.alpha
134
+ )
135
 
136
  ax.set_xticks([])
137
  ax.set_yticks([])
 
141
 
142
  return fig
143
 
144
+ def plot_projections_2d(
145
+ self,
146
+ wordlist_0: List[str],
147
+ wordlist_1: List[str]=[],
148
+ wordlist_2: List[str]=[],
149
+ wordlist_3: List[str]=[],
150
+ wordlist_4: List[str]=[],
151
+ **kwargs
152
+ ):
153
+
154
  # convertirlas a vector
155
  choices = [0, 1, 2, 3, 4]
156
  wordlist_choice = [
157
+ wordlist_0,
158
  wordlist_1,
159
+ wordlist_2,
160
+ wordlist_3,
161
  wordlist_4
162
  ]
163
 
164
+ err = self.check_oov(wordlist_choice)
165
  if err:
166
  raise Exception(err)
167
 
 
179
  processed_word_list = []
180
  for word_list_to_process, color in zip(wordlist_choice, choices):
181
  for word in word_list_to_process:
182
+ processed_word_list.append(
183
+ WordToPlot(word, color_dict[color], color, 1)
184
+ )
185
 
186
  if n_neighbors > 0:
187
+ neighbors = self.get_neighbors(
188
+ word,
189
+ n_neighbors=n_neighbors,
190
+ nn_method=kwargs.get('nn_method', 'sklearn')
191
+ )
192
+
193
  for n in neighbors:
194
  if n not in [wtp.word for wtp in processed_word_list]:
195
+ processed_word_list.append(
196
+ WordToPlot(n, color_dict[color], color, n_alpha)
197
+ )
198
 
199
  if not processed_word_list:
200
  raise Exception('Only empty lists were passed')
 
 
201
 
202
+ words_embedded = np.array(
203
+ [self.embedding.getPCA(wtp.word) for wtp in processed_word_list]
204
+ )
205
+
206
+ data = self.get_df(
207
+ words_embedded,
208
+ processed_word_list
209
+ )
210
+
211
+ fig = self.get_plot(
212
+ data,
213
+ processed_word_list,
214
+ words_embedded,
215
+ color_dict,
216
+ n_neighbors,
217
+ n_alpha,
218
+ kwargs.get('fontsize', 18),
219
+ kwargs.get('figsize', (20, 15))
220
+ )
221
 
 
 
 
 
 
222
  plt.show()
223
  return fig
224
 
225
+ # ToDo: No hay usos de este método. ¿Borrar?
226
+ def doesnt_match(
227
+ self,
228
+ wordlist: List[str]
229
+ ) -> str:
230
+
231
  err = self.check_oov([wordlist])
232
  if err:
233
  raise Exception(err)
234
+
235
+ words_emb = np.array([self.embedding.getEmbedding(word)
236
+ for word in wordlist])
237
  mean_vec = np.mean(words_emb, axis=0)
238
 
239
  doesnt_match = ""
240
  farthest_emb = 1.0
241
  for word in wordlist:
242
+ word_emb = self.embedding.getEmbedding(word)
243
+ cos_sim = np.dot(mean_vec, word_emb) / \
244
+ (norm(mean_vec)*norm(word_emb))
245
  if cos_sim <= farthest_emb:
246
  farthest_emb = cos_sim
247
  doesnt_match = word
modules/module_ann.py CHANGED
@@ -1,45 +1,71 @@
1
  import time
2
- import operator
3
  from tqdm import tqdm
4
  from annoy import AnnoyIndex
5
  from memory_profiler import profile
 
6
 
7
  class TicToc:
8
- def __init__(self):
 
 
 
9
  self.i = None
10
- def start(self):
 
 
 
 
11
  self.i = time.time()
12
- def stop(self):
 
 
 
 
13
  f = time.time()
14
  print(f - self.i, "seg.")
15
 
 
16
  class Ann:
17
- def __init__(self, words, vectors, coord):
18
- self.words = words.to_list()
19
- self.vectors = vectors.to_list()
20
- self.coord = coord.to_list()
 
 
 
 
 
 
21
  self.tree = None
22
 
23
  self.tt = TicToc()
24
 
25
- @profile
26
- def init(self, n_trees=10, metric='angular', n_jobs=-1):
27
- # metrics options = "angular", "euclidean", "manhattan", "hamming", or "dot"
28
- # n_jobs=-1 Run over all CPU availables
 
29
 
30
- print("Init tree...")
 
 
 
31
  self.tt.start()
32
  self.tree = AnnoyIndex(len(self.vectors[0]), metric=metric)
33
- for i,v in tqdm(enumerate(self.vectors), total=len(self.vectors)):
34
- self.tree.add_item(i,v)
35
  self.tt.stop()
36
 
37
- print("Build tree...")
38
  self.tt.start()
39
  self.tree.build(n_trees=n_trees, n_jobs=n_jobs)
40
  self.tt.stop()
41
 
42
- def __getWordId(self, word):
 
 
 
 
43
  word_id = None
44
  try:
45
  word_id = self.words.index(word)
@@ -47,16 +73,20 @@ class Ann:
47
  pass
48
  return word_id
49
 
50
- def get(self, word, n_neighbors=10):
 
 
 
 
 
51
  word_id = self.__getWordId(word)
52
- reword_xy_list = None
53
 
54
  if word_id != None:
55
- neighbord_id = self.tree.get_nns_by_item(word_id, n_neighbors)
56
- # word_xy_list = list(map(lambda i: (self.words[i],self.coord[i]), neighbord_id))
57
- # word_xy_list = list(map(lambda i: self.words[i], neighbord_id))
58
- word_xy_list = operator.itemgetter(*neighbord_id)(self.words)
59
  else:
60
  print(f"The word '{word}' does not exist")
61
-
62
- return word_xy_list
 
1
  import time
 
2
  from tqdm import tqdm
3
  from annoy import AnnoyIndex
4
  from memory_profiler import profile
5
+ from typing import List
6
 
7
  class TicToc:
8
+ def __init__(
9
+ self
10
+ ) -> None:
11
+
12
  self.i = None
13
+
14
+ def start(
15
+ self
16
+ ) -> None:
17
+
18
  self.i = time.time()
19
+
20
+ def stop(
21
+ self
22
+ ) -> None:
23
+
24
  f = time.time()
25
  print(f - self.i, "seg.")
26
 
27
+
28
  class Ann:
29
+ def __init__(
30
+ self,
31
+ words: List[str],
32
+ vectors: List,
33
+ coord: List,
34
+ ) -> None:
35
+
36
+ self.words = words
37
+ self.vectors = vectors
38
+ self.coord = coord
39
  self.tree = None
40
 
41
  self.tt = TicToc()
42
 
43
+ def init(self,
44
+ n_trees: int=10,
45
+ metric: str='angular',
46
+ n_jobs: int=-1 # n_jobs=-1 Run over all CPU availables
47
+ ) -> None:
48
 
49
+ availables_metrics = ['angular','euclidean','manhattan','hamming','dot']
50
+ assert(metric in self.availables_metrics), f"Error: The value of the parameter 'metric' can only be {availables_metrics}!"
51
+
52
+ print("\tInit tree...")
53
  self.tt.start()
54
  self.tree = AnnoyIndex(len(self.vectors[0]), metric=metric)
55
+ for i, v in tqdm(enumerate(self.vectors), total=len(self.vectors)):
56
+ self.tree.add_item(i, v)
57
  self.tt.stop()
58
 
59
+ print("\tBuild tree...")
60
  self.tt.start()
61
  self.tree.build(n_trees=n_trees, n_jobs=n_jobs)
62
  self.tt.stop()
63
 
64
+ def __getWordId(
65
+ self,
66
+ word: str
67
+ ) -> int:
68
+
69
  word_id = None
70
  try:
71
  word_id = self.words.index(word)
 
73
  pass
74
  return word_id
75
 
76
+ def get(
77
+ self,
78
+ word: str,
79
+ n_neighbors: int=10
80
+ ) -> List[str]:
81
+
82
  word_id = self.__getWordId(word)
83
+ neighbors_list = None
84
 
85
  if word_id != None:
86
+ neighbords_id = self.tree.get_nns_by_item(word_id, n_neighbors + 1)
87
+ neighbors_list = [self.words[idx] for idx in neighbords_id][1:]
88
+
 
89
  else:
90
  print(f"The word '{word}' does not exist")
91
+
92
+ return neighbors_list
modules/module_connection.py CHANGED
@@ -1,52 +1,75 @@
1
- import numpy as np
2
- import pandas as pd
3
- import gradio as gr
4
- from abc import ABC, abstractmethod
5
 
6
  from modules.module_WordExplorer import WordExplorer
7
  from modules.module_BiasExplorer import WordBiasExplorer
 
 
8
 
9
  class Connector(ABC):
10
- def parse_word(self, word : str):
 
 
 
 
11
  return word.lower().strip()
12
 
13
- def parse_words(self, array_in_string : str):
 
 
 
 
14
  words = array_in_string.strip()
15
  if not words:
16
  return []
17
- words = [self.parse_word(word) for word in words.split(',') if word.strip() != '']
 
 
 
 
18
  return words
19
 
20
- def process_error(self, err: str):
21
- if err is None:
22
- return
23
- return "<center><h3>" + err + "</h3></center>"
 
 
 
 
24
 
25
 
26
  class WordExplorerConnector(Connector):
 
 
 
 
27
 
28
- def __init__(self, **kwargs):
29
  if 'embedding' in kwargs:
30
  embedding = kwargs.get('embedding')
31
  else:
32
  raise KeyError
33
- self.word_explorer = WordExplorer(embedding)
34
-
35
- def plot_proyection_2d( self,
36
- wordlist_0,
37
- wordlist_1,
38
- wordlist_2,
39
- wordlist_3,
40
- wordlist_4,
41
- color_wordlist_0,
42
- color_wordlist_1,
43
- color_wordlist_2,
44
- color_wordlist_3,
45
- color_wordlist_4,
46
- n_alpha,
47
- fontsize,
48
- n_neighbors
49
- ):
 
 
 
 
 
50
  err = ""
51
  neighbors_method = 'sklearn'
52
  wordlist_0 = self.parse_words(wordlist_0)
@@ -59,49 +82,63 @@ class WordExplorerConnector(Connector):
59
  err = self.process_error("Enter at least one word to continue")
60
  return None, err
61
 
62
- err = self.word_explorer.check_oov([wordlist_0, wordlist_1, wordlist_2, wordlist_3, wordlist_4])
 
 
 
63
  if err:
64
  return None, self.process_error(err)
65
 
66
- fig = self.word_explorer.plot_projections_2d(wordlist_0,
67
- wordlist_1,
68
- wordlist_2,
69
- wordlist_3,
70
- wordlist_4,
71
- color_wordlist_0=color_wordlist_0,
72
- color_wordlist_1=color_wordlist_1,
73
- color_wordlist_2=color_wordlist_2,
74
- color_wordlist_3=color_wordlist_3,
75
- color_wordlist_4=color_wordlist_4,
76
- n_alpha=n_alpha,
77
- fontsize=fontsize,
78
- n_neighbors=n_neighbors,
79
- nn_method = neighbors_method
80
- )
 
 
81
  return fig, self.process_error(err)
82
 
83
  class BiasWordExplorerConnector(Connector):
84
 
85
- def __init__(self, **kwargs):
 
 
 
 
86
  if 'embedding' in kwargs:
87
  embedding = kwargs.get('embedding')
88
  else:
89
  raise KeyError
90
- self.bias_word_explorer = WordBiasExplorer(embedding)
91
 
92
- def calculate_bias_2d(self,
93
- wordlist_1,
94
- wordlist_2,
95
- to_diagnose_list
96
- ):
 
 
 
 
 
 
97
  err = ""
98
  wordlist_1 = self.parse_words(wordlist_1)
99
  wordlist_2 = self.parse_words(wordlist_2)
100
  to_diagnose_list = self.parse_words(to_diagnose_list)
101
 
102
  word_lists = [wordlist_1, wordlist_2, to_diagnose_list]
103
- for list in word_lists:
104
- if not list:
105
  err = "At least one word should be in the to diagnose list, bias 1 list and bias 2 list"
106
  if err:
107
  return None, self.process_error(err)
@@ -110,17 +147,23 @@ class BiasWordExplorerConnector(Connector):
110
  if err:
111
  return None, self.process_error(err)
112
 
113
- fig = self.bias_word_explorer.plot_biased_words(to_diagnose_list, wordlist_2, wordlist_1)
 
 
 
 
114
 
115
  return fig, self.process_error(err)
116
 
117
- def calculate_bias_4d(self,
118
- wordlist_1,
119
- wordlist_2,
120
- wordlist_3,
121
- wordlist_4,
122
- to_diagnose_list
123
- ):
 
 
124
  err = ""
125
  wordlist_1 = self.parse_words(wordlist_1)
126
  wordlist_2 = self.parse_words(wordlist_2)
@@ -129,8 +172,8 @@ class BiasWordExplorerConnector(Connector):
129
  to_diagnose_list = self.parse_words(to_diagnose_list)
130
 
131
  wordlists = [wordlist_1, wordlist_2, wordlist_3, wordlist_4, to_diagnose_list]
132
- for list in wordlists:
133
- if not list:
134
  err = "To plot with 4 spaces, you must enter at least one word in all lists."
135
  if err:
136
  return None, self.process_error(err)
@@ -139,5 +182,12 @@ class BiasWordExplorerConnector(Connector):
139
  if err:
140
  return None, self.process_error(err)
141
 
142
- fig = self.bias_word_explorer.plot_biased_words(to_diagnose_list, wordlist_1, wordlist_2, wordlist_3, wordlist_4)
 
 
 
 
 
 
 
143
  return fig, self.process_error(err)
 
1
+ from abc import ABC
 
 
 
2
 
3
  from modules.module_WordExplorer import WordExplorer
4
  from modules.module_BiasExplorer import WordBiasExplorer
5
+ from typing import List, Tuple
6
+
7
 
8
  class Connector(ABC):
9
+ def parse_word(
10
+ self,
11
+ word: str
12
+ ) -> str:
13
+
14
  return word.lower().strip()
15
 
16
+ def parse_words(
17
+ self,
18
+ array_in_string: str
19
+ ) -> List[str]:
20
+
21
  words = array_in_string.strip()
22
  if not words:
23
  return []
24
+
25
+ words = [
26
+ self.parse_word(word)
27
+ for word in words.split(',') if word.strip() != ''
28
+ ]
29
  return words
30
 
31
+ def process_error(
32
+ self,
33
+ err: str
34
+ ) -> str:
35
+
36
+ if err:
37
+ err = "<center><h3>" + err + "</h3></center>"
38
+ return err
39
 
40
 
41
  class WordExplorerConnector(Connector):
42
+ def __init__(
43
+ self,
44
+ **kwargs
45
+ ) -> None:
46
 
 
47
  if 'embedding' in kwargs:
48
  embedding = kwargs.get('embedding')
49
  else:
50
  raise KeyError
51
+
52
+ self.word_explorer = WordExplorer(
53
+ embedding=embedding
54
+ )
55
+
56
+ def plot_proyection_2d(
57
+ self,
58
+ wordlist_0: str,
59
+ wordlist_1: str,
60
+ wordlist_2: str,
61
+ wordlist_3: str,
62
+ wordlist_4: str,
63
+ color_wordlist_0: str,
64
+ color_wordlist_1: str,
65
+ color_wordlist_2: str,
66
+ color_wordlist_3: str,
67
+ color_wordlist_4: str,
68
+ n_alpha: float,
69
+ fontsize: int,
70
+ n_neighbors: int
71
+ ) -> Tuple:
72
+
73
  err = ""
74
  neighbors_method = 'sklearn'
75
  wordlist_0 = self.parse_words(wordlist_0)
 
82
  err = self.process_error("Enter at least one word to continue")
83
  return None, err
84
 
85
+ err = self.word_explorer.check_oov(
86
+ [wordlist_0, wordlist_1, wordlist_2, wordlist_3, wordlist_4]
87
+ )
88
+
89
  if err:
90
  return None, self.process_error(err)
91
 
92
+ fig = self.word_explorer.plot_projections_2d(
93
+ wordlist_0,
94
+ wordlist_1,
95
+ wordlist_2,
96
+ wordlist_3,
97
+ wordlist_4,
98
+ color_wordlist_0=color_wordlist_0,
99
+ color_wordlist_1=color_wordlist_1,
100
+ color_wordlist_2=color_wordlist_2,
101
+ color_wordlist_3=color_wordlist_3,
102
+ color_wordlist_4=color_wordlist_4,
103
+ n_alpha=n_alpha,
104
+ fontsize=fontsize,
105
+ n_neighbors=n_neighbors,
106
+ nn_method = neighbors_method
107
+ )
108
+
109
  return fig, self.process_error(err)
110
 
111
  class BiasWordExplorerConnector(Connector):
112
 
113
+ def __init__(
114
+ self,
115
+ **kwargs
116
+ ) -> None:
117
+
118
  if 'embedding' in kwargs:
119
  embedding = kwargs.get('embedding')
120
  else:
121
  raise KeyError
 
122
 
123
+ self.bias_word_explorer = WordBiasExplorer(
124
+ embedding=embedding
125
+ )
126
+
127
+ def calculate_bias_2d(
128
+ self,
129
+ wordlist_1: str,
130
+ wordlist_2: str,
131
+ to_diagnose_list: str
132
+ ) -> Tuple:
133
+
134
  err = ""
135
  wordlist_1 = self.parse_words(wordlist_1)
136
  wordlist_2 = self.parse_words(wordlist_2)
137
  to_diagnose_list = self.parse_words(to_diagnose_list)
138
 
139
  word_lists = [wordlist_1, wordlist_2, to_diagnose_list]
140
+ for _list in word_lists:
141
+ if not _list:
142
  err = "At least one word should be in the to diagnose list, bias 1 list and bias 2 list"
143
  if err:
144
  return None, self.process_error(err)
 
147
  if err:
148
  return None, self.process_error(err)
149
 
150
+ fig = self.bias_word_explorer.plot_biased_words(
151
+ to_diagnose_list,
152
+ wordlist_2,
153
+ wordlist_1
154
+ )
155
 
156
  return fig, self.process_error(err)
157
 
158
+ def calculate_bias_4d(
159
+ self,
160
+ wordlist_1: str,
161
+ wordlist_2: str,
162
+ wordlist_3: str,
163
+ wordlist_4: str,
164
+ to_diagnose_list: str
165
+ ) -> Tuple:
166
+
167
  err = ""
168
  wordlist_1 = self.parse_words(wordlist_1)
169
  wordlist_2 = self.parse_words(wordlist_2)
 
172
  to_diagnose_list = self.parse_words(to_diagnose_list)
173
 
174
  wordlists = [wordlist_1, wordlist_2, wordlist_3, wordlist_4, to_diagnose_list]
175
+ for _list in wordlists:
176
+ if not _list:
177
  err = "To plot with 4 spaces, you must enter at least one word in all lists."
178
  if err:
179
  return None, self.process_error(err)
 
182
  if err:
183
  return None, self.process_error(err)
184
 
185
+ fig = self.bias_word_explorer.plot_biased_words(
186
+ to_diagnose_list,
187
+ wordlist_1,
188
+ wordlist_2,
189
+ wordlist_3,
190
+ wordlist_4
191
+ )
192
+
193
  return fig, self.process_error(err)
modules/module_logsManager.py CHANGED
@@ -1,26 +1,36 @@
1
- import csv, os, pytz
 
2
  from gradio import utils
3
- from datetime import datetime
4
- from dotenv import load_dotenv
5
- from distutils.log import debug
6
  from typing import Any, List, Optional
7
- from gradio.components import IOComponent
8
- from gradio.flagging import FlaggingCallback, _get_dataset_features_info
 
9
 
10
 
11
  # --- Load environments vars ---
12
  load_dotenv()
13
 
 
14
  # --- Classes declaration ---
15
  class DateLogs:
16
- def __init__(self, zone="America/Argentina/Cordoba"):
 
 
 
 
17
  self.time_zone = pytz.timezone(zone)
18
 
19
- def full(self):
 
 
 
20
  now = datetime.now(self.time_zone)
21
  return now.strftime("%H:%M:%S %d-%m-%Y")
22
 
23
- def day(self):
 
 
 
24
  now = datetime.now(self.time_zone)
25
  return now.strftime("%d-%m-%Y")
26
 
@@ -40,12 +50,12 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
40
 
41
  def __init__(
42
  self,
43
- hf_token: str = os.getenv('HF_TOKEN'),
44
- dataset_name: str = os.getenv('DS_LOGS_NAME'),
45
- organization: Optional[str] = os.getenv('ORG_NAME'),
46
- private: bool = True,
47
- available_logs: bool = False
48
- ):
49
  """
50
  Parameters:
51
  hf_token: The HuggingFace token to use to create (and write the flagged sample to) the HuggingFace dataset.
@@ -53,6 +63,8 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
53
  organization: The organization to save the dataset under. The hf_token must provide write access to this organization. If not provided, saved under the name of the user corresponding to the hf_token.
54
  private: Whether the dataset should be private (defaults to False).
55
  """
 
 
56
  self.hf_token = hf_token
57
  self.dataset_name = dataset_name
58
  self.organization_name = organization
@@ -65,10 +77,10 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
65
 
66
 
67
  def setup(
68
- self,
69
- components: List[IOComponent],
70
- flagging_dir: str
71
- ):
72
  """
73
  Params:
74
  flagging_dir (str): local directory where the dataset is cloned,
@@ -112,9 +124,9 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
112
  def flag(
113
  self,
114
  flag_data: List[Any],
115
- flag_option: Optional[str] = None,
116
- flag_index: Optional[int] = None,
117
- username: Optional[str] = None,
118
  ) -> int:
119
 
120
  if self.available_logs:
 
1
+ from gradio.flagging import FlaggingCallback, _get_dataset_features_info
2
+ from gradio.components import IOComponent
3
  from gradio import utils
 
 
 
4
  from typing import Any, List, Optional
5
+ from dotenv import load_dotenv
6
+ from datetime import datetime
7
+ import csv, os, pytz
8
 
9
 
10
  # --- Load environments vars ---
11
  load_dotenv()
12
 
13
+
14
  # --- Classes declaration ---
15
  class DateLogs:
16
+ def __init__(
17
+ self,
18
+ zone: str="America/Argentina/Cordoba"
19
+ ) -> None:
20
+
21
  self.time_zone = pytz.timezone(zone)
22
 
23
+ def full(
24
+ self
25
+ ) -> str:
26
+
27
  now = datetime.now(self.time_zone)
28
  return now.strftime("%H:%M:%S %d-%m-%Y")
29
 
30
+ def day(
31
+ self
32
+ ) -> str:
33
+
34
  now = datetime.now(self.time_zone)
35
  return now.strftime("%d-%m-%Y")
36
 
 
50
 
51
  def __init__(
52
  self,
53
+ dataset_name: str=None,
54
+ hf_token: str=os.getenv('HF_TOKEN'),
55
+ organization: Optional[str]=os.getenv('ORG_NAME'),
56
+ private: bool=True,
57
+ available_logs: bool=False
58
+ ) -> None:
59
  """
60
  Parameters:
61
  hf_token: The HuggingFace token to use to create (and write the flagged sample to) the HuggingFace dataset.
 
63
  organization: The organization to save the dataset under. The hf_token must provide write access to this organization. If not provided, saved under the name of the user corresponding to the hf_token.
64
  private: Whether the dataset should be private (defaults to False).
65
  """
66
+ assert(dataset_name is not None), "Error: Parameter 'dataset_name' cannot be empty!."
67
+
68
  self.hf_token = hf_token
69
  self.dataset_name = dataset_name
70
  self.organization_name = organization
 
77
 
78
 
79
  def setup(
80
+ self,
81
+ components: List[IOComponent],
82
+ flagging_dir: str
83
+ ) -> None:
84
  """
85
  Params:
86
  flagging_dir (str): local directory where the dataset is cloned,
 
124
  def flag(
125
  self,
126
  flag_data: List[Any],
127
+ flag_option: Optional[str]=None,
128
+ flag_index: Optional[int]=None,
129
+ username: Optional[str]=None,
130
  ) -> int:
131
 
132
  if self.available_logs:
tool_info.py CHANGED
@@ -4,7 +4,7 @@ TOOL_INFO = """
4
  * [Read Full Paper](https://arxiv.org/abs/2207.06591)
5
 
6
  > ### Licensing Information
7
- * [MIT Licence](https://huggingface.co/spaces/vialibre/vialibre/bias_we_std_tool/resolve/main/LICENSE)
8
 
9
  > ### Citation Information
10
  ```c
 
4
  * [Read Full Paper](https://arxiv.org/abs/2207.06591)
5
 
6
  > ### Licensing Information
7
+ * [MIT Licence](https://huggingface.co/spaces/vialibre/edia_we_en/resolve/main/LICENSE)
8
 
9
  > ### Citation Information
10
  ```c