Spaces:

vialibre
/

edia_we_en

Runtime error

App Files Files Community

nanom commited on Dec 12, 2022

Commit

6ff911e

1 Parent(s): e47242d

Update all modules

Browse files

Files changed (16) hide show

.gitignore +2 -2
app.py +13 -5
data/.gitignore +0 -1
data/data_loader.py +0 -37
examples/.gitignore +2 -1
examples/{examples.py → examples_en.py} +0 -0
interfaces/interface_BiasWordExplorer.py +97 -41
interfaces/interface_WordExplorer.py +106 -33
language/.gitignore +1 -0
modules/model_embbeding.py +111 -49
modules/module_BiasExplorer.py +128 -58
modules/module_WordExplorer.py +124 -60
modules/module_ann.py +55 -25
modules/module_connection.py +115 -65
modules/module_logsManager.py +34 -22
tool_info.py +1 -1

.gitignore CHANGED Viewed

@@ -1,3 +1,3 @@
 __pycache__/
-bias_tool_logs/
-*.env

 __pycache__/
+*.env
+logs_edia_we_english/

app.py CHANGED Viewed

@@ -6,26 +6,34 @@ import pandas as pd
 # --- Imports modules ---
 from modules.model_embbeding import Embedding
 # --- Imports interfaces ---
 from interfaces.interface_WordExplorer import interface as wordExplorer_interface
 from interfaces.interface_BiasWordExplorer import interface as biasWordExplorer_interface
 # --- Tool config ---
-AVAILABLE_LOGS      = True                          # [True     | False]
-LANGUAGE            = "english"                     # [spanish  | english]
 EMBEDDINGS_PATH     = "data/GoogleNews-vectors-negative300-SLIM.bin"
 MAX_NEIGHBORS       = 20
 # --- Init classes ---
 embedding = Embedding(
     path=EMBEDDINGS_PATH,
-    binary=EMBEDDINGS_PATH.endswith('.bin'),
-    limit=100_000,
     randomizedPCA=False,
-    max_neighbors=20
 )
 labels = pd.read_json(f"language/{LANGUAGE}.json")["app"]
 # --- Main App ---
 INTERFACE_LIST = [
     biasWordExplorer_interface(

 # --- Imports modules ---
 from modules.model_embbeding import Embedding
 # --- Imports interfaces ---
 from interfaces.interface_WordExplorer import interface as wordExplorer_interface
 from interfaces.interface_BiasWordExplorer import interface as biasWordExplorer_interface
 # --- Tool config ---
 EMBEDDINGS_PATH     = "data/GoogleNews-vectors-negative300-SLIM.bin"
+LANGUAGE            = "english"                      # [spanish  | english]
 MAX_NEIGHBORS       = 20
+NN_METHOD           = 'sklearn'                      # ['sklearn' | 'ann']
+AVAILABLE_LOGS      = True                           # [True | False]
 # --- Init classes ---
 embedding = Embedding(
     path=EMBEDDINGS_PATH,
+    limit=100000,
     randomizedPCA=False,
+    max_neighbors=MAX_NEIGHBORS,
+    nn_method=NN_METHOD
 )
+# --- Init Vars ---
 labels = pd.read_json(f"language/{LANGUAGE}.json")["app"]
 # --- Main App ---
 INTERFACE_LIST = [
     biasWordExplorer_interface(

data/.gitignore CHANGED Viewed

	@@ -1,2 +1 @@
1	__pycache__/
2	- data_loader.py


1	__pycache__/

data/data_loader.py DELETED Viewed

@@ -1,37 +0,0 @@
-import pandas as pd
-from sklearn.decomposition import PCA
-from gensim.models import KeyedVectors
-def load_embeddings(path, binary = False, randomPCA = False, limit = None):
-    if randomPCA:
-        pca = PCA(n_components=2,
-                  copy=False,
-                  whiten=False,
-                  svd_solver='randomized',
-                  iterated_power='auto'
-                  )
-    else:
-        pca = PCA(n_components=2)
-    print("--------> PATH:", path)
-    model = KeyedVectors.load_word2vec_format(path, binary=binary, limit=limit)
-    # Cased Vocab
-    cased_words = model.index_to_key
-    cased_emb = model.get_normed_vectors()
-    cased_pca = pca.fit_transform(cased_emb)
-    df_cased = pd.DataFrame(
-        zip(
-            cased_words,
-            cased_emb,
-            cased_pca
-        ),
-        columns=['word', 'embedding', 'pca']
-    )
-    df_cased['word'] = df_cased.word.apply(lambda w: w.lower())
-    df_uncased = df_cased.drop_duplicates(subset='word')
-    return df_uncased
-#load_embeddings('data/fasttext-sbwc.100k.vec', limit=1000)

examples/.gitignore CHANGED Viewed

	@@ -1 +1,2 @@
1	- __pycache__


1	+ __pycache__
2	+ examples_es.py

examples/{examples.py → examples_en.py} RENAMED Viewed

File without changes

interfaces/interface_BiasWordExplorer.py CHANGED Viewed

@@ -1,48 +1,96 @@
 import gradio as gr
 import pandas as pd
-from tkinter import image_names
-from tool_info import TOOL_INFO
 from modules.module_logsManager import HuggingFaceDatasetSaver
 from modules.module_connection import BiasWordExplorerConnector
-from examples.examples import examples1_explorar_sesgo_en_palabras, examples2_explorar_sesgo_en_palabras
 # --- Interface ---
-def interface(embedding, available_logs, lang="spanish"):
     # --- Init logs ---
     log_callback = HuggingFaceDatasetSaver(
-        available_logs=available_logs
     )
     # --- Init vars ---
-    connector = BiasWordExplorerConnector(embedding=embedding)
-    labels = pd.read_json(f"language/{lang}.json")["BiasWordExplorer_interface"]
     interface = gr.Blocks()
     with interface:
-        gr.Markdown(labels["step1"])
         with gr.Row():
             with gr.Column():
                 with gr.Row():
-                    diagnose_list = gr.Textbox(lines=2, label=labels["wordListToDiagnose"])
                 with gr.Row():
-                    gr.Markdown(labels["step2&2Spaces"])
                 with gr.Row():
-                    wordlist_1 = gr.Textbox(lines=2, label=labels["wordList1"])
-                    wordlist_2 = gr.Textbox(lines=2, label=labels["wordList2"])
                 with gr.Row():
-                    gr.Markdown(labels["step2&4Spaces"])
                 with gr.Row():
-                    wordlist_3 = gr.Textbox(lines=2, label=labels["wordList3"])
-                    wordlist_4 = gr.Textbox(lines=2, label=labels["wordList4"])
             with gr.Column():
                 with gr.Row():
-                    bias2d = gr.Button(labels["plot2SpacesButton"])
                 with gr.Row():
-                    bias4d = gr.Button(labels["plot4SpacesButton"])
                 with gr.Row():
-                    err_msg = gr.Markdown(label='',visible=True)
                 with gr.Row():
-                    bias_plot = gr.Plot(label="", show_label=False)
         with gr.Row():
             examples = gr.Examples(
                 fn=connector.calculate_bias_2d,
@@ -54,51 +102,59 @@ def interface(embedding, available_logs, lang="spanish"):
         with gr.Row():
             examples = gr.Examples(
                 fn=connector.calculate_bias_4d,
-                inputs=[wordlist_1, wordlist_2,
-                        wordlist_3, wordlist_4, diagnose_list],
-                outputs=[bias_plot, err_msg],
                 examples=examples2_explorar_sesgo_en_palabras,
                 label=labels["examples4Spaces"]
             )
         with gr.Row():
-            gr.Markdown(TOOL_INFO)
         bias2d.click(
-            fn=connector.calculate_bias_2d,
-            inputs=[wordlist_1,wordlist_2,diagnose_list],
-            outputs=[bias_plot,err_msg]
         )
         bias4d.click(
             fn=connector.calculate_bias_4d,
-            inputs=[wordlist_1,wordlist_2,wordlist_3,wordlist_4,diagnose_list],
-            outputs=[bias_plot,err_msg]
         )
         # --- Logs ---
-        save_field = [wordlist_1,wordlist_2,wordlist_3,wordlist_4,diagnose_list]
-        log_callback.setup(components=save_field, flagging_dir="edia_bias_we_es")
         bias2d.click(
             fn=lambda *args: log_callback.flag(
-                    flag_data=args,
-                    flag_option="plot_2d",
-                    username="vialibre"
             ),
             inputs=save_field,
-            outputs=None,
             preprocess=False
         )
         bias4d.click(
             fn=lambda *args: log_callback.flag(
-                    flag_data=args,
-                    flag_option="plot_4d",
-                    username="vialibre"
             ),
             inputs=save_field,
-            outputs=None,
             preprocess=False
         )
-    return interface

 import gradio as gr
 import pandas as pd
 from modules.module_logsManager import HuggingFaceDatasetSaver
 from modules.module_connection import BiasWordExplorerConnector
+from examples.examples_en import examples1_explorar_sesgo_en_palabras, examples2_explorar_sesgo_en_palabras
+from tool_info import TOOL_INFO
 # --- Interface ---
+def interface(
+    embedding, # Class Embedding instance
+    available_logs: bool,
+    lang: str="english"
+) -> gr.Blocks:
     # --- Init logs ---
     log_callback = HuggingFaceDatasetSaver(
+        available_logs=available_logs,
+        dataset_name=f"logs_edia_we_{lang}"
     )
     # --- Init vars ---
+    connector = BiasWordExplorerConnector(
+        embedding=embedding
+    )
+    # --- Load language ---
+    labels = pd.read_json(
+        f"language/{lang}.json"
+    )["BiasWordExplorer_interface"]
+    # --- Interface ---
     interface = gr.Blocks()
     with interface:
+        gr.Markdown(
+            value=labels["step1"]
+        )
         with gr.Row():
             with gr.Column():
                 with gr.Row():
+                    diagnose_list = gr.Textbox(
+                        lines=2,
+                        label=labels["wordListToDiagnose"]
+                    )
                 with gr.Row():
+                    gr.Markdown(
+                        value=labels["step2&2Spaces"]
+                    )
                 with gr.Row():
+                    wordlist_1 = gr.Textbox(
+                        lines=2,
+                        label=labels["wordList1"]
+                    )
+                    wordlist_2 = gr.Textbox(
+                        lines=2,
+                        label=labels["wordList2"]
+                    )
                 with gr.Row():
+                    gr.Markdown(
+                        value=labels["step2&4Spaces"]
+                    )
                 with gr.Row():
+                    wordlist_3 = gr.Textbox(
+                        lines=2,
+                        label=labels["wordList3"]
+                    )
+                    wordlist_4 = gr.Textbox(
+                        lines=2,
+                        label=labels["wordList4"]
+                    )
             with gr.Column():
                 with gr.Row():
+                    bias2d = gr.Button(
+                        value=labels["plot2SpacesButton"]
+                    )
                 with gr.Row():
+                    bias4d = gr.Button(
+                        value=labels["plot4SpacesButton"]
+                    )
                 with gr.Row():
+                    err_msg = gr.Markdown(
+                        label="",
+                        visible=True
+                    )
                 with gr.Row():
+                    bias_plot = gr.Plot(
+                        label="",
+                        show_label=False
+                    )
         with gr.Row():
             examples = gr.Examples(
                 fn=connector.calculate_bias_2d,
         with gr.Row():
             examples = gr.Examples(
                 fn=connector.calculate_bias_4d,
+                inputs=[wordlist_1, wordlist_2,wordlist_3, wordlist_4, diagnose_list],
+                outputs=[
+                    bias_plot, err_msg
+                ],
                 examples=examples2_explorar_sesgo_en_palabras,
                 label=labels["examples4Spaces"]
             )
         with gr.Row():
+            gr.Markdown(
+                value=TOOL_INFO
+            )
         bias2d.click(
+            fn=connector.calculate_bias_2d,
+            inputs=[wordlist_1, wordlist_2, diagnose_list],
+            outputs=[bias_plot, err_msg]
         )
         bias4d.click(
             fn=connector.calculate_bias_4d,
+            inputs=[wordlist_1, wordlist_2,
+                    wordlist_3, wordlist_4, diagnose_list],
+            outputs=[bias_plot, err_msg]
         )
         # --- Logs ---
+        save_field = [wordlist_1, wordlist_2,wordlist_3, wordlist_4, diagnose_list]
+        log_callback.setup(
+            components=save_field,
+            flagging_dir="logs_word_bias"
+        )
         bias2d.click(
             fn=lambda *args: log_callback.flag(
+                flag_data=args,
+                flag_option="plot_2d",
+                username="vialibre"
             ),
             inputs=save_field,
+            outputs=None,
             preprocess=False
         )
         bias4d.click(
             fn=lambda *args: log_callback.flag(
+                flag_data=args,
+                flag_option="plot_4d",
+                username="vialibre"
             ),
             inputs=save_field,
+            outputs=None,
             preprocess=False
         )
+    return interface

interfaces/interface_WordExplorer.py CHANGED Viewed

@@ -2,73 +2,140 @@ import gradio as gr
 import pandas as pd
 import matplotlib.pyplot as plt
-from tool_info import TOOL_INFO
 from modules.module_connection import WordExplorerConnector
 from modules.module_logsManager import HuggingFaceDatasetSaver
-from examples.examples import examples_explorar_relaciones_entre_palabras
 plt.rcParams.update({'font.size': 14})
 def interface(
-    embedding,
     available_logs: bool,
-    max_neighbors: int, # Updated
-    lang: str="spanish",
 ) -> gr.Blocks:
     # --- Init logs ---
     log_callback = HuggingFaceDatasetSaver(
-        available_logs=available_logs
     )
     # --- Init vars ---
-    connector = WordExplorerConnector(embedding=embedding)
-    labels = pd.read_json(f"language/{lang}.json")["WordExplorer_interface"]
     # --- Interface ---
     interface = gr.Blocks()
     with interface:
-        gr.Markdown(labels["title"])
         with gr.Row():
             with gr.Column(scale=3):
                 with gr.Row(equal_height=True):
                     with gr.Column(scale=5):
-                        diagnose_list = gr.Textbox(lines=2, label=labels["wordListToDiagnose"])
                     with gr.Column(scale=1,min_width=10):
-                        color_wordlist = gr.ColorPicker(label="",value='#000000',)
                 with gr.Row():
                     with gr.Column(scale=5):
-                        wordlist_1 = gr.Textbox(lines=2, label=labels["wordList1"])
                     with gr.Column(scale=1,min_width=10):
-                        color_wordlist_1 = gr.ColorPicker(label="",value='#1f78b4')
                 with gr.Row():
                     with gr.Column(scale=5):
-                        wordlist_2 = gr.Textbox(lines=2, label=labels["wordList2"])
                     with gr.Column(scale=1,min_width=10):
-                        color_wordlist_2 = gr.ColorPicker(label="",value='#33a02c')
                 with gr.Row():
                     with gr.Column(scale=5):
-                        wordlist_3 = gr.Textbox(lines=2, label=labels["wordList3"])
                     with gr.Column(scale=1,min_width=10):
-                        color_wordlist_3 = gr.ColorPicker(label="",value='#e31a1c')
                 with gr.Row():
                     with gr.Column(scale=5):
-                        wordlist_4 = gr.Textbox(lines=2, label=labels["wordList4"])
                     with gr.Column(scale=1,min_width=10):
-                        color_wordlist_4 = gr.ColorPicker(label="",value='#6a3d9a')
             with gr.Column(scale=4):
                 with gr.Row():
                     with gr.Row():
-                        gr.Markdown(labels["plotNeighbours"]["title"])
-                        n_neighbors = gr.Slider(minimum=0,maximum=max_neighbors,step=1,label=labels["plotNeighbours"]["quantity"])
                     with gr.Row():
-                        alpha = gr.Slider(minimum=0.1,maximum=0.9, value=0.3, step=0.1,label=labels["options"]["transparency"])
-                        fontsize=gr.Number(value=25, label=labels["options"]["font-size"])
                     with gr.Row():
-                        btn_plot = gr.Button(labels["plot_button"])
                 with gr.Row():
-                    err_msg = gr.Markdown(label="", visible=True)
                 with gr.Row():
-                    word_proyections = gr.Plot(label="", show_label=False)
         with gr.Row():
             gr.Examples(
@@ -80,7 +147,9 @@ def interface(
             )
         with gr.Row():
-            gr.Markdown(TOOL_INFO)
         btn_plot.click(
             fn=connector.plot_proyection_2d,
@@ -99,21 +168,25 @@ def interface(
                 fontsize,
                 n_neighbors
             ],
-            outputs=[word_proyections,err_msg]
         )
         # --- Logs ---
-        save_field = [diagnose_list,wordlist_1,wordlist_2,wordlist_3,wordlist_4]
-        log_callback.setup(components=save_field, flagging_dir="edia_we_es")
         btn_plot.click(
             fn=lambda *args: log_callback.flag(
-                    flag_data=args,
-                    flag_option="explorar_palabras",
-                    username="vialibre",
             ),
             inputs=save_field,
             outputs=None,
             preprocess=False
         )
         return interface

 import pandas as pd
 import matplotlib.pyplot as plt
 from modules.module_connection import WordExplorerConnector
 from modules.module_logsManager import HuggingFaceDatasetSaver
+from examples.examples_en import examples_explorar_relaciones_entre_palabras
+from tool_info import TOOL_INFO
 plt.rcParams.update({'font.size': 14})
 def interface(
+    embedding, # Class Embedding instance
     available_logs: bool,
+    max_neighbors: int,
+    lang: str="english",
 ) -> gr.Blocks:
     # --- Init logs ---
     log_callback = HuggingFaceDatasetSaver(
+        available_logs=available_logs,
+        dataset_name=f"logs_edia_we_{lang}"
     )
     # --- Init vars ---
+    connector = WordExplorerConnector(
+        embedding=embedding
+    )
+    # --- Load language ---
+    labels = pd.read_json(
+        f"language/{lang}.json"
+    )["WordExplorer_interface"]
     # --- Interface ---
     interface = gr.Blocks()
     with interface:
+        gr.Markdown(
+            value=labels["title"]
+        )
         with gr.Row():
             with gr.Column(scale=3):
                 with gr.Row(equal_height=True):
                     with gr.Column(scale=5):
+                        diagnose_list = gr.Textbox(
+                            lines=2,
+                            label=labels["wordListToDiagnose"]
+                        )
                     with gr.Column(scale=1,min_width=10):
+                        color_wordlist = gr.ColorPicker(
+                            label="",
+                            value='#000000'
+                        )
                 with gr.Row():
                     with gr.Column(scale=5):
+                        wordlist_1 = gr.Textbox(
+                            lines=2,
+                            label=labels["wordList1"]
+                        )
                     with gr.Column(scale=1,min_width=10):
+                        color_wordlist_1 = gr.ColorPicker(
+                            label="",
+                            value='#1f78b4'
+                        )
                 with gr.Row():
                     with gr.Column(scale=5):
+                        wordlist_2 = gr.Textbox(
+                            lines=2,
+                            label=labels["wordList2"]
+                        )
                     with gr.Column(scale=1,min_width=10):
+                        color_wordlist_2 = gr.ColorPicker(
+                            label="",
+                            value='#33a02c'
+                        )
                 with gr.Row():
                     with gr.Column(scale=5):
+                        wordlist_3 = gr.Textbox(
+                            lines=2,
+                            label=labels["wordList3"]
+                        )
                     with gr.Column(scale=1,min_width=10):
+                        color_wordlist_3 = gr.ColorPicker(
+                            label="",
+                            value='#e31a1c'
+                        )
                 with gr.Row():
                     with gr.Column(scale=5):
+                        wordlist_4 = gr.Textbox(
+                            lines=2,
+                            label=labels["wordList4"]
+                        )
                     with gr.Column(scale=1,min_width=10):
+                        color_wordlist_4 = gr.ColorPicker(
+                            label="",
+                            value='#6a3d9a'
+                        )
             with gr.Column(scale=4):
                 with gr.Row():
                     with gr.Row():
+                        gr.Markdown(
+                            value=labels["plotNeighbours"]["title"]
+                        )
+                        n_neighbors = gr.Slider(
+                            minimum=0,
+                            maximum=max_neighbors,
+                            step=1,
+                            label=labels["plotNeighbours"]["quantity"]
+                        )
                     with gr.Row():
+                        alpha = gr.Slider(
+                            minimum=0.1,
+                            maximum=0.9,
+                            value=0.3,
+                            step=0.1,
+                            label=labels["options"]["transparency"]
+                        )
+                        fontsize=gr.Number(
+                            value=25,
+                            label=labels["options"]["font-size"]
+                        )
                     with gr.Row():
+                        btn_plot = gr.Button(
+                            value=labels["plot_button"]
+                        )
                 with gr.Row():
+                    err_msg = gr.Markdown(
+                        label="",
+                        visible=True
+                    )
                 with gr.Row():
+                    word_proyections = gr.Plot(
+                        label="",
+                        show_label=False
+                    )
         with gr.Row():
             gr.Examples(
             )
         with gr.Row():
+            gr.Markdown(
+                value=TOOL_INFO
+            )
         btn_plot.click(
             fn=connector.plot_proyection_2d,
                 fontsize,
                 n_neighbors
             ],
+            outputs=[word_proyections, err_msg]
         )
         # --- Logs ---
+        save_field = [diagnose_list, wordlist_1, wordlist_2, wordlist_3, wordlist_4]
+        log_callback.setup(
+            components=save_field,
+            flagging_dir="logs_word_explorer"
+        )
         btn_plot.click(
             fn=lambda *args: log_callback.flag(
+                flag_data=args,
+                flag_option="word_explorer",
+                username="vialibre",
             ),
             inputs=save_field,
             outputs=None,
             preprocess=False
         )
         return interface

language/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ spanish.json

modules/model_embbeding.py CHANGED Viewed

@@ -3,9 +3,8 @@ from memory_profiler import profile
 from sklearn.neighbors import NearestNeighbors
 from sklearn.decomposition import PCA
 from gensim.models import KeyedVectors
-from typing import List
 import os
-import operator
 import pandas as pd
 import numpy as np
@@ -14,21 +13,22 @@ from gensim import matutils
 class Embedding:
-    @profile
     def __init__(self,
         path: str,
-        binary: bool,
-        limit: int=None,
         randomizedPCA: bool=False,
-        max_neighbors: int=20
     ) -> None:
         # Embedding vars
         self.path = path
         self.limit = limit
         self.randomizedPCA = randomizedPCA
-        self.binary = binary
         self.max_neighbors = max_neighbors
         # Full embedding dataset
         self.ds = None
@@ -44,36 +44,34 @@ class Embedding:
         self,
     ) -> None:
         print(f"Preparing {os.path.basename(self.path)} embeddings...")
         # --- Prepare dataset ---
         self.ds = self.__preparate(
-            self.path, self.binary, self.limit, self.randomizedPCA
         )
         # --- Estimate Nearest Neighbors
-        # Method A: Througth annoy using forest tree
-        self.ann = Ann(
-            words=self.ds['word'],
-            vectors=self.ds['embedding'],
-            coord=self.ds['pca']
-        )
-        self.ann.init(
-            n_trees=20, metric='dot', n_jobs=-1
-        )
-        # Method B: Througth Sklearn method
-        self.neigh = NearestNeighbors(
-            n_neighbors=self.max_neighbors
-        )
-        self.neigh.fit(
-            X=self.ds['embedding'].to_list()
-        )
     def __preparate(
         self,
-        path: str,
-        binary: bool,
         limit: int,
         randomizedPCA: bool
     ) -> pd.DataFrame:
@@ -94,7 +92,7 @@ class Embedding:
         model = KeyedVectors.load_word2vec_format(
             fname=path,
-            binary=binary,
             limit=limit
         )
@@ -116,11 +114,48 @@ class Embedding:
         df_uncased = df_cased.drop_duplicates(subset='word')
         return df_uncased
     def __getValue(
         self,
         word: str,
         feature: str
-    ):
         word_id, value = None, None
         if word in self:
@@ -128,20 +163,22 @@ class Embedding:
         if word_id != None:
             value = self.ds[feature].to_list()[word_id]
         return value
     def getEmbedding(
         self,
         word: str
-    ):
         return self.__getValue(word, 'embedding')
     def getPCA(
         self,
         word: str
-    ):
         return self.__getValue(word, 'pca')
@@ -154,36 +191,61 @@ class Embedding:
         assert(n_neighbors <= self.max_neighbors), f"Error: The value of the parameter 'n_neighbors:{n_neighbors}' must less than or equal to {self.max_neighbors}!."
         if nn_method == 'ann':
-            words = self.ann.get(word, n_neighbors)
         elif nn_method == 'sklearn':
-            word_emb = self.getEmbedding(word).reshape(1,-1)
-            _, nn_ids = self.neigh.kneighbors(word_emb, n_neighbors+1)
-            #words = operator.itemgetter(*nn_ids[0])(self.ds['word'].to_list())
-            words = [self.ds['word'].to_list()[idx] for idx in nn_ids[0]][1:]
-        else:
-            words = []
-        return words
-    def __contains__(
-        self,
-        word: str
-    ) -> bool:
-        return word in self.ds['word'].to_list()
-    # ToDo: Revisar estos dos métodos usados en la pestaña sesgoEnPalabras
-    # ya que ahora los embedding vienen normalizados
-    def cosineSimilarities(self, vector_1, vectors_all):
         norm = np.linalg.norm(vector_1)
         all_norms = np.linalg.norm(vectors_all, axis=1)
         dot_products = dot(vectors_all, vector_1)
         similarities = dot_products / (norm * all_norms)
         return similarities
-    def getCosineSimilarities(self, w1, w2):
         return dot(
             matutils.unitvec(self.getEmbedding(w1)),
             matutils.unitvec(self.getEmbedding(w2))
-        )

 from sklearn.neighbors import NearestNeighbors
 from sklearn.decomposition import PCA
 from gensim.models import KeyedVectors
+from typing import List, Any
 import os
 import pandas as pd
 import numpy as np
 class Embedding:
     def __init__(self,
         path: str,
+        limit: int=None,
         randomizedPCA: bool=False,
+        max_neighbors: int=20,
+        nn_method: str='sklearn'
     ) -> None:
         # Embedding vars
         self.path = path
         self.limit = limit
         self.randomizedPCA = randomizedPCA
         self.max_neighbors = max_neighbors
+        self.availables_nn_methods = ['sklearn', 'ann']
+        self.nn_method = nn_method
         # Full embedding dataset
         self.ds = None
         self,
     ) -> None:
+        assert(self.nn_method in self.availables_nn_methods), f"Error: The value of the parameter 'nn method' can only be {self.availables_nn_methods}!"
         print(f"Preparing {os.path.basename(self.path)} embeddings...")
         # --- Prepare dataset ---
         self.ds = self.__preparate(
+            self.path, self.limit, self.randomizedPCA
         )
         # --- Estimate Nearest Neighbors
+        if self.nn_method == 'sklearn':
+            # Method A: Througth Sklearn method
+            self.__init_sklearn_method(
+                max_neighbors=self.max_neighbors,
+                vectors=self.ds['embedding'].to_list()
+            )
+        elif self.nn_method == 'ann':
+            # Method B: Througth annoy using forest tree
+            self.__init_ann_method(
+                words=self.ds['word'].to_list(),
+                vectors=self.ds['embedding'].to_list(),
+                coord=self.ds['pca'].to_list()
+            )
     def __preparate(
         self,
+        path: str,
         limit: int,
         randomizedPCA: bool
     ) -> pd.DataFrame:
         model = KeyedVectors.load_word2vec_format(
             fname=path,
+            binary=path.endswith('.bin'),
             limit=limit
         )
         df_uncased = df_cased.drop_duplicates(subset='word')
         return df_uncased
+    def __init_ann_method(
+        self,
+        words: List[str],
+        vectors: List[float],
+        coord: List[float],
+        n_trees: int=20,
+        metric: str='dot'
+    ) -> None:
+        print("Initializing Annoy method to search for nearby neighbors...")
+        self.ann = Ann(
+            words=words,
+            vectors=vectors,
+            coord=coord,
+        )
+        self.ann.init(
+            n_trees=n_trees,
+            metric=metric,
+            n_jobs=-1
+        )
+    def __init_sklearn_method(
+        self,
+        max_neighbors: int,
+        vectors: List[float]
+    ) -> None:
+        print("Initializing sklearn method to search for nearby neighbors...")
+        self.neigh = NearestNeighbors(
+            n_neighbors=max_neighbors
+        )
+        self.neigh.fit(
+            X=vectors
+        )
     def __getValue(
         self,
         word: str,
         feature: str
+    ) -> Any:
         word_id, value = None, None
         if word in self:
         if word_id != None:
             value = self.ds[feature].to_list()[word_id]
+        else:
+            print(f"The word '{word}' does not exist")
         return value
     def getEmbedding(
         self,
         word: str
+    ) -> np.ndarray:
         return self.__getValue(word, 'embedding')
     def getPCA(
         self,
         word: str
+    ) -> np.ndarray:
         return self.__getValue(word, 'pca')
         assert(n_neighbors <= self.max_neighbors), f"Error: The value of the parameter 'n_neighbors:{n_neighbors}' must less than or equal to {self.max_neighbors}!."
+        assert(nn_method in self.availables_nn_methods), f"Error: The value of the parameter 'nn method' can only be {self.availables_nn_methods}!"
+        neighbors_list = []
+        if word not in self:
+            print(f"The word '{word}' does not exist")
+            return neighbors_list
         if nn_method == 'ann':
+            if self.ann is None:
+                self.__init_ann_method(
+                    words=self.ds['word'].to_list(),
+                    vectors=self.ds['embedding'].to_list(),
+                    coord=self.ds['pca'].to_list()
+                )
+            neighbors_list = self.ann.get(word, n_neighbors)
         elif nn_method == 'sklearn':
+            if self.neigh is None:
+                self.__init_sklearn_method(
+                    max_neighbors=self.max_neighbors,
+                    vectors=self.ds['embedding'].to_list()
+                )
+            word_emb = self.getEmbedding(word).reshape(1,-1)
+            _, nn_ids = self.neigh.kneighbors(word_emb, n_neighbors + 1)
+            neighbors_list = [self.ds['word'].to_list()[idx] for idx in nn_ids[0]][1:]
+        return neighbors_list
+    def cosineSimilarities(
+        self,
+        vector_1,
+        vectors_all
+    ):
         norm = np.linalg.norm(vector_1)
         all_norms = np.linalg.norm(vectors_all, axis=1)
         dot_products = dot(vectors_all, vector_1)
         similarities = dot_products / (norm * all_norms)
         return similarities
+    def getCosineSimilarities(
+        self,
+        w1,
+        w2
+    ):
         return dot(
             matutils.unitvec(self.getEmbedding(w1)),
             matutils.unitvec(self.getEmbedding(w2))
+        )
+    def __contains__(
+        self,
+        word: str
+    ) -> bool:
+        return word in self.ds['word'].to_list()

modules/module_BiasExplorer.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import copy
 import numpy as np
 import pandas as pd
@@ -5,10 +7,14 @@ import seaborn as sns
 import matplotlib.pyplot as plt
 from sklearn.decomposition import PCA
-def take_two_sides_extreme_sorted(df, n_extreme,
-                                  part_column=None,
-                                  head_value='',
-                                  tail_value=''):
     head_df = df.head(n_extreme)[:]
     tail_df = df.tail(n_extreme)[:]
@@ -56,39 +62,63 @@ __all__ = ['GenderBiasWE', 'BiasWordEmbedding']
 class WordBiasExplorer():
-    def __init__(self, vocabulary):
-        # pylint: disable=undefined-variable
-        self.vocabulary = vocabulary
         self.direction = None
         self.positive_end = None
         self.negative_end = None
-    def __copy__(self):
-        bias_word_embedding = self.__class__(self.vocabulary)
         bias_word_embedding.direction = copy.deepcopy(self.direction)
         bias_word_embedding.positive_end = copy.deepcopy(self.positive_end)
         bias_word_embedding.negative_end = copy.deepcopy(self.negative_end)
         return bias_word_embedding
-    def __deepcopy__(self, memo):
         bias_word_embedding = copy.copy(self)
         bias_word_embedding.model = copy.deepcopy(bias_word_embedding.model)
         return bias_word_embedding
-    def __getitem__(self, key):
-        return self.vocabulary.getEmbedding(key)
-    def __contains__(self, item):
-        return item in self.vocabulary
-    def _is_direction_identified(self):
         if self.direction is None:
             raise RuntimeError('The direction was not identified'
                                ' for this {} instance'
                                .format(self.__class__.__name__))
-    def _identify_subspace_by_pca(self, definitional_pairs, n_components):
         matrix = []
         for word1, word2 in definitional_pairs:
@@ -105,8 +135,14 @@ class WordBiasExplorer():
         return pca
-    def _identify_direction(self, positive_end, negative_end,
-                            definitional, method='pca'):
         if method not in DIRECTION_METHODS:
             raise ValueError('method should be one of {}, {} was given'.format(
                 DIRECTION_METHODS, method))
@@ -154,7 +190,11 @@ class WordBiasExplorer():
         self.positive_end = positive_end
         self.negative_end = negative_end
-    def project_on_direction(self, word):
         """Project the normalized vector of the word on the direction.
         :param str word: The word tor project
         :return float: The projection scalar
@@ -163,13 +203,15 @@ class WordBiasExplorer():
         self._is_direction_identified()
         vector = self[word]
-        projection_score = self.vocabulary.cosineSimilarities(self.direction,
                                                           [vector])[0]
         return projection_score
-    def _calc_projection_scores(self, words):
         self._is_direction_identified()
         df = pd.DataFrame({'word': words})
@@ -181,7 +223,11 @@ class WordBiasExplorer():
         return df
-    def calc_projection_data(self, words):
         """
         Calculate projection, projected and rejected vectors of a words list.
         :param list words: List of words
@@ -206,7 +252,12 @@ class WordBiasExplorer():
         return pd.DataFrame(projection_data)
-    def plot_dist_projections_on_direction(self, word_groups, ax=None):
         """Plot the projection scalars distribution on the direction.
         :param dict word_groups word: The groups to projects
         :return float: The ax object of the plot
@@ -221,7 +272,7 @@ class WordBiasExplorer():
             words = word_groups[name]
             label = '{} (#{})'.format(name, len(words))
             vectors = [self[word] for word in words]
-            projections = self.vocabulary.cosineSimilarities(self.direction,
                                                          vectors)
             sns.distplot(projections, hist=False, label=label, ax=ax)
@@ -236,18 +287,26 @@ class WordBiasExplorer():
         return ax
-    def __errorChecking(self, word):
         out_msj = ""
         if not word:
             out_msj = "Error: First you most enter a word!"
         else:
-            if word not in self.vocabulary:
                 out_msj = f"Error: The word '<b>{word}</b>' is not in the vocabulary!"
         return out_msj
-    def check_oov(self, wordlists):
         for wordlist in wordlists:
             for word in wordlist:
                 msg = self.__errorChecking(word)
@@ -255,39 +314,44 @@ class WordBiasExplorer():
                     return msg
         return None
-    def plot_biased_words(self,
-                       words_to_diagnose,
-                       wordlist_right,
-                       wordlist_left,
-                       wordlist_top=[],
-                       wordlist_bottom=[]
-                       ):
         bias_2D = wordlist_top == [] and wordlist_bottom == []
         if bias_2D and (not wordlist_right or not wordlist_left):
             raise Exception('For bar plot, wordlist right and left can NOT be empty')
         elif not bias_2D and (not wordlist_right or not wordlist_left or not wordlist_top or not wordlist_bottom):
-            raise Exception('For plane (2D) plot, wordlist right, left, top and down can NOT be empty')
         err = self.check_oov([words_to_diagnose + wordlist_right + wordlist_left + wordlist_top + wordlist_bottom])
         if err:
             raise Exception(err)
-        return self.get_bias_plot(bias_2D,
-                                  words_to_diagnose,
-                                  definitional_1=(wordlist_right, wordlist_left),
-                                  definitional_2=(wordlist_top, wordlist_bottom)
-                                  )
-    def get_bias_plot(self,
-                      plot_2D,
-                      words_to_diagnose,
-                      definitional_1,
-                      definitional_2=([], []),
-                      method='sum',
-                      n_extreme=10,
-                      figsize=(15, 10)
-                      ):
         fig, ax = plt.subplots(1, figsize=figsize)
         self.method = method
         self.plot_projection_scores(plot_2D, words_to_diagnose, definitional_1, definitional_2, n_extreme, ax)
@@ -298,14 +362,17 @@ class WordBiasExplorer():
         return fig
-    def plot_projection_scores(self,
-                                  plot_2D,
-                                  words,
-                                  definitional_1,
-                                  definitional_2=([], []),
-                                  n_extreme=10,
-                                  ax=None,
-                                  axis_projection_step=0.1):
         name_left  = ', '.join(definitional_1[1])
         name_right = ', '.join(definitional_1[0])
@@ -341,6 +408,9 @@ class WordBiasExplorer():
             sns.barplot(x='projection', y='word', data=projections_df,
                     palette=projections_df['color'])
         else:
             sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
                         palette=projections_df['color'])

+# ToDo: Pendiente eliminar clases/métodos que no son utilizados. Luego, unificar sintaxix e incluir typing.
 import copy
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 from sklearn.decomposition import PCA
+def take_two_sides_extreme_sorted(
+    df,
+    n_extreme,
+    part_column=None,
+    head_value='',
+    tail_value=''
+):
     head_df = df.head(n_extreme)[:]
     tail_df = df.tail(n_extreme)[:]
 class WordBiasExplorer():
+    def __init__(
+        self,
+        embedding  # Class Embedding instance
+    ) -> None:
+        self.embedding = embedding
         self.direction = None
         self.positive_end = None
         self.negative_end = None
+    def __copy__(
+        self
+    ):
+        bias_word_embedding = self.__class__(self.embedding)
         bias_word_embedding.direction = copy.deepcopy(self.direction)
         bias_word_embedding.positive_end = copy.deepcopy(self.positive_end)
         bias_word_embedding.negative_end = copy.deepcopy(self.negative_end)
         return bias_word_embedding
+    def __deepcopy__(
+        self,
+        memo
+    ):
         bias_word_embedding = copy.copy(self)
         bias_word_embedding.model = copy.deepcopy(bias_word_embedding.model)
         return bias_word_embedding
+    def __getitem__(
+        self,
+        key: str
+    ) -> np.ndarray:
+        return self.embedding.getEmbedding(key)
+    def __contains__(
+        self,
+        item: str
+    ) -> bool:
+        return item in self.embedding
+    def _is_direction_identified(
+        self
+    ):
         if self.direction is None:
             raise RuntimeError('The direction was not identified'
                                ' for this {} instance'
                                .format(self.__class__.__name__))
+    def _identify_subspace_by_pca(
+        self,
+        definitional_pairs,
+        n_components
+    ):
         matrix = []
         for word1, word2 in definitional_pairs:
         return pca
+    def _identify_direction(
+        self,
+        positive_end,
+        negative_end,
+        definitional,
+        method='pca'
+    ):
         if method not in DIRECTION_METHODS:
             raise ValueError('method should be one of {}, {} was given'.format(
                 DIRECTION_METHODS, method))
         self.positive_end = positive_end
         self.negative_end = negative_end
+    def project_on_direction(
+        self,
+        word: str
+    ):
         """Project the normalized vector of the word on the direction.
         :param str word: The word tor project
         :return float: The projection scalar
         self._is_direction_identified()
         vector = self[word]
+        projection_score = self.embedding.cosineSimilarities(self.direction,
                                                           [vector])[0]
         return projection_score
+    def _calc_projection_scores(
+        self,
+        words
+    ):
         self._is_direction_identified()
         df = pd.DataFrame({'word': words})
         return df
+    def calc_projection_data(
+        self,
+        words
+    ):
         """
         Calculate projection, projected and rejected vectors of a words list.
         :param list words: List of words
         return pd.DataFrame(projection_data)
+    def plot_dist_projections_on_direction(
+        self,
+        word_groups,
+        ax=None
+    ):
         """Plot the projection scalars distribution on the direction.
         :param dict word_groups word: The groups to projects
         :return float: The ax object of the plot
             words = word_groups[name]
             label = '{} (#{})'.format(name, len(words))
             vectors = [self[word] for word in words]
+            projections = self.embedding.cosineSimilarities(self.direction,
                                                          vectors)
             sns.distplot(projections, hist=False, label=label, ax=ax)
         return ax
+    def __errorChecking(
+        self,
+        word
+    ):
         out_msj = ""
         if not word:
             out_msj = "Error: First you most enter a word!"
         else:
+            if word not in self.embedding:
                 out_msj = f"Error: The word '<b>{word}</b>' is not in the vocabulary!"
         return out_msj
+    def check_oov(
+        self,
+        wordlists
+    ):
         for wordlist in wordlists:
             for word in wordlist:
                 msg = self.__errorChecking(word)
                     return msg
         return None
+    def plot_biased_words(
+        self,
+        words_to_diagnose,
+        wordlist_right,
+        wordlist_left,
+        wordlist_top=[],
+        wordlist_bottom=[]
+    ):
         bias_2D = wordlist_top == [] and wordlist_bottom == []
         if bias_2D and (not wordlist_right or not wordlist_left):
             raise Exception('For bar plot, wordlist right and left can NOT be empty')
         elif not bias_2D and (not wordlist_right or not wordlist_left or not wordlist_top or not wordlist_bottom):
+            raise Exception('For plane plot, wordlist right, left, top and down can NOT be empty')
         err = self.check_oov([words_to_diagnose + wordlist_right + wordlist_left + wordlist_top + wordlist_bottom])
         if err:
             raise Exception(err)
+        return self.get_bias_plot(
+            bias_2D,
+            words_to_diagnose,
+            definitional_1=(wordlist_right, wordlist_left),
+            definitional_2=(wordlist_top, wordlist_bottom)
+        )
+    def get_bias_plot(
+        self,
+        plot_2D,
+        words_to_diagnose,
+        definitional_1,
+        definitional_2=([], []),
+        method='sum',
+        n_extreme=10,
+        figsize=(15, 10)
+    ):
         fig, ax = plt.subplots(1, figsize=figsize)
         self.method = method
         self.plot_projection_scores(plot_2D, words_to_diagnose, definitional_1, definitional_2, n_extreme, ax)
         return fig
+    def plot_projection_scores(
+        self,
+        plot_2D,
+        words,
+        definitional_1,
+        definitional_2=([], []),
+        n_extreme=10,
+        ax=None,
+        axis_projection_step=0.1
+    ):
         name_left  = ', '.join(definitional_1[1])
         name_right = ', '.join(definitional_1[0])
             sns.barplot(x='projection', y='word', data=projections_df,
                     palette=projections_df['color'])
         else:
+            # ToDo: revisar este warning:
+            # Ignoring `palette` because no `hue` variable has been assigned. sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
             sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
                         palette=projections_df['color'])

modules/module_WordExplorer.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import numpy as np
 import pandas as pd
 import seaborn as sns
@@ -5,37 +6,52 @@ from numpy.linalg import norm
 import matplotlib as mpl
 mpl.use('Agg')
-import matplotlib.pyplot as plt
 class WordToPlot:
-    def __init__(self, word, color, bias_space, alpha):
         self.word = word
         self.color = color
         self.bias_space = bias_space
         self.alpha = alpha
 class WordExplorer:
-    def __init__(self, vocabulary) -> None:
-        self.vocabulary = vocabulary
-    def __errorChecking(self, word):
         out_msj = ""
         if not word:
             out_msj = "Error: First you most enter a word!"
         else:
-            if word not in self.vocabulary:
-                out_msj = f"Error: The word '<b>{word}</b>' is not in the vocabulary!"
         return out_msj
-    def parse_words(self, string):
-        words = string.strip()
-        if words:
-            words = [word.strip() for word in words.split(',') if word != ""]
-        return words
-    def check_oov(self, wordlists):
         for wordlist in wordlists:
             for word in wordlist:
                 msg = self.__errorChecking(word)
@@ -43,10 +59,21 @@ class WordExplorer:
                     return msg
         return None
-    def get_neighbors(self, word, n_neighbors, nn_method):
-        return self.vocabulary.getNearestNeighbors(word, n_neighbors, nn_method)
-    def get_df(self, words_embedded, processed_word_list):
         df = pd.DataFrame(words_embedded)
         df['word'] = [wtp.word for wtp in processed_word_list]
@@ -55,16 +82,18 @@ class WordExplorer:
         df['word_bias_space'] = [wtp.bias_space for wtp in processed_word_list]
         return df
-    def get_plot(self,
-                 data,
-                 processed_word_list,
-                 words_embedded,
-                 color_dict,
-                 n_neighbors,
-                 n_alpha,
-                 fontsize=18,
-                 figsize=(20, 15)
-                 ):
         fig, ax = plt.subplots(figsize=figsize)
         sns.scatterplot(
@@ -89,11 +118,20 @@ class WordExplorer:
                 legend=False,
                 palette=color_dict
             )
         for i, wtp in enumerate(processed_word_list):
             x, y = words_embedded[i, :]
-            ax.annotate(wtp.word, xy=(x, y), xytext=(5, 2), color=wtp.color,
-                        textcoords='offset points',
-                        ha='right', va='bottom', size=fontsize, alpha=wtp.alpha)
         ax.set_xticks([])
         ax.set_yticks([])
@@ -103,25 +141,27 @@ class WordExplorer:
         return fig
-    def plot_projections_2d(self,
-                            wordlist_0,
-                            wordlist_1 = [],
-                            wordlist_2 = [],
-                            wordlist_3 = [],
-                            wordlist_4 = [],
-                            **kwargs
-                            ):
         # convertirlas a vector
         choices = [0, 1, 2, 3, 4]
         wordlist_choice = [
-            wordlist_0,
             wordlist_1,
-            wordlist_2,
-            wordlist_3,
             wordlist_4
         ]
-        err = self.check_oov(wordlist_choice)
         if err:
             raise Exception(err)
@@ -139,45 +179,69 @@ class WordExplorer:
         processed_word_list = []
         for word_list_to_process, color in zip(wordlist_choice, choices):
             for word in word_list_to_process:
-                processed_word_list.append(WordToPlot(word, color_dict[color], color, 1))
                 if n_neighbors > 0:
-                    neighbors = self.get_neighbors(word,
-                                                   n_neighbors=n_neighbors,
-                                                   nn_method=kwargs.get('nn_method', 'sklearn')
-                                                   )
                     for n in neighbors:
                         if n not in [wtp.word for wtp in processed_word_list]:
-                            processed_word_list.append(WordToPlot(n, color_dict[color], color, n_alpha))
         if not processed_word_list:
             raise Exception('Only empty lists were passed')
-        words_embedded = np.array([self.vocabulary.getPCA(wtp.word) for wtp in processed_word_list])
-        data = self.get_df(words_embedded, processed_word_list)
-        fig = self.get_plot(data, processed_word_list, words_embedded,
-                            color_dict, n_neighbors, n_alpha,
-                            kwargs.get('fontsize', 18),
-                            kwargs.get('figsize', (20, 15))
-                            )
         plt.show()
         return fig
-    def doesnt_match(self, wordlist):
         err = self.check_oov([wordlist])
         if err:
             raise Exception(err)
-        words_emb = np.array([self.vocabulary.getEmbedding(word) for word in wordlist])
         mean_vec = np.mean(words_emb, axis=0)
         doesnt_match = ""
         farthest_emb = 1.0
         for word in wordlist:
-            word_emb = self.vocabulary.getEmbedding(word)
-            cos_sim = np.dot(mean_vec, word_emb) / (norm(mean_vec)*norm(word_emb))
             if cos_sim <= farthest_emb:
                 farthest_emb = cos_sim
                 doesnt_match = word

+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import seaborn as sns
 import matplotlib as mpl
 mpl.use('Agg')
+from typing import List, Dict, Tuple
 class WordToPlot:
+    def __init__(
+        self,
+        word: str,
+        color: str,
+        bias_space: int,
+        alpha: float
+    ):
         self.word = word
         self.color = color
         self.bias_space = bias_space
         self.alpha = alpha
 class WordExplorer:
+    def __init__(
+        self,
+        embedding   # Class Embedding instance
+    ) -> None:
+        self.embedding = embedding
+    def __errorChecking(
+        self,
+        word: str
+    ) -> str:
         out_msj = ""
         if not word:
             out_msj = "Error: First you most enter a word!"
         else:
+            if word not in self.embedding:
+                out_msj =  f"Error: The word '<b>{word}</b>' is not in the vocabulary!"
         return out_msj
+    def check_oov(
+        self,
+        wordlists: List[str]
+    ) -> str:
         for wordlist in wordlists:
             for word in wordlist:
                 msg = self.__errorChecking(word)
                     return msg
         return None
+    def get_neighbors(
+        self,
+        word: str,
+        n_neighbors: int,
+        nn_method: str
+    ) -> List[str]:
+        return self.embedding.getNearestNeighbors(word, n_neighbors, nn_method)
+    def get_df(
+        self,
+        words_embedded: np.ndarray,
+        processed_word_list: List[str]
+    ) -> pd.DataFrame:
         df = pd.DataFrame(words_embedded)
         df['word'] = [wtp.word for wtp in processed_word_list]
         df['word_bias_space'] = [wtp.bias_space for wtp in processed_word_list]
         return df
+    def get_plot(
+        self,
+        data: pd.DataFrame,
+        processed_word_list: List[str],
+        words_embedded: np.ndarray,
+        color_dict: Dict,
+        n_neighbors: int,
+        n_alpha: float,
+        fontsize: int=18,
+        figsize: Tuple[int, int]=(20, 15)
+    ):
         fig, ax = plt.subplots(figsize=figsize)
         sns.scatterplot(
                 legend=False,
                 palette=color_dict
             )
         for i, wtp in enumerate(processed_word_list):
             x, y = words_embedded[i, :]
+            ax.annotate(
+                wtp.word,
+                xy=(x, y),
+                xytext=(5, 2),
+                color=wtp.color,
+                textcoords='offset points',
+                ha='right',
+                va='bottom',
+                size=fontsize,
+                alpha=wtp.alpha
+            )
         ax.set_xticks([])
         ax.set_yticks([])
         return fig
+    def plot_projections_2d(
+        self,
+        wordlist_0: List[str],
+        wordlist_1: List[str]=[],
+        wordlist_2: List[str]=[],
+        wordlist_3: List[str]=[],
+        wordlist_4: List[str]=[],
+        **kwargs
+    ):
         # convertirlas a vector
         choices = [0, 1, 2, 3, 4]
         wordlist_choice = [
+            wordlist_0,
             wordlist_1,
+            wordlist_2,
+            wordlist_3,
             wordlist_4
         ]
+        err = self.check_oov(wordlist_choice)
         if err:
             raise Exception(err)
         processed_word_list = []
         for word_list_to_process, color in zip(wordlist_choice, choices):
             for word in word_list_to_process:
+                processed_word_list.append(
+                    WordToPlot(word, color_dict[color], color, 1)
+                )
                 if n_neighbors > 0:
+                    neighbors = self.get_neighbors(
+                        word,
+                        n_neighbors=n_neighbors,
+                        nn_method=kwargs.get('nn_method', 'sklearn')
+                    )
                     for n in neighbors:
                         if n not in [wtp.word for wtp in processed_word_list]:
+                            processed_word_list.append(
+                                WordToPlot(n, color_dict[color], color, n_alpha)
+                            )
         if not processed_word_list:
             raise Exception('Only empty lists were passed')
+        words_embedded = np.array(
+            [self.embedding.getPCA(wtp.word) for wtp in processed_word_list]
+        )
+        data = self.get_df(
+            words_embedded,
+            processed_word_list
+        )
+        fig = self.get_plot(
+            data,
+            processed_word_list,
+            words_embedded,
+            color_dict,
+            n_neighbors,
+            n_alpha,
+            kwargs.get('fontsize', 18),
+            kwargs.get('figsize', (20, 15))
+        )
         plt.show()
         return fig
+    # ToDo: No hay usos de este método. ¿Borrar?
+    def doesnt_match(
+        self,
+        wordlist: List[str]
+    ) -> str:
         err = self.check_oov([wordlist])
         if err:
             raise Exception(err)
+        words_emb = np.array([self.embedding.getEmbedding(word)
+                             for word in wordlist])
         mean_vec = np.mean(words_emb, axis=0)
         doesnt_match = ""
         farthest_emb = 1.0
         for word in wordlist:
+            word_emb = self.embedding.getEmbedding(word)
+            cos_sim = np.dot(mean_vec, word_emb) / \
+                (norm(mean_vec)*norm(word_emb))
             if cos_sim <= farthest_emb:
                 farthest_emb = cos_sim
                 doesnt_match = word

modules/module_ann.py CHANGED Viewed

@@ -1,45 +1,71 @@
 import time
-import operator
 from tqdm import tqdm
 from annoy import AnnoyIndex
 from memory_profiler import profile
 class TicToc:
-    def __init__(self):
         self.i = None
-    def start(self):
         self.i = time.time()
-    def stop(self):
         f = time.time()
         print(f - self.i, "seg.")
 class Ann:
-    def __init__(self, words, vectors, coord):
-        self.words = words.to_list()
-        self.vectors = vectors.to_list()
-        self.coord = coord.to_list()
         self.tree = None
         self.tt = TicToc()
-    @profile
-    def init(self, n_trees=10, metric='angular', n_jobs=-1):
-        # metrics options = "angular", "euclidean", "manhattan", "hamming", or "dot"
-        # n_jobs=-1 Run over all CPU availables
-        print("Init tree...")
         self.tt.start()
         self.tree = AnnoyIndex(len(self.vectors[0]), metric=metric)
-        for i,v in tqdm(enumerate(self.vectors), total=len(self.vectors)):
-            self.tree.add_item(i,v)
         self.tt.stop()
-        print("Build tree...")
         self.tt.start()
         self.tree.build(n_trees=n_trees, n_jobs=n_jobs)
         self.tt.stop()
-    def __getWordId(self, word):
         word_id = None
         try:
             word_id = self.words.index(word)
@@ -47,16 +73,20 @@ class Ann:
             pass
         return word_id
-    def get(self, word, n_neighbors=10):
         word_id = self.__getWordId(word)
-        reword_xy_list = None
         if word_id != None:
-            neighbord_id = self.tree.get_nns_by_item(word_id, n_neighbors)
-            # word_xy_list = list(map(lambda i: (self.words[i],self.coord[i]), neighbord_id))
-            # word_xy_list = list(map(lambda i: self.words[i], neighbord_id))
-            word_xy_list = operator.itemgetter(*neighbord_id)(self.words)
         else:
             print(f"The word '{word}' does not exist")
-        return word_xy_list

 import time
 from tqdm import tqdm
 from annoy import AnnoyIndex
 from memory_profiler import profile
+from typing import List
 class TicToc:
+    def __init__(
+        self
+    ) -> None:
         self.i = None
+    def start(
+        self
+    ) -> None:
         self.i = time.time()
+    def stop(
+        self
+    ) -> None:
         f = time.time()
         print(f - self.i, "seg.")
 class Ann:
+    def __init__(
+        self,
+        words: List[str],
+        vectors: List,
+        coord: List,
+    ) -> None:
+        self.words = words
+        self.vectors = vectors
+        self.coord = coord
         self.tree = None
         self.tt = TicToc()
+    def init(self,
+        n_trees: int=10,
+        metric: str='angular',
+        n_jobs: int=-1  # n_jobs=-1 Run over all CPU availables
+    ) -> None:
+        availables_metrics = ['angular','euclidean','manhattan','hamming','dot']
+        assert(metric in self.availables_metrics), f"Error: The value of the parameter 'metric' can only be {availables_metrics}!"
+        print("\tInit tree...")
         self.tt.start()
         self.tree = AnnoyIndex(len(self.vectors[0]), metric=metric)
+        for i, v in tqdm(enumerate(self.vectors), total=len(self.vectors)):
+            self.tree.add_item(i, v)
         self.tt.stop()
+        print("\tBuild tree...")
         self.tt.start()
         self.tree.build(n_trees=n_trees, n_jobs=n_jobs)
         self.tt.stop()
+    def __getWordId(
+        self,
+        word: str
+    ) -> int:
         word_id = None
         try:
             word_id = self.words.index(word)
             pass
         return word_id
+    def get(
+        self,
+        word: str,
+        n_neighbors: int=10
+    ) -> List[str]:
         word_id = self.__getWordId(word)
+        neighbors_list = None
         if word_id != None:
+            neighbords_id = self.tree.get_nns_by_item(word_id, n_neighbors + 1)
+            neighbors_list = [self.words[idx] for idx in neighbords_id][1:]
         else:
             print(f"The word '{word}' does not exist")
+        return neighbors_list

modules/module_connection.py CHANGED Viewed

@@ -1,52 +1,75 @@
-import numpy as np
-import pandas as pd
-import gradio as gr
-from abc import ABC, abstractmethod
 from modules.module_WordExplorer import WordExplorer
 from modules.module_BiasExplorer import WordBiasExplorer
 class Connector(ABC):
-    def parse_word(self, word : str):
         return word.lower().strip()
-    def parse_words(self, array_in_string : str):
         words = array_in_string.strip()
         if not words:
             return []
-        words = [self.parse_word(word) for word in words.split(',') if word.strip() != '']
         return words
-    def process_error(self, err: str):
-        if err is None:
-            return
-        return "<center><h3>" + err + "</h3></center>"
 class WordExplorerConnector(Connector):
-    def __init__(self, **kwargs):
         if 'embedding' in kwargs:
             embedding = kwargs.get('embedding')
         else:
             raise KeyError
-        self.word_explorer = WordExplorer(embedding)
-    def plot_proyection_2d( self,
-                            wordlist_0,
-                            wordlist_1,
-                            wordlist_2,
-                            wordlist_3,
-                            wordlist_4,
-                            color_wordlist_0,
-                            color_wordlist_1,
-                            color_wordlist_2,
-                            color_wordlist_3,
-                            color_wordlist_4,
-                            n_alpha,
-                            fontsize,
-                            n_neighbors
-                            ):
         err = ""
         neighbors_method = 'sklearn'
         wordlist_0 = self.parse_words(wordlist_0)
@@ -59,49 +82,63 @@ class WordExplorerConnector(Connector):
             err = self.process_error("Enter at least one word to continue")
             return None, err
-        err = self.word_explorer.check_oov([wordlist_0, wordlist_1, wordlist_2, wordlist_3, wordlist_4])
         if err:
             return None, self.process_error(err)
-        fig = self.word_explorer.plot_projections_2d(wordlist_0,
-                                                     wordlist_1,
-                                                     wordlist_2,
-                                                     wordlist_3,
-                                                     wordlist_4,
-                                                     color_wordlist_0=color_wordlist_0,
-                                                     color_wordlist_1=color_wordlist_1,
-                                                     color_wordlist_2=color_wordlist_2,
-                                                     color_wordlist_3=color_wordlist_3,
-                                                     color_wordlist_4=color_wordlist_4,
-                                                     n_alpha=n_alpha,
-                                                     fontsize=fontsize,
-                                                     n_neighbors=n_neighbors,
-                                                     nn_method = neighbors_method
-                                                     )
         return fig, self.process_error(err)
 class BiasWordExplorerConnector(Connector):
-    def __init__(self, **kwargs):
         if 'embedding' in kwargs:
             embedding = kwargs.get('embedding')
         else:
             raise KeyError
-        self.bias_word_explorer = WordBiasExplorer(embedding)
-    def calculate_bias_2d(self,
-                         wordlist_1,
-                         wordlist_2,
-                         to_diagnose_list
-                         ):
         err = ""
         wordlist_1 = self.parse_words(wordlist_1)
         wordlist_2 = self.parse_words(wordlist_2)
         to_diagnose_list = self.parse_words(to_diagnose_list)
         word_lists = [wordlist_1, wordlist_2, to_diagnose_list]
-        for list in word_lists:
-            if not list:
                 err = "At least one word should be in the to diagnose list, bias 1 list and bias 2 list"
         if err:
             return None, self.process_error(err)
@@ -110,17 +147,23 @@ class BiasWordExplorerConnector(Connector):
         if err:
             return None, self.process_error(err)
-        fig = self.bias_word_explorer.plot_biased_words(to_diagnose_list, wordlist_2, wordlist_1)
         return fig, self.process_error(err)
-    def calculate_bias_4d(self,
-                         wordlist_1,
-                         wordlist_2,
-                         wordlist_3,
-                         wordlist_4,
-                         to_diagnose_list
-                         ):
         err = ""
         wordlist_1 = self.parse_words(wordlist_1)
         wordlist_2 = self.parse_words(wordlist_2)
@@ -129,8 +172,8 @@ class BiasWordExplorerConnector(Connector):
         to_diagnose_list = self.parse_words(to_diagnose_list)
         wordlists = [wordlist_1, wordlist_2, wordlist_3, wordlist_4, to_diagnose_list]
-        for list in wordlists:
-            if not list:
                 err = "To plot with 4 spaces, you must enter at least one word in all lists."
         if err:
             return None, self.process_error(err)
@@ -139,5 +182,12 @@ class BiasWordExplorerConnector(Connector):
         if err:
             return None, self.process_error(err)
-        fig = self.bias_word_explorer.plot_biased_words(to_diagnose_list, wordlist_1, wordlist_2, wordlist_3, wordlist_4)
         return fig, self.process_error(err)

+from abc import ABC
 from modules.module_WordExplorer import WordExplorer
 from modules.module_BiasExplorer import WordBiasExplorer
+from typing import List, Tuple
 class Connector(ABC):
+    def parse_word(
+        self,
+        word: str
+    ) -> str:
         return word.lower().strip()
+    def parse_words(
+        self,
+        array_in_string: str
+    ) -> List[str]:
         words = array_in_string.strip()
         if not words:
             return []
+        words = [
+            self.parse_word(word)
+            for word in words.split(',') if word.strip() != ''
+        ]
         return words
+    def process_error(
+        self,
+        err: str
+    ) -> str:
+        if err:
+            err = "<center><h3>" + err + "</h3></center>"
+        return err
 class WordExplorerConnector(Connector):
+    def __init__(
+        self,
+        **kwargs
+    ) -> None:
         if 'embedding' in kwargs:
             embedding = kwargs.get('embedding')
         else:
             raise KeyError
+        self.word_explorer = WordExplorer(
+            embedding=embedding
+        )
+    def plot_proyection_2d(
+        self,
+        wordlist_0: str,
+        wordlist_1: str,
+        wordlist_2: str,
+        wordlist_3: str,
+        wordlist_4: str,
+        color_wordlist_0: str,
+        color_wordlist_1: str,
+        color_wordlist_2: str,
+        color_wordlist_3: str,
+        color_wordlist_4: str,
+        n_alpha: float,
+        fontsize: int,
+        n_neighbors: int
+    ) -> Tuple:
         err = ""
         neighbors_method = 'sklearn'
         wordlist_0 = self.parse_words(wordlist_0)
             err = self.process_error("Enter at least one word to continue")
             return None, err
+        err = self.word_explorer.check_oov(
+            [wordlist_0, wordlist_1, wordlist_2, wordlist_3, wordlist_4]
+        )
         if err:
             return None, self.process_error(err)
+        fig = self.word_explorer.plot_projections_2d(
+            wordlist_0,
+            wordlist_1,
+            wordlist_2,
+            wordlist_3,
+            wordlist_4,
+            color_wordlist_0=color_wordlist_0,
+            color_wordlist_1=color_wordlist_1,
+            color_wordlist_2=color_wordlist_2,
+            color_wordlist_3=color_wordlist_3,
+            color_wordlist_4=color_wordlist_4,
+            n_alpha=n_alpha,
+            fontsize=fontsize,
+            n_neighbors=n_neighbors,
+            nn_method = neighbors_method
+        )
         return fig, self.process_error(err)
 class BiasWordExplorerConnector(Connector):
+    def __init__(
+        self,
+        **kwargs
+    ) -> None:
         if 'embedding' in kwargs:
             embedding = kwargs.get('embedding')
         else:
             raise KeyError
+        self.bias_word_explorer = WordBiasExplorer(
+            embedding=embedding
+        )
+    def calculate_bias_2d(
+        self,
+        wordlist_1: str,
+        wordlist_2: str,
+        to_diagnose_list: str
+    ) -> Tuple:
         err = ""
         wordlist_1 = self.parse_words(wordlist_1)
         wordlist_2 = self.parse_words(wordlist_2)
         to_diagnose_list = self.parse_words(to_diagnose_list)
         word_lists = [wordlist_1, wordlist_2, to_diagnose_list]
+        for _list in word_lists:
+            if not _list:
                 err = "At least one word should be in the to diagnose list, bias 1 list and bias 2 list"
         if err:
             return None, self.process_error(err)
         if err:
             return None, self.process_error(err)
+        fig = self.bias_word_explorer.plot_biased_words(
+            to_diagnose_list,
+            wordlist_2,
+            wordlist_1
+        )
         return fig, self.process_error(err)
+    def calculate_bias_4d(
+        self,
+        wordlist_1: str,
+        wordlist_2: str,
+        wordlist_3: str,
+        wordlist_4: str,
+        to_diagnose_list: str
+    ) -> Tuple:
         err = ""
         wordlist_1 = self.parse_words(wordlist_1)
         wordlist_2 = self.parse_words(wordlist_2)
         to_diagnose_list = self.parse_words(to_diagnose_list)
         wordlists = [wordlist_1, wordlist_2, wordlist_3, wordlist_4, to_diagnose_list]
+        for _list in wordlists:
+            if not _list:
                 err = "To plot with 4 spaces, you must enter at least one word in all lists."
         if err:
             return None, self.process_error(err)
         if err:
             return None, self.process_error(err)
+        fig = self.bias_word_explorer.plot_biased_words(
+            to_diagnose_list,
+            wordlist_1,
+            wordlist_2,
+            wordlist_3,
+            wordlist_4
+        )
         return fig, self.process_error(err)

modules/module_logsManager.py CHANGED Viewed

@@ -1,26 +1,36 @@
-import csv, os, pytz
 from gradio import utils
-from datetime import datetime
-from dotenv import load_dotenv
-from distutils.log import debug
 from typing import Any, List, Optional
-from gradio.components import IOComponent
-from gradio.flagging import FlaggingCallback, _get_dataset_features_info
 # --- Load environments vars ---
 load_dotenv()
 # --- Classes declaration ---
 class DateLogs:
-    def __init__(self, zone="America/Argentina/Cordoba"):
         self.time_zone = pytz.timezone(zone)
-    def full(self):
         now = datetime.now(self.time_zone)
         return now.strftime("%H:%M:%S %d-%m-%Y")
-    def day(self):
         now = datetime.now(self.time_zone)
         return now.strftime("%d-%m-%Y")
@@ -40,12 +50,12 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
     def __init__(
         self,
-        hf_token: str = os.getenv('HF_TOKEN'),
-        dataset_name: str = os.getenv('DS_LOGS_NAME'),
-        organization: Optional[str] = os.getenv('ORG_NAME'),
-        private: bool = True,
-        available_logs: bool = False
-    ):
         """
         Parameters:
             hf_token: The HuggingFace token to use to create (and write the flagged sample to) the HuggingFace dataset.
@@ -53,6 +63,8 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
             organization: The organization to save the dataset under. The hf_token must provide write access to this organization. If not provided, saved under the name of the user corresponding to the hf_token.
             private: Whether the dataset should be private (defaults to False).
         """
         self.hf_token = hf_token
         self.dataset_name = dataset_name
         self.organization_name = organization
@@ -65,10 +77,10 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
     def setup(
-            self,
-            components: List[IOComponent],
-            flagging_dir: str
-        ):
         """
         Params:
         flagging_dir (str): local directory where the dataset is cloned,
@@ -112,9 +124,9 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
     def flag(
         self,
         flag_data: List[Any],
-        flag_option: Optional[str] = None,
-        flag_index: Optional[int] = None,
-        username: Optional[str] = None,
     ) -> int:
         if self.available_logs:

+from gradio.flagging import FlaggingCallback, _get_dataset_features_info
+from gradio.components import IOComponent
 from gradio import utils
 from typing import Any, List, Optional
+from dotenv import load_dotenv
+from datetime import datetime
+import csv, os, pytz
 # --- Load environments vars ---
 load_dotenv()
 # --- Classes declaration ---
 class DateLogs:
+    def __init__(
+        self,
+        zone: str="America/Argentina/Cordoba"
+    ) -> None:
         self.time_zone = pytz.timezone(zone)
+    def full(
+        self
+    ) -> str:
         now = datetime.now(self.time_zone)
         return now.strftime("%H:%M:%S %d-%m-%Y")
+    def day(
+        self
+    ) -> str:
         now = datetime.now(self.time_zone)
         return now.strftime("%d-%m-%Y")
     def __init__(
         self,
+        dataset_name: str=None,
+        hf_token: str=os.getenv('HF_TOKEN'),
+        organization: Optional[str]=os.getenv('ORG_NAME'),
+        private: bool=True,
+        available_logs: bool=False
+    ) -> None:
         """
         Parameters:
             hf_token: The HuggingFace token to use to create (and write the flagged sample to) the HuggingFace dataset.
             organization: The organization to save the dataset under. The hf_token must provide write access to this organization. If not provided, saved under the name of the user corresponding to the hf_token.
             private: Whether the dataset should be private (defaults to False).
         """
+        assert(dataset_name is not None), "Error: Parameter 'dataset_name' cannot be empty!."
         self.hf_token = hf_token
         self.dataset_name = dataset_name
         self.organization_name = organization
     def setup(
+        self,
+        components: List[IOComponent],
+        flagging_dir: str
+    ) -> None:
         """
         Params:
         flagging_dir (str): local directory where the dataset is cloned,
     def flag(
         self,
         flag_data: List[Any],
+        flag_option: Optional[str]=None,
+        flag_index: Optional[int]=None,
+        username: Optional[str]=None,
     ) -> int:
         if self.available_logs:

tool_info.py CHANGED Viewed

@@ -4,7 +4,7 @@ TOOL_INFO = """
 * [Read Full Paper](https://arxiv.org/abs/2207.06591)
 > ### Licensing Information
-* [MIT Licence](https://huggingface.co/spaces/vialibre/vialibre/bias_we_std_tool/resolve/main/LICENSE)
 > ### Citation Information
 ```c

 * [Read Full Paper](https://arxiv.org/abs/2207.06591)
 > ### Licensing Information
+* [MIT Licence](https://huggingface.co/spaces/vialibre/edia_we_en/resolve/main/LICENSE)
 > ### Citation Information
 ```c