LMartinezEXEX commited on
Commit
785b2ef
·
1 Parent(s): 8a3c920

Using data_loader to load vector files instead of json.

Browse files
.gitattributes CHANGED
@@ -33,4 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *tfevents* filter=lfs diff=lfs merge=lfs -text
34
  data/semi_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
35
  data/half_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
36
- data/wiki-news-300d-1M.vec filter=lfs diff=lfs merge=lfs -text
 
 
33
  *tfevents* filter=lfs diff=lfs merge=lfs -text
34
  data/semi_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
35
  data/half_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
36
+ data/wiki-news-300d-1M.vec filter=lfs diff=lfs merge=lfs -text
37
+ data/fasttext-sbwc.100k.vec filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -13,11 +13,14 @@ from interfaces.interface_BiasWordExplorer import interface as biasWordExplorer_
13
  # --- Tool config ---
14
  AVAILABLE_LOGS = True # [True | False]
15
  LANGUAGE = "spanish" # [spanish | english]
16
- EMBEDDING_SUBSET = "fasttext" # [fasttext | mini]
17
 
18
  # --- Init classes ---
19
  embedding = Embedding(
20
- subset_name=EMBEDDING_SUBSET
 
 
 
21
  )
22
  labels = pd.read_json(f"language/{LANGUAGE}.json")["app"]
23
 
 
13
  # --- Tool config ---
14
  AVAILABLE_LOGS = True # [True | False]
15
  LANGUAGE = "spanish" # [spanish | english]
16
+ EMBEDDINGS_PATH = "data/fasttext-sbwc.100k.vec"
17
 
18
  # --- Init classes ---
19
  embedding = Embedding(
20
+ path=EMBEDDINGS_PATH,
21
+ binary=EMBEDDINGS_PATH.endswith('.bin'),
22
+ limit=None,
23
+ randomizedPCA=False
24
  )
25
  labels = pd.read_json(f"language/{LANGUAGE}.json")["app"]
26
 
data/data_loader.py CHANGED
@@ -13,16 +13,12 @@ def load_embeddings(path, binary = False, randomPCA = False, limit = None):
13
  else:
14
  pca = PCA(n_components=2)
15
 
 
16
  model = KeyedVectors.load_word2vec_format(path, binary=binary, limit=limit)
17
 
18
  # Cased Vocab
19
- cased_words = model.vocab.keys()
20
-
21
- #Normalized vectors
22
- model.init_sims(replace=True)
23
- cased_emb = [model[word] for word in cased_words]
24
-
25
- # PCA reduction
26
  cased_pca = pca.fit_transform(cased_emb)
27
 
28
  df_cased = pd.DataFrame(
@@ -36,6 +32,6 @@ def load_embeddings(path, binary = False, randomPCA = False, limit = None):
36
 
37
  df_cased['word'] = df_cased.word.apply(lambda w: w.lower())
38
  df_uncased = df_cased.drop_duplicates(subset='word')
39
- df_uncased.to_json(path[:-3] + 'json')
40
 
41
- load_embeddings('./wiki-news-300d-1M.vec', limit=10000)
 
13
  else:
14
  pca = PCA(n_components=2)
15
 
16
+ print("--------> PATH:", path)
17
  model = KeyedVectors.load_word2vec_format(path, binary=binary, limit=limit)
18
 
19
  # Cased Vocab
20
+ cased_words = model.index_to_key
21
+ cased_emb = model.get_normed_vectors()
 
 
 
 
 
22
  cased_pca = pca.fit_transform(cased_emb)
23
 
24
  df_cased = pd.DataFrame(
 
32
 
33
  df_cased['word'] = df_cased.word.apply(lambda w: w.lower())
34
  df_uncased = df_cased.drop_duplicates(subset='word')
35
+ return df_uncased
36
 
37
+ #load_embeddings('data/fasttext-sbwc.100k.vec', limit=1000)
data/fasttext-sbwc.100k.vec ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:062e67c948fc14de8c9327868293c76a6d6f536f62d6df3d8afbf6d62c4b9cc1
3
+ size 262330934
modules/model_embbeding.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import operator
2
  import numpy as np
3
  import pandas as pd
@@ -6,14 +7,14 @@ from gensim import matutils
6
  from modules.module_ann import Ann
7
  from memory_profiler import profile
8
  from sklearn.neighbors import NearestNeighbors
 
9
 
10
 
11
  class Embedding:
12
  @profile
13
- def __init__(self, subset_name):
14
  # Dataset info
15
- self.ds_subset = subset_name
16
- self.ds_path = f"data/{subset_name}_embedding_v6.zip"
17
 
18
  # Pandas dataset
19
  self.ds = None
@@ -25,16 +26,16 @@ class Embedding:
25
  self.ann = None
26
 
27
  # Load embedding and pca dataset
28
- self.__load()
29
 
30
  def __contains__(self, word):
31
  return word in self.ds['word'].to_list()
32
 
33
- def __load(self):
34
- print(f"Preparing {self.ds_subset} embedding...")
35
 
36
- # --- Download dataset ---
37
- self.ds = pd.read_json(self.ds_path)
38
 
39
  # --- Get embedding from string
40
  self.embedding = self.ds['embedding'].to_list()
 
1
+ import os
2
  import operator
3
  import numpy as np
4
  import pandas as pd
 
7
  from modules.module_ann import Ann
8
  from memory_profiler import profile
9
  from sklearn.neighbors import NearestNeighbors
10
+ from data.data_loader import load_embeddings
11
 
12
 
13
  class Embedding:
14
  @profile
15
+ def __init__(self, path, binary, limit = None, randomizedPCA = False):
16
  # Dataset info
17
+ self.path = path
 
18
 
19
  # Pandas dataset
20
  self.ds = None
 
26
  self.ann = None
27
 
28
  # Load embedding and pca dataset
29
+ self.__load(binary, limit, randomizedPCA)
30
 
31
  def __contains__(self, word):
32
  return word in self.ds['word'].to_list()
33
 
34
+ def __load(self, binary, limit, randomizedPCA):
35
+ print(f"Preparing {os.path.basename(self.path)} embeddings...")
36
 
37
+ # --- Prepare dataset ---
38
+ self.ds = load_embeddings(self.path, binary, randomizedPCA, limit)
39
 
40
  # --- Get embedding from string
41
  self.embedding = self.ds['embedding'].to_list()