Spaces:
Configuration error
Configuration error
LMartinezEXEX
commited on
Commit
·
785b2ef
1
Parent(s):
8a3c920
Using data_loader to load vector files instead of json.
Browse files- .gitattributes +2 -1
- app.py +5 -2
- data/data_loader.py +5 -9
- data/fasttext-sbwc.100k.vec +3 -0
- modules/model_embbeding.py +9 -8
.gitattributes
CHANGED
@@ -33,4 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
34 |
data/semi_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
|
35 |
data/half_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
|
36 |
-
data/wiki-news-300d-1M.vec filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
34 |
data/semi_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
|
35 |
data/half_embedding_v6.zip filter=lfs diff=lfs merge=lfs -text
|
36 |
+
data/wiki-news-300d-1M.vec filter=lfs diff=lfs merge=lfs -text
|
37 |
+
data/fasttext-sbwc.100k.vec filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -13,11 +13,14 @@ from interfaces.interface_BiasWordExplorer import interface as biasWordExplorer_
|
|
13 |
# --- Tool config ---
|
14 |
AVAILABLE_LOGS = True # [True | False]
|
15 |
LANGUAGE = "spanish" # [spanish | english]
|
16 |
-
|
17 |
|
18 |
# --- Init classes ---
|
19 |
embedding = Embedding(
|
20 |
-
|
|
|
|
|
|
|
21 |
)
|
22 |
labels = pd.read_json(f"language/{LANGUAGE}.json")["app"]
|
23 |
|
|
|
13 |
# --- Tool config ---
|
14 |
AVAILABLE_LOGS = True # [True | False]
|
15 |
LANGUAGE = "spanish" # [spanish | english]
|
16 |
+
EMBEDDINGS_PATH = "data/fasttext-sbwc.100k.vec"
|
17 |
|
18 |
# --- Init classes ---
|
19 |
embedding = Embedding(
|
20 |
+
path=EMBEDDINGS_PATH,
|
21 |
+
binary=EMBEDDINGS_PATH.endswith('.bin'),
|
22 |
+
limit=None,
|
23 |
+
randomizedPCA=False
|
24 |
)
|
25 |
labels = pd.read_json(f"language/{LANGUAGE}.json")["app"]
|
26 |
|
data/data_loader.py
CHANGED
@@ -13,16 +13,12 @@ def load_embeddings(path, binary = False, randomPCA = False, limit = None):
|
|
13 |
else:
|
14 |
pca = PCA(n_components=2)
|
15 |
|
|
|
16 |
model = KeyedVectors.load_word2vec_format(path, binary=binary, limit=limit)
|
17 |
|
18 |
# Cased Vocab
|
19 |
-
cased_words = model.
|
20 |
-
|
21 |
-
#Normalized vectors
|
22 |
-
model.init_sims(replace=True)
|
23 |
-
cased_emb = [model[word] for word in cased_words]
|
24 |
-
|
25 |
-
# PCA reduction
|
26 |
cased_pca = pca.fit_transform(cased_emb)
|
27 |
|
28 |
df_cased = pd.DataFrame(
|
@@ -36,6 +32,6 @@ def load_embeddings(path, binary = False, randomPCA = False, limit = None):
|
|
36 |
|
37 |
df_cased['word'] = df_cased.word.apply(lambda w: w.lower())
|
38 |
df_uncased = df_cased.drop_duplicates(subset='word')
|
39 |
-
df_uncased
|
40 |
|
41 |
-
load_embeddings('
|
|
|
13 |
else:
|
14 |
pca = PCA(n_components=2)
|
15 |
|
16 |
+
print("--------> PATH:", path)
|
17 |
model = KeyedVectors.load_word2vec_format(path, binary=binary, limit=limit)
|
18 |
|
19 |
# Cased Vocab
|
20 |
+
cased_words = model.index_to_key
|
21 |
+
cased_emb = model.get_normed_vectors()
|
|
|
|
|
|
|
|
|
|
|
22 |
cased_pca = pca.fit_transform(cased_emb)
|
23 |
|
24 |
df_cased = pd.DataFrame(
|
|
|
32 |
|
33 |
df_cased['word'] = df_cased.word.apply(lambda w: w.lower())
|
34 |
df_uncased = df_cased.drop_duplicates(subset='word')
|
35 |
+
return df_uncased
|
36 |
|
37 |
+
#load_embeddings('data/fasttext-sbwc.100k.vec', limit=1000)
|
data/fasttext-sbwc.100k.vec
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:062e67c948fc14de8c9327868293c76a6d6f536f62d6df3d8afbf6d62c4b9cc1
|
3 |
+
size 262330934
|
modules/model_embbeding.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import operator
|
2 |
import numpy as np
|
3 |
import pandas as pd
|
@@ -6,14 +7,14 @@ from gensim import matutils
|
|
6 |
from modules.module_ann import Ann
|
7 |
from memory_profiler import profile
|
8 |
from sklearn.neighbors import NearestNeighbors
|
|
|
9 |
|
10 |
|
11 |
class Embedding:
|
12 |
@profile
|
13 |
-
def __init__(self,
|
14 |
# Dataset info
|
15 |
-
self.
|
16 |
-
self.ds_path = f"data/{subset_name}_embedding_v6.zip"
|
17 |
|
18 |
# Pandas dataset
|
19 |
self.ds = None
|
@@ -25,16 +26,16 @@ class Embedding:
|
|
25 |
self.ann = None
|
26 |
|
27 |
# Load embedding and pca dataset
|
28 |
-
self.__load()
|
29 |
|
30 |
def __contains__(self, word):
|
31 |
return word in self.ds['word'].to_list()
|
32 |
|
33 |
-
def __load(self):
|
34 |
-
print(f"Preparing {self.
|
35 |
|
36 |
-
# ---
|
37 |
-
self.ds =
|
38 |
|
39 |
# --- Get embedding from string
|
40 |
self.embedding = self.ds['embedding'].to_list()
|
|
|
1 |
+
import os
|
2 |
import operator
|
3 |
import numpy as np
|
4 |
import pandas as pd
|
|
|
7 |
from modules.module_ann import Ann
|
8 |
from memory_profiler import profile
|
9 |
from sklearn.neighbors import NearestNeighbors
|
10 |
+
from data.data_loader import load_embeddings
|
11 |
|
12 |
|
13 |
class Embedding:
|
14 |
@profile
|
15 |
+
def __init__(self, path, binary, limit = None, randomizedPCA = False):
|
16 |
# Dataset info
|
17 |
+
self.path = path
|
|
|
18 |
|
19 |
# Pandas dataset
|
20 |
self.ds = None
|
|
|
26 |
self.ann = None
|
27 |
|
28 |
# Load embedding and pca dataset
|
29 |
+
self.__load(binary, limit, randomizedPCA)
|
30 |
|
31 |
def __contains__(self, word):
|
32 |
return word in self.ds['word'].to_list()
|
33 |
|
34 |
+
def __load(self, binary, limit, randomizedPCA):
|
35 |
+
print(f"Preparing {os.path.basename(self.path)} embeddings...")
|
36 |
|
37 |
+
# --- Prepare dataset ---
|
38 |
+
self.ds = load_embeddings(self.path, binary, randomizedPCA, limit)
|
39 |
|
40 |
# --- Get embedding from string
|
41 |
self.embedding = self.ds['embedding'].to_list()
|