Spaces:
Configuration error
Configuration error
Commit
·
0c56fea
1
Parent(s):
e7eeec5
Added other approaches to load embeddings from .vec and .bin extended files
Browse files- modules/model_embbeding.py +44 -5
modules/model_embbeding.py
CHANGED
@@ -3,6 +3,7 @@ from memory_profiler import profile
|
|
3 |
from sklearn.neighbors import NearestNeighbors
|
4 |
from sklearn.decomposition import PCA
|
5 |
from gensim.models import KeyedVectors
|
|
|
6 |
from typing import List, Any
|
7 |
import os
|
8 |
import pandas as pd
|
@@ -89,12 +90,50 @@ class Embedding:
|
|
89 |
pca = PCA(
|
90 |
n_components=2
|
91 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
-
model
|
94 |
-
|
95 |
-
binary=path.endswith('.bin'),
|
96 |
-
limit=limit
|
97 |
-
)
|
98 |
|
99 |
# Cased Vocab
|
100 |
cased_words = model.index_to_key
|
|
|
3 |
from sklearn.neighbors import NearestNeighbors
|
4 |
from sklearn.decomposition import PCA
|
5 |
from gensim.models import KeyedVectors
|
6 |
+
from gensim.models.fasttext import load_facebook_vectors
|
7 |
from typing import List, Any
|
8 |
import os
|
9 |
import pandas as pd
|
|
|
90 |
pca = PCA(
|
91 |
n_components=2
|
92 |
)
|
93 |
+
|
94 |
+
model = None
|
95 |
+
binary = path.endswith('.bin')
|
96 |
+
|
97 |
+
# Should be enough for all .vec files
|
98 |
+
try:
|
99 |
+
model = KeyedVectors.load_word2vec_format(
|
100 |
+
fname=path,
|
101 |
+
binary=binary,
|
102 |
+
limit=limit,
|
103 |
+
unicode_errors='ignore'
|
104 |
+
)
|
105 |
+
|
106 |
+
except UnicodeDecodeError:
|
107 |
+
pass #Try other way of loading
|
108 |
+
|
109 |
+
# If it's a .bin Fasttext saved model
|
110 |
+
if model is None and binary:
|
111 |
+
print('Error during load of provided model. \
|
112 |
+
Using different approaches. \
|
113 |
+
The limit parameter won\'t be taken into account'
|
114 |
+
)
|
115 |
+
|
116 |
+
# If it's a Fasttext model
|
117 |
+
try:
|
118 |
+
model = load_facebook_vectors(
|
119 |
+
path=path
|
120 |
+
)
|
121 |
+
|
122 |
+
except UnicodeDecodeError:
|
123 |
+
pass
|
124 |
+
|
125 |
+
if model is None:
|
126 |
+
# Last chance, if it is a Word2Vec model
|
127 |
+
try:
|
128 |
+
model = KeyedVectors.load(
|
129 |
+
fname=path
|
130 |
+
)
|
131 |
+
|
132 |
+
except:
|
133 |
+
pass
|
134 |
|
135 |
+
if model is None:
|
136 |
+
raise Exception(f'Can\'t load {path} after multiple approaches.')
|
|
|
|
|
|
|
137 |
|
138 |
# Cased Vocab
|
139 |
cased_words = model.index_to_key
|