Spaces:

vialibre
/

edia_we_es

Configuration error

App Files Files Community

LMartinezEXEX commited on Dec 16, 2022

Commit

0c56fea

1 Parent(s): e7eeec5

Added other approaches to load embeddings from .vec and .bin extended files

Browse files

Files changed (1) hide show

modules/model_embbeding.py +44 -5

modules/model_embbeding.py CHANGED Viewed

@@ -3,6 +3,7 @@ from memory_profiler import profile
 from sklearn.neighbors import NearestNeighbors
 from sklearn.decomposition import PCA
 from gensim.models import KeyedVectors
 from typing import List, Any
 import os
 import pandas as pd
@@ -89,12 +90,50 @@ class Embedding:
             pca = PCA(
                 n_components=2
             )
-        model = KeyedVectors.load_word2vec_format(
-            fname=path,
-            binary=path.endswith('.bin'),
-            limit=limit
-        )
         # Cased Vocab
         cased_words = model.index_to_key

 from sklearn.neighbors import NearestNeighbors
 from sklearn.decomposition import PCA
 from gensim.models import KeyedVectors
+from gensim.models.fasttext import load_facebook_vectors
 from typing import List, Any
 import os
 import pandas as pd
             pca = PCA(
                 n_components=2
             )
+        model = None
+        binary = path.endswith('.bin')
+        # Should be enough for all .vec files
+        try:
+            model = KeyedVectors.load_word2vec_format(
+                fname=path,
+                binary=binary,
+                limit=limit,
+                unicode_errors='ignore'
+            )
+        except UnicodeDecodeError:
+            pass    #Try other way of loading
+        # If it's a .bin Fasttext saved model
+        if model is None and binary:
+            print('Error during load of provided model. \
+                   Using different approaches. \
+                   The limit parameter won\'t be taken into account'
+                )
+            # If it's a Fasttext model
+            try:
+                model = load_facebook_vectors(
+                    path=path
+                )
+            except UnicodeDecodeError:
+                pass
+            if model is None:
+                # Last chance, if it is a Word2Vec model
+                try:
+                    model = KeyedVectors.load(
+                        fname=path
+                    )
+                except:
+                    pass
+        if model is None:
+            raise Exception(f'Can\'t load {path} after multiple approaches.')
         # Cased Vocab
         cased_words = model.index_to_key