Spaces:

cnmoro
/

SemanticCompression

Running

cnmoro commited on May 19, 2024

Commit

bb73643

verified ·

1 Parent(s): b218a36

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,17 +3,15 @@ from minivectordb.embedding_model import EmbeddingModel
 from minivectordb.vector_database import VectorDatabase
 from multiprocessing import cpu_count
 from functools import lru_cache
-import fasttext, random, nltk, tiktoken, os
 import concurrent.futures
-nltk.download('stopwords')
-from nltk.corpus import stopwords
 os.environ['TOKENIZERS_PARALLELISM'] = 'true'
 langdetect_model = fasttext.load_model('lid.176.ftz')
 embedding_model = EmbeddingModel(onnx_model_cpu_core_count=1)
-en_stop_words = stopwords.words('english')
-pt_stop_words = stopwords.words('portuguese')
 tokenizer = tiktoken.encoding_for_model("gpt-4")
 def count_tokens_tiktoken(text):

 from minivectordb.vector_database import VectorDatabase
 from multiprocessing import cpu_count
 from functools import lru_cache
+import fasttext, random, tiktoken, os, pickle
 import concurrent.futures
 os.environ['TOKENIZERS_PARALLELISM'] = 'true'
 langdetect_model = fasttext.load_model('lid.176.ftz')
 embedding_model = EmbeddingModel(onnx_model_cpu_core_count=1)
+en_stop_words = pickle.load(open("en_stopwords.pkl", "rb"))
+pt_stop_words = pickle.load(open("pt_stopwords.pkl", "rb"))
 tokenizer = tiktoken.encoding_for_model("gpt-4")
 def count_tokens_tiktoken(text):