deenasun commited on
Commit
c4f7a31
·
1 Parent(s): f37f939

add python-version, update gitignore, update vectorizer to use cached gensim-data if available

Browse files
.gitignore CHANGED
@@ -1,2 +1,4 @@
1
  .DS_Store
2
- .env
 
 
 
1
  .DS_Store
2
+ .env
3
+ __pycache__/*
4
+ gensim-data/*
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.11.13
__pycache__/asl_gloss.cpython-311.pyc DELETED
Binary file (14.3 kB)
 
__pycache__/document_parsing.cpython-311.pyc DELETED
Binary file (15.2 kB)
 
__pycache__/document_parsing.cpython-313.pyc DELETED
Binary file (10.6 kB)
 
__pycache__/vectorizer.cpython-311.pyc DELETED
Binary file (7.07 kB)
 
vectorizer.py CHANGED
@@ -1,5 +1,6 @@
1
  import gensim
2
  import gensim.downloader
 
3
  import numpy as np
4
  import pandas as pd
5
  import os
@@ -18,8 +19,16 @@ class Vectorizer:
18
  """
19
  Returns a KeyedVector object loaded from gensim
20
  """
 
21
  try:
 
 
 
 
 
 
22
  kv = gensim.downloader.load(model_name) # returns a keyedvector
 
23
  return kv
24
  except Exception as e:
25
  print(f"Unable to load embedding model from gensim: {e}")
 
1
  import gensim
2
  import gensim.downloader
3
+ from gensim.models import KeyedVectors
4
  import numpy as np
5
  import pandas as pd
6
  import os
 
19
  """
20
  Returns a KeyedVector object loaded from gensim
21
  """
22
+ model_path = os.path.join(os.getcwd(), 'gensim-data', 'GoogleNews-vectors-negative300.bin.gz')
23
  try:
24
+ print(f"Loading model from {model_path}")
25
+ kv = KeyedVectors.load_word2vec_format(model_path, binary=True)
26
+ print("Word2Vec model loaded successfully as KeyedVectors object.")
27
+ return kv
28
+ except FileNotFoundError:
29
+ print(f"Error: Model file not found at {model_path}. Trying to download...")
30
  kv = gensim.downloader.load(model_name) # returns a keyedvector
31
+ print("Word2Vec model loaded successfully as KeyedVectors object.")
32
  return kv
33
  except Exception as e:
34
  print(f"Unable to load embedding model from gensim: {e}")