Kolumbus Lindh commited on
Commit
f0678ae
·
1 Parent(s): 0cb7604

updated to a new LLM with more tokens and larger vectors

Browse files
Files changed (1) hide show
  1. pinecone_handler.py +5 -3
pinecone_handler.py CHANGED
@@ -45,13 +45,15 @@ class PineconeHandler:
45
 
46
  self.pc.create_index(
47
  name=PINECONE_INDEX_NAME,
48
- dimension=384,
49
  metric="cosine",
50
  spec=spec
51
  )
52
  self.index = self.pc.Index(PINECONE_INDEX_NAME)
53
 
54
- self.model = SentenceTransformer('all-MiniLM-L6-v2')
 
 
55
  log.info(f"Initialized connection to Pinecone index '{PINECONE_INDEX_NAME}'")
56
 
57
  def _create_embedding(self, ad: Dict[str, Any]) -> List[float]:
@@ -91,7 +93,7 @@ class PineconeHandler:
91
  'city': (workplace_address.get('municipality', '') or '')[:100],
92
  'occupation': (occupation.get('label', '') or '')[:100],
93
  'headline': (ad.get('headline', '') or '')[:200],
94
- 'description': (description.get('text', '') or '')[:1000],
95
  'logo_url': (ad.get('logo_url', '') or '')[:200],
96
  'webpage_url': (ad.get('webpage_url', '') or '')[:200],
97
  'published': (ad.get('publication_date', '') or '')[:50]
 
45
 
46
  self.pc.create_index(
47
  name=PINECONE_INDEX_NAME,
48
+ dimension=512,
49
  metric="cosine",
50
  spec=spec
51
  )
52
  self.index = self.pc.Index(PINECONE_INDEX_NAME)
53
 
54
+ #self.model = SentenceTransformer('all-MiniLM-L6-v2')
55
+ #512 token max length, embedding dim 768
56
+ self.model = SentenceTransformer('sentence-transformers/allenai-specter')
57
  log.info(f"Initialized connection to Pinecone index '{PINECONE_INDEX_NAME}'")
58
 
59
  def _create_embedding(self, ad: Dict[str, Any]) -> List[float]:
 
93
  'city': (workplace_address.get('municipality', '') or '')[:100],
94
  'occupation': (occupation.get('label', '') or '')[:100],
95
  'headline': (ad.get('headline', '') or '')[:200],
96
+ 'description': (description.get('text', '') or '')[:2000],
97
  'logo_url': (ad.get('logo_url', '') or '')[:200],
98
  'webpage_url': (ad.get('webpage_url', '') or '')[:200],
99
  'published': (ad.get('publication_date', '') or '')[:50]