Spaces:
Running
Running
Kolumbus Lindh
commited on
Commit
·
f0678ae
1
Parent(s):
0cb7604
updated to a new LLM with more tokens and larger vectors
Browse files- pinecone_handler.py +5 -3
pinecone_handler.py
CHANGED
@@ -45,13 +45,15 @@ class PineconeHandler:
|
|
45 |
|
46 |
self.pc.create_index(
|
47 |
name=PINECONE_INDEX_NAME,
|
48 |
-
dimension=
|
49 |
metric="cosine",
|
50 |
spec=spec
|
51 |
)
|
52 |
self.index = self.pc.Index(PINECONE_INDEX_NAME)
|
53 |
|
54 |
-
self.model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
|
|
|
55 |
log.info(f"Initialized connection to Pinecone index '{PINECONE_INDEX_NAME}'")
|
56 |
|
57 |
def _create_embedding(self, ad: Dict[str, Any]) -> List[float]:
|
@@ -91,7 +93,7 @@ class PineconeHandler:
|
|
91 |
'city': (workplace_address.get('municipality', '') or '')[:100],
|
92 |
'occupation': (occupation.get('label', '') or '')[:100],
|
93 |
'headline': (ad.get('headline', '') or '')[:200],
|
94 |
-
'description': (description.get('text', '') or '')[:
|
95 |
'logo_url': (ad.get('logo_url', '') or '')[:200],
|
96 |
'webpage_url': (ad.get('webpage_url', '') or '')[:200],
|
97 |
'published': (ad.get('publication_date', '') or '')[:50]
|
|
|
45 |
|
46 |
self.pc.create_index(
|
47 |
name=PINECONE_INDEX_NAME,
|
48 |
+
dimension=512,
|
49 |
metric="cosine",
|
50 |
spec=spec
|
51 |
)
|
52 |
self.index = self.pc.Index(PINECONE_INDEX_NAME)
|
53 |
|
54 |
+
#self.model = SentenceTransformer('all-MiniLM-L6-v2')
|
55 |
+
#512 token max length, embedding dim 768
|
56 |
+
self.model = SentenceTransformer('sentence-transformers/allenai-specter')
|
57 |
log.info(f"Initialized connection to Pinecone index '{PINECONE_INDEX_NAME}'")
|
58 |
|
59 |
def _create_embedding(self, ad: Dict[str, Any]) -> List[float]:
|
|
|
93 |
'city': (workplace_address.get('municipality', '') or '')[:100],
|
94 |
'occupation': (occupation.get('label', '') or '')[:100],
|
95 |
'headline': (ad.get('headline', '') or '')[:200],
|
96 |
+
'description': (description.get('text', '') or '')[:2000],
|
97 |
'logo_url': (ad.get('logo_url', '') or '')[:200],
|
98 |
'webpage_url': (ad.get('webpage_url', '') or '')[:200],
|
99 |
'published': (ad.get('publication_date', '') or '')[:50]
|