Mbonea commited on
Commit
eb82136
·
1 Parent(s): ca40eab

create an index if none exists

Browse files
Files changed (1) hide show
  1. App/Embedding/utils/Initialize.py +27 -34
App/Embedding/utils/Initialize.py CHANGED
@@ -7,23 +7,37 @@ import os
7
 
8
 
9
 
10
-
11
- async def delete_documents(task_id):
12
- # get api key from app.pinecone.io
13
- PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
14
  # find your environment next to the api key in pinecone console
15
  PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")
16
 
17
 
18
  index_name = "transcript-bits"
19
- model_name = "thenlper/gte-base"
20
  embeddings = HuggingFaceEmbeddings(model_name=model_name)
21
  spec = PodSpec()
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- pc=pinecone.Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV,spec=spec)
25
- vector_index = pc.Index(index_name)
26
- docsearch = Pinecone.from_existing_index(index_name, embeddings)
 
 
27
 
28
  docsearch.delete(
29
  filter={
@@ -54,20 +68,7 @@ def generateChunks(chunks, task_id, n=100):
54
 
55
 
56
  def search(query: str, task_id: str):
57
- # get api key from app.pinecone.io
58
- PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
59
- # find your environment next to the api key in pinecone console
60
- PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")
61
-
62
-
63
- index_name = "transcript-bits"
64
- model_name = "thenlper/gte-base"
65
- embeddings = HuggingFaceEmbeddings(model_name=model_name)
66
-
67
-
68
- pc=pinecone.Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
69
- vector_index = pc.Index(index_name)
70
- docsearch = Pinecone.from_existing_index(index_name, embeddings)
71
 
72
  filtering_conditions = {
73
  "task_id": {"$eq": task_id},
@@ -81,18 +82,10 @@ def search(query: str, task_id: str):
81
 
82
 
83
  def encode(temp: list[Document]):
84
- PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
85
- # find your environment next to the api key in pinecone console
86
- PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")
87
-
88
 
89
- index_name = "transcript-bits"
90
- model_name = "thenlper/gte-base"
91
- embeddings = HuggingFaceEmbeddings(model_name=model_name)
92
 
93
 
94
- pc=pinecone.Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
95
- vector_index = pc.Index(index_name)
96
- docsearch = Pinecone.from_existing_index(index_name, embeddings)
97
- docsearch.add_documents(temp)
98
- # return embeddings.embed_documents(texts = [d.page_content for d in temp])
 
7
 
8
 
9
 
10
+ def initDocument():
11
+ # get api key from app.pinecone.io
12
+ PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY",'')
 
13
  # find your environment next to the api key in pinecone console
14
  PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")
15
 
16
 
17
  index_name = "transcript-bits"
18
+ model_name = "Alibaba-NLP/gte-base-en-v1.5"
19
  embeddings = HuggingFaceEmbeddings(model_name=model_name)
20
  spec = PodSpec()
21
 
22
+ try:
23
+ pc=pinecone.Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV,spec=spec)
24
+ vector_index = pc.Index(index_name)
25
+ except:
26
+ pc.create_index(name=index_name,
27
+ dimension=768,
28
+ metric="cosine",
29
+ spec=spec)
30
+
31
+
32
+ docsearch = Pinecone.from_existing_index(index_name, embeddings)
33
+
34
+ return docsearch
35
 
36
+
37
+
38
+
39
+ async def delete_documents(task_id):
40
+ docsearch=initDocument()
41
 
42
  docsearch.delete(
43
  filter={
 
68
 
69
 
70
  def search(query: str, task_id: str):
71
+ docsearch=initDocument()
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  filtering_conditions = {
74
  "task_id": {"$eq": task_id},
 
82
 
83
 
84
  def encode(temp: list[Document]):
85
+ docsearch=initDocument()
86
+ docsearch.add_documents(temp)
87
+ # return embeddings.embed_documents(texts = [d.page_content for d in temp])
 
88
 
 
 
 
89
 
90
 
91
+ initDocument()