Mbonea commited on
Commit
1fba972
·
1 Parent(s): f5b43ca

filter conditions

Browse files
Files changed (1) hide show
  1. App/Embedding/utils/Initialize.py +30 -101
App/Embedding/utils/Initialize.py CHANGED
@@ -1,73 +1,32 @@
1
  from langchain.embeddings import HuggingFaceEmbeddings
2
  from langchain.docstore.document import Document
3
- from langchain.vectorstores import MongoDBAtlasVectorSearch
4
- from pymongo import MongoClient
5
- from motor.motor_asyncio import AsyncIOMotorClient
6
- import os,pprint
7
-
8
-
9
- completion_base = os.environ.get("completion_base")
10
- openai_api_key = os.environ.get("openai_api_key")
11
- mongoDB = os.environ.get("MONGO_DB")
12
- template = """### Given the following context
13
- ### Context
14
- {context}
15
- ### Use it to explain the question: {question}
16
- """
17
-
18
-
19
- async def fetch_data(question, context):
20
- url = completion_base
21
-
22
- payload = json.dumps(
23
- {
24
- "messages": [
25
- {
26
- "role": "system",
27
- "content": "### You provide explanations based on the provided context",
28
- },
29
- {
30
- "role": "user",
31
- "content": template.format(context=context, question=question),
32
- },
33
- ],
34
- "model": "gpt-3.5-turbo",
35
- "temperature": 1,
36
- "presence_penalty": 0,
37
- "top_p": 0.95,
38
- "frequency_penalty": 0,
39
- "stream": False,
40
- }
41
- )
42
 
43
- headers = {
44
- "Content-Type": "application/json",
45
- "Authorization": f"Bearer {openai_api_key}",
46
- }
47
 
48
- async with aiohttp.ClientSession() as session:
49
- async with session.post(url, headers=headers, data=payload) as response:
50
- response = await response.json()
51
- return response["choices"][0]["message"]["content"]
52
 
 
 
 
53
 
54
- async def delete_documents(task_id):
55
- client = AsyncIOMotorClient(mongoDB)
56
- db = client["transcriptions"]
57
- collection = db["videos"]
58
 
59
- result = await collection.delete_many({"task_id": task_id})
60
- print(f"Deleted {result.deleted_count} document(s)")
 
 
61
 
 
 
 
 
 
 
62
 
63
- # mongo_client = MongoClient(
64
- # mongoDB
65
- # )
66
- # model_name = "BAAI/bge-base-en"
67
- # collection = mongo_client["transcriptions"]["videos"]
68
- # embeddings = HuggingFaceEmbeddings(model_name=model_name)
69
- # index_name = "test_embeddings"
70
- # vectorstore = MongoDBAtlasVectorSearch(collection, embeddings, index_name=index_name)
71
 
72
 
73
  def generateChunks(chunks, task_id, n=100):
@@ -91,47 +50,17 @@ def generateChunks(chunks, task_id, n=100):
91
 
92
 
93
  def search(query: str, task_id: str):
94
- mongo_client = MongoClient(mongoDB)
95
- model_name = "BAAI/bge-base-en"
96
- collection = mongo_client["transcriptions"]["videos"]
97
- embeddings = HuggingFaceEmbeddings(model_name=model_name)
98
- index_name = "test_embedding"
99
- k = 5
100
- vectorstore = MongoDBAtlasVectorSearch(
101
- collection,
102
- embedding=embeddings,
103
- index_name="test_embedding",
104
- )
105
-
106
- data = vectorstore.similarity_search(
107
- query=query,
108
- pre_filter={"text": {"path": "task_id", "query": task_id}},
109
- search_kwargs={
110
- "k": k, # overrequest k during search
111
- "pre_filter": {"path": "task_id", "equals": task_id},
112
- "post_filter_pipeline": [{"$limit": k}], # limit results to top k
113
- },
114
- )
115
- # data =[d.dict() for d in data]
116
- # print(data[0].metadata.exclude({'_id','embedding'}))
117
- # pprint.pprint(data[0].metadata)
118
- return [{"text": d.page_content,'start':d.metadata['start'],"end":d.metadata['end']} for d in data]
119
- # agent =vectorstore.as_retriever(
120
 
121
- # )
122
- # return agent.get_relevant_documents
123
 
124
 
125
  def encode(temp: list[Document]):
126
- mongo_client = MongoClient(mongoDB)
127
- model_name = "BAAI/bge-base-en"
128
- collection = mongo_client["transcriptions"]["videos"]
129
- embeddings = HuggingFaceEmbeddings(model_name=model_name)
130
- index_name = "test_embedding"
131
- vectorstore = MongoDBAtlasVectorSearch(
132
- collection, embeddings, index_name=index_name
133
- )
134
- vectorstore.from_documents(
135
- temp, embedding=embeddings, collection=collection, index_name=index_name
136
- )
137
- # return embeddings.embed_documents(texts = [d.page_content for d in temp])
 
1
  from langchain.embeddings import HuggingFaceEmbeddings
2
  from langchain.docstore.document import Document
3
+ from langchain.vectorstores import Pinecone
4
+ import pinecone
5
+ import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ # get api key from app.pinecone.io
8
+ PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
9
+ # find your environment next to the api key in pinecone console
10
+ PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")
11
 
 
 
 
 
12
 
13
+ index_name = "transcript-bits"
14
+ model_name = "thenlper/gte-base"
15
+ embeddings = HuggingFaceEmbeddings(model_name=model_name)
16
 
 
 
 
 
17
 
18
+ pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
19
+ vector_index = pinecone.Index(index_name=index_name)
20
+ docsearch = Pinecone.from_existing_index(index_name, embeddings)
21
+
22
 
23
+ async def delete_documents(task_id):
24
+ docsearch.delete(
25
+ filter={
26
+ "task_id": {"$eq": "task_id"},
27
+ }
28
+ )
29
 
 
 
 
 
 
 
 
 
30
 
31
 
32
  def generateChunks(chunks, task_id, n=100):
 
50
 
51
 
52
  def search(query: str, task_id: str):
53
+ filtering_conditions = {
54
+ "task_id": {"$eq": "task_id"},
55
+ }
56
+ data =docsearch.similarity_search(query, k=10, filter=filtering_conditions)
57
+ return [
58
+ {"text": d.page_content, "start": d.metadata["start"], "end": d.metadata["end"]}
59
+ for d in data
60
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
 
 
62
 
63
 
64
  def encode(temp: list[Document]):
65
+ docsearch.add_documents([temp])
66
+ # return embeddings.embed_documents(texts = [d.page_content for d in temp])