ajoy0071998 commited on
Commit
effc96d
·
verified ·
1 Parent(s): 79f08dd

Upload 5 files

Browse files
Files changed (5) hide show
  1. .env +1 -0
  2. chain.py +42 -0
  3. embeddings.py +50 -0
  4. response.py +28 -0
  5. user_data.py +37 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ MISTRAL_API_KEY="VblmlGe1ROVcFpraMchfapRp4QvFacqf"
chain.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from embeddings import set_embedding
3
+ from embeddings import get_chunks
4
+ from user_data import user_doc,get_docs_by_user
5
+ from response import get_answer
6
+ from PyPDF2 import PdfReader
7
+ import os
8
+ import shutil
9
+
10
+
11
+ def extract_pdf_with_user(user_id: str,pdf_path: str, name: str) -> tuple:
12
+ # Set new PDF path based on provided name
13
+ dir_path = os.path.dirname(pdf_path)
14
+ new_pdf_path = os.path.join(dir_path, f"{name}.pdf")
15
+
16
+ if pdf_path != new_pdf_path:
17
+ shutil.copy(pdf_path, new_pdf_path)
18
+ reader = PdfReader(new_pdf_path)
19
+ text = ""
20
+ for page in reader.pages:
21
+ text += page.extract_text() or ""
22
+
23
+ set_embedding(text,new_pdf_path,user_id)
24
+ user_doc(user_id, new_pdf_path)
25
+ return get_docs_by_user(user_id)
26
+
27
+ ##return (user_id, text,pdf_path)
28
+
29
+ def get_final_aswer(query: str, user_id: str,pdf_path: str) -> list:
30
+ chunks = get_chunks(query,user_id,pdf_path)
31
+ full_text = ""
32
+ for chunk in chunks:
33
+ full_text += chunk
34
+
35
+ final_answer = get_answer(query,full_text)
36
+
37
+ return final_answer
38
+
39
+
40
+
41
+
42
+
embeddings.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##return embedding vector for a given text
2
+ ##uses senetence based emebdings
3
+ from langchain_text_splitters import CharacterTextSplitter
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+ from langchain_core.documents import Document
6
+ from langchain_chroma import Chroma
7
+
8
+ model_name = "sentence-transformers/all-MiniLM-L6-v2"
9
+ model_kwargs = {"device": "cpu"}
10
+ encode_kwargs = {"normalize_embeddings": True}
11
+ hf = HuggingFaceEmbeddings(
12
+ model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
13
+ )
14
+
15
+
16
+ vector_store = Chroma(
17
+ collection_name="collection",
18
+ embedding_function=hf,
19
+ persist_directory="chroma_langchain_db",
20
+ )
21
+
22
+
23
+ def set_embedding(text:str,doc_id:str,user_id:str):
24
+ text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
25
+ encoding_name="cl100k_base", chunk_size=300, chunk_overlap=40
26
+ )
27
+ texts = text_splitter.split_text(text)
28
+ print(type(texts[0]))## IT IS LIST OF STRINGS
29
+
30
+ for i in range(len(texts)):
31
+ ##vector=hf.embed_query(texts[i])
32
+ vector_id=(user_id+doc_id+str(i))
33
+ globals()[f"document_{i}"]=Document(
34
+ page_content= texts[i],
35
+ metadata={"doc_id": doc_id, "user_id": user_id},
36
+ id= vector_id,
37
+ )
38
+ vector_store.add_documents([globals()[f"document_{i}"]])
39
+ print(f"Added document {i} with id {vector_id}")
40
+
41
+ def get_chunks(query:str,user_id:str,doc_id:str):
42
+ results = vector_store.similarity_search(
43
+ query,
44
+ k=5,
45
+ filter={"user_id": user_id}
46
+ )
47
+ list_of_chunks=[]
48
+ for res in results:
49
+ list_of_chunks.append(res.page_content)
50
+ return list_of_chunks
response.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##stores embedding vector and corresponding text with vector custom_id
2
+ ##filter embeddings by custom_id
3
+ ##search for similar embeddings
4
+ ##return text data with given vector custom_id
5
+ from dotenv import load_dotenv
6
+ load_dotenv()
7
+
8
+ from langchain_mistralai import ChatMistralAI
9
+
10
+ llm = ChatMistralAI(
11
+ model="mistral-large-latest",
12
+ temperature=0,
13
+ max_retries=1,
14
+ )
15
+
16
+ prompt_tamplet = """
17
+ You just need to answer the question based on the following context.
18
+ QUESTIONS : {question}
19
+ CONTEXT : {context}
20
+ """
21
+
22
+
23
+
24
+ def get_answer(question:str,context:str):
25
+ final_prompt = prompt_tamplet.format(question=question, context=context)
26
+ response = llm.invoke(final_prompt)
27
+ ##print("from planner :",type(response.content))
28
+ return response.content
user_data.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ import os
3
+ db_path = os.path.join("data", "user_docs.db")
4
+
5
+ def init_db():
6
+ conn = sqlite3.connect(db_path)
7
+ cursor = conn.cursor()
8
+ cursor.execute('''
9
+ CREATE TABLE IF NOT EXISTS user_docs (
10
+ user_id TEXT,
11
+ doc_id TEXT
12
+ )
13
+ ''')
14
+ conn.commit()
15
+ conn.close()
16
+ init_db()
17
+ def user_doc(user_id: str, doc_id: str):
18
+ conn = sqlite3.connect(db_path)
19
+ cursor = conn.cursor()
20
+ cursor.execute('''
21
+ INSERT INTO user_docs (user_id, doc_id)
22
+ VALUES (?, ?)
23
+ ''', (user_id, doc_id))
24
+ conn.commit()
25
+ conn.close()
26
+
27
+
28
+ def get_docs_by_user(user_id: str):
29
+ conn = sqlite3.connect(db_path)
30
+ cursor = conn.cursor()
31
+ cursor.execute('''
32
+ SELECT user_id, doc_id FROM user_docs WHERE user_id = ?
33
+ ''', (user_id,))
34
+ results = cursor.fetchall()
35
+ conn.close()
36
+ return results
37
+