Spaces:

ajoy0071998
/

QA_system

Runtime error

App Files Files Community

ajoy0071998 commited on May 23

Commit

effc96d

verified ·

1 Parent(s): 79f08dd

Upload 5 files

Browse files

Files changed (5) hide show

.env +1 -0
chain.py +42 -0
embeddings.py +50 -0
response.py +28 -0
user_data.py +37 -0

.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ MISTRAL_API_KEY="VblmlGe1ROVcFpraMchfapRp4QvFacqf"

chain.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from embeddings import set_embedding
+from embeddings import get_chunks
+from user_data import user_doc,get_docs_by_user
+from response import get_answer
+from PyPDF2 import PdfReader
+import os
+import shutil
+def extract_pdf_with_user(user_id: str,pdf_path: str, name: str) -> tuple:
+    # Set new PDF path based on provided name
+    dir_path = os.path.dirname(pdf_path)
+    new_pdf_path = os.path.join(dir_path, f"{name}.pdf")
+    if pdf_path != new_pdf_path:
+        shutil.copy(pdf_path, new_pdf_path)
+    reader = PdfReader(new_pdf_path)
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text() or ""
+    set_embedding(text,new_pdf_path,user_id)
+    user_doc(user_id, new_pdf_path)
+    return get_docs_by_user(user_id)
+    ##return (user_id, text,pdf_path)
+def get_final_aswer(query: str, user_id: str,pdf_path: str) -> list:
+    chunks = get_chunks(query,user_id,pdf_path)
+    full_text = ""
+    for chunk in chunks:
+        full_text += chunk
+    final_answer = get_answer(query,full_text)
+    return final_answer

embeddings.py ADDED Viewed

	@@ -0,0 +1,50 @@

+##return embedding vector for a given text
+##uses senetence based emebdings
+from langchain_text_splitters import CharacterTextSplitter
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_core.documents import Document
+from langchain_chroma import Chroma
+model_name = "sentence-transformers/all-MiniLM-L6-v2"
+model_kwargs = {"device": "cpu"}
+encode_kwargs = {"normalize_embeddings": True}
+hf = HuggingFaceEmbeddings(
+model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
+)
+vector_store = Chroma(
+collection_name="collection",
+embedding_function=hf,
+persist_directory="chroma_langchain_db",
+)
+def set_embedding(text:str,doc_id:str,user_id:str):
+    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
+    encoding_name="cl100k_base", chunk_size=300, chunk_overlap=40
+)
+    texts = text_splitter.split_text(text)
+    print(type(texts[0]))## IT IS LIST OF STRINGS
+    for i in range(len(texts)):
+        ##vector=hf.embed_query(texts[i])
+        vector_id=(user_id+doc_id+str(i))
+        globals()[f"document_{i}"]=Document(
+            page_content= texts[i],
+            metadata={"doc_id": doc_id, "user_id": user_id},
+            id= vector_id,
+        )
+        vector_store.add_documents([globals()[f"document_{i}"]])
+        print(f"Added document {i} with id {vector_id}")
+def get_chunks(query:str,user_id:str,doc_id:str):
+    results = vector_store.similarity_search(
+    query,
+    k=5,
+    filter={"user_id": user_id}
+)
+    list_of_chunks=[]
+    for res in results:
+        list_of_chunks.append(res.page_content)
+    return list_of_chunks

response.py ADDED Viewed

	@@ -0,0 +1,28 @@

+##stores embedding vector and corresponding text with vector custom_id
+##filter embeddings by custom_id
+##search for similar embeddings
+##return text data with given vector custom_id
+from dotenv import load_dotenv
+load_dotenv()
+from langchain_mistralai import ChatMistralAI
+llm = ChatMistralAI(
+    model="mistral-large-latest",
+    temperature=0,
+    max_retries=1,
+)
+prompt_tamplet = """
+    You just need to answer the question based on the following context.
+    QUESTIONS : {question}
+    CONTEXT : {context}
+"""
+def get_answer(question:str,context:str):
+    final_prompt = prompt_tamplet.format(question=question, context=context)
+    response = llm.invoke(final_prompt)
+    ##print("from planner :",type(response.content))
+    return response.content

user_data.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import sqlite3
+import os
+db_path = os.path.join("data", "user_docs.db")
+def init_db():
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS user_docs (
+            user_id TEXT,
+            doc_id TEXT
+        )
+    ''')
+    conn.commit()
+    conn.close()
+init_db()
+def user_doc(user_id: str, doc_id: str):
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+    cursor.execute('''
+        INSERT INTO user_docs (user_id, doc_id)
+        VALUES (?, ?)
+    ''', (user_id, doc_id))
+    conn.commit()
+    conn.close()
+def get_docs_by_user(user_id: str):
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+    cursor.execute('''
+        SELECT user_id, doc_id FROM user_docs WHERE user_id = ?
+    ''', (user_id,))
+    results = cursor.fetchall()
+    conn.close()
+    return results