ajoy0071998 commited on
Commit
6d9f20a
·
verified ·
1 Parent(s): dbce58a

Upload 5 files

Browse files
Files changed (5) hide show
  1. .env +1 -0
  2. app.py +53 -0
  3. chain.py +42 -0
  4. embeddings.py +50 -0
  5. response.py +28 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ MISTRAL_API_KEY="VblmlGe1ROVcFpraMchfapRp4QvFacqf"
app.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import shutil
4
+ from chain import extract_pdf_with_user, get_final_aswer
5
+
6
+ UPLOAD_DIR = "uploads"
7
+ os.makedirs(UPLOAD_DIR, exist_ok=True)
8
+
9
+ st.title("PDF Extraction and Question Answering")
10
+
11
+ # --- Step 1: Upload PDF and extract ---
12
+
13
+ st.header("Upload PDF and extract text")
14
+
15
+ user_id = st.text_input("User ID")
16
+ name = st.text_input("Save PDF name (without extension)")
17
+
18
+ uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
19
+
20
+ if st.button("Extract PDF"):
21
+
22
+ if not user_id or not name or not uploaded_file:
23
+ st.error("Please provide User ID, name, and upload a PDF file.")
24
+ else:
25
+ # Save uploaded PDF
26
+ upload_path = os.path.join(UPLOAD_DIR, uploaded_file.name)
27
+ with open(upload_path, "wb") as f:
28
+ shutil.copyfileobj(uploaded_file, f)
29
+
30
+ # Call your extraction function
31
+ documents = extract_pdf_with_user(user_id, upload_path, name)
32
+ st.success("PDF processed!")
33
+ st.write("Extracted documents:")
34
+ st.write(documents)
35
+
36
+ # --- Step 2: Ask a question ---
37
+
38
+ st.header("Ask a question")
39
+
40
+ query = st.text_input("Enter your question")
41
+
42
+ if st.button("Get Answer"):
43
+
44
+ if not user_id or not name or not query:
45
+ st.error("Please provide User ID, name, and your question.")
46
+ else:
47
+ pdf_path = os.path.join(UPLOAD_DIR, f"{name}.pdf")
48
+ if not os.path.exists(pdf_path):
49
+ st.error(f"No PDF found with name '{name}.pdf'. Please upload and extract first.")
50
+ else:
51
+ answer = get_final_aswer(query, user_id, pdf_path)
52
+ st.success("Answer generated!")
53
+ st.write(answer)
chain.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from embeddings import set_embedding
3
+ from embeddings import get_chunks
4
+ from user_data import user_doc,get_docs_by_user
5
+ from response import get_answer
6
+ from PyPDF2 import PdfReader
7
+ import os
8
+ import shutil
9
+
10
+
11
+ def extract_pdf_with_user(user_id: str,pdf_path: str, name: str) -> tuple:
12
+ # Set new PDF path based on provided name
13
+ dir_path = os.path.dirname(pdf_path)
14
+ new_pdf_path = os.path.join(dir_path, f"{name}.pdf")
15
+
16
+ if pdf_path != new_pdf_path:
17
+ shutil.copy(pdf_path, new_pdf_path)
18
+ reader = PdfReader(new_pdf_path)
19
+ text = ""
20
+ for page in reader.pages:
21
+ text += page.extract_text() or ""
22
+
23
+ set_embedding(text,new_pdf_path,user_id)
24
+ user_doc(user_id, new_pdf_path)
25
+ return get_docs_by_user(user_id)
26
+
27
+ ##return (user_id, text,pdf_path)
28
+
29
+ def get_final_aswer(query: str, user_id: str,pdf_path: str) -> list:
30
+ chunks = get_chunks(query,user_id,pdf_path)
31
+ full_text = ""
32
+ for chunk in chunks:
33
+ full_text += chunk
34
+
35
+ final_answer = get_answer(query,full_text)
36
+
37
+ return final_answer
38
+
39
+
40
+
41
+
42
+
embeddings.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##return embedding vector for a given text
2
+ ##uses senetence based emebdings
3
+ from langchain_text_splitters import CharacterTextSplitter
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+ from langchain_core.documents import Document
6
+ from langchain_chroma import Chroma
7
+
8
+ model_name = "sentence-transformers/all-MiniLM-L6-v2"
9
+ model_kwargs = {"device": "cpu"}
10
+ encode_kwargs = {"normalize_embeddings": True}
11
+ hf = HuggingFaceEmbeddings(
12
+ model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
13
+ )
14
+
15
+
16
+ vector_store = Chroma(
17
+ collection_name="collection",
18
+ embedding_function=hf,
19
+ persist_directory="chroma_langchain_db",
20
+ )
21
+
22
+
23
+ def set_embedding(text:str,doc_id:str,user_id:str):
24
+ text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
25
+ encoding_name="cl100k_base", chunk_size=300, chunk_overlap=40
26
+ )
27
+ texts = text_splitter.split_text(text)
28
+ print(type(texts[0]))## IT IS LIST OF STRINGS
29
+
30
+ for i in range(len(texts)):
31
+ ##vector=hf.embed_query(texts[i])
32
+ vector_id=(user_id+doc_id+str(i))
33
+ globals()[f"document_{i}"]=Document(
34
+ page_content= texts[i],
35
+ metadata={"doc_id": doc_id, "user_id": user_id},
36
+ id= vector_id,
37
+ )
38
+ vector_store.add_documents([globals()[f"document_{i}"]])
39
+ print(f"Added document {i} with id {vector_id}")
40
+
41
+ def get_chunks(query:str,user_id:str,doc_id:str):
42
+ results = vector_store.similarity_search(
43
+ query,
44
+ k=5,
45
+ filter={"user_id": user_id}
46
+ )
47
+ list_of_chunks=[]
48
+ for res in results:
49
+ list_of_chunks.append(res.page_content)
50
+ return list_of_chunks
response.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##stores embedding vector and corresponding text with vector custom_id
2
+ ##filter embeddings by custom_id
3
+ ##search for similar embeddings
4
+ ##return text data with given vector custom_id
5
+ from dotenv import load_dotenv
6
+ load_dotenv()
7
+
8
+ from langchain_mistralai import ChatMistralAI
9
+
10
+ llm = ChatMistralAI(
11
+ model="mistral-large-latest",
12
+ temperature=0,
13
+ max_retries=1,
14
+ )
15
+
16
+ prompt_tamplet = """
17
+ You just need to answer the question based on the following context.
18
+ QUESTIONS : {question}
19
+ CONTEXT : {context}
20
+ """
21
+
22
+
23
+
24
+ def get_answer(question:str,context:str):
25
+ final_prompt = prompt_tamplet.format(question=question, context=context)
26
+ response = llm.invoke(final_prompt)
27
+ ##print("from planner :",type(response.content))
28
+ return response.content