Spaces:
Running
Running
Upload 5 files
Browse files- .env +1 -0
- app.py +53 -0
- chain.py +42 -0
- embeddings.py +50 -0
- response.py +28 -0
.env
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
MISTRAL_API_KEY="VblmlGe1ROVcFpraMchfapRp4QvFacqf"
|
app.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
import shutil
|
4 |
+
from chain import extract_pdf_with_user, get_final_aswer
|
5 |
+
|
6 |
+
UPLOAD_DIR = "uploads"
|
7 |
+
os.makedirs(UPLOAD_DIR, exist_ok=True)
|
8 |
+
|
9 |
+
st.title("PDF Extraction and Question Answering")
|
10 |
+
|
11 |
+
# --- Step 1: Upload PDF and extract ---
|
12 |
+
|
13 |
+
st.header("Upload PDF and extract text")
|
14 |
+
|
15 |
+
user_id = st.text_input("User ID")
|
16 |
+
name = st.text_input("Save PDF name (without extension)")
|
17 |
+
|
18 |
+
uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
|
19 |
+
|
20 |
+
if st.button("Extract PDF"):
|
21 |
+
|
22 |
+
if not user_id or not name or not uploaded_file:
|
23 |
+
st.error("Please provide User ID, name, and upload a PDF file.")
|
24 |
+
else:
|
25 |
+
# Save uploaded PDF
|
26 |
+
upload_path = os.path.join(UPLOAD_DIR, uploaded_file.name)
|
27 |
+
with open(upload_path, "wb") as f:
|
28 |
+
shutil.copyfileobj(uploaded_file, f)
|
29 |
+
|
30 |
+
# Call your extraction function
|
31 |
+
documents = extract_pdf_with_user(user_id, upload_path, name)
|
32 |
+
st.success("PDF processed!")
|
33 |
+
st.write("Extracted documents:")
|
34 |
+
st.write(documents)
|
35 |
+
|
36 |
+
# --- Step 2: Ask a question ---
|
37 |
+
|
38 |
+
st.header("Ask a question")
|
39 |
+
|
40 |
+
query = st.text_input("Enter your question")
|
41 |
+
|
42 |
+
if st.button("Get Answer"):
|
43 |
+
|
44 |
+
if not user_id or not name or not query:
|
45 |
+
st.error("Please provide User ID, name, and your question.")
|
46 |
+
else:
|
47 |
+
pdf_path = os.path.join(UPLOAD_DIR, f"{name}.pdf")
|
48 |
+
if not os.path.exists(pdf_path):
|
49 |
+
st.error(f"No PDF found with name '{name}.pdf'. Please upload and extract first.")
|
50 |
+
else:
|
51 |
+
answer = get_final_aswer(query, user_id, pdf_path)
|
52 |
+
st.success("Answer generated!")
|
53 |
+
st.write(answer)
|
chain.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from embeddings import set_embedding
|
3 |
+
from embeddings import get_chunks
|
4 |
+
from user_data import user_doc,get_docs_by_user
|
5 |
+
from response import get_answer
|
6 |
+
from PyPDF2 import PdfReader
|
7 |
+
import os
|
8 |
+
import shutil
|
9 |
+
|
10 |
+
|
11 |
+
def extract_pdf_with_user(user_id: str,pdf_path: str, name: str) -> tuple:
|
12 |
+
# Set new PDF path based on provided name
|
13 |
+
dir_path = os.path.dirname(pdf_path)
|
14 |
+
new_pdf_path = os.path.join(dir_path, f"{name}.pdf")
|
15 |
+
|
16 |
+
if pdf_path != new_pdf_path:
|
17 |
+
shutil.copy(pdf_path, new_pdf_path)
|
18 |
+
reader = PdfReader(new_pdf_path)
|
19 |
+
text = ""
|
20 |
+
for page in reader.pages:
|
21 |
+
text += page.extract_text() or ""
|
22 |
+
|
23 |
+
set_embedding(text,new_pdf_path,user_id)
|
24 |
+
user_doc(user_id, new_pdf_path)
|
25 |
+
return get_docs_by_user(user_id)
|
26 |
+
|
27 |
+
##return (user_id, text,pdf_path)
|
28 |
+
|
29 |
+
def get_final_aswer(query: str, user_id: str,pdf_path: str) -> list:
|
30 |
+
chunks = get_chunks(query,user_id,pdf_path)
|
31 |
+
full_text = ""
|
32 |
+
for chunk in chunks:
|
33 |
+
full_text += chunk
|
34 |
+
|
35 |
+
final_answer = get_answer(query,full_text)
|
36 |
+
|
37 |
+
return final_answer
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
|
embeddings.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
##return embedding vector for a given text
|
2 |
+
##uses senetence based emebdings
|
3 |
+
from langchain_text_splitters import CharacterTextSplitter
|
4 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
5 |
+
from langchain_core.documents import Document
|
6 |
+
from langchain_chroma import Chroma
|
7 |
+
|
8 |
+
model_name = "sentence-transformers/all-MiniLM-L6-v2"
|
9 |
+
model_kwargs = {"device": "cpu"}
|
10 |
+
encode_kwargs = {"normalize_embeddings": True}
|
11 |
+
hf = HuggingFaceEmbeddings(
|
12 |
+
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
|
13 |
+
)
|
14 |
+
|
15 |
+
|
16 |
+
vector_store = Chroma(
|
17 |
+
collection_name="collection",
|
18 |
+
embedding_function=hf,
|
19 |
+
persist_directory="chroma_langchain_db",
|
20 |
+
)
|
21 |
+
|
22 |
+
|
23 |
+
def set_embedding(text:str,doc_id:str,user_id:str):
|
24 |
+
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
|
25 |
+
encoding_name="cl100k_base", chunk_size=300, chunk_overlap=40
|
26 |
+
)
|
27 |
+
texts = text_splitter.split_text(text)
|
28 |
+
print(type(texts[0]))## IT IS LIST OF STRINGS
|
29 |
+
|
30 |
+
for i in range(len(texts)):
|
31 |
+
##vector=hf.embed_query(texts[i])
|
32 |
+
vector_id=(user_id+doc_id+str(i))
|
33 |
+
globals()[f"document_{i}"]=Document(
|
34 |
+
page_content= texts[i],
|
35 |
+
metadata={"doc_id": doc_id, "user_id": user_id},
|
36 |
+
id= vector_id,
|
37 |
+
)
|
38 |
+
vector_store.add_documents([globals()[f"document_{i}"]])
|
39 |
+
print(f"Added document {i} with id {vector_id}")
|
40 |
+
|
41 |
+
def get_chunks(query:str,user_id:str,doc_id:str):
|
42 |
+
results = vector_store.similarity_search(
|
43 |
+
query,
|
44 |
+
k=5,
|
45 |
+
filter={"user_id": user_id}
|
46 |
+
)
|
47 |
+
list_of_chunks=[]
|
48 |
+
for res in results:
|
49 |
+
list_of_chunks.append(res.page_content)
|
50 |
+
return list_of_chunks
|
response.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
##stores embedding vector and corresponding text with vector custom_id
|
2 |
+
##filter embeddings by custom_id
|
3 |
+
##search for similar embeddings
|
4 |
+
##return text data with given vector custom_id
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
load_dotenv()
|
7 |
+
|
8 |
+
from langchain_mistralai import ChatMistralAI
|
9 |
+
|
10 |
+
llm = ChatMistralAI(
|
11 |
+
model="mistral-large-latest",
|
12 |
+
temperature=0,
|
13 |
+
max_retries=1,
|
14 |
+
)
|
15 |
+
|
16 |
+
prompt_tamplet = """
|
17 |
+
You just need to answer the question based on the following context.
|
18 |
+
QUESTIONS : {question}
|
19 |
+
CONTEXT : {context}
|
20 |
+
"""
|
21 |
+
|
22 |
+
|
23 |
+
|
24 |
+
def get_answer(question:str,context:str):
|
25 |
+
final_prompt = prompt_tamplet.format(question=question, context=context)
|
26 |
+
response = llm.invoke(final_prompt)
|
27 |
+
##print("from planner :",type(response.content))
|
28 |
+
return response.content
|