Aseem Gupta commited on
Commit
35d9362
·
1 Parent(s): bfa0055

current alpha version for pdf's only for all users common db is there for now

Browse files
Files changed (2) hide show
  1. app.py +137 -0
  2. requirements.txt +18 -0
app.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from langchain_community.document_loaders import PyPDFLoader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ # from langchain_chroma import Chroma
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain_groq import ChatGroq
7
+ from langchain.chains import create_retrieval_chain
8
+ from langchain.chains.combine_documents import create_stuff_documents_chain
9
+ from langchain_core.prompts import ChatPromptTemplate
10
+ import os
11
+ from dotenv import load_dotenv
12
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
13
+ # from langchain.embeddings import HuggingFaceEmbeddings # open source free embedding
14
+ load_dotenv()
15
+
16
+
17
+ class PDFQAProcessor:
18
+
19
+ SYSTEM_PROMPT = os.getenv('SYSTEM_PROMPT')
20
+
21
+ llm = ChatGroq(
22
+ # model_name="deepseek-r1-distill-llama-70b",
23
+ model_name="llama-3.3-70b-versatile",
24
+ temperature=0.1,
25
+ max_tokens=8000,
26
+ api_key = os.getenv('GROQ_API_KEY')
27
+ )
28
+
29
+ # Setup RAG chain
30
+ prompt = ChatPromptTemplate.from_messages([
31
+ ("system", SYSTEM_PROMPT),
32
+ ("human", "{input}"),
33
+ ])
34
+
35
+ question_answer_chain = create_stuff_documents_chain(llm, prompt)
36
+
37
+ # EMBEDDING_MODEL = "intfloat/e5-large-v2"
38
+
39
+ # embeddings = HuggingFaceEmbeddings(
40
+ # model_name=EMBEDDING_MODEL,
41
+ # model_kwargs={'device': 'cpu'},
42
+ # encode_kwargs={'normalize_embeddings': True}
43
+ # )
44
+
45
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
46
+ CHUNK_SIZE = 550
47
+ CHUNK_OVERLAP = 80
48
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE,chunk_overlap = CHUNK_OVERLAP)
49
+ # persist_directory="./chroma_db"
50
+
51
+
52
+ def __init__(self):
53
+ self.vectorstore = None
54
+ self.retriever = None
55
+
56
+ def process_pdfs(self, pdf_files):
57
+ """Processing PDF files and creating vector store"""
58
+ if not pdf_files:
59
+ return "Please upload PDF files first!"
60
+
61
+ try:
62
+ # Load and split documents
63
+ docs = []
64
+ for pdf_file in pdf_files:
65
+ loader = PyPDFLoader(pdf_file.name)
66
+ docs.extend(loader.load())
67
+
68
+ splits = self.text_splitter.split_documents(docs)
69
+
70
+ # # Create vector store
71
+ # self.vectorstore = Chroma.from_documents(
72
+ # documents=splits,
73
+ # embedding=self.embeddings,
74
+ # # persist_directory = self.persist_directory
75
+ # )
76
+ # Replace Chroma with:
77
+ self.vectorstore = FAISS.from_documents(
78
+ splits,
79
+ self.embeddings
80
+ )
81
+ self.retriever = self.vectorstore.as_retriever(search_kwargs={"k": 18})
82
+ return "PDFs processed successfully! Ask your questions now."
83
+
84
+ except Exception as e:
85
+ return f"Error processing PDFs: {str(e)}"
86
+
87
+ def answer_question(self, question):
88
+ """Handling question answering"""
89
+ if not self.retriever:
90
+ return "Please process PDFs first!", None
91
+
92
+ try:
93
+ # Initialize LLM
94
+ rag_chain = create_retrieval_chain(self.retriever, self.question_answer_chain)
95
+
96
+ response = rag_chain.invoke({"input": question})
97
+
98
+ final_response = response["answer"] + "\n\n### Sources\n\n" # Changed to use markdown formatting
99
+ for info in response["context"]:
100
+ final_response += (
101
+ f"{info.page_content}<br>" # Changed to use markdown bold formatting
102
+ f"Source of Info: {info.metadata['source']}<br>"
103
+ f"At Page No: {info.metadata['page_label']}<br><br>"
104
+ )
105
+ return final_response
106
+ except Exception as e:
107
+ return f"Error answering question: {str(e)}", None
108
+
109
+ processor = PDFQAProcessor()
110
+
111
+ with gr.Blocks(title="PDF QA Assistant") as demo:
112
+ with gr.Tab("Upload PDFs"):
113
+ file_input = gr.Files(label="Upload PDFs", file_types=[".pdf"])
114
+ process_btn = gr.Button("Process PDFs")
115
+ status_output = gr.Textbox(label="Processing Status")
116
+
117
+ with gr.Tab("Ask Questions"):
118
+ question_input = gr.Textbox(label="Your Question")
119
+ # answer_output = gr.Textbox(label="Answer", interactive=False)
120
+ answer_output = gr.Markdown(label="Answer")
121
+ ask_btn = gr.Button("Ask Question")
122
+
123
+ process_btn.click(
124
+ processor.process_pdfs,
125
+ inputs=file_input,
126
+ outputs=status_output
127
+ )
128
+
129
+ # QA workflow
130
+ ask_btn.click(
131
+ processor.answer_question,
132
+ inputs=question_input,
133
+ outputs=[answer_output]
134
+ )
135
+
136
+ if __name__ == "__main__":
137
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==5.14.0
2
+ groq==0.15.0
3
+ huggingface-hub==0.27.1
4
+ langchain==0.3.15
5
+ langchain-community==0.3.15
6
+ langchain-core==0.3.31
7
+ langchain-experimental==0.3.4
8
+ langchain-google-genai==2.0.9
9
+ langchain-groq==0.2.3
10
+ langchain-text-splitters==0.3.5
11
+ nltk==3.9.1
12
+ python-dotenv==1.0.1
13
+ sentence-transformers==3.4.0
14
+ tokenizers==0.20.3
15
+ torch==2.5.1
16
+ transformers==4.46.3
17
+ unstructured==0.16.15
18
+ faiss-cpu