|
import os |
|
import gradio as gr |
|
import logging |
|
from langchain.document_loaders import PyPDFLoader |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.embeddings import OpenAIEmbeddings |
|
from langchain.vectorstores import FAISS |
|
from langchain.chat_models import ChatOpenAI |
|
from langchain.chains import ConversationalRetrievalChain, LLMChain |
|
from langchain.memory import ConversationBufferMemory |
|
from langchain.prompts import PromptTemplate |
|
from PyPDF2 import PdfReader |
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
class ResponseStructureSelector: |
|
def __init__(self, llm): |
|
self.llm = llm |
|
self.structure_prompt = PromptTemplate( |
|
input_variables=['context', 'query'], |
|
template="""Analyze the context and query to determine the most appropriate response structure: |
|
Context: {context} |
|
Query: {query} |
|
|
|
Select the optimal response format: |
|
1. Markdown with bullet points and headlines |
|
2. Concise paragraph with key insights |
|
3. Numbered list with detailed explanations |
|
4. Technical breakdown with subheadings |
|
5. Quick summary with critical points |
|
|
|
Choose the number (1-5) of the most suitable format:""" |
|
) |
|
self.structure_chain = LLMChain(llm=self.llm, prompt=self.structure_prompt) |
|
|
|
def select_structure(self, context, query): |
|
try: |
|
structure_choice = self.structure_chain.run({'context': context, 'query': query}) |
|
return int(structure_choice.strip()) |
|
except: |
|
return 1 |
|
|
|
class QueryRefiner: |
|
def __init__(self, llm): |
|
self.refinement_llm = llm |
|
self.refinement_prompt = PromptTemplate( |
|
input_variables=['query', 'context'], |
|
template="""Refine query for clarity and precision: |
|
Original Query: {query} |
|
Document Context: {context} |
|
Refined, Focused Query:""" |
|
) |
|
self.refinement_chain = LLMChain(llm=self.refinement_llm, prompt=self.refinement_prompt) |
|
|
|
def refine_query(self, original_query, context_hints=''): |
|
try: |
|
return self.refinement_chain.run({ |
|
'query': original_query, |
|
'context': context_hints or "General document" |
|
}).strip() |
|
except Exception as e: |
|
logger.error(f"Query refinement error: {e}") |
|
return original_query |
|
|
|
class AdvancedPdfChatbot: |
|
def __init__(self, openai_api_key): |
|
os.environ["OPENAI_API_KEY"] = openai_api_key |
|
self.llm = ChatOpenAI(temperature=0, model_name='gpt-4o', max_tokens=1000) |
|
|
|
self.embeddings = OpenAIEmbeddings() |
|
self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100) |
|
|
|
self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) |
|
self.query_refiner = QueryRefiner(self.llm) |
|
self.response_selector = ResponseStructureSelector(self.llm) |
|
|
|
self.db = None |
|
self.chain = None |
|
self.document_metadata = {} |
|
|
|
def _create_response_prompt(self, structure_choice): |
|
structure_templates = { |
|
1: """Markdown Response with Structured Insights: |
|
## {title} |
|
### Key Highlights |
|
{content} |
|
### Conclusion |
|
{conclusion}""", |
|
2: """{title}: {content}. Key Takeaway: {conclusion}""", |
|
3: """Structured Breakdown: |
|
1. {title} |
|
- Main Point: {content} |
|
2. Implications |
|
- {conclusion}""", |
|
4: """Technical Analysis |
|
## {title} |
|
### Core Concept |
|
{content} |
|
### Technical Implications |
|
{conclusion}""", |
|
5: """Concise Summary: {title}. Key Points: {content}. Conclusion: {conclusion}.""" |
|
} |
|
return PromptTemplate( |
|
template=structure_templates.get(structure_choice, structure_templates[1]), |
|
input_variables=["title", "content", "conclusion"] |
|
) |
|
|
|
def load_and_process_pdf(self, pdf_path): |
|
try: |
|
|
|
reader = PdfReader(pdf_path) |
|
self.document_metadata = { |
|
"title": reader.metadata.get("/Title", "Untitled Document"), |
|
"author": reader.metadata.get("/Author", "Unknown") |
|
} |
|
|
|
|
|
loader = PyPDFLoader(pdf_path) |
|
documents = loader.load() |
|
texts = self.text_splitter.split_documents(documents) |
|
|
|
|
|
self.db = FAISS.from_documents(texts[:30], self.embeddings) |
|
|
|
|
|
self.chain = ConversationalRetrievalChain.from_llm( |
|
llm=self.llm, |
|
retriever=self.db.as_retriever(search_kwargs={"k": 3}), |
|
memory=self.memory |
|
) |
|
|
|
return True |
|
except Exception as e: |
|
logger.error(f"PDF processing error: {e}") |
|
return False |
|
|
|
def chat(self, query): |
|
if not self.chain: |
|
return "Upload a PDF first." |
|
|
|
|
|
context = f"Document: {self.document_metadata.get('title', 'Unknown')}" |
|
refined_query = self.query_refiner.refine_query(query, context) |
|
|
|
|
|
structure_choice = self.response_selector.select_structure(context, refined_query) |
|
|
|
|
|
result = self.chain({"question": refined_query}) |
|
|
|
return result['answer'] |
|
|
|
|
|
pdf_chatbot = AdvancedPdfChatbot(os.environ.get("OPENAI_API_KEY")) |
|
|
|
def upload_pdf(pdf_file): |
|
if not pdf_file: |
|
return "Upload a PDF file." |
|
file_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file |
|
return "PDF processed successfully" if pdf_chatbot.load_and_process_pdf(file_path) else "Processing failed" |
|
|
|
def respond(message, history): |
|
try: |
|
bot_message = pdf_chatbot.chat(message) |
|
history.append((message, bot_message)) |
|
return "", history |
|
except Exception as e: |
|
return f"Error: {e}", history |
|
|
|
|
|
|
|
pdf_chatbot = AdvancedPdfChatbot(os.environ.get("OPENAI_API_KEY")) |
|
|
|
def upload_pdf(pdf_file): |
|
if pdf_file is None: |
|
return "Please upload a PDF file." |
|
file_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file |
|
try: |
|
pdf_chatbot.load_and_process_pdf(file_path) |
|
return f"PDF processed successfully: {file_path}" |
|
except Exception as e: |
|
logger.error(f"PDF processing error: {e}") |
|
return f"Error processing PDF: {str(e)}" |
|
|
|
def respond(message, history): |
|
if not message: |
|
return "", history |
|
try: |
|
bot_message = pdf_chatbot.chat(message) |
|
history.append((message, bot_message)) |
|
return "", history |
|
except Exception as e: |
|
logger.error(f"Chat response error: {e}") |
|
return f"Error: {str(e)}", history |
|
|
|
def clear_chatbot(): |
|
pdf_chatbot.clear_memory() |
|
return [] |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# Advanced PDF Chatbot") |
|
with gr.Row(): |
|
pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"]) |
|
upload_button = gr.Button("Process PDF") |
|
|
|
upload_status = gr.Textbox(label="Upload Status") |
|
upload_button.click(upload_pdf, inputs=[pdf_upload], outputs=[upload_status]) |
|
chatbot_interface = gr.Chatbot() |
|
msg = gr.Textbox(placeholder="Enter your query...") |
|
msg.submit(respond, inputs=[msg, chatbot_interface], outputs=[msg, chatbot_interface]) |
|
clear_button = gr.Button("Clear Conversation") |
|
clear_button.click(clear_chatbot, outputs=[chatbot_interface]) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|