File size: 4,335 Bytes
ac04873
 
 
 
f1ba16e
ac04873
 
 
f1ba16e
ac04873
f1ba16e
ac04873
 
68f99dc
9105bd6
9e1463b
f1ba16e
9105bd6
f1ba16e
9105bd6
 
 
 
 
 
ac04873
 
f1ba16e
 
ac04873
 
 
f1ba16e
f32ba7f
ea1c14e
001d160
f1ba16e
f32ba7f
ac04873
633c443
f32ba7f
f1ba16e
84df10e
f1ba16e
f8c8ec1
 
f1ba16e
b98540b
 
ac04873
f8c8ec1
 
b98540b
 
ea1c14e
 
 
b98540b
 
 
ea1c14e
b177750
b98540b
 
 
 
ac04873
b98540b
95e2001
 
b98540b
95e2001
b98540b
ac04873
 
 
f1ba16e
ac04873
 
b98540b
ac04873
f1ba16e
f8c8ec1
 
 
f1ba16e
ac04873
 
f1ba16e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os
import gradio as gr
import asyncio
from langchain_core.prompts import PromptTemplate
from langchain_community.output_parsers.rail_parser import GuardrailsOutputParser
from langchain_community.document_loaders import PyPDFLoader
from langchain_google_genai import ChatGoogleGenerativeAI
import google.generativeai as genai
from langchain.chains.question_answering import load_qa_chain  # Import load_qa_chain

async def initialize(file_path, question):
    genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
    model = genai.GenerativeModel('gemini-pro')
    model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3)
    
    # Refined prompt template to encourage precise and concise answers
    prompt_template = """Answer the question precisely and concisely using the provided context. Avoid any additional commentary or system messages.
                          If the answer is not contained in the context, respond with "answer not available in context".
                          
                          Context:
                          {context}
                          
                          Question:
                          {question}
                          
                          Answer:
                        """
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
    
    if os.path.exists(file_path):
        pdf_loader = PyPDFLoader(file_path)
        pages = pdf_loader.load_and_split()
        
        # Extract content from each page and store along with page number
        page_contexts = [page.page_content for i, page in enumerate(pages)]
        context = "\n".join(page_contexts[:30])  # Using the first 30 pages for context
        
        # Load the question-answering chain
        stuff_chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
        
        # Get the answer from the model
        stuff_answer = await stuff_chain.ainvoke({"input_documents": pages, "question": question, "context": context})
        answer = stuff_answer.get('output_text', '').strip()
        
        # Identify key sentences or phrases
        key_phrases = answer.split(". ")  # Split answer into sentences for more precise matching
        
        # Score each page based on the presence of key phrases
        page_scores = [0] * len(pages)
        for i, page in enumerate(pages):
            for phrase in key_phrases:
                if phrase.lower() in page.page_content.lower():
                    page_scores[i] += 1
        
        # Determine the top pages based on highest scores
        top_pages_with_scores = sorted(enumerate(page_scores), key=lambda x: x[1], reverse=True)
        top_pages = [i + 1 for i, score in top_pages_with_scores if score > 0][:2]  # Get top 2 pages
        
        # Generate links for each top page
        file_name = os.path.basename(file_path)
        # Use a general link format with instructions for manual navigation if automatic links are not supported
        page_links = [f"[Page {p}](file://{os.path.abspath(file_path)})" for p in top_pages]
        page_links_str = ', '.join(page_links)
        
        if top_pages:
            source_str = f"Top relevant page(s): {page_links_str}"
        else:
            source_str = "Top relevant page(s): Not found in specific page"

        # Create a clickable link for the document
        source_link = f"[Document: {file_name}](file://{os.path.abspath(file_path)})"
        
        return f"Answer: {answer}\n{source_str}\n{source_link}"
    else:
        return "Error: Unable to process the document. Please ensure the PDF file is valid."

# Define Gradio Interface
input_file = gr.File(label="Upload PDF File")
input_question = gr.Textbox(label="Ask about the document")
output_text = gr.Textbox(label="Answer and Top Pages")

async def pdf_qa(file, question):
    if file is None:
        return "Error: No file uploaded. Please upload a PDF document."
    
    answer = await initialize(file.name, question)
    return answer

# Create Gradio Interface with share=True to enable a public link
gr.Interface(fn=pdf_qa, inputs=[input_file, input_question], outputs=output_text, title="PDF Question Answering System", description="Upload a PDF file and ask questions about the content.").launch(share=True)