Spaces:

saritha
/

RAG_with_page_index_gemini

Sleeping

App Files Files Community

RAG_with_page_index_gemini / app.py

saritha

Update app.py

633c443 verified 11 months ago

raw

history blame

5.43 kB

	import os
	import gradio as gr
	import asyncio
	from datetime import datetime
	from langchain_core.prompts import PromptTemplate
	from langchain_community.document_loaders import PyPDFLoader
	from langchain_google_genai import ChatGoogleGenerativeAI
	import google.generativeai as genai
	from langchain.chains.question_answering import load_qa_chain # Import load_qa_chain

	# Initialize an empty list to store chat history and context
	chat_history = []
	context_history = ""

	async def initialize(file_path, question):
	global context_history

	genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
	model = genai.GenerativeModel('gemini-pro')
	model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3)

	# Refined prompt template to encourage precise and concise answers
	prompt_template = """Answer the question precisely and concisely using the provided context. Avoid any additional commentary or system messages.
	If the answer is not contained in the context, respond with "answer not available in context".

	Context:
	{context}

	Question:
	{question}

	Answer:
	"""
	prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

	if os.path.exists(file_path):
	pdf_loader = PyPDFLoader(file_path)
	pages = pdf_loader.load_and_split()

	# Extract content from each page and store along with page number
	page_contexts = [page.page_content for i, page in enumerate(pages)]
	context = "\n".join(page_contexts[:30]) # Using the first 30 pages for context

	# Load the question-answering chain
	stuff_chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)

	# Combine previous context with the new context
	combined_context = context_history + "\n" + context

	# Get the answer from the model
	stuff_answer = await stuff_chain.ainvoke({"input_documents": pages, "question": question, "context": combined_context})
	answer = stuff_answer.get('output_text', '').strip()

	# Identify key sentences or phrases
	key_phrases = answer.split(". ") # Split answer into sentences for more precise matching

	# Score each page based on the presence of key phrases
	page_scores = [0] * len(pages)
	for i, page in enumerate(pages):
	for phrase in key_phrases:
	if phrase.lower() in page.page_content.lower():
	page_scores[i] += 1

	# Determine the top pages based on highest scores
	top_pages_with_scores = sorted(enumerate(page_scores), key=lambda x: x[1], reverse=True)
	top_pages = [i + 1 for i, score in top_pages_with_scores if score > 0][:2] # Get top 2 pages

	# Generate links for each top page
	file_name = os.path.basename(file_path)
	page_links = [f"[Page {p}](file://{os.path.abspath(file_path)})" for p in top_pages]
	page_links_str = ', '.join(page_links)

	if top_pages:
	source_str = f"Top relevant page(s): {page_links_str}"
	else:
	source_str = "Top relevant page(s): Not found in specific page"

	# Create a clickable link for the document
	source_link = f"[Document: {file_name}](file://{os.path.abspath(file_path)})"

	# Save interaction to chat history
	timestamp = datetime.now().isoformat()
	chat_history.append({
	'timestamp': timestamp,
	'question': question,
	'answer': answer,
	'source': source_str,
	'document_link': source_link
	})

	# Update context history
	context_history += f"\nQ: {question}\nA: {answer}"

	return f"Answer: {answer}\n{source_str}\n{source_link}"
	else:
	return "Error: Unable to process the document. Please ensure the PDF file is valid."

	# Define Gradio Interface for QA and Chat History
	input_file = gr.File(label="Upload PDF File")
	input_question = gr.Textbox(label="Ask about the document")
	output_text = gr.Textbox(label="Answer and Top Pages", lines=10, max_lines=10)

	def get_chat_history():
	history_str = "\n".join([f"Q: {entry['question']}\nA: {entry['answer']}\n{entry['source']}\n{entry['document_link']}\nTimestamp: {entry['timestamp']}\n" for entry in chat_history])
	return history_str

	async def pdf_qa(file, question):
	if file is None:
	return "Error: No file uploaded. Please upload a PDF document."

	answer = await initialize(file.name, question)
	return answer

	# Create Gradio Interfaces
	qa_interface = gr.Interface(
	fn=pdf_qa,
	inputs=[input_file, input_question],
	outputs=output_text,
	title="PDF Question Answering System",
	description="Upload a PDF file and ask questions about the content."
	)

	history_interface = gr.Interface(
	fn=get_chat_history,
	inputs=[],
	outputs=gr.Textbox(label="Chat History", lines=20, max_lines=20),
	title="Chat History",
	description="View the history of interactions."
	)

	# Launch both interfaces
	qa_interface.launch(share=True)
	history_interface.launch(share=True)