askdoctor / app.py
farquasar's picture
Update app.py
c83125c verified
import PyPDF2
from sentence_transformers import SentenceTransformer
import faiss
import gradio as gr
import openai
# Ensure your OpenAI API key is correct
openai.api_key = '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'
# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
text = []
with open(pdf_path, "rb") as file:
reader = PyPDF2.PdfReader(file)
for i, page in enumerate(reader.pages):
page_text = page.extract_text()
text.append({"page": i + 1, "text": page_text})
return text
# Function to chunk text into manageable pieces
def chunk_text(text_data, chunk_size=2000):
chunks = []
for data in text_data:
page_text = data["text"]
page_num = data["page"]
sentences = page_text.split(". ")
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) <= chunk_size:
current_chunk += sentence + ". "
else:
chunks.append({"chunk": current_chunk.strip(), "page": page_num})
current_chunk = sentence + ". "
if current_chunk:
chunks.append({"chunk": current_chunk.strip(), "page": page_num})
return chunks
# Function to create a FAISS index from the chunked text
def create_faiss_index(chunks, model_name="all-MiniLM-L6-v2"):
model = SentenceTransformer(model_name)
embeddings = model.encode([chunk["chunk"] for chunk in chunks])
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
return index, chunks
# Function to retrieve relevant chunks from the FAISS index based on a question
def retrieve_from_pdf(question, index, chunks, model_name="all-MiniLM-L6-v2"):
model = SentenceTransformer(model_name)
query_embedding = model.encode([question])
_, top_k_indices = index.search(query_embedding, k=10) # Retrieve top 10 results
retrieved_chunks = [chunks[idx] for idx in top_k_indices[0]]
# Debug: Print the retrieved chunks
print("Retrieved Chunks:")
for i, chunk in enumerate(retrieved_chunks):
print(f"Chunk {i + 1}: {chunk['chunk'][:200]}... (Page {chunk['page']})") # Truncate to first 200 chars
page_numbers = set(chunk["page"] for chunk in retrieved_chunks)
print(f"Retrieved page numbers: {page_numbers}") # Debug: Page numbers
filtered_chunks = [chunk for chunk in retrieved_chunks if chunk["page"] in page_numbers]
context = " ".join([chunk["chunk"] for chunk in filtered_chunks])
return filtered_chunks, context
# GPT function to generate a precise answer using the retrieved context
# GPT function to generate a precise answer using the retrieved context
def gpt_generate_answer(question, context, pages):
pages_text = ", ".join(map(str, set(pages)))
prompt = (
f"Answer the following question as precisely and concisely as possible based on the provided context. "
f"Also include the page numbers where the relevant text was found. Please respond in English:\n\n"
f"Question: {question}\n\n"
f"Context: {context}\n\n"
f"Pages: {pages_text}\n\n"
f"Please strictly follow this format:\n"
f"- **Answer:** [Your answer]\n"
f"- **Relevant Text:** [The most relevant portion of the context]\n"
f"- **Pages:** [Pages of the Relevant Text]\n"
)
# Debug: Print the prompt to OpenAI
print("GPT Prompt:", prompt)
# Call OpenAI API
response = openai.ChatCompletion.create(
model="o1-mini",
messages=[{"role": "user", "content": prompt}]
)
# Debug: Print the entire response from GPT
print("GPT Raw Response:", response)
content = response['choices'][0]['message']['content']
print("GPT Content:", content) # Debug
# Parse the GPT response
return parse_gpt_response(content)
# Function to parse the GPT response
def parse_gpt_response(content):
answer, relevant_text, relevant_pages = None, None, None
# Check and parse for English labels
if "Answer:" in content:
answer = content.split("- **Answer:**")[1].split("- **Relevant Text:**")[0].strip()
if "Relevant Text:" in content:
relevant_text = content.split("- **Relevant Text:**")[1].split("- **Pages:**")[0].strip()
if "Pages:" in content:
relevant_pages = content.split("- **Pages:**")[1].strip()
# Ensure missing information is handled
if not answer:
print("Warning: 'Answer' was not parsed correctly.")
answer = "Answer not found."
if not relevant_text:
print("Warning: 'Relevant Text' was not parsed correctly.")
relevant_text = "Relevant Text not found."
if not relevant_pages:
print("Warning: 'Pages' was not parsed correctly.")
relevant_pages = "Pages not found."
# Debug: Print parsed content
print("Parsed Answer:", answer) # Debug
print("Parsed Relevant Text:", relevant_text) # Debug
print("Parsed Relevant Pages:", relevant_pages) # Debug
return answer, relevant_text, relevant_pages
# Gradio function to integrate everything into an interactive interface
def gradio_rag(question):
pdf_path = "norms_pacing.pdf"
text_data = extract_text_from_pdf(pdf_path)
chunks = chunk_text(text_data)
index, chunk_list = create_faiss_index(chunks)
retrieved_chunks, context = retrieve_from_pdf(question, index, chunk_list)
if not context.strip():
answer = "No relevant information found."
relevant_text = "No relevant text found."
relevant_pages = "No pages found."
else:
pages = [chunk["page"] for chunk in retrieved_chunks] # Extract relevant pages
answer, relevant_text, relevant_pages = gpt_generate_answer(question, context, pages)
print("Final Answer:", answer) # Debug
print("Final Relevant Text:", relevant_text) # Debug
print("Final Relevant Pages:", relevant_pages) # Debug
return answer, relevant_text, relevant_pages
# Gradio interface
interface = gr.Interface(
fn=gradio_rag,
inputs=gr.Textbox(label="Enter your question"),
outputs=[
gr.Textbox(label="Answer"),
gr.Textbox(label="Relevant Retrieved Text"),
gr.Textbox(label="Pages Retrieved")
],
title="RAG PDF Q&A with GPT",
description="Ask a question, and the system retrieves relevant information from a PDF file and generates a refined answer using GPT.",
)
# Launch the interface
if __name__ == "__main__":
interface.launch(share=True)