|
import PyPDF2 |
|
from sentence_transformers import SentenceTransformer |
|
import faiss |
|
import gradio as gr |
|
import openai |
|
|
|
|
|
openai.api_key = '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' |
|
|
|
|
|
def extract_text_from_pdf(pdf_path): |
|
text = [] |
|
with open(pdf_path, "rb") as file: |
|
reader = PyPDF2.PdfReader(file) |
|
for i, page in enumerate(reader.pages): |
|
page_text = page.extract_text() |
|
text.append({"page": i + 1, "text": page_text}) |
|
return text |
|
|
|
|
|
def chunk_text(text_data, chunk_size=2000): |
|
chunks = [] |
|
for data in text_data: |
|
page_text = data["text"] |
|
page_num = data["page"] |
|
sentences = page_text.split(". ") |
|
current_chunk = "" |
|
for sentence in sentences: |
|
if len(current_chunk) + len(sentence) <= chunk_size: |
|
current_chunk += sentence + ". " |
|
else: |
|
chunks.append({"chunk": current_chunk.strip(), "page": page_num}) |
|
current_chunk = sentence + ". " |
|
if current_chunk: |
|
chunks.append({"chunk": current_chunk.strip(), "page": page_num}) |
|
return chunks |
|
|
|
|
|
def create_faiss_index(chunks, model_name="all-MiniLM-L6-v2"): |
|
model = SentenceTransformer(model_name) |
|
embeddings = model.encode([chunk["chunk"] for chunk in chunks]) |
|
dimension = embeddings.shape[1] |
|
index = faiss.IndexFlatL2(dimension) |
|
index.add(embeddings) |
|
return index, chunks |
|
|
|
|
|
def retrieve_from_pdf(question, index, chunks, model_name="all-MiniLM-L6-v2"): |
|
model = SentenceTransformer(model_name) |
|
query_embedding = model.encode([question]) |
|
_, top_k_indices = index.search(query_embedding, k=10) |
|
|
|
retrieved_chunks = [chunks[idx] for idx in top_k_indices[0]] |
|
|
|
|
|
print("Retrieved Chunks:") |
|
for i, chunk in enumerate(retrieved_chunks): |
|
print(f"Chunk {i + 1}: {chunk['chunk'][:200]}... (Page {chunk['page']})") |
|
|
|
page_numbers = set(chunk["page"] for chunk in retrieved_chunks) |
|
print(f"Retrieved page numbers: {page_numbers}") |
|
|
|
filtered_chunks = [chunk for chunk in retrieved_chunks if chunk["page"] in page_numbers] |
|
|
|
context = " ".join([chunk["chunk"] for chunk in filtered_chunks]) |
|
|
|
return filtered_chunks, context |
|
|
|
|
|
|
|
def gpt_generate_answer(question, context, pages): |
|
pages_text = ", ".join(map(str, set(pages))) |
|
|
|
prompt = ( |
|
f"Answer the following question as precisely and concisely as possible based on the provided context. " |
|
f"Also include the page numbers where the relevant text was found. Please respond in English:\n\n" |
|
f"Question: {question}\n\n" |
|
f"Context: {context}\n\n" |
|
f"Pages: {pages_text}\n\n" |
|
f"Please strictly follow this format:\n" |
|
f"- **Answer:** [Your answer]\n" |
|
f"- **Relevant Text:** [The most relevant portion of the context]\n" |
|
f"- **Pages:** [Pages of the Relevant Text]\n" |
|
) |
|
|
|
|
|
print("GPT Prompt:", prompt) |
|
|
|
|
|
response = openai.ChatCompletion.create( |
|
model="o1-mini", |
|
messages=[{"role": "user", "content": prompt}] |
|
) |
|
|
|
|
|
print("GPT Raw Response:", response) |
|
|
|
content = response['choices'][0]['message']['content'] |
|
print("GPT Content:", content) |
|
|
|
|
|
return parse_gpt_response(content) |
|
|
|
|
|
|
|
def parse_gpt_response(content): |
|
answer, relevant_text, relevant_pages = None, None, None |
|
|
|
|
|
if "Answer:" in content: |
|
answer = content.split("- **Answer:**")[1].split("- **Relevant Text:**")[0].strip() |
|
if "Relevant Text:" in content: |
|
relevant_text = content.split("- **Relevant Text:**")[1].split("- **Pages:**")[0].strip() |
|
if "Pages:" in content: |
|
relevant_pages = content.split("- **Pages:**")[1].strip() |
|
|
|
|
|
if not answer: |
|
print("Warning: 'Answer' was not parsed correctly.") |
|
answer = "Answer not found." |
|
if not relevant_text: |
|
print("Warning: 'Relevant Text' was not parsed correctly.") |
|
relevant_text = "Relevant Text not found." |
|
if not relevant_pages: |
|
print("Warning: 'Pages' was not parsed correctly.") |
|
relevant_pages = "Pages not found." |
|
|
|
|
|
print("Parsed Answer:", answer) |
|
print("Parsed Relevant Text:", relevant_text) |
|
print("Parsed Relevant Pages:", relevant_pages) |
|
|
|
return answer, relevant_text, relevant_pages |
|
|
|
|
|
def gradio_rag(question): |
|
pdf_path = "norms_pacing.pdf" |
|
text_data = extract_text_from_pdf(pdf_path) |
|
chunks = chunk_text(text_data) |
|
index, chunk_list = create_faiss_index(chunks) |
|
|
|
retrieved_chunks, context = retrieve_from_pdf(question, index, chunk_list) |
|
|
|
if not context.strip(): |
|
answer = "No relevant information found." |
|
relevant_text = "No relevant text found." |
|
relevant_pages = "No pages found." |
|
else: |
|
pages = [chunk["page"] for chunk in retrieved_chunks] |
|
answer, relevant_text, relevant_pages = gpt_generate_answer(question, context, pages) |
|
print("Final Answer:", answer) |
|
print("Final Relevant Text:", relevant_text) |
|
print("Final Relevant Pages:", relevant_pages) |
|
|
|
return answer, relevant_text, relevant_pages |
|
|
|
|
|
interface = gr.Interface( |
|
fn=gradio_rag, |
|
inputs=gr.Textbox(label="Enter your question"), |
|
outputs=[ |
|
gr.Textbox(label="Answer"), |
|
gr.Textbox(label="Relevant Retrieved Text"), |
|
gr.Textbox(label="Pages Retrieved") |
|
], |
|
title="RAG PDF Q&A with GPT", |
|
description="Ask a question, and the system retrieves relevant information from a PDF file and generates a refined answer using GPT.", |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
interface.launch(share=True) |