|
import os |
|
import faiss |
|
import gradio as gr |
|
import numpy as np |
|
import requests |
|
|
|
from pypdf import PdfReader |
|
from sentence_transformers import SentenceTransformer |
|
|
|
|
|
|
|
|
|
def extract_pdf_text(pdf_file) -> str: |
|
reader = PdfReader(pdf_file) |
|
all_text = [] |
|
for page in reader.pages: |
|
text = page.extract_text() or "" |
|
all_text.append(text.strip()) |
|
return "\n".join(all_text) |
|
|
|
def chunk_text(text, chunk_size=300, overlap=50): |
|
words = text.split() |
|
chunks = [] |
|
start = 0 |
|
while start < len(words): |
|
end = start + chunk_size |
|
chunk = words[start:end] |
|
chunks.append(" ".join(chunk)) |
|
start += (chunk_size - overlap) |
|
return chunks |
|
|
|
|
|
|
|
|
|
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") |
|
|
|
|
|
|
|
|
|
def build_faiss_index(chunks): |
|
chunk_embeddings = embedding_model.encode(chunks, show_progress_bar=False) |
|
chunk_embeddings = np.array(chunk_embeddings, dtype='float32') |
|
dimension = chunk_embeddings.shape[1] |
|
index = faiss.IndexFlatL2(dimension) |
|
index.add(chunk_embeddings) |
|
return index, chunk_embeddings |
|
|
|
|
|
|
|
|
|
def retrieve_chunks(query, index, chunks, top_k=3): |
|
query_embedding = embedding_model.encode([query], show_progress_bar=False) |
|
query_embedding = np.array(query_embedding, dtype='float32') |
|
|
|
distances, indices = index.search(query_embedding, top_k) |
|
return [chunks[i] for i in indices[0]] |
|
|
|
|
|
|
|
|
|
def gemini_generate(prompt): |
|
gemini_api_key = os.environ.get("GEMINI_API_KEY", "") |
|
if not gemini_api_key: |
|
return "Error: No GEMINI_API_KEY found in environment variables." |
|
|
|
url = ( |
|
"https://generativelanguage.googleapis.com/" |
|
"v1beta/models/gemini-1.5-flash:generateContent" |
|
f"?key={gemini_api_key}" |
|
) |
|
|
|
data = { |
|
"contents": [ |
|
{ |
|
"parts": [ |
|
{"text": prompt} |
|
] |
|
} |
|
] |
|
} |
|
headers = {"Content-Type": "application/json"} |
|
response = requests.post(url, headers=headers, json=data) |
|
|
|
if response.status_code != 200: |
|
return f"Error {response.status_code}: {response.text}" |
|
|
|
r_data = response.json() |
|
try: |
|
return r_data["candidates"][0]["content"]["parts"][0]["text"] |
|
except Exception: |
|
return f"Parsing error or unexpected response structure: {r_data}" |
|
|
|
|
|
|
|
|
|
def answer_question_with_RAG(user_question, index, chunks): |
|
relevant_chunks = retrieve_chunks(user_question, index, chunks, top_k=3) |
|
context = "\n\n".join(relevant_chunks) |
|
prompt = f""" |
|
You are an AI assistant that knows the details from the uploaded research paper. |
|
Answer the user's question accurately using the context below. |
|
If something is not in the context, say you don't know. |
|
|
|
Context: |
|
{context} |
|
|
|
User's question: {user_question} |
|
|
|
Answer: |
|
""" |
|
return gemini_generate(prompt) |
|
|
|
|
|
|
|
|
|
def process_pdf(pdf_file): |
|
if pdf_file is None: |
|
return None, "Please upload a PDF file." |
|
|
|
text = extract_pdf_text(pdf_file.name) |
|
if not text: |
|
return None, "No text found in PDF." |
|
|
|
chunks = chunk_text(text, chunk_size=300, overlap=50) |
|
if not chunks: |
|
return None, "No valid text to chunk." |
|
|
|
faiss_index, _ = build_faiss_index(chunks) |
|
return (faiss_index, chunks), "PDF processed successfully!" |
|
|
|
def chat_with_paper(query, state): |
|
if not state: |
|
return "Please upload and process a PDF first." |
|
faiss_index, doc_chunks = state |
|
if not query or not query.strip(): |
|
return "Please enter a valid question." |
|
return answer_question_with_RAG(query, faiss_index, doc_chunks) |
|
|
|
demo_theme = gr.themes.Soft(primary_hue="slate") |
|
|
|
css_code = """ |
|
body { |
|
background-color: #E6F7FF !important; /* Lightest blue */ |
|
margin: 0; |
|
padding: 0; |
|
} |
|
|
|
.block > .inside { |
|
margin: auto !important; |
|
max-width: 900px !important; |
|
border: 4px solid black !important; |
|
border-radius: 10px !important; |
|
background-color: #FFFFFF !important; |
|
padding: 20px !important; |
|
} |
|
|
|
#icon-container { |
|
text-align: center !important; |
|
margin-top: 1rem !important; |
|
margin-bottom: 1rem !important; |
|
} |
|
|
|
#app-title { |
|
text-align: center !important; |
|
font-size: 3rem !important; |
|
font-weight: 900 !important; |
|
margin-bottom: 0.5rem !important; |
|
margin-top: 0.5rem !important; |
|
} |
|
|
|
#app-welcome { |
|
text-align: center !important; |
|
font-size: 1.5rem !important; |
|
color: #444 !important; |
|
margin-bottom: 25px !important; |
|
font-weight: 700 !important; |
|
} |
|
|
|
button { |
|
background-color: #3CB371 !important; |
|
color: #ffffff !important; |
|
border: none !important; |
|
font-weight: 600 !important; |
|
cursor: pointer; |
|
} |
|
|
|
button:hover { |
|
background-color: #2E8B57 !important; |
|
} |
|
|
|
textarea, input[type="text"] { |
|
text-align: center !important; |
|
} |
|
""" |
|
|
|
with gr.Blocks(theme=demo_theme, css=css_code) as demo: |
|
gr.Markdown(""" |
|
<div id="icon-container"> |
|
<img src="https://i.ibb.co/3Wp3yBZ/ai-icon.png" alt="AI icon" style="width:100px;"> |
|
</div> |
|
""") |
|
|
|
gr.Markdown("<div id='app-title'>AI-Powered Personal Research Assistant</div>") |
|
gr.Markdown("<div id='app-welcome'>Welcome! How may I help you?</div>") |
|
|
|
state = gr.State() |
|
|
|
with gr.Row(): |
|
pdf_input = gr.File(label="Upload your research paper (PDF)", file_types=[".pdf"]) |
|
process_button = gr.Button("Process PDF") |
|
|
|
status_output = gr.Textbox(label="Status", interactive=False) |
|
|
|
process_button.click( |
|
fn=process_pdf, |
|
inputs=pdf_input, |
|
outputs=[state, status_output] |
|
) |
|
|
|
with gr.Row(): |
|
user_query = gr.Textbox(label="Ask a question about your research paper:") |
|
ask_button = gr.Button("Get Answer") |
|
answer_output = gr.Textbox(label="Answer") |
|
|
|
ask_button.click( |
|
fn=chat_with_paper, |
|
inputs=[user_query, state], |
|
outputs=answer_output |
|
) |
|
|
|
demo.launch() |
|
|