Spaces:

fhmsf
/

AI-Powered-Personalized-Research-Assistant

Sleeping

App Files Files Community

fhmsf commited on Jan 4

Commit

f615b93

verified ·

1 Parent(s): e44e7a9

Create app.py

Browse files

Files changed (1) hide show

app.py +243 -0

app.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import os
+import faiss
+import gradio as gr
+import numpy as np
+import requests
+from pypdf import PdfReader
+from sentence_transformers import SentenceTransformer
+################################################################################
+# 1. PDF Parsing and Chunking
+################################################################################
+def extract_pdf_text(pdf_file) -> str:
+    """
+    Extracts text from each page of the uploaded PDF, then concatenates them.
+    """
+    reader = PdfReader(pdf_file)
+    all_text = []
+    for page in reader.pages:
+        text = page.extract_text() or ""
+        all_text.append(text.strip())
+    return "\n".join(all_text)
+def chunk_text(text, chunk_size=300, overlap=50):
+    """
+    Splits text into overlapping chunks of size ~chunk_size tokens.
+    overlap indicates how many tokens from the previous chunk are included again.
+    """
+    words = text.split()
+    chunks = []
+    start = 0
+    while start < len(words):
+        end = start + chunk_size
+        chunk = words[start:end]
+        chunks.append(" ".join(chunk))
+        start += (chunk_size - overlap)
+    return chunks
+################################################################################
+# 2. Embedding Model
+################################################################################
+# Use a SentenceTransformer from Hugging Face to embed text
+embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+################################################################################
+# 3. Building the FAISS Index
+################################################################################
+def build_faiss_index(chunks):
+    """
+    Creates a FAISS index from the text chunks. Returns (index, chunk_embeddings).
+    """
+    chunk_embeddings = embedding_model.encode(chunks, show_progress_bar=False)
+    chunk_embeddings = np.array(chunk_embeddings, dtype='float32')
+    dimension = chunk_embeddings.shape[1]
+    index = faiss.IndexFlatL2(dimension)  # L2 distance
+    index.add(chunk_embeddings)
+    return index, chunk_embeddings
+################################################################################
+# 4. Retrieval Function
+################################################################################
+def retrieve_chunks(query, index, chunks, top_k=3):
+    """
+    Embeds the user query and retrieves top_k most relevant chunks via FAISS.
+    """
+    query_embedding = embedding_model.encode([query], show_progress_bar=False)
+    query_embedding = np.array(query_embedding, dtype='float32')
+    distances, indices = index.search(query_embedding, top_k)
+    relevant_chunks = [chunks[i] for i in indices[0]]
+    return relevant_chunks
+################################################################################
+# 5. Gemini LLM Integration (Parsing 'candidates')
+################################################################################
+def gemini_generate(prompt):
+    """
+    Calls Google's Gemini API using the environment variable GEMINI_API_KEY.
+    Assumes the 'generateContent' endpoint returns text under:
+      r_data["candidates"][0]["content"]["parts"][0]["text"]
+    """
+    gemini_api_key = os.environ.get("GEMINI_API_KEY", "")
+    if not gemini_api_key:
+        return "Error: No GEMINI_API_KEY found in environment variables."
+    url = (
+        "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent"
+        f"?key={gemini_api_key}"
+    )
+    data = {
+        "contents": [
+            {
+                "parts": [
+                    {"text": prompt}
+                ]
+            }
+        ]
+    }
+    headers = {"Content-Type": "application/json"}
+    response = requests.post(url, headers=headers, json=data)
+    if response.status_code != 200:
+        return f"Error {response.status_code}: {response.text}"
+    r_data = response.json()
+    try:
+        generated_text = r_data["candidates"][0]["content"]["parts"][0]["text"]
+        return generated_text
+    except Exception:
+        return f"Parsing error or unexpected response structure: {r_data}"
+################################################################################
+# 6. RAG QA Function
+################################################################################
+def answer_question_with_RAG(user_question, index, chunks):
+    """
+    Retrieves relevant chunks, builds an augmented prompt, and calls gemini_generate.
+    """
+    relevant_chunks = retrieve_chunks(user_question, index, chunks, top_k=3)
+    context = "\n\n".join(relevant_chunks)
+    prompt = f"""
+    You are an AI assistant that knows the details from the uploaded research paper.
+    Answer the user's question accurately using the context below.
+    If something is not in the context, say you don't know.
+    Context:
+    {context}
+    User's question: {user_question}
+    Answer:
+    """
+    return gemini_generate(prompt)
+################################################################################
+# 7. Gradio Interface
+################################################################################
+def process_pdf(pdf_file):
+    """
+    Called after the user uploads a PDF and clicks 'Process PDF'.
+    Extracts text, chunks it, builds FAISS index, and returns the new state.
+    """
+    if pdf_file is None:
+        return None, "Please upload a PDF file."
+    text = extract_pdf_text(pdf_file.name)
+    if not text:
+        return None, "No text found in PDF."
+    chunks = chunk_text(text, chunk_size=300, overlap=50)
+    if not chunks:
+        return None, "No valid text to chunk."
+    faiss_index, _ = build_faiss_index(chunks)
+    return (faiss_index, chunks), "PDF processed successfully!"
+def chat_with_paper(query, state):
+    """
+    Handles user queries after the PDF is processed.
+    'state' is a tuple: (faiss_index, doc_chunks).
+    """
+    if not state:
+        return "Please upload and process a PDF first."
+    faiss_index, doc_chunks = state
+    if not query or not query.strip():
+        return "Please enter a valid question."
+    return answer_question_with_RAG(query, faiss_index, doc_chunks)
+################################################################################
+# 8. Gradio App with Sky-Blue Tiles
+################################################################################
+import gradio as gr
+demo_theme = gr.themes.Soft(primary_hue="slate")
+css_code = """
+/* Tiled sky-blue background */
+body {
+    background: url('https://i.ibb.co/gvrZQ1C/sky-blue-tile.png');
+    background-repeat: repeat;
+    background-size: 150px 150px;
+}
+/* Centered headings */
+#title-heading {
+    text-align: center;
+    font-size: 2.5rem;
+    font-weight: 700;
+    margin-bottom: 10px;
+}
+#welcome-text {
+    text-align: center;
+    font-size: 1.2rem;
+    color: #444;
+    margin-bottom: 25px;
+    margin-top: 0.5rem;
+}
+"""
+with gr.Blocks(theme=demo_theme, css=css_code) as demo:
+    gr.Markdown("<div id='title-heading'>AI-Powered Personalized Research Assistant</div>")
+    gr.Markdown("<div id='welcome-text'>Welcome! How may I help you?</div>")
+    # State to store (faiss_index, chunks)
+    state = gr.State()
+    with gr.Row():
+        pdf_input = gr.File(label="Upload your research paper (PDF)", file_types=[".pdf"])
+        process_button = gr.Button("Process PDF")
+    status_output = gr.Textbox(label="Status", interactive=False)
+    # When user clicks "Process PDF," we parse and build the index
+    process_button.click(
+        fn=process_pdf,
+        inputs=pdf_input,
+        outputs=[state, status_output]
+    )
+    with gr.Row():
+        user_query = gr.Textbox(label="Ask a question about your research paper:")
+        ask_button = gr.Button("Get Answer")
+    answer_output = gr.Textbox(label="Answer")
+    # When user clicks "Get Answer," we do an RAG-based query
+    ask_button.click(
+        fn=chat_with_paper,
+        inputs=[user_query, state],
+        outputs=answer_output
+    )
+demo.launch()