Spaces:

Sunbird
/

acres

Sleeping

App Files Files Community

ak3ra commited on Nov 7, 2024

Commit

7bb0003

1 Parent(s): 5674d87

Refactor chat_response function to include PDF preview generation

Browse files

Files changed (5) hide show

app.py +41 -13
interface.py +1 -0
rag/rag_pipeline.py +20 -7
study_files.json +0 -1
utils/pdf_processor.py +1 -0

app.py CHANGED Viewed

@@ -274,15 +274,36 @@ def process_pdf_uploads(files: List[gr.File], collection_name: str) -> str:
 def chat_response(
-    message: str, history: List[Tuple[str, str]], study_name: str
-) -> Tuple[List[Tuple[str, str]], str]:
     """Generate chat response and update history."""
     if not message.strip():
-        return history, None
-    response = chat_function(message, study_name, "Default")
     history.append((message, response))
-    return history, None
 def create_gr_interface() -> gr.Blocks:
@@ -390,6 +411,9 @@ def create_gr_interface() -> gr.Blocks:
                             upload_btn = gr.Button("Process PDFs", variant="primary")
                         pdf_status = gr.Markdown()
                         current_collection = gr.State(value=None)
         # Event handlers for Study Analysis tab
         process_zotero_btn.click(
             process_zotero_library_items,
@@ -433,24 +457,28 @@ def create_gr_interface() -> gr.Blocks:
             if not message.strip():
                 raise gr.Error("Please enter a message")
             history = history + [(message, None)]
-            return history, ""
         def generate_chat_response(history, collection_id):
             """Generate response for the last message in history."""
             if not collection_id:
                 raise gr.Error("Please upload PDFs first")
             if len(history) == 0:
-                return history
             last_message = history[-1][0]
             try:
-                response = chat_function(last_message, collection_id, "Default")
-                history[-1] = (last_message, response)
             except Exception as e:
                 logger.error(f"Error in generate_chat_response: {str(e)}")
                 history[-1] = (last_message, f"Error: {str(e)}")
-            return history
         # Update PDF event handlers
         upload_btn.click(  # Change from pdf_files.upload to upload_btn.click
@@ -463,11 +491,11 @@ def create_gr_interface() -> gr.Blocks:
         chat_submit_btn.click(
             add_message,
             inputs=[chat_history, query_input],
-            outputs=[chat_history, query_input],
         ).success(
             generate_chat_response,
             inputs=[chat_history, current_collection],
-            outputs=[chat_history],
         )
     return demo

 def chat_response(
+    message: str,
+    history: List[Tuple[str, str]],
+    study_name: str,
+    pdf_processor: PDFProcessor,
+) -> Tuple[List[Tuple[str, str]], str, Any]:
     """Generate chat response and update history."""
     if not message.strip():
+        return history, None, None
+    rag = get_rag_pipeline(study_name)
+    response, source_info = rag.query(message)
     history.append((message, response))
+    # Generate PDF preview if source information is available
+    preview_image = None
+    if (
+        source_info
+        and source_info.get("source_file")
+        and source_info.get("page_numbers")
+    ):
+        try:
+            # Get the first page number from the source
+            page_num = source_info["page_numbers"][0]
+            preview_image = pdf_processor.render_page(
+                source_info["source_file"], int(page_num)
+            )
+        except Exception as e:
+            logger.error(f"Error generating PDF preview: {str(e)}")
+    return history, preview_image
 def create_gr_interface() -> gr.Blocks:
                             upload_btn = gr.Button("Process PDFs", variant="primary")
                         pdf_status = gr.Markdown()
                         current_collection = gr.State(value=None)
+        pdf_processor = PDFProcessor()
         # Event handlers for Study Analysis tab
         process_zotero_btn.click(
             process_zotero_library_items,
             if not message.strip():
                 raise gr.Error("Please enter a message")
             history = history + [(message, None)]
+            return history, "", None  # Return empty preview
         def generate_chat_response(history, collection_id):
             """Generate response for the last message in history."""
             if not collection_id:
                 raise gr.Error("Please upload PDFs first")
             if len(history) == 0:
+                return history, None
             last_message = history[-1][0]
             try:
+                updated_history, preview_image = chat_response(
+                    last_message,
+                    history[:-1],
+                    collection_id,
+                    pdf_processor,
+                )
+                return updated_history, preview_image
             except Exception as e:
                 logger.error(f"Error in generate_chat_response: {str(e)}")
                 history[-1] = (last_message, f"Error: {str(e)}")
+                return history, None
         # Update PDF event handlers
         upload_btn.click(  # Change from pdf_files.upload to upload_btn.click
         chat_submit_btn.click(
             add_message,
             inputs=[chat_history, query_input],
+            outputs=[chat_history, query_input, pdf_preview],
         ).success(
             generate_chat_response,
             inputs=[chat_history, current_collection],
+            outputs=[chat_history, pdf_preview],
         )
     return demo

interface.py CHANGED Viewed

@@ -3,6 +3,7 @@ Gradio interface module for ACRES RAG Platform.
 Defines the UI components and layout.
 """
 import gradio as gr

 Defines the UI components and layout.
 """
+# interface.py
 import gradio as gr

rag/rag_pipeline.py CHANGED Viewed

@@ -10,6 +10,8 @@ from llama_index.embeddings.openai import OpenAIEmbedding
 from llama_index.llms.openai import OpenAI
 from llama_index.vector_stores.chroma import ChromaVectorStore
 import chromadb
 logging.basicConfig(level=logging.INFO)
@@ -27,7 +29,6 @@ class RAGPipeline:
         self.documents = None
         self.client = chromadb.Client()
         self.collection = self.client.get_or_create_collection(self.collection_name)
-        # Embed and store each node in ChromaDB
         self.embedding_model = OpenAIEmbedding(model_name="text-embedding-ada-002")
         self.load_documents()
         self.build_index()
@@ -50,9 +51,12 @@ class RAGPipeline:
                     "authors": ", ".join(doc_data.get("authors", [])),
                     "year": doc_data.get("date"),
                     "doi": doc_data.get("doi"),
                 }
-                # Append document data for use in ChromaDB indexing
                 self.documents.append(
                     Document(text=doc_content, id_=f"doc_{index}", metadata=metadata)
                 )
@@ -83,7 +87,7 @@ class RAGPipeline:
     def query(
         self, context: str, prompt_template: PromptTemplate = None
-    ) -> Dict[str, Any]:
         if prompt_template is None:
             prompt_template = PromptTemplate(
                 "Context information is below.\n"
@@ -98,9 +102,7 @@ class RAGPipeline:
                 "Ensure that EVERY statement from the context is properly cited."
             )
-        # This is a hack to index all the documents in the store :)
         n_documents = len(self.index.docstore.docs)
-        print(f"n_documents: {n_documents}")
         query_engine = self.index.as_query_engine(
             text_qa_template=prompt_template,
             similarity_top_k=n_documents if n_documents <= 17 else 15,
@@ -108,7 +110,18 @@ class RAGPipeline:
             llm=OpenAI(model="gpt-4o-mini"),
         )
-        # Perform the query
         response = query_engine.query(context)
-        return response

 from llama_index.llms.openai import OpenAI
 from llama_index.vector_stores.chroma import ChromaVectorStore
 import chromadb
+from typing import Dict, Any, List, Tuple
 logging.basicConfig(level=logging.INFO)
         self.documents = None
         self.client = chromadb.Client()
         self.collection = self.client.get_or_create_collection(self.collection_name)
         self.embedding_model = OpenAIEmbedding(model_name="text-embedding-ada-002")
         self.load_documents()
         self.build_index()
                     "authors": ", ".join(doc_data.get("authors", [])),
                     "year": doc_data.get("date"),
                     "doi": doc_data.get("doi"),
+                    "source_file": doc_data.get("source_file"),  # Add source file path
+                    "page_numbers": list(
+                        doc_data.get("pages", {}).keys()
+                    ),  # Add page numbers
                 }
                 self.documents.append(
                     Document(text=doc_content, id_=f"doc_{index}", metadata=metadata)
                 )
     def query(
         self, context: str, prompt_template: PromptTemplate = None
+    ) -> Tuple[str, Dict[str, Any]]:
         if prompt_template is None:
             prompt_template = PromptTemplate(
                 "Context information is below.\n"
                 "Ensure that EVERY statement from the context is properly cited."
             )
         n_documents = len(self.index.docstore.docs)
         query_engine = self.index.as_query_engine(
             text_qa_template=prompt_template,
             similarity_top_k=n_documents if n_documents <= 17 else 15,
             llm=OpenAI(model="gpt-4o-mini"),
         )
         response = query_engine.query(context)
+        # Extract source information from the response nodes
+        source_info = {}
+        if hasattr(response, "source_nodes") and response.source_nodes:
+            source_node = response.source_nodes[0]  # Get the most relevant source
+            metadata = source_node.metadata
+            source_info = {
+                "source_file": metadata.get("source_file"),
+                "page_numbers": metadata.get("page_numbers", []),
+                "title": metadata.get("title"),
+                "authors": metadata.get("authors"),
+            }
+        return response.response, source_info

study_files.json CHANGED Viewed

@@ -10,5 +10,4 @@
     "iom": "data/iom_zotero_items.json",
     "ExportedRis_file_1_of_1 (1)": "data/exportedris-file-1-of-1-1_zotero_items.json",
     "wb_1813-9450-6689": "data/wb-1813-9450-6689_zotero_items.json",
-    "kayongo papers": "data/kayongo-papers_zotero_items.json"
 }

     "iom": "data/iom_zotero_items.json",
     "ExportedRis_file_1_of_1 (1)": "data/exportedris-file-1-of-1-1_zotero_items.json",
     "wb_1813-9450-6689": "data/wb-1813-9450-6689_zotero_items.json",
 }

utils/pdf_processor.py CHANGED Viewed

@@ -3,6 +3,7 @@ PDF processing module for ACRES RAG Platform.
 Handles PDF file processing, text extraction, and page rendering.
 """
 import os
 import fitz
 import logging

 Handles PDF file processing, text extraction, and page rendering.
 """
+# utils/pdf_processor.py
 import os
 import fitz
 import logging