Spaces:

Divyansh12
/

PDF_Insights_QA

Runtime error

App Files Files Community

Divyansh12 commited on Jan 7

Commit

11c7c99

verified ·

1 Parent(s): 756320e

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -40

app.py CHANGED Viewed

@@ -1,12 +1,6 @@
 import os
-import asyncio
 import nest_asyncio
-import pinecone
 import time
-import fitz
-import base64
-from pathlib import Path
-from typing import List, Tuple
 from dotenv import find_dotenv, load_dotenv
 from langchain_groq import ChatGroq
 from langchain_huggingface import HuggingFaceEmbeddings
@@ -17,7 +11,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.document_loaders import PyPDFDirectoryLoader
 from langchain_pinecone import PineconeVectorStore
 from pinecone import Pinecone, ServerlessSpec
-import gradio as gr
 from langchain import hub
 # Allow nested async calls
@@ -61,7 +55,9 @@ docs = text_splitter.split_documents(documents)
 def are_documents_indexed(index):
     try:
         test_embedding = embedding_model.embed_query("test")
         results = index.query(vector=test_embedding, top_k=1)
         return len(results.matches) > 0
     except Exception as e:
@@ -95,7 +91,7 @@ relevance_prompt_template = PromptTemplate.from_template(
     Return ONLY the numeric score, without any additional text or explanation.
     Question: {question}
     Retrieved Context: {retrieved_context}
-    Relevance Score:"""
 )
 def format_docs(docs):
@@ -111,17 +107,6 @@ def conditional_answer(x):
     relevance_score = extract_score(x["relevance_score"])
     return "I don't know." if relevance_score < 4 else x["answer"]
-def highlight_pdf(pdf_path: str, text_chunks: List[str]) -> str:
-    doc = fitz.open(pdf_path)
-    for page in doc:
-        for chunk in text_chunks:
-            areas = page.search_for(chunk)
-            for rect in areas:
-                highlight = page.add_highlight_annot(rect)
-    buffer = doc.write()
-    doc.close()
-    return base64.b64encode(buffer).decode()
 # RAG pipeline
 rag_chain_from_docs = (
     RunnablePassthrough.assign(context=lambda x: format_docs(x["context"]))
@@ -145,41 +130,32 @@ rag_chain_from_docs = (
 )
 rag_chain_with_source = RunnableParallel(
-    {"context": retriever, "question": RunnablePassthrough()}
 ).assign(answer=rag_chain_from_docs)
-async def process_question(question: str) -> Tuple[str, str, dict]:
     try:
         result = await rag_chain_with_source.ainvoke(question)
         final_answer = result["answer"]["final_answer"]
-        context_docs = result["context"]
-        sources = []
-        highlighted_pdfs = {}
-        for doc in context_docs:
-            source = doc.metadata.get("source")
-            if source and source.endswith('.pdf'):
-                sources.append(source)
-                if source not in highlighted_pdfs:
-                    highlighted_pdfs[source] = highlight_pdf(source, [doc.page_content])
-        return final_answer, ", ".join(sources), highlighted_pdfs
     except Exception as e:
-        return f"Error: {str(e)}", "Error retrieving sources", {}
-# Gradio interface
-print("Setting up Gradio interface...")
 demo = gr.Interface(
     fn=process_question,
-    inputs=gr.Textbox(label="Enter your question"),
     outputs=[
         gr.Textbox(label="Answer"),
         gr.Textbox(label="Sources"),
-        gr.Gallery(label="Highlighted PDFs")
     ],
     title="RAG Question Answering",
-    description="Enter a question to get an answer with highlighted relevant sections."
 )
 if __name__ == "__main__":

 import os
 import nest_asyncio
 import time
 from dotenv import find_dotenv, load_dotenv
 from langchain_groq import ChatGroq
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.document_loaders import PyPDFDirectoryLoader
 from langchain_pinecone import PineconeVectorStore
 from pinecone import Pinecone, ServerlessSpec
+import gradio as gr
 from langchain import hub
 # Allow nested async calls
 def are_documents_indexed(index):
     try:
+        # Create a simple test embedding
         test_embedding = embedding_model.embed_query("test")
+        # Query the index
         results = index.query(vector=test_embedding, top_k=1)
         return len(results.matches) > 0
     except Exception as e:
     Return ONLY the numeric score, without any additional text or explanation.
     Question: {question}
     Retrieved Context: {retrieved_context}
+    Relevance Score: """
 )
 def format_docs(docs):
     relevance_score = extract_score(x["relevance_score"])
     return "I don't know." if relevance_score < 4 else x["answer"]
 # RAG pipeline
 rag_chain_from_docs = (
     RunnablePassthrough.assign(context=lambda x: format_docs(x["context"]))
 )
 rag_chain_with_source = RunnableParallel(
+    {"context": retriever,
+     "question": RunnablePassthrough()
+    }
 ).assign(answer=rag_chain_from_docs)
+async def process_question(question):
     try:
         result = await rag_chain_with_source.ainvoke(question)
         final_answer = result["answer"]["final_answer"]
+        sources = [doc.metadata.get("source") for doc in result["context"]]
+        source_list = ", ".join(sources)
+        return final_answer, source_list
     except Exception as e:
+        return f"Error: {str(e)}", "Error retrieving sources"
+# Gradio
+print("Gradio interface...")
 demo = gr.Interface(
     fn=process_question,
+    inputs=gr.Textbox(label="Enter your question", value=""),
     outputs=[
         gr.Textbox(label="Answer"),
         gr.Textbox(label="Sources"),
     ],
     title="RAG Question Answering",
+    description="Enter a question and get an answer from the PDFs.",
 )
 if __name__ == "__main__":