Spaces:

kaiserpister
/

demo-pdfchat

Runtime error

App Files Files Community

kaiserpister commited on Sep 15, 2023

Commit

59122b6

1 Parent(s): 1a9597f

Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

README.md +3 -9
__pycache__/pdfparser.cpython-310.pyc +0 -0
__pycache__/ui.cpython-310.pyc +0 -0
pdfparser.py +142 -0
requirements.txt +9 -0
ui.py +61 -0

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: Demo Pdfchat
-emoji: ⚡
-colorFrom: blue
-colorTo: indigo
 sdk: gradio
-sdk_version: 3.44.3
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: demo-pdfchat
+app_file: ui.py
 sdk: gradio
+sdk_version: 3.35.2
 ---

__pycache__/pdfparser.cpython-310.pyc ADDED Viewed

Binary file (3.44 kB). View file

__pycache__/ui.cpython-310.pyc ADDED Viewed

Binary file (1.88 kB). View file

pdfparser.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import io
+import os
+import boto3
+from langchain.document_loaders import PyPDFium2Loader
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores import FAISS
+from pdf2image import convert_from_path
+from sllim import chat
+# Standard Textract client setup
+textract_client = boto3.client("textract")
+template = """I will give you a couple of paragraphs from a PDF document along with a question about the document. You will provide an answer as accurately as possible and provide citations for why that answer is correct.
+DOCUMENTS:
+{docs}
+---
+QUERY:
+{query}
+"""
+embeddings = OpenAIEmbeddings()
+def convert_pdf_to_text(pdf_file_path: str):
+    # Convert the PDF to an in-memory image format
+    images = convert_from_path(pdf_file_path)
+    docs = []
+    for image in images:
+        # Convert the image into byte stream
+        with io.BytesIO() as image_stream:
+            image.save(image_stream, "JPEG")
+            image_bytes = image_stream.getvalue()
+        # Use Textract to detect text in the local image
+        response = textract_client.detect_document_text(Document={"Bytes": image_bytes})
+        text = ""
+        # Print the detected text blocks
+        for item in response["Blocks"]:
+            if item["BlockType"] == "LINE":
+                text += item["Text"] + "\n"
+        docs.append(text)
+    return docs
+def process_file(file_path):
+    index_path = get_index_name(file_path)
+    if os.path.exists(index_path):
+        return
+    loader = PyPDFium2Loader(file_path)
+    data = loader.load()
+    # Parse text into paragraphs
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=50,
+        length_function=len,
+    )
+    docs = text_splitter.split_documents(data)
+    if len(docs) == 0:
+        data = convert_pdf_to_text(file_path)
+        docs = text_splitter.create_documents(data)
+    # Embed paragraphs
+    db = FAISS.from_documents(docs, embeddings)
+    db.save_local(index_path)
+def get_index_name(file_path):
+    basename = os.path.splitext(os.path.basename(file_path))[0]
+    index_path = basename + "_faiss_index"
+    return index_path
+def ask_question_all(history):
+    indices = []
+    docs = []
+    messages = []
+    for user, bot in history:
+        if not isinstance(user, str):
+            indices.append(get_index_name(user[0]))
+        elif bot:
+            messages.append({"role": "user", "content": user})
+            messages.append({"role": "assistant", "content": bot})
+        else:
+            # Handle new message
+            for index_path in indices:
+                db = FAISS.load_local(index_path, embeddings)
+                docs.extend(db.similarity_search(user))
+            messages.append(
+                {
+                    "role": "user",
+                    "content": template.format(
+                        query=user, docs="\n".join(map(lambda x: x.page_content, docs))
+                    ),
+                }
+            )
+    # send similar paragraphs with question to model
+    return chat(messages, model="gpt-3.5-turbo")
+def ask_question(query, upload_file, history=None):
+    file_path = upload_file.name
+    index_path = get_index_name(file_path)
+    if not os.path.exists(index_path):
+        loader = PyPDFium2Loader(file_path)
+        data = loader.load()
+        # Parse text into paragraphs
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=50,
+            length_function=len,
+        )
+        docs = text_splitter.split_documents(data)
+        if len(docs) == 0:
+            data = convert_pdf_to_text(file_path)
+            docs = text_splitter.create_documents(data)
+        # Embed paragraphs
+        db = FAISS.from_documents(docs, embeddings)
+        db.save_local(index_path)
+    else:
+        db = FAISS.load_local(index_path, embeddings)
+    docs = db.similarity_search(query)
+    messages = [
+        {
+            "role": "user",
+            "content": template.format(
+                query=query, docs="\n".join(map(lambda x: x.page_content, docs))
+            ),
+        }
+    ]
+    # send similar paragraphs with question to model
+    return chat(messages, model="gpt-3.5-turbo")

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+sllim
+openai
+faiss-cpu
+tiktoken
+pdf2image
+pypdfium2
+gradio
+boto3
+langchain

ui.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import os
+import gradio as gr
+from pdfparser import ask_question_all, process_file
+PASSWORD = os.environ["OPEN_PASSWORD"]
+def add_text(history, text):
+    history = history + [(text, None)]
+    return history, gr.update(value="", interactive=False)
+def add_file(history, file):
+    history = history + [((file.name,), None)]
+    return history
+def bot(history):
+    if history[0][0] == PASSWORD:
+        if len(history) == 1:
+            response = "Access granted."
+        else:
+            response = ask_question_all(history[1:])
+    else:
+        response = "Wrong password"
+    history[-1][1] = response
+    return history
+def bot_upload(history):
+    if history[0][0] == PASSWORD:
+        process_file(history[-1][0][0])
+        history[-1][1] = "Ready."
+    else:
+        history[-1][1] = "Wrong password"
+    return history
+with gr.Blocks() as demo:
+    chatbot = gr.Chatbot([], elem_id="chatbot").style(height=450)
+    with gr.Row():
+        with gr.Column(scale=0.85):
+            txt = gr.Textbox(
+                show_label=False,
+                placeholder="First upload a pdf file, then query it",
+            ).style(container=False)
+        with gr.Column(scale=0.15, min_width=0):
+            btn = gr.UploadButton("📁", file_types=["pdf"])
+    txt_msg = txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
+        bot, chatbot, chatbot
+    )
+    txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
+    file_msg = btn.upload(add_file, [chatbot, btn], [chatbot], queue=False).then(
+        bot_upload, chatbot, chatbot
+    )
+demo.launch()