Spaces:

d8aai
/

simple-paper-qa

Runtime error

App Files Files Community

hfwittmann commited on Jun 9, 2023

Commit

806ab1d

0 Parent(s):

Duplicate from hfwittmann/simple-paper-qa

Browse files

Files changed (5) hide show

.gitattributes +34 -0
.vscode/launch.json +16 -0
README.md +14 -0
app.py +186 -0
requirements.txt +10 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: Current File",
+            "type": "python",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "justMyCode": true
+        }
+    ]
+}

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: Simple Paper Qa
+emoji: 🏃
+colorFrom: blue
+colorTo: blue
+sdk: gradio
+sdk_version: 3.34.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+duplicated_from: hfwittmann/simple-paper-qa
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import gradio as gr
+import pandas as pd
+from pathlib import Path
+import os
+css_style = """
+.gradio-container {
+    font-family: "IBM Plex Mono";
+}
+"""
+def request_pathname(files, data, openai_api_key, index):
+    if files is None:
+        return [[]]
+    for file in files:
+        # make sure we're not duplicating things in the dataset
+        if file.name in [x[0] for x in data]:
+            continue
+        data.append([file.name, None, None])
+    mydataset = pd.DataFrame(data, columns=["filepath", "citation string", "key"])
+    validation, index = validate_dataset(mydataset, openai_api_key, index)
+    return (
+        [[len(data), 0]],
+        data,
+        data,
+        validation,
+        index
+    )
+def validate_dataset(dataset, openapi, index):
+    docs_ready = dataset.iloc[-1, 0] != ""
+    if docs_ready and type(openapi) is str and len(openapi) > 0:
+        os.environ["OPENAI_API_KEY"] = openapi.strip()
+        index = get_index(dataset, openapi, index)
+        return "✨Ready✨", index
+    elif docs_ready:
+        return "⚠️Waiting for key⚠️", index
+    elif type(openapi) is str and len(openapi) > 0:
+        return "⚠️Waiting for documents⚠️", index
+    else:
+        return "⚠️Waiting for documents and key⚠️", index
+def get_index(dataset, openapi, index):
+    docs_ready = dataset.iloc[-1, 0] != ""
+    if docs_ready and type(openapi) is str and len(openapi) > 0:
+        from langchain.document_loaders import PyPDFLoader
+        from langchain.vectorstores import DocArrayInMemorySearch
+        from IPython.display import display, Markdown
+        from langchain.indexes import VectorstoreIndexCreator
+        # myfile = "Angela Merkel - Wikipedia.pdf"
+        # loader = PyPDFLoader(file_path=myfile)
+        loader = PyPDFLoader(file_path=dataset["filepath"][0])
+        index = VectorstoreIndexCreator(
+            vectorstore_cls=DocArrayInMemorySearch
+        ).from_loaders([loader])
+    return index
+def make_stats(docs):
+    return [[len(docs.doc_previews), sum([x[0] for x in docs.doc_previews])]]
+def do_ask(question, button, openapi, dataset, index):
+    passages = ""
+    docs_ready = dataset.iloc[-1, 0] != ""
+    out = ''
+    if button == "✨Ready✨" and type(openapi) is str and len(openapi) > 0 and docs_ready:
+    # "Please provide a summary of  signifcant personal life events of  Angela Merkel. Of that summary extract all events with dates and put these into a markdown table."
+    # limit = f' Limit your answer to a maxmium of {length} words.'
+        query =   question # + limit
+        response = index.query(query)
+        out = response
+    yield out, index
+with gr.Blocks(css=css_style) as demo:
+    docs = gr.State()
+    data = gr.State([])
+    openai_api_key = gr.State("")
+    gr.Markdown(
+        """
+    # Document Question and Answer
+    *By D8a.ai*
+    Based on https://huggingface.co/spaces/whitead/paper-qa
+    Significant advances in langchain have made it possible to simplify the code.
+    This tool allows you to ask questions of your uploaded text, PDF documents.
+    It uses OpenAI's GPT models, so you need to enter your API key below. This
+    tool is under active development and currently uses a lot of tokens - up to 10,000
+    for a single query. This is $0.10-0.20 per query, so please be careful!
+    * [langchain](https://github.com/hwchase17/langchain) is the main library this tool utilizes.
+    1. Enter API Key ([What is that?](https://platform.openai.com/account/api-keys))
+    2. Upload your documents
+    3. Ask a questions
+    """
+    )
+    openai_api_key = gr.Textbox(
+        label="OpenAI API Key", placeholder="sk-...", type="password"
+    )
+    with gr.Tab("File Upload"):
+        uploaded_files = gr.File(
+            label="Your Documents Upload (PDF or txt)",
+            file_count="multiple",
+        )
+    with gr.Accordion("See Docs:", open=False):
+        dataset = gr.Dataframe(
+            headers=["filepath", "citation string", "key"],
+            datatype=["str", "str", "str"],
+            col_count=(3, "fixed"),
+            interactive=False,
+            label="Documents and Citations",
+            overflow_row_behaviour="paginate",
+            max_rows=5,
+        )
+    buildb = gr.Textbox(
+        "⚠️Waiting for documents and key...",
+        label="Status",
+        interactive=False,
+        show_label=True,
+        max_lines=1,
+    )
+    index = gr.State()
+    stats = gr.Dataframe(
+        headers=["Docs", "Chunks"],
+        datatype=["number", "number"],
+        col_count=(2, "fixed"),
+        interactive=False,
+        label="Doc Stats",
+    )
+    openai_api_key.change(
+        validate_dataset, inputs=[dataset, openai_api_key], outputs=[buildb, index]
+    )
+    dataset.change(validate_dataset, inputs=[dataset, openai_api_key, index], outputs=[buildb, index])
+    uploaded_files.change(
+        request_pathname,
+        inputs=[uploaded_files, data, openai_api_key, index],
+        outputs=[stats, data, dataset, buildb, index],
+    )
+    query = gr.Textbox(placeholder="Enter your question here...", label="Question")
+    # with gr.Row():
+    #     length = gr.Slider(25, 200, value=100, step=5, label="Words in answer")
+    ask = gr.Button("Ask Question")
+    answer = gr.Markdown(label="Answer")
+    ask.click(
+        do_ask,
+        inputs=[query, buildb, openai_api_key, dataset, index],
+        outputs=[answer, index],
+    )
+demo.queue(concurrency_count=20)
+demo.launch(show_error=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+python-dotenv == 1.0.0
+openai==0.27.8
+langchain==0.0.194
+tiktoken==0.4.0
+pandas==2.0.2
+pypdf==3.9.1
+docarray==0.32.1
+gradio == 3.34.0
+jupyter == 1.0.0
+ipykernel == 6.23.1