File size: 4,724 Bytes
c3629c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4136636
 
 
 
c3629c7
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import UnstructuredFileLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from pypdf import PdfReader
import mimetypes
import validators
import requests
import tempfile
import gradio as gr


def get_empty_state():
    return {"knowledge_base": None}


def on_token_change(user_token):
    os.environ["OPENAI_API_KEY"] = user_token


def create_knowledge_base(docs):
    # split into chunks
    text_splitter = CharacterTextSplitter(
        separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len
    )
    chunks = text_splitter.split_documents(docs)

    # Create embeddings
    embeddings = OpenAIEmbeddings()
    knowledge_base = FAISS.from_documents(chunks, embeddings)
    return knowledge_base


def upload_file(file_obj):
    # pdf_reader = PdfReader(file_obj.name)
    # text = ""
    # for page in pdf_reader.pages:
    #     text += page.extract_text()
    loader = UnstructuredFileLoader(file_obj.name, strategy="fast")
    docs = loader.load()

    knowledge_base = create_knowledge_base(docs)
    return file_obj.name, {"knowledge_base": knowledge_base}


def upload_via_url(url):
    if validators.url(url):
        r = requests.get(url)

        if r.status_code != 200:
            raise ValueError(
                "Check the url of your file; returned status code %s" % r.status_code
            )

        content_type = r.headers.get("content-type")
        file_extension = mimetypes.guess_extension(content_type)
        temp_file = tempfile.NamedTemporaryFile(suffix=file_extension, delete=False)
        temp_file.write(r.content)
        file_path = temp_file.name
        loader = UnstructuredFileLoader(file_path, strategy="fast")
        docs = loader.load()
        with open(file_path, mode="rb") as f:
            pass
        knowledge_base = create_knowledge_base(docs)
        return file_path, {"knowledge_base": knowledge_base}
    else:
        raise ValueError("Please enter a valid URL")


def answer_question(question, state):
    knowledge_base = state["knowledge_base"]
    if knowledge_base:
        docs = knowledge_base.similarity_search(question)

        llm = OpenAI(temperature=0.4)
        chain = load_qa_chain(llm, chain_type="stuff")
        response = chain.run(input_documents=docs, question=question)
        return response
    else:
        return "Please upload a file first"


with gr.Blocks(css="style.css") as demo:
    state = gr.State(get_empty_state())
    with gr.Column(elem_id="col-container"):
        gr.Markdown(
            """

            # Ask your PDF 💬

            """
        )
        user_token = gr.Textbox(
            value="",
            label="OpenAI API Key",
            placeholder="OpenAI API Key",
            type="password",
            show_label=True,
        )
        gr.Markdown("**Upload your file**")
        with gr.Row(elem_id="row-flex"):
            with gr.Column(scale=3):
                file_url = gr.Textbox(
                    value="",
                    label="Upload your file",
                    placeholder="Enter a url",
                    show_label=False,
                )
            with gr.Column(scale=1, min_width=160):
                upload_button = gr.UploadButton(
                    "Browse File", file_types=[".txt", ".pdf", ".doc", ".docx"]
                )
        file_output = gr.File()
        user_question = gr.Textbox(value="", label="Ask a question about your file:")
        answer = gr.Textbox(value="", label="Answer:")
        gr.Examples(
            ["What is the main topic of the file?", "Who is the author of the file?"],
            user_question,
        )
        gr.HTML(
            """<br><br><br><center>You can duplicate this Space to skip the queue:<a href="https://huggingface.co/spaces/dragonSwing/langchain-askpdf?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a><br>

                <p><img src="https://visitor-badge.glitch.me/badge?page_id=dragonswing.langchain-askpdf" alt="visitors"></p></center>"""
        )

    file_url.submit(upload_via_url, file_url, [file_output, state])
    upload_button.upload(upload_file, upload_button, [file_output, state])
    user_token.change(on_token_change, inputs=[user_token], outputs=[])
    user_question.submit(answer_question, [user_question, state], [answer])

demo.queue().launch()