Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,108 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import time
|
|
|
3 |
|
4 |
-
|
5 |
-
# loaders
|
6 |
-
from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader, Docx2txtLoader, UnstructuredWordDocumentLoader, UnstructuredPowerPointLoader
|
7 |
-
# splits
|
8 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
9 |
-
# embeddings
|
10 |
-
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
|
11 |
-
# vector stores
|
12 |
-
from langchain.vectorstores import Chroma
|
13 |
-
# huggingface hub
|
14 |
-
from huggingface_hub import InferenceClient
|
15 |
-
from langchain import HuggingFaceHub
|
16 |
-
# models
|
17 |
-
from langchain.llms import OpenAI
|
18 |
-
# retrievers
|
19 |
-
from langchain.chains import RetrievalQA
|
20 |
-
import gradio as gr
|
21 |
|
22 |
-
|
|
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
global vectordb
|
53 |
-
vectordb = Chroma.from_documents(
|
54 |
-
documents=chunked_documents,
|
55 |
-
embedding=embeddings,
|
56 |
-
)
|
57 |
return "loaded"
|
58 |
|
59 |
-
def
|
60 |
-
|
61 |
-
|
62 |
-
temperature=0, openai_api_key=openai_key, model_name="gpt-3.5-turbo", verbose=False
|
63 |
-
)
|
64 |
-
else:
|
65 |
-
llm = HuggingFaceHub(repo_id='MBZUAI/LaMini-Flan-T5-248M',
|
66 |
-
huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
|
67 |
-
model_kwargs={"max_length":512,"do_sample":True,
|
68 |
-
"temperature":0.2})
|
69 |
-
qa_chain = RetrievalQA.from_chain_type(llm = llm,
|
70 |
-
chain_type = "stuff",
|
71 |
-
retriever = vectordb.as_retriever(search_kwargs = {"k": 10}),
|
72 |
-
return_source_documents = False,
|
73 |
-
verbose = True)
|
74 |
-
result = qa_chain(message)["result"]
|
75 |
-
chat_history.append((message, result))
|
76 |
time.sleep(2)
|
77 |
return "", chat_history
|
78 |
-
|
|
|
|
|
|
|
|
|
79 |
def loading():
|
80 |
return "Loading..."
|
81 |
|
82 |
def clear_chromadb():
|
83 |
-
|
84 |
-
for id in ids:
|
85 |
-
vectordb._collection.delete(ids=id)
|
86 |
|
87 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
|
|
|
88 |
with gr.Row():
|
89 |
-
openai_key = gr.Textbox(label="
|
90 |
-
|
91 |
-
|
92 |
-
urls = gr.Textbox(label="Enter one of multiple online pdf urls (comma separated if multiple)")
|
93 |
-
with gr.Row():
|
94 |
-
load_docs = gr.Button("Load documents and urls", variant="primary", scale=1)
|
95 |
-
loading_status = gr.Textbox(label="Loading status", placeholder="", interactive=False, scale=0)
|
96 |
with gr.Row():
|
97 |
with gr.Column(scale=1):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
msg = gr.Textbox(label="User message")
|
99 |
chatbot = gr.Chatbot()
|
100 |
-
with gr.Row():
|
101 |
clearchat = gr.ClearButton([msg, chatbot], value="New chat",)
|
102 |
-
cleardb = gr.Button(value="Reset context (for loading new documents)", variant="secondary")
|
103 |
load_docs.click(loading, None, loading_status, queue=False)
|
104 |
-
load_docs.click(build_context, inputs=[
|
105 |
-
msg.submit(
|
106 |
cleardb.click(clear_chromadb)
|
107 |
|
108 |
demo.queue(concurrency_count=3)
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from embedchain import App, OpenSourceApp, CustomApp
|
3 |
+
from embedchain.config import CustomAppConfig
|
4 |
+
from embedchain.models import Providers, EmbeddingFunctions
|
5 |
+
import chromadb
|
6 |
import os
|
7 |
import time
|
8 |
+
import subprocess
|
9 |
|
10 |
+
#HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
+
config = CustomAppConfig(embedding_fn=EmbeddingFunctions.OPENAI, provider=Providers.OPENAI, embedding_fn_model="text-embedding-ada-002")
|
13 |
+
app = CustomApp(config)
|
14 |
|
15 |
+
class ContextCreator:
|
16 |
+
def __init__(self, app):
|
17 |
+
self.app = app
|
18 |
+
def create_context(self, pdf_urls="", docx_urls="", youtube_urls="", web_urls="", sitemap_url="", upload_files=None):
|
19 |
+
if pdf_urls != "":
|
20 |
+
for x in pdf_urls.split(","):
|
21 |
+
self.app.add(x, data_type='pdf_file')
|
22 |
+
if docx_urls != "":
|
23 |
+
for x in docx_urls.split(","):
|
24 |
+
self.app.add(x, data_type='docx_file')
|
25 |
+
if youtube_urls != "":
|
26 |
+
for x in youtube_urls.split(","):
|
27 |
+
self.app.add(x, data_type='youtube_video')
|
28 |
+
if web_urls != "":
|
29 |
+
for x in web_urls.split(","):
|
30 |
+
self.app.add(x, data_type='web_page')
|
31 |
+
if sitemap_url != "":
|
32 |
+
self.app.add(x, data_type='sitemap')
|
33 |
+
if upload_files is not None:
|
34 |
+
for idx, file in enumerate(upload_files):
|
35 |
+
if file.name.endswith('.pdf'):
|
36 |
+
self.app.add(file.name, data_type='pdf_file')
|
37 |
+
if file.name.endswith('.docx'):
|
38 |
+
self.app.add(file.name, data_type='docx_file')
|
39 |
+
|
40 |
+
def build_context(pdf_urls, docx_urls, youtube_urls, web_urls, sitemap_url, upload_files):
|
41 |
+
context_creator = ContextCreator(app)
|
42 |
+
context_creator.create_context(pdf_urls, docx_urls, youtube_urls, web_urls, sitemap_url, upload_files)
|
|
|
|
|
|
|
|
|
|
|
43 |
return "loaded"
|
44 |
|
45 |
+
def llm_respond(query, chat_history):
|
46 |
+
result = app.query(query)
|
47 |
+
chat_history.append((query, result))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
time.sleep(2)
|
49 |
return "", chat_history
|
50 |
+
|
51 |
+
def environ_api_key(api_key):
|
52 |
+
os.environ["OPENAI_API_KEY"] = api_key
|
53 |
+
return "OpenAI API key set !"
|
54 |
+
|
55 |
def loading():
|
56 |
return "Loading..."
|
57 |
|
58 |
def clear_chromadb():
|
59 |
+
subprocess.call('rm -rf ./db', shell=True)
|
|
|
|
|
60 |
|
61 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
62 |
+
gr.HTML("<h2>Query your documents</h2>")
|
63 |
+
gr.HTML("<p>Made with the embedchain Framework</p><p>The framework is built to be able to use multiple free or paid LLM (OpenAI GPT, GPT4ALL, llama2...), however open models like GPT4all or llama are very slow on CPU, which is why OpenAI is prefered here (the default embeddings model is text-embedding-ada-002 and the chat model is gpt-3.5-turbo)</p>")
|
64 |
with gr.Row():
|
65 |
+
openai_key = gr.Textbox(label="OpenAI API Key")
|
66 |
+
out = gr.Textbox(interactive=False)
|
67 |
+
openai_key.change(environ_api_key, openai_key, out)
|
|
|
|
|
|
|
|
|
68 |
with gr.Row():
|
69 |
with gr.Column(scale=1):
|
70 |
+
gr.HTML("<h3>Create your context by combining document formats</h3>")
|
71 |
+
pdf_urls = gr.Textbox(label="Online pdf urls (comma separated if multiple)")
|
72 |
+
docx_urls = gr.Textbox(label="Online docx urls (comma separated if multiple)")
|
73 |
+
youtube_urls = gr.Textbox(label="Youtube video urls (comma separated if multiple)")
|
74 |
+
web_urls = gr.Textbox(label="Webpage urls (comma separated if multiple)")
|
75 |
+
sitemap_url = gr.Textbox(label="Sitemap url (generally ending with sitemap.xml)")
|
76 |
+
upload_files = gr.Files(label="Load local pdf or docx files", file_types=['.pdf','.docx'], type="file")
|
77 |
+
load_docs = gr.Button("Load documents and urls", variant="primary")
|
78 |
+
loading_status = gr.Textbox(label="Loading status", placeholder="", interactive=False, scale=0)
|
79 |
+
with gr.Column(scale=2):
|
80 |
+
gr.HTML("<h3>Query your context</h3>")
|
81 |
msg = gr.Textbox(label="User message")
|
82 |
chatbot = gr.Chatbot()
|
|
|
83 |
clearchat = gr.ClearButton([msg, chatbot], value="New chat",)
|
84 |
+
cleardb = gr.Button(value="Reset current documents context (for loading new documents)", variant="secondary")
|
85 |
load_docs.click(loading, None, loading_status, queue=False)
|
86 |
+
load_docs.click(build_context, inputs=[pdf_urls, docx_urls, youtube_urls, web_urls, sitemap_url, upload_files], outputs=[loading_status], queue=False)
|
87 |
+
msg.submit(llm_respond, [msg, chatbot], [msg, chatbot])
|
88 |
cleardb.click(clear_chromadb)
|
89 |
|
90 |
demo.queue(concurrency_count=3)
|