Spaces:

Geraldine
/

simple_contextual_chatbot

Runtime error

App Files Files Community

Geraldine commited on Aug 25, 2023

Commit

244aaba

1 Parent(s): cd73297

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -83

app.py CHANGED Viewed

@@ -1,108 +1,90 @@
 import os
 import time
-import langchain
-# loaders
-from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader, Docx2txtLoader, UnstructuredWordDocumentLoader, UnstructuredPowerPointLoader
-# splits
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-# embeddings
-from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
-# vector stores
-from langchain.vectorstores import Chroma
-# huggingface hub
-from huggingface_hub import InferenceClient
-from langchain import HuggingFaceHub
-# models
-from langchain.llms import OpenAI
-# retrievers
-from langchain.chains import RetrievalQA
-import gradio as gr
-HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
-def build_context(openai_key,files,urls):
-  if openai_key != "":
-    embeddings = OpenAIEmbeddings(model_name="text-embedding-ada-002", openai_api_key=openai_key)
-  else:
-    embeddings = HuggingFaceEmbeddings(
-                model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}
-            )
-  documents = []
-  if files is not None:
-    print("files not none")
-    for idx, file in enumerate(files):
-      if file.name.endswith('.pdf'):
-        loader = PyPDFLoader(file.name)
-        documents.extend(loader.load())
-      elif file.name.endswith('.docx'):
-        loader = Docx2txtLoader(file.name)
-        documents.extend(loader.load())
-      elif file.name.endswith('.ppt') or file.name.endswith('.pptx'):
-        loader = UnstructuredPowerPointLoader(file.name)
-        documents.extend(loader.load())
-  if urls != "":
-    print("urls not none")
-    list_urls = urls.split(sep=",")
-    for url in list_urls:
-      loader = OnlinePDFLoader(url)
-      documents.extend(loader.load())
-  text_splitter = RecursiveCharacterTextSplitter(chunk_size=800,chunk_overlap=0,length_function=len,separators=["\n\n", "\n", " ", ""])
-  chunked_documents = text_splitter.split_documents(documents)
-  global vectordb
-  vectordb = Chroma.from_documents(
-    documents=chunked_documents,
-    embedding=embeddings,
-    )
   return "loaded"
-def llm_response(openai_key, message, chat_history):
-  if openai_key != "":
-    llm = OpenAI(
-        temperature=0, openai_api_key=openai_key, model_name="gpt-3.5-turbo", verbose=False
-    )
-  else:
-    llm = HuggingFaceHub(repo_id='MBZUAI/LaMini-Flan-T5-248M',
-                        huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
-                         model_kwargs={"max_length":512,"do_sample":True,
-                                       "temperature":0.2})
-  qa_chain = RetrievalQA.from_chain_type(llm = llm,
-                                       chain_type = "stuff",
-                                       retriever = vectordb.as_retriever(search_kwargs = {"k": 10}),
-                                       return_source_documents = False,
-                                       verbose = True)
-  result = qa_chain(message)["result"]
-  chat_history.append((message, result))
   time.sleep(2)
   return "", chat_history
 def loading():
     return "Loading..."
 def clear_chromadb():
-  ids = vectordb.get()["ids"]
-  for id in ids:
-    vectordb._collection.delete(ids=id)
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
   with gr.Row():
-    openai_key = gr.Textbox(label="Enter your OpenAI API Key if you want to use the gpt-3.5-turbo-16k model. If not, the open source LaMini-Flan-T5-248M is used")
-  with gr.Row():
-      pdf_docs = gr.Files(label="Load pdf files", file_types=['.pdf','.docx','.ppt','.pptx'], type="file")
-      urls = gr.Textbox(label="Enter one of multiple online pdf urls (comma separated if multiple)")
-  with gr.Row():
-    load_docs = gr.Button("Load documents and urls", variant="primary", scale=1)
-    loading_status = gr.Textbox(label="Loading status", placeholder="", interactive=False, scale=0)
   with gr.Row():
     with gr.Column(scale=1):
       msg = gr.Textbox(label="User message")
       chatbot = gr.Chatbot()
-  with gr.Row():
       clearchat = gr.ClearButton([msg, chatbot], value="New chat",)
-      cleardb = gr.Button(value="Reset context (for loading new documents)", variant="secondary")
   load_docs.click(loading, None, loading_status, queue=False)
-  load_docs.click(build_context, inputs=[openai_key,pdf_docs, urls], outputs=[loading_status], queue=False)
-  msg.submit(llm_response, [openai_key, msg, chatbot], [msg, chatbot])
   cleardb.click(clear_chromadb)
 demo.queue(concurrency_count=3)

+import gradio as gr
+from embedchain import App, OpenSourceApp, CustomApp
+from embedchain.config import CustomAppConfig
+from embedchain.models import Providers, EmbeddingFunctions
+import chromadb
 import os
 import time
+import subprocess
+#HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
+config = CustomAppConfig(embedding_fn=EmbeddingFunctions.OPENAI, provider=Providers.OPENAI, embedding_fn_model="text-embedding-ada-002")
+app = CustomApp(config)
+class ContextCreator:
+    def __init__(self, app):
+        self.app = app
+    def create_context(self, pdf_urls="", docx_urls="", youtube_urls="", web_urls="", sitemap_url="", upload_files=None):
+        if pdf_urls != "":
+            for x in pdf_urls.split(","):
+                self.app.add(x, data_type='pdf_file')
+        if docx_urls != "":
+            for x in docx_urls.split(","):
+                self.app.add(x, data_type='docx_file')
+        if youtube_urls != "":
+            for x in youtube_urls.split(","):
+                self.app.add(x, data_type='youtube_video')
+        if web_urls != "":
+            for x in web_urls.split(","):
+                self.app.add(x, data_type='web_page')
+        if sitemap_url != "":
+          self.app.add(x, data_type='sitemap')
+        if upload_files is not None:
+          for idx, file in enumerate(upload_files):
+            if file.name.endswith('.pdf'):
+              self.app.add(file.name, data_type='pdf_file')
+            if file.name.endswith('.docx'):
+              self.app.add(file.name, data_type='docx_file')
+def build_context(pdf_urls, docx_urls, youtube_urls, web_urls, sitemap_url, upload_files):
+  context_creator = ContextCreator(app)
+  context_creator.create_context(pdf_urls, docx_urls, youtube_urls, web_urls, sitemap_url, upload_files)
   return "loaded"
+def llm_respond(query, chat_history):
+  result = app.query(query)
+  chat_history.append((query, result))
   time.sleep(2)
   return "", chat_history
+def environ_api_key(api_key):
+  os.environ["OPENAI_API_KEY"] = api_key
+  return "OpenAI API key set !"
 def loading():
     return "Loading..."
 def clear_chromadb():
+  subprocess.call('rm -rf ./db', shell=True)
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+  gr.HTML("<h2>Query your documents</h2>")
+  gr.HTML("<p>Made with the embedchain Framework</p><p>The framework is built to be able to use multiple free or paid LLM (OpenAI GPT, GPT4ALL, llama2...), however open models like GPT4all or llama are very slow on CPU, which is why OpenAI is prefered here (the default embeddings model is text-embedding-ada-002 and the chat model is gpt-3.5-turbo)</p>")
   with gr.Row():
+    openai_key = gr.Textbox(label="OpenAI API Key")
+    out = gr.Textbox(interactive=False)
+    openai_key.change(environ_api_key, openai_key, out)
   with gr.Row():
     with gr.Column(scale=1):
+      gr.HTML("<h3>Create your context by combining document formats</h3>")
+      pdf_urls = gr.Textbox(label="Online pdf urls (comma separated if multiple)")
+      docx_urls = gr.Textbox(label="Online docx urls (comma separated if multiple)")
+      youtube_urls = gr.Textbox(label="Youtube video urls (comma separated if multiple)")
+      web_urls = gr.Textbox(label="Webpage urls (comma separated if multiple)")
+      sitemap_url = gr.Textbox(label="Sitemap url (generally ending with sitemap.xml)")
+      upload_files = gr.Files(label="Load local pdf or docx files", file_types=['.pdf','.docx'], type="file")
+      load_docs = gr.Button("Load documents and urls", variant="primary")
+      loading_status = gr.Textbox(label="Loading status", placeholder="", interactive=False, scale=0)
+    with gr.Column(scale=2):
+      gr.HTML("<h3>Query your context</h3>")
       msg = gr.Textbox(label="User message")
       chatbot = gr.Chatbot()
       clearchat = gr.ClearButton([msg, chatbot], value="New chat",)
+      cleardb = gr.Button(value="Reset current documents context (for loading new documents)", variant="secondary")
   load_docs.click(loading, None, loading_status, queue=False)
+  load_docs.click(build_context, inputs=[pdf_urls, docx_urls, youtube_urls, web_urls, sitemap_url, upload_files], outputs=[loading_status], queue=False)
+  msg.submit(llm_respond, [msg, chatbot], [msg, chatbot])
   cleardb.click(clear_chromadb)
 demo.queue(concurrency_count=3)