Geraldine commited on
Commit
244aaba
·
1 Parent(s): cd73297

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -83
app.py CHANGED
@@ -1,108 +1,90 @@
 
 
 
 
 
1
  import os
2
  import time
 
3
 
4
- import langchain
5
- # loaders
6
- from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader, Docx2txtLoader, UnstructuredWordDocumentLoader, UnstructuredPowerPointLoader
7
- # splits
8
- from langchain.text_splitter import RecursiveCharacterTextSplitter
9
- # embeddings
10
- from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
11
- # vector stores
12
- from langchain.vectorstores import Chroma
13
- # huggingface hub
14
- from huggingface_hub import InferenceClient
15
- from langchain import HuggingFaceHub
16
- # models
17
- from langchain.llms import OpenAI
18
- # retrievers
19
- from langchain.chains import RetrievalQA
20
- import gradio as gr
21
 
22
- HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
 
23
 
24
- def build_context(openai_key,files,urls):
25
- if openai_key != "":
26
- embeddings = OpenAIEmbeddings(model_name="text-embedding-ada-002", openai_api_key=openai_key)
27
- else:
28
- embeddings = HuggingFaceEmbeddings(
29
- model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}
30
- )
31
- documents = []
32
- if files is not None:
33
- print("files not none")
34
- for idx, file in enumerate(files):
35
- if file.name.endswith('.pdf'):
36
- loader = PyPDFLoader(file.name)
37
- documents.extend(loader.load())
38
- elif file.name.endswith('.docx'):
39
- loader = Docx2txtLoader(file.name)
40
- documents.extend(loader.load())
41
- elif file.name.endswith('.ppt') or file.name.endswith('.pptx'):
42
- loader = UnstructuredPowerPointLoader(file.name)
43
- documents.extend(loader.load())
44
- if urls != "":
45
- print("urls not none")
46
- list_urls = urls.split(sep=",")
47
- for url in list_urls:
48
- loader = OnlinePDFLoader(url)
49
- documents.extend(loader.load())
50
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=800,chunk_overlap=0,length_function=len,separators=["\n\n", "\n", " ", ""])
51
- chunked_documents = text_splitter.split_documents(documents)
52
- global vectordb
53
- vectordb = Chroma.from_documents(
54
- documents=chunked_documents,
55
- embedding=embeddings,
56
- )
57
  return "loaded"
58
 
59
- def llm_response(openai_key, message, chat_history):
60
- if openai_key != "":
61
- llm = OpenAI(
62
- temperature=0, openai_api_key=openai_key, model_name="gpt-3.5-turbo", verbose=False
63
- )
64
- else:
65
- llm = HuggingFaceHub(repo_id='MBZUAI/LaMini-Flan-T5-248M',
66
- huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
67
- model_kwargs={"max_length":512,"do_sample":True,
68
- "temperature":0.2})
69
- qa_chain = RetrievalQA.from_chain_type(llm = llm,
70
- chain_type = "stuff",
71
- retriever = vectordb.as_retriever(search_kwargs = {"k": 10}),
72
- return_source_documents = False,
73
- verbose = True)
74
- result = qa_chain(message)["result"]
75
- chat_history.append((message, result))
76
  time.sleep(2)
77
  return "", chat_history
78
-
 
 
 
 
79
  def loading():
80
  return "Loading..."
81
 
82
  def clear_chromadb():
83
- ids = vectordb.get()["ids"]
84
- for id in ids:
85
- vectordb._collection.delete(ids=id)
86
 
87
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
 
 
88
  with gr.Row():
89
- openai_key = gr.Textbox(label="Enter your OpenAI API Key if you want to use the gpt-3.5-turbo-16k model. If not, the open source LaMini-Flan-T5-248M is used")
90
- with gr.Row():
91
- pdf_docs = gr.Files(label="Load pdf files", file_types=['.pdf','.docx','.ppt','.pptx'], type="file")
92
- urls = gr.Textbox(label="Enter one of multiple online pdf urls (comma separated if multiple)")
93
- with gr.Row():
94
- load_docs = gr.Button("Load documents and urls", variant="primary", scale=1)
95
- loading_status = gr.Textbox(label="Loading status", placeholder="", interactive=False, scale=0)
96
  with gr.Row():
97
  with gr.Column(scale=1):
 
 
 
 
 
 
 
 
 
 
 
98
  msg = gr.Textbox(label="User message")
99
  chatbot = gr.Chatbot()
100
- with gr.Row():
101
  clearchat = gr.ClearButton([msg, chatbot], value="New chat",)
102
- cleardb = gr.Button(value="Reset context (for loading new documents)", variant="secondary")
103
  load_docs.click(loading, None, loading_status, queue=False)
104
- load_docs.click(build_context, inputs=[openai_key,pdf_docs, urls], outputs=[loading_status], queue=False)
105
- msg.submit(llm_response, [openai_key, msg, chatbot], [msg, chatbot])
106
  cleardb.click(clear_chromadb)
107
 
108
  demo.queue(concurrency_count=3)
 
1
+ import gradio as gr
2
+ from embedchain import App, OpenSourceApp, CustomApp
3
+ from embedchain.config import CustomAppConfig
4
+ from embedchain.models import Providers, EmbeddingFunctions
5
+ import chromadb
6
  import os
7
  import time
8
+ import subprocess
9
 
10
+ #HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ config = CustomAppConfig(embedding_fn=EmbeddingFunctions.OPENAI, provider=Providers.OPENAI, embedding_fn_model="text-embedding-ada-002")
13
+ app = CustomApp(config)
14
 
15
+ class ContextCreator:
16
+ def __init__(self, app):
17
+ self.app = app
18
+ def create_context(self, pdf_urls="", docx_urls="", youtube_urls="", web_urls="", sitemap_url="", upload_files=None):
19
+ if pdf_urls != "":
20
+ for x in pdf_urls.split(","):
21
+ self.app.add(x, data_type='pdf_file')
22
+ if docx_urls != "":
23
+ for x in docx_urls.split(","):
24
+ self.app.add(x, data_type='docx_file')
25
+ if youtube_urls != "":
26
+ for x in youtube_urls.split(","):
27
+ self.app.add(x, data_type='youtube_video')
28
+ if web_urls != "":
29
+ for x in web_urls.split(","):
30
+ self.app.add(x, data_type='web_page')
31
+ if sitemap_url != "":
32
+ self.app.add(x, data_type='sitemap')
33
+ if upload_files is not None:
34
+ for idx, file in enumerate(upload_files):
35
+ if file.name.endswith('.pdf'):
36
+ self.app.add(file.name, data_type='pdf_file')
37
+ if file.name.endswith('.docx'):
38
+ self.app.add(file.name, data_type='docx_file')
39
+
40
+ def build_context(pdf_urls, docx_urls, youtube_urls, web_urls, sitemap_url, upload_files):
41
+ context_creator = ContextCreator(app)
42
+ context_creator.create_context(pdf_urls, docx_urls, youtube_urls, web_urls, sitemap_url, upload_files)
 
 
 
 
 
43
  return "loaded"
44
 
45
+ def llm_respond(query, chat_history):
46
+ result = app.query(query)
47
+ chat_history.append((query, result))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  time.sleep(2)
49
  return "", chat_history
50
+
51
+ def environ_api_key(api_key):
52
+ os.environ["OPENAI_API_KEY"] = api_key
53
+ return "OpenAI API key set !"
54
+
55
  def loading():
56
  return "Loading..."
57
 
58
  def clear_chromadb():
59
+ subprocess.call('rm -rf ./db', shell=True)
 
 
60
 
61
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
62
+ gr.HTML("<h2>Query your documents</h2>")
63
+ gr.HTML("<p>Made with the embedchain Framework</p><p>The framework is built to be able to use multiple free or paid LLM (OpenAI GPT, GPT4ALL, llama2...), however open models like GPT4all or llama are very slow on CPU, which is why OpenAI is prefered here (the default embeddings model is text-embedding-ada-002 and the chat model is gpt-3.5-turbo)</p>")
64
  with gr.Row():
65
+ openai_key = gr.Textbox(label="OpenAI API Key")
66
+ out = gr.Textbox(interactive=False)
67
+ openai_key.change(environ_api_key, openai_key, out)
 
 
 
 
68
  with gr.Row():
69
  with gr.Column(scale=1):
70
+ gr.HTML("<h3>Create your context by combining document formats</h3>")
71
+ pdf_urls = gr.Textbox(label="Online pdf urls (comma separated if multiple)")
72
+ docx_urls = gr.Textbox(label="Online docx urls (comma separated if multiple)")
73
+ youtube_urls = gr.Textbox(label="Youtube video urls (comma separated if multiple)")
74
+ web_urls = gr.Textbox(label="Webpage urls (comma separated if multiple)")
75
+ sitemap_url = gr.Textbox(label="Sitemap url (generally ending with sitemap.xml)")
76
+ upload_files = gr.Files(label="Load local pdf or docx files", file_types=['.pdf','.docx'], type="file")
77
+ load_docs = gr.Button("Load documents and urls", variant="primary")
78
+ loading_status = gr.Textbox(label="Loading status", placeholder="", interactive=False, scale=0)
79
+ with gr.Column(scale=2):
80
+ gr.HTML("<h3>Query your context</h3>")
81
  msg = gr.Textbox(label="User message")
82
  chatbot = gr.Chatbot()
 
83
  clearchat = gr.ClearButton([msg, chatbot], value="New chat",)
84
+ cleardb = gr.Button(value="Reset current documents context (for loading new documents)", variant="secondary")
85
  load_docs.click(loading, None, loading_status, queue=False)
86
+ load_docs.click(build_context, inputs=[pdf_urls, docx_urls, youtube_urls, web_urls, sitemap_url, upload_files], outputs=[loading_status], queue=False)
87
+ msg.submit(llm_respond, [msg, chatbot], [msg, chatbot])
88
  cleardb.click(clear_chromadb)
89
 
90
  demo.queue(concurrency_count=3)