Maxime Bourliatoux commited on
Commit
3b6db3d
·
1 Parent(s): 6c2ad63

Initial commit

Browse files
Files changed (4) hide show
  1. .gitignore +2 -0
  2. README.md +32 -4
  3. app.py +155 -0
  4. requirements.txt +5 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ chroma_db/*
2
+ __pycache__/*
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: Chatbot G Pdf
3
- emoji: 📚
4
- colorFrom: blue
5
- colorTo: blue
6
  sdk: gradio
7
  sdk_version: 4.16.0
8
  app_file: app.py
@@ -10,4 +10,32 @@ pinned: false
10
  license: mit
11
  ---
12
 
 
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: GAIA Chatbot - level 3
3
+ emoji: 🌍
4
+ colorFrom: red
5
+ colorTo: purple
6
  sdk: gradio
7
  sdk_version: 4.16.0
8
  app_file: app.py
 
10
  license: mit
11
  ---
12
 
13
+ # Run on a space
14
+
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
16
+
17
+ Simply push your code on a huggingface space.
18
+
19
+ # Run locally
20
+
21
+ You must have python (3.8)[https://www.python.org/downloads/].
22
+
23
+ Check https://www.gradio.app/guides/quickstart for more details about Gradio.
24
+
25
+ ## Install dependencies
26
+
27
+ `pip install gradio`
28
+
29
+ `pip install -r requirements.txt`
30
+
31
+ ## Add Mistral API Key to your environement variables
32
+
33
+ in `~/.profile` or `~/.bashrc`
34
+
35
+ `export MISTRAL_API_KEY=YOUR_API_KEY`
36
+
37
+ ## Run your code
38
+
39
+ `python3 app.py`
40
+
41
+ ## Open your browser to `http://127.0.0.1:7860`
app.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import gradio as gr
4
+ from llama_index import (
5
+ VectorStoreIndex,
6
+ download_loader,
7
+ )
8
+ import chromadb
9
+
10
+ from llama_index.llms import MistralAI
11
+ from llama_index.embeddings import MistralAIEmbedding
12
+ from llama_index.vector_stores import ChromaVectorStore
13
+ from llama_index.storage.storage_context import StorageContext
14
+ from llama_index import ServiceContext
15
+
16
+ title = "Gaia Mistral Chat RAG PDF Demo"
17
+ description = "Example of an assistant with Gradio, RAG from PDF documents and Mistral AI via its API"
18
+ placeholder = (
19
+ "Vous pouvez me posez une question sur ce contexte, appuyer sur Entrée pour valider"
20
+ )
21
+ placeholder_url = "Extract text from this url"
22
+ llm_model = "mistral-small"
23
+
24
+ env_api_key = os.environ.get("MISTRAL_API_KEY")
25
+ query_engine = None
26
+
27
+ # Define LLMs
28
+ llm = MistralAI(api_key=env_api_key, model=llm_model)
29
+ embed_model = MistralAIEmbedding(model_name="mistral-embed", api_key=env_api_key)
30
+
31
+ # create client and a new collection
32
+ db = chromadb.PersistentClient(path="./chroma_db")
33
+ chroma_collection = db.get_or_create_collection("quickstart")
34
+
35
+ # set up ChromaVectorStore and load in data
36
+ vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
37
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
38
+ service_context = ServiceContext.from_defaults(
39
+ chunk_size=1024, llm=llm, embed_model=embed_model
40
+ )
41
+
42
+ PDFReader = download_loader("PDFReader")
43
+ loader = PDFReader()
44
+
45
+ index = VectorStoreIndex(
46
+ [], service_context=service_context, storage_context=storage_context
47
+ )
48
+ query_engine = index.as_query_engine(similarity_top_k=5)
49
+
50
+
51
+ def get_documents_in_db():
52
+ print("Fetching documents in DB")
53
+ docs = []
54
+ for item in chroma_collection.get(include=["metadatas"])["metadatas"]:
55
+ docs.append(json.loads(item["_node_content"])["metadata"]["file_name"])
56
+ docs = list(set(docs))
57
+ print(f"Found {len(docs)} documents")
58
+ out = "**List of files in db:**\n"
59
+ for d in docs:
60
+ out += " - " + d + "\n"
61
+
62
+ return out
63
+
64
+
65
+ def empty_db():
66
+ ids = chroma_collection.get()["ids"]
67
+ chroma_collection.delete(ids)
68
+ return get_documents_in_db()
69
+
70
+
71
+ def load_file(file):
72
+ documents = loader.load_data(file=file)
73
+
74
+ for doc in documents:
75
+ index.insert(doc)
76
+
77
+ return (
78
+ gr.Textbox(visible=False),
79
+ gr.Textbox(value=f"Document encoded ! You can ask questions", visible=True),
80
+ get_documents_in_db(),
81
+ )
82
+
83
+
84
+ def load_document(input_file):
85
+ file_name = input_file.name.split("/")[-1]
86
+ return gr.Textbox(value=f"Document loaded: {file_name}", visible=True)
87
+
88
+
89
+ with gr.Blocks() as demo:
90
+ gr.Markdown(
91
+ """ # Welcome to Gaia Level 3 Demo
92
+
93
+ Add a file before interacting with the Chat.
94
+ This demo allows you to interact with a pdf file and then ask questions to Mistral APIs.
95
+ Mistral will answer with the context extracted from your uploaded file.
96
+
97
+ *The files will stay in the database unless there is 48h of inactivty or you re-build the space.*
98
+ """
99
+ )
100
+
101
+ gr.Markdown(""" ### 1 / Extract data from PDF """)
102
+
103
+ with gr.Row():
104
+ with gr.Column():
105
+ input_file = gr.File(
106
+ label="Load a pdf",
107
+ file_types=[".pdf"],
108
+ file_count="single",
109
+ type="filepath",
110
+ interactive=True,
111
+ )
112
+ file_msg = gr.Textbox(
113
+ label="Loaded documents:", container=False, visible=False
114
+ )
115
+
116
+ input_file.upload(
117
+ fn=load_document,
118
+ inputs=[
119
+ input_file,
120
+ ],
121
+ outputs=[file_msg],
122
+ concurrency_limit=20,
123
+ )
124
+
125
+ file_btn = gr.Button(value="Encode file ✅", interactive=True)
126
+ btn_msg = gr.Textbox(container=False, visible=False)
127
+
128
+ with gr.Row():
129
+ db_list = gr.Markdown(value=get_documents_in_db)
130
+ delete_btn = gr.Button(value="Empty db 🗑️", interactive=True, scale=0)
131
+
132
+ file_btn.click(
133
+ load_file,
134
+ inputs=[input_file],
135
+ outputs=[file_msg, btn_msg, db_list],
136
+ show_progress="full",
137
+ )
138
+ delete_btn.click(empty_db, outputs=[db_list], show_progress="minimal")
139
+
140
+ gr.Markdown(""" ### 2 / Ask a question about this context """)
141
+
142
+ chatbot = gr.Chatbot()
143
+ msg = gr.Textbox(placeholder=placeholder)
144
+ clear = gr.ClearButton([msg, chatbot])
145
+
146
+ def respond(message, chat_history):
147
+ response = query_engine.query(message)
148
+ chat_history.append((message, str(response)))
149
+ return chat_history
150
+
151
+ msg.submit(respond, [msg, chatbot], [chatbot])
152
+
153
+ demo.title = title
154
+
155
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ pypdf
2
+ mistralai
3
+ llama-index
4
+ gradio
5
+ chromadb