Spaces:
Sleeping
Sleeping
initial commit
Browse files- README.md +39 -13
- app.py +213 -0
- requirements.txt +5 -0
- tmp_docs/empty.txt +0 -0
README.md
CHANGED
@@ -1,13 +1,39 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
This is a chatbot that uses Langchain's Conversational Retrieval Chain to generate responses to user input. The chatbot can ingest files and use Pinecone (Pinecone API key required) or Chroma vector stores (no API key required) to retrieve relevant documents for generating responses. OpenAI's API key is also required. The UI is based on Streamlit.
|
3 |
+
|
4 |
+
## Fun fact
|
5 |
+
This README file is generated by this app after ingesting this python file. See the screenshot below.
|
6 |
+
|
7 |
+
## Installation
|
8 |
+
|
9 |
+
To install the required packages, run:
|
10 |
+
|
11 |
+
```
|
12 |
+
pip install -r requirements.txt
|
13 |
+
```
|
14 |
+
|
15 |
+
## Usage
|
16 |
+
|
17 |
+
To run the chatbot, run:
|
18 |
+
|
19 |
+
```
|
20 |
+
streamlit run app.py
|
21 |
+
```
|
22 |
+
|
23 |
+
The chatbot will prompt the user for inputs and generate a response based on user's question and the chat history.
|
24 |
+
|
25 |
+
## Ingesting Files
|
26 |
+
|
27 |
+
To ingest files, select "Yes" when prompted and upload the files. The chatbot will split the files into smaller documents and ingest them into the vector store.
|
28 |
+
|
29 |
+
## Using Pinecone
|
30 |
+
|
31 |
+
To use Pinecone, select "Yes" when prompted and enter the name of the Pinecone index. Make sure to set the `PINECONE_API_KEY` and `PINECONE_API_ENV` environment variables.
|
32 |
+
|
33 |
+
## Using Chroma
|
34 |
+
|
35 |
+
To use Chroma, enter the name of the Chroma collection when prompted. The chatbot will create a Chroma vector store in the `persist_directory` specified in the code.
|
36 |
+
|
37 |
+
|
38 |
+
## Screenshot
|
39 |
+
![chat](https://github.com/eliujl/chatbot_for_files_UI/assets/8711788/1353e575-b813-4d93-9e44-ed625002f0ae)
|
app.py
ADDED
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import required libraries
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
from langchain.document_loaders import (
|
4 |
+
UnstructuredWordDocumentLoader,
|
5 |
+
PyMuPDFLoader,
|
6 |
+
UnstructuredFileLoader,
|
7 |
+
)
|
8 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
9 |
+
from langchain.chat_models import ChatOpenAI
|
10 |
+
from langchain.vectorstores import Pinecone, Chroma
|
11 |
+
from langchain.chains import ConversationalRetrievalChain
|
12 |
+
import os
|
13 |
+
import pinecone
|
14 |
+
import streamlit as st
|
15 |
+
import shutil
|
16 |
+
|
17 |
+
# Set up OpenAI API key (from .bashrc, Windows environment variables, .env)
|
18 |
+
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
|
19 |
+
|
20 |
+
# Set up Pinecone env
|
21 |
+
PINECONE_API_KEY = os.environ['PINECONE_API_KEY']
|
22 |
+
PINECONE_API_ENV = os.environ['PINECONE_API_ENV']
|
23 |
+
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
|
24 |
+
|
25 |
+
pinecone_index_name = ''
|
26 |
+
chroma_collection_name = ''
|
27 |
+
persist_directory = ''
|
28 |
+
chat_history = []
|
29 |
+
docsearch_ready = False
|
30 |
+
directory_name = 'tmp_docs'
|
31 |
+
|
32 |
+
|
33 |
+
def save_file(files):
|
34 |
+
# Remove existing files in the directory
|
35 |
+
if os.path.exists(directory_name):
|
36 |
+
for filename in os.listdir(directory_name):
|
37 |
+
file_path = os.path.join(directory_name, filename)
|
38 |
+
try:
|
39 |
+
if os.path.isfile(file_path):
|
40 |
+
os.remove(file_path)
|
41 |
+
except Exception as e:
|
42 |
+
print(f"Error: {e}")
|
43 |
+
# Save the new file with original filename
|
44 |
+
if files is not None:
|
45 |
+
for file in files:
|
46 |
+
file_name = file.name
|
47 |
+
file_path = os.path.join(directory_name, file_name)
|
48 |
+
with open(file_path, 'wb') as f:
|
49 |
+
shutil.copyfileobj(file, f)
|
50 |
+
|
51 |
+
|
52 |
+
def load_files():
|
53 |
+
file_path = "./tmp_docs/"
|
54 |
+
all_texts = []
|
55 |
+
n_files = 0
|
56 |
+
n_char = 0
|
57 |
+
n_texts = 0
|
58 |
+
|
59 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
60 |
+
chunk_size=400, chunk_overlap=50
|
61 |
+
)
|
62 |
+
for filename in os.listdir(directory_name):
|
63 |
+
file = os.path.join(directory_name, filename)
|
64 |
+
if os.path.isfile(file):
|
65 |
+
if file.endswith(".docx"):
|
66 |
+
loader = UnstructuredWordDocumentLoader(file)
|
67 |
+
elif file.endswith(".pdf"):
|
68 |
+
loader = PyMuPDFLoader(file)
|
69 |
+
else: # assume a pure text format and attempt to load it
|
70 |
+
loader = UnstructuredFileLoader(file)
|
71 |
+
data = loader.load()
|
72 |
+
texts = text_splitter.split_documents(data)
|
73 |
+
n_files += 1
|
74 |
+
n_char += len(data[0].page_content)
|
75 |
+
n_texts += len(texts)
|
76 |
+
all_texts.extend(texts)
|
77 |
+
st.write(
|
78 |
+
f"Loaded {n_files} file(s) with {n_char} characters, and split into {n_texts} split-documents."
|
79 |
+
)
|
80 |
+
return all_texts, n_texts
|
81 |
+
|
82 |
+
|
83 |
+
def ingest(all_texts, use_pinecone, embeddings, pinecone_index_name, chroma_collection_name, persist_directory):
|
84 |
+
if use_pinecone:
|
85 |
+
docsearch = Pinecone.from_texts(
|
86 |
+
[t.page_content for t in all_texts], embeddings, index_name=pinecone_index_name) # add namespace=pinecone_namespace if provided
|
87 |
+
else:
|
88 |
+
docsearch = Chroma.from_documents(
|
89 |
+
all_texts, embeddings, collection_name=chroma_collection_name, persist_directory=persist_directory)
|
90 |
+
return docsearch
|
91 |
+
|
92 |
+
|
93 |
+
def setup_retriever(docsearch, k):
|
94 |
+
retriever = docsearch.as_retriever(
|
95 |
+
search_type="similarity", search_kwargs={"k": k}, include_metadata=True)
|
96 |
+
return retriever
|
97 |
+
|
98 |
+
|
99 |
+
def setup_docsearch(use_pinecone, pinecone_index_name, embeddings, chroma_collection_name, persist_directory):
|
100 |
+
docsearch = []
|
101 |
+
n_texts = 0
|
102 |
+
if use_pinecone:
|
103 |
+
# Load the pre-created Pinecone index.
|
104 |
+
# The index which has already be stored in pinecone.io as long-term memory
|
105 |
+
if pinecone_index_name in pinecone.list_indexes():
|
106 |
+
docsearch = Pinecone.from_existing_index(
|
107 |
+
pinecone_index_name, embeddings) # add namespace=pinecone_namespace if provided
|
108 |
+
index_client = pinecone.Index(pinecone_index_name)
|
109 |
+
# Get the index information
|
110 |
+
index_info = index_client.describe_index_stats()
|
111 |
+
namespace_name = ''
|
112 |
+
n_texts = index_info['namespaces'][namespace_name]['vector_count']
|
113 |
+
else:
|
114 |
+
raise ValueError('''Cannot find the specified Pinecone index.
|
115 |
+
Create one in pinecone.io or using, e.g.,
|
116 |
+
pinecone.create_index(
|
117 |
+
name=index_name, dimension=1536, metric="cosine", shards=1)''')
|
118 |
+
else:
|
119 |
+
docsearch = Chroma(persist_directory=persist_directory, embedding_function=embeddings,
|
120 |
+
collection_name=chroma_collection_name)
|
121 |
+
n_texts = docsearch._client._count(
|
122 |
+
collection_name=chroma_collection_name)
|
123 |
+
return docsearch, n_texts
|
124 |
+
|
125 |
+
|
126 |
+
def get_response(query, chat_history):
|
127 |
+
result = CRqa({"question": query, "chat_history": chat_history})
|
128 |
+
return result['answer'], result['source_documents']
|
129 |
+
|
130 |
+
|
131 |
+
def setup_em_llm(OPENAI_API_KEY):
|
132 |
+
# Set up OpenAI embeddings
|
133 |
+
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
|
134 |
+
# Use Open AI LLM with gpt-3.5-turbo.
|
135 |
+
# Set the temperature to be 0 if you do not want it to make up things
|
136 |
+
llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", streaming=True,
|
137 |
+
openai_api_key=OPENAI_API_KEY)
|
138 |
+
return embeddings, llm
|
139 |
+
|
140 |
+
|
141 |
+
# Get user input of whether to use Pinecone or not
|
142 |
+
col1, col2, col3, col4 = st.columns([1, 1, 1, 1])
|
143 |
+
# create the radio buttons and text input fields
|
144 |
+
with col1:
|
145 |
+
r_pinecone = st.radio('Do you want to use Pinecone index?', ('Yes', 'No'))
|
146 |
+
with col2:
|
147 |
+
r_ingest = st.radio(
|
148 |
+
'Do you want to ingest the file(s)?', ('Yes', 'No'))
|
149 |
+
with col3:
|
150 |
+
OPENAI_API_KEY = st.text_input(
|
151 |
+
"Enter your OpenAI API key and press Enter", type="password")
|
152 |
+
with col4:
|
153 |
+
if OPENAI_API_KEY:
|
154 |
+
embeddings, llm = setup_em_llm(OPENAI_API_KEY)
|
155 |
+
if r_pinecone.lower() == 'yes' and PINECONE_API_KEY != '':
|
156 |
+
use_pinecone = True
|
157 |
+
pinecone_index_name = st.text_input('Enter your Pinecone index')
|
158 |
+
else:
|
159 |
+
use_pinecone = False
|
160 |
+
chroma_collection_name = st.text_input(
|
161 |
+
'''Not using Pinecone or empty Pinecone API key provided.
|
162 |
+
Using Chroma. Enter Chroma collection name of 3-63 characters:''')
|
163 |
+
persist_directory = "./vectorstore"
|
164 |
+
|
165 |
+
if pinecone_index_name or chroma_collection_name:
|
166 |
+
if r_ingest.lower() == 'yes':
|
167 |
+
files = st.file_uploader('Upload Files', accept_multiple_files=True)
|
168 |
+
if files:
|
169 |
+
save_file(files)
|
170 |
+
all_texts, n_texts = load_files()
|
171 |
+
docsearch = ingest(all_texts, use_pinecone, embeddings, pinecone_index_name,
|
172 |
+
chroma_collection_name, persist_directory)
|
173 |
+
docsearch_ready = True
|
174 |
+
else:
|
175 |
+
st.write(
|
176 |
+
'No data is to be ingested. Make sure the Pinecone index or Chroma collection name you provided contains data.')
|
177 |
+
docsearch, n_texts = setup_docsearch(use_pinecone, pinecone_index_name,
|
178 |
+
embeddings, chroma_collection_name, persist_directory)
|
179 |
+
docsearch_ready = True
|
180 |
+
if docsearch_ready:
|
181 |
+
# number of sources (split-documents when ingesting files); default is 4
|
182 |
+
k = min([20, n_texts])
|
183 |
+
retriever = setup_retriever(docsearch, k)
|
184 |
+
CRqa = ConversationalRetrievalChain.from_llm(
|
185 |
+
llm, retriever=retriever, return_source_documents=True)
|
186 |
+
|
187 |
+
st.title('Chatbot')
|
188 |
+
# Get user input
|
189 |
+
query = st.text_input('Enter your question; enter "exit" to exit')
|
190 |
+
if query:
|
191 |
+
# Generate a reply based on the user input and chat history
|
192 |
+
reply, source = get_response(query, chat_history)
|
193 |
+
print(chat_history)
|
194 |
+
# Update the chat history with the user input and system response
|
195 |
+
chat_history.append(('User', query))
|
196 |
+
chat_history.append(('Bot', reply))
|
197 |
+
chat_history_str = '\n'.join(
|
198 |
+
[f'{x[0]}: {x[1]}' for x in chat_history])
|
199 |
+
st.text_area('Chat record:', value=chat_history_str, height=250)
|
200 |
+
# Display sources
|
201 |
+
for i, source_i in enumerate(source):
|
202 |
+
if i < 2:
|
203 |
+
if len(source_i.page_content) > 400:
|
204 |
+
page_content = source_i.page_content[:400]
|
205 |
+
else:
|
206 |
+
page_content = source_i.page_content
|
207 |
+
if source_i.metadata:
|
208 |
+
metadata_source = source_i.metadata['source']
|
209 |
+
st.write(
|
210 |
+
f"**_Source {i+1}:_** {metadata_source}: {page_content}")
|
211 |
+
st.write(source_i.metadata)
|
212 |
+
else:
|
213 |
+
st.write(f"**_Source {i+1}:_** {page_content}")
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain
|
2 |
+
openai
|
3 |
+
streamlit
|
4 |
+
pinecone-client
|
5 |
+
chromadb
|
tmp_docs/empty.txt
ADDED
File without changes
|