agutfraind commited on
Commit
7baa084
·
1 Parent(s): 863df0d

uploading docs

Browse files
Files changed (3) hide show
  1. .streamlit/config.toml +6 -0
  2. app.py +87 -30
  3. app_constants.py +6 -1
.streamlit/config.toml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ [theme]
2
+ primaryColor="#F63366"
3
+ backgroundColor="#FFFFFF"
4
+ secondaryBackgroundColor="#F0F2F6"
5
+ textColor="#262730"
6
+ font="sans serif"
app.py CHANGED
@@ -12,31 +12,29 @@ Based on:
12
  1. https://huggingface.co/spaces/llamaindex/llama_index_vector_demo
13
  2. https://github.com/logan-markewich/llama_index_starter_pack/blob/main/streamlit_term_definition/
14
 
15
-
16
  TODO:
17
- - document upload
18
  - customize to other [LLMs](https://gpt-index.readthedocs.io/en/latest/reference/llm_predictor.html#llama_index.llm_predictor.LLMPredictor)
19
- - canned questions
20
-
21
  '''
22
 
23
  import os
24
  import streamlit as st
25
- from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader, ServiceContext, LLMPredictor, PromptHelper
26
  from llama_index import StorageContext, load_index_from_storage
27
 
28
  from langchain import OpenAI, HuggingFaceHub
29
 
30
  import app_constants
31
 
32
- index_fpath = "./index.json"
33
- documents_folder = "./documents"
34
 
35
  if "dummy" not in st.session_state:
36
  st.session_state["dummy"] = "dummy"
37
 
38
- @st.cache_resource #st makes this globally available for all users and sessions
39
- def initialize_index(index_name, documents_folder):
40
  """
41
  creates an index of the documents in the folder
42
  if the index exists, skipped
@@ -50,8 +48,10 @@ def initialize_index(index_name, documents_folder):
50
  # set chunk size limit
51
  chunk_size_limit = 600
52
 
53
- llm_predictor = LLMPredictor(llm=OpenAI(temperature=0.5, model_name="text-davinci-003", max_tokens=num_outputs))
54
-
 
 
55
  #wishlist: alternatives
56
  service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)
57
  if os.path.exists(index_name):
@@ -66,8 +66,10 @@ def initialize_index(index_name, documents_folder):
66
  documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper,
67
  chunk_size_limit=512, service_context=service_context
68
  )
69
- doc_index.storage_context.persist(index_fpath)
 
70
 
 
71
  return doc_index
72
 
73
  #st returns data that's available for future caller
@@ -84,17 +86,18 @@ st.title("LLM scanner")
84
  st.markdown(
85
  (
86
  "This app allows you to query documents!\n\n"
87
- "Powered by [Llama Index](https://gpt-index.readthedocs.io/en/latest/index.html) and supporting multiple LLMs"
88
  )
89
  )
90
 
91
- setup_tab, query_tab = st.tabs(
92
- ["Setup", "Query"]
93
  )
94
 
95
  with setup_tab:
96
  st.subheader("LLM Setup")
97
  api_key = st.text_input("Enter your OpenAI API key here", type="password")
 
98
  #wishlist llm_name = st.selectbox(
99
  # "Which LLM?", ["text-davinci-003", "gpt-3.5-turbo", "gpt-4"]
100
  #)
@@ -105,6 +108,47 @@ with setup_tab:
105
  # "LLM Temperature", min_value=0.0, max_value=1.0, step=0.1
106
  #)
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  with query_tab:
110
  st.subheader("Query Tab")
@@ -114,22 +158,35 @@ with query_tab:
114
  #api_key = st.text_input("Enter your OpenAI API key here:", type="password")
115
  if api_key:
116
  os.environ['OPENAI_API_KEY'] = api_key
117
- doc_index = initialize_index(index_fpath, documents_folder)
118
-
119
 
120
  if doc_index is None:
121
- st.warning("Please enter your api key first.")
122
-
123
- text = st.text_input("Query text:", value="What did the author do growing up?")
124
-
125
- if st.button("Run Query") and text is not None:
126
- response = query_index(doc_index, text)
127
- st.markdown(response)
128
-
129
- llm_col, embed_col = st.columns(2)
130
- with llm_col:
131
- st.markdown(f"LLM Tokens Used: {doc_index.service_context.llm_predictor._last_token_usage}")
 
132
 
133
- with embed_col:
134
- st.markdown(f"Embedding Tokens Used: {doc_index.service_context.embed_model._last_token_usage}")
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
 
12
  1. https://huggingface.co/spaces/llamaindex/llama_index_vector_demo
13
  2. https://github.com/logan-markewich/llama_index_starter_pack/blob/main/streamlit_term_definition/
14
 
 
15
  TODO:
 
16
  - customize to other [LLMs](https://gpt-index.readthedocs.io/en/latest/reference/llm_predictor.html#llama_index.llm_predictor.LLMPredictor)
17
+ - guardrails on
18
+ - prevent answers on facts outside the document (e.g. birthdate of Michael Jordan in the docs vs. the baseball player)
19
  '''
20
 
21
  import os
22
  import streamlit as st
23
+ from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader, ServiceContext, LLMPredictor, PromptHelper, readers
24
  from llama_index import StorageContext, load_index_from_storage
25
 
26
  from langchain import OpenAI, HuggingFaceHub
27
 
28
  import app_constants
29
 
30
+ index_fpath = "./llamas_index"
31
+ documents_folder = "./documents" #initial documents - additional can be added via upload
32
 
33
  if "dummy" not in st.session_state:
34
  st.session_state["dummy"] = "dummy"
35
 
36
+ #@st.cache_resource #st makes this globally available for all users and sessions
37
+ def initialize_index(index_name, documents_folder, persisted_to_storage=True):
38
  """
39
  creates an index of the documents in the folder
40
  if the index exists, skipped
 
48
  # set chunk size limit
49
  chunk_size_limit = 600
50
 
51
+ llm_predictor = LLMPredictor(llm=OpenAI(openai_api_key=api_key, #from env
52
+ temperature=0.5,
53
+ model_name="text-davinci-003",
54
+ max_tokens=num_outputs))
55
  #wishlist: alternatives
56
  service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)
57
  if os.path.exists(index_name):
 
66
  documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper,
67
  chunk_size_limit=512, service_context=service_context
68
  )
69
+ if persisted_to_storage:
70
+ doc_index.storage_context.persist(index_fpath)
71
 
72
+ #avoid this side-effect: st.session_state["doc_index"] = "doc_index"
73
  return doc_index
74
 
75
  #st returns data that's available for future caller
 
86
  st.markdown(
87
  (
88
  "This app allows you to query documents!\n\n"
89
+ "Powered by [Llama Index](https://gpt-index.readthedocs.io/en/latest/index.html)"
90
  )
91
  )
92
 
93
+ setup_tab, upload_tab, query_tab = st.tabs(
94
+ ["Setup", "Index", "Query"]
95
  )
96
 
97
  with setup_tab:
98
  st.subheader("LLM Setup")
99
  api_key = st.text_input("Enter your OpenAI API key here", type="password")
100
+
101
  #wishlist llm_name = st.selectbox(
102
  # "Which LLM?", ["text-davinci-003", "gpt-3.5-turbo", "gpt-4"]
103
  #)
 
108
  # "LLM Temperature", min_value=0.0, max_value=1.0, step=0.1
109
  #)
110
 
111
+ if api_key is not None and "doc_index" not in st.session_state:
112
+ st.session_state["doc_index"] = initialize_index(index_fpath, documents_folder, persisted_to_storage=False)
113
+
114
+
115
+ with upload_tab:
116
+ st.subheader("Upload documents")
117
+
118
+ if st.button("Re-initialize index with pre-packaged documents"):
119
+ st.session_state["doc_index"] = initialize_index(index_fpath, documents_folder, persisted_to_storage=False)
120
+ st.info('Documents in index: ' + str(st.session_state["doc_index"].docstore.docs.__len__()))
121
+
122
+ if "doc_index" in st.session_state:
123
+ doc_index = st.session_state["doc_index"]
124
+ st.markdown(
125
+ "Either upload a document, or enter the text manually."
126
+ )
127
+ uploaded_file = st.file_uploader(
128
+ "Upload a document (pdf):", type=["pdf"]
129
+ )
130
+ document_text = st.text_area("Enter text")
131
+ if st.button("Add document to index") and (uploaded_file or document_text):
132
+ with st.spinner("Inserting (large files may be slow)..."):
133
+ if document_text:
134
+ doc_index.refresh([readers.Document(text=document_text)]) #tokenizes new documents
135
+ st.info('Documents in index: ' + str(st.session_state["doc_index"].docstore.docs.__len__()))
136
+
137
+ st.session_state["doc_index"] = doc_index
138
+ if uploaded_file:
139
+ uploads_folder = "uploads/"
140
+ if not os.path.exists(uploads_folder):
141
+ os.mkdir(uploads_folder)
142
+ #file_details = {"FileName":uploaded_file.name,"FileType":uploaded_file.type}
143
+ with open(uploads_folder + "tmp.pdf", "wb") as f:
144
+ f.write(uploaded_file.getbuffer())
145
+ documents = SimpleDirectoryReader(uploads_folder).load_data()
146
+ doc_index.refresh(documents) #tokenizes new documents
147
+ st.session_state["doc_index"] = doc_index
148
+ st.info('Documents in index: ' + str(st.session_state["doc_index"].docstore.docs.__len__()))
149
+
150
+ st.session_state["doc_index"] = doc_index
151
+ os.remove(uploads_folder + "tmp.pdf")
152
 
153
  with query_tab:
154
  st.subheader("Query Tab")
 
158
  #api_key = st.text_input("Enter your OpenAI API key here:", type="password")
159
  if api_key:
160
  os.environ['OPENAI_API_KEY'] = api_key
161
+ #doc_index = initialize_index(index_fpath, documents_folder)
 
162
 
163
  if doc_index is None:
164
+ if "doc_index" in st.session_state:
165
+ doc_index = st.session_state["doc_index"]
166
+ st.info('Documents in index: ' + str(doc_index.docstore.docs.__len__()))
167
+ else:
168
+ st.warning("Doc index is not available - initialize or upload")
169
+ #st.warning("Please enter your api key first.")
170
+
171
+ if doc_index and api_key:
172
+ select_type_your_own = 'type your own...'
173
+ options_for_queries = app_constants.canned_questions + [select_type_your_own]
174
+ query_selection = st.selectbox("Select option", options=options_for_queries)
175
+ query_text = None
176
 
177
+ if query_selection == select_type_your_own:
178
+ query_text = st.text_input("Query text")
179
+ else:
180
+ query_text = query_selection
181
+
182
+ if st.button("Run Query") and (doc_index is not None) and (query_text is not None):
183
+ response = query_index(doc_index, query_text)
184
+ st.markdown(response)
185
+
186
+ llm_col, embed_col = st.columns(2)
187
+ with llm_col:
188
+ st.markdown(f"LLM Tokens Used: {doc_index.service_context.llm_predictor._last_token_usage}")
189
+
190
+ with embed_col:
191
+ st.markdown(f"Embedding Tokens Used: {doc_index.service_context.embed_model._last_token_usage}")
192
 
app_constants.py CHANGED
@@ -3,4 +3,9 @@ file for
3
  - canned prompts
4
  - constants (other than secrets)
5
 
6
- '''
 
 
 
 
 
 
3
  - canned prompts
4
  - constants (other than secrets)
5
 
6
+ '''
7
+
8
+ canned_questions = [
9
+ "When was Paul Graham born?",
10
+ "What was his first startup?"
11
+ ]