Spaces:
Runtime error
Runtime error
File size: 8,035 Bytes
863df0d 7baa084 863df0d 7baa084 863df0d 7baa084 863df0d 7baa084 863df0d 7baa084 863df0d 7baa084 863df0d 7baa084 863df0d 7baa084 863df0d 7baa084 863df0d 7baa084 863df0d 7baa084 863df0d 7baa084 863df0d 7baa084 863df0d 7baa084 863df0d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
'''
LLM scanner streamlit app
streamlit run .\app.py
Functionality
- tokenize documents
- respond to queries
- generate new documents
Based on:
1. https://huggingface.co/spaces/llamaindex/llama_index_vector_demo
2. https://github.com/logan-markewich/llama_index_starter_pack/blob/main/streamlit_term_definition/
TODO:
- customize to other [LLMs](https://gpt-index.readthedocs.io/en/latest/reference/llm_predictor.html#llama_index.llm_predictor.LLMPredictor)
- guardrails on
- prevent answers on facts outside the document (e.g. birthdate of Michael Jordan in the docs vs. the baseball player)
'''
import os
import streamlit as st
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader, ServiceContext, LLMPredictor, PromptHelper, readers
from llama_index import StorageContext, load_index_from_storage
from langchain import OpenAI, HuggingFaceHub
import app_constants
index_fpath = "./llamas_index"
documents_folder = "./documents" #initial documents - additional can be added via upload
if "dummy" not in st.session_state:
st.session_state["dummy"] = "dummy"
#@st.cache_resource #st makes this globally available for all users and sessions
def initialize_index(index_name, documents_folder, persisted_to_storage=True):
"""
creates an index of the documents in the folder
if the index exists, skipped
"""
# set maximum input size
max_input_size = 4096
# set number of output tokens
num_outputs = 2000
# set maximum chunk overlap
max_chunk_overlap = 20
# set chunk size limit
chunk_size_limit = 600
llm_predictor = LLMPredictor(llm=OpenAI(openai_api_key=api_key, #from env
temperature=0.5,
model_name="text-davinci-003",
max_tokens=num_outputs))
#wishlist: alternatives
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)
if os.path.exists(index_name):
storage_context = StorageContext.from_defaults(persist_dir=index_fpath)
doc_index = load_index_from_storage(service_context=service_context, storage_context=storage_context)
else:
#st.info("Updating the document index")
prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)
documents = SimpleDirectoryReader(documents_folder).load_data()
doc_index = GPTVectorStoreIndex.from_documents(
documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper,
chunk_size_limit=512, service_context=service_context
)
if persisted_to_storage:
doc_index.storage_context.persist(index_fpath)
#avoid this side-effect: st.session_state["doc_index"] = "doc_index"
return doc_index
#st returns data that's available for future caller
@st.cache_data(max_entries=200, persist=True)
def query_index(_index, query_text):
query_engine = _index.as_query_engine()
response = query_engine.query(query_text)
#response = _index.query(query_text)
return str(response)
#page format is directly written her
st.title("LLM scanner")
st.markdown(
(
"This app allows you to query documents!\n\n"
"Powered by [Llama Index](https://gpt-index.readthedocs.io/en/latest/index.html)"
)
)
setup_tab, upload_tab, query_tab = st.tabs(
["Setup", "Index", "Query"]
)
with setup_tab:
st.subheader("LLM Setup")
api_key = st.text_input("Enter your OpenAI API key here", type="password")
#wishlist llm_name = st.selectbox(
# "Which LLM?", ["text-davinci-003", "gpt-3.5-turbo", "gpt-4"]
#)
#repo_id = "google/flan-t5-xl" # See https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads for some other options
#llm = HuggingFaceHub(repo_id=repo_id, model_kwargs={"temperature":0, "max_length":64})
#model_temperature = st.slider(
# "LLM Temperature", min_value=0.0, max_value=1.0, step=0.1
#)
if api_key is not None and "doc_index" not in st.session_state:
st.session_state["doc_index"] = initialize_index(index_fpath, documents_folder, persisted_to_storage=False)
with upload_tab:
st.subheader("Upload documents")
if st.button("Re-initialize index with pre-packaged documents"):
st.session_state["doc_index"] = initialize_index(index_fpath, documents_folder, persisted_to_storage=False)
st.info('Documents in index: ' + str(st.session_state["doc_index"].docstore.docs.__len__()))
if "doc_index" in st.session_state:
doc_index = st.session_state["doc_index"]
st.markdown(
"Either upload a document, or enter the text manually."
)
uploaded_file = st.file_uploader(
"Upload a document (pdf):", type=["pdf"]
)
document_text = st.text_area("Enter text")
if st.button("Add document to index") and (uploaded_file or document_text):
with st.spinner("Inserting (large files may be slow)..."):
if document_text:
doc_index.refresh([readers.Document(text=document_text)]) #tokenizes new documents
st.info('Documents in index: ' + str(st.session_state["doc_index"].docstore.docs.__len__()))
st.session_state["doc_index"] = doc_index
if uploaded_file:
uploads_folder = "uploads/"
if not os.path.exists(uploads_folder):
os.mkdir(uploads_folder)
#file_details = {"FileName":uploaded_file.name,"FileType":uploaded_file.type}
with open(uploads_folder + "tmp.pdf", "wb") as f:
f.write(uploaded_file.getbuffer())
documents = SimpleDirectoryReader(uploads_folder).load_data()
doc_index.refresh(documents) #tokenizes new documents
st.session_state["doc_index"] = doc_index
st.info('Documents in index: ' + str(st.session_state["doc_index"].docstore.docs.__len__()))
st.session_state["doc_index"] = doc_index
os.remove(uploads_folder + "tmp.pdf")
with query_tab:
st.subheader("Query Tab")
st.write("Enter a query about the included documents. Find [documentation here](https://huggingface.co/spaces/agutfraind/llmscanner)")
doc_index = None
#api_key = st.text_input("Enter your OpenAI API key here:", type="password")
if api_key:
os.environ['OPENAI_API_KEY'] = api_key
#doc_index = initialize_index(index_fpath, documents_folder)
if doc_index is None:
if "doc_index" in st.session_state:
doc_index = st.session_state["doc_index"]
st.info('Documents in index: ' + str(doc_index.docstore.docs.__len__()))
else:
st.warning("Doc index is not available - initialize or upload")
#st.warning("Please enter your api key first.")
if doc_index and api_key:
select_type_your_own = 'type your own...'
options_for_queries = app_constants.canned_questions + [select_type_your_own]
query_selection = st.selectbox("Select option", options=options_for_queries)
query_text = None
if query_selection == select_type_your_own:
query_text = st.text_input("Query text")
else:
query_text = query_selection
if st.button("Run Query") and (doc_index is not None) and (query_text is not None):
response = query_index(doc_index, query_text)
st.markdown(response)
llm_col, embed_col = st.columns(2)
with llm_col:
st.markdown(f"LLM Tokens Used: {doc_index.service_context.llm_predictor._last_token_usage}")
with embed_col:
st.markdown(f"Embedding Tokens Used: {doc_index.service_context.embed_model._last_token_usage}")
|