File size: 8,035 Bytes
863df0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7baa084
 
863df0d
 
 
 
7baa084
863df0d
 
 
 
 
 
7baa084
 
863df0d
 
 
 
7baa084
 
863df0d
 
 
 
 
 
 
 
 
 
 
 
 
7baa084
 
 
 
863df0d
 
 
 
 
 
 
 
 
 
 
 
 
 
7baa084
 
863df0d
7baa084
863df0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7baa084
863df0d
 
 
7baa084
 
863df0d
 
 
 
 
7baa084
863df0d
 
 
 
 
 
 
 
 
 
7baa084
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
863df0d
 
 
 
 
 
 
 
 
7baa084
863df0d
 
7baa084
 
 
 
 
 
 
 
 
 
 
 
863df0d
7baa084
 
 
 
 
 
 
 
 
 
 
 
 
 
 
863df0d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
'''
LLM scanner streamlit app

streamlit run .\app.py

Functionality
- tokenize documents
- respond to queries
- generate new documents

Based on: 
1. https://huggingface.co/spaces/llamaindex/llama_index_vector_demo
2. https://github.com/logan-markewich/llama_index_starter_pack/blob/main/streamlit_term_definition/

TODO:
- customize to other [LLMs](https://gpt-index.readthedocs.io/en/latest/reference/llm_predictor.html#llama_index.llm_predictor.LLMPredictor) 
- guardrails on 
- prevent answers on facts outside the document (e.g. birthdate of Michael Jordan in the docs vs. the baseball player)
'''

import os
import streamlit as st
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader, ServiceContext, LLMPredictor, PromptHelper, readers
from llama_index import StorageContext, load_index_from_storage

from langchain import OpenAI, HuggingFaceHub

import app_constants

index_fpath = "./llamas_index"
documents_folder = "./documents" #initial documents - additional can be added via upload

if "dummy" not in st.session_state:
    st.session_state["dummy"] = "dummy"

#@st.cache_resource  #st makes this globally available for all users and sessions 
def initialize_index(index_name, documents_folder, persisted_to_storage=True):
    """
    creates an index of the documents in the folder
    if the index exists, skipped
    """
    # set maximum input size
    max_input_size = 4096
    # set number of output tokens
    num_outputs = 2000
    # set maximum chunk overlap
    max_chunk_overlap = 20
    # set chunk size limit
    chunk_size_limit = 600 

    llm_predictor = LLMPredictor(llm=OpenAI(openai_api_key=api_key, #from env
                                            temperature=0.5, 
                                            model_name="text-davinci-003", 
                                            max_tokens=num_outputs))    
    #wishlist: alternatives
    service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)
    if os.path.exists(index_name):
        storage_context = StorageContext.from_defaults(persist_dir=index_fpath)
        doc_index = load_index_from_storage(service_context=service_context, storage_context=storage_context)
    else:
        #st.info("Updating the document index")
        prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)
 
        documents = SimpleDirectoryReader(documents_folder).load_data()
        doc_index = GPTVectorStoreIndex.from_documents(
            documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper, 
            chunk_size_limit=512, service_context=service_context
        )
        if persisted_to_storage:
            doc_index.storage_context.persist(index_fpath)

    #avoid this side-effect: st.session_state["doc_index"] = "doc_index"
    return doc_index

#st returns data that's available for future caller
@st.cache_data(max_entries=200, persist=True)  
def query_index(_index, query_text):
    query_engine = _index.as_query_engine()
    response = query_engine.query(query_text)
    #response = _index.query(query_text)
    return str(response)


#page format is directly written her
st.title("LLM scanner")
st.markdown(
    (
        "This app allows you to query documents!\n\n"
        "Powered by [Llama Index](https://gpt-index.readthedocs.io/en/latest/index.html)"
    )
)

setup_tab, upload_tab, query_tab = st.tabs(
    ["Setup", "Index", "Query"]
)

with setup_tab:
    st.subheader("LLM Setup")
    api_key = st.text_input("Enter your OpenAI API key here", type="password")
    
    #wishlist llm_name = st.selectbox(
    #    "Which LLM?", ["text-davinci-003", "gpt-3.5-turbo", "gpt-4"]
    #)
    #repo_id = "google/flan-t5-xl" # See https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads for some other options
    #llm = HuggingFaceHub(repo_id=repo_id, model_kwargs={"temperature":0, "max_length":64})
    
    #model_temperature = st.slider(
    #    "LLM Temperature", min_value=0.0, max_value=1.0, step=0.1
    #)

if api_key is not None and "doc_index" not in st.session_state: 
    st.session_state["doc_index"] = initialize_index(index_fpath, documents_folder, persisted_to_storage=False) 


with upload_tab:
    st.subheader("Upload documents")
    
    if st.button("Re-initialize index with pre-packaged documents"):
        st.session_state["doc_index"] = initialize_index(index_fpath, documents_folder, persisted_to_storage=False)   
        st.info('Documents in index: ' + str(st.session_state["doc_index"].docstore.docs.__len__()))

    if "doc_index" in st.session_state:
        doc_index = st.session_state["doc_index"]
        st.markdown(
            "Either upload a document, or enter the text manually."
        )
        uploaded_file = st.file_uploader(
            "Upload a document (pdf):", type=["pdf"]
        )
        document_text = st.text_area("Enter text")
        if st.button("Add document to index") and (uploaded_file or document_text):
            with st.spinner("Inserting (large files may be slow)..."):
                if document_text:
                    doc_index.refresh([readers.Document(text=document_text)]) #tokenizes new documents
                    st.info('Documents in index: ' + str(st.session_state["doc_index"].docstore.docs.__len__()))

                    st.session_state["doc_index"] = doc_index
                if uploaded_file:     
                    uploads_folder = "uploads/"
                    if not os.path.exists(uploads_folder):
                        os.mkdir(uploads_folder)
                    #file_details = {"FileName":uploaded_file.name,"FileType":uploaded_file.type}
                    with open(uploads_folder + "tmp.pdf", "wb") as f:
                        f.write(uploaded_file.getbuffer())
                    documents = SimpleDirectoryReader(uploads_folder).load_data()
                    doc_index.refresh(documents) #tokenizes new documents
                    st.session_state["doc_index"] = doc_index
                    st.info('Documents in index: ' + str(st.session_state["doc_index"].docstore.docs.__len__()))

                    st.session_state["doc_index"] = doc_index
                    os.remove(uploads_folder + "tmp.pdf")

with query_tab:
    st.subheader("Query Tab")
    st.write("Enter a query about the included documents. Find [documentation here](https://huggingface.co/spaces/agutfraind/llmscanner)")

    doc_index = None
    #api_key = st.text_input("Enter your OpenAI API key here:", type="password")
    if api_key:
        os.environ['OPENAI_API_KEY'] = api_key
        #doc_index = initialize_index(index_fpath, documents_folder)   

    if doc_index is None:
        if "doc_index" in st.session_state:
            doc_index = st.session_state["doc_index"]
            st.info('Documents in index: ' + str(doc_index.docstore.docs.__len__())) 
        else:
            st.warning("Doc index is not available - initialize or upload")
        #st.warning("Please enter your api key first.")

    if doc_index and api_key:
        select_type_your_own = 'type your own...'
        options_for_queries = app_constants.canned_questions + [select_type_your_own]
        query_selection = st.selectbox("Select option", options=options_for_queries)
        query_text = None
        
        if query_selection == select_type_your_own: 
            query_text = st.text_input("Query text")
        else:
            query_text = query_selection 

        if st.button("Run Query") and (doc_index is not None) and (query_text is not None):
            response = query_index(doc_index, query_text)
            st.markdown(response)
            
            llm_col, embed_col = st.columns(2)
            with llm_col:
                st.markdown(f"LLM Tokens Used: {doc_index.service_context.llm_predictor._last_token_usage}")
            
            with embed_col:
                st.markdown(f"Embedding Tokens Used: {doc_index.service_context.embed_model._last_token_usage}")