Spaces:
Sleeping
Sleeping
Zwea Htet
commited on
Commit
·
b83dc9c
1
Parent(s):
3455cec
updated llama index demo
Browse files- app.py +21 -18
- models/llamaCustom.py +7 -3
- models/vector_database.py +34 -0
- pages/llama_custom_demo.py +32 -3
- requirements.txt +3 -1
app.py
CHANGED
|
@@ -1,13 +1,9 @@
|
|
| 1 |
-
# https://docs.streamlit.io/knowledge-base/tutorials/build-conversational-apps
|
| 2 |
-
|
| 3 |
import os
|
| 4 |
-
|
| 5 |
import openai
|
| 6 |
import requests
|
| 7 |
import streamlit as st
|
| 8 |
|
| 9 |
from utils.util import *
|
| 10 |
-
|
| 11 |
from langchain.memory import ConversationBufferMemory
|
| 12 |
|
| 13 |
SAVE_DIR = "uploaded_files"
|
|
@@ -17,30 +13,24 @@ os.makedirs(SAVE_DIR, exist_ok=True)
|
|
| 17 |
def init_session_state():
|
| 18 |
if "openai_api_key" not in st.session_state:
|
| 19 |
st.session_state.openai_api_key = ""
|
| 20 |
-
|
| 21 |
if "uploaded_files" not in st.session_state:
|
| 22 |
st.session_state.uploaded_files = os.listdir(SAVE_DIR)
|
|
|
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
init_session_state()
|
| 26 |
|
| 27 |
st.set_page_config(page_title="RegBotBeta", page_icon="📜🤖")
|
| 28 |
-
|
| 29 |
st.title("Welcome to RegBotBeta2.0")
|
| 30 |
-
st.header("Powered by `LlamaIndex🦙`, `Langchain
|
| 31 |
|
| 32 |
|
| 33 |
-
def init_session_state():
|
| 34 |
-
if "huggingface_token" not in st.session_state:
|
| 35 |
-
st.session_state.huggingface_token = ""
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
init_session_state()
|
| 39 |
-
|
| 40 |
uploaded_files = st.file_uploader(
|
| 41 |
"Upload Files",
|
| 42 |
accept_multiple_files=True,
|
| 43 |
type=["pdf", "docx", "txt", "csv"],
|
|
|
|
| 44 |
)
|
| 45 |
|
| 46 |
if uploaded_files:
|
|
@@ -48,14 +38,27 @@ if uploaded_files:
|
|
| 48 |
if file not in st.session_state.uploaded_files:
|
| 49 |
# add the file to session state
|
| 50 |
st.session_state.uploaded_files.append(file.name)
|
| 51 |
-
|
| 52 |
# save the file to the sample_data directory
|
| 53 |
with open(os.path.join(SAVE_DIR, file.name), "wb") as f:
|
| 54 |
f.write(file.getbuffer())
|
| 55 |
-
|
| 56 |
st.success("File(s) uploaded successfully!")
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
if st.session_state.uploaded_files:
|
| 59 |
st.write("Uploaded Files:")
|
| 60 |
-
for
|
| 61 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
import openai
|
| 3 |
import requests
|
| 4 |
import streamlit as st
|
| 5 |
|
| 6 |
from utils.util import *
|
|
|
|
| 7 |
from langchain.memory import ConversationBufferMemory
|
| 8 |
|
| 9 |
SAVE_DIR = "uploaded_files"
|
|
|
|
| 13 |
def init_session_state():
|
| 14 |
if "openai_api_key" not in st.session_state:
|
| 15 |
st.session_state.openai_api_key = ""
|
|
|
|
| 16 |
if "uploaded_files" not in st.session_state:
|
| 17 |
st.session_state.uploaded_files = os.listdir(SAVE_DIR)
|
| 18 |
+
if "huggingface_token" not in st.session_state:
|
| 19 |
+
st.session_state.huggingface_token = ""
|
| 20 |
|
| 21 |
|
| 22 |
init_session_state()
|
| 23 |
|
| 24 |
st.set_page_config(page_title="RegBotBeta", page_icon="📜🤖")
|
|
|
|
| 25 |
st.title("Welcome to RegBotBeta2.0")
|
| 26 |
+
st.header("Powered by `LlamaIndex🦙`, `Langchain🦜🔗` and `OpenAI API`")
|
| 27 |
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
uploaded_files = st.file_uploader(
|
| 30 |
"Upload Files",
|
| 31 |
accept_multiple_files=True,
|
| 32 |
type=["pdf", "docx", "txt", "csv"],
|
| 33 |
+
label_visibility="hidden",
|
| 34 |
)
|
| 35 |
|
| 36 |
if uploaded_files:
|
|
|
|
| 38 |
if file not in st.session_state.uploaded_files:
|
| 39 |
# add the file to session state
|
| 40 |
st.session_state.uploaded_files.append(file.name)
|
|
|
|
| 41 |
# save the file to the sample_data directory
|
| 42 |
with open(os.path.join(SAVE_DIR, file.name), "wb") as f:
|
| 43 |
f.write(file.getbuffer())
|
|
|
|
| 44 |
st.success("File(s) uploaded successfully!")
|
| 45 |
|
| 46 |
+
|
| 47 |
+
def delete_file(filename):
|
| 48 |
+
"""Delete file from session state and local filesystem."""
|
| 49 |
+
if filename in st.session_state.uploaded_files and os.path.exists(
|
| 50 |
+
os.path.join(SAVE_DIR, filename)
|
| 51 |
+
):
|
| 52 |
+
st.session_state.uploaded_files.remove(filename)
|
| 53 |
+
os.remove(os.path.join(SAVE_DIR, filename))
|
| 54 |
+
st.success(f"Deleted {filename}!")
|
| 55 |
+
st.rerun()
|
| 56 |
+
|
| 57 |
+
|
| 58 |
if st.session_state.uploaded_files:
|
| 59 |
st.write("Uploaded Files:")
|
| 60 |
+
for index, filename in enumerate(st.session_state.uploaded_files):
|
| 61 |
+
col1, col2 = st.columns([4, 1])
|
| 62 |
+
col1.write(filename)
|
| 63 |
+
if col2.button("Delete", key=f"delete_{index}"):
|
| 64 |
+
delete_file(filename)
|
models/llamaCustom.py
CHANGED
|
@@ -54,7 +54,7 @@ Use the following example format for your answer:
|
|
| 54 |
Answer:
|
| 55 |
The answer to the user question.
|
| 56 |
Reference:
|
| 57 |
-
The list of references to the specific sections of the documents that support your answer.
|
| 58 |
[END_FORMAT]
|
| 59 |
"""
|
| 60 |
|
|
@@ -184,9 +184,13 @@ class LlamaCustom:
|
|
| 184 |
|
| 185 |
def get_response(self, query_str: str, chat_history: List[ChatMessage]):
|
| 186 |
# https://docs.llamaindex.ai/en/stable/module_guides/deploying/chat_engines/
|
|
|
|
|
|
|
| 187 |
query_engine = self.index.as_query_engine(
|
| 188 |
-
text_qa_template=PromptTemplate(QUERY_ENGINE_QA_TEMPLATE),
|
| 189 |
-
refine_template=PromptTemplate(
|
|
|
|
|
|
|
| 190 |
verbose=self.verbose,
|
| 191 |
)
|
| 192 |
# chat_engine = self.index.as_chat_engine(
|
|
|
|
| 54 |
Answer:
|
| 55 |
The answer to the user question.
|
| 56 |
Reference:
|
| 57 |
+
The list of references (such as page number, title, chapter, section) to the specific sections of the documents that support your answer.
|
| 58 |
[END_FORMAT]
|
| 59 |
"""
|
| 60 |
|
|
|
|
| 184 |
|
| 185 |
def get_response(self, query_str: str, chat_history: List[ChatMessage]):
|
| 186 |
# https://docs.llamaindex.ai/en/stable/module_guides/deploying/chat_engines/
|
| 187 |
+
# https://docs.llamaindex.ai/en/stable/examples/query_engine/citation_query_engine/
|
| 188 |
+
# https://docs.llamaindex.ai/en/stable/examples/query_engine/knowledge_graph_rag_query_engine/
|
| 189 |
query_engine = self.index.as_query_engine(
|
| 190 |
+
text_qa_template=PromptTemplate(QUERY_ENGINE_QA_TEMPLATE + ANSWER_FORMAT),
|
| 191 |
+
refine_template=PromptTemplate(
|
| 192 |
+
QUERY_ENGINE_REFINE_TEMPLATE
|
| 193 |
+
), # passing ANSWER_FORMAT here will not give the desired output, need to use the output parser from llama index?
|
| 194 |
verbose=self.verbose,
|
| 195 |
)
|
| 196 |
# chat_engine = self.index.as_chat_engine(
|
models/vector_database.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pinecone import Pinecone, ServerlessSpec
|
| 2 |
+
from llama_index.vector_stores.pinecone import PineconeVectorStore
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
load_dotenv()
|
| 8 |
+
|
| 9 |
+
# Pinecone Vector Database
|
| 10 |
+
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
|
| 11 |
+
pc_index_name = "llama-integration-pinecone"
|
| 12 |
+
# pc_index_name = "openai-embeddings"
|
| 13 |
+
pc_indexes = pc.list_indexes()
|
| 14 |
+
|
| 15 |
+
# Check if the index already exists
|
| 16 |
+
def index_exists(index_name):
|
| 17 |
+
for index in pc_indexes:
|
| 18 |
+
if index["name"] == index_name:
|
| 19 |
+
return True
|
| 20 |
+
return False
|
| 21 |
+
|
| 22 |
+
# Create the index if it doesn't exist
|
| 23 |
+
if not index_exists(pc_index_name):
|
| 24 |
+
pc.create_index(
|
| 25 |
+
name=pc_index_name,
|
| 26 |
+
dimension=1536,
|
| 27 |
+
spec=ServerlessSpec(cloud="aws", region="us-east-1"),
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# Initialize your index
|
| 31 |
+
pinecone_index = pc.Index(pc_index_name)
|
| 32 |
+
|
| 33 |
+
# Define the vector store
|
| 34 |
+
pinecone_vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
|
pages/llama_custom_demo.py
CHANGED
|
@@ -7,6 +7,8 @@ from typing import List
|
|
| 7 |
from models.llms import load_llm, integrated_llms
|
| 8 |
from models.embeddings import hf_embed_model, openai_embed_model
|
| 9 |
from models.llamaCustom import LlamaCustom
|
|
|
|
|
|
|
| 10 |
from utils.chatbox import show_previous_messages, show_chat_input
|
| 11 |
from utils.util import validate_openai_api_key
|
| 12 |
|
|
@@ -30,7 +32,8 @@ VECTOR_STORE_DIR = "vectorStores"
|
|
| 30 |
HF_REPO_ID = "zhtet/RegBotBeta"
|
| 31 |
|
| 32 |
# global
|
| 33 |
-
Settings.embed_model = hf_embed_model
|
|
|
|
| 34 |
|
| 35 |
# huggingface api
|
| 36 |
hf_api = HfApi()
|
|
@@ -62,9 +65,10 @@ def init_session_state():
|
|
| 62 |
|
| 63 |
|
| 64 |
# @st.cache_resource
|
| 65 |
-
def
|
| 66 |
filename: str,
|
| 67 |
) -> VectorStoreIndex:
|
|
|
|
| 68 |
try:
|
| 69 |
index_path = pathlib.Path(f"{VECTOR_STORE_DIR}/{filename.replace('.', '_')}")
|
| 70 |
if pathlib.Path.exists(index_path):
|
|
@@ -89,6 +93,23 @@ def index_docs(
|
|
| 89 |
return index
|
| 90 |
|
| 91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
def check_api_key(model_name: str, source: str):
|
| 93 |
if source.startswith("openai"):
|
| 94 |
if not st.session_state.openai_api_key:
|
|
@@ -164,6 +185,13 @@ with tab1:
|
|
| 164 |
label="Choose a file to chat with: ", options=os.listdir(SAVE_DIR)
|
| 165 |
)
|
| 166 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
if st.button("Submit", key="submit", help="Submit the form"):
|
| 168 |
with st.status("Loading ...", expanded=True) as status:
|
| 169 |
try:
|
|
@@ -176,7 +204,8 @@ with tab1:
|
|
| 176 |
Settings.llm = llama_llm
|
| 177 |
|
| 178 |
st.write("Processing Data ...")
|
| 179 |
-
index =
|
|
|
|
| 180 |
|
| 181 |
st.write("Finishing Up ...")
|
| 182 |
llama_custom = LlamaCustom(model_name=selected_llm_name, index=index)
|
|
|
|
| 7 |
from models.llms import load_llm, integrated_llms
|
| 8 |
from models.embeddings import hf_embed_model, openai_embed_model
|
| 9 |
from models.llamaCustom import LlamaCustom
|
| 10 |
+
|
| 11 |
+
# from models.vector_database import pinecone_vector_store
|
| 12 |
from utils.chatbox import show_previous_messages, show_chat_input
|
| 13 |
from utils.util import validate_openai_api_key
|
| 14 |
|
|
|
|
| 32 |
HF_REPO_ID = "zhtet/RegBotBeta"
|
| 33 |
|
| 34 |
# global
|
| 35 |
+
# Settings.embed_model = hf_embed_model
|
| 36 |
+
Settings.embed_model = openai_embed_model
|
| 37 |
|
| 38 |
# huggingface api
|
| 39 |
hf_api = HfApi()
|
|
|
|
| 65 |
|
| 66 |
|
| 67 |
# @st.cache_resource
|
| 68 |
+
def get_index(
|
| 69 |
filename: str,
|
| 70 |
) -> VectorStoreIndex:
|
| 71 |
+
"""This function loads the index from storage if it exists, otherwise it creates a new index from the document."""
|
| 72 |
try:
|
| 73 |
index_path = pathlib.Path(f"{VECTOR_STORE_DIR}/{filename.replace('.', '_')}")
|
| 74 |
if pathlib.Path.exists(index_path):
|
|
|
|
| 93 |
return index
|
| 94 |
|
| 95 |
|
| 96 |
+
# def get_pinecone_index(filename: str) -> VectorStoreIndex:
|
| 97 |
+
# """Thie function loads the index from Pinecone if it exists, otherwise it creates a new index from the document."""
|
| 98 |
+
# reader = SimpleDirectoryReader(input_files=[f"{SAVE_DIR}/{filename}"])
|
| 99 |
+
# docs = reader.load_data(show_progress=True)
|
| 100 |
+
# storage_context = StorageContext.from_defaults(vector_store=pinecone_vector_store)
|
| 101 |
+
# index = VectorStoreIndex.from_documents(
|
| 102 |
+
# documents=docs, show_progress=True, storage_context=storage_context
|
| 103 |
+
# )
|
| 104 |
+
|
| 105 |
+
# return index
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def get_chroma_index(filename: str) -> VectorStoreIndex:
|
| 109 |
+
"""This function loads the index from Chroma if it exists, otherwise it creates a new index from the document."""
|
| 110 |
+
pass
|
| 111 |
+
|
| 112 |
+
|
| 113 |
def check_api_key(model_name: str, source: str):
|
| 114 |
if source.startswith("openai"):
|
| 115 |
if not st.session_state.openai_api_key:
|
|
|
|
| 185 |
label="Choose a file to chat with: ", options=os.listdir(SAVE_DIR)
|
| 186 |
)
|
| 187 |
|
| 188 |
+
if st.button("Clear all api keys"):
|
| 189 |
+
st.session_state.openai_api_key = ""
|
| 190 |
+
st.session_state.replicate_api_token = ""
|
| 191 |
+
st.session_state.hf_token = ""
|
| 192 |
+
st.success("All API keys cleared!")
|
| 193 |
+
st.rerun()
|
| 194 |
+
|
| 195 |
if st.button("Submit", key="submit", help="Submit the form"):
|
| 196 |
with st.status("Loading ...", expanded=True) as status:
|
| 197 |
try:
|
|
|
|
| 204 |
Settings.llm = llama_llm
|
| 205 |
|
| 206 |
st.write("Processing Data ...")
|
| 207 |
+
index = get_index(selected_file)
|
| 208 |
+
# index = get_pinecone_index(selected_file)
|
| 209 |
|
| 210 |
st.write("Finishing Up ...")
|
| 211 |
llama_custom = LlamaCustom(model_name=selected_llm_name, index=index)
|
requirements.txt
CHANGED
|
@@ -7,11 +7,13 @@ langchain_pinecone
|
|
| 7 |
openai
|
| 8 |
faiss-cpu
|
| 9 |
python-dotenv
|
| 10 |
-
streamlit
|
| 11 |
huggingface_hub<0.21.0
|
| 12 |
pypdf
|
| 13 |
llama-index-llms-huggingface>=0.1.4
|
| 14 |
llama-index-embeddings-langchain>=0.1.2
|
|
|
|
|
|
|
| 15 |
replicate>=0.25.1
|
| 16 |
llama-index-llms-replicate
|
| 17 |
sentence-transformers>=2.6.1
|
|
|
|
| 7 |
openai
|
| 8 |
faiss-cpu
|
| 9 |
python-dotenv
|
| 10 |
+
streamlit>=1.24.0
|
| 11 |
huggingface_hub<0.21.0
|
| 12 |
pypdf
|
| 13 |
llama-index-llms-huggingface>=0.1.4
|
| 14 |
llama-index-embeddings-langchain>=0.1.2
|
| 15 |
+
llama-index-vector-stores-pinecone
|
| 16 |
+
pinecone-client>=3.0.0
|
| 17 |
replicate>=0.25.1
|
| 18 |
llama-index-llms-replicate
|
| 19 |
sentence-transformers>=2.6.1
|