File size: 2,098 Bytes
6872416
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import streamlit as st
#from langchain.retrievers import KNNRetriever
from langchain.storage import LocalFileStore
from langchain.embeddings import CacheBackedEmbeddings
from langchain.vectorstores import FAISS
#from streamapp import *
from PIL import Image

from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


st.sidebar.image(Image.open("./test-logo.png"), use_column_width=True)


print("Loading Index Page!!")

#if 'vectorstore' in st.session_state.keys():
vectorstore = st.session_state['vectorstore']
# else:
#     retriever = initialize_vectorstore()
#     vectorstore = st.session_state['vectorstore']

def _text_splitter(doc):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=600,
        chunk_overlap=50,
        length_function=len,
    )
    return text_splitter.transform_documents(doc)

def _load_docs(path: str):
    load_doc = WebBaseLoader(path).load()
    doc = _text_splitter(load_doc)
    return doc


with st.form("Index documents to Vector Store"):

    file_path = st.text_input(label="Enter the web link", value="", placeholder="", label_visibility="visible", disabled=False)
    print("file_path  " ,file_path)

    submitted = st.form_submit_button("Submit")

    if submitted:
        st.write("Submitted web link: " + file_path)
        webpage_loader = _load_docs(file_path)
    
        webpage_chunks = _text_splitter(webpage_loader)
    
        # store embeddings in vector store
        print("vectorstore length before addition,  ", len(vectorstore.serialize_to_bytes()))   
        vectorstore.add_documents(webpage_chunks)
        print("vectorstore length after addition,  ", len(vectorstore.serialize_to_bytes()))   
        
        st.session_state['vectorstore'] = vectorstore
        retriever = vectorstore.as_retriever()
        st.session_state['retriever'] = retriever
        st.session_state['docadd'] = 1
        
        st.markdown('<h2 style="color:#100170;font-size:24px;">Document loaded to vector store successfully!!</h2>', unsafe_allow_html=True)