File size: 4,957 Bytes
8ec2781
 
 
 
 
 
 
 
c47da58
8ec2781
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ddec797
8ec2781
ddec797
 
 
 
8ec2781
ddec797
 
 
 
 
8ec2781
 
14a1b3e
 
 
abb41df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ec2781
ddec797
 
2865e6a
c47da58
 
ddec797
8ec2781
 
 
 
 
 
 
9c7beb9
abb41df
8ec2781
 
ddec797
8ec2781
 
 
 
 
 
 
 
 
 
 
 
 
 
c47da58
8ec2781
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ddec797
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import streamlit as st
from streamlit_chat import message
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import Replicate
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
import os
from dotenv import load_dotenv
import tempfile

load_dotenv()

def initialize_session_state():
    if 'history' not in st.session_state:
        st.session_state['history'] = []

    if 'generated' not in st.session_state:
        st.session_state['generated'] = ["Hello! Ask me about your file"]

    if 'past' not in st.session_state:
        st.session_state['past'] = ["Hey! 👋"]

def conversation_chat(query, chain, history):
    result = chain({"question": query, "chat_history": history})
    history.append((query, result["answer"]))
    return result["answer"]

def display_chat_history(chain):
    reply_container = st.container()
    container = st.container()

    with container:
        col1, col2 = st.columns(2)

        with col1:
            with st.form(key='my_form', clear_on_submit=True):
                user_input = st.text_input("Question:", placeholder="Ask about your Documents", key='input')
                submit_button = st.form_submit_button(label='Send')

        with col2:
            if st.session_state['generated']:
                for i in range(len(st.session_state['generated'])):
                    message(st.session_state["past"][i], is_user=True, key=str(i) + '_user', avatar_style="thumbs")
                    message(st.session_state["generated"][i], key=str(i), avatar_style="fun-emoji")

def create_conversational_chain(vector_store):
    replicate_api_token = "r8_AA3K1fhDykqLa5M74E5V0w5ss1z0P9S3foWJl"  # Replace with your actual token
    os.environ["REPLICATE_API_TOKEN"] = replicate_api_token

    # Add debugging statements
    st.write("Text chunks lengths:", [len(chunk) for chunk in text_chunks])
    st.write("Text chunks content:", text_chunks)

    # Create embeddings
    st.write("Creating embeddings...")
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",
                                       model_kwargs={'device': 'cpu'})
    
    # Add debugging statements
    st.write("Embeddings lengths:", [len(emb) for emb in embeddings])
    st.write("Embeddings content:", embeddings)

    vector_store = FAISS.from_documents(text_chunks, embedding=embeddings)
    
    st.write("Creating conversation chain...")
    
    llm = Replicate(
        streaming=True,
        model="replicate/llama-2-70b-chat:58d078176e02c219e11eb4da5a02a7830a283b14cf8f94537af893ccff5ee781",
        callbacks=[StreamingStdOutCallbackHandler()],
        input={"temperature": 0.01, "max_length": 500, "top_p": 1},
        replicate_api_token=replicate_api_token
    )
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

    chain = ConversationalRetrievalChain.from_llm(llm=llm, chain_type='stuff',
                                                 retriever=vector_store.as_retriever(search_kwargs={"k": 2}),
                                                 memory=memory)
    return chain



def main():
    initialize_session_state()
    st.title("Chat With Your Doc")
    st.sidebar.title("Document Processing")
    uploaded_files = st.sidebar.file_uploader("Upload files", accept_multiple_files=True)

    if uploaded_files:
        text = []
        for file in uploaded_files:
            file_extension = os.path.splitext(file.name)[1]
            with tempfile.NamedTemporaryFile(delete=False) as temp_file:
                temp_file.write(file.read())
                temp_file_path = temp_file.name

            loader = None
            if file_extension == ".pdf":
                loader = PyPDFLoader(temp_file_path)
            elif file_extension in (".docx", ".doc"):
                loader = Docx2txtLoader(temp_file_path)
            elif file_extension == ".txt":
                loader = TextLoader(temp_file_path)

            if loader:
                text.extend(loader.load())
                os.remove(temp_file_path)

        text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len)
        text_chunks = text_splitter.split_documents(text)

        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",
                                           model_kwargs={'device': 'cpu'})
        vector_store = FAISS.from_documents(text_chunks, embedding=embeddings)
        chain = create_conversational_chain(vector_store)
        display_chat_history(chain)

if __name__ == "__main__":
    main()