File size: 4,570 Bytes
8ec2781
 
 
 
 
 
 
 
a948408
 
 
8ec2781
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a948408
8ec2781
 
 
 
ddec797
8ec2781
ddec797
 
 
 
8ec2781
ddec797
 
 
 
 
8ec2781
a948408
14a1b3e
 
 
a948408
 
 
 
 
 
8ec2781
ddec797
 
2865e6a
c47da58
 
ddec797
8ec2781
 
 
 
 
 
 
 
a948408
8ec2781
ddec797
8ec2781
 
 
 
 
 
 
 
 
 
 
 
 
 
a948408
8ec2781
 
 
 
 
 
 
 
 
 
 
a948408
8ec2781
a948408
 
 
 
8ec2781
 
ddec797
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import streamlit as st
from streamlit_chat import message
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import Replicate
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import TextLoader
from langchain.document_loaders import Docx2txtLoader
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
import os
import tempfile

load_dotenv()

def initialize_session_state():
    if 'history' not in st.session_state:
        st.session_state['history'] = []

    if 'generated' not in st.session_state:
        st.session_state['generated'] = ["Hello! Ask me about your file"]

    if 'past' not in st.session_state:
        st.session_state['past'] = ["Hey! 👋"]

def conversation_chat(query, chain, history):
    result = chain({"question": query, "chat_history": history})
    history.append((query, result["answer"]))
    return result["answer"]

def display_chat_history():
    reply_container = st.container()
    container = st.container()

    with container:
        col1, col2 = st.columns(2)

        with col1:
            with st.form(key='my_form', clear_on_submit=True):
                user_input = st.text_input("Question:", placeholder="Ask about your Documents", key='input')
                submit_button = st.form_submit_button(label='Send')

        with col2:
            if st.session_state['generated']:
                for i in range(len(st.session_state['generated'])):
                    message(st.session_state["past"][i], is_user=True, key=str(i) + '_user', avatar_style="thumbs")
                    message(st.session_state["generated"][i], key=str(i), avatar_style="fun-emoji")

def create_conversational_chain(vector_store, text_chunks, embeddings):
    replicate_api_token = "r8_AA3K1fhDykqLa5M74E5V0w5ss1z0P9S3foWJl"  # Replace with your actual token
    os.environ["REPLICATE_API_TOKEN"] = replicate_api_token

    print("Length of text_chunks:", len(text_chunks))
    print("Content of text_chunks:", text_chunks)

    print("Length of embeddings:", len(embeddings))
    print("Content of embeddings:", embeddings)

    llm = Replicate(
        streaming=True,
        model="replicate/llama-2-70b-chat:58d078176e02c219e11eb4da5a02a7830a283b14cf8f94537af893ccff5ee781",
        callbacks=[StreamingStdOutCallbackHandler()],
        input={"temperature": 0.01, "max_length": 500, "top_p": 1},
        replicate_api_token=replicate_api_token
    )
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

    chain = ConversationalRetrievalChain.from_llm(llm=llm, chain_type='stuff',
                                                 retriever=vector_store.as_retriever(search_kwargs={"k": 2}),
                                                 memory=memory)
    return chain

def main():
    load_dotenv()
    initialize_session_state()
    st.title("Chat With Your Doc")
    st.sidebar.title("Document Processing")
    uploaded_files = st.sidebar.file_uploader("Upload files", accept_multiple_files=True)

    if uploaded_files:
        text = []
        for file in uploaded_files:
            file_extension = os.path.splitext(file.name)[1]
            with tempfile.NamedTemporaryFile(delete=False) as temp_file:
                temp_file.write(file.read())
                temp_file_path = temp_file.name

            loader = None
            if file_extension == ".pdf":
                loader = PyPDFLoader(temp_file_path)
            elif file_extension == ".docx" or file_extension == ".doc":
                loader = Docx2txtLoader(temp_file_path)
            elif file_extension == ".txt":
                loader = TextLoader(temp_file_path)

            if loader:
                text.extend(loader.load())
                os.remove(temp_file_path)

        text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len)
        text_chunks = text_splitter.split_documents(text)

        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'})
        vector_store = FAISS.from_documents(text_chunks, embedding=embeddings)

        chain = create_conversational_chain(vector_store, text_chunks, embeddings)

        display_chat_history()

if __name__ == "__main__":
    main()