File size: 7,575 Bytes
1f6b1f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0361dbf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f6b1f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8f09fc
 
1f6b1f0
 
b90e071
1f6b1f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ddad88
b90e071
74f61e5
b90e071
74f61e5
1f6b1f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0361dbf
 
1f6b1f0
 
 
b90e071
1f6b1f0
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
import torch
import base64
import textwrap
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.chains import RetrievalQA
from streamlit_chat import message
from langchain.document_loaders import PyPDFLoader, DirectoryLoader, PDFMinerLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
import os

st.set_page_config(page_title="pdf-GPT", page_icon="πŸ“–", layout="wide")
@st.cache_resource
def get_model():
    device = torch.device('cpu')
    # device = torch.device('cuda:0')

    checkpoint = "LaMini-T5-738M"
    checkpoint = "MBZUAI/LaMini-T5-738M"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    base_model = AutoModelForSeq2SeqLM.from_pretrained(
        checkpoint,
        device_map=device,
        torch_dtype = torch.float32,
        # offload_folder= "/model_ck"
    )
    return base_model,tokenizer

@st.cache_resource
def llm_pipeline():
    base_model,tokenizer = get_model()
    pipe = pipeline(
        'text2text-generation',
        model = base_model,
        tokenizer=tokenizer,
        max_length = 512,
        do_sample = True,
        temperature = 0.3,
        top_p = 0.95,
        # device=device
    )

    local_llm = HuggingFacePipeline(pipeline = pipe)
    return local_llm

@st.cache_resource
def qa_llm():
    llm = llm_pipeline()
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    db = Chroma(persist_directory="db", embedding_function = embeddings)
    retriever = db.as_retriever()
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type = "stuff",
        retriever = retriever,
        return_source_documents=True
    )
    return qa


def process_answer(instruction):
    response=''
    instruction = instruction
    qa = qa_llm()
    generated_text = qa(instruction)
    answer = generated_text['result']
    return answer, generated_text

# Display conversation history using Streamlit messages
def display_conversation(history):
    # st.write(history)
    for i in range(len(history["generated"])):
        message(history["past"][i] , is_user=True, key= str(i) + "_user")
        if isinstance(history["generated"][i],str):
          message(history["generated"][i] , key= str(i))
        else:
          
          message(history["generated"][i][0] , key= str(i))
        #   sources_list = []
        #   for source in history["generated"][i][1]['source_documents']:
        #     # st.write(source.metadata['source'])
        #     sources_list.append(source.metadata['source'])
        #   message(str(set(sources_list)) , key="sources_"+str(i))


# function to display the PDF of a given file
@st.cache_data
def displayPDF(file,file_name):
    # Opening file from file path
    with open(file, "rb") as f:
        base64_pdf = base64.b64encode(f.read()).decode('utf-8')

    # Embedding PDF in HTML
    pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="700" height="900" type="application/pdf"></iframe>'
    # pdf_display = f'<iframe src="{file}" width="700" height="900" type="application/pdf"></iframe>'
    # st.write()
    # pdf_display = f'<embed src="http://localhost:8900/{file_name}" width="700" height="1000" type="application/pdf"></embed>'
    # pdf_display = f'<iframe src="http://localhost:8900/{file_name}" width="700" height="900" type="application/pdf"></iframe>'


    # st.write(pdf_display)
    st.markdown(pdf_display, unsafe_allow_html=True)

@st.cache_resource
def data_ingestion(file_path,persist_directory):
    # for root, dirs, files in os.walk("docs"):
    #     for file in files:
    if file_path.endswith(".pdf"):
        print(file_path)
        loader = PDFMinerLoader(file_path)
        documents = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=500)
        texts = text_splitter.split_documents(documents)
        # create embeddings 
        embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
        # create vector store
        db = Chroma.from_documents(texts, embeddings, persist_directory="uploaded/db")
        db.persist()
        db=None
  
def main():
    st.markdown("<h1 style='text-align:center; color: blue;'>Chat with Your PDF πŸ“‘</h1>", unsafe_allow_html=True)
    st.markdown("<h3 style='text-align:center; color: grey;'>Built by Vicky</h3>", unsafe_allow_html=True)
    st.markdown("<h2 style='text-align:center; color: red;'>Upload your PDF</h2>", unsafe_allow_html=True)

    uploaded_file = st.file_uploader("",type=["pdf"])

    if uploaded_file is not None:
        file_details = {
            "name" : uploaded_file.name,
            "type" : uploaded_file.type,
            "size" : uploaded_file.size
        }
        print(os.getcwd())
        # st.write(os.getcwd())
        cwd = os.getcwd()
        # st.write(os.listdir(cwd))
        filepath = cwd+"/uploaded/"+uploaded_file.name
        with open(filepath, "wb") as temp_file:
            temp_file.write(uploaded_file.read())

        col1, col2 = st.columns([1,1])
        with col1:
            # st.markdown("<h2 style='text-align:center; color:grey;'>PDF Details</h2>",unsafe_allow_html=True)
            # st.write(file_details)
            st.markdown("<h2 style='text-align:center; color: grey;'>PDF Preview</h2>", unsafe_allow_html=True)
            displayPDF(filepath,uploaded_file.name)
            # displayPDF(uploaded_file)
        with col2:
            with st.spinner("Embeddings are in process......."):
                ingested_data = data_ingestion(filepath,filepath)
            st.success('Embeddings are created Successfully!')
            st.markdown("<h2 style='text-align:center; color: grey;'>Chat Here</h2>", unsafe_allow_html=True)
            

            user_input = st.text_input(label="Message",key="input")
            # user_input = st.chat_input("",key="input")
            # styl = f"""
            #         <style>
            #             .stTextInput {{
            #             position: fixed;
            #             bottom: 3rem;
            #             }}
            #         </style>
            #         """
            # st.markdown(styl, unsafe_allow_html=True)

            # Initialize session state for generated responses and past messages
            if "generated" not in st.session_state:
                st.session_state["generated"] = ["I am ready to help you"]
            if "past" not in st.session_state:
                st.session_state["past"] = ["Hey There!"]

            # Search the database for a response based on user input and update session state
            if user_input:
                answer = process_answer({"query" : user_input})
                # answer = user_input
                st.session_state["past"].append(user_input)
                response = answer
                st.session_state["generated"].append(response)
                # st.write(st.session_state)
                # user_input = st.text_input(label="Message",key="input")

            # Display Conversation history using Streamlit messages
            if st.session_state["generated"]:
                display_conversation(st.session_state)



if __name__ == "__main__":
    main()