mikepastor11 commited on
Commit
7db4ca0
·
verified ·
1 Parent(s): 2ef6911

Delete PennwickFileAnalyzer.py

Browse files
Files changed (1) hide show
  1. PennwickFileAnalyzer.py +0 -168
PennwickFileAnalyzer.py DELETED
@@ -1,168 +0,0 @@
1
- ##############################################################
2
- # PDF Chat
3
- #
4
- # Mike Pastor February 2024
5
-
6
-
7
- import streamlit as st
8
- from dotenv import load_dotenv
9
-
10
- from PyPDF2 import PdfReader
11
- from langchain.text_splitter import CharacterTextSplitter
12
-
13
- from InstructorEmbedding import INSTRUCTOR
14
- from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
15
- from langchain.vectorstores import FAISS
16
- from langchain.chat_models import ChatOpenAI
17
- from langchain.memory import ConversationBufferMemory
18
- from langchain.chains import ConversationalRetrievalChain
19
- from htmlTemplates import css, bot_template, user_template
20
- from langchain.llms import HuggingFaceHub
21
-
22
- def get_pdf_text(pdf_docs):
23
- text = ""
24
- for pdf in pdf_docs:
25
- pdf_reader = PdfReader(pdf)
26
- for page in pdf_reader.pages:
27
- text += page.extract_text()
28
- return text
29
-
30
- # Chunk size and overlap must not exceed the models capacity!
31
- #
32
- def get_text_chunks(text):
33
- text_splitter = CharacterTextSplitter(
34
- separator="\n",
35
- chunk_size=800, # 1000
36
- chunk_overlap=200,
37
- length_function=len
38
- )
39
- chunks = text_splitter.split_text(text)
40
- return chunks
41
-
42
-
43
- def get_vectorstore(text_chunks):
44
- # embeddings = OpenAIEmbeddings()
45
-
46
- # pip install InstructorEmbedding
47
- # pip install sentence-transformers==2.2.2
48
- embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
49
-
50
- # from InstructorEmbedding import INSTRUCTOR
51
- # model = INSTRUCTOR('hkunlp/instructor-xl')
52
- # sentence = "3D ActionSLAM: wearable person tracking in multi-floor environments"
53
- # instruction = "Represent the Science title:"
54
- # embeddings = model.encode([[instruction, sentence]])
55
-
56
- # embeddings = model.encode(text_chunks)
57
- print('have Embeddings: ')
58
-
59
- # text_chunks="this is a test"
60
- # FAISS, Chroma and other vector databases
61
- #
62
- vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
63
- print('FAISS succeeds: ')
64
-
65
- return vectorstore
66
-
67
- def get_conversation_chain(vectorstore):
68
- # llm = ChatOpenAI()
69
- # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
70
- # google/bigbird-roberta-base facebook/bart-large
71
- llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature": 0.5, "max_length": 512})
72
-
73
- memory = ConversationBufferMemory(
74
- memory_key='chat_history', return_messages=True)
75
- conversation_chain = ConversationalRetrievalChain.from_llm(
76
- llm=llm,
77
- retriever=vectorstore.as_retriever(),
78
- memory=memory,
79
- )
80
- return conversation_chain
81
-
82
- def handle_userinput(user_question):
83
-
84
- response = st.session_state.conversation({'question': user_question})
85
- # response = st.session_state.conversation({'summarization': user_question})
86
- st.session_state.chat_history = response['chat_history']
87
-
88
-
89
- # st.empty()
90
-
91
- for i, message in enumerate(st.session_state.chat_history):
92
- if i % 2 == 0:
93
- st.write(user_template.replace(
94
- "{{MSG}}", message.content), unsafe_allow_html=True)
95
-
96
- else:
97
- st.write(bot_template.replace(
98
- "{{MSG}}", message.content), unsafe_allow_html=True)
99
-
100
-
101
-
102
-
103
- def main():
104
-
105
- load_dotenv()
106
- st.set_page_config(page_title="MLP Chat with multiple PDFs",
107
- page_icon=":books:")
108
-
109
- st.write(css, unsafe_allow_html=True)
110
-
111
- if "conversation" not in st.session_state:
112
- st.session_state.conversation = None
113
- if "chat_history" not in st.session_state:
114
- st.session_state.chat_history = None
115
-
116
- st.header("Mike's PDF Chat :books:")
117
-
118
- user_question = st.text_input("Ask a question about your documents:")
119
- if user_question:
120
- handle_userinput(user_question)
121
-
122
- # st.write( user_template, unsafe_allow_html=True)
123
- # st.write(user_template.replace( "{{MSG}}", "Hello robot!"), unsafe_allow_html=True)
124
- # st.write(bot_template.replace( "{{MSG}}", "Hello human!"), unsafe_allow_html=True)
125
-
126
-
127
- with st.sidebar:
128
-
129
- st.subheader("Your documents")
130
- pdf_docs = st.file_uploader(
131
- "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
132
-
133
- # Upon button press
134
- if st.button("Process these files"):
135
- with st.spinner("Processing..."):
136
-
137
- #################################################################
138
- # Track the overall time for file processing into Vectors
139
- # #
140
- from datetime import datetime
141
- global_now = datetime.now()
142
- global_current_time = global_now.strftime("%H:%M:%S")
143
- st.write("Vectorizing Files - Current Time =", global_current_time)
144
-
145
- # get pdf text
146
- raw_text = get_pdf_text(pdf_docs)
147
- # st.write(raw_text)
148
-
149
- # # get the text chunks
150
- text_chunks = get_text_chunks(raw_text)
151
- # st.write(text_chunks)
152
-
153
- # # create vector store
154
- vectorstore = get_vectorstore(text_chunks)
155
-
156
- # # create conversation chain
157
- st.session_state.conversation = get_conversation_chain(vectorstore)
158
-
159
- # Mission Complete!
160
- global_later = datetime.now()
161
- st.write("Files Vectorized - Total EXECUTION Time =",
162
- (global_later - global_now), global_later)
163
-
164
-
165
- if __name__ == '__main__':
166
- main()
167
-
168
-