cokoli1 commited on
Commit
a07690c
Β·
1 Parent(s): 92bd3ca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +471 -2
app.py CHANGED
@@ -1,4 +1,473 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Hugging Face's logo
2
+ Hugging Face
3
+
4
+
5
+
6
+ Spaces:
7
+
8
+ awacke1
9
+ /
10
+ ChatGPT-Memory-Chat-Story-Generator
11
+
12
+ like
13
+ 11
14
+ App
15
+
16
+ Files
17
+ Community
18
+ ChatGPT-Memory-Chat-Story-Generator
19
+ /
20
+ app.py
21
+ awacke1's picture
22
+ awacke1
23
+ Update app.py
24
+ 03c8b68
25
+ raw
26
+ history
27
+ blame
28
+ contribute
29
+ delete
30
+ No virus
31
+ 17.6 kB
32
  import streamlit as st
33
+ import openai
34
+ import os
35
+ import base64
36
+ import glob
37
+ import json
38
+ import mistune
39
+ import pytz
40
+ import math
41
+ import requests
42
+ import time
43
+ import re
44
+ import textract
45
+
46
+ from datetime import datetime
47
+ from openai import ChatCompletion
48
+ from xml.etree import ElementTree as ET
49
+ from bs4 import BeautifulSoup
50
+ from collections import deque
51
+ from audio_recorder_streamlit import audio_recorder
52
+
53
+ from dotenv import load_dotenv
54
+ from PyPDF2 import PdfReader
55
+ from langchain.text_splitter import CharacterTextSplitter
56
+ from langchain.embeddings import OpenAIEmbeddings
57
+ from langchain.vectorstores import FAISS
58
+ from langchain.chat_models import ChatOpenAI
59
+ from langchain.memory import ConversationBufferMemory
60
+ from langchain.chains import ConversationalRetrievalChain
61
+ from templates import css, bot_template, user_template
62
+
63
+
64
+
65
+ def generate_filename(prompt, file_type):
66
+ central = pytz.timezone('US/Central')
67
+ safe_date_time = datetime.now(central).strftime("%m%d_%H%M") # Date and time DD-HHMM
68
+ safe_prompt = "".join(x for x in prompt if x.isalnum())[:90] # Limit file name size and trim whitespace
69
+ return f"{safe_date_time}_{safe_prompt}.{file_type}" # Return a safe file name
70
+
71
+
72
+ def transcribe_audio(openai_key, file_path, model):
73
+ OPENAI_API_URL = "https://api.openai.com/v1/audio/transcriptions"
74
+ headers = {
75
+ "Authorization": f"Bearer {openai_key}",
76
+ }
77
+ with open(file_path, 'rb') as f:
78
+ data = {'file': f}
79
+ response = requests.post(OPENAI_API_URL, headers=headers, files=data, data={'model': model})
80
+ if response.status_code == 200:
81
+ st.write(response.json())
82
+ chatResponse = chat_with_model(response.json().get('text'), '') # *************************************
83
+ transcript = response.json().get('text')
84
+ #st.write('Responses:')
85
+ #st.write(chatResponse)
86
+ filename = generate_filename(transcript, 'txt')
87
+ create_file(filename, transcript, chatResponse)
88
+ return transcript
89
+ else:
90
+ st.write(response.json())
91
+ st.error("Error in API call.")
92
+ return None
93
+
94
+ def save_and_play_audio(audio_recorder):
95
+ audio_bytes = audio_recorder()
96
+ if audio_bytes:
97
+ filename = generate_filename("Recording", "wav")
98
+ with open(filename, 'wb') as f:
99
+ f.write(audio_bytes)
100
+ st.audio(audio_bytes, format="audio/wav")
101
+ return filename
102
+ return None
103
+
104
+ def create_file(filename, prompt, response):
105
+ if filename.endswith(".txt"):
106
+ with open(filename, 'w') as file:
107
+ file.write(f"{prompt}\n{response}")
108
+ elif filename.endswith(".htm"):
109
+ with open(filename, 'w') as file:
110
+ file.write(f"{prompt} {response}")
111
+ elif filename.endswith(".md"):
112
+ with open(filename, 'w') as file:
113
+ file.write(f"{prompt}\n\n{response}")
114
+
115
+ def truncate_document(document, length):
116
+ return document[:length]
117
+ def divide_document(document, max_length):
118
+ return [document[i:i+max_length] for i in range(0, len(document), max_length)]
119
+
120
+ def get_table_download_link(file_path):
121
+ with open(file_path, 'r') as file:
122
+ try:
123
+ data = file.read()
124
+ except:
125
+ st.write('')
126
+ return file_path
127
+ b64 = base64.b64encode(data.encode()).decode()
128
+ file_name = os.path.basename(file_path)
129
+ ext = os.path.splitext(file_name)[1] # get the file extension
130
+ if ext == '.txt':
131
+ mime_type = 'text/plain'
132
+ elif ext == '.py':
133
+ mime_type = 'text/plain'
134
+ elif ext == '.xlsx':
135
+ mime_type = 'text/plain'
136
+ elif ext == '.csv':
137
+ mime_type = 'text/plain'
138
+ elif ext == '.htm':
139
+ mime_type = 'text/html'
140
+ elif ext == '.md':
141
+ mime_type = 'text/markdown'
142
+ else:
143
+ mime_type = 'application/octet-stream' # general binary data type
144
+ href = f'<a href="data:{mime_type};base64,{b64}" target="_blank" download="{file_name}">{file_name}</a>'
145
+ return href
146
+
147
+ def CompressXML(xml_text):
148
+ root = ET.fromstring(xml_text)
149
+ for elem in list(root.iter()):
150
+ if isinstance(elem.tag, str) and 'Comment' in elem.tag:
151
+ elem.parent.remove(elem)
152
+ return ET.tostring(root, encoding='unicode', method="xml")
153
+
154
+ def read_file_content(file,max_length):
155
+ if file.type == "application/json":
156
+ content = json.load(file)
157
+ return str(content)
158
+ elif file.type == "text/html" or file.type == "text/htm":
159
+ content = BeautifulSoup(file, "html.parser")
160
+ return content.text
161
+ elif file.type == "application/xml" or file.type == "text/xml":
162
+ tree = ET.parse(file)
163
+ root = tree.getroot()
164
+ xml = CompressXML(ET.tostring(root, encoding='unicode'))
165
+ return xml
166
+ elif file.type == "text/markdown" or file.type == "text/md":
167
+ md = mistune.create_markdown()
168
+ content = md(file.read().decode())
169
+ return content
170
+ elif file.type == "text/plain":
171
+ return file.getvalue().decode()
172
+ else:
173
+ return ""
174
+
175
+ def chat_with_model(prompt, document_section, model_choice='gpt-3.5-turbo'):
176
+ model = model_choice
177
+ conversation = [{'role': 'system', 'content': 'You are a helpful assistant.'}]
178
+ conversation.append({'role': 'user', 'content': prompt})
179
+ if len(document_section)>0:
180
+ conversation.append({'role': 'assistant', 'content': document_section})
181
+
182
+ start_time = time.time()
183
+ report = []
184
+ res_box = st.empty()
185
+ collected_chunks = []
186
+ collected_messages = []
187
+
188
+ for chunk in openai.ChatCompletion.create(
189
+ model='gpt-3.5-turbo',
190
+ messages=conversation,
191
+ temperature=0.5,
192
+ stream=True
193
+ ):
194
+
195
+ collected_chunks.append(chunk) # save the event response
196
+ chunk_message = chunk['choices'][0]['delta'] # extract the message
197
+ collected_messages.append(chunk_message) # save the message
198
+
199
+ content=chunk["choices"][0].get("delta",{}).get("content")
200
+
201
+ try:
202
+ report.append(content)
203
+ if len(content) > 0:
204
+ result = "".join(report).strip()
205
+ #result = result.replace("\n", "")
206
+ res_box.markdown(f'*{result}*')
207
+ except:
208
+ st.write(' ')
209
+
210
+ full_reply_content = ''.join([m.get('content', '') for m in collected_messages])
211
+ st.write("Elapsed time:")
212
+ st.write(time.time() - start_time)
213
+ return full_reply_content
214
+
215
+ def chat_with_file_contents(prompt, file_content, model_choice='gpt-3.5-turbo'):
216
+ conversation = [{'role': 'system', 'content': 'You are a helpful assistant.'}]
217
+ conversation.append({'role': 'user', 'content': prompt})
218
+ if len(file_content)>0:
219
+ conversation.append({'role': 'assistant', 'content': file_content})
220
+ response = openai.ChatCompletion.create(model=model_choice, messages=conversation)
221
+ return response['choices'][0]['message']['content']
222
+
223
+ def extract_mime_type(file):
224
+ # Check if the input is a string
225
+ if isinstance(file, str):
226
+ pattern = r"type='(.*?)'"
227
+ match = re.search(pattern, file)
228
+ if match:
229
+ return match.group(1)
230
+ else:
231
+ raise ValueError(f"Unable to extract MIME type from {file}")
232
+ # If it's not a string, assume it's a streamlit.UploadedFile object
233
+ elif isinstance(file, streamlit.UploadedFile):
234
+ return file.type
235
+ else:
236
+ raise TypeError("Input should be a string or a streamlit.UploadedFile object")
237
+
238
+ from io import BytesIO
239
+ import re
240
+
241
+ def extract_file_extension(file):
242
+ # get the file name directly from the UploadedFile object
243
+ file_name = file.name
244
+ pattern = r".*?\.(.*?)$"
245
+ match = re.search(pattern, file_name)
246
+ if match:
247
+ return match.group(1)
248
+ else:
249
+ raise ValueError(f"Unable to extract file extension from {file_name}")
250
+
251
+ def pdf2txt(docs):
252
+ text = ""
253
+ for file in docs:
254
+ file_extension = extract_file_extension(file)
255
+ # print the file extension
256
+ st.write(f"File type extension: {file_extension}")
257
+
258
+ # read the file according to its extension
259
+ try:
260
+ if file_extension.lower() in ['py', 'txt', 'html', 'htm', 'xml', 'json']:
261
+ text += file.getvalue().decode('utf-8')
262
+ elif file_extension.lower() == 'pdf':
263
+ from PyPDF2 import PdfReader
264
+ pdf = PdfReader(BytesIO(file.getvalue()))
265
+ for page in range(len(pdf.pages)):
266
+ text += pdf.pages[page].extract_text() # new PyPDF2 syntax
267
+ except Exception as e:
268
+ st.write(f"Error processing file {file.name}: {e}")
269
+
270
+ return text
271
+
272
+ def pdf2txt_old(pdf_docs):
273
+ st.write(pdf_docs)
274
+ for file in pdf_docs:
275
+ mime_type = extract_mime_type(file)
276
+ st.write(f"MIME type of file: {mime_type}")
277
+
278
+ text = ""
279
+ for pdf in pdf_docs:
280
+ pdf_reader = PdfReader(pdf)
281
+ for page in pdf_reader.pages:
282
+ text += page.extract_text()
283
+ return text
284
+
285
+ def txt2chunks(text):
286
+ text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len)
287
+ return text_splitter.split_text(text)
288
+
289
+ def vector_store(text_chunks):
290
+ key = os.getenv('OPENAI_API_KEY')
291
+ embeddings = OpenAIEmbeddings(openai_api_key=key)
292
+ return FAISS.from_texts(texts=text_chunks, embedding=embeddings)
293
+
294
+ def get_chain(vectorstore):
295
+ llm = ChatOpenAI()
296
+ memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
297
+ return ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(), memory=memory)
298
+
299
+ def process_user_input(user_question):
300
+ response = st.session_state.conversation({'question': user_question})
301
+ st.session_state.chat_history = response['chat_history']
302
+ for i, message in enumerate(st.session_state.chat_history):
303
+ template = user_template if i % 2 == 0 else bot_template
304
+ st.write(template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
305
+ # Save file output from PDF query results
306
+ filename = generate_filename(user_question, 'txt')
307
+ create_file(filename, user_question, message.content)
308
+
309
+ #st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
310
+
311
+ def divide_prompt(prompt, max_length):
312
+ words = prompt.split()
313
+ chunks = []
314
+ current_chunk = []
315
+ current_length = 0
316
+ for word in words:
317
+ if len(word) + current_length <= max_length:
318
+ current_length += len(word) + 1 # Adding 1 to account for spaces
319
+ current_chunk.append(word)
320
+ else:
321
+ chunks.append(' '.join(current_chunk))
322
+ current_chunk = [word]
323
+ current_length = len(word)
324
+ chunks.append(' '.join(current_chunk)) # Append the final chunk
325
+ return chunks
326
+
327
+ def main():
328
+ # Sidebar and global
329
+ openai.api_key = os.getenv('OPENAI_API_KEY')
330
+ st.set_page_config(page_title="GPT Streamlit Document Reasoner",layout="wide")
331
+
332
+ # File type for output, model choice
333
+ menu = ["txt", "htm", "xlsx", "csv", "md", "py"] #619
334
+ choice = st.sidebar.selectbox("Output File Type:", menu)
335
+ model_choice = st.sidebar.radio("Select Model:", ('gpt-3.5-turbo', 'gpt-3.5-turbo-0301'))
336
+
337
+ # Audio, transcribe, GPT:
338
+ filename = save_and_play_audio(audio_recorder)
339
+ if filename is not None:
340
+ transcription = transcribe_audio(openai.api_key, filename, "whisper-1")
341
+ st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
342
+ filename=None # since transcription is finished next time just use the saved transcript
343
+
344
+ # prompt interfaces
345
+ user_prompt = st.text_area("Enter prompts, instructions & questions:", '', height=100)
346
+
347
+ # file section interface for prompts against large documents as context
348
+ collength, colupload = st.columns([2,3]) # adjust the ratio as needed
349
+ with collength:
350
+ max_length = st.slider("File section length for large files", min_value=1000, max_value=128000, value=12000, step=1000)
351
+ with colupload:
352
+ uploaded_file = st.file_uploader("Add a file for context:", type=["pdf", "xml", "json", "xlsx","csv","html", "htm", "md", "txt"])
353
+
354
+ # Document section chat
355
+ document_sections = deque()
356
+ document_responses = {}
357
+ if uploaded_file is not None:
358
+ file_content = read_file_content(uploaded_file, max_length)
359
+ document_sections.extend(divide_document(file_content, max_length))
360
+ if len(document_sections) > 0:
361
+ if st.button("πŸ‘οΈ View Upload"):
362
+ st.markdown("**Sections of the uploaded file:**")
363
+ for i, section in enumerate(list(document_sections)):
364
+ st.markdown(f"**Section {i+1}**\n{section}")
365
+ st.markdown("**Chat with the model:**")
366
+ for i, section in enumerate(list(document_sections)):
367
+ if i in document_responses:
368
+ st.markdown(f"**Section {i+1}**\n{document_responses[i]}")
369
+ else:
370
+ if st.button(f"Chat about Section {i+1}"):
371
+ st.write('Reasoning with your inputs...')
372
+ response = chat_with_model(user_prompt, section, model_choice) # *************************************
373
+ st.write('Response:')
374
+ st.write(response)
375
+ document_responses[i] = response
376
+ filename = generate_filename(f"{user_prompt}_section_{i+1}", choice)
377
+ create_file(filename, user_prompt, response)
378
+ st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
379
+
380
+ if st.button('πŸ’¬ Chat'):
381
+ st.write('Reasoning with your inputs...')
382
+
383
+ #response = chat_with_model(user_prompt, ''.join(list(document_sections,)), model_choice) # *************************************
384
+
385
+ # Divide the user_prompt into smaller sections
386
+ user_prompt_sections = divide_prompt(user_prompt, max_length)
387
+ full_response = ''
388
+ for prompt_section in user_prompt_sections:
389
+ # Process each section with the model
390
+ response = chat_with_model(prompt_section, ''.join(list(document_sections)), model_choice)
391
+ full_response += response + '\n' # Combine the responses
392
+
393
+ #st.write('Response:')
394
+ #st.write(full_response)
395
+
396
+ response = full_response
397
+ st.write('Response:')
398
+ st.write(response)
399
+
400
+ filename = generate_filename(user_prompt, choice)
401
+ create_file(filename, user_prompt, response)
402
+ st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
403
+
404
+ all_files = glob.glob("*.*")
405
+ all_files = [file for file in all_files if len(os.path.splitext(file)[0]) >= 20] # exclude files with short names
406
+ all_files.sort(key=lambda x: (os.path.splitext(x)[1], x), reverse=True) # sort by file type and file name in descending order
407
+
408
+ # sidebar of files
409
+ file_contents=''
410
+ next_action=''
411
+ for file in all_files:
412
+ col1, col2, col3, col4, col5 = st.sidebar.columns([1,6,1,1,1]) # adjust the ratio as needed
413
+ with col1:
414
+ if st.button("🌐", key="md_"+file): # md emoji button
415
+ with open(file, 'r') as f:
416
+ file_contents = f.read()
417
+ next_action='md'
418
+ with col2:
419
+ st.markdown(get_table_download_link(file), unsafe_allow_html=True)
420
+ with col3:
421
+ if st.button("πŸ“‚", key="open_"+file): # open emoji button
422
+ with open(file, 'r') as f:
423
+ file_contents = f.read()
424
+ next_action='open'
425
+ with col4:
426
+ if st.button("πŸ”", key="read_"+file): # search emoji button
427
+ with open(file, 'r') as f:
428
+ file_contents = f.read()
429
+ next_action='search'
430
+ with col5:
431
+ if st.button("πŸ—‘", key="delete_"+file):
432
+ os.remove(file)
433
+ st.experimental_rerun()
434
+
435
+ if len(file_contents) > 0:
436
+ if next_action=='open':
437
+ file_content_area = st.text_area("File Contents:", file_contents, height=500)
438
+ if next_action=='md':
439
+ st.markdown(file_contents)
440
+ if next_action=='search':
441
+ file_content_area = st.text_area("File Contents:", file_contents, height=500)
442
+ st.write('Reasoning with your inputs...')
443
+ response = chat_with_model(user_prompt, file_contents, model_choice)
444
+ filename = generate_filename(file_contents, choice)
445
+ create_file(filename, file_contents, response)
446
+
447
+ st.experimental_rerun()
448
+ #st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
449
+
450
+ if __name__ == "__main__":
451
+ main()
452
+
453
+ load_dotenv()
454
+ st.write(css, unsafe_allow_html=True)
455
+
456
+ st.header("Chat with documents :books:")
457
+ user_question = st.text_input("Ask a question about your documents:")
458
+ if user_question:
459
+ process_user_input(user_question)
460
 
461
+ with st.sidebar:
462
+ st.subheader("Your documents")
463
+ docs = st.file_uploader("import documents", accept_multiple_files=True)
464
+ with st.spinner("Processing"):
465
+ raw = pdf2txt(docs)
466
+ if len(raw) > 0:
467
+ length = str(len(raw))
468
+ text_chunks = txt2chunks(raw)
469
+ vectorstore = vector_store(text_chunks)
470
+ st.session_state.conversation = get_chain(vectorstore)
471
+ st.markdown('# AI Search Index of Length:' + length + ' Created.') # add timing
472
+ filename = generate_filename(raw, 'txt')
473
+ create_file(filename, raw, '')