Mohamed-BC commited on
Commit
20b1f3c
1 Parent(s): c766880

docubot-v1.5.3

Browse files
.streamlit/config.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [server]
2
+ enableXsrfProtection = false
3
+ enableCORS = false
__pycache__/utilities.cpython-310.pyc ADDED
Binary file (1.39 kB). View file
 
app.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import base64
3
+ import os
4
+ import tempfile
5
+ import time
6
+ import utilities as util
7
+
8
+ def main():
9
+ st.set_page_config(page_title="PDF Viewer", layout="wide", page_icon='./logo.png')
10
+ # st.image('./logo.png', width=60)
11
+ st.title(":blue[DocuBot]",anchor=False)
12
+ st.write("View and chat with your PDF")
13
+
14
+ if 'messages' not in st.session_state:
15
+ st.session_state.messages = [{'role': 'assistant', "content": "Hello! Upload a document and let's get started."}]
16
+ state = True
17
+ # with st.sidebar:
18
+ uploaded_file = st.sidebar.file_uploader("Upload your PDF File", type="pdf")
19
+ if uploaded_file:
20
+ state = False
21
+ with tempfile.TemporaryDirectory() as tmp_dir:
22
+ file_path = os.path.join(tmp_dir, uploaded_file.name)
23
+ with open(file_path, "wb") as f:
24
+ f.write(uploaded_file.getvalue()) # Write the PDF content
25
+ pdf_text = util.get_pdf_text(file_path)
26
+ pdf_frame = util.display_pdf(file_path)
27
+ st.sidebar.markdown(pdf_frame, unsafe_allow_html=True)
28
+ user_prompt = st.chat_input("What do you wanna know about the document?", disabled=state)
29
+ if st.sidebar.button(label="summarize"):
30
+ st.session_state.messages.append({'role': 'user', "content": "Summarize the document"})
31
+ with st.spinner("..."):
32
+ summary = util.summarize(pdf_text, max_length=200)
33
+ st.session_state.messages.append({'role': 'assistant', "content": "Summary of "+uploaded_file.name+": <br>"+summary})
34
+ if user_prompt:
35
+ st.session_state.messages.append({'role': 'user', "content": user_prompt})
36
+ response = "You asked: "+user_prompt
37
+ with st.spinner("..."):
38
+ time.sleep(2)
39
+ st.session_state.messages.append({'role': 'assistant', "content": response})
40
+
41
+ for message in st.session_state.messages:
42
+ with st.chat_message(message['role']):
43
+ st.markdown(message['content'], unsafe_allow_html=True)
44
+
45
+ if __name__ == "__main__":
46
+ main()
logo.png ADDED
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ streamlit
2
+ transformers
3
+ pdfplumber
utilities.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import pdfplumber
3
+ from transformers import pipeline
4
+ # Function to extract text from a PDF and summarize it
5
+ def get_pdf_text(pdf_file):
6
+ text = ""
7
+ # Open the PDF file and extract text
8
+ with pdfplumber.open(pdf_file) as pdf:
9
+ for page in pdf.pages:
10
+ text += page.extract_text() # Extract text from each page
11
+ return text
12
+
13
+ def display_pdf(file_path):
14
+ # Read the PDF file
15
+ with open(file_path, "rb") as f:
16
+ data = f.read()
17
+ # Convert PDF content to base64
18
+ base64_pdf = base64.b64encode(data).decode("utf-8")
19
+ # Create an iframe to display the PDF
20
+ pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600px"></iframe>'
21
+ return pdf_display
22
+
23
+ def split_text(text, max_length):
24
+ """Split text into smaller chunks based on a specified length."""
25
+ words = text.split()
26
+ chunks = [' '.join(words[i:i+max_length]) for i in range(0, len(words), max_length)]
27
+ return chunks
28
+
29
+ def summarize(text,max_length):
30
+ summarizer = pipeline(task="summarization", model='facebook/bart-large-cnn')
31
+ text_chunks = split_text(text, max_length=max_length) # Split into chunks of 500 words
32
+ # Summarize each chunk and combine the results
33
+ summaries = [summarizer(chunk)[0]['summary_text'] for chunk in text_chunks]
34
+ # Combine the summaries into a final summary
35
+ final_summary = ' '.join(summaries)
36
+ return final_summary
37
+ # return text_chunks[0]