Spaces:
Sleeping
Sleeping
Mohamed-BC
commited on
Commit
•
20b1f3c
1
Parent(s):
c766880
docubot-v1.5.3
Browse files- .streamlit/config.toml +3 -0
- __pycache__/utilities.cpython-310.pyc +0 -0
- app.py +46 -0
- logo.png +0 -0
- requirements.txt +3 -0
- utilities.py +37 -0
.streamlit/config.toml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[server]
|
2 |
+
enableXsrfProtection = false
|
3 |
+
enableCORS = false
|
__pycache__/utilities.cpython-310.pyc
ADDED
Binary file (1.39 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import base64
|
3 |
+
import os
|
4 |
+
import tempfile
|
5 |
+
import time
|
6 |
+
import utilities as util
|
7 |
+
|
8 |
+
def main():
|
9 |
+
st.set_page_config(page_title="PDF Viewer", layout="wide", page_icon='./logo.png')
|
10 |
+
# st.image('./logo.png', width=60)
|
11 |
+
st.title(":blue[DocuBot]",anchor=False)
|
12 |
+
st.write("View and chat with your PDF")
|
13 |
+
|
14 |
+
if 'messages' not in st.session_state:
|
15 |
+
st.session_state.messages = [{'role': 'assistant', "content": "Hello! Upload a document and let's get started."}]
|
16 |
+
state = True
|
17 |
+
# with st.sidebar:
|
18 |
+
uploaded_file = st.sidebar.file_uploader("Upload your PDF File", type="pdf")
|
19 |
+
if uploaded_file:
|
20 |
+
state = False
|
21 |
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
22 |
+
file_path = os.path.join(tmp_dir, uploaded_file.name)
|
23 |
+
with open(file_path, "wb") as f:
|
24 |
+
f.write(uploaded_file.getvalue()) # Write the PDF content
|
25 |
+
pdf_text = util.get_pdf_text(file_path)
|
26 |
+
pdf_frame = util.display_pdf(file_path)
|
27 |
+
st.sidebar.markdown(pdf_frame, unsafe_allow_html=True)
|
28 |
+
user_prompt = st.chat_input("What do you wanna know about the document?", disabled=state)
|
29 |
+
if st.sidebar.button(label="summarize"):
|
30 |
+
st.session_state.messages.append({'role': 'user', "content": "Summarize the document"})
|
31 |
+
with st.spinner("..."):
|
32 |
+
summary = util.summarize(pdf_text, max_length=200)
|
33 |
+
st.session_state.messages.append({'role': 'assistant', "content": "Summary of "+uploaded_file.name+": <br>"+summary})
|
34 |
+
if user_prompt:
|
35 |
+
st.session_state.messages.append({'role': 'user', "content": user_prompt})
|
36 |
+
response = "You asked: "+user_prompt
|
37 |
+
with st.spinner("..."):
|
38 |
+
time.sleep(2)
|
39 |
+
st.session_state.messages.append({'role': 'assistant', "content": response})
|
40 |
+
|
41 |
+
for message in st.session_state.messages:
|
42 |
+
with st.chat_message(message['role']):
|
43 |
+
st.markdown(message['content'], unsafe_allow_html=True)
|
44 |
+
|
45 |
+
if __name__ == "__main__":
|
46 |
+
main()
|
logo.png
ADDED
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
transformers
|
3 |
+
pdfplumber
|
utilities.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import pdfplumber
|
3 |
+
from transformers import pipeline
|
4 |
+
# Function to extract text from a PDF and summarize it
|
5 |
+
def get_pdf_text(pdf_file):
|
6 |
+
text = ""
|
7 |
+
# Open the PDF file and extract text
|
8 |
+
with pdfplumber.open(pdf_file) as pdf:
|
9 |
+
for page in pdf.pages:
|
10 |
+
text += page.extract_text() # Extract text from each page
|
11 |
+
return text
|
12 |
+
|
13 |
+
def display_pdf(file_path):
|
14 |
+
# Read the PDF file
|
15 |
+
with open(file_path, "rb") as f:
|
16 |
+
data = f.read()
|
17 |
+
# Convert PDF content to base64
|
18 |
+
base64_pdf = base64.b64encode(data).decode("utf-8")
|
19 |
+
# Create an iframe to display the PDF
|
20 |
+
pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600px"></iframe>'
|
21 |
+
return pdf_display
|
22 |
+
|
23 |
+
def split_text(text, max_length):
|
24 |
+
"""Split text into smaller chunks based on a specified length."""
|
25 |
+
words = text.split()
|
26 |
+
chunks = [' '.join(words[i:i+max_length]) for i in range(0, len(words), max_length)]
|
27 |
+
return chunks
|
28 |
+
|
29 |
+
def summarize(text,max_length):
|
30 |
+
summarizer = pipeline(task="summarization", model='facebook/bart-large-cnn')
|
31 |
+
text_chunks = split_text(text, max_length=max_length) # Split into chunks of 500 words
|
32 |
+
# Summarize each chunk and combine the results
|
33 |
+
summaries = [summarizer(chunk)[0]['summary_text'] for chunk in text_chunks]
|
34 |
+
# Combine the summaries into a final summary
|
35 |
+
final_summary = ' '.join(summaries)
|
36 |
+
return final_summary
|
37 |
+
# return text_chunks[0]
|