GianJSX Tuana commited on
Commit
577c82f
·
0 Parent(s):

Duplicate from Tuana/PDF-Summarizer

Browse files

Co-authored-by: Tuana Celik <[email protected]>

Files changed (6) hide show
  1. .gitattributes +27 -0
  2. README.md +13 -0
  3. app.py +70 -0
  4. header-image.png +0 -0
  5. packages.txt +2 -0
  6. requirements.txt +1 -0
.gitattributes ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.onnx filter=lfs diff=lfs merge=lfs -text
13
+ *.ot filter=lfs diff=lfs merge=lfs -text
14
+ *.parquet filter=lfs diff=lfs merge=lfs -text
15
+ *.pb filter=lfs diff=lfs merge=lfs -text
16
+ *.pt filter=lfs diff=lfs merge=lfs -text
17
+ *.pth filter=lfs diff=lfs merge=lfs -text
18
+ *.rar filter=lfs diff=lfs merge=lfs -text
19
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
20
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
21
+ *.tflite filter=lfs diff=lfs merge=lfs -text
22
+ *.tgz filter=lfs diff=lfs merge=lfs -text
23
+ *.wasm filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: PDF Summarizer
3
+ emoji: 🌅
4
+ colorFrom: green
5
+ colorTo: pink
6
+ sdk: streamlit
7
+ sdk_version: 1.9.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: Tuana/PDF-Summarizer
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
app.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from haystack.document_stores import InMemoryDocumentStore
3
+ from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter, Crawler
4
+ from haystack.schema import Document
5
+ import logging
6
+ import base64
7
+ from PIL import Image
8
+ import validators
9
+
10
+ @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
11
+ def start_haystack():
12
+ document_store = InMemoryDocumentStore()
13
+ preprocessor = PreProcessor(
14
+ clean_empty_lines=True,
15
+ clean_whitespace=True,
16
+ clean_header_footer=True,
17
+ split_by="word",
18
+ split_length=200,
19
+ split_respect_sentence_boundary=True,
20
+ )
21
+ summarizer = TransformersSummarizer(model_name_or_path="facebook/bart-large-cnn")
22
+ return document_store, summarizer, preprocessor
23
+
24
+
25
+ def pdf_to_document_store(pdf_file):
26
+ document_store.delete_documents()
27
+ converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
28
+ with open("temp-path.pdf", 'wb') as temp_file:
29
+ base64_pdf = base64.b64encode(pdf_file.read()).decode('utf-8')
30
+ temp_file.write(base64.b64decode(base64_pdf))
31
+ doc = converter.convert(file_path="temp-path.pdf", meta=None)
32
+ preprocessed_docs=preprocessor.process(doc)
33
+ document_store.write_documents(preprocessed_docs)
34
+ temp_file.close()
35
+
36
+ def summarize(content):
37
+ pdf_to_document_store(content)
38
+ summaries = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True)
39
+ return summaries
40
+
41
+ def set_state_if_absent(key, value):
42
+ if key not in st.session_state:
43
+ st.session_state[key] = value
44
+
45
+ set_state_if_absent("summaries", None)
46
+
47
+ document_store, summarizer, preprocessor = start_haystack()
48
+
49
+ st.title('TL;DR with Haystack')
50
+ image = Image.open('header-image.png')
51
+ st.image(image)
52
+
53
+ st.markdown( """
54
+ This Summarization demo uses a [Haystack TransformerSummarizer node](https://haystack.deepset.ai/pipeline_nodes/summarizer). You can upload a PDF file, which will be converted to text with the [Haystack PDFtoTextConverter](https://haystack.deepset.ai/reference/file-converters#pdftotextconverter). In this demo, we produce 1 summary for the whole file you upload. So, the TransformerSummarizer treats the whole thing as one string, which means along with the model limitations, PDFs that have a lot of unneeded text at the beginning produce poor results. For best results, upload a document that has minimal intro and tables at the top.
55
+ """, unsafe_allow_html=True)
56
+
57
+ uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=False)
58
+
59
+ if uploaded_file is not None :
60
+ if st.button('Summarize Document'):
61
+ with st.spinner("📚 &nbsp;&nbsp; Please wait while we produce a summary..."):
62
+ try:
63
+ st.session_state.summaries = summarize(uploaded_file)
64
+ except Exception as e:
65
+ logging.exception(e)
66
+
67
+ if st.session_state.summaries:
68
+ st.write('## Summary')
69
+ for count, summary in enumerate(st.session_state.summaries):
70
+ st.write(summary.content)
header-image.png ADDED
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ poppler-utils
2
+ xpdf
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ farm-haystack[ocr,crawler]==1.4.0