Spaces:
Build error
Build error
File size: 2,035 Bytes
28ec4f0 a3fdd99 6e57c67 836e16d a3fdd99 f6cc0cb a3fdd99 9a54394 5fdc2d5 cc0fbf1 9a54394 6e57c67 9a54394 a3fdd99 6c152f9 5fdc2d5 f6cc0cb 5fdc2d5 5f91d5b f6cc0cb a7fa548 abcc1dd eff1d2d d42a71a 9a54394 9097656 28ec4f0 a3fdd99 28ec4f0 bfb2bfb a4300de a7fa548 a4300de 3dfe2a3 f3a61e0 9097656 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import streamlit as st
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter
from haystack.schema import Document
import logging
import base64
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
def start_haystack():
document_store = InMemoryDocumentStore()
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="word",
split_length=200,
split_respect_sentence_boundary=True,
)
summarizer = TransformersSummarizer(model_name_or_path="google/pegasus-xsum")
return document_store, summarizer, preprocessor
def pdf_to_document_store(pdf_files):
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
for pdf in pdf_files:
with open("temp-path.pdf", 'wb') as temp_file:
base64_pdf = base64.b64encode(pdf.read()).decode('utf-8')
temp_file.write(base64.b64decode(base64_pdf))
doc = converter.convert(file_path="temp-path.pdf", meta=None)[0]
preprocessed_docs=preprocessor.process([doc])
document_store.write_documents(preprocessed_docs)
temp_file.close()
def summarize(files):
pdf_to_document_store(files)
summaries = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True)
st.write('Summary')
for summary in summaries:
st.write(summary.content)
document_store, summarizer, preprocessor = start_haystack()
uploaded_files = st.file_uploader("Choose PDF files", accept_multiple_files=True)
if uploaded_files is not None:
st.write(len(uploaded_files))
if st.button('Summarize Documents'):
summarize(uploaded_files)
if st.button('Calculate num of docs'):
st.write(document_store.get_document_count())
if st.button('Clear DocumentStore'):
document_store.delete_all_documents() |