Spaces:
Build error
Build error
import streamlit as st | |
from haystack.document_stores import InMemoryDocumentStore | |
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter | |
from haystack.schema import Document | |
import logging | |
import base64 | |
def start_haystack(): | |
document_store = InMemoryDocumentStore() | |
preprocessor = PreProcessor( | |
clean_empty_lines=True, | |
clean_whitespace=True, | |
clean_header_footer=True, | |
split_by="word", | |
split_length=200, | |
split_respect_sentence_boundary=True, | |
) | |
summarizer = TransformersSummarizer(model_name_or_path="google/pegasus-xsum") | |
return document_store, summarizer, preprocessor | |
def pdf_to_document_store(pdf_files): | |
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"]) | |
for pdf in pdf_files: | |
with open("temp-path.pdf", 'wb') as temp_file: | |
base64_pdf = base64.b64encode(pdf.read()).decode('utf-8') | |
temp_file.write(base64.b64decode(base64_pdf)) | |
doc = converter.convert(file_path="temp-path.pdf", meta=None)[0] | |
preprocessed_docs=preprocessor.process([doc]) | |
document_store.write_documents(preprocessed_docs) | |
temp_file.close() | |
def summarize(files): | |
pdf_to_document_store(files) | |
summaries = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True) | |
st.write('Summary') | |
for summary in summaries: | |
st.write(summary.content) | |
document_store, summarizer, preprocessor = start_haystack() | |
uploaded_files = st.file_uploader("Choose PDF files", accept_multiple_files=True) | |
if uploaded_files is not None: | |
st.write(len(uploaded_files)) | |
if st.button('Summarize Documents'): | |
summarize(uploaded_files) | |
if st.button('Calculate num of docs'): | |
st.write(document_store.get_document_count()) | |
if st.button('Clear DocumentStore'): | |
document_store.delete_all_documents() |