import gradio as gr from haystack.document_stores import InMemoryDocumentStore from haystack.nodes import FARMReader, PreProcessor, PDFToTextConverter, TfidfRetriever import logging document_store = InMemoryDocumentStore() preprocessor = PreProcessor( clean_empty_lines=True, clean_whitespace=True, clean_header_footer=True, split_by="word", split_length=100, split_respect_sentence_boundary=True, split_overlap=3 ) def pdf_to_document_store(pdf_files): document_store.delete_documents() converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"]) documents = [] for pdf in pdf_files: logging.info(pdf.name) documents.append(converter.convert(file_path=pdf.name, meta=None)) preprocessed_docs = preprocessor.process(documents) document_store.write_documents(preprocessed_docs) return None def summarize(files): print('Got files') pdf_to_document_store(files) return document_store.get_document_count() title = "Summarize one or more PDFs with a Haystack Summariser pipeline" iface = gr.Interface(fn=summarize, inputs="files", outputs="text", title=title, theme="default") iface.launch()