File size: 1,097 Bytes
a3fdd99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import streamlit as st
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import FARMReader, PreProcessor, PDFToTextConverter, TfidfRetriever
import logging

document_store = InMemoryDocumentStore()
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True,
    split_overlap=3
)

uploaded_files = st.file_uploader(label='Upload a PDF Document', accept_multiple_files=True)
logging.info(uploaded_files)

def pdf_to_document_store(pdf_files):
    document_store.delete_documents()
    converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
    documents = []
    for pdf in pdf_files:
        documents.append(converter.convert(file_path=pdf.name, meta=None))
    preprocessed_docs = preprocessor.process(documents)
    document_store.write_documents(preprocessed_docs)
    return None

if uploaded_files is not None:
    document_store.delete_all_documents()
    pdf_to_document_store(uploaded_files)