File size: 1,375 Bytes
f3a61e0
a3fdd99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3572c4
a3fdd99
 
 
 
f3a61e0
 
819ac67
84af4a0
a6970fe
f3a61e0
 
 
3986348
 
 
 
0389e9a
3986348
f3a61e0
 
fc419f1
f3a61e0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import gradio as gr
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import FARMReader, PreProcessor, PDFToTextConverter, TfidfRetriever
import logging

document_store = InMemoryDocumentStore()
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True,
    split_overlap=3
)


def pdf_to_document_store(pdf_files):
    document_store.delete_documents()
    converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
    documents = [converter.convert(file_path=pdf_files.name, meta=None)[0]]
    preprocessed_docs = preprocessor.process(documents)
    document_store.write_documents(preprocessed_docs)
    return None


def summarize(files):
    print('Got files')
    print(type(files))
    pdf_to_document_store(files)
    return document_store.get_document_count()

title = "Summarize one or more PDFs with a Haystack Summariser pipeline"
#print('Before files')
#files = gr.inputs.File(file_count="multiple",type="file", label="Upload a pdf")
#print(str(files))
#print('After files')
iface = gr.Interface(fn=summarize, 
                    inputs= "files",
                    outputs="text",
                    title=title,
                    theme="default")
iface.launch()