File size: 4,622 Bytes
91975ca
6fa214d
 
c992322
91975ca
0c277f0
 
 
 
 
8e1e329
d4df53b
daf15f4
0c277f0
 
 
 
91975ca
ce202e4
 
0c277f0
91975ca
 
 
0c277f0
 
 
 
 
c992322
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6fa214d
c992322
 
 
 
 
91975ca
c2c2862
 
6fa214d
c2c2862
 
91975ca
 
c2c2862
 
 
 
 
91975ca
 
5e46932
0c277f0
91975ca
0c277f0
91975ca
0c277f0
91975ca
 
 
 
c2c2862
 
 
91975ca
c2c2862
6fa214d
c2c2862
91975ca
c2c2862
 
 
 
6fa214d
 
 
 
 
 
 
 
 
 
 
0c277f0
6fa214d
bbe4709
8481e1e
0c277f0
bbe4709
0c277f0
 
 
bbe4709
0c277f0
 
 
 
bbe4709
 
 
 
0c277f0
 
 
 
 
 
bbe4709
0c277f0
 
 
 
 
 
 
bbe4709
91975ca
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import streamlit as st
from typing import Callable, Dict, List, Optional

import re
import os
from haystack.utils import fetch_archive_from_http, clean_wiki_text, convert_files_to_docs
from haystack.schema import Answer
from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines import ExtractiveQAPipeline
from haystack.nodes import FARMReader, TfidfRetriever
from haystack.nodes.file_converter import PDFToTextConverter, TextConverter
from haystack.schema import Document
from haystack.nodes import PreProcessor
import logging
from markdown import markdown
from annotated_text import annotation
from PIL import Image

logger = logging.getLogger(__name__)

os.environ['TOKENIZERS_PARALLELISM'] ="false"



#def load_and_write_data(document_store):
#    doc_dir = './article_txt_got'
#    docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
#    document_store.write_documents(docs)

def basic(s):
    """
    :param s: string to be processed
    :return: processed string: see comments in the source code for more info
    """
    # Text Lowercase
    #s = s.lower() 
    # Remove punctuation
    #translator = str.maketrans(' ', ' ', string.punctuation) 
    #s = s.translate(translator)
    # Remove URLs
    s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
    s = re.sub(r"http\S+", " ", s)
    # Remove new line characters
    #s = re.sub('\n', ' ', s) 
  
    # Remove distracting single quotes
    #s = re.sub("\'", " ", s) 
    # Remove all remaining numbers and non alphanumeric characters
    #s = re.sub(r'\d+', ' ', s) 
    #s = re.sub(r'\W+', ' ', s)

    # define custom words to replace:
    #s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
    
    return s.strip()
    
def load_document(
    file_path: str,
    file_name,
    encoding: Optional[str] = None,
    id_hash_keys: Optional[List[str]] = None,
) -> List[Document]:
    
    """
    takes docx, txt and pdf files as input and \
    extracts text as well as the filename as metadata. \
    Since haystack does not take care of all pdf files, \
    pdfplumber is attached to the pipeline in case the pdf \ 
    extraction fails via Haystack.
    Returns a list of type haystack.schema.Document
    """
    st.write(file_name)
    if file_name.endswith('.pdf'):
        converter = PDFToTextConverter(remove_numeric_tables=True)
    if file_name.endswith('.txt'):
        converter = TextConverter()
    if file_name.endswith('.docx'):
        converter = DocxToTextConverter()


    documents = []
    logger.info("Converting {}".format(file_name))
    # PDFToTextConverter, TextConverter, and DocxToTextConverter 
    # return a list containing a single Document
    document = converter.convert(
                file_path=file_path, meta=None, 
                encoding=encoding, id_hash_keys=id_hash_keys
                )[0]
    text = document.content
    documents.append(Document(content=text, 
                              meta={"name": file_name}, 
                              id_hash_keys=id_hash_keys))
    
    '''check if text is empty and apply different pdf processor. \
    This can happen whith certain pdf types.'''
    for i in documents: 
        if i.content == "":
            st.write("using pdfplumber")
            text = []
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    text.append(page.extract_text())
            i.content = ' '.join([page for page in text])
    
    return documents

 
def preprocessing(document):
    """
    takes in haystack document object and splits it into paragraphs and applies simple cleaning.
    Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and 
    list that contains all text joined together.
    """    

    preprocessor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="sentence",
        split_length=3,
        split_respect_sentence_boundary=False,
        split_overlap=1
    )
    for i in document:
        docs_processed = preprocessor.process([i])
        for item in docs_processed:
            item.content = basic(item.content)

    st.write("your document has been splitted to", len(docs_processed), "paragraphs")
    
    # create dataframe of text and list of all text
    #df = pd.DataFrame(docs_processed)
    #all_text = " ".join(df.content.to_list())
    #par_list = df.content.to_list()

    return docs_processed #, df, all_text, par_list