File size: 4,898 Bytes
e1b1d60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import logging
import pandas as pd
import numpy as np
import string  
import nltk 
import spacy
import en_core_web_sm
import re
import streamlit as st

from haystack.nodes import PreProcessor

'''basic cleaning - suitable for transformer models'''
def basic(s,SDG = False):
    """
    :param s: string to be processed
    :return: processed string: see comments in the source code for more info
    """
    # Text Lowercase
    #s = s.lower() 
    # Remove punctuation
    #translator = str.maketrans(' ', ' ', string.punctuation) 
    #s = s.translate(translator)
    # Remove URLs
    s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
    s = re.sub(r"http\S+", " ", s)
    if SDG == True:
        s = s.lower()
        translator = str.maketrans(' ', ' ', string.punctuation)  
        s = s.translate(translator)
        s = re.sub('\n', ' ', s)
        s = re.sub("\'", " ", s)
        s = re.sub(r'\d+', ' ', s)
        s = re.sub(r'\W+', ' ', s) 

    # Remove new line characters
    #s = re.sub('\n', ' ', s) 
  
    # Remove distracting single quotes
    #s = re.sub("\'", " ", s) 
    # Remove all remaining numbers and non alphanumeric characters
    #s = re.sub(r'\d+', ' ', s) 
    #s = re.sub(r'\W+', ' ', s)

    # define custom words to replace:
    #s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
    
    return s.strip()


def preprocessingForSDG(document):

    """
    takes in haystack document object and splits it into paragraphs and applies simple cleaning.

    Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and 
    list that contains all text joined together.
    """    

    preprocessor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="word",
        split_length=120,
        split_respect_sentence_boundary=False,
        #split_overlap=1
    )
    for i in document:
        docs_processed = preprocessor.process([i])
        for item in docs_processed:
            item.content = basic(item.content, SDG = True)

    with st.spinner("👑 document being splitted into paragraphs"):
        logging.info("document has been splitted to {} paragraphs".format(len(docs_processed)))
    
    # create dataframe of text and list of all text
    df = pd.DataFrame(docs_processed)
    all_text = " ".join(df.content.to_list())
    par_list = df.content.to_list()

    return docs_processed, df, all_text, par_list

def preprocessing(document):

    """
    takes in haystack document object and splits it into paragraphs and applies simple cleaning.

    Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and 
    list that contains all text joined together.
    """    

    preprocessor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="sentence",
        split_length=3,
        split_respect_sentence_boundary=False,
        split_overlap=1
    )
    for i in document:
        docs_processed = preprocessor.process([i])
        for item in docs_processed:
            item.content = basic(item.content)

    with st.spinner("👑 document being splitted into paragraphs"):
        logging.info("document has been splitted to {} paragraphs".format(len(docs_processed)))
    
    # create dataframe of text and list of all text
    df = pd.DataFrame(docs_processed)
    all_text = " ".join(df.content.to_list())
    par_list = df.content.to_list()

    return docs_processed, df, all_text, par_list

'''processing with spacy - suitable for models such as tf-idf, word2vec'''
def spacy_clean(alpha:str, use_nlp:bool = True) -> str:

    """

    Clean and tokenise a string using Spacy. Keeps only alphabetic characters, removes stopwords and

    filters out all but proper nouns, nounts, verbs and adjectives.

    Parameters
    ----------
    alpha : str

            The input string.

    use_nlp : bool, default False

            Indicates whether Spacy needs to use NLP. Enable this when using this function on its own.

            Should be set to False if used inside nlp.pipeline   

     Returns
    -------
    ' '.join(beta) : a concatenated list of lemmatised tokens, i.e. a processed string

    Notes
    -----
    Fails if alpha is an NA value. Performance decreases as len(alpha) gets large.
    Use together with nlp.pipeline for batch processing.

    """

    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"])

    if use_nlp:

        alpha = nlp(alpha)

        

    beta = []

    for tok in alpha:

        if all([tok.is_alpha, not tok.is_stop, tok.pos_ in ['PROPN', 'NOUN', 'VERB', 'ADJ']]):

            beta.append(tok.lemma_)

            
    text = ' '.join(beta)
    text = text.lower()
    return text