File size: 797 Bytes
fbf7e95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from unidecode import unidecode
import numpy as np
import pandas as pd


def remove_diacritics(series):
    se_np = series.to_numpy()
    se_np = np.vectorize(unidecode)(se_np)
    return pd.Series(se_np)


def lowercase(series):
    return series.str.lower()


def remove_punctuation(series):
    return series.str.replace(r"[^\w\s]", "")


def normalize_whitespace(series):
    # Replace all whitespace with a single space
    s = series.str.replace(r"\s", " ")
    # Remove leading and trailing whitespace
    s = s.str.strip()
    # Remove double spaces
    return s.str.replace(r"\s+", " ")


def substring(series, start, end):
    return series.str[start:end]


def apply_normalizers(series, transforms):
    for transform in transforms:
        series = transform(series)
    return series