Projeto commited on
Commit
042e42d
·
1 Parent(s): c34187f

Delete legalnlp/clean_functions.py

Browse files
Files changed (1) hide show
  1. legalnlp/clean_functions.py +0 -94
legalnlp/clean_functions.py DELETED
@@ -1,94 +0,0 @@
1
-
2
- import re
3
- import ftfy
4
- from legalnlp.mask_functions import *
5
-
6
-
7
- def clean_bert(text):
8
- """
9
- Cleans a text based on bad Unicode and other characters
10
- Parameters
11
- -----------
12
- texto: str
13
- A piece of text
14
- Returns
15
- -----------
16
- str
17
- Fixed text
18
- """
19
-
20
- txt = ftfy.fix_text(text)
21
- txt = txt.replace("\n", " ")
22
- txt = re.sub(' +', ' ', txt)
23
- return(txt)
24
-
25
-
26
- def clean(text, lower=True, return_masked=False):
27
- """
28
- Cleans a text by removing general patterns, such as url, email, acronyms and other symbols, plural
29
- of words and specific Portuguese-related grammar
30
- Parameters
31
- -----------
32
- texto: str
33
- A piece of text
34
- lower: bool
35
- Whether to lowercase text (Default: True)
36
- return_masked: bool
37
- If return_masked == False, the function outputs a clean text. Otherwise, it returns a dictionary containing the clean text and the information extracted by RegEx (Default: False)
38
- Returns
39
- -----------
40
- dict or str
41
-
42
- """
43
-
44
- dic = {}
45
-
46
- # Limpeza geral
47
- dic['txt'], dic['url'] = mask_url(text) # Remove URLs
48
- dic['txt'], dic['email'] = mask_email(dic['txt']) # Remove emails
49
- # Siglas (e.g., C.P.F => CPF)
50
- dic['txt'] = re.sub("([A-Z])\.", r"\1", dic['txt'])
51
- if lower:
52
- dic['txt'] = dic['txt'].lower() # Tornando letras minúsculas
53
- dic['txt'] = re.sub("s[\/\.]a", " sa ", dic['txt'],
54
- flags=re.I) # s.a ou s/a => sa
55
- dic['txt'] = dic['txt'].replace(" - - ", " - ")
56
- dic['txt'] = dic['txt'].replace(" - ", " - - ")
57
- # Colocando espaço aos lados dos símbolos
58
- dic['txt'] = re.sub("(\W)", r" \1 ", dic['txt'])
59
- dic['txt'] = dic['txt'].replace("\n", " ")
60
- dic['txt'] = dic['txt'].replace("\t", " ")
61
-
62
- # Possíveis plurais e gênero
63
- dic['txt'] = dic['txt'].replace("( s )", "(s)")
64
- dic['txt'] = dic['txt'].replace("( a )", "(a)")
65
- dic['txt'] = dic['txt'].replace("( as )", "(as)")
66
- dic['txt'] = dic['txt'].replace("( o )", "(o)")
67
- dic['txt'] = dic['txt'].replace("( os )", "(os)")
68
-
69
- # Juntando algumas strings
70
- dic['txt'] = re.sub("(?<=\d) [-\.] (?=\d)", '', dic['txt'])
71
- dic['txt'] = re.sub("(?<=\d) , (?=\d)", ',', dic['txt'])
72
- dic['txt'] = dic['txt'].replace("[ email ]", "[email]")
73
- dic['txt'] = dic['txt'].replace("[ url ]", "[url]")
74
- # (e.g., arquivem - se => arquivem-se)
75
- dic['txt'] = re.sub("(\w) - (\w)", r"\1-\2", dic['txt'])
76
- dic['txt'] = re.sub(' +', ' ', dic['txt'])
77
-
78
- # Mascarando
79
- dic['txt'], dic['oab'] = mask_oab(dic['txt'])
80
- dic['txt'], dic['data'] = mask_data(dic['txt'])
81
- dic['txt'], dic['processo'] = mask_processo(dic['txt'])
82
- # Consideramos que as casas decimais são dadas pela vírgula
83
- dic['txt'], dic['valor'] = mask_valor(dic['txt'])
84
- dic['txt'], dic['numero'] = mask_numero(dic['txt'])
85
-
86
- # Extra spaces
87
- dic['txt'] = re.sub(' +', ' ', dic['txt'])
88
- dic['txt'] = dic['txt'].strip()
89
-
90
- # Output
91
- if return_masked:
92
- return dic
93
- else:
94
- return dic['txt']