Projeto commited on
Commit
5af74e5
·
1 Parent(s): 042e42d

Delete legalnlp/mask_functions.py

Browse files
Files changed (1) hide show
  1. legalnlp/mask_functions.py +0 -161
legalnlp/mask_functions.py DELETED
@@ -1,161 +0,0 @@
1
- import re
2
-
3
- def mask_email(txt):
4
-
5
- """
6
- Finds an email pattern and then masks it.
7
- Parameters
8
- -----------
9
- txt: str
10
- A piece of text containing the email pattern
11
- Returns
12
- -----------
13
- str
14
- masked email string as ' [email] '
15
- list
16
- list with the found pattern(s)
17
- """
18
-
19
-
20
- pattern=r'[^\s]+@[^\s]+'
21
- sub=' [email] '
22
-
23
- return re.sub(pattern, sub, txt, flags=re.I), re.findall(pattern, txt, flags=re.I)
24
-
25
- def mask_url(txt):
26
-
27
- """
28
- Finds an url pattern and then masks it.
29
- Parameters
30
- -----------
31
- txt: str
32
- A piece of text containing the url pattern
33
- Returns
34
- -----------
35
- str
36
- masked url string as ' [url] '
37
- list
38
- list with the found pattern(s)
39
- """
40
-
41
- pattern='http\S+'
42
- pattern2='www\S+'
43
- sub=' [url] '
44
-
45
- txt, find = re.sub(pattern, sub, txt, flags=re.I), re.findall(pattern, txt, flags=re.I)
46
- txt, find2 = re.sub(pattern2, sub, txt, flags=re.I), re.findall(pattern2, txt, flags=re.I)
47
-
48
- return txt, find+find2
49
-
50
- def mask_oab(txt):
51
-
52
- """
53
- Finds an OAB (which stands for Order of Attorneys of Brazil) pattern and then masks it.
54
- Parameters
55
- -----------
56
- txt: str
57
- A piece of text containing the OAB pattern
58
- Returns
59
- -----------
60
- str
61
- masked OAB string as ' [oab] '
62
- list
63
- list with the found pattern(s)
64
- """
65
-
66
- find=[]
67
- pattern='OAB\s?[:-]?\s?\d+\s?/?\s?[A-Z]?[A-Z]?'
68
- pattern2='OAB\s?/?\s?[A-Z]?[A-Z]?\s?[:-]?\s?\d+'
69
- sub=' [oab] '
70
-
71
- txt, find = re.sub(pattern, sub, txt, flags=re.I), re.findall(pattern, txt, flags=re.I)
72
- txt, find2 = re.sub(pattern2, sub, txt, flags=re.I), re.findall(pattern2, txt, flags=re.I)
73
-
74
- return txt, find+find2
75
-
76
- def mask_data(txt):
77
-
78
- """
79
- Finds a date-format pattern and then masks it.
80
- Parameters
81
- -----------
82
- txt: str
83
- A piece of text containing the date
84
- Returns
85
- -----------
86
- str
87
- masked date string as ' [data] '
88
- list
89
- list with the found pattern(s)
90
- """
91
-
92
-
93
- pattern="\d{2}\s?\/\s?\d{2}\s?\/\s?\d{4}"
94
- sub=' [data] '
95
-
96
- return re.sub(pattern, sub, txt, flags=re.I), re.findall(pattern, txt, flags=re.I)
97
-
98
- def mask_processo(txt, num=15):
99
-
100
- """
101
- Finds a lawsuit number pattern and then masks it.
102
- Parameters
103
- -----------
104
- txt: str
105
- A piece of text containing the lawsuit number pattern
106
- Returns
107
- -----------
108
- str
109
- masked lawsuit number string as ' [processo] '
110
- list
111
- list with the found pattern(s)
112
- """
113
-
114
- pattern="\d{"+str(num)+",}" #consideramos números com mais de 15 dígitos como sendo o número de um processo
115
- sub=' [processo] '
116
-
117
- return re.sub(pattern, sub, txt, flags=re.I), re.findall(pattern, txt, flags=re.I)
118
-
119
- def mask_numero(txt):
120
-
121
- """
122
- Finds a number pattern and then masks it.
123
- Parameters
124
- -----------
125
- txt: str
126
- A piece of text containing the number pattern
127
- Returns
128
- -----------
129
- str
130
- masked number string as ' [numero] '
131
- list
132
- list with the found pattern(s)
133
-
134
- """
135
-
136
- pattern="\d+"
137
- sub=' [numero] '
138
-
139
- return re.sub(pattern, sub, txt, flags=re.I), re.findall(pattern, txt, flags=re.I)
140
-
141
- def mask_valor(txt):
142
-
143
- """
144
- Finds a value pattern and then masks it.
145
- Parameters
146
- -----------
147
- txt: str
148
- A piece of text containing the value pattern
149
- Returns
150
- -----------
151
- str
152
- masked value string as ' [valor] '
153
- list
154
- list with the found pattern(s)
155
- """
156
-
157
-
158
- pattern="R\s?\$\s?\d+[.,]?\d+[.,]?\d+"
159
- sub=' [valor] '
160
-
161
- return re.sub(pattern, sub, txt, flags=re.I), re.findall(pattern, txt, flags=re.I)