ValadisCERTH commited on
Commit
ce611b0
·
1 Parent(s): e60af45

Create magnitudeIdentification

Browse files
Files changed (1) hide show
  1. magnitudeIdentification +320 -0
magnitudeIdentification ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ import re
3
+ from word2number import w2n
4
+
5
+ # Load the spacy model with GloVe embeddings
6
+ nlp = spacy.load("en_core_web_lg")
7
+
8
+
9
+ def capture_numbers(input_sentence):
10
+ '''
11
+ This is a function to capture cases of refered numbers either in numeric or free-text form
12
+ '''
13
+
14
+ try:
15
+ # Define the regular expression patterns
16
+ pattern1 = r"(\d+|\w+(?:\s+\w+)*)\s+(decimal|point|dot|comma)\s+(\d+|\w+(?:\s+\w+)*)"
17
+
18
+ # Find all matches in the text
19
+ matches = re.findall(pattern1, input_sentence)
20
+
21
+ # This part is to capture cases like six point five, 5 point five, six point 5, 5 point 5
22
+ pattern_numbers = []
23
+ for match in matches:
24
+ if len(match) == 3:
25
+ # add the $pattern string to easily specify them in a subsequent step
26
+ full_string = "{} {} {} {}".format(match[0], match[1], match[2], '$pattern')
27
+ pattern_numbers.append(full_string)
28
+
29
+ for elem in pattern_numbers:
30
+ input_sentence = input_sentence.replace(elem, " ")
31
+
32
+ if pattern_numbers:
33
+ # Remove duplicates with set and convert back to list
34
+ pattern_final_numbers = list(set(pattern_numbers))
35
+ else:
36
+ pattern_final_numbers = []
37
+
38
+ # we delete the captured references from the sentence, because if we capture something like seven point five
39
+ # then spacy will also identify seven and five, which we do not want it to
40
+ for element in pattern_final_numbers:
41
+ target_elem = element.replace("$pattern", "").strip()
42
+ if target_elem in input_sentence:
43
+ input_sentence = input_sentence.replace(target_elem, " ")
44
+
45
+ # This is for cases of thirty eight or one million and two, etc.
46
+
47
+ # Define a regular expression to match multiword free-text numbers
48
+ pattern2 = r"(?<!\w)(?:(?:zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|trillion)(?:\s(?:and\s)?(?:zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|trillion))+\s?)+(?!\w*pennies)"
49
+
50
+ # Find all multiword free-text number matches in the sentence
51
+ multi_numbers = re.findall(pattern2, input_sentence)
52
+
53
+ if multi_numbers:
54
+ multinumber_final_numbers = list(set(multi_numbers))
55
+ else:
56
+ multinumber_final_numbers = []
57
+
58
+ for elem in multinumber_final_numbers:
59
+ if elem in input_sentence:
60
+ input_sentence = input_sentence.replace(elem, " ")
61
+
62
+ # we also delete the captured references from the sentence in this case
63
+ for element in multinumber_final_numbers:
64
+ target_elem = element.replace("$pattern", "").strip()
65
+ if target_elem in input_sentence:
66
+ input_sentence = input_sentence.replace(target_elem, " ")
67
+
68
+ # Parse the input sentence with Spacy
69
+ doc = nlp(input_sentence)
70
+
71
+ # This is to capture all the numbers in int and float form, as well as numbers like eight, two, hundred
72
+ s_numbers = [token.text for token in doc if token.like_num]
73
+
74
+ if s_numbers:
75
+ # Remove duplicates with set and convert back to list
76
+ spacy_final_numbers = list(set(s_numbers))
77
+
78
+ else:
79
+ spacy_final_numbers = []
80
+
81
+ # return the extracted numbers
82
+ return pattern_final_numbers + multinumber_final_numbers + spacy_final_numbers
83
+
84
+ except:
85
+ return 0
86
+
87
+
88
+ def numeric_number_dot_freetext(text):
89
+ '''
90
+ This is a function to convert cases of '6 point five, six point 5 etc'
91
+ '''
92
+
93
+ try:
94
+ # # Define a dictionary to map words to numbers
95
+ num_dict = {
96
+ 'zero': 0,
97
+ 'one': 1,
98
+ 'two': 2,
99
+ 'three': 3,
100
+ 'four': 4,
101
+ 'five': 5,
102
+ 'six': 6,
103
+ 'seven': 7,
104
+ 'eight': 8,
105
+ 'nine': 9,
106
+ 'ten': 10,
107
+ 'eleven': 11,
108
+ 'twelve': 12,
109
+ 'thirteen': 13,
110
+ 'fourteen': 14,
111
+ 'fifteen': 15,
112
+ 'sixteen': 16,
113
+ 'seventeen': 17,
114
+ 'eighteen': 18,
115
+ 'nineteen': 19,
116
+ 'twenty': 20,
117
+ 'thirty': 30,
118
+ 'forty': 40,
119
+ 'fifty': 50,
120
+ 'sixty': 60,
121
+ 'seventy': 70,
122
+ 'eighty': 80,
123
+ 'ninety': 90,
124
+ 'hundred': 100,
125
+ 'thousand': 1000,
126
+ 'million': 1000000,
127
+ 'billion': 1000000000,
128
+ 'trillion': 1000000000000
129
+ }
130
+
131
+ # # Define a regular expression pattern to extract the numeric form and free text form from input text
132
+ pattern = r"(\d+|\w+(?:\s+\w+)*)\s+(?:decimal|point|dot|comma)\s+(\d+|\w+(?:\s+\w+)*)"
133
+
134
+ # Use regular expression to extract the numeric form and free text form from input text
135
+ match = re.search(pattern, text)
136
+
137
+ if match:
138
+ num1 = match.group(1)
139
+ num2 = match.group(2)
140
+
141
+ # If the numeric form is a word, map it to its numerical value
142
+ if num1 in num_dict:
143
+ num1 = num_dict[num1]
144
+
145
+ # if not in the dictionary try also with the w2n library
146
+ else:
147
+
148
+ # try to convert to float. That means this is a number, otherwise it is a string so continue
149
+ try:
150
+ num1 = float(num1)
151
+ except:
152
+
153
+ # this will handle cases like "bla bla bla seven"
154
+ try:
155
+ num1 = w2n.word_to_num(num1)
156
+
157
+ # this is to handle cases like "bla bla bla 7"
158
+ except:
159
+
160
+ try:
161
+ # we identify all the numeric references
162
+ num_ref1 = [int(ref) for ref in re.findall(r'\d+', num1)]
163
+
164
+ # if there is exactly one number then we cope with that
165
+ if len(num_ref1) == 1:
166
+ num1 = num_ref1[0]
167
+
168
+ # in any other case throw an error
169
+ elif len(num_ref1) > 1:
170
+ return (0, 'MAGNITUDE', 'more_magnitude')
171
+
172
+ elif len(num_ref1) == 0:
173
+ return (0, 'MAGNITUDE', 'no_magnitude')
174
+
175
+ except:
176
+ return (0, 'MAGNITUDE', 'unknown_error')
177
+
178
+ # If the free text form is a word, map it to its numerical value
179
+ if num2 in num_dict:
180
+ num2 = num_dict[num2]
181
+
182
+ else:
183
+ try:
184
+ num2 = int(num2)
185
+ except:
186
+ try:
187
+ num2 = w2n.word_to_num(num2)
188
+ except:
189
+ try:
190
+ # we identify all the numeric references
191
+ num_ref2 = [int(ref) for ref in re.findall(r'\d+', num2)]
192
+
193
+ # if there is exactly one number then we cope with that
194
+ if len(num_ref2) == 1:
195
+ num2 = num_ref2[0]
196
+
197
+ # in any other case throw an error
198
+ elif len(num_ref2) > 1:
199
+ return (0, 'MAGNITUDE', 'more_magnitude')
200
+
201
+ elif len(num_ref2) == 0:
202
+ return (0, 'MAGNITUDE', 'no_magnitude')
203
+
204
+ except:
205
+ return (0, 'MAGNITUDE', 'unknown_error')
206
+
207
+ try:
208
+ # Convert both parts to float and add them together to get the final decimal value
209
+ result = float(num1) + float(num2) / (10 ** len(str(num2)))
210
+ return result
211
+ except:
212
+ return (0, 'MAGNITUDE', 'unknown_error')
213
+
214
+
215
+ else:
216
+ # If input text doesn't match the expected pattern, return None
217
+ return 0
218
+
219
+ except:
220
+ return 0
221
+
222
+
223
+ def convert_into_numeric(num_list):
224
+ '''
225
+ This is a function to convert the identified numbers into a numeric form
226
+ '''
227
+
228
+ if num_list:
229
+
230
+ # at first we examine how many numbers were captured. Only one number should exist
231
+ if len(num_list) > 1:
232
+ return (0, 'MAGNITUDE', 'more_magnitude')
233
+
234
+ else:
235
+ target_num = num_list[0]
236
+
237
+ # case it is an integer or float, convert it, otherwise move to following cases
238
+ try:
239
+
240
+ target_num_float = float(target_num)
241
+ return {'Number': target_num}
242
+
243
+ except:
244
+
245
+ # at first we check for cases like 6,5. If such cases exist we return a format error, otherwise we continue as before
246
+ if ',' in target_num:
247
+ try:
248
+ target_num = float(target_num.replace(",", "."))
249
+ return (0, 'MAGNITUDE', 'format_error')
250
+
251
+ except:
252
+ return (0, 'MAGNITUDE', 'unknown_error')
253
+
254
+ else:
255
+
256
+ # case that it belongs to one of the patterns of freetext number followed by numeric form etc (all the combinations)
257
+ if "$pattern" in target_num:
258
+ num, _ = target_num.split("$")
259
+
260
+ # try with this function for all the rest of cases (6 point 5, 6 point five, six point 5)
261
+ num_conversion = numeric_number_dot_freetext(num)
262
+
263
+ if num_conversion:
264
+ return {'Number': num_conversion}
265
+
266
+ # if none of the above has worked, then examine the case of freetext numbers without patterns (e.g. two, million, twenty three, etc)
267
+ else:
268
+ try:
269
+ num_conversion = w2n.word_to_num(target_num)
270
+ return {'Number': num_conversion}
271
+
272
+ # if none of the above try to handle cases of "million and two" or "a million and two". In such cases, we delete any 'a' reference
273
+ # and we insert the word 'one' at the beginning. In that way the w2n library can handle them besides immediately throw an error
274
+ except:
275
+
276
+ try:
277
+ target_num = target_num.replace(" a ", " ")
278
+ new_target_num = "one " + target_num
279
+ num_conversion = w2n.word_to_num(new_target_num)
280
+ return {'Number': num_conversion}
281
+
282
+ except:
283
+ return (0, 'MAGNITUDE', 'unknown_error')
284
+
285
+ else:
286
+ return (0, 'MAGNITUDE', 'no_magnitude')
287
+
288
+
289
+ def magnitude_binding(input_text):
290
+ '''
291
+ This is a function that binds together all the subcomponents of the magnitude number identification, while also controlling for multiple, or zero magnitude references
292
+ '''
293
+
294
+ try:
295
+
296
+ # capture the referred magnitudes
297
+ target_numbers = capture_numbers(input_text)
298
+
299
+ # we only accept for one magnitude reference
300
+ if len(target_numbers) == 1:
301
+ numeric_target_numbers = convert_into_numeric(target_numbers)
302
+
303
+ return numeric_target_numbers
304
+
305
+ # in case of zero references return the appropriate code (to aid returning the correct prompt)
306
+ elif len(target_numbers) == 0:
307
+ return (0, 'MAGNITUDE', 'no_magnitude')
308
+
309
+ # in case of more than one references return the appropriate code (to aid returning the correct prompt)
310
+ elif len(target_numbers) > 1:
311
+ return (0, 'MAGNITUDE', 'more_magnitude')
312
+
313
+ # in case of unexpected error return the appropriate code (to aid returning the correct prompt)
314
+ else:
315
+ return (0, 'MAGNITUDE', 'unknown_error')
316
+
317
+ except:
318
+ return (0, 'MAGNITUDE', 'unknown_error')
319
+
320
+