ValadisCERTH commited on
Commit
902d4bf
·
1 Parent(s): 13c4417

Update helper.py

Browse files
Files changed (1) hide show
  1. helper.py +151 -153
helper.py CHANGED
@@ -7,14 +7,14 @@ spacy.cli.download("en_core_web_lg")
7
  nlp = spacy.load("en_core_web_lg")
8
 
9
 
10
- def capture_numbers (input_sentence):
11
  '''
12
  This is a function to capture cases of refered numbers either in numeric or free-text form
13
  '''
14
 
15
  try:
16
  # Define the regular expression patterns
17
- pattern1 = r"\b(\w+)\s+(point|decimal|dot|comma)\s+(\w+)\b"
18
 
19
  # Find all matches in the text
20
  matches = re.findall(pattern1, input_sentence)
@@ -31,95 +31,61 @@ def capture_numbers (input_sentence):
31
  input_sentence = input_sentence.replace(elem, " ")
32
 
33
  if pattern_numbers:
34
-
35
  # Remove duplicates with set and convert back to list
36
- final_numbers = list(set(pattern_numbers))
37
- return final_numbers
38
-
39
  else:
 
40
 
41
- # Parse the input sentence with Spacy
42
- doc = nlp(input_sentence)
 
 
 
 
43
 
44
- # This is to capture all the numbers in int and float form, as well as numbers like eight, two, hunded
45
- numbers = [token.text for token in doc if token.like_num]
46
 
47
- # Remove duplicates with set and convert back to list
48
- final_numbers = list(set(numbers))
49
 
50
- # Print the extracted numbers
51
- if final_numbers:
52
- return final_numbers
53
- else:
54
- return 0
55
 
56
- except:
57
- return 0
 
 
58
 
59
- def numeric_freetext_dot_freetext(text):
60
- '''
61
- This is a function to convert cases of 'six point five'
62
- '''
63
-
64
- # Define a dictionary to map freetext numbers to numeric values
65
- number_map = {
66
- 'zero': 0,
67
- 'one': 1,
68
- 'two': 2,
69
- 'three': 3,
70
- 'four': 4,
71
- 'five': 5,
72
- 'six': 6,
73
- 'seven': 7,
74
- 'eight': 8,
75
- 'nine': 9,
76
- 'ten': 10,
77
- 'eleven': 11,
78
- 'twelve': 12,
79
- 'thirteen': 13,
80
- 'fourteen': 14,
81
- 'fifteen': 15,
82
- 'sixteen': 16,
83
- 'seventeen': 17,
84
- 'eighteen': 18,
85
- 'nineteen': 19,
86
- 'twenty': 20,
87
- 'thirty': 30,
88
- 'forty': 40,
89
- 'fifty': 50,
90
- 'sixty': 60,
91
- 'seventy': 70,
92
- 'eighty': 80,
93
- 'ninety': 90,
94
- 'hundred': 100,
95
- 'thousand': 1000,
96
- 'million': 1000000,
97
- 'billion': 1000000000,
98
- 'trillion': 1000000000000
99
- }
100
-
101
- try:
102
-
103
- # Define regular expression to match freetext numbers
104
- pattern = re.compile(r'(\w+(?:\s+\w+)*)\s+(point|decimal|dot|comma)\s+(\w+(?:\s+\w+)*)')
105
-
106
- # Extract freetext number and decimal part from input text
107
- match = pattern.search(text)
108
 
109
- if match:
110
- whole_part = match.group(1).lower()
111
- decimal_part = match.group(3).lower()
112
- # Convert whole and decimal parts to numeric form
113
- numeric_whole = sum(number_map[word] * (10 ** (len(whole_part.split()) - i - 1)) for i, word in enumerate(whole_part.split()))
114
- numeric_decimal = sum(number_map[word] * (0.1 ** (i + 1)) for i, word in enumerate(decimal_part.split()))
115
- return numeric_whole + numeric_decimal
116
-
117
- # Return None if the input text doesn't match the regular expression
118
- return 0
119
 
120
- except:
121
- return 0
 
 
 
 
122
 
 
 
 
 
 
 
 
 
123
 
124
 
125
  def numeric_number_dot_freetext(text):
@@ -128,100 +94,132 @@ def numeric_number_dot_freetext(text):
128
  '''
129
 
130
  try:
131
- # Define a dictionary to map words to numbers
132
- num_dict = {"zero":0, "one":1, "two":2, "three":3, "four":4, "five":5,
133
- "six":6, "seven":7, "eight":8, "nine":9}
134
-
135
- # Define a regular expression pattern to extract the numeric form and free text form from input text
136
- pattern = r"(\d+|\w+)(?:\s+(?:decimal|point|dot|comma)\s+)(\d+|\w+)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
  # Use regular expression to extract the numeric form and free text form from input text
139
  match = re.search(pattern, text)
140
-
141
  if match:
142
  num1 = match.group(1)
143
  num2 = match.group(2)
144
-
145
  # If the numeric form is a word, map it to its numerical value
146
  if num1 in num_dict:
147
  num1 = num_dict[num1]
148
 
149
- # If the free text form is a word, map it to its numerical value
150
- if num2 in num_dict:
151
- num2 = num_dict[num2]
152
-
153
- # Convert both parts to float and add them together to get the final decimal value
154
- result = float(num1) + float(num2) / (10 ** len(str(num2)))
155
-
156
- return result
157
-
158
- else:
159
- # If input text doesn't match the expected pattern, return None
160
- return 0
161
-
162
- except:
163
- return 0
164
 
 
 
 
 
165
 
166
- def convert_into_numeric(num_list):
167
- '''
168
- This is a function to convert the identified numbers into a numeric form
169
- '''
170
 
171
- if num_list:
172
-
173
- # at first we examine how many numbers were captured. Only one number should exist
174
- if len(num_list) > 1:
175
- return 0
176
-
177
- else:
178
- target_num = num_list[0]
179
-
180
- # case it is an integer or float, convert it, otherwise move to following cases
181
- try:
182
- target_num_float = float(target_num)
183
- return {'Number' : target_num}
184
-
185
- except:
186
- # case that it belongs to one of the patterns of freetext number followed by numeric form etc (all the combinations)
187
- if "$pattern" in target_num:
188
- num, _ = target_num.split("$")
189
 
190
- # try at first with that function for the case of six point five
191
- num_conversion = numeric_freetext_dot_freetext(num)
 
192
 
193
- if num_conversion:
194
- return {'Number' : num_conversion}
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
- # if not, try with this function for all the rest of cases (6 point 5, 6 point five, six point 5)
197
  else:
198
- num_conversion = numeric_number_dot_freetext(num)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
- if num_conversion:
201
- return {'Number' : num_conversion}
202
 
203
- # if none of the above has worked, then examine the case of freetext numbers without patterns (e.g. two, million, twenty three, etc)
204
- else:
205
  try:
206
- num_conversion = w2n.word_to_num(target_num)
207
- return {'Number' : num_conversion}
208
-
209
- # if none of the above, error.
210
  except:
211
- return 0
212
-
213
- else:
214
- return 0
215
-
216
-
217
 
218
- def magnitude_binding(input_text):
219
-
220
- try:
221
- target_numbers = capture_numbers(input_text)
222
- numeric_target_numbers = convert_into_numeric(target_numbers)
223
-
224
- return numeric_target_numbers
225
-
226
  except:
227
  return 0
 
7
  nlp = spacy.load("en_core_web_lg")
8
 
9
 
10
+ def capture_numbers(input_sentence):
11
  '''
12
  This is a function to capture cases of refered numbers either in numeric or free-text form
13
  '''
14
 
15
  try:
16
  # Define the regular expression patterns
17
+ pattern1 = r"(\d+|\w+(?:\s+\w+)*)\s+(decimal|point|dot|comma)\s+(\d+|\w+(?:\s+\w+)*)"
18
 
19
  # Find all matches in the text
20
  matches = re.findall(pattern1, input_sentence)
 
31
  input_sentence = input_sentence.replace(elem, " ")
32
 
33
  if pattern_numbers:
 
34
  # Remove duplicates with set and convert back to list
35
+ pattern_final_numbers = list(set(pattern_numbers))
 
 
36
  else:
37
+ pattern_final_numbers = []
38
 
39
+ # we delete the captured references from the sentence, because if we capture something like seven point five
40
+ # then spacy will also identify seven and five, which we do not want it to
41
+ for element in pattern_final_numbers:
42
+ target_elem = element.replace("$pattern","").strip()
43
+ if target_elem in input_sentence:
44
+ input_sentence = input_sentence.replace(target_elem, " ")
45
 
 
 
46
 
47
+ # This is for cases of thirty eight or one million and two, etc.
 
48
 
49
+ # Define a regular expression to match multiword free-text numbers
50
+ pattern2 = r"(?<!\w)(?:(?:zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|trillion)(?:\s(?:and\s)?(?:zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|trillion))+\s?)+(?!\w*pennies)"
51
+
52
+ # Find all multiword free-text number matches in the sentence
53
+ multi_numbers = re.findall(pattern2, input_sentence)
54
 
55
+ if multi_numbers:
56
+ multinumber_final_numbers = list(set(multi_numbers))
57
+ else:
58
+ multinumber_final_numbers = []
59
 
60
+ for elem in multinumber_final_numbers:
61
+ if elem in input_sentence:
62
+ input_sentence = input_sentence.replace(elem, " ")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
+ # we also delete the captured references from the sentence in this case
65
+ for element in multinumber_final_numbers:
66
+ target_elem = element.replace("$pattern","").strip()
67
+ if target_elem in input_sentence:
68
+ input_sentence = input_sentence.replace(target_elem, " ")
69
+
70
+
71
+ # Parse the input sentence with Spacy
72
+ doc = nlp(input_sentence)
 
73
 
74
+ # This is to capture all the numbers in int and float form, as well as numbers like eight, two, hundred
75
+ s_numbers = [token.text for token in doc if token.like_num]
76
+
77
+ if s_numbers:
78
+ # Remove duplicates with set and convert back to list
79
+ spacy_final_numbers = list(set(s_numbers))
80
 
81
+ else:
82
+ spacy_final_numbers = []
83
+
84
+ # return the extracted numbers
85
+ return pattern_final_numbers + multinumber_final_numbers + spacy_final_numbers
86
+
87
+ except:
88
+ return 0
89
 
90
 
91
  def numeric_number_dot_freetext(text):
 
94
  '''
95
 
96
  try:
97
+ # # Define a dictionary to map words to numbers
98
+ num_dict = {
99
+ 'zero': 0,
100
+ 'one': 1,
101
+ 'two': 2,
102
+ 'three': 3,
103
+ 'four': 4,
104
+ 'five': 5,
105
+ 'six': 6,
106
+ 'seven': 7,
107
+ 'eight': 8,
108
+ 'nine': 9,
109
+ 'ten': 10,
110
+ 'eleven': 11,
111
+ 'twelve': 12,
112
+ 'thirteen': 13,
113
+ 'fourteen': 14,
114
+ 'fifteen': 15,
115
+ 'sixteen': 16,
116
+ 'seventeen': 17,
117
+ 'eighteen': 18,
118
+ 'nineteen': 19,
119
+ 'twenty': 20,
120
+ 'thirty': 30,
121
+ 'forty': 40,
122
+ 'fifty': 50,
123
+ 'sixty': 60,
124
+ 'seventy': 70,
125
+ 'eighty': 80,
126
+ 'ninety': 90,
127
+ 'hundred': 100,
128
+ 'thousand': 1000,
129
+ 'million': 1000000,
130
+ 'billion': 1000000000,
131
+ 'trillion': 1000000000000
132
+ }
133
+
134
+ # # Define a regular expression pattern to extract the numeric form and free text form from input text
135
+ pattern = r"(\d+|\w+(?:\s+\w+)*)\s+(?:decimal|point|dot|comma)\s+(\d+|\w+(?:\s+\w+)*)"
136
 
137
  # Use regular expression to extract the numeric form and free text form from input text
138
  match = re.search(pattern, text)
139
+
140
  if match:
141
  num1 = match.group(1)
142
  num2 = match.group(2)
143
+
144
  # If the numeric form is a word, map it to its numerical value
145
  if num1 in num_dict:
146
  num1 = num_dict[num1]
147
 
148
+ # if not in the dictionary try also with the w2n library
149
+ else:
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
+ # try to convert to float. That means this is a number, otherwise it is a string so continue
152
+ try:
153
+ num1 = float(num1)
154
+ except:
155
 
156
+ # this will handle cases like "bla bla bla seven"
157
+ try:
158
+ num1 = w2n.word_to_num(num1)
 
159
 
160
+ # this is to handle cases like "bla bla bla 7"
161
+ except:
162
+
163
+ try:
164
+ # we identify all the numeric references
165
+ num_ref1 = [int(ref) for ref in re.findall(r'\d+', num1)]
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
+ # if there is exactly one number then we cope with that
168
+ if len(num_ref1) == 1:
169
+ num1 = num_ref1[0]
170
 
171
+ # in any other case throw an error
172
+ elif len(num_ref1) > 1:
173
+ return (0,'MAGNITUDE','more_magnitude')
174
+
175
+ elif len(num_ref1) == 0:
176
+ return (0,'MAGNITUDE','no_magnitude')
177
+
178
+ except:
179
+ return (0,'MAGNITUDE','unknown_error')
180
+
181
+
182
+ # If the free text form is a word, map it to its numerical value
183
+ if num2 in num_dict:
184
+ num2 = num_dict[num2]
185
 
 
186
  else:
187
+ try:
188
+ num2 = int(num2)
189
+ except:
190
+ try:
191
+ num2 = w2n.word_to_num(num2)
192
+ except:
193
+ try:
194
+ # we identify all the numeric references
195
+ num_ref2 = [int(ref) for ref in re.findall(r'\d+', num2)]
196
+
197
+ # if there is exactly one number then we cope with that
198
+ if len(num_ref2) == 1:
199
+ num2 = num_ref2[0]
200
+
201
+ # in any other case throw an error
202
+ elif len(num_ref2) > 1:
203
+ return (0,'MAGNITUDE','more_magnitude')
204
+
205
+ elif len(num_ref2) == 0:
206
+ return (0,'MAGNITUDE','no_magnitude')
207
+
208
+ except:
209
+ return (0,'MAGNITUDE','unknown_error')
210
 
 
 
211
 
 
 
212
  try:
213
+ # Convert both parts to float and add them together to get the final decimal value
214
+ result = float(num1) + float(num2) / (10 ** len(str(num2)))
215
+ return result
 
216
  except:
217
+ return (0, 'MAGNITUDE', 'unknown_error')
218
+
 
 
 
 
219
 
220
+ else:
221
+ # If input text doesn't match the expected pattern, return None
222
+ return 0
223
+
 
 
 
 
224
  except:
225
  return 0