ValadisCERTH commited on
Commit
13c4417
·
1 Parent(s): 3f018ec

Create helper.py

Browse files
Files changed (1) hide show
  1. helper.py +227 -0
helper.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ import re
3
+ from word2number import w2n
4
+
5
+ # load the spacy model
6
+ spacy.cli.download("en_core_web_lg")
7
+ nlp = spacy.load("en_core_web_lg")
8
+
9
+
10
+ def capture_numbers (input_sentence):
11
+ '''
12
+ This is a function to capture cases of refered numbers either in numeric or free-text form
13
+ '''
14
+
15
+ try:
16
+ # Define the regular expression patterns
17
+ pattern1 = r"\b(\w+)\s+(point|decimal|dot|comma)\s+(\w+)\b"
18
+
19
+ # Find all matches in the text
20
+ matches = re.findall(pattern1, input_sentence)
21
+
22
+ # This part is to capture cases like six point five, 5 point five, six point 5, 5 point 5
23
+ pattern_numbers = []
24
+ for match in matches:
25
+ if len(match) == 3:
26
+ # add the $pattern string to easily specify them in a subsequent step
27
+ full_string = "{} {} {} {}".format(match[0], match[1], match[2], '$pattern')
28
+ pattern_numbers.append(full_string)
29
+
30
+ for elem in pattern_numbers:
31
+ input_sentence = input_sentence.replace(elem, " ")
32
+
33
+ if pattern_numbers:
34
+
35
+ # Remove duplicates with set and convert back to list
36
+ final_numbers = list(set(pattern_numbers))
37
+ return final_numbers
38
+
39
+ else:
40
+
41
+ # Parse the input sentence with Spacy
42
+ doc = nlp(input_sentence)
43
+
44
+ # This is to capture all the numbers in int and float form, as well as numbers like eight, two, hunded
45
+ numbers = [token.text for token in doc if token.like_num]
46
+
47
+ # Remove duplicates with set and convert back to list
48
+ final_numbers = list(set(numbers))
49
+
50
+ # Print the extracted numbers
51
+ if final_numbers:
52
+ return final_numbers
53
+ else:
54
+ return 0
55
+
56
+ except:
57
+ return 0
58
+
59
+ def numeric_freetext_dot_freetext(text):
60
+ '''
61
+ This is a function to convert cases of 'six point five'
62
+ '''
63
+
64
+ # Define a dictionary to map freetext numbers to numeric values
65
+ number_map = {
66
+ 'zero': 0,
67
+ 'one': 1,
68
+ 'two': 2,
69
+ 'three': 3,
70
+ 'four': 4,
71
+ 'five': 5,
72
+ 'six': 6,
73
+ 'seven': 7,
74
+ 'eight': 8,
75
+ 'nine': 9,
76
+ 'ten': 10,
77
+ 'eleven': 11,
78
+ 'twelve': 12,
79
+ 'thirteen': 13,
80
+ 'fourteen': 14,
81
+ 'fifteen': 15,
82
+ 'sixteen': 16,
83
+ 'seventeen': 17,
84
+ 'eighteen': 18,
85
+ 'nineteen': 19,
86
+ 'twenty': 20,
87
+ 'thirty': 30,
88
+ 'forty': 40,
89
+ 'fifty': 50,
90
+ 'sixty': 60,
91
+ 'seventy': 70,
92
+ 'eighty': 80,
93
+ 'ninety': 90,
94
+ 'hundred': 100,
95
+ 'thousand': 1000,
96
+ 'million': 1000000,
97
+ 'billion': 1000000000,
98
+ 'trillion': 1000000000000
99
+ }
100
+
101
+ try:
102
+
103
+ # Define regular expression to match freetext numbers
104
+ pattern = re.compile(r'(\w+(?:\s+\w+)*)\s+(point|decimal|dot|comma)\s+(\w+(?:\s+\w+)*)')
105
+
106
+ # Extract freetext number and decimal part from input text
107
+ match = pattern.search(text)
108
+
109
+ if match:
110
+ whole_part = match.group(1).lower()
111
+ decimal_part = match.group(3).lower()
112
+ # Convert whole and decimal parts to numeric form
113
+ numeric_whole = sum(number_map[word] * (10 ** (len(whole_part.split()) - i - 1)) for i, word in enumerate(whole_part.split()))
114
+ numeric_decimal = sum(number_map[word] * (0.1 ** (i + 1)) for i, word in enumerate(decimal_part.split()))
115
+ return numeric_whole + numeric_decimal
116
+
117
+ # Return None if the input text doesn't match the regular expression
118
+ return 0
119
+
120
+ except:
121
+ return 0
122
+
123
+
124
+
125
+ def numeric_number_dot_freetext(text):
126
+ '''
127
+ This is a function to convert cases of '6 point five and six point 5'
128
+ '''
129
+
130
+ try:
131
+ # Define a dictionary to map words to numbers
132
+ num_dict = {"zero":0, "one":1, "two":2, "three":3, "four":4, "five":5,
133
+ "six":6, "seven":7, "eight":8, "nine":9}
134
+
135
+ # Define a regular expression pattern to extract the numeric form and free text form from input text
136
+ pattern = r"(\d+|\w+)(?:\s+(?:decimal|point|dot|comma)\s+)(\d+|\w+)"
137
+
138
+ # Use regular expression to extract the numeric form and free text form from input text
139
+ match = re.search(pattern, text)
140
+
141
+ if match:
142
+ num1 = match.group(1)
143
+ num2 = match.group(2)
144
+
145
+ # If the numeric form is a word, map it to its numerical value
146
+ if num1 in num_dict:
147
+ num1 = num_dict[num1]
148
+
149
+ # If the free text form is a word, map it to its numerical value
150
+ if num2 in num_dict:
151
+ num2 = num_dict[num2]
152
+
153
+ # Convert both parts to float and add them together to get the final decimal value
154
+ result = float(num1) + float(num2) / (10 ** len(str(num2)))
155
+
156
+ return result
157
+
158
+ else:
159
+ # If input text doesn't match the expected pattern, return None
160
+ return 0
161
+
162
+ except:
163
+ return 0
164
+
165
+
166
+ def convert_into_numeric(num_list):
167
+ '''
168
+ This is a function to convert the identified numbers into a numeric form
169
+ '''
170
+
171
+ if num_list:
172
+
173
+ # at first we examine how many numbers were captured. Only one number should exist
174
+ if len(num_list) > 1:
175
+ return 0
176
+
177
+ else:
178
+ target_num = num_list[0]
179
+
180
+ # case it is an integer or float, convert it, otherwise move to following cases
181
+ try:
182
+ target_num_float = float(target_num)
183
+ return {'Number' : target_num}
184
+
185
+ except:
186
+ # case that it belongs to one of the patterns of freetext number followed by numeric form etc (all the combinations)
187
+ if "$pattern" in target_num:
188
+ num, _ = target_num.split("$")
189
+
190
+ # try at first with that function for the case of six point five
191
+ num_conversion = numeric_freetext_dot_freetext(num)
192
+
193
+ if num_conversion:
194
+ return {'Number' : num_conversion}
195
+
196
+ # if not, try with this function for all the rest of cases (6 point 5, 6 point five, six point 5)
197
+ else:
198
+ num_conversion = numeric_number_dot_freetext(num)
199
+
200
+ if num_conversion:
201
+ return {'Number' : num_conversion}
202
+
203
+ # if none of the above has worked, then examine the case of freetext numbers without patterns (e.g. two, million, twenty three, etc)
204
+ else:
205
+ try:
206
+ num_conversion = w2n.word_to_num(target_num)
207
+ return {'Number' : num_conversion}
208
+
209
+ # if none of the above, error.
210
+ except:
211
+ return 0
212
+
213
+ else:
214
+ return 0
215
+
216
+
217
+
218
+ def magnitude_binding(input_text):
219
+
220
+ try:
221
+ target_numbers = capture_numbers(input_text)
222
+ numeric_target_numbers = convert_into_numeric(target_numbers)
223
+
224
+ return numeric_target_numbers
225
+
226
+ except:
227
+ return 0