ValadisCERTH commited on
Commit
0493ed8
·
1 Parent(s): f89c921

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +262 -0
app.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ import re
3
+ from word2number import w2n
4
+
5
+ # load the spacy model
6
+ spacy.cli.download("en_core_web_lg")
7
+ nlp = spacy.load("en_core_web_lg")
8
+
9
+
10
+ def capture_numbers (input_sentence):
11
+ '''
12
+ This is a function to capture cases of refered numbers either in numeric or free-text form
13
+ '''
14
+
15
+ try:
16
+
17
+ # Define the regular expression patterns
18
+ pattern1 = r"\b(\w+)\s+(point|decimal|dot|comma)\s+(\w+)\b"
19
+
20
+ # Find all matches in the text
21
+ matches1 = re.findall(pattern1, input_sentence)
22
+ matches2 = re.findall(pattern2, input_sentence)
23
+ matches3 = re.findall(pattern3, input_sentence)
24
+ matches4 = re.findall(pattern4, input_sentence)
25
+
26
+ matches = matches1 + matches2 + matches3 + matches4
27
+
28
+ # This part is to capture cases like six point five, 5 point five, six point 5, 5 point 5
29
+ pattern_numbers = []
30
+ for match in matches:
31
+ if len(match) == 3:
32
+ # add the $pattern string to easily specify them in a subsequent step
33
+ full_string = "{} {} {} {}".format(match[0], match[1], match[2], '$pattern')
34
+ pattern_numbers.append(full_string)
35
+
36
+ for elem in pattern_numbers:
37
+ input_sentence = input_sentence.replace(elem, " ")
38
+
39
+ if pattern_numbers:
40
+
41
+ # Remove duplicates with set and convert back to list
42
+ final_numbers = list(set(pattern_numbers))
43
+ return final_numbers
44
+
45
+ else:
46
+
47
+ # Parse the input sentence with Spacy
48
+ doc = nlp(input_sentence)
49
+
50
+ # This is to capture all the numbers in int and float form, as well as numbers like eight, two, hunded
51
+ numbers = [token.text for token in doc if token.like_num]
52
+
53
+ # Remove duplicates with set and convert back to list
54
+ final_numbers = list(set(numbers))
55
+
56
+ # Print the extracted numbers
57
+ if final_numbers:
58
+ return final_numbers
59
+ else:
60
+ return 0
61
+
62
+ except:
63
+ return 0
64
+
65
+
66
+ def numeric_freetext_dot_freetext(text):
67
+ '''
68
+ This is a function to convert cases of 'six point five'
69
+ '''
70
+
71
+ # Define a dictionary to map freetext numbers to numeric values
72
+ number_map = {
73
+ 'zero': 0,
74
+ 'one': 1,
75
+ 'two': 2,
76
+ 'three': 3,
77
+ 'four': 4,
78
+ 'five': 5,
79
+ 'six': 6,
80
+ 'seven': 7,
81
+ 'eight': 8,
82
+ 'nine': 9,
83
+ 'ten': 10,
84
+ 'eleven': 11,
85
+ 'twelve': 12,
86
+ 'thirteen': 13,
87
+ 'fourteen': 14,
88
+ 'fifteen': 15,
89
+ 'sixteen': 16,
90
+ 'seventeen': 17,
91
+ 'eighteen': 18,
92
+ 'nineteen': 19,
93
+ 'twenty': 20,
94
+ 'thirty': 30,
95
+ 'forty': 40,
96
+ 'fifty': 50,
97
+ 'sixty': 60,
98
+ 'seventy': 70,
99
+ 'eighty': 80,
100
+ 'ninety': 90,
101
+ 'hundred': 100,
102
+ 'thousand': 1000,
103
+ 'million': 1000000,
104
+ 'billion': 1000000000,
105
+ 'trillion': 1000000000000
106
+ }
107
+
108
+ try:
109
+
110
+ # Define regular expression to match freetext numbers
111
+ pattern = re.compile(r'(\w+(?:\s+\w+)*)\s+(point|decimal|dot|comma)\s+(\w+(?:\s+\w+)*)')
112
+
113
+ # Extract freetext number and decimal part from input text
114
+ match = pattern.search(text)
115
+
116
+ if match:
117
+ whole_part = match.group(1).lower()
118
+ decimal_part = match.group(3).lower()
119
+ # Convert whole and decimal parts to numeric form
120
+ numeric_whole = sum(number_map[word] * (10 ** (len(whole_part.split()) - i - 1)) for i, word in enumerate(whole_part.split()))
121
+ numeric_decimal = sum(number_map[word] * (0.1 ** (i + 1)) for i, word in enumerate(decimal_part.split()))
122
+ return numeric_whole + numeric_decimal
123
+
124
+ # Return None if the input text doesn't match the regular expression
125
+ return 0
126
+
127
+ except:
128
+ return 0
129
+
130
+
131
+
132
+ def numeric_number_dot_freetext(text):
133
+ '''
134
+ This is a function to convert cases of '6 point five and six point 5'
135
+ '''
136
+
137
+ try:
138
+ # Define a dictionary to map words to numbers
139
+ num_dict = {"zero":0, "one":1, "two":2, "three":3, "four":4, "five":5,
140
+ "six":6, "seven":7, "eight":8, "nine":9}
141
+
142
+ # Define a regular expression pattern to extract the numeric form and free text form from input text
143
+ pattern = r"(\d+|\w+)(?:\s+(?:decimal|point|dot|comma)\s+)(\d+|\w+)"
144
+
145
+ # Use regular expression to extract the numeric form and free text form from input text
146
+ match = re.search(pattern, text)
147
+
148
+ if match:
149
+ num1 = match.group(1)
150
+ num2 = match.group(2)
151
+
152
+ # If the numeric form is a word, map it to its numerical value
153
+ if num1 in num_dict:
154
+ num1 = num_dict[num1]
155
+
156
+ # If the free text form is a word, map it to its numerical value
157
+ if num2 in num_dict:
158
+ num2 = num_dict[num2]
159
+
160
+ # Convert both parts to float and add them together to get the final decimal value
161
+ result = float(num1) + float(num2) / (10 ** len(str(num2)))
162
+
163
+ return result
164
+
165
+ else:
166
+ # If input text doesn't match the expected pattern, return None
167
+ return 0
168
+
169
+ except:
170
+ return 0
171
+
172
+
173
+ def convert_into_numeric(num_list):
174
+ '''
175
+ This is a function to convert the identified numbers into a numeric form
176
+ '''
177
+
178
+ if num_list:
179
+
180
+ # at first we examine how many numbers were captured. Only one number should exist
181
+ if len(num_list) > 1:
182
+ return 0
183
+
184
+ else:
185
+ target_num = num_list[0]
186
+
187
+ # case it is an integer or float, convert it, otherwise move to following cases
188
+ try:
189
+ target_num_float = float(target_num)
190
+ return {'Number' : target_num}
191
+
192
+ except:
193
+ # case that it belongs to one of the patterns of freetext number followed by numeric form etc (all the combinations)
194
+ if "$pattern" in target_num:
195
+ num, _ = target_num.split("$")
196
+
197
+ # try at first with that function for the case of six point five
198
+ num_conversion = numeric_freetext_dot_freetext(num)
199
+
200
+ if num_conversion:
201
+ return {'Number' : num_conversion}
202
+
203
+ # if not, try with this function for all the rest of cases (6 point 5, 6 point five, six point 5)
204
+ else:
205
+ num_conversion = numeric_number_dot_freetext(num)
206
+
207
+ if num_conversion:
208
+ return {'Number' : num_conversion}
209
+
210
+ # if none of the above has worked, then examine the case of freetext numbers without patterns (e.g. two, million, twenty three, etc)
211
+ else:
212
+ try:
213
+ num_conversion = w2n.word_to_num(target_num)
214
+ return {'Number' : num_conversion}
215
+
216
+ # if none of the above, error.
217
+ except:
218
+ return 0
219
+
220
+ else:
221
+ return 0
222
+
223
+
224
+
225
+ def magnitude_binding(input_text):
226
+
227
+ try:
228
+ target_numbers = capture_numbers(input_text)
229
+ numeric_target_numbers = convert_into_numeric(target_numbers)
230
+
231
+ return numeric_target_numbers
232
+
233
+ except:
234
+ return 0
235
+
236
+
237
+
238
+ from transformers import pipeline
239
+ import gradio as gr
240
+
241
+ title = "Natural Language module Demo for Magnitude numbers identification"
242
+ description = "This is a simple demo just for demonstration purposes, so that Serco team might have the chance to validate the results of the Natural Language module concerning magnitude number identification, while in progress \n\n NOTE: DO NOT ENTER DATES FOR THIS TESTING"
243
+
244
+ examples = [
245
+ ["Earthquake located in Ishkoshim, Tajikistan with magnitude greater than 6"],
246
+ ["Give me all the earthquakes with magnitude above than 8.23 in the region of Athens"],
247
+ ["Earthquake happened in Rome with a magnitude of four"],
248
+ ["I want all earthquakes larger than five point six that occurred in Rome"],
249
+ ["I want all earthquakes larger than five point 6 that occurred in Rome"],
250
+ ["I want all earthquakes larger than 5 point six that occurred in Rome"],
251
+ ["I want all earthquakes larger than 5 point 6 that occurred in Rome"]
252
+ ]
253
+
254
+ gr.Interface(
255
+ fn=magnitude_binding,
256
+ inputs="text",
257
+ outputs="text",
258
+ title=title,
259
+ description=description,
260
+ examples=examples,
261
+ enable_queue=True,
262
+ ).launch()