Spaces:
Sleeping
Sleeping
Commit
·
3f018ec
1
Parent(s):
6a57ce1
Update app.py
Browse files
app.py
CHANGED
@@ -1,247 +1,19 @@
|
|
1 |
-
import spacy
|
2 |
-
import re
|
3 |
-
from word2number import w2n
|
4 |
-
|
5 |
-
# load the spacy model
|
6 |
-
spacy.cli.download("en_core_web_lg")
|
7 |
-
nlp = spacy.load("en_core_web_lg")
|
8 |
-
|
9 |
-
|
10 |
-
def capture_numbers (input_sentence):
|
11 |
-
'''
|
12 |
-
This is a function to capture cases of refered numbers either in numeric or free-text form
|
13 |
-
'''
|
14 |
-
|
15 |
-
try:
|
16 |
-
# Define the regular expression patterns
|
17 |
-
pattern1 = r"\b(\w+)\s+(point|decimal|dot|comma)\s+(\w+)\b"
|
18 |
-
|
19 |
-
# Find all matches in the text
|
20 |
-
matches = re.findall(pattern1, input_sentence)
|
21 |
-
|
22 |
-
# This part is to capture cases like six point five, 5 point five, six point 5, 5 point 5
|
23 |
-
pattern_numbers = []
|
24 |
-
for match in matches:
|
25 |
-
if len(match) == 3:
|
26 |
-
# add the $pattern string to easily specify them in a subsequent step
|
27 |
-
full_string = "{} {} {} {}".format(match[0], match[1], match[2], '$pattern')
|
28 |
-
pattern_numbers.append(full_string)
|
29 |
-
|
30 |
-
for elem in pattern_numbers:
|
31 |
-
input_sentence = input_sentence.replace(elem, " ")
|
32 |
-
|
33 |
-
if pattern_numbers:
|
34 |
-
|
35 |
-
# Remove duplicates with set and convert back to list
|
36 |
-
final_numbers = list(set(pattern_numbers))
|
37 |
-
return final_numbers
|
38 |
-
|
39 |
-
else:
|
40 |
-
|
41 |
-
# Parse the input sentence with Spacy
|
42 |
-
doc = nlp(input_sentence)
|
43 |
-
|
44 |
-
# This is to capture all the numbers in int and float form, as well as numbers like eight, two, hunded
|
45 |
-
numbers = [token.text for token in doc if token.like_num]
|
46 |
-
|
47 |
-
# Remove duplicates with set and convert back to list
|
48 |
-
final_numbers = list(set(numbers))
|
49 |
-
|
50 |
-
# Print the extracted numbers
|
51 |
-
if final_numbers:
|
52 |
-
return final_numbers
|
53 |
-
else:
|
54 |
-
return 0
|
55 |
-
|
56 |
-
except:
|
57 |
-
return 0
|
58 |
-
|
59 |
-
def numeric_freetext_dot_freetext(text):
|
60 |
-
'''
|
61 |
-
This is a function to convert cases of 'six point five'
|
62 |
-
'''
|
63 |
-
|
64 |
-
# Define a dictionary to map freetext numbers to numeric values
|
65 |
-
number_map = {
|
66 |
-
'zero': 0,
|
67 |
-
'one': 1,
|
68 |
-
'two': 2,
|
69 |
-
'three': 3,
|
70 |
-
'four': 4,
|
71 |
-
'five': 5,
|
72 |
-
'six': 6,
|
73 |
-
'seven': 7,
|
74 |
-
'eight': 8,
|
75 |
-
'nine': 9,
|
76 |
-
'ten': 10,
|
77 |
-
'eleven': 11,
|
78 |
-
'twelve': 12,
|
79 |
-
'thirteen': 13,
|
80 |
-
'fourteen': 14,
|
81 |
-
'fifteen': 15,
|
82 |
-
'sixteen': 16,
|
83 |
-
'seventeen': 17,
|
84 |
-
'eighteen': 18,
|
85 |
-
'nineteen': 19,
|
86 |
-
'twenty': 20,
|
87 |
-
'thirty': 30,
|
88 |
-
'forty': 40,
|
89 |
-
'fifty': 50,
|
90 |
-
'sixty': 60,
|
91 |
-
'seventy': 70,
|
92 |
-
'eighty': 80,
|
93 |
-
'ninety': 90,
|
94 |
-
'hundred': 100,
|
95 |
-
'thousand': 1000,
|
96 |
-
'million': 1000000,
|
97 |
-
'billion': 1000000000,
|
98 |
-
'trillion': 1000000000000
|
99 |
-
}
|
100 |
-
|
101 |
-
try:
|
102 |
-
|
103 |
-
# Define regular expression to match freetext numbers
|
104 |
-
pattern = re.compile(r'(\w+(?:\s+\w+)*)\s+(point|decimal|dot|comma)\s+(\w+(?:\s+\w+)*)')
|
105 |
-
|
106 |
-
# Extract freetext number and decimal part from input text
|
107 |
-
match = pattern.search(text)
|
108 |
-
|
109 |
-
if match:
|
110 |
-
whole_part = match.group(1).lower()
|
111 |
-
decimal_part = match.group(3).lower()
|
112 |
-
# Convert whole and decimal parts to numeric form
|
113 |
-
numeric_whole = sum(number_map[word] * (10 ** (len(whole_part.split()) - i - 1)) for i, word in enumerate(whole_part.split()))
|
114 |
-
numeric_decimal = sum(number_map[word] * (0.1 ** (i + 1)) for i, word in enumerate(decimal_part.split()))
|
115 |
-
return numeric_whole + numeric_decimal
|
116 |
-
|
117 |
-
# Return None if the input text doesn't match the regular expression
|
118 |
-
return 0
|
119 |
-
|
120 |
-
except:
|
121 |
-
return 0
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
def numeric_number_dot_freetext(text):
|
126 |
-
'''
|
127 |
-
This is a function to convert cases of '6 point five and six point 5'
|
128 |
-
'''
|
129 |
-
|
130 |
-
try:
|
131 |
-
# Define a dictionary to map words to numbers
|
132 |
-
num_dict = {"zero":0, "one":1, "two":2, "three":3, "four":4, "five":5,
|
133 |
-
"six":6, "seven":7, "eight":8, "nine":9}
|
134 |
-
|
135 |
-
# Define a regular expression pattern to extract the numeric form and free text form from input text
|
136 |
-
pattern = r"(\d+|\w+)(?:\s+(?:decimal|point|dot|comma)\s+)(\d+|\w+)"
|
137 |
-
|
138 |
-
# Use regular expression to extract the numeric form and free text form from input text
|
139 |
-
match = re.search(pattern, text)
|
140 |
-
|
141 |
-
if match:
|
142 |
-
num1 = match.group(1)
|
143 |
-
num2 = match.group(2)
|
144 |
-
|
145 |
-
# If the numeric form is a word, map it to its numerical value
|
146 |
-
if num1 in num_dict:
|
147 |
-
num1 = num_dict[num1]
|
148 |
-
|
149 |
-
# If the free text form is a word, map it to its numerical value
|
150 |
-
if num2 in num_dict:
|
151 |
-
num2 = num_dict[num2]
|
152 |
-
|
153 |
-
# Convert both parts to float and add them together to get the final decimal value
|
154 |
-
result = float(num1) + float(num2) / (10 ** len(str(num2)))
|
155 |
-
|
156 |
-
return result
|
157 |
-
|
158 |
-
else:
|
159 |
-
# If input text doesn't match the expected pattern, return None
|
160 |
-
return 0
|
161 |
-
|
162 |
-
except:
|
163 |
-
return 0
|
164 |
-
|
165 |
-
|
166 |
-
def convert_into_numeric(num_list):
|
167 |
-
'''
|
168 |
-
This is a function to convert the identified numbers into a numeric form
|
169 |
-
'''
|
170 |
-
|
171 |
-
if num_list:
|
172 |
-
|
173 |
-
# at first we examine how many numbers were captured. Only one number should exist
|
174 |
-
if len(num_list) > 1:
|
175 |
-
return 0
|
176 |
-
|
177 |
-
else:
|
178 |
-
target_num = num_list[0]
|
179 |
-
|
180 |
-
# case it is an integer or float, convert it, otherwise move to following cases
|
181 |
-
try:
|
182 |
-
target_num_float = float(target_num)
|
183 |
-
return {'Number' : target_num}
|
184 |
-
|
185 |
-
except:
|
186 |
-
# case that it belongs to one of the patterns of freetext number followed by numeric form etc (all the combinations)
|
187 |
-
if "$pattern" in target_num:
|
188 |
-
num, _ = target_num.split("$")
|
189 |
-
|
190 |
-
# try at first with that function for the case of six point five
|
191 |
-
num_conversion = numeric_freetext_dot_freetext(num)
|
192 |
-
|
193 |
-
if num_conversion:
|
194 |
-
return {'Number' : num_conversion}
|
195 |
-
|
196 |
-
# if not, try with this function for all the rest of cases (6 point 5, 6 point five, six point 5)
|
197 |
-
else:
|
198 |
-
num_conversion = numeric_number_dot_freetext(num)
|
199 |
-
|
200 |
-
if num_conversion:
|
201 |
-
return {'Number' : num_conversion}
|
202 |
-
|
203 |
-
# if none of the above has worked, then examine the case of freetext numbers without patterns (e.g. two, million, twenty three, etc)
|
204 |
-
else:
|
205 |
-
try:
|
206 |
-
num_conversion = w2n.word_to_num(target_num)
|
207 |
-
return {'Number' : num_conversion}
|
208 |
-
|
209 |
-
# if none of the above, error.
|
210 |
-
except:
|
211 |
-
return 0
|
212 |
-
|
213 |
-
else:
|
214 |
-
return 0
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
def magnitude_binding(input_text):
|
219 |
-
|
220 |
-
try:
|
221 |
-
target_numbers = capture_numbers(input_text)
|
222 |
-
numeric_target_numbers = convert_into_numeric(target_numbers)
|
223 |
-
|
224 |
-
return numeric_target_numbers
|
225 |
-
|
226 |
-
except:
|
227 |
-
return 0
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
from transformers import pipeline
|
232 |
import gradio as gr
|
233 |
|
234 |
-
|
235 |
-
|
|
|
|
|
236 |
|
237 |
examples = [
|
238 |
-
["
|
239 |
-
["
|
240 |
-
["
|
241 |
-
["I want all earthquakes
|
242 |
-
["I want all earthquakes
|
243 |
-
["I want all earthquakes
|
244 |
-
["I want all earthquakes
|
245 |
]
|
246 |
|
247 |
gr.Interface(
|
@@ -252,4 +24,4 @@ gr.Interface(
|
|
252 |
description=description,
|
253 |
examples=examples,
|
254 |
enable_queue=True,
|
255 |
-
).launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from transformers import pipeline
|
2 |
import gradio as gr
|
3 |
|
4 |
+
from helper import magnitude_binding
|
5 |
+
|
6 |
+
title = "Magnitudes Identification Demo"
|
7 |
+
description = "This is a simple demo just for demonstration purposes, so that Serco team might have the chance to validate the first results of the Natural Language module concerning the identification of magnitude numbers \n\n NOTE: DO NOT ENTER DATES FOR THIS TESTING"
|
8 |
|
9 |
examples = [
|
10 |
+
["I want all earthquakes that are located in Ishkoshim, Tajikistan with a magnitude greater than 6"],
|
11 |
+
["I want all earthquakes that are located in Ishkoshim, Tajikistan with a magnitude greater than 6.5"],
|
12 |
+
["I want all earthquakes that are located in Ishkoshim, Tajikistan with a magnitude greater than six"],
|
13 |
+
["I want all earthquakes that are located in Ishkoshim, Tajikistan with a magnitude greater than six point five"],
|
14 |
+
["I want all earthquakes that are located in Ishkoshim, Tajikistan with a magnitude greater than 6 point five"],
|
15 |
+
["I want all earthquakes that are located in Ishkoshim, Tajikistan with a magnitude greater than six point 5"],
|
16 |
+
["I want all earthquakes that are located in Ishkoshim, Tajikistan with a magnitude greater than 6 point 5"],
|
17 |
]
|
18 |
|
19 |
gr.Interface(
|
|
|
24 |
description=description,
|
25 |
examples=examples,
|
26 |
enable_queue=True,
|
27 |
+
).launch()
|