Spaces:

ValadisCERTH
/

NumbersModuleSerco

Sleeping

App Files Files Community

ValadisCERTH commited on Mar 23, 2023

Commit

0493ed8

1 Parent(s): f89c921

Create app.py

Browse files

Files changed (1) hide show

app.py +262 -0

app.py ADDED Viewed

	@@ -0,0 +1,262 @@

+import spacy
+import re
+from word2number import w2n
+# load the spacy model
+spacy.cli.download("en_core_web_lg")
+nlp = spacy.load("en_core_web_lg")
+def capture_numbers (input_sentence):
+  '''
+    This is a function to capture cases of refered numbers either in numeric or free-text form
+  '''
+  try:
+    # Define the regular expression patterns
+    pattern1 = r"\b(\w+)\s+(point|decimal|dot|comma)\s+(\w+)\b"
+    # Find all matches in the text
+    matches1 = re.findall(pattern1, input_sentence)
+    matches2 = re.findall(pattern2, input_sentence)
+    matches3 = re.findall(pattern3, input_sentence)
+    matches4 = re.findall(pattern4, input_sentence)
+    matches = matches1 + matches2 + matches3 + matches4
+    # This part is to capture cases like six point five, 5 point five, six point 5, 5 point 5
+    pattern_numbers = []
+    for match in matches:
+        if len(match) == 3:
+            # add the $pattern string to easily specify them in a subsequent step
+            full_string = "{} {} {} {}".format(match[0], match[1], match[2], '$pattern')
+            pattern_numbers.append(full_string)
+    for elem in pattern_numbers:
+      input_sentence = input_sentence.replace(elem, " ")
+    if pattern_numbers:
+        # Remove duplicates with set and convert back to list
+        final_numbers = list(set(pattern_numbers))
+        return final_numbers
+    else:
+      # Parse the input sentence with Spacy
+      doc = nlp(input_sentence)
+      # This is to capture all the numbers in int and float form, as well as numbers like eight, two, hunded
+      numbers = [token.text for token in doc if token.like_num]
+      # Remove duplicates with set and convert back to list
+      final_numbers = list(set(numbers))
+      # Print the extracted numbers
+      if final_numbers:
+        return final_numbers
+      else:
+        return 0
+  except:
+    return 0
+def numeric_freetext_dot_freetext(text):
+    '''
+      This is a function to convert cases of 'six point five'
+    '''
+    # Define a dictionary to map freetext numbers to numeric values
+    number_map = {
+        'zero': 0,
+        'one': 1,
+        'two': 2,
+        'three': 3,
+        'four': 4,
+        'five': 5,
+        'six': 6,
+        'seven': 7,
+        'eight': 8,
+        'nine': 9,
+        'ten': 10,
+        'eleven': 11,
+        'twelve': 12,
+        'thirteen': 13,
+        'fourteen': 14,
+        'fifteen': 15,
+        'sixteen': 16,
+        'seventeen': 17,
+        'eighteen': 18,
+        'nineteen': 19,
+        'twenty': 20,
+        'thirty': 30,
+        'forty': 40,
+        'fifty': 50,
+        'sixty': 60,
+        'seventy': 70,
+        'eighty': 80,
+        'ninety': 90,
+        'hundred': 100,
+        'thousand': 1000,
+        'million': 1000000,
+        'billion': 1000000000,
+        'trillion': 1000000000000
+    }
+    try:
+      # Define regular expression to match freetext numbers
+      pattern = re.compile(r'(\w+(?:\s+\w+)*)\s+(point|decimal|dot|comma)\s+(\w+(?:\s+\w+)*)')
+      # Extract freetext number and decimal part from input text
+      match = pattern.search(text)
+      if match:
+          whole_part = match.group(1).lower()
+          decimal_part = match.group(3).lower()
+          # Convert whole and decimal parts to numeric form
+          numeric_whole = sum(number_map[word] * (10 ** (len(whole_part.split()) - i - 1)) for i, word in enumerate(whole_part.split()))
+          numeric_decimal = sum(number_map[word] * (0.1 ** (i + 1)) for i, word in enumerate(decimal_part.split()))
+          return numeric_whole + numeric_decimal
+      # Return None if the input text doesn't match the regular expression
+      return 0
+    except:
+      return 0
+def numeric_number_dot_freetext(text):
+  '''
+  This is a function to convert cases of '6 point five and six point 5'
+  '''
+  try:
+      # Define a dictionary to map words to numbers
+      num_dict = {"zero":0, "one":1, "two":2, "three":3, "four":4, "five":5,
+                  "six":6, "seven":7, "eight":8, "nine":9}
+      # Define a regular expression pattern to extract the numeric form and free text form from input text
+      pattern = r"(\d+|\w+)(?:\s+(?:decimal|point|dot|comma)\s+)(\d+|\w+)"
+      # Use regular expression to extract the numeric form and free text form from input text
+      match = re.search(pattern, text)
+      if match:
+          num1 = match.group(1)
+          num2 = match.group(2)
+          # If the numeric form is a word, map it to its numerical value
+          if num1 in num_dict:
+              num1 = num_dict[num1]
+          # If the free text form is a word, map it to its numerical value
+          if num2 in num_dict:
+              num2 = num_dict[num2]
+          # Convert both parts to float and add them together to get the final decimal value
+          result = float(num1) + float(num2) / (10 ** len(str(num2)))
+          return result
+      else:
+          # If input text doesn't match the expected pattern, return None
+          return 0
+  except:
+    return 0
+def convert_into_numeric(num_list):
+  '''
+  This is a function to convert the identified numbers into a numeric form
+  '''
+  if num_list:
+    # at first we examine how many numbers were captured. Only one number should exist
+    if len(num_list) > 1:
+      return 0
+    else:
+      target_num = num_list[0]
+      # case it is an integer or float, convert it, otherwise move to following cases
+      try:
+        target_num_float = float(target_num)
+        return {'Number' : target_num}
+      except:
+        # case that it belongs to one of the patterns of freetext number followed by numeric form etc (all the combinations)
+        if "$pattern" in target_num:
+          num, _ = target_num.split("$")
+          # try at first with that function for the case of six point five
+          num_conversion = numeric_freetext_dot_freetext(num)
+          if num_conversion:
+            return {'Number' : num_conversion}
+          # if not, try with this function for all the rest of cases (6 point 5, 6 point five, six point 5)
+          else:
+            num_conversion = numeric_number_dot_freetext(num)
+            if num_conversion:
+              return {'Number' : num_conversion}
+        # if none of the above has worked, then examine the case of freetext numbers without patterns (e.g. two, million, twenty three, etc)
+        else:
+          try:
+            num_conversion = w2n.word_to_num(target_num)
+            return {'Number' : num_conversion}
+          # if none of the above, error.
+          except:
+            return 0
+  else:
+    return 0
+def magnitude_binding(input_text):
+  try:
+    target_numbers = capture_numbers(input_text)
+    numeric_target_numbers = convert_into_numeric(target_numbers)
+    return numeric_target_numbers
+  except:
+    return 0
+from transformers import pipeline
+import gradio as gr
+title = "Natural Language module Demo for Magnitude numbers identification"
+description = "This is a simple demo just for demonstration purposes, so that Serco team might have the chance to validate the results of the Natural Language module concerning magnitude number identification, while in progress \n\n NOTE: DO NOT ENTER DATES FOR THIS TESTING"
+examples = [
+    ["Earthquake located in Ishkoshim, Tajikistan with magnitude greater than 6"],
+    ["Give me all the earthquakes with magnitude above than 8.23 in the region of Athens"],
+    ["Earthquake happened in Rome with a magnitude of four"],
+    ["I want all earthquakes larger than five point six that occurred in Rome"],
+    ["I want all earthquakes larger than five point 6 that occurred in Rome"],
+    ["I want all earthquakes larger than 5 point six that occurred in Rome"],
+    ["I want all earthquakes larger than 5 point 6 that occurred in Rome"]
+]
+gr.Interface(
+    fn=magnitude_binding,
+    inputs="text",
+    outputs="text",
+    title=title,
+    description=description,
+    examples=examples,
+    enable_queue=True,
+    ).launch()