Spaces:

cdactvm
/

Punjabi_ASR_Demo

Sleeping

App Files Files Community

cdactvm commited on Mar 3

Commit

c8bb448

verified ·

1 Parent(s): dbb14d8

Update text2int.py

Browse files

Files changed (1) hide show

text2int.py +65 -50

text2int.py CHANGED Viewed

@@ -1,35 +1,41 @@
-from isNumber import is_number  # Remove or replace this if unnecessary
 def text_to_int(textnum, numwords={}):
-    # Define units, tens, and scales including "lac"
-    units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
-            'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
-            'sixteen', 'seventeen', 'eighteen', 'nineteen']
     tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
-    scales = ['hundred', 'thousand', 'lac', 'million', 'billion', 'trillion']  # "lac" added
-    ordinal_words = {'first': 1, 'second': 2, 'third': 3, 'fifth': 5, 'eighth': 8, 'ninth': 9, 'twelfth': 12}
     ordinal_endings = [('ieth', 'y'), ('th', '')]
     if not numwords:
         numwords['and'] = (1, 0)  # Handle "one hundred and twenty"
-        # Add units, tens, and scales to numwords
         for idx, word in enumerate(units):
             numwords[word] = (1, idx)
         for idx, word in enumerate(tens):
-            numwords[word] = (1, idx * 10)
         for idx, word in enumerate(scales):
-            numwords[word] = (10 ** (5 if word == 'lac' else idx * 3 or 2), 0)  # Handle "lac" as 10^5
-    # Remove hyphens and normalize input
-    textnum = textnum.replace('-', ' ')
     current = result = 0
     curstring = ''
     onnumber = False
     lastunit = False
     lastscale = False
     def is_numword(x):
         return is_number(x) or x in numwords
@@ -38,8 +44,16 @@ def text_to_int(textnum, numwords={}):
         if is_number(x):
             return 0, int(x.replace(',', ''))
         return numwords[x]
-    for word in textnum.split():
         if word in ordinal_words:
             scale, increment = (1, ordinal_words[word])
             current = current * scale + increment
@@ -49,40 +63,41 @@ def text_to_int(textnum, numwords={}):
             onnumber = True
             lastunit = False
             lastscale = False
-        else:
-            for ending, replacement in ordinal_endings:
-                if word.endswith(ending):
-                    word = f"{word[:-len(ending)]}{replacement}"
-            if not is_numword(word) or (word == 'and' and not lastscale):
-                if onnumber:
-                    curstring += repr(result + current) + " "
-                curstring += word + " "
-                result = current = 0
-                onnumber = False
-                lastunit = False
-                lastscale = False
-            else:
-                scale, increment = from_numword(word)
-                onnumber = True
-                if lastunit and word not in scales:
-                    curstring += repr(result + current) + " "
-                    result = current = 0
-                if scale > 1:
-                    current = max(1, current)
-                current = current * scale + increment
-                if scale >= 100:
-                    result += current
-                    current = 0
-                lastscale = word in scales
-                lastunit = word in units
     if onnumber:
-        curstring += repr(result + current)
     return curstring.strip()

+import re
+def is_number(s):
+    try:
+        float(s.replace(',', ''))  # Handles numbers with commas
+        return True
+    except ValueError:
+        return False
 def text_to_int(textnum, numwords={}):
+    units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine',
+             'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen']
     tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
+    scales = ['hundred', 'thousand', 'lac', 'million', 'billion', 'trillion']  # "lac" handled as 10^5
+    ordinal_words = {'first': 1, 'second': 2, 'third': 3, 'fourth': 4, 'fifth': 5, 'sixth': 6,
+                     'seventh': 7, 'eighth': 8, 'ninth': 9, 'tenth': 10, 'eleventh': 11, 'twelfth': 12}
     ordinal_endings = [('ieth', 'y'), ('th', '')]
     if not numwords:
         numwords['and'] = (1, 0)  # Handle "one hundred and twenty"
         for idx, word in enumerate(units):
             numwords[word] = (1, idx)
         for idx, word in enumerate(tens):
+            if word:
+                numwords[word] = (1, idx * 10)
         for idx, word in enumerate(scales):
+            numwords[word] = (10 ** (5 if word == 'lac' else idx * 3 or 2), 0)
+    textnum = textnum.lower().replace('-', ' ')  # Normalize input
+    words = textnum.split()
     current = result = 0
     curstring = ''
     onnumber = False
     lastunit = False
     lastscale = False
+    decimal_part = []
+    is_decimal = False
     def is_numword(x):
         return is_number(x) or x in numwords
         if is_number(x):
             return 0, int(x.replace(',', ''))
         return numwords[x]
+    for word in words:
+        if word == 'point':
+            is_decimal = True
+            continue
+        for ending, replacement in ordinal_endings:
+            if word.endswith(ending):
+                word = f"{word[:-len(ending)]}{replacement}"
         if word in ordinal_words:
             scale, increment = (1, ordinal_words[word])
             current = current * scale + increment
             onnumber = True
             lastunit = False
             lastscale = False
+        elif is_numword(word):
+            scale, increment = from_numword(word)
+            onnumber = True
+            if is_decimal:
+                decimal_part.append(str(increment))
+                continue
+            if lastunit and word not in scales:
+                curstring += str(result + current) + " "
+                result = current = 0
+            if scale > 1:
+                current = max(1, current)
+            current = current * scale + increment
+            if scale >= 100:
+                result += current
+                current = 0
+            lastscale = word in scales
+            lastunit = word in units
+        elif word == 'and' and lastscale:
+            continue  # Ignore "and" when used in valid contexts
+        else:
+            if onnumber:
+                curstring += str(result + current) + " "
+            curstring += word + " "
+            result = current = 0
+            onnumber = False
+            lastunit = False
+            lastscale = False
     if onnumber:
+        curstring += str(result + current)
+    if decimal_part:
+        curstring += '.' + ''.join(decimal_part)
     return curstring.strip()