cdactvm commited on
Commit
c8bb448
·
verified ·
1 Parent(s): dbb14d8

Update text2int.py

Browse files
Files changed (1) hide show
  1. text2int.py +65 -50
text2int.py CHANGED
@@ -1,35 +1,41 @@
1
- from isNumber import is_number # Remove or replace this if unnecessary
 
 
 
 
 
 
 
2
 
3
  def text_to_int(textnum, numwords={}):
4
- # Define units, tens, and scales including "lac"
5
- units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
6
- 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
7
- 'sixteen', 'seventeen', 'eighteen', 'nineteen']
8
  tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
9
- scales = ['hundred', 'thousand', 'lac', 'million', 'billion', 'trillion'] # "lac" added
10
- ordinal_words = {'first': 1, 'second': 2, 'third': 3, 'fifth': 5, 'eighth': 8, 'ninth': 9, 'twelfth': 12}
 
11
  ordinal_endings = [('ieth', 'y'), ('th', '')]
12
-
13
  if not numwords:
14
  numwords['and'] = (1, 0) # Handle "one hundred and twenty"
15
-
16
- # Add units, tens, and scales to numwords
17
  for idx, word in enumerate(units):
18
  numwords[word] = (1, idx)
19
  for idx, word in enumerate(tens):
20
- numwords[word] = (1, idx * 10)
21
-
22
  for idx, word in enumerate(scales):
23
- numwords[word] = (10 ** (5 if word == 'lac' else idx * 3 or 2), 0) # Handle "lac" as 10^5
24
-
25
- # Remove hyphens and normalize input
26
- textnum = textnum.replace('-', ' ')
27
 
 
 
 
28
  current = result = 0
29
  curstring = ''
30
  onnumber = False
31
  lastunit = False
32
  lastscale = False
 
 
33
 
34
  def is_numword(x):
35
  return is_number(x) or x in numwords
@@ -38,8 +44,16 @@ def text_to_int(textnum, numwords={}):
38
  if is_number(x):
39
  return 0, int(x.replace(',', ''))
40
  return numwords[x]
41
-
42
- for word in textnum.split():
 
 
 
 
 
 
 
 
43
  if word in ordinal_words:
44
  scale, increment = (1, ordinal_words[word])
45
  current = current * scale + increment
@@ -49,40 +63,41 @@ def text_to_int(textnum, numwords={}):
49
  onnumber = True
50
  lastunit = False
51
  lastscale = False
52
- else:
53
- for ending, replacement in ordinal_endings:
54
- if word.endswith(ending):
55
- word = f"{word[:-len(ending)]}{replacement}"
56
-
57
- if not is_numword(word) or (word == 'and' and not lastscale):
58
- if onnumber:
59
- curstring += repr(result + current) + " "
60
- curstring += word + " "
61
- result = current = 0
62
- onnumber = False
63
- lastunit = False
64
- lastscale = False
65
- else:
66
- scale, increment = from_numword(word)
67
- onnumber = True
68
-
69
- if lastunit and word not in scales:
70
- curstring += repr(result + current) + " "
71
- result = current = 0
72
-
73
- if scale > 1:
74
- current = max(1, current)
75
-
76
- current = current * scale + increment
77
 
78
- if scale >= 100:
79
- result += current
80
- current = 0
81
 
82
- lastscale = word in scales
83
- lastunit = word in units
 
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  if onnumber:
86
- curstring += repr(result + current)
87
-
 
 
 
88
  return curstring.strip()
 
1
+ import re
2
+
3
+ def is_number(s):
4
+ try:
5
+ float(s.replace(',', '')) # Handles numbers with commas
6
+ return True
7
+ except ValueError:
8
+ return False
9
 
10
  def text_to_int(textnum, numwords={}):
11
+ units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine',
12
+ 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen']
 
 
13
  tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
14
+ scales = ['hundred', 'thousand', 'lac', 'million', 'billion', 'trillion'] # "lac" handled as 10^5
15
+ ordinal_words = {'first': 1, 'second': 2, 'third': 3, 'fourth': 4, 'fifth': 5, 'sixth': 6,
16
+ 'seventh': 7, 'eighth': 8, 'ninth': 9, 'tenth': 10, 'eleventh': 11, 'twelfth': 12}
17
  ordinal_endings = [('ieth', 'y'), ('th', '')]
18
+
19
  if not numwords:
20
  numwords['and'] = (1, 0) # Handle "one hundred and twenty"
 
 
21
  for idx, word in enumerate(units):
22
  numwords[word] = (1, idx)
23
  for idx, word in enumerate(tens):
24
+ if word:
25
+ numwords[word] = (1, idx * 10)
26
  for idx, word in enumerate(scales):
27
+ numwords[word] = (10 ** (5 if word == 'lac' else idx * 3 or 2), 0)
 
 
 
28
 
29
+ textnum = textnum.lower().replace('-', ' ') # Normalize input
30
+ words = textnum.split()
31
+
32
  current = result = 0
33
  curstring = ''
34
  onnumber = False
35
  lastunit = False
36
  lastscale = False
37
+ decimal_part = []
38
+ is_decimal = False
39
 
40
  def is_numword(x):
41
  return is_number(x) or x in numwords
 
44
  if is_number(x):
45
  return 0, int(x.replace(',', ''))
46
  return numwords[x]
47
+
48
+ for word in words:
49
+ if word == 'point':
50
+ is_decimal = True
51
+ continue
52
+
53
+ for ending, replacement in ordinal_endings:
54
+ if word.endswith(ending):
55
+ word = f"{word[:-len(ending)]}{replacement}"
56
+
57
  if word in ordinal_words:
58
  scale, increment = (1, ordinal_words[word])
59
  current = current * scale + increment
 
63
  onnumber = True
64
  lastunit = False
65
  lastscale = False
66
+ elif is_numword(word):
67
+ scale, increment = from_numword(word)
68
+ onnumber = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
+ if is_decimal:
71
+ decimal_part.append(str(increment))
72
+ continue
73
 
74
+ if lastunit and word not in scales:
75
+ curstring += str(result + current) + " "
76
+ result = current = 0
77
 
78
+ if scale > 1:
79
+ current = max(1, current)
80
+ current = current * scale + increment
81
+ if scale >= 100:
82
+ result += current
83
+ current = 0
84
+ lastscale = word in scales
85
+ lastunit = word in units
86
+ elif word == 'and' and lastscale:
87
+ continue # Ignore "and" when used in valid contexts
88
+ else:
89
+ if onnumber:
90
+ curstring += str(result + current) + " "
91
+ curstring += word + " "
92
+ result = current = 0
93
+ onnumber = False
94
+ lastunit = False
95
+ lastscale = False
96
+
97
  if onnumber:
98
+ curstring += str(result + current)
99
+
100
+ if decimal_part:
101
+ curstring += '.' + ''.join(decimal_part)
102
+
103
  return curstring.strip()