File size: 3,498 Bytes
c8bb448
 
 
 
 
 
 
 
2b75669
 
c8bb448
 
2b75669
c8bb448
 
 
2b75669
c8bb448
2b75669
 
 
 
 
c8bb448
 
2b75669
c8bb448
2b75669
c8bb448
 
 
2b75669
 
 
 
 
c8bb448
 
2b75669
 
 
 
 
 
 
 
c8bb448
 
 
 
 
 
 
 
 
 
2b75669
 
 
 
 
 
 
 
 
c8bb448
 
 
2b75669
c8bb448
 
 
2b75669
c8bb448
 
 
2b75669
c8bb448
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b75669
c8bb448
 
 
 
 
2b75669
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import re

def is_number(s):
    try:
        float(s.replace(',', ''))  # Handles numbers with commas
        return True
    except ValueError:
        return False

def text_to_int(textnum, numwords={}):
    units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine',
             'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen']
    tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
    scales = ['hundred', 'thousand', 'lac', 'million', 'billion', 'trillion']  # "lac" handled as 10^5
    ordinal_words = {'first': 1, 'second': 2, 'third': 3, 'fourth': 4, 'fifth': 5, 'sixth': 6,
                     'seventh': 7, 'eighth': 8, 'ninth': 9, 'tenth': 10, 'eleventh': 11, 'twelfth': 12}
    ordinal_endings = [('ieth', 'y'), ('th', '')]
    
    if not numwords:
        numwords['and'] = (1, 0)  # Handle "one hundred and twenty"
        for idx, word in enumerate(units):
            numwords[word] = (1, idx)
        for idx, word in enumerate(tens):
            if word:
                numwords[word] = (1, idx * 10)
        for idx, word in enumerate(scales):
            numwords[word] = (10 ** (5 if word == 'lac' else idx * 3 or 2), 0)

    textnum = textnum.lower().replace('-', ' ')  # Normalize input
    words = textnum.split()
    
    current = result = 0
    curstring = ''
    onnumber = False
    lastunit = False
    lastscale = False
    decimal_part = []
    is_decimal = False

    def is_numword(x):
        return is_number(x) or x in numwords

    def from_numword(x):
        if is_number(x):
            return 0, int(x.replace(',', ''))
        return numwords[x]
    
    for word in words:
        if word == 'point':
            is_decimal = True
            continue
        
        for ending, replacement in ordinal_endings:
            if word.endswith(ending):
                word = f"{word[:-len(ending)]}{replacement}"
        
        if word in ordinal_words:
            scale, increment = (1, ordinal_words[word])
            current = current * scale + increment
            if scale > 100:
                result += current
                current = 0
            onnumber = True
            lastunit = False
            lastscale = False
        elif is_numword(word):
            scale, increment = from_numword(word)
            onnumber = True

            if is_decimal:
                decimal_part.append(str(increment))
                continue

            if lastunit and word not in scales:
                curstring += str(result + current) + " "
                result = current = 0

            if scale > 1:
                current = max(1, current)
            current = current * scale + increment
            if scale >= 100:
                result += current
                current = 0
            lastscale = word in scales
            lastunit = word in units
        elif word == 'and' and lastscale:
            continue  # Ignore "and" when used in valid contexts
        else:
            if onnumber:
                curstring += str(result + current) + " "
            curstring += word + " "
            result = current = 0
            onnumber = False
            lastunit = False
            lastscale = False
    
    if onnumber:
        curstring += str(result + current)
    
    if decimal_part:
        curstring += '.' + ''.join(decimal_part)
    
    return curstring.strip()