duyv commited on
Commit
a926f18
·
verified ·
1 Parent(s): d704306

Upload 128 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +27 -0
  2. Vinorm/LICENSE +21 -0
  3. Vinorm/MANIFEST.in +9 -0
  4. Vinorm/README.md +36 -0
  5. Vinorm/build/lib/vinorm/Dict/Popular.txt +0 -0
  6. Vinorm/build/lib/vinorm/Mapping/Acronyms.txt +0 -0
  7. Vinorm/build/lib/vinorm/Mapping/Acronyms_shorten.txt +0 -0
  8. Vinorm/build/lib/vinorm/Mapping/BaseUnit.txt +109 -0
  9. Vinorm/build/lib/vinorm/Mapping/CurrencyUnit.txt +10 -0
  10. Vinorm/build/lib/vinorm/Mapping/LetterSoundEN.txt +33 -0
  11. Vinorm/build/lib/vinorm/Mapping/LetterSoundVN.txt +89 -0
  12. Vinorm/build/lib/vinorm/Mapping/Number.txt +10 -0
  13. Vinorm/build/lib/vinorm/Mapping/PrefixUnit.txt +20 -0
  14. Vinorm/build/lib/vinorm/Mapping/Symbol.txt +54 -0
  15. Vinorm/build/lib/vinorm/Mapping/Teencode.txt +484 -0
  16. Vinorm/build/lib/vinorm/RegexRule/Codenumber.txt +1 -0
  17. Vinorm/build/lib/vinorm/RegexRule/Date_1.txt +3 -0
  18. Vinorm/build/lib/vinorm/RegexRule/Date_2.txt +1 -0
  19. Vinorm/build/lib/vinorm/RegexRule/Date_3.txt +1 -0
  20. Vinorm/build/lib/vinorm/RegexRule/Date_From_To_1.txt +2 -0
  21. Vinorm/build/lib/vinorm/RegexRule/Date_From_To_2.txt +2 -0
  22. Vinorm/build/lib/vinorm/RegexRule/Email.txt +1 -0
  23. Vinorm/build/lib/vinorm/RegexRule/FootballOther.txt +3 -0
  24. Vinorm/build/lib/vinorm/RegexRule/FootballUnder.txt +1 -0
  25. Vinorm/build/lib/vinorm/RegexRule/Measurement.txt +4 -0
  26. Vinorm/build/lib/vinorm/RegexRule/Measurement_1.txt +4 -0
  27. Vinorm/build/lib/vinorm/RegexRule/Month.txt +1 -0
  28. Vinorm/build/lib/vinorm/RegexRule/NormalNumber.txt +4 -0
  29. Vinorm/build/lib/vinorm/RegexRule/Office.txt +1 -0
  30. Vinorm/build/lib/vinorm/RegexRule/PhoneNumber.txt +4 -0
  31. Vinorm/build/lib/vinorm/RegexRule/PoliticalDivision.txt +1 -0
  32. Vinorm/build/lib/vinorm/RegexRule/RomanNumber.txt +1 -0
  33. Vinorm/build/lib/vinorm/RegexRule/Street.txt +1 -0
  34. Vinorm/build/lib/vinorm/RegexRule/Time.txt +3 -0
  35. Vinorm/build/lib/vinorm/RegexRule/Website.txt +2 -0
  36. Vinorm/build/lib/vinorm/__init__.py +38 -0
  37. Vinorm/build/lib/vinorm/input.txt +1 -0
  38. Vinorm/build/lib/vinorm/lib/icu/64.2/Makefile.inc +292 -0
  39. Vinorm/build/lib/vinorm/lib/icu/64.2/pkgdata.inc +17 -0
  40. Vinorm/build/lib/vinorm/lib/libicudata.so +3 -0
  41. Vinorm/build/lib/vinorm/lib/libicudata.so.64 +3 -0
  42. Vinorm/build/lib/vinorm/lib/libicudata.so.64.2 +3 -0
  43. Vinorm/build/lib/vinorm/lib/libicui18n.so +3 -0
  44. Vinorm/build/lib/vinorm/lib/libicui18n.so.64 +3 -0
  45. Vinorm/build/lib/vinorm/lib/libicui18n.so.64.2 +3 -0
  46. Vinorm/build/lib/vinorm/lib/libicuio.so +0 -0
  47. Vinorm/build/lib/vinorm/lib/libicuio.so.64 +0 -0
  48. Vinorm/build/lib/vinorm/lib/libicuio.so.64.2 +0 -0
  49. Vinorm/build/lib/vinorm/lib/libicutest.so +0 -0
  50. Vinorm/build/lib/vinorm/lib/libicutest.so.64 +0 -0
.gitattributes CHANGED
@@ -33,3 +33,30 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Vinorm/build/lib/vinorm/lib/libicudata.so filter=lfs diff=lfs merge=lfs -text
37
+ Vinorm/build/lib/vinorm/lib/libicudata.so.64 filter=lfs diff=lfs merge=lfs -text
38
+ Vinorm/build/lib/vinorm/lib/libicudata.so.64.2 filter=lfs diff=lfs merge=lfs -text
39
+ Vinorm/build/lib/vinorm/lib/libicui18n.so filter=lfs diff=lfs merge=lfs -text
40
+ Vinorm/build/lib/vinorm/lib/libicui18n.so.64 filter=lfs diff=lfs merge=lfs -text
41
+ Vinorm/build/lib/vinorm/lib/libicui18n.so.64.2 filter=lfs diff=lfs merge=lfs -text
42
+ Vinorm/build/lib/vinorm/lib/libicutu.so filter=lfs diff=lfs merge=lfs -text
43
+ Vinorm/build/lib/vinorm/lib/libicutu.so.64 filter=lfs diff=lfs merge=lfs -text
44
+ Vinorm/build/lib/vinorm/lib/libicutu.so.64.2 filter=lfs diff=lfs merge=lfs -text
45
+ Vinorm/build/lib/vinorm/lib/libicuuc.so filter=lfs diff=lfs merge=lfs -text
46
+ Vinorm/build/lib/vinorm/lib/libicuuc.so.64 filter=lfs diff=lfs merge=lfs -text
47
+ Vinorm/build/lib/vinorm/lib/libicuuc.so.64.2 filter=lfs diff=lfs merge=lfs -text
48
+ Vinorm/build/lib/vinorm/main filter=lfs diff=lfs merge=lfs -text
49
+ Vinorm/dist/vinorm-2.0.7-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
50
+ Vinorm/vinorm/lib/libicudata.so filter=lfs diff=lfs merge=lfs -text
51
+ Vinorm/vinorm/lib/libicudata.so.64 filter=lfs diff=lfs merge=lfs -text
52
+ Vinorm/vinorm/lib/libicudata.so.64.2 filter=lfs diff=lfs merge=lfs -text
53
+ Vinorm/vinorm/lib/libicui18n.so filter=lfs diff=lfs merge=lfs -text
54
+ Vinorm/vinorm/lib/libicui18n.so.64 filter=lfs diff=lfs merge=lfs -text
55
+ Vinorm/vinorm/lib/libicui18n.so.64.2 filter=lfs diff=lfs merge=lfs -text
56
+ Vinorm/vinorm/lib/libicutu.so filter=lfs diff=lfs merge=lfs -text
57
+ Vinorm/vinorm/lib/libicutu.so.64 filter=lfs diff=lfs merge=lfs -text
58
+ Vinorm/vinorm/lib/libicutu.so.64.2 filter=lfs diff=lfs merge=lfs -text
59
+ Vinorm/vinorm/lib/libicuuc.so filter=lfs diff=lfs merge=lfs -text
60
+ Vinorm/vinorm/lib/libicuuc.so.64 filter=lfs diff=lfs merge=lfs -text
61
+ Vinorm/vinorm/lib/libicuuc.so.64.2 filter=lfs diff=lfs merge=lfs -text
62
+ Vinorm/vinorm/main filter=lfs diff=lfs merge=lfs -text
Vinorm/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2019 The Python Packaging Authority
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
20
+
21
+ This project is belong to AILAB, Ho Chi Minh University of Science
Vinorm/MANIFEST.in ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ include LICENSE
2
+ include README.md
3
+ include vinorm/input.txt
4
+ include vinorm/output.txt
5
+ include vinorm/main
6
+ recursive-include vinorm/Dict *
7
+ recursive-include vinorm/lib *
8
+ recursive-include vinorm/Mapping *
9
+ recursive-include vinorm/RegexRule *
Vinorm/README.md ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Install ViNorm package
2
+ ```
3
+ pip install vinorm
4
+ ```
5
+ ### Using in python script
6
+ ```python
7
+ from vinorm import TTSnorm
8
+ S=TTSnorm("Hàm này được phát triển từ 8/2019. Có phải tháng 12/2020 đã có vaccine phòng ngừa Covid-19 xmz ?")
9
+ ```
10
+ Some option
11
+ ```python
12
+ TTSnorm(text, punc = False, unknown = True, lower = True, rule = False )
13
+ ```
14
+ - **lower**: If true, get normalization with lowercase
15
+ - **rule**: If true, just get normalization wit Regex, not using Dictionary Checking (this flag is not used with another flag)
16
+ - **punc**: If true, do not replace punctuation with dot and coma
17
+ - **unknown**: If true, replace unknown word, discard word undefine and do not contain vowel, do not spell word with vowel
18
+
19
+ From version 2.0, do not replace unknown words, skip them for espeak handle in phonetization step
20
+ - This version does not parse case: "Tổ chức WTO"
21
+ WTO do not in dictionary -> unknown -> keep origin, do not spell as in version 1.0, this aim to use with espeak, let espeak handle, but the drawback is the output of espeak for this case is "ve1kɛɜpte1ɔ7", it does not split each syllable.
22
+ - For new entity, need to update in the dictionary
23
+
24
+ For update lastest version access: https://github.com/NoahDrisort/vinorm
25
+
26
+ For version 1.0: spell words that is unknown by each character, check previous commit
27
+
28
+ For mac version: https://github.com/v-nhandt21/Vinorm/tree/vinorm_mac
29
+
30
+ For C++ version: https://github.com/NoahDrisort/vinorm_cpp_version
31
+
32
+ ### Update pypi
33
+ ```sh
34
+ python setup.py sdist bdist_wheel
35
+ twine upload dist/*
36
+ ```
Vinorm/build/lib/vinorm/Dict/Popular.txt ADDED
Binary file (69.4 kB). View file
 
Vinorm/build/lib/vinorm/Mapping/Acronyms.txt ADDED
The diff for this file is too large to render. See raw diff
 
Vinorm/build/lib/vinorm/Mapping/Acronyms_shorten.txt ADDED
The diff for this file is too large to render. See raw diff
 
Vinorm/build/lib/vinorm/Mapping/BaseUnit.txt ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ s#giây
2
+ giây#giây
3
+ ngày#ngày
4
+ m#mét
5
+ mét#mét
6
+ nm#na nô mét
7
+ g#gam
8
+ ampe#am be
9
+ mol#mon
10
+ cd#can đê la
11
+ N#Niu tơn
12
+ Pa#bát can
13
+ atm#át mót phe
14
+ đ#đồng
15
+ ha#héc ta
16
+ h#giờ
17
+ giờ#giờ
18
+ tháng#tháng
19
+ năm#năm
20
+ kB#kí lô bay
21
+ s#giây
22
+ giây#giây
23
+ đồng#đồng
24
+ ml#mi li lít
25
+ l#lít
26
+ lít#lít
27
+ gb#ghi ga bay
28
+ mb#mê ga bay
29
+ kb#kí lô bít
30
+ kg#kí lô gam
31
+ mg#mi li gam
32
+ m3#mét khối
33
+ km3#kí lô mét khối
34
+ dm3#đề xi mét khối
35
+ cc#xen ti mét khối
36
+ cm3#xen ti mét khối
37
+ m2#mét vuông
38
+ km2#kí lô mét vuông
39
+ dm2#đề xi mét vuông
40
+ cm2#xen ti mét vuông
41
+ mm2#mi li mét vuông
42
+ km#kí lô mét
43
+ dm#đề xi mét
44
+ cm#xen ti mét
45
+ mm#mi li mét
46
+ nm#na nô mét
47
+ mph#dặm một giờ
48
+ ft#phuốt
49
+ kn#hải lí một giờ
50
+ Nm#niu tơn mét
51
+ GN#gi ga niu tơn
52
+ MN#mê ga niu tơn
53
+ kN#ki lô niu tơn
54
+ mN#mi li niu tơn
55
+ j#giun
56
+ kj#ki lô giun
57
+ mj#mê ga giun
58
+ gj#gi ga giun
59
+ mw#mê ga oát
60
+ kW#ki lo oát
61
+ w#oát
62
+ Wh#oát giời
63
+ MWh#mê ga oát giờ
64
+ kWh#ki lo oát giờ
65
+ MeV#mê ga e lét tron vôn
66
+ eV#e lét tron vôn
67
+ Cal#ca lo
68
+ kcal#ki lô ca lo
69
+ °C#độ xê
70
+ °F#độ ép
71
+ °K#độ kê vin
72
+ dp#đi ốp
73
+ độ#độ
74
+ Wb#ve bê
75
+ điôp#đi ốp
76
+ Bq#bét cơ ren
77
+ dB#đề xi ben
78
+ min#phút
79
+ sec#giây
80
+ mmHg#mi li mét thủy ngân
81
+ mA#mi li âm be
82
+ rad#ra đi an
83
+ radian#ra đi an
84
+ hz#héc
85
+ tấn#tấn
86
+ lb#bao
87
+ oz#ao
88
+ ounce#ao
89
+ pound#bao
90
+ carat#ca ra
91
+ carat#ca ra
92
+ gallon#ga lon
93
+ gal#ga lon
94
+ inch#in
95
+ vnđ#việt nam đồng
96
+ vnd#việt nam đồng
97
+ rm#ring git
98
+ rub#rúp
99
+ chỉ#chỉ
100
+ lượng#lượng
101
+ sào#sào
102
+ công#công
103
+ mẫu#mẫu
104
+ yến#yến
105
+ tạ#tạ
106
+ vg#vòng
107
+ vòng#vòng
108
+ ph#phút
109
+ s2#giây bình phương
Vinorm/build/lib/vinorm/Mapping/CurrencyUnit.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ %#phần trăm
2
+ $#đô la
3
+ €#ê rô
4
+ £#bảng
5
+ ¥#yên
6
+ ₩#won
7
+ ₭#kíp
8
+ ₱#bê xô
9
+ ฿#bạc
10
+ Ω#ôm
Vinorm/build/lib/vinorm/Mapping/LetterSoundEN.txt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ a#ây
2
+ ă#á
3
+ â#ớ
4
+ b#bi
5
+ c#si
6
+ d#đi
7
+ đ#đê
8
+ e#i
9
+ ê#ê
10
+ f#ép
11
+ g#giy
12
+ h#ếch
13
+ i#ai
14
+ j#giây
15
+ k#cây
16
+ l#eo
17
+ m#em
18
+ n#en
19
+ o#âu
20
+ ô#ô
21
+ ơ#ơ
22
+ p#pi
23
+ q#kiu
24
+ r#a
25
+ s#ét
26
+ t#ti
27
+ u#diu
28
+ ư#ư
29
+ v#vi
30
+ w#đắp liu
31
+ x#ít
32
+ y#quai
33
+ z#giét
Vinorm/build/lib/vinorm/Mapping/LetterSoundVN.txt ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ f#ép
2
+ j#giây
3
+ z#giét
4
+ a#a
5
+ ă#á
6
+ â#ớ
7
+ b#bê
8
+ c#xê
9
+ d#dê
10
+ đ#đê
11
+ e#e
12
+ ê#ê
13
+ g#giê
14
+ h#hát
15
+ i#i
16
+ k#ca
17
+ l#lờ
18
+ m#mờ
19
+ n#nờ
20
+ o#o
21
+ ô#ô
22
+ ơ#ơ
23
+ p#pê
24
+ q#quy
25
+ r#rờ
26
+ s#ét
27
+ t#tê
28
+ u#u
29
+ ư#ư
30
+ v#vê
31
+ x#ích
32
+ y#i
33
+ w#vê kép
34
+ à#
35
+ ả#
36
+ ã#
37
+ á#
38
+ ạ#
39
+ ằ#
40
+ ẳ#
41
+ ẵ#
42
+ ầ#
43
+ ẩ#
44
+ ẫ#
45
+ ấ#
46
+ ậ#
47
+ è#
48
+ ẻ#
49
+ ẽ#
50
+ é#
51
+ ẹ#
52
+ ề#
53
+ ể#
54
+ ễ#
55
+ ế#
56
+ ệ#
57
+ ì#
58
+ ỉ#
59
+ ĩ#
60
+ í#
61
+ ị#
62
+ ò#
63
+ ỏ#
64
+ õ#
65
+ ó#
66
+ ọ#
67
+ ồ#
68
+ ổ#
69
+ ỗ#
70
+ ố#
71
+ ộ#
72
+ ỡ#
73
+ ớ#
74
+ ợ#
75
+ ù#
76
+ ủ#
77
+ ũ#
78
+ ú#
79
+ ụ#
80
+ ừ#
81
+ ử#
82
+ ữ#
83
+ ứ#
84
+ ự#
85
+ ỳ#
86
+ ỷ#
87
+ ỹ#
88
+ ý#
89
+ ỵ#
Vinorm/build/lib/vinorm/Mapping/Number.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ 0#không
2
+ 1#một
3
+ 2#hai
4
+ 3#ba
5
+ 4#bốn
6
+ 5#năm
7
+ 6#sáu
8
+ 7#bảy
9
+ 8#tám
10
+ 9#chín
Vinorm/build/lib/vinorm/Mapping/PrefixUnit.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ d#đề xi
2
+ c#xăng ti
3
+ m#mi li
4
+ μ#mi cờ rô
5
+ n#na nô
6
+ p#pi cô
7
+ f#phêm tô
8
+ a#át tô
9
+ z#zép tô
10
+ y#dóc tô
11
+ Y#dót ta
12
+ Z#giết ta
13
+ E#ét tra
14
+ P#pê ta
15
+ T#tê ra
16
+ G#ghi ga
17
+ M#mê ga
18
+ k#ki lô
19
+ h#héc tô
20
+ da#đề ca
Vinorm/build/lib/vinorm/Mapping/Symbol.txt ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @#a còng
2
+ $#đô la
3
+ %#phần trăm
4
+ ^#mũ
5
+ &#và
6
+ *#sao
7
+ +#cộng
8
+ =#bằng
9
+ \#xuyệt
10
+ <#nhỏ hơn
11
+ >#lớn hơn
12
+ <>#khác
13
+ ≠#khác
14
+ /#xuyệt
15
+ ±#cộng trừ
16
+ ×#nhân
17
+ ÷#chia
18
+ ∀#với mọi
19
+ ∏#tích
20
+ ∑#tổng
21
+ ∩#giao
22
+ ∪#hội
23
+ ≈#tương đương
24
+ ≤#nhỏ hơn hoặc bằng
25
+ ≥#lớn hơn hoặc bằng
26
+ ¬#phủ định
27
+ ∞#vô cùng
28
+ α#an pha
29
+ β#bê ta
30
+ γ#ga ma
31
+ δ#đen ta
32
+ ε#ép si lon
33
+ ϵ#thuộc
34
+ ζ#de ta
35
+ η#ê ta
36
+ θ#thê ta
37
+ ι#i ô ta
38
+ κ#cáp ba
39
+ λ#lam đa
40
+ ᴧ#và
41
+ μ#muy
42
+ ν#nu
43
+ ξ#xi xi
44
+ ο#o mi ron
45
+ π#pi
46
+ ρ#ro
47
+ σ#xích ma
48
+ τ#tao
49
+ υ#úp si lon
50
+ φ#phi
51
+ χ#chi
52
+ ψ#si
53
+ ω#ô me ga
54
+ ©#bản quyền
Vinorm/build/lib/vinorm/Mapping/Teencode.txt ADDED
@@ -0,0 +1,484 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bạk#bạch
2
+ bưc#bức
3
+ bùl#buồn
4
+ nếnh#nến
5
+ byt#buýt
6
+ cak#cách
7
+ cẻ#kể
8
+ cẻ#kẻ
9
+ cik#kinh
10
+ oxi#ô xi
11
+ oxy#ô xi
12
+ axit#a xít
13
+ cín#kín
14
+ cìw#kiều
15
+ côt#cốt
16
+ cũg#cũng
17
+ dáh#đánh
18
+ dấy#đấy
19
+ dìw#điều
20
+ dồ#đồ
21
+ dôg#đông
22
+ dọg#đọng
23
+ dớn#đớn
24
+ dừz#đường
25
+ fẳg#phẳng
26
+ fải#phải
27
+ fân#phân
28
+ fố#phố
29
+ fog#phong
30
+ fưz#phương
31
+ fựz#phượng
32
+ ge#ghe
33
+ gế#ghế
34
+ gen#ghen
35
+ hồg#hồng
36
+ ja#gia
37
+ já#giá
38
+ jà#già
39
+ jấy#giấy
40
+ jì#gì
41
+ jìn#gìn
42
+ jờ#giờ
43
+ jở#giở
44
+ jòg#giòng
45
+ jù#dù
46
+ jữ#giữ
47
+ kăn#khăn
48
+ ken#khen
49
+ ki#khi
50
+ kó#khó
51
+ kôg#không
52
+ lặg#lặng
53
+ lòg#lòng
54
+ lỵ#luỵ
55
+ mệh#mệnh
56
+ mih#minh
57
+ mừi#mười
58
+ mul#muôn
59
+ net#nét
60
+ nge#nghe
61
+ ngề#nghề
62
+ ngĩ#nghĩ
63
+ ngil#nghiên
64
+ H’leo#hờ leo
65
+ Ea#e a
66
+ ngòj#ngoài
67
+ ngọj#ngoại
68
+ ngừj#người
69
+ nhiw#nhiêu
70
+ nhưg#nhưng
71
+ nhữg#những
72
+ nup#núp
73
+ ôg#ông
74
+ qa#qua
75
+ qan#quan
76
+ qay#quay
77
+ qen#quen
78
+ qên#quên
79
+ qệt#quyệt
80
+ qý#quý
81
+ rồg#rồng
82
+ tăc#tắc
83
+ tak#tách
84
+ thừz#thường
85
+ thý#thuý
86
+ tih#tinh
87
+ tìh#tình
88
+ tĩh#tĩnh
89
+ trìw#triều
90
+ trog#trong
91
+ trồg#trồng
92
+ trug#trung
93
+ trưs#trước
94
+ trỳl#truyền
95
+ tyd#tuyết
96
+ vàg#vàng
97
+ vắg#vắng
98
+ vid#viết
99
+ vil#viên
100
+ vữg#vững
101
+ vưz#vương
102
+ xah#xanh
103
+ zâu#dâu
104
+ zì#dì
105
+ zo#do
106
+ zự#dự
107
+ roy#rồi
108
+ oh#ồ
109
+ ko#không
110
+ k#không
111
+ trc#trước
112
+ ơiii#ơi
113
+ bít#biết
114
+ fb#facebook
115
+ chứt#chức
116
+ khmer#khờ me
117
+ brâu#bờ râu
118
+ bru#bờ ru
119
+ kor#co
120
+ k'ho#cờ ho
121
+ hrê#hờ rê
122
+ m'nông#mờ nông
123
+ xtiêng#ti en
124
+ h'mong#hờ mông
125
+ péo#béo
126
+ rhadé#ra dé
127
+ raglai#ra lai
128
+ raglei#ra lây
129
+ noang#non
130
+ laoang#lao ang
131
+ paleng#ba len
132
+ xơlang#sơ lang
133
+ toong#tông
134
+ pọng#bọng
135
+ glơ#lơ
136
+ lar#la
137
+ plây#bờ lây
138
+ pleiku#bờ lây cu
139
+ yang#giang
140
+ thưr#thư
141
+ thớp#thớt
142
+ yung#giun
143
+ blui#bờ lui
144
+ blinh#bờ linh
145
+ nhiêng#nhiên
146
+ đim#dim
147
+ đech#đếch
148
+ tôc#tóc
149
+ brạo#bờ rạo
150
+ kontum#con tum
151
+ srúc#sờ rúc
152
+ cùa#cùa
153
+ chro#chờ ro
154
+ chrau#chờ rau
155
+ bicu#bi cu
156
+ sree#sờ re
157
+ bondưng#bon dưng
158
+ đazur#đa giu
159
+ đakriêng#đa riêng
160
+ đưngur#đư gu
161
+ kơ#cơ
162
+ kdun#cờ du
163
+ kơpa#cơ ba
164
+ kơsa#cơ sa
165
+ kra#cờ ra
166
+ zanh#gianh
167
+ k'tol#cờ to
168
+ liênghót#liêng hót
169
+ paungtin#bau tin
170
+ rglê#rờ lê
171
+ rơô#rơ ô
172
+ rơon#rơ on
173
+ sarem#sa rem
174
+ sơao#sơ ao
175
+ srê#sờ rê
176
+ lơmu#lơ mu
177
+ kroong#cờ rong
178
+ tuprông#tu rông
179
+ nim#nim
180
+ mơlam#mơ lam
181
+ achuếch#a chết
182
+ adốt#a dốt
183
+ nđnok#đờ nóc
184
+ prông#bờ rong
185
+ proong#bờ rong
186
+ bluông#bờ luông
187
+ bruốt#bờ ruốc
188
+ đuốt#đuốc
189
+ kriêng#cờ riêng
190
+ bloong#bờ lông
191
+ choong#chông
192
+ unh#ừ
193
+ plei#bờ lây
194
+ pooc#bốc
195
+ lêp#lép
196
+ h'nei#hờ nây
197
+ ah#a
198
+ kúm#cúm
199
+ kmụ#cờ mụ
200
+ rvai#rờ vai
201
+ veng#ven
202
+ lắk#lắc
203
+ tlắp#tờ lắp
204
+ đêr#đê
205
+ tmoong#tờ mông
206
+ tiác#ti ác
207
+ goi#goi
208
+ oivê#ô vê
209
+ mar#ma
210
+ kưmbur#cơ bu
211
+ thràng#tờ ràng
212
+ tgooc#tờ gốc
213
+ xlooc#sờ lóc
214
+ tlê#tờ lê
215
+ chưnđre#chưng re
216
+ tloc#tờ lóc
217
+ klảng#cờ lãng
218
+ glava#la va
219
+ tvạ#tờ vạ
220
+ tờrông#tờ rông
221
+ blai#bờ lai
222
+ niễu#niễu
223
+ soong#song
224
+ lip#líp
225
+ ôc#ốc
226
+ tlăp#tờ lắp
227
+ krlự#cờ lự
228
+ đnẹ#đờ nẹ
229
+ blao#bờ lao
230
+ sôp#sốp
231
+ huây#hay
232
+ k'va#cờ va
233
+ k'bình#cờ bình
234
+ h'mông#hờ mông
235
+ gioảng#gioảng
236
+ chìn#chình
237
+ nệnh#nện
238
+ lot#lót
239
+ pàn#bàn
240
+ kây#cây
241
+ gar#ga
242
+ kuyênh#khuy en
243
+ prong#bờ rông
244
+ riăm#ri am
245
+ dak#đắc
246
+ pnông#bờ nông
247
+ lak#lắc
248
+ sưr#sư
249
+ bing#binh
250
+ busor#bu so
251
+ đôp#đốp
252
+ krong#cờ rong
253
+ đrang#đờ rang
254
+ ja#gia
255
+ lieng#li en
256
+ hot#hót
257
+ đing#đinh
258
+ lưk#lư
259
+ mbuôn#mờ buôn
260
+ mđrang#mờ rang
261
+ mok#móc
262
+ nđu#nờ đu
263
+ n'tor#nờ to
264
+ pang#bang
265
+ pé#bé
266
+ sur#su
267
+ ting#tinh
268
+ tsong#tờ song
269
+ mbre#mờ re
270
+ phok#phóc
271
+ rche#rờ che
272
+ rchil#rờ chiêu
273
+ lăk#lắc
274
+ sruk#sờ ru
275
+ tơr#tơ
276
+ triek#trét
277
+ vmăk#vờ mắc
278
+ ntor#nờ to
279
+ pột#bột
280
+ kêr#kê
281
+ hăr#ha
282
+ plup#lu
283
+ khạ#cạ
284
+ chại#chạy
285
+ poọng#bọng
286
+ k'xing#cờ xinh
287
+ đrá#ra
288
+ kmang#cờ mang
289
+ zàng#giàng
290
+ vủ#vũ
291
+ pầu#bầu
292
+ pu#bu
293
+ sủn#sủng
294
+ hua#hua
295
+ sli#sờ li
296
+ pờ#bờ
297
+ sèn#xèn
298
+ dín#dính
299
+ slử#sờ lử
300
+ slay#sờ lây
301
+ phén#phéng
302
+ tẻn#tẽn
303
+ mùn#mùng
304
+ mán#máng
305
+ lya#li a
306
+ pạ#bạ
307
+ paseng#ba sen
308
+ pế#bế
309
+ sìn#xìn
310
+ pún#bún
311
+ chéng#chén
312
+ zhang#giang
313
+ déo#déo
314
+ đện#điện
315
+ lăp#lắp
316
+ cru#cờ ru
317
+ cado#ca do
318
+ kôt#cốt
319
+ b'nahria#bờ na ri a
320
+ lơng#lơn
321
+ k'bao#cờ bao
322
+ m'hơi#mờ hơi
323
+ rchom#rờ chôm
324
+ rsiu#rờ siu
325
+ rmah#rờ ma
326
+ ksor#cờ so
327
+ rahlan#ra lan
328
+ hieo#hi eo
329
+ kpă#cờ pa
330
+ kbla#cờ la
331
+ ksơr#cờ sơ
332
+ kreng#cờ reng
333
+ bobo#bo bo
334
+ pupo#bu bo
335
+ kazá#ca giá
336
+ camau#ca mau
337
+ chama#cha ma
338
+ chip#chíp
339
+ catơ#ca tơ
340
+ anu#a nu
341
+ xeng#xen
342
+ séng#xén
343
+ saư#sa ư
344
+ chom#chôm
345
+ dẹ#de
346
+ pươi#bươi
347
+ lok-ly#lo ly
348
+ pá#bá
349
+ duyền#duyên
350
+ pâu#bâu
351
+ páo#báo
352
+ pố#bố
353
+ dửn#dửng
354
+ nìu#nhìu
355
+ nghiã#nghĩa
356
+ cởng#cỡn
357
+ băc#bắc
358
+ đăt#đắc
359
+ tich#tích
360
+ viêt#viết
361
+ thưở#thuở
362
+ lich#lít
363
+ niê#ni ê
364
+ kđăm#cờ đăm
365
+ h'hen#hờ hen
366
+ hoà#hòa
367
+ rađe#ra đe
368
+ krung#cờ run
369
+ ktul#cờ tu
370
+ đlie#đờ li ê
371
+ epan#ê ban
372
+ êđê#ê đê
373
+ thày#thầy
374
+ yàng#giàng
375
+ choá#cho á
376
+ gah#ga
377
+ êđe#ê đê
378
+ mớí#mới
379
+ adrâng#a rân
380
+ ayun#a un
381
+ tul#tu
382
+ atul#a tu
383
+ buon#buôn
384
+ yah#da
385
+ duot#dượt
386
+ eban#ê ban
387
+ rah#ra
388
+ eman#ê man
389
+ emo#ê mo
390
+ enoul#ê no
391
+ hđok#hờ đo
392
+ hrue#hờ ru
393
+ hmok#hờ móc
394
+ jdrong#dờ rong
395
+ ktub#cờ tu
396
+ knul#cờ nu
397
+ kpor#cờ bo
398
+ ktla#cờ ta la
399
+ mjao#mi ao
400
+ mlo#mờ lo
401
+ duon#dương
402
+ hut#hút
403
+ ksei#cờ xây
404
+ nie#ni ê
405
+ blo#bờ lo
406
+ dap#dáp
407
+ rit#rít
408
+ mla#mờ la
409
+ sieng#si yên
410
+ sor#so
411
+ sok#sóc
412
+ hlinh#hờ linh
413
+ palei#ba lây
414
+ samách#sa mách
415
+ maha#ma ha
416
+ rudra#ru ra
417
+ inra#in ra
418
+ patra#pa tra
419
+ inđra#in ra
420
+ sara#sa ra
421
+ cri#cờ ri
422
+ lờy#lời
423
+ lỗu#lỗ
424
+ đựoc#được
425
+ klăn#cờ lăn
426
+ thăk#thắc
427
+ thak#then
428
+ eli#e li
429
+ đwai#đờ quai
430
+ lòai#loài
431
+ won#quan
432
+ kut#cút
433
+ pui#bui
434
+ đắk#đắc
435
+ đăk#đắc
436
+ ali#a li
437
+ mousa#mô sa
438
+ ysa#i sa
439
+ nguỵ#ngụy
440
+ uc#úc
441
+ xoè#xòe
442
+ pặp#bặp
443
+ khun#cờ hun
444
+ khuống#cờ huống
445
+ khằm#cờ hằm
446
+ lềm#lềnh
447
+ phia#khuya
448
+ tuỳ#tùy
449
+ cu-mưng#cu mưng
450
+ phìâ#phì
451
+ thuỷ#thủy
452
+ hoóng#hóng
453
+ lọai#loại
454
+ jan#gian
455
+ tuúp#túp
456
+ nưóc#nước
457
+ noóc#nóc
458
+ roong#rong
459
+ đăc#đắc
460
+ đuổng#đuông
461
+ thằm#thằm
462
+ nhâp#nhấp
463
+ nủ#nhủ
464
+ pú#bú
465
+ sếnh#sến
466
+ đựơc#được
467
+ krom#cờ rom
468
+ khêmara#khê ma ra
469
+ phum#phun
470
+ sampôt#sam bốt
471
+ khrâm#khờ râm
472
+ balamon#bà la môn
473
+ trớc#trước
474
+ xatra#sa tra
475
+ chnăm#chờ năm
476
+ oóc#óc
477
+ boóc#bóc
478
+ dơng#dương
479
+ hiùnh#huỳnh
480
+ neang#nang
481
+ nuth#nút
482
+ panth#ban
483
+ pem#bem
484
+ nêang#nê ang
Vinorm/build/lib/vinorm/RegexRule/Codenumber.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ (?i)(\b|^)[^\s]*\d[^\s]*\b
Vinorm/build/lib/vinorm/RegexRule/Date_1.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ (?i)\b[0-3]?[0-9]\s?\/\s?[01]?\d\s?\/\s?[12]\d{3}\b
2
+ (?i)\b[0-3]?[0-9]\s?-\s?[01]?\d\s?-\s?[12]\d{3}\b
3
+ (?i)\b[0-3]?[0-9]\s?\.\s?[01]?\d\s?\.\s?[12]\d{3}\b
Vinorm/build/lib/vinorm/RegexRule/Date_2.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ (?i)\s+([ivx]+)[\s-\/]([12]\d{3})\b
Vinorm/build/lib/vinorm/RegexRule/Date_3.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ (?i)\b(ngày|sáng|trưa|chiều|tối|đêm|hôm|nay|hai|ba|tư|năm|sáu|bảy|nhật|qua|lúc|từ|đến)\s+[0-3]?[0-9]\s?[\/.-]\s?[01]?\d\b
Vinorm/build/lib/vinorm/RegexRule/Date_From_To_1.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ (?i)(từ|ngày) [0-3]?[0-9]\s?-\s?[0-3]?[0-9][.\/][01]?\d\b
2
+ (?i)(từ|ngày) [0-3]?[0-9][.\/][01]?\d\s?(-|đến)\s?[0-3]?[0-9][.\/][01]?\d\b
Vinorm/build/lib/vinorm/RegexRule/Date_From_To_2.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ (?i)(từ|tháng) [01]?\d\s?-\s?[01]?\d[.\/][12]\d{3}\b
2
+ (?i)(từ|tháng) [01]?\d[.\/][12]\d{3}\s?(-|đến)\s?[01]?\d[.\/][12]\d{3}
Vinorm/build/lib/vinorm/RegexRule/Email.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ [a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*
Vinorm/build/lib/vinorm/RegexRule/FootballOther.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ (?i)[đ]ội hình \b\d\s?-\s?\d\s?-\s?\d(-\s?\d)?\b
2
+ (?i)[t]ỉ số \b\d{1,2}\s?[-|]\s?\d{1,2}\b
3
+ (?i)[h]ạ\s(\w+(\s)?){1,4}\d{1,2}\s?[-|]\s?\d{1,2}\b
Vinorm/build/lib/vinorm/RegexRule/FootballUnder.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ (?i)\b[u][.]?\d{2}\b
Vinorm/build/lib/vinorm/RegexRule/Measurement.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ (?i)\b(\d+(?:\.\d{3})+(?:,\d+)?)\s?([°|\p{Alphabetic}]+[2|3]?)(?:\/(\p{Alphabetic}+[2|3]?))?(?:\b|$)(\s?-?)
2
+ (?i)\b(\d+(?:,\d{3})+(?:\.\d+)?)\s?([°|\p{Alphabetic}]+[2|3]?)(?:\/(\p{Alphabetic}+[2|3]?))?(?:\b|$)(\s?-?)
3
+ (?i)\b(\d+(?:,\d+))\s?([°|\p{Alphabetic}]+[2|3]?)(?:\/(\p{Alphabetic}+[2|3]?))?(?:\b|$)(\s?-?)
4
+ (?i)\b(\d+(?:\.\d+)?)\s?([°|\p{Alphabetic}]+[2|3]?)(?:\/(\p{Alphabetic}+[2|3]?))?(?:\b|$)(\s?-?)
Vinorm/build/lib/vinorm/RegexRule/Measurement_1.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ (?i)(?:\b|^)(\d+(?:\.\d{3})+(?:,\d+)?)\s?(\%|\$|฿|₱|₭|₩|¥|€|£|Ω)(\s-|$|-|\s)
2
+ (?i)(?:\b|^)(\d+(?:,\d{3})+(?:\.\d+)?)\s?(\%|\$|฿|₱|₭|₩|¥|€|£|Ω)(\s-|$|-|\s)
3
+ (?i)(?:\b|^)(\d+(?:,\d+))\s?(\%|\$|฿|₱|₭|₩|¥|€|£|Ω)(\s-|$|-|\s)
4
+ (?i)(?:\b|^)(\d+(?:\.\d+)?)\s?(\%|\$|฿|₱|₭|₩|¥|€|£|Ω)(\s-|$|-|\s)
Vinorm/build/lib/vinorm/RegexRule/Month.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ (?i)tháng \d{1,2}\s?[\/.-]\s?\d{4}\b
Vinorm/build/lib/vinorm/RegexRule/NormalNumber.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ (?i)(\s|^)\d+(\.\d{3})+(,\d+)?(:?\b|$)
2
+ (?i)(\s|^)\d+(,\d{3})+(\.\d+)?(:?\b|$)
3
+ (?i)(\s|^|\s-)\d+(,\d+)(:?\b|$)
4
+ (?i)(\s|^|\s-)\d+(\.\d+)?(:?\b|$)
Vinorm/build/lib/vinorm/RegexRule/Office.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ (?i)(phòng|lớp|đơn vị)\s[^\s]*\d[^\s]*(\b|$)
Vinorm/build/lib/vinorm/RegexRule/PhoneNumber.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ ([^(\w|\d|\.)]|^)((\+\d{1,3})|0)[-\s.]?\d{1,3}[-\s.]?\d{3}[-\s.]?\d{4}\b
2
+ ([^(\w|\d|\.)]|^)((\+\d{1,3})|0)[-\s.]?\d{2,3}[-\s.]?\d{2}[- .]?\d{2}[- .]?\d{2}\b
3
+ ([^(\w|\d|\.)]|^)((\+\d{1,3})|0)[-\s.]?\d{1,3}[-\s.]?\d{1,2}[-\s.]?\d{2,3}[-\s.]?\d{3}\b
4
+ \b1[89]00[\s\.]?[\d\s\.]{4,8}\b
Vinorm/build/lib/vinorm/RegexRule/PoliticalDivision.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ (?i)\b(kp|q|p|h|tx|tp|x)\s?[.]\s?
Vinorm/build/lib/vinorm/RegexRule/RomanNumber.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ (?i)(\b|^)(thứ|lần|kỷ|kỉ|kì|kỳ|khóa)\s+[V|I|X]{1,5}(\b|$)
Vinorm/build/lib/vinorm/RegexRule/Street.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ (?i)\b(đường|số|số nhà|nhà|địa chỉ|tọa lạc|xã|thôn|ấp|khu phố|căn hộ|cư xá|Đ\/c)[\s:]\s?[^\s]*\d[^\s]*(\b|$)
Vinorm/build/lib/vinorm/RegexRule/Time.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ (?i)\b\d{1,2}\s?[hg]\s?\d{1,2}\s?[ap]?[m]?\b(\s?-?)
2
+ (?i)\b\d{1,2}\s?[hg]\b(\s?-?)
3
+ (?i)\b(?:2[0-4]|[01]?[1-9])\s?[:hg]\s?[0-5][0-9]\s?[ap]?[m]?\b(\s?-?)
Vinorm/build/lib/vinorm/RegexRule/Website.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ (?i)\b(https?:\/\/|ftp:\/\/|www\.|[^\s:=]+@www\.)?((\w+)\.)+(?:com|au\.uk|co\.in|net|org|info|coop|int|co\.uk|org\.uk|ac\.uk|uk)([\.\/][^\s]*)*([^(w|\d)]|$)
2
+ (?i)\b((https?:\/\/|ftp:\/\/|sftp:\/\/|www\.|[^\s:=]+@www\.))(?:\S+(?::\S*)?@)?(?:(?!10(?:\.\d{1,3}){3})(?!127(?:\.\d{1,3}){3})(?!169\.254(?:\.\d{1,3}){2})(?!192\.168(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\d]+-[a-z\d])*[a-z\d]+)(?:\.(?:[a-z\d]+-?)*[a-z\d]+)*(?:\.(?:[a-z]{2,})))(?::\d{2,5})?(?:\/[^\s]*)?([^(w|\d)]|$)
Vinorm/build/lib/vinorm/__init__.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import subprocess,os
3
+ import imp
4
+ def TTSnorm(text, punc = False, unknown = True, lower = True, rule = False ):
5
+ A=imp.find_module('vinorm')[1]
6
+
7
+ #print(A)
8
+ I=A+"/input.txt"
9
+ with open(I, mode="w+", encoding="utf-8") as fw:
10
+ fw.write(text)
11
+
12
+ myenv = os.environ.copy()
13
+ myenv['LD_LIBRARY_PATH'] = A+'/lib'
14
+
15
+ E=A+"/main"
16
+ Command = [E]
17
+ if punc:
18
+ Command.append("-punc")
19
+ if unknown:
20
+ Command.append("-unknown")
21
+ if lower:
22
+ Command.append("-lower")
23
+ if rule:
24
+ Command.append("-rule")
25
+ subprocess.check_call(Command, env=myenv, cwd=A)
26
+
27
+ O=A+"/output.txt"
28
+ with open(O, mode="r", encoding="utf-8") as fr:
29
+ text=fr.read()
30
+ TEXT=""
31
+ S=text.split("#line#")
32
+ for s in S:
33
+ if s=="":
34
+ continue
35
+ TEXT+=s+". "
36
+
37
+
38
+ return TEXT
Vinorm/build/lib/vinorm/input.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ nhan
Vinorm/build/lib/vinorm/lib/icu/64.2/Makefile.inc ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2016 and later: Unicode, Inc. and others.
2
+ # License & terms of use: http://www.unicode.org/copyright.html
3
+ ## -*-makefile-*-
4
+ #******************************************************************************
5
+ # Copyright (C) 1999-2014, International Business Machines
6
+ # Corporation and others. All Rights Reserved.
7
+ #******************************************************************************
8
+ # This Makefile.inc is designed to be included into projects which make use
9
+ # of the ICU.
10
+
11
+ # CONTENTS OF THIS FILE
12
+ # 1). Base configuration information and linkage
13
+ # 2). Variables giving access to ICU tools
14
+ # 3). Host information
15
+ # 4). Compiler flags and settings
16
+ # 5). Data Packaging directives
17
+ # 6). Include of platform make fragment (mh-* file)
18
+
19
+ ##################################################################
20
+ ##################################################################
21
+ #
22
+ # *1* base configuration information and linkage
23
+ #
24
+ ##################################################################
25
+ # The PREFIX is the base of where ICU is installed.
26
+ # Inside this directory you should find bin, lib, include/unicode,
27
+ # etc. If ICU is not installed in this directory, you must change the
28
+ # following line. There should exist $(prefix)/include/unicode/utypes.h
29
+ # for example.
30
+ prefix = /usr/local
31
+ exec_prefix = ${prefix}
32
+ libdir = ${exec_prefix}/lib
33
+ libexecdir = ${exec_prefix}/libexec
34
+ bindir = ${exec_prefix}/bin
35
+ datarootdir = ${prefix}/share
36
+ datadir = ${datarootdir}
37
+ sbindir = ${exec_prefix}/sbin
38
+
39
+ # about the ICU version
40
+ VERSION = 64.2
41
+ UNICODE_VERSION = 12.1
42
+
43
+ # The prefix for ICU libraries, normally 'icu'
44
+ ICUPREFIX = icu
45
+ PACKAGE = icu
46
+ LIBICU = lib$(ICUPREFIX)
47
+
48
+ # Static library prefix and file extension
49
+ STATIC_PREFIX = s
50
+ LIBSICU = lib$(STATIC_PREFIX)$(ICUPREFIX)
51
+ A = a
52
+
53
+ # Suffix at the end of libraries. Usually empty.
54
+ ICULIBSUFFIX =
55
+ # ICULIBSUFFIX_VERSION is non-empty if it is to contain a library
56
+ # version. For example, if it is 21, it means libraries are named
57
+ # libicuuc21.so for example.
58
+
59
+ # rpath links a library search path right into the binaries.
60
+ ## mh-files MUST NOT override RPATHLDFLAGS unless they provide
61
+ ## equivalent '#SH#' lines for icu-config fixup
62
+ ENABLE_RPATH = NO
63
+ ifeq ($(ENABLE_RPATH),YES)
64
+ RPATHLDFLAGS = $(LD_RPATH)$(LD_RPATH_PRE)$(libdir)
65
+ endif
66
+
67
+ #SH## icu-config version of above 'if':
68
+ #SH#case "x$ENABLE_RPATH" in
69
+ #SH# x[yY]*)
70
+ #SH# ENABLE_RPATH=YES
71
+ #SH# RPATHLDFLAGS="${LD_RPATH}${LD_RPATH_PRE}${libdir}"
72
+ #SH# ;;
73
+ #SH#
74
+ #SH# x[nN]*)
75
+ #SH# ENABLE_RPATH=NO
76
+ #SH# RPATHLDFLAGS=""
77
+ #SH# ;;
78
+ #SH#
79
+ #SH# x)
80
+ #SH# ENABLE_RPATH=NO
81
+ #SH# RPATHLDFLAGS=""
82
+ #SH# ;;
83
+ #SH#
84
+ #SH# *)
85
+ #SH# echo $0: Unknown --enable-rpath value ${ENABLE_RPATH} 1>&2
86
+ #SH# exit 3
87
+ #SH# ;;
88
+ #SH#esac
89
+
90
+ # Name flexibility for the library naming scheme. Any modifications should
91
+ # be made in the mh- file for the specific platform.
92
+ DATA_STUBNAME = data
93
+ COMMON_STUBNAME = uc
94
+ I18N_STUBNAME = i18n
95
+ LAYOUTEX_STUBNAME = lx
96
+ IO_STUBNAME = io
97
+ TOOLUTIL_STUBNAME = tu
98
+ CTESTFW_STUBNAME = test
99
+
100
+
101
+
102
+ ### To link your application with ICU:
103
+ # 1. use LDFLAGS, CFLAGS, etc from above
104
+ # 2. link with $(ICULIBS)
105
+ # 3. optionally, add one or more of:
106
+ # - $(ICULIBS_I18N) - i18n library, formatting, etc.
107
+ # - $(ICULIBS_ICUIO) - ICU stdio equivalent library
108
+
109
+ ICULIBS_COMMON = -l$(ICUPREFIX)uc$(ICULIBSUFFIX)$(ICULIBSUFFIX_VERSION)
110
+ ICULIBS_DATA = -l$(ICUPREFIX)$(DATA_STUBNAME)$(ICULIBSUFFIX)$(ICULIBSUFFIX_VERSION)
111
+ ICULIBS_I18N = -l$(ICUPREFIX)$(I18N_STUBNAME)$(ICULIBSUFFIX)$(ICULIBSUFFIX_VERSION)
112
+ ICULIBS_TOOLUTIL = -l$(ICUPREFIX)tu$(ICULIBSUFFIX)$(ICULIBSUFFIX_VERSION)
113
+ ICULIBS_CTESTFW = -l$(ICUPREFIX)ctestfw$(ICULIBSUFFIX)$(ICULIBSUFFIX_VERSION)
114
+ ICULIBS_ICUIO = -l$(ICUPREFIX)io$(ICULIBSUFFIX)$(ICULIBSUFFIX_VERSION)
115
+ ICULIBS_OBSOLETE = -l$(ICUPREFIX)obsolete$(ICULIBSUFFIX)$(ICULIBSUFFIX_VERSION)
116
+ ICULIBS_LAYOUTEX = -l$(ICUPREFIX)lx$(ICULIBSUFFIX)$(ICULIBSUFFIX_VERSION)
117
+ ICULIBS_BASE = -L$(libdir)
118
+
119
+ # for icu-config to test with
120
+ ICULIBS_COMMON_LIB_NAME = ${LIBICU}${COMMON_STUBNAME}${ICULIBSUFFIX}${ICULIBSUFFIX_VERSION}.${SO}
121
+ ICULIBS_COMMON_LIB_NAME_A = ${LIBICU}${COMMON_STUBNAME}${ICULIBSUFFIX}.${A}
122
+
123
+ # ICULIBS is the set of libraries your application should link
124
+ # with usually. Many applications will want to add $(ICULIBS_I18N) as well.
125
+ ICULIBS = $(ICULIBS_BASE) $(ICULIBS_I18N) $(ICULIBS_COMMON) $(ICULIBS_DATA)
126
+
127
+ # Proper echo newline handling is needed in icu-config
128
+ ECHO_N=-n
129
+ ECHO_C=
130
+ # Not currently being used but good to have for proper tab handling
131
+ ECHO_T=
132
+
133
+ ##################################################################
134
+ ##################################################################
135
+ #
136
+ # *2* access to ICU tools
137
+ #
138
+ ##################################################################
139
+ # Environment variable to set a runtime search path
140
+ # (Overridden when necessary in -mh files)
141
+ LDLIBRARYPATH_ENVVAR = LD_LIBRARY_PATH
142
+
143
+ # Versioned target for a shared library
144
+ FINAL_SO_TARGET = $(SO_TARGET).$(SO_TARGET_VERSION)
145
+ MIDDLE_SO_TARGET = $(SO_TARGET).$(SO_TARGET_VERSION_MAJOR)
146
+
147
+ # Access to important ICU tools.
148
+ # Use as follows: $(INVOKE) $(GENRB) arguments ..
149
+ INVOKE = $(LDLIBRARYPATH_ENVVAR)=$(libdir):$$$(LDLIBRARYPATH_ENVVAR) $(LEAK_CHECKER)
150
+ GENCCODE = $(sbindir)/genccode
151
+ ICUPKG = $(sbindir)/icupkg
152
+ GENCMN = $(sbindir)/gencmn
153
+ GENRB = $(bindir)/genrb
154
+ PKGDATA = $(bindir)/pkgdata
155
+
156
+ # moved here because of dependencies
157
+ pkgdatadir = $(datadir)/$(PACKAGE)$(ICULIBSUFFIX)/$(VERSION)
158
+ pkglibdir = $(libdir)/$(PACKAGE)$(ICULIBSUFFIX)/$(VERSION)
159
+
160
+ ##################################################################
161
+ ##################################################################
162
+ #
163
+ # *3* Information about the host
164
+ #
165
+ ##################################################################
166
+
167
+ # Information about the host that 'configure' was run on.
168
+ host = x86_64-pc-linux-gnu
169
+ host_alias =
170
+ host_cpu = x86_64
171
+ host_vendor = pc
172
+ host_os = linux-gnu
173
+ # Our platform canonical name (as determined by configure)
174
+ # this is a #define value (i.e. U_XXXX or XXXX)
175
+ platform = U_LINUX
176
+
177
+ ##################################################################
178
+ ##################################################################
179
+ #
180
+ # *4* compiler flags and misc. options
181
+ #
182
+ ##################################################################
183
+ AR = ar
184
+ # initial tab keeps it out of the shell version.
185
+ ARFLAGS := $(ARFLAGS)
186
+ #SH#ARFLAGS=" ${ARFLAGS}"
187
+ CC = gcc
188
+ CPP = gcc -E
189
+ CFLAGS =
190
+ CPPFLAGS = -I$(prefix)/include
191
+ CXXFLAGS = -std=c++11
192
+ CXX = g++
193
+ DEFAULT_MODE = dll
194
+ DEFS = -DPACKAGE_NAME=\"ICU\" -DPACKAGE_TARNAME=\"International\ Components\ for\ Unicode\" -DPACKAGE_VERSION=\"64.2\" -DPACKAGE_STRING=\"ICU\ 64.2\" -DPACKAGE_BUGREPORT=\"http://icu-project.org/bugs\" -DPACKAGE_URL=\"http://icu-project.org\" -DSTDC_HEADERS=1 -DHAVE_SYS_TYPES_H=1 -DHAVE_SYS_STAT_H=1 -DHAVE_STDLIB_H=1 -DHAVE_STRING_H=1 -DHAVE_MEMORY_H=1 -DHAVE_STRINGS_H=1 -DHAVE_INTTYPES_H=1 -DHAVE_STDINT_H=1 -DHAVE_UNISTD_H=1 -DSIZEOF_VOID_P=8 -DHAVE_LIBM=1 -DHAVE_ELF_H=1 -DHAVE_DLFCN_H=1 -DHAVE_DLOPEN=1 -DHAVE_GETTIMEOFDAY=1 -DHAVE_LIBPTHREAD=1 -DHAVE_INTTYPES_H=1 -DHAVE_DIRENT_H=1 -DHAVE_WCHAR_H=1 -DSIZEOF_WCHAR_T=4
195
+ # use a consistent INSTALL
196
+ INSTALL = $(SHELL) $(pkgdatadir)/install-sh -c
197
+ INSTALL_DATA = ${INSTALL} -m 644
198
+ INSTALL_DATA = ${INSTALL} -m 644
199
+ INSTALL_PROGRAM = ${INSTALL}
200
+ INSTALL_PROGRAM = ${INSTALL}
201
+ INSTALL_SCRIPT = ${INSTALL}
202
+ LDFLAGS = $(RPATHLDFLAGS)
203
+ LIBS = -lpthread -ldl -lm
204
+ LIB_M =
205
+ LIB_VERSION = 64.2
206
+ LIB_VERSION_MAJOR = 64
207
+ MKINSTALLDIRS = $(SHELL) $(pkgdatadir)/mkinstalldirs
208
+ RANLIB = ranlib
209
+ RMV = rm -rf
210
+ SHELL = /bin/bash
211
+ SHLIB.c= $(CC) $(DEFS) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) -shared
212
+ SHLIB.cc= $(CXX) $(DEFS) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -shared
213
+ U_IS_BIG_ENDIAN = 0
214
+ includedir = ${prefix}/include
215
+ infodir = ${datarootdir}/info
216
+ localstatedir = ${prefix}/var
217
+ mandir = ${datarootdir}/man
218
+ oldincludedir = /usr/include
219
+ program_transform_name = s,x,x,
220
+ sharedstatedir = ${prefix}/com
221
+ sysconfdir = ${prefix}/etc
222
+ INSTALL-L = ${INSTALL_DATA}
223
+
224
+ # for derivative builds - don't bother with VERBOSE/NONVERBOSE SILENT_COMPILE
225
+ SILENT_COMPILE=$(1) #M#
226
+ ICU_MSG=@echo " $(1) " #M#
227
+
228
+ ##################################################################
229
+ ##################################################################
230
+ #
231
+ # *5* packaging options and directories
232
+ #
233
+ ##################################################################
234
+
235
+
236
+ # The basename of the ICU data file (i.e. icudt21b )
237
+ ICUDATA_CHAR = l
238
+ ICUDATA_NAME = icudt64l
239
+
240
+ # Defaults for pkgdata's mode and directories
241
+ # The default data dir changes depending on what packaging mode
242
+ # is being used
243
+ ifeq ($(strip $(PKGDATA_MODE)),)
244
+ #SH# if [ "x$PKGDATA_MODE" = "x" ];
245
+ #SH# then
246
+ PKGDATA_MODE=dll
247
+ #SH# fi
248
+ endif
249
+
250
+ #SH# case "$PKGDATA_MODE" in
251
+ ifeq ($(PKGDATA_MODE),common)
252
+ #SH# common)
253
+ ICUDATA_DIR = $(pkgdatadir)
254
+ ICUPKGDATA_DIR = $(ICUDATA_DIR)
255
+ #SH# ;;
256
+ else
257
+ ifeq ($(PKGDATA_MODE),dll)
258
+ #SH# dll)
259
+ ICUDATA_DIR = $(pkgdatadir)
260
+ ICUPKGDATA_DIR = $(libdir)
261
+ #SH# ;;
262
+ else
263
+ #SH# *)
264
+ ICUDATA_DIR = $(pkgdatadir)
265
+ ICUPKGDATA_DIR = $(ICUDATA_DIR)
266
+ #SH# ;;
267
+ endif
268
+ endif
269
+
270
+ #SH# esac
271
+
272
+ GENCCODE_ASSEMBLY = -a gcc
273
+
274
+ ##################################################################
275
+ ##################################################################
276
+ #
277
+ # *6* Inclusion of platform make fragment (mh-* file)
278
+ #
279
+ ##################################################################
280
+ # The mh- file ("make fragment") for the platform is included here.
281
+ # It may override the above settings.
282
+ # It is put last so that the mh-file can override anything.
283
+ # The selfcheck is just a sanity check that this makefile is
284
+ # parseable. The mh fragment is only included if this does not occur.
285
+
286
+ ifeq (selfcheck,$(MAKECMDGOALS)) #M#
287
+ selfcheck: #M#
288
+ @echo passed #M#
289
+ else #M#
290
+ include $(pkgdatadir)/config/mh-linux
291
+ endif #M#
292
+
Vinorm/build/lib/vinorm/lib/icu/64.2/pkgdata.inc ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GENCCODE_ASSEMBLY_TYPE=-a gcc
2
+ SO=so
3
+ SOBJ=so
4
+ A=a
5
+ LIBPREFIX=lib
6
+ LIB_EXT_ORDER=.64.2
7
+ COMPILE=gcc -D_REENTRANT -DU_HAVE_ELF_H=1 -DU_HAVE_STRTOD_L=1 -DU_HAVE_XLOCALE_H=0 -DU_ATTRIBUTE_DEPRECATED= -O2 -std=c11 -Wall -pedantic -Wshadow -Wpointer-arith -Wmissing-prototypes -Wwrite-strings -c
8
+ LIBFLAGS=-I/usr/local/include -DPIC -fPIC
9
+ GENLIB=gcc -O2 -std=c11 -Wall -pedantic -Wshadow -Wpointer-arith -Wmissing-prototypes -Wwrite-strings -shared -Wl,-Bsymbolic
10
+ LDICUDTFLAGS=-nodefaultlibs -nostdlib
11
+ LD_SONAME=-Wl,-soname -Wl,
12
+ RPATH_FLAGS=
13
+ BIR_LDFLAGS=-Wl,-Bsymbolic
14
+ AR=ar
15
+ ARFLAGS=r
16
+ RANLIB=ranlib
17
+ INSTALL_CMD=/usr/bin/install -c
Vinorm/build/lib/vinorm/lib/libicudata.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57aa7951c4335334ba9a97d4cc090ab7f39d68ee2e63cccb0d89c3f75c51d035
3
+ size 27534480
Vinorm/build/lib/vinorm/lib/libicudata.so.64 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57aa7951c4335334ba9a97d4cc090ab7f39d68ee2e63cccb0d89c3f75c51d035
3
+ size 27534480
Vinorm/build/lib/vinorm/lib/libicudata.so.64.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57aa7951c4335334ba9a97d4cc090ab7f39d68ee2e63cccb0d89c3f75c51d035
3
+ size 27534480
Vinorm/build/lib/vinorm/lib/libicui18n.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0773031c81b988dd0d77cfb78de52ca8aa1bc8b32c5210b10803d5f2301e2e8a
3
+ size 3781216
Vinorm/build/lib/vinorm/lib/libicui18n.so.64 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0773031c81b988dd0d77cfb78de52ca8aa1bc8b32c5210b10803d5f2301e2e8a
3
+ size 3781216
Vinorm/build/lib/vinorm/lib/libicui18n.so.64.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0773031c81b988dd0d77cfb78de52ca8aa1bc8b32c5210b10803d5f2301e2e8a
3
+ size 3781216
Vinorm/build/lib/vinorm/lib/libicuio.so ADDED
Binary file (70.2 kB). View file
 
Vinorm/build/lib/vinorm/lib/libicuio.so.64 ADDED
Binary file (70.2 kB). View file
 
Vinorm/build/lib/vinorm/lib/libicuio.so.64.2 ADDED
Binary file (70.2 kB). View file
 
Vinorm/build/lib/vinorm/lib/libicutest.so ADDED
Binary file (93.1 kB). View file
 
Vinorm/build/lib/vinorm/lib/libicutest.so.64 ADDED
Binary file (93.1 kB). View file