nguyen1207 commited on
Commit
7e94173
·
1 Parent(s): bbc8379

initial commit

Browse files
Files changed (4) hide show
  1. .gitignore +229 -0
  2. app.py +92 -0
  3. preprocessing.py +126 -0
  4. requirements.txt +3 -0
.gitignore ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ models
2
+
3
+ .ipynb_checkpoints
4
+ */.ipynb_checkpoints/*
5
+
6
+ # IPython
7
+ profile_default/
8
+ ipython_config.py
9
+
10
+ # Remove previous ipynb_checkpoints
11
+ # git rm -r .ipynb_checkpoints/
12
+
13
+ ### macOS ###
14
+ # General
15
+ .DS_Store
16
+ .AppleDouble
17
+ .LSOverride
18
+
19
+ # Icon must end with two \r
20
+ Icon
21
+
22
+
23
+ # Thumbnails
24
+ ._*
25
+
26
+ # Files that might appear in the root of a volume
27
+ .DocumentRevisions-V100
28
+ .fseventsd
29
+ .Spotlight-V100
30
+ .TemporaryItems
31
+ .Trashes
32
+ .VolumeIcon.icns
33
+ .com.apple.timemachine.donotpresent
34
+
35
+ # Directories potentially created on remote AFP share
36
+ .AppleDB
37
+ .AppleDesktop
38
+ Network Trash Folder
39
+ Temporary Items
40
+ .apdisk
41
+
42
+ ### macOS Patch ###
43
+ # iCloud generated files
44
+ *.icloud
45
+
46
+ ### Python ###
47
+ # Byte-compiled / optimized / DLL files
48
+ __pycache__/
49
+ *.py[cod]
50
+ *$py.class
51
+
52
+ # C extensions
53
+ *.so
54
+
55
+ # Distribution / packaging
56
+ .Python
57
+ build/
58
+ develop-eggs/
59
+ dist/
60
+ downloads/
61
+ eggs/
62
+ .eggs/
63
+ lib/
64
+ lib64/
65
+ parts/
66
+ sdist/
67
+ var/
68
+ wheels/
69
+ share/python-wheels/
70
+ *.egg-info/
71
+ .installed.cfg
72
+ *.egg
73
+ MANIFEST
74
+
75
+ # PyInstaller
76
+ # Usually these files are written by a python script from a template
77
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
78
+ *.manifest
79
+ *.spec
80
+
81
+ # Installer logs
82
+ pip-log.txt
83
+ pip-delete-this-directory.txt
84
+
85
+ # Unit test / coverage reports
86
+ htmlcov/
87
+ .tox/
88
+ .nox/
89
+ .coverage
90
+ .coverage.*
91
+ .cache
92
+ nosetests.xml
93
+ coverage.xml
94
+ *.cover
95
+ *.py,cover
96
+ .hypothesis/
97
+ .pytest_cache/
98
+ cover/
99
+
100
+ # Translations
101
+ *.mo
102
+ *.pot
103
+
104
+ # Django stuff:
105
+ *.log
106
+ local_settings.py
107
+ db.sqlite3
108
+ db.sqlite3-journal
109
+
110
+ # Flask stuff:
111
+ instance/
112
+ .webassets-cache
113
+
114
+ # Scrapy stuff:
115
+ .scrapy
116
+
117
+ # Sphinx documentation
118
+ docs/_build/
119
+
120
+ # PyBuilder
121
+ .pybuilder/
122
+ target/
123
+
124
+ # Jupyter Notebook
125
+
126
+ # IPython
127
+
128
+ # pyenv
129
+ # For a library or package, you might want to ignore these files since the code is
130
+ # intended to run in multiple environments; otherwise, check them in:
131
+ # .python-version
132
+
133
+ # pipenv
134
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
135
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
136
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
137
+ # install all needed dependencies.
138
+ #Pipfile.lock
139
+
140
+ # poetry
141
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
142
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
143
+ # commonly ignored for libraries.
144
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
145
+ #poetry.lock
146
+
147
+ # pdm
148
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
149
+ #pdm.lock
150
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
151
+ # in version control.
152
+ # https://pdm.fming.dev/#use-with-ide
153
+ .pdm.toml
154
+
155
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
156
+ __pypackages__/
157
+
158
+ # Celery stuff
159
+ celerybeat-schedule
160
+ celerybeat.pid
161
+
162
+ # SageMath parsed files
163
+ *.sage.py
164
+
165
+ # Environments
166
+ .env
167
+ .venv
168
+ env/
169
+ venv/
170
+ ENV/
171
+ env.bak/
172
+ venv.bak/
173
+
174
+ # Spyder project settings
175
+ .spyderproject
176
+ .spyproject
177
+
178
+ # Rope project settings
179
+ .ropeproject
180
+
181
+ # mkdocs documentation
182
+ /site
183
+
184
+ # mypy
185
+ .mypy_cache/
186
+ .dmypy.json
187
+ dmypy.json
188
+
189
+ # Pyre type checker
190
+ .pyre/
191
+
192
+ # pytype static type analyzer
193
+ .pytype/
194
+
195
+ # Cython debug symbols
196
+ cython_debug/
197
+
198
+ # PyCharm
199
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
200
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
201
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
202
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
203
+ #.idea/
204
+
205
+ ### Python Patch ###
206
+ # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
207
+ poetry.toml
208
+
209
+ # ruff
210
+ .ruff_cache/
211
+
212
+ # LSP config files
213
+ pyrightconfig.json
214
+
215
+ ### VisualStudioCode ###
216
+ .vscode
217
+
218
+ # Local History for Visual Studio Code
219
+ .history/
220
+
221
+ # Built Visual Studio Code Extensions
222
+ *.vsix
223
+
224
+ ### VisualStudioCode Patch ###
225
+ # Ignore all local history of files
226
+ .history
227
+ .ionide
228
+
229
+ # End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,python,jupyternotebooks,macos
app.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import streamlit as st
4
+ from dotenv import load_dotenv
5
+ from langchain_community.llms import VLLMOpenAI
6
+
7
+ from preprocessing import preprocess_pipeline
8
+
9
+ load_dotenv()
10
+
11
+
12
+ def disable_input():
13
+ st.session_state.translating = True
14
+
15
+
16
+ def translate(prompt, top_p, temperature, repetition_penalty):
17
+ llm = VLLMOpenAI(
18
+ openai_api_key="EMPTY",
19
+ openai_api_base=os.getenv("VISTRAL_7B_MT_SERVER"),
20
+ model_name="nguyen1207/Vistral-7B-MT",
21
+ temperature=temperature,
22
+ top_p=top_p,
23
+ frequency_penalty=repetition_penalty,
24
+ )
25
+
26
+ stream = llm.stream(
27
+ prompt,
28
+ )
29
+
30
+ count = 0
31
+
32
+ for response in stream:
33
+ print(response)
34
+ if count < 3:
35
+ count += 1
36
+ yield ""
37
+ else:
38
+ yield response
39
+
40
+
41
+ st.set_page_config(page_title="Vietnamese to English Translation")
42
+
43
+ st.title(
44
+ "🇻🇳 Vietnamese to 🇺🇸 English Translation but with Teencode and Slang understanding 🤯"
45
+ )
46
+
47
+ st.sidebar.header("Translation Parameters")
48
+ top_p = st.sidebar.slider("Top p", min_value=0.0, max_value=1.0, value=0.95)
49
+ temperature = st.sidebar.slider("Temperature", min_value=0.0, max_value=2.0, value=0.3)
50
+ repetition_penalty = st.sidebar.slider(
51
+ "Repetition Penalty", min_value=1.0, max_value=3.0, value=1.05
52
+ )
53
+
54
+ if "messages" not in st.session_state:
55
+ st.session_state.messages = []
56
+ st.session_state.translating = False
57
+
58
+ for message in st.session_state.messages:
59
+ with st.chat_message(message["role"]):
60
+ st.markdown(message["content"])
61
+
62
+
63
+ if user_input := st.chat_input(
64
+ "Vietnamese text goes here... 🇻🇳",
65
+ disabled=st.session_state.translating,
66
+ on_submit=disable_input,
67
+ ):
68
+ if user_input.strip() != "":
69
+ st.session_state.translating = True
70
+ preprocessed_input = preprocess_pipeline(user_input)
71
+
72
+ st.session_state.messages.append({"role": "user", "content": user_input})
73
+ with st.chat_message("user"):
74
+ st.markdown(user_input)
75
+
76
+ with st.chat_message("assistant"):
77
+ prompt_template = """<s> [INST] Dịch câu sau từ tiếng Việt sang tiếng Anh:
78
+
79
+ Tiếng Việt: {} [/INST] """
80
+
81
+ prompt = prompt_template.format(preprocessed_input)
82
+
83
+ stream = translate(prompt, top_p, temperature, repetition_penalty)
84
+
85
+ translation = st.write_stream(stream)
86
+ st.markdown(translation)
87
+
88
+ st.session_state.messages.append({"role": "assistant", "content": translation})
89
+
90
+ # Reset the input field
91
+ st.session_state.translating = False
92
+ st.rerun()
preprocessing.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import unicodedata
3
+ from string import punctuation
4
+
5
+
6
+ def remove_emoticon(text: str):
7
+ emoticon_pattern = re.compile(r"(:|;|=|-|@)(\)|]|\(|v|>|<|D|@)+")
8
+ text = emoticon_pattern.sub("", text)
9
+ return text
10
+
11
+
12
+ def remove_emoji(text: str):
13
+ emoji_pattern = re.compile(
14
+ "["
15
+ "\U0001F600-\U0001F64F" # emoticons
16
+ "\U0001F300-\U0001F5FF" # symbols & pictographs
17
+ "\U0001F680-\U0001F6FF" # transport & map symbols
18
+ "\U0001F1E0-\U0001F1FF" # flags (iOS)
19
+ "\U00002500-\U00002BEF" # chinese char
20
+ "\U00002702-\U000027B0"
21
+ "\U000024C2-\U0001F251"
22
+ "\U0001f926-\U0001f937"
23
+ "\U00010000-\U0010ffff"
24
+ "\u2640-\u2642"
25
+ "\u2600-\u2B55"
26
+ "\u200d"
27
+ "\u23cf"
28
+ "\u23e9"
29
+ "\u231a"
30
+ "\ufe0f" # dingbats
31
+ "\u3030"
32
+ "]+",
33
+ re.UNICODE,
34
+ )
35
+
36
+ text = emoji_pattern.sub("", text)
37
+ return text
38
+
39
+
40
+ def remove_consecutive_whitespace(text: str):
41
+ return " ".join(text.split())
42
+
43
+
44
+ def remove_consecutive_punctuation(text: str):
45
+ # only keep one punctuation
46
+ pattern = re.compile(r"([%s])\1+" % re.escape(punctuation))
47
+ return pattern.sub(r"\1", text)
48
+
49
+
50
+ def normalize_unicode(text: str):
51
+ return unicodedata.normalize("NFKC", text)
52
+
53
+
54
+ def normalize_accents(text: str):
55
+ dict_map = {
56
+ "òa": "oà",
57
+ "Òa": "Oà",
58
+ "ÒA": "OÀ",
59
+ "óa": "oá",
60
+ "Óa": "Oá",
61
+ "ÓA": "OÁ",
62
+ "ỏa": "oả",
63
+ "Ỏa": "Oả",
64
+ "ỎA": "OẢ",
65
+ "õa": "oã",
66
+ "Õa": "Oã",
67
+ "ÕA": "OÃ",
68
+ "ọa": "oạ",
69
+ "Ọa": "Oạ",
70
+ "ỌA": "OẠ",
71
+ "òe": "oè",
72
+ "Òe": "Oè",
73
+ "ÒE": "OÈ",
74
+ "óe": "oé",
75
+ "Óe": "Oé",
76
+ "ÓE": "OÉ",
77
+ "ỏe": "oẻ",
78
+ "Ỏe": "Oẻ",
79
+ "ỎE": "OẺ",
80
+ "õe": "oẽ",
81
+ "Õe": "Oẽ",
82
+ "ÕE": "OẼ",
83
+ "ọe": "oẹ",
84
+ "Ọe": "Oẹ",
85
+ "ỌE": "OẸ",
86
+ "ùy": "uỳ",
87
+ "Ùy": "Uỳ",
88
+ "ÙY": "UỲ",
89
+ "úy": "uý",
90
+ "Úy": "Uý",
91
+ "ÚY": "UÝ",
92
+ "ủy": "uỷ",
93
+ "Ủy": "Uỷ",
94
+ "ỦY": "UỶ",
95
+ "ũy": "uỹ",
96
+ "Ũy": "Uỹ",
97
+ "ŨY": "UỸ",
98
+ "ụy": "uỵ",
99
+ "Ụy": "Uỵ",
100
+ "ỤY": "UỴ",
101
+ }
102
+
103
+ for k, v in dict_map.items():
104
+ text = re.sub(k, v, text, flags=re.IGNORECASE)
105
+
106
+ return text
107
+
108
+
109
+ def preprocess_pipeline(text):
110
+ text = remove_emoticon(text)
111
+
112
+ # remove emojis
113
+ text = remove_emoji(text)
114
+ # normalize unicode
115
+ text = normalize_unicode(text)
116
+
117
+ # normalize accents
118
+ text = normalize_accents(text)
119
+
120
+ # remove consecutive whitespace
121
+ text = remove_consecutive_whitespace(text)
122
+
123
+ # remove consecutive punctuation
124
+ text = remove_consecutive_punctuation(text)
125
+
126
+ return text
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ huggingface_hub==0.20.3
2
+ streamlit==1.32.1
3
+ llama-cpp-python==0.2.84