nguyen1207 commited on
Commit
6136947
1 Parent(s): ff6187d

initial commit

Browse files
Files changed (4) hide show
  1. .gitignore +229 -0
  2. app.py +112 -0
  3. preprocessing.py +126 -0
  4. requirements.txt +3 -0
.gitignore ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ models
2
+
3
+ .ipynb_checkpoints
4
+ */.ipynb_checkpoints/*
5
+
6
+ # IPython
7
+ profile_default/
8
+ ipython_config.py
9
+
10
+ # Remove previous ipynb_checkpoints
11
+ # git rm -r .ipynb_checkpoints/
12
+
13
+ ### macOS ###
14
+ # General
15
+ .DS_Store
16
+ .AppleDouble
17
+ .LSOverride
18
+
19
+ # Icon must end with two \r
20
+ Icon
21
+
22
+
23
+ # Thumbnails
24
+ ._*
25
+
26
+ # Files that might appear in the root of a volume
27
+ .DocumentRevisions-V100
28
+ .fseventsd
29
+ .Spotlight-V100
30
+ .TemporaryItems
31
+ .Trashes
32
+ .VolumeIcon.icns
33
+ .com.apple.timemachine.donotpresent
34
+
35
+ # Directories potentially created on remote AFP share
36
+ .AppleDB
37
+ .AppleDesktop
38
+ Network Trash Folder
39
+ Temporary Items
40
+ .apdisk
41
+
42
+ ### macOS Patch ###
43
+ # iCloud generated files
44
+ *.icloud
45
+
46
+ ### Python ###
47
+ # Byte-compiled / optimized / DLL files
48
+ __pycache__/
49
+ *.py[cod]
50
+ *$py.class
51
+
52
+ # C extensions
53
+ *.so
54
+
55
+ # Distribution / packaging
56
+ .Python
57
+ build/
58
+ develop-eggs/
59
+ dist/
60
+ downloads/
61
+ eggs/
62
+ .eggs/
63
+ lib/
64
+ lib64/
65
+ parts/
66
+ sdist/
67
+ var/
68
+ wheels/
69
+ share/python-wheels/
70
+ *.egg-info/
71
+ .installed.cfg
72
+ *.egg
73
+ MANIFEST
74
+
75
+ # PyInstaller
76
+ # Usually these files are written by a python script from a template
77
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
78
+ *.manifest
79
+ *.spec
80
+
81
+ # Installer logs
82
+ pip-log.txt
83
+ pip-delete-this-directory.txt
84
+
85
+ # Unit test / coverage reports
86
+ htmlcov/
87
+ .tox/
88
+ .nox/
89
+ .coverage
90
+ .coverage.*
91
+ .cache
92
+ nosetests.xml
93
+ coverage.xml
94
+ *.cover
95
+ *.py,cover
96
+ .hypothesis/
97
+ .pytest_cache/
98
+ cover/
99
+
100
+ # Translations
101
+ *.mo
102
+ *.pot
103
+
104
+ # Django stuff:
105
+ *.log
106
+ local_settings.py
107
+ db.sqlite3
108
+ db.sqlite3-journal
109
+
110
+ # Flask stuff:
111
+ instance/
112
+ .webassets-cache
113
+
114
+ # Scrapy stuff:
115
+ .scrapy
116
+
117
+ # Sphinx documentation
118
+ docs/_build/
119
+
120
+ # PyBuilder
121
+ .pybuilder/
122
+ target/
123
+
124
+ # Jupyter Notebook
125
+
126
+ # IPython
127
+
128
+ # pyenv
129
+ # For a library or package, you might want to ignore these files since the code is
130
+ # intended to run in multiple environments; otherwise, check them in:
131
+ # .python-version
132
+
133
+ # pipenv
134
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
135
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
136
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
137
+ # install all needed dependencies.
138
+ #Pipfile.lock
139
+
140
+ # poetry
141
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
142
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
143
+ # commonly ignored for libraries.
144
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
145
+ #poetry.lock
146
+
147
+ # pdm
148
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
149
+ #pdm.lock
150
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
151
+ # in version control.
152
+ # https://pdm.fming.dev/#use-with-ide
153
+ .pdm.toml
154
+
155
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
156
+ __pypackages__/
157
+
158
+ # Celery stuff
159
+ celerybeat-schedule
160
+ celerybeat.pid
161
+
162
+ # SageMath parsed files
163
+ *.sage.py
164
+
165
+ # Environments
166
+ .env
167
+ .venv
168
+ env/
169
+ venv/
170
+ ENV/
171
+ env.bak/
172
+ venv.bak/
173
+
174
+ # Spyder project settings
175
+ .spyderproject
176
+ .spyproject
177
+
178
+ # Rope project settings
179
+ .ropeproject
180
+
181
+ # mkdocs documentation
182
+ /site
183
+
184
+ # mypy
185
+ .mypy_cache/
186
+ .dmypy.json
187
+ dmypy.json
188
+
189
+ # Pyre type checker
190
+ .pyre/
191
+
192
+ # pytype static type analyzer
193
+ .pytype/
194
+
195
+ # Cython debug symbols
196
+ cython_debug/
197
+
198
+ # PyCharm
199
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
200
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
201
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
202
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
203
+ #.idea/
204
+
205
+ ### Python Patch ###
206
+ # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
207
+ poetry.toml
208
+
209
+ # ruff
210
+ .ruff_cache/
211
+
212
+ # LSP config files
213
+ pyrightconfig.json
214
+
215
+ ### VisualStudioCode ###
216
+ .vscode
217
+
218
+ # Local History for Visual Studio Code
219
+ .history/
220
+
221
+ # Built Visual Studio Code Extensions
222
+ *.vsix
223
+
224
+ ### VisualStudioCode Patch ###
225
+ # Ignore all local history of files
226
+ .history
227
+ .ionide
228
+
229
+ # End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,python,jupyternotebooks,macos
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from random import randint
2
+ from time import sleep
3
+
4
+ import streamlit as st
5
+ from huggingface_hub import hf_hub_download
6
+ from llama_cpp import Llama
7
+
8
+ from preprocessing import preprocess_pipeline
9
+
10
+ vistral_path = hf_hub_download(
11
+ repo_id="nguyen1207/Vistral-7B-MT-GGUF",
12
+ filename="vistral-7b-mt.Q4_K_M.gguf",
13
+ resume_download=True,
14
+ cache_dir="models",
15
+ )
16
+
17
+ llm = Llama(model_path=vistral_path)
18
+
19
+
20
+ def disable_input():
21
+ st.session_state.translating = True
22
+
23
+
24
+ def translate(llm, prompt, top_p, top_k, temperature, repetition_penalty, max_length):
25
+ stream = llm.create_completion(
26
+ prompt,
27
+ stream=True,
28
+ top_p=top_p,
29
+ top_k=top_k,
30
+ temperature=temperature,
31
+ frequency_penalty=repetition_penalty,
32
+ max_tokens=max_length,
33
+ )
34
+
35
+ count = 0
36
+
37
+ for response in stream:
38
+ if count < 3:
39
+ count += 1
40
+ yield ""
41
+ else:
42
+ yield response["choices"][0]["text"]
43
+
44
+
45
+ model = None
46
+ tokenizer = None
47
+
48
+ st.set_page_config(page_title="Vietnamese to English Translation")
49
+
50
+ st.title(
51
+ "🇻🇳 Vietnamese to 🇺🇸 English Translation but with Teencode and Slang understanding 🤯"
52
+ )
53
+
54
+ st.sidebar.header("Translation Parameters")
55
+ top_p = st.sidebar.slider("Top p", min_value=0.0, max_value=1.0, value=0.95)
56
+ top_k = st.sidebar.slider("Top k", min_value=1, max_value=100, value=50)
57
+ temperature = st.sidebar.slider("Temperature", min_value=0.0, max_value=2.0, value=0.3)
58
+ repetition_penalty = st.sidebar.slider(
59
+ "Repetition Penalty", min_value=1.0, max_value=3.0, value=1.05
60
+ )
61
+ max_length = st.sidebar.slider("Max Length", min_value=10, max_value=512, value=128)
62
+
63
+ if "messages" not in st.session_state:
64
+ st.session_state.messages = []
65
+ st.session_state.translating = False
66
+
67
+ for message in st.session_state.messages:
68
+ with st.chat_message(message["role"]):
69
+ st.markdown(message["content"])
70
+
71
+
72
+ if user_input := st.chat_input(
73
+ "Vietnamese text goes here... 🇻🇳",
74
+ disabled=st.session_state.translating,
75
+ on_submit=disable_input,
76
+ ):
77
+ if user_input.strip() != "":
78
+ st.session_state.translating = True
79
+ preprocessed_input = preprocess_pipeline(user_input)
80
+
81
+ st.session_state.messages.append({"role": "user", "content": user_input})
82
+ with st.chat_message("user"):
83
+ st.markdown(user_input)
84
+
85
+ with st.chat_message("assistant"):
86
+ # stream = client.chat.completions.create(
87
+ # model=st.session_state["openai_model"],
88
+ # messages=[
89
+ # {"role": m["role"], "content": m["content"]}
90
+ # for m in st.session_state.messages
91
+ # ],
92
+ # stream=True,
93
+ # )
94
+
95
+ prompt_template = """<s> [INST] Dịch câu sau từ tiếng Việt sang tiếng Anh:
96
+
97
+ Tiếng Việt: {} [/INST] """
98
+
99
+ prompt = prompt_template.format(preprocessed_input)
100
+
101
+ stream = translate(
102
+ llm, prompt, top_p, top_k, temperature, repetition_penalty, max_length
103
+ )
104
+
105
+ translation = st.write_stream(stream)
106
+ st.markdown(translation)
107
+
108
+ st.session_state.messages.append({"role": "assistant", "content": translation})
109
+
110
+ # Reset the input field
111
+ st.session_state.translating = False
112
+ st.rerun()
preprocessing.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import unicodedata
3
+ from string import punctuation
4
+
5
+
6
+ def remove_emoticon(text: str):
7
+ emoticon_pattern = re.compile(r"(:|;|=|-|@)(\)|]|\(|v|>|<|D|@)+")
8
+ text = emoticon_pattern.sub("", text)
9
+ return text
10
+
11
+
12
+ def remove_emoji(text: str):
13
+ emoji_pattern = re.compile(
14
+ "["
15
+ "\U0001F600-\U0001F64F" # emoticons
16
+ "\U0001F300-\U0001F5FF" # symbols & pictographs
17
+ "\U0001F680-\U0001F6FF" # transport & map symbols
18
+ "\U0001F1E0-\U0001F1FF" # flags (iOS)
19
+ "\U00002500-\U00002BEF" # chinese char
20
+ "\U00002702-\U000027B0"
21
+ "\U000024C2-\U0001F251"
22
+ "\U0001f926-\U0001f937"
23
+ "\U00010000-\U0010ffff"
24
+ "\u2640-\u2642"
25
+ "\u2600-\u2B55"
26
+ "\u200d"
27
+ "\u23cf"
28
+ "\u23e9"
29
+ "\u231a"
30
+ "\ufe0f" # dingbats
31
+ "\u3030"
32
+ "]+",
33
+ re.UNICODE,
34
+ )
35
+
36
+ text = emoji_pattern.sub("", text)
37
+ return text
38
+
39
+
40
+ def remove_consecutive_whitespace(text: str):
41
+ return " ".join(text.split())
42
+
43
+
44
+ def remove_consecutive_punctuation(text: str):
45
+ # only keep one punctuation
46
+ pattern = re.compile(r"([%s])\1+" % re.escape(punctuation))
47
+ return pattern.sub(r"\1", text)
48
+
49
+
50
+ def normalize_unicode(text: str):
51
+ return unicodedata.normalize("NFKC", text)
52
+
53
+
54
+ def normalize_accents(text: str):
55
+ dict_map = {
56
+ "òa": "oà",
57
+ "Òa": "Oà",
58
+ "ÒA": "OÀ",
59
+ "óa": "oá",
60
+ "Óa": "Oá",
61
+ "ÓA": "OÁ",
62
+ "ỏa": "oả",
63
+ "Ỏa": "Oả",
64
+ "ỎA": "OẢ",
65
+ "õa": "oã",
66
+ "Õa": "Oã",
67
+ "ÕA": "OÃ",
68
+ "ọa": "oạ",
69
+ "Ọa": "Oạ",
70
+ "ỌA": "OẠ",
71
+ "òe": "oè",
72
+ "Òe": "Oè",
73
+ "ÒE": "OÈ",
74
+ "óe": "oé",
75
+ "Óe": "Oé",
76
+ "ÓE": "OÉ",
77
+ "ỏe": "oẻ",
78
+ "Ỏe": "Oẻ",
79
+ "ỎE": "OẺ",
80
+ "õe": "oẽ",
81
+ "Õe": "Oẽ",
82
+ "ÕE": "OẼ",
83
+ "ọe": "oẹ",
84
+ "Ọe": "Oẹ",
85
+ "ỌE": "OẸ",
86
+ "ùy": "uỳ",
87
+ "Ùy": "Uỳ",
88
+ "ÙY": "UỲ",
89
+ "úy": "uý",
90
+ "Úy": "Uý",
91
+ "ÚY": "UÝ",
92
+ "ủy": "uỷ",
93
+ "Ủy": "Uỷ",
94
+ "ỦY": "UỶ",
95
+ "ũy": "uỹ",
96
+ "Ũy": "Uỹ",
97
+ "ŨY": "UỸ",
98
+ "ụy": "uỵ",
99
+ "Ụy": "Uỵ",
100
+ "ỤY": "UỴ",
101
+ }
102
+
103
+ for k, v in dict_map.items():
104
+ text = re.sub(k, v, text, flags=re.IGNORECASE)
105
+
106
+ return text
107
+
108
+
109
+ def preprocess_pipeline(text):
110
+ text = remove_emoticon(text)
111
+
112
+ # remove emojis
113
+ text = remove_emoji(text)
114
+ # normalize unicode
115
+ text = normalize_unicode(text)
116
+
117
+ # normalize accents
118
+ text = normalize_accents(text)
119
+
120
+ # remove consecutive whitespace
121
+ text = remove_consecutive_whitespace(text)
122
+
123
+ # remove consecutive punctuation
124
+ text = remove_consecutive_punctuation(text)
125
+
126
+ return text
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ huggingface_hub==0.20.3
2
+ streamlit==1.32.1
3
+ llama-cpp-python==0.2.84