Spaces:

nguyen1207
/

vi-en-mt-teencode-slang

Sleeping

App Files Files Community

nguyen1207 commited on Jul 30

Commit

6136947

•

1 Parent(s): ff6187d

initial commit

Browse files

Files changed (4) hide show

.gitignore +229 -0
app.py +112 -0
preprocessing.py +126 -0
requirements.txt +3 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,229 @@

+models
+.ipynb_checkpoints
+*/.ipynb_checkpoints/*
+# IPython
+profile_default/
+ipython_config.py
+# Remove previous ipynb_checkpoints
+#   git rm -r .ipynb_checkpoints/
+### macOS ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+# Icon must end with two \r
+Icon
+# Thumbnails
+._*
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+### macOS Patch ###
+# iCloud generated files
+*.icloud
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+# IPython
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+# ruff
+.ruff_cache/
+# LSP config files
+pyrightconfig.json
+### VisualStudioCode ###
+.vscode
+# Local History for Visual Studio Code
+.history/
+# Built Visual Studio Code Extensions
+*.vsix
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,python,jupyternotebooks,macos

app.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from random import randint
+from time import sleep
+import streamlit as st
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
+from preprocessing import preprocess_pipeline
+vistral_path = hf_hub_download(
+    repo_id="nguyen1207/Vistral-7B-MT-GGUF",
+    filename="vistral-7b-mt.Q4_K_M.gguf",
+    resume_download=True,
+    cache_dir="models",
+)
+llm = Llama(model_path=vistral_path)
+def disable_input():
+    st.session_state.translating = True
+def translate(llm, prompt, top_p, top_k, temperature, repetition_penalty, max_length):
+    stream = llm.create_completion(
+        prompt,
+        stream=True,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+        frequency_penalty=repetition_penalty,
+        max_tokens=max_length,
+    )
+    count = 0
+    for response in stream:
+        if count < 3:
+            count += 1
+            yield ""
+        else:
+            yield response["choices"][0]["text"]
+model = None
+tokenizer = None
+st.set_page_config(page_title="Vietnamese to English Translation")
+st.title(
+    "🇻🇳 Vietnamese to 🇺🇸 English Translation but with Teencode and Slang understanding 🤯"
+)
+st.sidebar.header("Translation Parameters")
+top_p = st.sidebar.slider("Top p", min_value=0.0, max_value=1.0, value=0.95)
+top_k = st.sidebar.slider("Top k", min_value=1, max_value=100, value=50)
+temperature = st.sidebar.slider("Temperature", min_value=0.0, max_value=2.0, value=0.3)
+repetition_penalty = st.sidebar.slider(
+    "Repetition Penalty", min_value=1.0, max_value=3.0, value=1.05
+)
+max_length = st.sidebar.slider("Max Length", min_value=10, max_value=512, value=128)
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+    st.session_state.translating = False
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+if user_input := st.chat_input(
+    "Vietnamese text goes here... 🇻🇳",
+    disabled=st.session_state.translating,
+    on_submit=disable_input,
+):
+    if user_input.strip() != "":
+        st.session_state.translating = True
+        preprocessed_input = preprocess_pipeline(user_input)
+        st.session_state.messages.append({"role": "user", "content": user_input})
+        with st.chat_message("user"):
+            st.markdown(user_input)
+        with st.chat_message("assistant"):
+            # stream = client.chat.completions.create(
+            #     model=st.session_state["openai_model"],
+            #     messages=[
+            #         {"role": m["role"], "content": m["content"]}
+            #         for m in st.session_state.messages
+            #     ],
+            #     stream=True,
+            # )
+            prompt_template = """<s> [INST] Dịch câu sau từ tiếng Việt sang tiếng Anh:
+Tiếng Việt: {} [/INST] """
+            prompt = prompt_template.format(preprocessed_input)
+            stream = translate(
+                llm, prompt, top_p, top_k, temperature, repetition_penalty, max_length
+            )
+            translation = st.write_stream(stream)
+            st.markdown(translation)
+        st.session_state.messages.append({"role": "assistant", "content": translation})
+        # Reset the input field
+        st.session_state.translating = False
+        st.rerun()

preprocessing.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import re
+import unicodedata
+from string import punctuation
+def remove_emoticon(text: str):
+    emoticon_pattern = re.compile(r"(:|;|=|-|@)(\)|]|\(|v|>|<|D|@)+")
+    text = emoticon_pattern.sub("", text)
+    return text
+def remove_emoji(text: str):
+    emoji_pattern = re.compile(
+        "["
+        "\U0001F600-\U0001F64F"  # emoticons
+        "\U0001F300-\U0001F5FF"  # symbols & pictographs
+        "\U0001F680-\U0001F6FF"  # transport & map symbols
+        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
+        "\U00002500-\U00002BEF"  # chinese char
+        "\U00002702-\U000027B0"
+        "\U000024C2-\U0001F251"
+        "\U0001f926-\U0001f937"
+        "\U00010000-\U0010ffff"
+        "\u2640-\u2642"
+        "\u2600-\u2B55"
+        "\u200d"
+        "\u23cf"
+        "\u23e9"
+        "\u231a"
+        "\ufe0f"  # dingbats
+        "\u3030"
+        "]+",
+        re.UNICODE,
+    )
+    text = emoji_pattern.sub("", text)
+    return text
+def remove_consecutive_whitespace(text: str):
+    return " ".join(text.split())
+def remove_consecutive_punctuation(text: str):
+    # only keep one punctuation
+    pattern = re.compile(r"([%s])\1+" % re.escape(punctuation))
+    return pattern.sub(r"\1", text)
+def normalize_unicode(text: str):
+    return unicodedata.normalize("NFKC", text)
+def normalize_accents(text: str):
+    dict_map = {
+        "òa": "oà",
+        "Òa": "Oà",
+        "ÒA": "OÀ",
+        "óa": "oá",
+        "Óa": "Oá",
+        "ÓA": "OÁ",
+        "ỏa": "oả",
+        "Ỏa": "Oả",
+        "ỎA": "OẢ",
+        "õa": "oã",
+        "Õa": "Oã",
+        "ÕA": "OÃ",
+        "ọa": "oạ",
+        "Ọa": "Oạ",
+        "ỌA": "OẠ",
+        "òe": "oè",
+        "Òe": "Oè",
+        "ÒE": "OÈ",
+        "óe": "oé",
+        "Óe": "Oé",
+        "ÓE": "OÉ",
+        "ỏe": "oẻ",
+        "Ỏe": "Oẻ",
+        "ỎE": "OẺ",
+        "õe": "oẽ",
+        "Õe": "Oẽ",
+        "ÕE": "OẼ",
+        "ọe": "oẹ",
+        "Ọe": "Oẹ",
+        "ỌE": "OẸ",
+        "ùy": "uỳ",
+        "Ùy": "Uỳ",
+        "ÙY": "UỲ",
+        "úy": "uý",
+        "Úy": "Uý",
+        "ÚY": "UÝ",
+        "ủy": "uỷ",
+        "Ủy": "Uỷ",
+        "ỦY": "UỶ",
+        "ũy": "uỹ",
+        "Ũy": "Uỹ",
+        "ŨY": "UỸ",
+        "ụy": "uỵ",
+        "Ụy": "Uỵ",
+        "ỤY": "UỴ",
+    }
+    for k, v in dict_map.items():
+        text = re.sub(k, v, text, flags=re.IGNORECASE)
+    return text
+def preprocess_pipeline(text):
+    text = remove_emoticon(text)
+    # remove emojis
+    text = remove_emoji(text)
+    # normalize unicode
+    text = normalize_unicode(text)
+    # normalize accents
+    text = normalize_accents(text)
+    # remove consecutive whitespace
+    text = remove_consecutive_whitespace(text)
+    # remove consecutive punctuation
+    text = remove_consecutive_punctuation(text)
+    return text

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+huggingface_hub==0.20.3
+streamlit==1.32.1
+llama-cpp-python==0.2.84