Spaces:

willsh1997
/

llm_multilingual_demo

Running on Zero

App Files Files Community

willsh1997 commited on May 12

Commit

7a44d8e

1 Parent(s): cc77a80

:sparkles: initial commit

Browse files

Files changed (5) hide show

.github/workflows/push_to_hub.yml +20 -0
README.md +14 -0
flores_200_keys.csv +204 -0
llm_translate_gradio.py +127 -0
requirements.txt +85 -0

.github/workflows/push_to_hub.yml ADDED Viewed

	@@ -0,0 +1,20 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [main]
+  # to run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Push to hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push https://willsh1997:[email protected]/spaces/willsh1997/llm_multilingual_demo main

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: Llm Multilingual Demo
+emoji: 📊
+colorFrom: pink
+colorTo: gray
+sdk: gradio
+sdk_version: 5.16.1
+app_file: llm_translate_gradio.py
+pinned: true
+license: apache-2.0
+short_description: MVP demo of multilingual LLM performance eval space
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

flores_200_keys.csv ADDED Viewed

	@@ -0,0 +1,204 @@

+Acehnese (Arabic script),ace_Arab
+Acehnese (Latin script),ace_Latn
+Mesopotamian Arabic,acm_Arab
+Ta’izzi-Adeni Arabic,acq_Arab
+Tunisian Arabic,aeb_Arab
+Afrikaans,afr_Latn
+South Levantine Arabic,ajp_Arab
+Akan,aka_Latn
+Amharic,amh_Ethi
+North Levantine Arabic,apc_Arab
+Modern Standard Arabic,arb_Arab
+Modern Standard Arabic (Romanized),arb_Latn
+Najdi Arabic,ars_Arab
+Moroccan Arabic,ary_Arab
+Egyptian Arabic,arz_Arab
+Assamese,asm_Beng
+Asturian,ast_Latn
+Awadhi,awa_Deva
+Central Aymara,ayr_Latn
+South Azerbaijani,azb_Arab
+North Azerbaijani,azj_Latn
+Bashkir,bak_Cyrl
+Bambara,bam_Latn
+Balinese,ban_Latn
+Belarusian,bel_Cyrl
+Bemba,bem_Latn
+Bengali,ben_Beng
+Bhojpuri,bho_Deva
+Banjar (Arabic script),bjn_Arab
+Banjar (Latin script),bjn_Latn
+Standard Tibetan,bod_Tibt
+Bosnian,bos_Latn
+Buginese,bug_Latn
+Bulgarian,bul_Cyrl
+Catalan,cat_Latn
+Cebuano,ceb_Latn
+Czech,ces_Latn
+Chokwe,cjk_Latn
+Central Kurdish,ckb_Arab
+Crimean Tatar,crh_Latn
+Welsh,cym_Latn
+Danish,dan_Latn
+German,deu_Latn
+Southwestern Dinka,dik_Latn
+Dyula,dyu_Latn
+Dzongkha,dzo_Tibt
+Greek,ell_Grek
+English,eng_Latn
+Esperanto,epo_Latn
+Estonian,est_Latn
+Basque,eus_Latn
+Ewe,ewe_Latn
+Faroese,fao_Latn
+Fijian,fij_Latn
+Finnish,fin_Latn
+Fon,fon_Latn
+French,fra_Latn
+Friulian,fur_Latn
+Nigerian Fulfulde,fuv_Latn
+Scottish Gaelic,gla_Latn
+Irish,gle_Latn
+Galician,glg_Latn
+Guarani,grn_Latn
+Gujarati,guj_Gujr
+Haitian Creole,hat_Latn
+Hausa,hau_Latn
+Hebrew,heb_Hebr
+Hindi,hin_Deva
+Chhattisgarhi,hne_Deva
+Croatian,hrv_Latn
+Hungarian,hun_Latn
+Armenian,hye_Armn
+Igbo,ibo_Latn
+Ilocano,ilo_Latn
+Indonesian,ind_Latn
+Icelandic,isl_Latn
+Italian,ita_Latn
+Javanese,jav_Latn
+Japanese,jpn_Jpan
+Kabyle,kab_Latn
+Jingpho,kac_Latn
+Kamba,kam_Latn
+Kannada,kan_Knda
+Kashmiri (Arabic script),kas_Arab
+Kashmiri (Devanagari script),kas_Deva
+Georgian,kat_Geor
+Central Kanuri (Arabic script),knc_Arab
+Central Kanuri (Latin script),knc_Latn
+Kazakh,kaz_Cyrl
+Kabiyè,kbp_Latn
+Kabuverdianu,kea_Latn
+Khmer,khm_Khmr
+Kikuyu,kik_Latn
+Kinyarwanda,kin_Latn
+Kyrgyz,kir_Cyrl
+Kimbundu,kmb_Latn
+Northern Kurdish,kmr_Latn
+Kikongo,kon_Latn
+Korean,kor_Hang
+Lao,lao_Laoo
+Ligurian,lij_Latn
+Limburgish,lim_Latn
+Lingala,lin_Latn
+Lithuanian,lit_Latn
+Lombard,lmo_Latn
+Latgalian,ltg_Latn
+Luxembourgish,ltz_Latn
+Luba-Kasai,lua_Latn
+Ganda,lug_Latn
+Luo,luo_Latn
+Mizo,lus_Latn
+Standard Latvian,lvs_Latn
+Magahi,mag_Deva
+Maithili,mai_Deva
+Malayalam,mal_Mlym
+Marathi,mar_Deva
+Minangkabau (Arabic script),min_Arab
+Minangkabau (Latin script),min_Latn
+Macedonian,mkd_Cyrl
+Plateau Malagasy,plt_Latn
+Maltese,mlt_Latn
+Meitei (Bengali script),mni_Beng
+Halh Mongolian,khk_Cyrl
+Mossi,mos_Latn
+Maori,mri_Latn
+Burmese,mya_Mymr
+Dutch,nld_Latn
+Norwegian Nynorsk,nno_Latn
+Norwegian Bokmål,nob_Latn
+Nepali,npi_Deva
+Northern Sotho,nso_Latn
+Nuer,nus_Latn
+Nyanja,nya_Latn
+Occitan,oci_Latn
+West Central Oromo,gaz_Latn
+Odia,ory_Orya
+Pangasinan,pag_Latn
+Eastern Panjabi,pan_Guru
+Papiamento,pap_Latn
+Western Persian,pes_Arab
+Polish,pol_Latn
+Portuguese,por_Latn
+Dari,prs_Arab
+Southern Pashto,pbt_Arab
+Ayacucho Quechua,quy_Latn
+Romanian,ron_Latn
+Rundi,run_Latn
+Russian,rus_Cyrl
+Sango,sag_Latn
+Sanskrit,san_Deva
+Santali,sat_Olck
+Sicilian,scn_Latn
+Shan,shn_Mymr
+Sinhala,sin_Sinh
+Slovak,slk_Latn
+Slovenian,slv_Latn
+Samoan,smo_Latn
+Shona,sna_Latn
+Sindhi,snd_Arab
+Somali,som_Latn
+Southern Sotho,sot_Latn
+Spanish,spa_Latn
+Tosk Albanian,als_Latn
+Sardinian,srd_Latn
+Serbian,srp_Cyrl
+Swati,ssw_Latn
+Sundanese,sun_Latn
+Swedish,swe_Latn
+Swahili,swh_Latn
+Silesian,szl_Latn
+Tamil,tam_Taml
+Tatar,tat_Cyrl
+Telugu,tel_Telu
+Tajik,tgk_Cyrl
+Tagalog,tgl_Latn
+Thai,tha_Thai
+Tigrinya,tir_Ethi
+Tamasheq (Latin script),taq_Latn
+Tamasheq (Tifinagh script),taq_Tfng
+Tok Pisin,tpi_Latn
+Tswana,tsn_Latn
+Tsonga,tso_Latn
+Turkmen,tuk_Latn
+Tumbuka,tum_Latn
+Turkish,tur_Latn
+Twi,twi_Latn
+Central Atlas Tamazight,tzm_Tfng
+Uyghur,uig_Arab
+Ukrainian,ukr_Cyrl
+Umbundu,umb_Latn
+Urdu,urd_Arab
+Northern Uzbek,uzn_Latn
+Venetian,vec_Latn
+Vietnamese,vie_Latn
+Waray,war_Latn
+Wolof,wol_Latn
+Xhosa,xho_Latn
+Eastern Yiddish,ydd_Hebr
+Yoruba,yor_Latn
+Yue Chinese,yue_Hant
+Chinese (Simplified),zho_Hans
+Chinese (Traditional),zho_Hant
+Standard Malay,zsm_Latn
+Zulu,zul_Latn

llm_translate_gradio.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import spaces
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+import torch
+from transformers import pipeline
+import pandas as pd
+import gradio as gr
+#NLLB translation setup
+tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
+model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
+@spaces.GPU
+def translate_to_lang(input_str, target_lang):
+    """
+    Function to translate arbitrary language input to one of 202 languages.
+    inputs:
+    - input_str [str]: Input arbitrary language str
+    - target_lang [str]: FLORES 200 str indicating the target language to translate to
+    outputs:
+    - output_str [str]: output in translated language
+    """
+    assert target_lang in tokenizer.additional_special_tokens, "not a valid FLORES 200 language!"
+    inputs = tokenizer(input_str, return_tensors="pt")
+    translated_tokens = model.generate(
+        **inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids(target_lang),
+    )
+    output_str = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
+    return output_str
+lang_keys = pd.read_csv('flores_200_keys.csv', header=None)
+#FLORES normal name key setup
+flores_dict = {}
+for i in range(len(lang_keys)):
+    flores_dict[lang_keys.loc[i][0]]=lang_keys.loc[i][1]
+#Llama 3.2 1b setup
+model_id = "meta-llama/Llama-3.2-3B-Instruct"
+pipe = pipeline(
+    "text-generation",
+    model=model_id,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+)
+@spaces.GPU
+def llama_QA(input_question):
+    """
+    stupid func for asking llama a question and then getting an answer
+    inputs:
+    - input_question [str]: question for llama to answer
+    outputs:
+    - response [str]: llama's response
+    """
+    messages = [
+    {"role": "system", "content": "You are a helpful chatbot assistant. Answer all questions in the language they are asked in."},
+    {"role": "user", "content": input_question},
+    ]
+    outputs = pipe(
+        messages,
+        max_new_tokens=512
+    )
+    response = outputs[0]["generated_text"][-1]['content']
+    return response
+# QA translation roundtrip
+@spaces.GPU
+def llama_multilang_roundtrip(input_question, lang):
+    """
+    func which translates input q to another language, asks llama that q in that lang, then translates that response back to english
+    inputs:
+    - input_question [str]: question to ask and be translated
+    - lang [str]: FLORES 200 target lang for roundtrip
+    outputs:
+    - response [str]: response in english, translated from llama response
+    """
+    noneng_input = translate_to_lang(input_question, lang)
+    init_response = llama_QA(noneng_input)
+    response = translate_to_lang(init_response, 'eng_Latn')
+    return response
+@spaces.GPU
+def gradio_func(input_question, left_lang, right_lang):
+    """
+    silly wrapper function for gradio that turns all inputs into a single func. runs both the LHS and RHS of teh 'app' in order to let gradio work correctly.
+    """
+    left_output = llama_multilang_roundtrip(input_question, flores_dict[left_lang])
+    right_output = llama_multilang_roundtrip(input_question, flores_dict[right_lang])
+    return left_output, right_output
+# Create the Gradio interface
+def create_interface():
+    # Get available languages from the flores_dict
+    language_choices = list(flores_dict.keys())
+    with gr.Blocks() as demo:
+        gr.Markdown("Ask Llama the same question in different languages!")
+        with gr.Row():
+            question_input = gr.Textbox(label="Enter your question", interactive=True)
+        with gr.Row():
+            left_lang = gr.Dropdown(choices=language_choices, label="Language #1")
+            right_lang = gr.Dropdown(choices=language_choices, label="Language #2")
+        with gr.Row():
+            submit_btn = gr.Button("Translate")
+        with gr.Row():
+            left_output = gr.Textbox(label="Language #1 answer", interactive=False)
+            right_output = gr.Textbox(label="Language #2 answer", interactive=False)
+        submit_btn.click(
+            fn=gradio_func,
+            inputs=[question_input, left_lang, right_lang],
+            outputs=[left_output, right_output]
+        )
+    return demo
+# Launch the app
+demo = create_interface()
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,85 @@

+accelerate==1.4.0
+aiofiles==23.2.1
+annotated-types==0.7.0
+anyio==4.8.0
+asttokens==3.0.0
+certifi==2025.1.31
+charset-normalizer==3.4.1
+click==8.1.8
+comm==0.2.2
+debugpy==1.8.12
+decorator==5.1.1
+exceptiongroup==1.2.2
+executing==2.2.0
+fastapi==0.115.8
+ffmpy==0.5.0
+filelock==3.17.0
+fsspec==2025.2.0
+gradio==5.16.1
+gradio_client==1.7.0
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.28.1
+huggingface-hub==0.28.1
+idna==3.10
+ipykernel==6.29.5
+ipython==8.32.0
+jedi==0.19.2
+Jinja2==3.1.5
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mpmath==1.3.0
+nest-asyncio==1.6.0
+networkx==3.4.2
+numpy==2.2.3
+orjson==3.10.15
+packaging==24.2
+pandas==2.2.3
+parso==0.8.4
+pexpect==4.9.0
+pillow==11.1.0
+platformdirs==4.3.6
+prompt_toolkit==3.0.50
+psutil==7.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pydantic==2.10.6
+pydantic_core==2.27.2
+pydub==0.25.1
+Pygments==2.19.1
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+pytz==2025.1
+PyYAML==6.0.2
+pyzmq==26.2.1
+regex==2024.11.6
+requests==2.32.3
+rich==13.9.4
+ruff==0.9.6
+safehttpx==0.1.6
+safetensors==0.5.2
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+stack-data==0.6.3
+starlette==0.45.3
+sympy==1.13.1
+tokenizers==0.21.0
+tomlkit==0.13.2
+torch==2.4.0
+tornado==6.4.2
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.49.0
+typer==0.15.1
+typing_extensions==4.12.2
+tzdata==2025.1
+urllib3==2.3.0
+uvicorn==0.34.0
+wcwidth==0.2.13
+websockets==14.2