willsh1997 commited on
Commit
7a44d8e
·
1 Parent(s): cc77a80

:sparkles: initial commit

Browse files
.github/workflows/push_to_hub.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v3
14
+ with:
15
+ fetch-depth: 0
16
+ lfs: true
17
+ - name: Push to hub
18
+ env:
19
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
20
+ run: git push https://willsh1997:[email protected]/spaces/willsh1997/llm_multilingual_demo main
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Llm Multilingual Demo
3
+ emoji: 📊
4
+ colorFrom: pink
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 5.16.1
8
+ app_file: llm_translate_gradio.py
9
+ pinned: true
10
+ license: apache-2.0
11
+ short_description: MVP demo of multilingual LLM performance eval space
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
flores_200_keys.csv ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Acehnese (Arabic script),ace_Arab
2
+ Acehnese (Latin script),ace_Latn
3
+ Mesopotamian Arabic,acm_Arab
4
+ Ta’izzi-Adeni Arabic,acq_Arab
5
+ Tunisian Arabic,aeb_Arab
6
+ Afrikaans,afr_Latn
7
+ South Levantine Arabic,ajp_Arab
8
+ Akan,aka_Latn
9
+ Amharic,amh_Ethi
10
+ North Levantine Arabic,apc_Arab
11
+ Modern Standard Arabic,arb_Arab
12
+ Modern Standard Arabic (Romanized),arb_Latn
13
+ Najdi Arabic,ars_Arab
14
+ Moroccan Arabic,ary_Arab
15
+ Egyptian Arabic,arz_Arab
16
+ Assamese,asm_Beng
17
+ Asturian,ast_Latn
18
+ Awadhi,awa_Deva
19
+ Central Aymara,ayr_Latn
20
+ South Azerbaijani,azb_Arab
21
+ North Azerbaijani,azj_Latn
22
+ Bashkir,bak_Cyrl
23
+ Bambara,bam_Latn
24
+ Balinese,ban_Latn
25
+ Belarusian,bel_Cyrl
26
+ Bemba,bem_Latn
27
+ Bengali,ben_Beng
28
+ Bhojpuri,bho_Deva
29
+ Banjar (Arabic script),bjn_Arab
30
+ Banjar (Latin script),bjn_Latn
31
+ Standard Tibetan,bod_Tibt
32
+ Bosnian,bos_Latn
33
+ Buginese,bug_Latn
34
+ Bulgarian,bul_Cyrl
35
+ Catalan,cat_Latn
36
+ Cebuano,ceb_Latn
37
+ Czech,ces_Latn
38
+ Chokwe,cjk_Latn
39
+ Central Kurdish,ckb_Arab
40
+ Crimean Tatar,crh_Latn
41
+ Welsh,cym_Latn
42
+ Danish,dan_Latn
43
+ German,deu_Latn
44
+ Southwestern Dinka,dik_Latn
45
+ Dyula,dyu_Latn
46
+ Dzongkha,dzo_Tibt
47
+ Greek,ell_Grek
48
+ English,eng_Latn
49
+ Esperanto,epo_Latn
50
+ Estonian,est_Latn
51
+ Basque,eus_Latn
52
+ Ewe,ewe_Latn
53
+ Faroese,fao_Latn
54
+ Fijian,fij_Latn
55
+ Finnish,fin_Latn
56
+ Fon,fon_Latn
57
+ French,fra_Latn
58
+ Friulian,fur_Latn
59
+ Nigerian Fulfulde,fuv_Latn
60
+ Scottish Gaelic,gla_Latn
61
+ Irish,gle_Latn
62
+ Galician,glg_Latn
63
+ Guarani,grn_Latn
64
+ Gujarati,guj_Gujr
65
+ Haitian Creole,hat_Latn
66
+ Hausa,hau_Latn
67
+ Hebrew,heb_Hebr
68
+ Hindi,hin_Deva
69
+ Chhattisgarhi,hne_Deva
70
+ Croatian,hrv_Latn
71
+ Hungarian,hun_Latn
72
+ Armenian,hye_Armn
73
+ Igbo,ibo_Latn
74
+ Ilocano,ilo_Latn
75
+ Indonesian,ind_Latn
76
+ Icelandic,isl_Latn
77
+ Italian,ita_Latn
78
+ Javanese,jav_Latn
79
+ Japanese,jpn_Jpan
80
+ Kabyle,kab_Latn
81
+ Jingpho,kac_Latn
82
+ Kamba,kam_Latn
83
+ Kannada,kan_Knda
84
+ Kashmiri (Arabic script),kas_Arab
85
+ Kashmiri (Devanagari script),kas_Deva
86
+ Georgian,kat_Geor
87
+ Central Kanuri (Arabic script),knc_Arab
88
+ Central Kanuri (Latin script),knc_Latn
89
+ Kazakh,kaz_Cyrl
90
+ Kabiyè,kbp_Latn
91
+ Kabuverdianu,kea_Latn
92
+ Khmer,khm_Khmr
93
+ Kikuyu,kik_Latn
94
+ Kinyarwanda,kin_Latn
95
+ Kyrgyz,kir_Cyrl
96
+ Kimbundu,kmb_Latn
97
+ Northern Kurdish,kmr_Latn
98
+ Kikongo,kon_Latn
99
+ Korean,kor_Hang
100
+ Lao,lao_Laoo
101
+ Ligurian,lij_Latn
102
+ Limburgish,lim_Latn
103
+ Lingala,lin_Latn
104
+ Lithuanian,lit_Latn
105
+ Lombard,lmo_Latn
106
+ Latgalian,ltg_Latn
107
+ Luxembourgish,ltz_Latn
108
+ Luba-Kasai,lua_Latn
109
+ Ganda,lug_Latn
110
+ Luo,luo_Latn
111
+ Mizo,lus_Latn
112
+ Standard Latvian,lvs_Latn
113
+ Magahi,mag_Deva
114
+ Maithili,mai_Deva
115
+ Malayalam,mal_Mlym
116
+ Marathi,mar_Deva
117
+ Minangkabau (Arabic script),min_Arab
118
+ Minangkabau (Latin script),min_Latn
119
+ Macedonian,mkd_Cyrl
120
+ Plateau Malagasy,plt_Latn
121
+ Maltese,mlt_Latn
122
+ Meitei (Bengali script),mni_Beng
123
+ Halh Mongolian,khk_Cyrl
124
+ Mossi,mos_Latn
125
+ Maori,mri_Latn
126
+ Burmese,mya_Mymr
127
+ Dutch,nld_Latn
128
+ Norwegian Nynorsk,nno_Latn
129
+ Norwegian Bokmål,nob_Latn
130
+ Nepali,npi_Deva
131
+ Northern Sotho,nso_Latn
132
+ Nuer,nus_Latn
133
+ Nyanja,nya_Latn
134
+ Occitan,oci_Latn
135
+ West Central Oromo,gaz_Latn
136
+ Odia,ory_Orya
137
+ Pangasinan,pag_Latn
138
+ Eastern Panjabi,pan_Guru
139
+ Papiamento,pap_Latn
140
+ Western Persian,pes_Arab
141
+ Polish,pol_Latn
142
+ Portuguese,por_Latn
143
+ Dari,prs_Arab
144
+ Southern Pashto,pbt_Arab
145
+ Ayacucho Quechua,quy_Latn
146
+ Romanian,ron_Latn
147
+ Rundi,run_Latn
148
+ Russian,rus_Cyrl
149
+ Sango,sag_Latn
150
+ Sanskrit,san_Deva
151
+ Santali,sat_Olck
152
+ Sicilian,scn_Latn
153
+ Shan,shn_Mymr
154
+ Sinhala,sin_Sinh
155
+ Slovak,slk_Latn
156
+ Slovenian,slv_Latn
157
+ Samoan,smo_Latn
158
+ Shona,sna_Latn
159
+ Sindhi,snd_Arab
160
+ Somali,som_Latn
161
+ Southern Sotho,sot_Latn
162
+ Spanish,spa_Latn
163
+ Tosk Albanian,als_Latn
164
+ Sardinian,srd_Latn
165
+ Serbian,srp_Cyrl
166
+ Swati,ssw_Latn
167
+ Sundanese,sun_Latn
168
+ Swedish,swe_Latn
169
+ Swahili,swh_Latn
170
+ Silesian,szl_Latn
171
+ Tamil,tam_Taml
172
+ Tatar,tat_Cyrl
173
+ Telugu,tel_Telu
174
+ Tajik,tgk_Cyrl
175
+ Tagalog,tgl_Latn
176
+ Thai,tha_Thai
177
+ Tigrinya,tir_Ethi
178
+ Tamasheq (Latin script),taq_Latn
179
+ Tamasheq (Tifinagh script),taq_Tfng
180
+ Tok Pisin,tpi_Latn
181
+ Tswana,tsn_Latn
182
+ Tsonga,tso_Latn
183
+ Turkmen,tuk_Latn
184
+ Tumbuka,tum_Latn
185
+ Turkish,tur_Latn
186
+ Twi,twi_Latn
187
+ Central Atlas Tamazight,tzm_Tfng
188
+ Uyghur,uig_Arab
189
+ Ukrainian,ukr_Cyrl
190
+ Umbundu,umb_Latn
191
+ Urdu,urd_Arab
192
+ Northern Uzbek,uzn_Latn
193
+ Venetian,vec_Latn
194
+ Vietnamese,vie_Latn
195
+ Waray,war_Latn
196
+ Wolof,wol_Latn
197
+ Xhosa,xho_Latn
198
+ Eastern Yiddish,ydd_Hebr
199
+ Yoruba,yor_Latn
200
+ Yue Chinese,yue_Hant
201
+ Chinese (Simplified),zho_Hans
202
+ Chinese (Traditional),zho_Hant
203
+ Standard Malay,zsm_Latn
204
+ Zulu,zul_Latn
llm_translate_gradio.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3
+ import torch
4
+ from transformers import pipeline
5
+ import pandas as pd
6
+ import gradio as gr
7
+
8
+ #NLLB translation setup
9
+
10
+ tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
11
+ model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
12
+
13
+ @spaces.GPU
14
+ def translate_to_lang(input_str, target_lang):
15
+ """
16
+ Function to translate arbitrary language input to one of 202 languages.
17
+
18
+ inputs:
19
+ - input_str [str]: Input arbitrary language str
20
+ - target_lang [str]: FLORES 200 str indicating the target language to translate to
21
+
22
+ outputs:
23
+ - output_str [str]: output in translated language
24
+ """
25
+ assert target_lang in tokenizer.additional_special_tokens, "not a valid FLORES 200 language!"
26
+ inputs = tokenizer(input_str, return_tensors="pt")
27
+
28
+ translated_tokens = model.generate(
29
+ **inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids(target_lang),
30
+ )
31
+ output_str = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
32
+ return output_str
33
+
34
+ lang_keys = pd.read_csv('flores_200_keys.csv', header=None)
35
+ #FLORES normal name key setup
36
+ flores_dict = {}
37
+ for i in range(len(lang_keys)):
38
+ flores_dict[lang_keys.loc[i][0]]=lang_keys.loc[i][1]
39
+
40
+ #Llama 3.2 1b setup
41
+ model_id = "meta-llama/Llama-3.2-3B-Instruct"
42
+ pipe = pipeline(
43
+ "text-generation",
44
+ model=model_id,
45
+ torch_dtype=torch.bfloat16,
46
+ device_map="auto",
47
+ )
48
+
49
+ @spaces.GPU
50
+ def llama_QA(input_question):
51
+ """
52
+ stupid func for asking llama a question and then getting an answer
53
+ inputs:
54
+ - input_question [str]: question for llama to answer
55
+
56
+ outputs:
57
+ - response [str]: llama's response
58
+ """
59
+
60
+ messages = [
61
+ {"role": "system", "content": "You are a helpful chatbot assistant. Answer all questions in the language they are asked in."},
62
+ {"role": "user", "content": input_question},
63
+ ]
64
+ outputs = pipe(
65
+ messages,
66
+ max_new_tokens=512
67
+ )
68
+ response = outputs[0]["generated_text"][-1]['content']
69
+ return response
70
+
71
+
72
+ # QA translation roundtrip
73
+ @spaces.GPU
74
+ def llama_multilang_roundtrip(input_question, lang):
75
+ """
76
+ func which translates input q to another language, asks llama that q in that lang, then translates that response back to english
77
+
78
+ inputs:
79
+ - input_question [str]: question to ask and be translated
80
+ - lang [str]: FLORES 200 target lang for roundtrip
81
+
82
+ outputs:
83
+ - response [str]: response in english, translated from llama response
84
+ """
85
+ noneng_input = translate_to_lang(input_question, lang)
86
+ init_response = llama_QA(noneng_input)
87
+ response = translate_to_lang(init_response, 'eng_Latn')
88
+ return response
89
+
90
+ @spaces.GPU
91
+ def gradio_func(input_question, left_lang, right_lang):
92
+ """
93
+ silly wrapper function for gradio that turns all inputs into a single func. runs both the LHS and RHS of teh 'app' in order to let gradio work correctly.
94
+ """
95
+ left_output = llama_multilang_roundtrip(input_question, flores_dict[left_lang])
96
+ right_output = llama_multilang_roundtrip(input_question, flores_dict[right_lang])
97
+ return left_output, right_output
98
+
99
+ # Create the Gradio interface
100
+ def create_interface():
101
+ # Get available languages from the flores_dict
102
+ language_choices = list(flores_dict.keys())
103
+
104
+ with gr.Blocks() as demo:
105
+ gr.Markdown("Ask Llama the same question in different languages!")
106
+ with gr.Row():
107
+ question_input = gr.Textbox(label="Enter your question", interactive=True)
108
+ with gr.Row():
109
+ left_lang = gr.Dropdown(choices=language_choices, label="Language #1")
110
+ right_lang = gr.Dropdown(choices=language_choices, label="Language #2")
111
+ with gr.Row():
112
+ submit_btn = gr.Button("Translate")
113
+ with gr.Row():
114
+ left_output = gr.Textbox(label="Language #1 answer", interactive=False)
115
+ right_output = gr.Textbox(label="Language #2 answer", interactive=False)
116
+
117
+ submit_btn.click(
118
+ fn=gradio_func,
119
+ inputs=[question_input, left_lang, right_lang],
120
+ outputs=[left_output, right_output]
121
+ )
122
+
123
+ return demo
124
+
125
+ # Launch the app
126
+ demo = create_interface()
127
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==1.4.0
2
+ aiofiles==23.2.1
3
+ annotated-types==0.7.0
4
+ anyio==4.8.0
5
+ asttokens==3.0.0
6
+ certifi==2025.1.31
7
+ charset-normalizer==3.4.1
8
+ click==8.1.8
9
+ comm==0.2.2
10
+ debugpy==1.8.12
11
+ decorator==5.1.1
12
+ exceptiongroup==1.2.2
13
+ executing==2.2.0
14
+ fastapi==0.115.8
15
+ ffmpy==0.5.0
16
+ filelock==3.17.0
17
+ fsspec==2025.2.0
18
+ gradio==5.16.1
19
+ gradio_client==1.7.0
20
+ h11==0.14.0
21
+ httpcore==1.0.7
22
+ httpx==0.28.1
23
+ huggingface-hub==0.28.1
24
+ idna==3.10
25
+ ipykernel==6.29.5
26
+ ipython==8.32.0
27
+ jedi==0.19.2
28
+ Jinja2==3.1.5
29
+ jupyter_client==8.6.3
30
+ jupyter_core==5.7.2
31
+ markdown-it-py==3.0.0
32
+ MarkupSafe==2.1.5
33
+ matplotlib-inline==0.1.7
34
+ mdurl==0.1.2
35
+ mpmath==1.3.0
36
+ nest-asyncio==1.6.0
37
+ networkx==3.4.2
38
+ numpy==2.2.3
39
+ orjson==3.10.15
40
+ packaging==24.2
41
+ pandas==2.2.3
42
+ parso==0.8.4
43
+ pexpect==4.9.0
44
+ pillow==11.1.0
45
+ platformdirs==4.3.6
46
+ prompt_toolkit==3.0.50
47
+ psutil==7.0.0
48
+ ptyprocess==0.7.0
49
+ pure_eval==0.2.3
50
+ pydantic==2.10.6
51
+ pydantic_core==2.27.2
52
+ pydub==0.25.1
53
+ Pygments==2.19.1
54
+ python-dateutil==2.9.0.post0
55
+ python-multipart==0.0.20
56
+ pytz==2025.1
57
+ PyYAML==6.0.2
58
+ pyzmq==26.2.1
59
+ regex==2024.11.6
60
+ requests==2.32.3
61
+ rich==13.9.4
62
+ ruff==0.9.6
63
+ safehttpx==0.1.6
64
+ safetensors==0.5.2
65
+ semantic-version==2.10.0
66
+ shellingham==1.5.4
67
+ six==1.17.0
68
+ sniffio==1.3.1
69
+ stack-data==0.6.3
70
+ starlette==0.45.3
71
+ sympy==1.13.1
72
+ tokenizers==0.21.0
73
+ tomlkit==0.13.2
74
+ torch==2.4.0
75
+ tornado==6.4.2
76
+ tqdm==4.67.1
77
+ traitlets==5.14.3
78
+ transformers==4.49.0
79
+ typer==0.15.1
80
+ typing_extensions==4.12.2
81
+ tzdata==2025.1
82
+ urllib3==2.3.0
83
+ uvicorn==0.34.0
84
+ wcwidth==0.2.13
85
+ websockets==14.2