Spaces:

naacl-anonymous
/

selective_pre_translation

Runtime error

App Files Files Community

Anonymous commited on Oct 1, 2024

Commit

707f578

1 Parent(s): 7c008e6

changes

Browse files

Files changed (3) hide show

app.py +1 -1
requirements.txt +3 -1
tasks/summarization.py +20 -20

app.py CHANGED Viewed

@@ -41,7 +41,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     with gr.Row():
         task = gr.Dropdown(label="Task", choices=list(tasks_datasets.keys()), value=QA)
         language = gr.Dropdown(label="Source Language", choices=languages, value="English")
-        model_type = gr.Dropdown(label="Model Type", choices=["Multilingual", "English"], value='English')
     config_recommendation = gr.Button("Recommend Configuration")
     with gr.Row():
         config_prompt = gr.Textbox(label="Recommended Configuration", interactive=False,

     with gr.Row():
         task = gr.Dropdown(label="Task", choices=list(tasks_datasets.keys()), value=QA)
         language = gr.Dropdown(label="Source Language", choices=languages, value="English")
+        model_type = gr.Dropdown(label="Model Type", choices=["Multilingual", "English-Centric"], value='Multilingual')
     config_recommendation = gr.Button("Recommend Configuration")
     with gr.Row():
         config_prompt = gr.Textbox(label="Recommended Configuration", interactive=False,

requirements.txt CHANGED Viewed

@@ -3,4 +3,6 @@ numpy
 datasets
 easygoogletranslate
 evaluate
-langchain

 datasets
 easygoogletranslate
 evaluate
+langchain
+tqdm
+iso639

tasks/summarization.py CHANGED Viewed

@@ -1,10 +1,9 @@
 from typing import List, Dict, Optional, Union
 import numpy as np
 from datasets import Dataset, load_dataset
 from easygoogletranslate import EasyGoogleTranslate
 from langchain.prompts import PromptTemplate, FewShotPromptTemplate
 LANGUAGE_TO_SUFFIX = {
     "chinese_simplified": "zh-CN",
     "french": "fr",
@@ -24,12 +23,13 @@ LANGUAGE_TO_SUFFIX = {
     "persian": "fa",
     "azerbaijani": "az",
     "korean": "ko",
 }
 def choose_few_shot_examples(
         train_dataset: Dataset, few_shot_size: int, context: List[str], selection_criteria: str, lang: str,
 ) -> List[Dict[str, Union[str, int]]]:
     selected_examples = []
     example_idxs = []
@@ -56,15 +56,15 @@ def choose_few_shot_examples(
 def _translate_instruction(basic_instruction: str, target_language: str) -> str:
     translator = EasyGoogleTranslate(
         source_language="en",
-        target_language=LANGUAGE_TO_SUFFIX[target_language],
         timeout=50,
     )
     return translator.translate(basic_instruction)
 def _translate_example(example: Dict[str, str], src_language: str, target_language: str):
-    translator = EasyGoogleTranslate(source_language=LANGUAGE_TO_SUFFIX[src_language],
-                                     target_language=LANGUAGE_TO_SUFFIX[target_language],
                                      timeout=30)
     try:
         return {'text': translator.translate(example['text']), 'summary': ''}
@@ -85,20 +85,20 @@ def create_instruction(lang: str, expected_output: str):
     )
-def load_xlsum_data(lang, split, limit = 5):
     """Loads the xlsum dataset"""
     dataset = load_dataset("csebuetnlp/xlsum", lang)[split]
     return dataset.select(range(limit))
 def construct_prompt(
-    instruction: str,
-    test_example: dict,
-    zero_shot: bool,
-    dataset: str,
-    num_examples: int,
-    lang: str,
-    config: Dict[str, str],
 ):
     if not instruction:
         print(lang)
@@ -110,14 +110,14 @@ def construct_prompt(
     zero_shot_template = f"""{instruction}""" + "\n Input: {text} " ""
-    test_data = load_xlsum_data(lang=lang, split="test", limit=100)
-    print(test_data)
-    print(num_examples)
-    print(lang)
     ic_examples = []
     if not zero_shot:
         ic_examples = choose_few_shot_examples(
             train_dataset=test_data,
             few_shot_size=num_examples,
@@ -139,7 +139,7 @@ def construct_prompt(
     )
     print("lang", lang)
-    print(config["input"] , lang)
     if config["input"] != lang:
         test_example = _translate_example(
             example=test_example, src_language=lang, target_language=config["input"]

 from typing import List, Dict, Optional, Union
 import numpy as np
 from datasets import Dataset, load_dataset
 from easygoogletranslate import EasyGoogleTranslate
 from langchain.prompts import PromptTemplate, FewShotPromptTemplate
+from iso639 import to_iso639_1
 LANGUAGE_TO_SUFFIX = {
     "chinese_simplified": "zh-CN",
     "french": "fr",
     "persian": "fa",
     "azerbaijani": "az",
     "korean": "ko",
+    "hebrew": "he",
 }
 def choose_few_shot_examples(
         train_dataset: Dataset, few_shot_size: int, context: List[str], selection_criteria: str, lang: str,
 ) -> List[Dict[str, Union[str, int]]]:
     selected_examples = []
     example_idxs = []
 def _translate_instruction(basic_instruction: str, target_language: str) -> str:
     translator = EasyGoogleTranslate(
         source_language="en",
+        target_language=to_iso639_1(target_language),
         timeout=50,
     )
     return translator.translate(basic_instruction)
 def _translate_example(example: Dict[str, str], src_language: str, target_language: str):
+    translator = EasyGoogleTranslate(source_language=to_iso639_1(str(src_language).capitalize()),
+                                     target_language=to_iso639_1(str(target_language).capitalize()),
                                      timeout=30)
     try:
         return {'text': translator.translate(example['text']), 'summary': ''}
     )
+def load_xlsum_data(lang, split, limit=5):
     """Loads the xlsum dataset"""
     dataset = load_dataset("csebuetnlp/xlsum", lang)[split]
     return dataset.select(range(limit))
 def construct_prompt(
+        instruction: str,
+        test_example: dict,
+        zero_shot: bool,
+        dataset: str,
+        num_examples: int,
+        lang: str,
+        config: Dict[str, str],
 ):
     if not instruction:
         print(lang)
     zero_shot_template = f"""{instruction}""" + "\n Input: {text} " ""
+    if not zero_shot:
+        try:
+            test_data = load_xlsum_data(lang=lang, split="test", limit=100)
+        except Exception as e:
+            raise KeyError(f"{lang} is not supported in XlSum dataset, choose supported language in few-shot")
     ic_examples = []
     if not zero_shot:
         ic_examples = choose_few_shot_examples(
             train_dataset=test_data,
             few_shot_size=num_examples,
     )
     print("lang", lang)
+    print(config["input"], lang)
     if config["input"] != lang:
         test_example = _translate_example(
             example=test_example, src_language=lang, target_language=config["input"]