Anonymous commited on
Commit
707f578
·
1 Parent(s): 7c008e6
Files changed (3) hide show
  1. app.py +1 -1
  2. requirements.txt +3 -1
  3. tasks/summarization.py +20 -20
app.py CHANGED
@@ -41,7 +41,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
41
  with gr.Row():
42
  task = gr.Dropdown(label="Task", choices=list(tasks_datasets.keys()), value=QA)
43
  language = gr.Dropdown(label="Source Language", choices=languages, value="English")
44
- model_type = gr.Dropdown(label="Model Type", choices=["Multilingual", "English"], value='English')
45
  config_recommendation = gr.Button("Recommend Configuration")
46
  with gr.Row():
47
  config_prompt = gr.Textbox(label="Recommended Configuration", interactive=False,
 
41
  with gr.Row():
42
  task = gr.Dropdown(label="Task", choices=list(tasks_datasets.keys()), value=QA)
43
  language = gr.Dropdown(label="Source Language", choices=languages, value="English")
44
+ model_type = gr.Dropdown(label="Model Type", choices=["Multilingual", "English-Centric"], value='Multilingual')
45
  config_recommendation = gr.Button("Recommend Configuration")
46
  with gr.Row():
47
  config_prompt = gr.Textbox(label="Recommended Configuration", interactive=False,
requirements.txt CHANGED
@@ -3,4 +3,6 @@ numpy
3
  datasets
4
  easygoogletranslate
5
  evaluate
6
- langchain
 
 
 
3
  datasets
4
  easygoogletranslate
5
  evaluate
6
+ langchain
7
+ tqdm
8
+ iso639
tasks/summarization.py CHANGED
@@ -1,10 +1,9 @@
1
  from typing import List, Dict, Optional, Union
2
-
3
  import numpy as np
4
  from datasets import Dataset, load_dataset
5
  from easygoogletranslate import EasyGoogleTranslate
6
  from langchain.prompts import PromptTemplate, FewShotPromptTemplate
7
-
8
  LANGUAGE_TO_SUFFIX = {
9
  "chinese_simplified": "zh-CN",
10
  "french": "fr",
@@ -24,12 +23,13 @@ LANGUAGE_TO_SUFFIX = {
24
  "persian": "fa",
25
  "azerbaijani": "az",
26
  "korean": "ko",
 
27
  }
28
 
 
29
  def choose_few_shot_examples(
30
  train_dataset: Dataset, few_shot_size: int, context: List[str], selection_criteria: str, lang: str,
31
  ) -> List[Dict[str, Union[str, int]]]:
32
-
33
  selected_examples = []
34
 
35
  example_idxs = []
@@ -56,15 +56,15 @@ def choose_few_shot_examples(
56
  def _translate_instruction(basic_instruction: str, target_language: str) -> str:
57
  translator = EasyGoogleTranslate(
58
  source_language="en",
59
- target_language=LANGUAGE_TO_SUFFIX[target_language],
60
  timeout=50,
61
  )
62
  return translator.translate(basic_instruction)
63
 
64
 
65
  def _translate_example(example: Dict[str, str], src_language: str, target_language: str):
66
- translator = EasyGoogleTranslate(source_language=LANGUAGE_TO_SUFFIX[src_language],
67
- target_language=LANGUAGE_TO_SUFFIX[target_language],
68
  timeout=30)
69
  try:
70
  return {'text': translator.translate(example['text']), 'summary': ''}
@@ -85,20 +85,20 @@ def create_instruction(lang: str, expected_output: str):
85
  )
86
 
87
 
88
- def load_xlsum_data(lang, split, limit = 5):
89
  """Loads the xlsum dataset"""
90
  dataset = load_dataset("csebuetnlp/xlsum", lang)[split]
91
  return dataset.select(range(limit))
92
 
93
 
94
  def construct_prompt(
95
- instruction: str,
96
- test_example: dict,
97
- zero_shot: bool,
98
- dataset: str,
99
- num_examples: int,
100
- lang: str,
101
- config: Dict[str, str],
102
  ):
103
  if not instruction:
104
  print(lang)
@@ -110,14 +110,14 @@ def construct_prompt(
110
 
111
  zero_shot_template = f"""{instruction}""" + "\n Input: {text} " ""
112
 
113
- test_data = load_xlsum_data(lang=lang, split="test", limit=100)
 
 
 
 
114
 
115
- print(test_data)
116
- print(num_examples)
117
- print(lang)
118
  ic_examples = []
119
  if not zero_shot:
120
-
121
  ic_examples = choose_few_shot_examples(
122
  train_dataset=test_data,
123
  few_shot_size=num_examples,
@@ -139,7 +139,7 @@ def construct_prompt(
139
  )
140
 
141
  print("lang", lang)
142
- print(config["input"] , lang)
143
  if config["input"] != lang:
144
  test_example = _translate_example(
145
  example=test_example, src_language=lang, target_language=config["input"]
 
1
  from typing import List, Dict, Optional, Union
 
2
  import numpy as np
3
  from datasets import Dataset, load_dataset
4
  from easygoogletranslate import EasyGoogleTranslate
5
  from langchain.prompts import PromptTemplate, FewShotPromptTemplate
6
+ from iso639 import to_iso639_1
7
  LANGUAGE_TO_SUFFIX = {
8
  "chinese_simplified": "zh-CN",
9
  "french": "fr",
 
23
  "persian": "fa",
24
  "azerbaijani": "az",
25
  "korean": "ko",
26
+ "hebrew": "he",
27
  }
28
 
29
+
30
  def choose_few_shot_examples(
31
  train_dataset: Dataset, few_shot_size: int, context: List[str], selection_criteria: str, lang: str,
32
  ) -> List[Dict[str, Union[str, int]]]:
 
33
  selected_examples = []
34
 
35
  example_idxs = []
 
56
  def _translate_instruction(basic_instruction: str, target_language: str) -> str:
57
  translator = EasyGoogleTranslate(
58
  source_language="en",
59
+ target_language=to_iso639_1(target_language),
60
  timeout=50,
61
  )
62
  return translator.translate(basic_instruction)
63
 
64
 
65
  def _translate_example(example: Dict[str, str], src_language: str, target_language: str):
66
+ translator = EasyGoogleTranslate(source_language=to_iso639_1(str(src_language).capitalize()),
67
+ target_language=to_iso639_1(str(target_language).capitalize()),
68
  timeout=30)
69
  try:
70
  return {'text': translator.translate(example['text']), 'summary': ''}
 
85
  )
86
 
87
 
88
+ def load_xlsum_data(lang, split, limit=5):
89
  """Loads the xlsum dataset"""
90
  dataset = load_dataset("csebuetnlp/xlsum", lang)[split]
91
  return dataset.select(range(limit))
92
 
93
 
94
  def construct_prompt(
95
+ instruction: str,
96
+ test_example: dict,
97
+ zero_shot: bool,
98
+ dataset: str,
99
+ num_examples: int,
100
+ lang: str,
101
+ config: Dict[str, str],
102
  ):
103
  if not instruction:
104
  print(lang)
 
110
 
111
  zero_shot_template = f"""{instruction}""" + "\n Input: {text} " ""
112
 
113
+ if not zero_shot:
114
+ try:
115
+ test_data = load_xlsum_data(lang=lang, split="test", limit=100)
116
+ except Exception as e:
117
+ raise KeyError(f"{lang} is not supported in XlSum dataset, choose supported language in few-shot")
118
 
 
 
 
119
  ic_examples = []
120
  if not zero_shot:
 
121
  ic_examples = choose_few_shot_examples(
122
  train_dataset=test_data,
123
  few_shot_size=num_examples,
 
139
  )
140
 
141
  print("lang", lang)
142
+ print(config["input"], lang)
143
  if config["input"] != lang:
144
  test_example = _translate_example(
145
  example=test_example, src_language=lang, target_language=config["input"]