portalniy-dev commited on
Commit
72c1ae2
Β·
verified Β·
1 Parent(s): a0a643a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -4
app.py CHANGED
@@ -9,7 +9,7 @@ dataset_names = {
9
  'ag_news': None,
10
  'squad': None,
11
  'cnn_dailymail': '1.0.0', # Specify configuration for cnn_dailymail
12
- 'wiki40b': 'en' # Specify language for wiki40b
13
  }
14
 
15
  # Global variables for model and tokenizer
@@ -22,11 +22,21 @@ def load_and_prepare_datasets():
22
  for name, config in dataset_names.items():
23
  datasets.append(load_dataset(name, config))
24
 
 
 
 
 
 
 
 
 
 
 
25
  # Concatenate train datasets only for training
26
- train_dataset = concatenate_datasets([ds['train'] for ds in datasets if 'train' in ds])
27
 
28
- # Use only a subset for evaluation if needed
29
- eval_dataset = concatenate_datasets([ds['test'] for ds in datasets if 'test' in ds])
30
 
31
  return train_dataset, eval_dataset
32
 
 
9
  'ag_news': None,
10
  'squad': None,
11
  'cnn_dailymail': '1.0.0', # Specify configuration for cnn_dailymail
12
+ 'wiki40b': 'ru' # Specify language for wiki40b
13
  }
14
 
15
  # Global variables for model and tokenizer
 
22
  for name, config in dataset_names.items():
23
  datasets.append(load_dataset(name, config))
24
 
25
+ # Extract only the 'text' field from each dataset for training
26
+ train_datasets = []
27
+ eval_datasets = []
28
+
29
+ for ds in datasets:
30
+ if 'train' in ds:
31
+ train_datasets.append(ds['train'].map(lambda x: {'text': x['text']}))
32
+ if 'test' in ds:
33
+ eval_datasets.append(ds['test'].map(lambda x: {'text': x['text']}))
34
+
35
  # Concatenate train datasets only for training
36
+ train_dataset = concatenate_datasets(train_datasets)
37
 
38
+ # Concatenate eval datasets only for evaluation
39
+ eval_dataset = concatenate_datasets(eval_datasets)
40
 
41
  return train_dataset, eval_dataset
42