Pradeep Kumar commited on
Commit
d23a681
·
verified ·
1 Parent(s): 0de7ffc

Delete tagging_data_lib.py

Browse files
Files changed (1) hide show
  1. tagging_data_lib.py +0 -426
tagging_data_lib.py DELETED
@@ -1,426 +0,0 @@
1
- # Copyright 2024 The TensorFlow Authors. All Rights Reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- """Library to process data for tagging task such as NER/POS."""
16
- import collections
17
- import os
18
-
19
- from absl import logging
20
- import tensorflow as tf, tf_keras
21
-
22
- from official.nlp.data import classifier_data_lib
23
- from official.nlp.tools import tokenization
24
-
25
- # A negative label id for the padding label, which will not contribute
26
- # to loss/metrics in training.
27
- _PADDING_LABEL_ID = -1
28
-
29
- # The special unknown token, used to substitute a word which has too many
30
- # subwords after tokenization.
31
- _UNK_TOKEN = "[UNK]"
32
-
33
-
34
- class InputExample(object):
35
- """A single training/test example for token classification."""
36
-
37
- def __init__(self,
38
- sentence_id,
39
- sub_sentence_id=0,
40
- words=None,
41
- label_ids=None):
42
- """Constructs an InputExample."""
43
- self.sentence_id = sentence_id
44
- self.sub_sentence_id = sub_sentence_id
45
- self.words = words if words else []
46
- self.label_ids = label_ids if label_ids else []
47
-
48
- def add_word_and_label_id(self, word, label_id):
49
- """Adds word and label_id pair in the example."""
50
- self.words.append(word)
51
- self.label_ids.append(label_id)
52
-
53
-
54
- def _read_one_file(file_name, label_list):
55
- """Reads one file and returns a list of `InputExample` instances."""
56
- lines = tf.io.gfile.GFile(file_name, "r").readlines()
57
- examples = []
58
- label_id_map = {label: i for i, label in enumerate(label_list)}
59
- sentence_id = 0
60
- example = InputExample(sentence_id=0)
61
- for line in lines:
62
- line = line.strip("\n")
63
- if line:
64
- # The format is: <token>\t<label> for train/dev set and <token> for test.
65
- items = line.split("\t")
66
- assert len(items) == 2 or len(items) == 1
67
- token = items[0].strip()
68
-
69
- # Assign a dummy label_id for test set
70
- label_id = label_id_map[items[1].strip()] if len(items) == 2 else 0
71
- example.add_word_and_label_id(token, label_id)
72
- else:
73
- # Empty line indicates a new sentence.
74
- if example.words:
75
- examples.append(example)
76
- sentence_id += 1
77
- example = InputExample(sentence_id=sentence_id)
78
-
79
- if example.words:
80
- examples.append(example)
81
- return examples
82
-
83
-
84
- class PanxProcessor(classifier_data_lib.DataProcessor):
85
- """Processor for the Panx data set."""
86
- supported_languages = [
87
- "ar", "he", "vi", "id", "jv", "ms", "tl", "eu", "ml", "ta", "te", "af",
88
- "nl", "en", "de", "el", "bn", "hi", "mr", "ur", "fa", "fr", "it", "pt",
89
- "es", "bg", "ru", "ja", "ka", "ko", "th", "sw", "yo", "my", "zh", "kk",
90
- "tr", "et", "fi", "hu"
91
- ]
92
-
93
- def __init__(self,
94
- process_text_fn=tokenization.convert_to_unicode,
95
- only_use_en_train=True,
96
- only_use_en_dev=True):
97
- """See base class.
98
-
99
- Args:
100
- process_text_fn: See base class.
101
- only_use_en_train: If True, only use english training data. Otherwise, use
102
- training data from all languages.
103
- only_use_en_dev: If True, only use english dev data. Otherwise, use dev
104
- data from all languages.
105
- """
106
- super(PanxProcessor, self).__init__(process_text_fn)
107
- self.only_use_en_train = only_use_en_train
108
- self.only_use_en_dev = only_use_en_dev
109
-
110
- def get_train_examples(self, data_dir):
111
- examples = _read_one_file(
112
- os.path.join(data_dir, "train-en.tsv"), self.get_labels())
113
- if not self.only_use_en_train:
114
- for language in self.supported_languages:
115
- if language == "en":
116
- continue
117
- examples.extend(
118
- _read_one_file(
119
- os.path.join(data_dir, f"train-{language}.tsv"),
120
- self.get_labels()))
121
- return examples
122
-
123
- def get_dev_examples(self, data_dir):
124
- examples = _read_one_file(
125
- os.path.join(data_dir, "dev-en.tsv"), self.get_labels())
126
- if not self.only_use_en_dev:
127
- for language in self.supported_languages:
128
- if language == "en":
129
- continue
130
- examples.extend(
131
- _read_one_file(
132
- os.path.join(data_dir, f"dev-{language}.tsv"),
133
- self.get_labels()))
134
- return examples
135
-
136
- def get_test_examples(self, data_dir):
137
- examples_dict = {}
138
- for language in self.supported_languages:
139
- examples_dict[language] = _read_one_file(
140
- os.path.join(data_dir, "test-%s.tsv" % language), self.get_labels())
141
- return examples_dict
142
-
143
- def get_labels(self):
144
- return ["O", "B-PER", "I-PER", "B-LOC", "I-LOC", "B-ORG", "I-ORG"]
145
-
146
- @staticmethod
147
- def get_processor_name():
148
- return "panx"
149
-
150
-
151
- class UdposProcessor(classifier_data_lib.DataProcessor):
152
- """Processor for the Udpos data set."""
153
- supported_languages = [
154
- "af", "ar", "bg", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr",
155
- "he", "hi", "hu", "id", "it", "ja", "kk", "ko", "mr", "nl", "pt", "ru",
156
- "ta", "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"
157
- ]
158
-
159
- def __init__(self,
160
- process_text_fn=tokenization.convert_to_unicode,
161
- only_use_en_train=True,
162
- only_use_en_dev=True):
163
- """See base class.
164
-
165
- Args:
166
- process_text_fn: See base class.
167
- only_use_en_train: If True, only use english training data. Otherwise, use
168
- training data from all languages.
169
- only_use_en_dev: If True, only use english dev data. Otherwise, use dev
170
- data from all languages.
171
- """
172
- super(UdposProcessor, self).__init__(process_text_fn)
173
- self.only_use_en_train = only_use_en_train
174
- self.only_use_en_dev = only_use_en_dev
175
-
176
- def get_train_examples(self, data_dir):
177
- if self.only_use_en_train:
178
- examples = _read_one_file(
179
- os.path.join(data_dir, "train-en.tsv"), self.get_labels())
180
- else:
181
- examples = []
182
- # Uses glob because some languages are missing in train.
183
- for filepath in tf.io.gfile.glob(os.path.join(data_dir, "train-*.tsv")):
184
- examples.extend(
185
- _read_one_file(
186
- filepath,
187
- self.get_labels()))
188
- return examples
189
-
190
- def get_dev_examples(self, data_dir):
191
- if self.only_use_en_dev:
192
- examples = _read_one_file(
193
- os.path.join(data_dir, "dev-en.tsv"), self.get_labels())
194
- else:
195
- examples = []
196
- for filepath in tf.io.gfile.glob(os.path.join(data_dir, "dev-*.tsv")):
197
- examples.extend(
198
- _read_one_file(
199
- filepath,
200
- self.get_labels()))
201
- return examples
202
-
203
- def get_test_examples(self, data_dir):
204
- examples_dict = {}
205
- for language in self.supported_languages:
206
- examples_dict[language] = _read_one_file(
207
- os.path.join(data_dir, "test-%s.tsv" % language), self.get_labels())
208
- return examples_dict
209
-
210
- def get_labels(self):
211
- return [
212
- "ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM",
213
- "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"
214
- ]
215
-
216
- @staticmethod
217
- def get_processor_name():
218
- return "udpos"
219
-
220
-
221
- def _tokenize_example(example, max_length, tokenizer, text_preprocessing=None):
222
- """Tokenizes words and breaks long example into short ones."""
223
- # Needs additional [CLS] and [SEP] tokens.
224
- max_length = max_length - 2
225
- new_examples = []
226
- new_example = InputExample(sentence_id=example.sentence_id, sub_sentence_id=0)
227
- if any([x < 0 for x in example.label_ids]):
228
- raise ValueError("Unexpected negative label_id: %s" % example.label_ids)
229
-
230
- for i, word in enumerate(example.words):
231
- if text_preprocessing:
232
- word = text_preprocessing(word)
233
- subwords = tokenizer.tokenize(word)
234
- if (not subwords or len(subwords) > max_length) and word:
235
- subwords = [_UNK_TOKEN]
236
-
237
- if len(subwords) + len(new_example.words) > max_length:
238
- # Start a new example.
239
- new_examples.append(new_example)
240
- last_sub_sentence_id = new_example.sub_sentence_id
241
- new_example = InputExample(
242
- sentence_id=example.sentence_id,
243
- sub_sentence_id=last_sub_sentence_id + 1)
244
-
245
- for j, subword in enumerate(subwords):
246
- # Use the real label for the first subword, and pad label for
247
- # the remainings.
248
- subword_label = example.label_ids[i] if j == 0 else _PADDING_LABEL_ID
249
- new_example.add_word_and_label_id(subword, subword_label)
250
-
251
- if new_example.words:
252
- new_examples.append(new_example)
253
-
254
- return new_examples
255
-
256
-
257
- def _convert_single_example(example, max_seq_length, tokenizer):
258
- """Converts an `InputExample` instance to a `tf.train.Example` instance."""
259
- tokens = ["[CLS]"]
260
- tokens.extend(example.words)
261
- tokens.append("[SEP]")
262
- input_ids = tokenizer.convert_tokens_to_ids(tokens)
263
- label_ids = [_PADDING_LABEL_ID]
264
- label_ids.extend(example.label_ids)
265
- label_ids.append(_PADDING_LABEL_ID)
266
-
267
- segment_ids = [0] * len(input_ids)
268
- input_mask = [1] * len(input_ids)
269
-
270
- # Pad up to the sequence length.
271
- while len(input_ids) < max_seq_length:
272
- input_ids.append(0)
273
- input_mask.append(0)
274
- segment_ids.append(0)
275
- label_ids.append(_PADDING_LABEL_ID)
276
-
277
- def create_int_feature(values):
278
- return tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
279
-
280
- features = collections.OrderedDict()
281
- features["input_ids"] = create_int_feature(input_ids)
282
- features["input_mask"] = create_int_feature(input_mask)
283
- features["segment_ids"] = create_int_feature(segment_ids)
284
- features["label_ids"] = create_int_feature(label_ids)
285
- features["sentence_id"] = create_int_feature([example.sentence_id])
286
- features["sub_sentence_id"] = create_int_feature([example.sub_sentence_id])
287
-
288
- tf_example = tf.train.Example(features=tf.train.Features(feature=features))
289
- return tf_example
290
-
291
-
292
- def write_example_to_file(examples,
293
- tokenizer,
294
- max_seq_length,
295
- output_file,
296
- text_preprocessing=None):
297
- """Writes `InputExample`s into a tfrecord file with `tf.train.Example` protos.
298
-
299
- Note that the words inside each example will be tokenized and be applied by
300
- `text_preprocessing` if available. Also, if the length of sentence (plus
301
- special [CLS] and [SEP] tokens) exceeds `max_seq_length`, the long sentence
302
- will be broken into multiple short examples. For example:
303
-
304
- Example (text_preprocessing=lowercase, max_seq_length=5)
305
- words: ["What", "a", "great", "weekend"]
306
- labels: [ 7, 5, 9, 10]
307
- sentence_id: 0
308
- preprocessed: ["what", "a", "great", "weekend"]
309
- tokenized: ["what", "a", "great", "week", "##end"]
310
-
311
- will result in two tf.example protos:
312
-
313
- tokens: ["[CLS]", "what", "a", "great", "[SEP]"]
314
- label_ids: [-1, 7, 5, 9, -1]
315
- input_mask: [ 1, 1, 1, 1, 1]
316
- segment_ids: [ 0, 0, 0, 0, 0]
317
- input_ids: [ tokenizer.convert_tokens_to_ids(tokens) ]
318
- sentence_id: 0
319
-
320
- tokens: ["[CLS]", "week", "##end", "[SEP]", "[PAD]"]
321
- label_ids: [-1, 10, -1, -1, -1]
322
- input_mask: [ 1, 1, 1, 0, 0]
323
- segment_ids: [ 0, 0, 0, 0, 0]
324
- input_ids: [ tokenizer.convert_tokens_to_ids(tokens) ]
325
- sentence_id: 0
326
-
327
- Note the use of -1 in `label_ids` to indicate that a token should not be
328
- considered for classification (e.g., trailing ## wordpieces or special
329
- token). Token classification models should accordingly ignore these when
330
- calculating loss, metrics, etc...
331
-
332
- Args:
333
- examples: A list of `InputExample` instances.
334
- tokenizer: The tokenizer to be applied on the data.
335
- max_seq_length: Maximum length of generated sequences.
336
- output_file: The name of the output tfrecord file.
337
- text_preprocessing: optional preprocessing run on each word prior to
338
- tokenization.
339
-
340
- Returns:
341
- The total number of tf.train.Example proto written to file.
342
- """
343
- tf.io.gfile.makedirs(os.path.dirname(output_file))
344
- writer = tf.io.TFRecordWriter(output_file)
345
- num_tokenized_examples = 0
346
- for (ex_index, example) in enumerate(examples):
347
- if ex_index % 10000 == 0:
348
- logging.info("Writing example %d of %d to %s", ex_index, len(examples),
349
- output_file)
350
-
351
- tokenized_examples = _tokenize_example(example, max_seq_length, tokenizer,
352
- text_preprocessing)
353
- num_tokenized_examples += len(tokenized_examples)
354
- for per_tokenized_example in tokenized_examples:
355
- tf_example = _convert_single_example(per_tokenized_example,
356
- max_seq_length, tokenizer)
357
- writer.write(tf_example.SerializeToString())
358
-
359
- writer.close()
360
- return num_tokenized_examples
361
-
362
-
363
- def token_classification_meta_data(train_data_size,
364
- max_seq_length,
365
- num_labels,
366
- eval_data_size=None,
367
- test_data_size=None,
368
- label_list=None,
369
- processor_type=None):
370
- """Creates metadata for tagging (token classification) datasets."""
371
- meta_data = {
372
- "train_data_size": train_data_size,
373
- "max_seq_length": max_seq_length,
374
- "num_labels": num_labels,
375
- "task_type": "tagging",
376
- "label_type": "int",
377
- "label_shape": [max_seq_length],
378
- }
379
- if eval_data_size:
380
- meta_data["eval_data_size"] = eval_data_size
381
- if test_data_size:
382
- meta_data["test_data_size"] = test_data_size
383
- if label_list:
384
- meta_data["label_list"] = label_list
385
- if processor_type:
386
- meta_data["processor_type"] = processor_type
387
-
388
- return meta_data
389
-
390
-
391
- def generate_tf_record_from_data_file(processor, data_dir, tokenizer,
392
- max_seq_length, train_data_output_path,
393
- eval_data_output_path,
394
- test_data_output_path,
395
- text_preprocessing):
396
- """Generates tfrecord files from the raw data."""
397
- common_kwargs = dict(
398
- tokenizer=tokenizer,
399
- max_seq_length=max_seq_length,
400
- text_preprocessing=text_preprocessing)
401
- train_examples = processor.get_train_examples(data_dir)
402
- train_data_size = write_example_to_file(
403
- train_examples, output_file=train_data_output_path, **common_kwargs)
404
-
405
- eval_examples = processor.get_dev_examples(data_dir)
406
- eval_data_size = write_example_to_file(
407
- eval_examples, output_file=eval_data_output_path, **common_kwargs)
408
-
409
- test_input_data_examples = processor.get_test_examples(data_dir)
410
- test_data_size = {}
411
- for language, examples in test_input_data_examples.items():
412
- test_data_size[language] = write_example_to_file(
413
- examples,
414
- output_file=test_data_output_path.format(language),
415
- **common_kwargs)
416
-
417
- labels = processor.get_labels()
418
- meta_data = token_classification_meta_data(
419
- train_data_size,
420
- max_seq_length,
421
- len(labels),
422
- eval_data_size,
423
- test_data_size,
424
- label_list=labels,
425
- processor_type=processor.get_processor_name())
426
- return meta_data