Pradeep Kumar commited on
Commit
4061739
·
verified ·
1 Parent(s): 329d378

Delete tagging_data_lib_test.py

Browse files
Files changed (1) hide show
  1. tagging_data_lib_test.py +0 -108
tagging_data_lib_test.py DELETED
@@ -1,108 +0,0 @@
1
- # Copyright 2024 The TensorFlow Authors. All Rights Reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- """Tests for official.nlp.data.tagging_data_lib."""
16
- import os
17
- import random
18
-
19
- from absl.testing import parameterized
20
- import tensorflow as tf, tf_keras
21
-
22
- from official.nlp.data import tagging_data_lib
23
- from official.nlp.tools import tokenization
24
-
25
-
26
- def _create_fake_file(filename, labels, is_test):
27
-
28
- def write_one_sentence(writer, length):
29
- for _ in range(length):
30
- line = "hiworld"
31
- if not is_test:
32
- line += "\t%s" % (labels[random.randint(0, len(labels) - 1)])
33
- writer.write(line + "\n")
34
-
35
- # Writes two sentences with length of 3 and 12 respectively.
36
- with tf.io.gfile.GFile(filename, "w") as writer:
37
- write_one_sentence(writer, 3)
38
- writer.write("\n")
39
- write_one_sentence(writer, 12)
40
-
41
-
42
- class TaggingDataLibTest(tf.test.TestCase, parameterized.TestCase):
43
-
44
- def setUp(self):
45
- super(TaggingDataLibTest, self).setUp()
46
-
47
- self.processors = {
48
- "panx": tagging_data_lib.PanxProcessor,
49
- "udpos": tagging_data_lib.UdposProcessor,
50
- }
51
- self.vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt")
52
- with tf.io.gfile.GFile(self.vocab_file, "w") as writer:
53
- writer.write("\n".join(["[CLS]", "[SEP]", "hi", "##world", "[UNK]"]))
54
-
55
- @parameterized.parameters(
56
- {"task_type": "panx"},
57
- {"task_type": "udpos"},
58
- )
59
- def test_generate_tf_record(self, task_type):
60
- processor = self.processors[task_type]()
61
- input_data_dir = os.path.join(self.get_temp_dir(), task_type)
62
- tf.io.gfile.mkdir(input_data_dir)
63
- # Write fake train file.
64
- _create_fake_file(
65
- os.path.join(input_data_dir, "train-en.tsv"),
66
- processor.get_labels(),
67
- is_test=False)
68
-
69
- # Write fake dev file.
70
- _create_fake_file(
71
- os.path.join(input_data_dir, "dev-en.tsv"),
72
- processor.get_labels(),
73
- is_test=False)
74
-
75
- # Write fake test files.
76
- for lang in processor.supported_languages:
77
- _create_fake_file(
78
- os.path.join(input_data_dir, "test-%s.tsv" % lang),
79
- processor.get_labels(),
80
- is_test=True)
81
-
82
- output_path = os.path.join(self.get_temp_dir(), task_type, "output")
83
- tokenizer = tokenization.FullTokenizer(
84
- vocab_file=self.vocab_file, do_lower_case=True)
85
- metadata = tagging_data_lib.generate_tf_record_from_data_file(
86
- processor,
87
- input_data_dir,
88
- tokenizer,
89
- max_seq_length=8,
90
- train_data_output_path=os.path.join(output_path, "train.tfrecord"),
91
- eval_data_output_path=os.path.join(output_path, "eval.tfrecord"),
92
- test_data_output_path=os.path.join(output_path, "test_{}.tfrecord"),
93
- text_preprocessing=tokenization.convert_to_unicode)
94
-
95
- self.assertEqual(metadata["train_data_size"], 5)
96
- files = tf.io.gfile.glob(output_path + "/*")
97
- expected_files = []
98
- expected_files.append(os.path.join(output_path, "train.tfrecord"))
99
- expected_files.append(os.path.join(output_path, "eval.tfrecord"))
100
- for lang in processor.supported_languages:
101
- expected_files.append(
102
- os.path.join(output_path, "test_%s.tfrecord" % lang))
103
-
104
- self.assertCountEqual(files, expected_files)
105
-
106
-
107
- if __name__ == "__main__":
108
- tf.test.main()