Spaces:

NLPV
/

ISCO-code-predictor-api

Sleeping

App Files Files

NLPV commited on Aug 15, 2024

Commit

858cd2d

verified ·

1 Parent(s): 7040da3

Delete sentence_retrieval_lib.py

Browse files

Files changed (1) hide show

sentence_retrieval_lib.py +0 -166

sentence_retrieval_lib.py DELETED Viewed

@@ -1,166 +0,0 @@
-# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""BERT library to process data for cross lingual sentence retrieval task."""
-import os
-from absl import logging
-from official.nlp.data import classifier_data_lib
-from official.nlp.tools import tokenization
-class BuccProcessor(classifier_data_lib.DataProcessor):
-  """Procssor for Xtreme BUCC data set."""
-  supported_languages = ["de", "fr", "ru", "zh"]
-  def __init__(self, process_text_fn=tokenization.convert_to_unicode):
-    super(BuccProcessor, self).__init__(process_text_fn)
-    self.languages = BuccProcessor.supported_languages
-  def get_dev_examples(self, data_dir, file_pattern):
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, file_pattern.format("dev"))),
-        "sample")
-  def get_test_examples(self, data_dir, file_pattern):
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, file_pattern.format("test"))),
-        "test")
-  @staticmethod
-  def get_processor_name():
-    """See base class."""
-    return "BUCC"
-  def _create_examples(self, lines, set_type):
-    """Creates examples for the training and dev sets."""
-    examples = []
-    for (i, line) in enumerate(lines):
-      guid = "%s-%s" % (set_type, i)
-      example_id = int(line[0].split("-")[1])
-      text_a = self.process_text_fn(line[1])
-      examples.append(
-          classifier_data_lib.InputExample(
-              guid=guid, text_a=text_a, example_id=example_id))
-    return examples
-class TatoebaProcessor(classifier_data_lib.DataProcessor):
-  """Procssor for Xtreme Tatoeba data set."""
-  supported_languages = [
-      "af", "ar", "bg", "bn", "de", "el", "es", "et", "eu", "fa", "fi", "fr",
-      "he", "hi", "hu", "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr",
-      "nl", "pt", "ru", "sw", "ta", "te", "th", "tl", "tr", "ur", "vi", "zh"
-  ]
-  def __init__(self, process_text_fn=tokenization.convert_to_unicode):
-    super(TatoebaProcessor, self).__init__(process_text_fn)
-    self.languages = TatoebaProcessor.supported_languages
-  def get_test_examples(self, data_dir, file_path):
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, file_path)), "test")
-  @staticmethod
-  def get_processor_name():
-    """See base class."""
-    return "TATOEBA"
-  def _create_examples(self, lines, set_type):
-    """Creates examples for the training and dev sets."""
-    examples = []
-    for (i, line) in enumerate(lines):
-      guid = "%s-%s" % (set_type, i)
-      text_a = self.process_text_fn(line[0])
-      examples.append(
-          classifier_data_lib.InputExample(
-              guid=guid, text_a=text_a, example_id=i))
-    return examples
-def generate_sentence_retrevial_tf_record(processor,
-                                          data_dir,
-                                          tokenizer,
-                                          eval_data_output_path=None,
-                                          test_data_output_path=None,
-                                          max_seq_length=128):
-  """Generates the tf records for retrieval tasks.
-  Args:
-    processor: Input processor object to be used for generating data. Subclass
-      of `DataProcessor`.
-      data_dir: Directory that contains train/eval data to process. Data files
-        should be in from.
-      tokenizer: The tokenizer to be applied on the data.
-      eval_data_output_path: Output to which processed tf record for evaluation
-        will be saved.
-      test_data_output_path: Output to which processed tf record for testing
-        will be saved. Must be a pattern template with {} if processor has
-        language specific test data.
-      max_seq_length: Maximum sequence length of the to be generated
-        training/eval data.
-  Returns:
-      A dictionary containing input meta data.
-  """
-  assert eval_data_output_path or test_data_output_path
-  if processor.get_processor_name() == "BUCC":
-    path_pattern = "{}-en.{{}}.{}"
-  if processor.get_processor_name() == "TATOEBA":
-    path_pattern = "{}-en.{}"
-  meta_data = {
-      "processor_type": processor.get_processor_name(),
-      "max_seq_length": max_seq_length,
-      "number_eval_data": {},
-      "number_test_data": {},
-  }
-  logging.info("Start to process %s task data", processor.get_processor_name())
-  for lang_a in processor.languages:
-    for lang_b in [lang_a, "en"]:
-      if eval_data_output_path:
-        eval_input_data_examples = processor.get_dev_examples(
-            data_dir, os.path.join(path_pattern.format(lang_a, lang_b)))
-        num_eval_data = len(eval_input_data_examples)
-        logging.info("Processing %d dev examples of %s-en.%s", num_eval_data,
-                     lang_a, lang_b)
-        output_file = os.path.join(
-            eval_data_output_path,
-            "{}-en-{}.{}.tfrecords".format(lang_a, lang_b, "dev"))
-        classifier_data_lib.file_based_convert_examples_to_features(
-            eval_input_data_examples, None, max_seq_length, tokenizer,
-            output_file, None)
-        meta_data["number_eval_data"][f"{lang_a}-en.{lang_b}"] = num_eval_data
-      if test_data_output_path:
-        test_input_data_examples = processor.get_test_examples(
-            data_dir, os.path.join(path_pattern.format(lang_a, lang_b)))
-        num_test_data = len(test_input_data_examples)
-        logging.info("Processing %d test examples of %s-en.%s", num_test_data,
-                     lang_a, lang_b)
-        output_file = os.path.join(
-            test_data_output_path,
-            "{}-en-{}.{}.tfrecords".format(lang_a, lang_b, "test"))
-        classifier_data_lib.file_based_convert_examples_to_features(
-            test_input_data_examples, None, max_seq_length, tokenizer,
-            output_file, None)
-        meta_data["number_test_data"][f"{lang_a}-en.{lang_b}"] = num_test_data
-  return meta_data