Spaces:

NLPV
/

ISCO-code-predictor-api

Running

ISCO-code-predictor-api / sentence_retrieval_lib.py

Pradeep Kumar

Upload 33 files

f18e71f verified 11 months ago

6.58 kB

	# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""BERT library to process data for cross lingual sentence retrieval task."""

	import os

	from absl import logging
	from official.nlp.data import classifier_data_lib
	from official.nlp.tools import tokenization


	class BuccProcessor(classifier_data_lib.DataProcessor):
	"""Procssor for Xtreme BUCC data set."""
	supported_languages = ["de", "fr", "ru", "zh"]

	def __init__(self, process_text_fn=tokenization.convert_to_unicode):
	super(BuccProcessor, self).__init__(process_text_fn)
	self.languages = BuccProcessor.supported_languages

	def get_dev_examples(self, data_dir, file_pattern):
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, file_pattern.format("dev"))),
	"sample")

	def get_test_examples(self, data_dir, file_pattern):
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, file_pattern.format("test"))),
	"test")

	@staticmethod
	def get_processor_name():
	"""See base class."""
	return "BUCC"

	def _create_examples(self, lines, set_type):
	"""Creates examples for the training and dev sets."""
	examples = []
	for (i, line) in enumerate(lines):
	guid = "%s-%s" % (set_type, i)
	example_id = int(line[0].split("-")[1])
	text_a = self.process_text_fn(line[1])
	examples.append(
	classifier_data_lib.InputExample(
	guid=guid, text_a=text_a, example_id=example_id))
	return examples


	class TatoebaProcessor(classifier_data_lib.DataProcessor):
	"""Procssor for Xtreme Tatoeba data set."""
	supported_languages = [
	"af", "ar", "bg", "bn", "de", "el", "es", "et", "eu", "fa", "fi", "fr",
	"he", "hi", "hu", "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr",
	"nl", "pt", "ru", "sw", "ta", "te", "th", "tl", "tr", "ur", "vi", "zh"
	]

	def __init__(self, process_text_fn=tokenization.convert_to_unicode):
	super(TatoebaProcessor, self).__init__(process_text_fn)
	self.languages = TatoebaProcessor.supported_languages

	def get_test_examples(self, data_dir, file_path):
	return self._create_examples(
	self._read_tsv(os.path.join(data_dir, file_path)), "test")

	@staticmethod
	def get_processor_name():
	"""See base class."""
	return "TATOEBA"

	def _create_examples(self, lines, set_type):
	"""Creates examples for the training and dev sets."""
	examples = []
	for (i, line) in enumerate(lines):
	guid = "%s-%s" % (set_type, i)
	text_a = self.process_text_fn(line[0])
	examples.append(
	classifier_data_lib.InputExample(
	guid=guid, text_a=text_a, example_id=i))
	return examples


	def generate_sentence_retrevial_tf_record(processor,
	data_dir,
	tokenizer,
	eval_data_output_path=None,
	test_data_output_path=None,
	max_seq_length=128):
	"""Generates the tf records for retrieval tasks.

	Args:
	processor: Input processor object to be used for generating data. Subclass
	of `DataProcessor`.
	data_dir: Directory that contains train/eval data to process. Data files
	should be in from.
	tokenizer: The tokenizer to be applied on the data.
	eval_data_output_path: Output to which processed tf record for evaluation
	will be saved.
	test_data_output_path: Output to which processed tf record for testing
	will be saved. Must be a pattern template with {} if processor has
	language specific test data.
	max_seq_length: Maximum sequence length of the to be generated
	training/eval data.

	Returns:
	A dictionary containing input meta data.
	"""
	assert eval_data_output_path or test_data_output_path

	if processor.get_processor_name() == "BUCC":
	path_pattern = "{}-en.{{}}.{}"

	if processor.get_processor_name() == "TATOEBA":
	path_pattern = "{}-en.{}"

	meta_data = {
	"processor_type": processor.get_processor_name(),
	"max_seq_length": max_seq_length,
	"number_eval_data": {},
	"number_test_data": {},
	}
	logging.info("Start to process %s task data", processor.get_processor_name())

	for lang_a in processor.languages:
	for lang_b in [lang_a, "en"]:
	if eval_data_output_path:
	eval_input_data_examples = processor.get_dev_examples(
	data_dir, os.path.join(path_pattern.format(lang_a, lang_b)))

	num_eval_data = len(eval_input_data_examples)
	logging.info("Processing %d dev examples of %s-en.%s", num_eval_data,
	lang_a, lang_b)
	output_file = os.path.join(
	eval_data_output_path,
	"{}-en-{}.{}.tfrecords".format(lang_a, lang_b, "dev"))
	classifier_data_lib.file_based_convert_examples_to_features(
	eval_input_data_examples, None, max_seq_length, tokenizer,
	output_file, None)
	meta_data["number_eval_data"][f"{lang_a}-en.{lang_b}"] = num_eval_data

	if test_data_output_path:
	test_input_data_examples = processor.get_test_examples(
	data_dir, os.path.join(path_pattern.format(lang_a, lang_b)))

	num_test_data = len(test_input_data_examples)
	logging.info("Processing %d test examples of %s-en.%s", num_test_data,
	lang_a, lang_b)
	output_file = os.path.join(
	test_data_output_path,
	"{}-en-{}.{}.tfrecords".format(lang_a, lang_b, "test"))
	classifier_data_lib.file_based_convert_examples_to_features(
	test_input_data_examples, None, max_seq_length, tokenizer,
	output_file, None)
	meta_data["number_test_data"][f"{lang_a}-en.{lang_b}"] = num_test_data

	return meta_data