Spaces:

JustKiddo
/

KiddosSpace

Running

App Files Files Community

KiddosSpace / mt_eng_vietnamese.py

JustKiddo

Upload mt_eng_vietnamese.py

021ca1e verified 11 months ago

raw

history blame

5.16 kB

	# coding=utf-8
	# Copyright 2020 HuggingFace Datasets Authors.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.


	import collections

	import datasets


	_DESCRIPTION = """\
	Preprocessed Dataset from IWSLT'15 English-Vietnamese machine translation: English-Vietnamese.
	"""

	_CITATION = """\
	@inproceedings{Luong-Manning:iwslt15,
	Address = {Da Nang, Vietnam}
	Author = {Luong, Minh-Thang and Manning, Christopher D.},
	Booktitle = {International Workshop on Spoken Language Translation},
	Title = {Stanford Neural Machine Translation Systems for Spoken Language Domain},
	Year = {2015}}
	"""

	_DATA_URL = "https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/{}.{}"

	# Tuple that describes a single pair of files with matching translations.
	# language_to_file is the map from language (2 letter string: example 'en')
	# to the file path in the extracted directory.
	TranslateData = collections.namedtuple("TranslateData", ["url", "language_to_file"])


	class MT_Eng_ViConfig(datasets.BuilderConfig):
	"""BuilderConfig for MT_Eng_Vietnamese."""

	def __init__(self, language_pair=(None, None), **kwargs):
	"""BuilderConfig for MT_Eng_Vi.
	Args:
	for the `datasets.features.text.TextEncoder` used for the features feature.
	language_pair: pair of languages that will be used for translation. Should
	contain 2-letter coded strings. First will be used at source and second
	as target in supervised mode. For example: ("vi", "en").
	**kwargs: keyword arguments forwarded to super.
	"""

	description = ("Translation dataset from %s to %s") % (language_pair[0], language_pair[1])
	super(MT_Eng_ViConfig, self).__init__(
	description=description,
	version=datasets.Version("1.0.0"),
	**kwargs,
	)
	self.language_pair = language_pair


	class MTEngVietnamese(datasets.GeneratorBasedBuilder):
	"""English Vietnamese machine translation dataset from IWSLT2015."""

	BUILDER_CONFIGS = [
	MT_Eng_ViConfig(
	name="iwslt2015-vi-en",
	language_pair=("vi", "en"),
	),
	MT_Eng_ViConfig(
	name="iwslt2015-en-vi",
	language_pair=("en", "vi"),
	),
	]
	BUILDER_CONFIG_CLASS = MT_Eng_ViConfig

	def _info(self):
	source, target = self.config.language_pair
	return datasets.DatasetInfo(
	description=_DESCRIPTION,
	features=datasets.Features(
	{"translation": datasets.features.Translation(languages=self.config.language_pair)}
	),
	supervised_keys=(source, target),
	homepage="https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/",
	citation=_CITATION,
	)

	def _split_generators(self, dl_manager):
	source, target = self.config.language_pair

	files = {}
	for split in ("train", "dev", "test"):
	if split == "dev":
	dl_dir_src = dl_manager.download_and_extract(_DATA_URL.format("tst2012", source))
	dl_dir_tar = dl_manager.download_and_extract(_DATA_URL.format("tst2012", target))
	if split == "dev":
	dl_dir_src = dl_manager.download_and_extract(_DATA_URL.format("tst2013", source))
	dl_dir_tar = dl_manager.download_and_extract(_DATA_URL.format("tst2013", target))
	if split == "train":
	dl_dir_src = dl_manager.download_and_extract(_DATA_URL.format(split, source))
	dl_dir_tar = dl_manager.download_and_extract(_DATA_URL.format(split, target))

	files[split] = {"source_file": dl_dir_src, "target_file": dl_dir_tar}

	return [
	datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs=files["train"]),
	datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs=files["dev"]),
	datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs=files["test"]),
	]

	def _generate_examples(self, source_file, target_file):
	"""This function returns the examples in the raw (text) form."""
	with open(source_file, encoding="utf-8") as f:
	source_sentences = f.read().split("\n")
	with open(target_file, encoding="utf-8") as f:
	target_sentences = f.read().split("\n")

	source, target = self.config.language_pair
	for idx, (l1, l2) in enumerate(zip(source_sentences, target_sentences)):
	result = {"translation": {source: l1, target: l2}}
	# Make sure that both translations are non-empty.
	yield idx, result