Upload folder using huggingface_hub

d1ceb73 verified 11 months ago

8.34 kB

	# coding=utf-8
	# Copyright 2020, The RAG Authors and The HuggingFace Inc. team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""RAG model configuration"""

	from ...configuration_utils import PretrainedConfig
	from ...utils import add_start_docstrings


	RAG_CONFIG_DOC = r"""
	[`RagConfig`] stores the configuration of a RagModel. Configuration objects inherit from [`PretrainedConfig`] and
	can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information.

	Args:
	title_sep (`str`, optional, defaults to `" / "`):
	Separator inserted between the title and the text of the retrieved document when calling [`RagRetriever`].
	doc_sep (`str`, optional, defaults to `" // "`):
	Separator inserted between the text of the retrieved document and the original input when calling
	[`RagRetriever`].
	n_docs (`int`, optional, defaults to 5):
	Number of documents to retrieve.
	max_combined_length (`int`, optional, defaults to 300):
	Max length of contextualized input returned by [`~RagRetriever.__call__`].
	retrieval_vector_size (`int`, optional, defaults to 768):
	Dimensionality of the document embeddings indexed by [`RagRetriever`].
	retrieval_batch_size (`int`, optional, defaults to 8):
	Retrieval batch size, defined as the number of queries issues concurrently to the faiss index encapsulated
	[`RagRetriever`].
	dataset (`str`, optional, defaults to `"wiki_dpr"`):
	A dataset identifier of the indexed dataset in HuggingFace Datasets (list all available datasets and ids
	using `datasets.list_datasets()`).
	dataset_split (`str`, optional, defaults to `"train"`)
	Which split of the `dataset` to load.
	index_name (`str`, optional, defaults to `"compressed"`)
	The index name of the index associated with the `dataset`. One can choose between `"legacy"`, `"exact"` and
	`"compressed"`.
	index_path (`str`, optional)
	The path to the serialized faiss index on disk.
	passages_path (`str`, optional):
	A path to text passages compatible with the faiss index. Required if using
	[`~models.rag.retrieval_rag.LegacyIndex`]
	use_dummy_dataset (`bool`, optional, defaults to `False`)
	Whether to load a "dummy" variant of the dataset specified by `dataset`.
	label_smoothing (`float`, optional, defaults to 0.0):
	Only relevant if `return_loss` is set to `True`. Controls the `epsilon` parameter value for label smoothing
	in the loss calculation. If set to 0, no label smoothing is performed.
	do_marginalize (`bool`, optional, defaults to `False`):
	If `True`, the logits are marginalized over all documents by making use of
	`torch.nn.functional.log_softmax`.
	reduce_loss (`bool`, optional, defaults to `False`):
	Whether or not to reduce the NLL loss using the `torch.Tensor.sum` operation.
	do_deduplication (`bool`, optional, defaults to `True`):
	Whether or not to deduplicate the generations from different context documents for a given input. Has to be
	set to `False` if used while training with distributed backend.
	exclude_bos_score (`bool`, optional, defaults to `False`):
	Whether or not to disregard the BOS token when computing the loss.
	output_retrieved(`bool`, optional, defaults to `False`):
	If set to `True`, `retrieved_doc_embeds`, `retrieved_doc_ids`, `context_input_ids` and
	`context_attention_mask` are returned. See returned tensors for more detail.
	use_cache (`bool`, optional, defaults to `True`):
	Whether or not the model should return the last key/values attentions (not used by all models).
	forced_eos_token_id (`int`, optional):
	The id of the token to force as the last generated token when `max_length` is reached. Usually set to
	`eos_token_id`.
	"""


	@add_start_docstrings(RAG_CONFIG_DOC)
	class RagConfig(PretrainedConfig):
	model_type = "rag"
	is_composition = True

	def __init__(
	self,
	vocab_size=None,
	is_encoder_decoder=True,
	prefix=None,
	bos_token_id=None,
	pad_token_id=None,
	eos_token_id=None,
	decoder_start_token_id=None,
	title_sep=" / ",
	doc_sep=" // ",
	n_docs=5,
	max_combined_length=300,
	retrieval_vector_size=768,
	retrieval_batch_size=8,
	dataset="wiki_dpr",
	dataset_split="train",
	index_name="compressed",
	index_path=None,
	passages_path=None,
	use_dummy_dataset=False,
	reduce_loss=False,
	label_smoothing=0.0,
	do_deduplication=True,
	exclude_bos_score=False,
	do_marginalize=False,
	output_retrieved=False,
	use_cache=True,
	forced_eos_token_id=None,
	dataset_revision=None,
	**kwargs,
	):
	super().__init__(
	bos_token_id=bos_token_id,
	pad_token_id=pad_token_id,
	eos_token_id=eos_token_id,
	decoder_start_token_id=decoder_start_token_id,
	forced_eos_token_id=forced_eos_token_id,
	is_encoder_decoder=is_encoder_decoder,
	prefix=prefix,
	vocab_size=vocab_size,
	**kwargs,
	)
	assert (
	"question_encoder" in kwargs and "generator" in kwargs
	), "Config has to be initialized with question_encoder and generator config"
	question_encoder_config = kwargs.pop("question_encoder")
	question_encoder_model_type = question_encoder_config.pop("model_type")
	decoder_config = kwargs.pop("generator")
	decoder_model_type = decoder_config.pop("model_type")

	from ..auto.configuration_auto import AutoConfig

	self.question_encoder = AutoConfig.for_model(question_encoder_model_type, **question_encoder_config)
	self.generator = AutoConfig.for_model(decoder_model_type, **decoder_config)

	self.reduce_loss = reduce_loss
	self.label_smoothing = label_smoothing
	self.exclude_bos_score = exclude_bos_score
	self.do_marginalize = do_marginalize

	self.title_sep = title_sep
	self.doc_sep = doc_sep
	self.n_docs = n_docs
	self.max_combined_length = max_combined_length

	self.dataset = dataset
	self.dataset_split = dataset_split
	self.index_name = index_name

	self.retrieval_vector_size = retrieval_vector_size
	self.retrieval_batch_size = retrieval_batch_size
	self.passages_path = passages_path
	self.index_path = index_path
	self.use_dummy_dataset = use_dummy_dataset
	self.dataset_revision = dataset_revision

	self.output_retrieved = output_retrieved

	self.do_deduplication = do_deduplication

	self.use_cache = use_cache

	if self.forced_eos_token_id is None:
	self.forced_eos_token_id = getattr(self.generator, "forced_eos_token_id", None)

	@classmethod
	def from_question_encoder_generator_configs(
	cls, question_encoder_config: PretrainedConfig, generator_config: PretrainedConfig, **kwargs
	) -> PretrainedConfig:
	r"""
	Instantiate a [`EncoderDecoderConfig`] (or a derived class) from a pre-trained encoder model configuration and
	decoder model configuration.

	Returns:
	[`EncoderDecoderConfig`]: An instance of a configuration object
	"""
	return cls(question_encoder=question_encoder_config.to_dict(), generator=generator_config.to_dict(), **kwargs)