File size: 3,548 Bytes
ac8143c 4292831 b93343f 4292831 ac8143c ef96a6b ac8143c ac0977e ac8143c ac0977e ac8143c 4292831 ac8143c 4292831 b93343f ac8143c e31a688 ac8143c b93343f e31a688 ac8143c b93343f ac8143c 4292831 ac8143c b93343f ac8143c b93343f ac8143c ac0977e ac8143c 4292831 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import hashlib
from collections import Counter
from dataclasses import dataclass
import datasets
import evaluate
logger = evaluate.logging.get_logger(__name__)
_DESCRIPTION = """
Returns the duplicate fraction of duplicate strings in the input.
"""
_KWARGS_DESCRIPTION = """
Args:
`data`: a list of `str` to be checked for duplicates.
Returns:
`duplicate_fraction` (`float`) : the fraction of strings that are duplicated.
`duplicates_dict` (`dict`) (optional) : a dictionary containing tuples with the duplicate strings and the number of times they are repeated.
Examples:
>>> data = ["hello sun","hello moon", "hello sun"]
>>> duplicates = evaluate.load("text_duplicates")
>>> results = duplicates.compute(data=data)
>>> print(results)
{'duplicate_fraction': 0.33333333333333337}
>>> data = ["hello sun","hello moon", "hello sun"]
>>> duplicates = evaluate.load("text_duplicates")
>>> results = duplicates.compute(data=data, list_duplicates=True)
>>> print(results)
{'duplicate_fraction': 0.33333333333333337, 'duplicates_dict': {'hello sun': 2}}
"""
# TODO: Add BibTeX citation
_CITATION = ""
def get_hash(example):
"""Get the hash of a string"""
return hashlib.md5(example.strip().encode("utf-8")).hexdigest()
@dataclass
class TextDuplicatesConfig(evaluate.info.Config):
name: str = "default"
list_duplicates: bool = False
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class TextDuplicates(evaluate.Measurement):
"""This measurement returns the duplicate strings contained in the input(s)."""
CONFIG_CLASS = TextDuplicatesConfig
ALLOWED_CONFIG_NAMES = ["default"]
def _info(self, config):
return evaluate.MeasurementInfo(
# This is the description that will appear on the modules page.
module_type="measurement",
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
config=config,
# This defines the format of each prediction and reference
features=datasets.Features(
{
"data": datasets.Value("string"),
}
),
)
def _compute(self, data):
"""Returns the duplicates contained in the input data and the number of times they are repeated."""
if self.config.list_duplicates == True:
logger.warning("This functionality can be memory-intensive for large datasets!")
n_dedup = len(set([get_hash(d) for d in data]))
c = Counter(data)
duplicates = {k: v for k, v in c.items() if v > 1}
return {"duplicate_fraction": 1 - (n_dedup / len(data)), "duplicates_dict": duplicates}
else:
n_dedup = len(set([get_hash(d) for d in data]))
return {"duplicate_fraction": 1 - (n_dedup / len(data))}
|