File size: 3,548 Bytes
ac8143c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4292831
b93343f
4292831
 
 
 
 
ac8143c
 
 
 
ef96a6b
ac8143c
 
 
 
 
 
 
 
ac0977e
ac8143c
 
 
 
 
 
 
 
 
 
 
 
ac0977e
ac8143c
 
 
 
4292831
 
ac8143c
 
 
 
4292831
b93343f
 
 
 
 
 
 
 
ac8143c
e31a688
ac8143c
 
b93343f
 
 
 
e31a688
ac8143c
 
 
 
 
b93343f
ac8143c
4292831
 
 
 
 
ac8143c
 
b93343f
ac8143c
b93343f
ac8143c
 
 
 
ac0977e
ac8143c
4292831
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import hashlib
from collections import Counter
from dataclasses import dataclass

import datasets

import evaluate


logger = evaluate.logging.get_logger(__name__)

_DESCRIPTION = """
Returns the duplicate fraction of duplicate strings in the input.
"""

_KWARGS_DESCRIPTION = """
Args:
    `data`: a list of `str` to be checked for duplicates.

Returns:
    `duplicate_fraction` (`float`) : the fraction of strings that are duplicated.
    `duplicates_dict` (`dict`) (optional) : a dictionary containing tuples with the duplicate strings and the number of times they are repeated.

Examples:
    >>> data = ["hello sun","hello moon", "hello sun"]
    >>> duplicates = evaluate.load("text_duplicates")
    >>> results = duplicates.compute(data=data)
    >>> print(results)
    {'duplicate_fraction': 0.33333333333333337}

    >>> data = ["hello sun","hello moon", "hello sun"]
    >>> duplicates = evaluate.load("text_duplicates")
    >>> results =  duplicates.compute(data=data, list_duplicates=True)
    >>> print(results)
    {'duplicate_fraction': 0.33333333333333337, 'duplicates_dict': {'hello sun': 2}}
"""

# TODO: Add BibTeX citation
_CITATION = ""


def get_hash(example):
    """Get the hash of a string"""
    return hashlib.md5(example.strip().encode("utf-8")).hexdigest()


@dataclass
class TextDuplicatesConfig(evaluate.info.Config):

    name: str = "default"

    list_duplicates: bool = False


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class TextDuplicates(evaluate.Measurement):
    """This measurement returns the duplicate strings contained in the input(s)."""

    CONFIG_CLASS = TextDuplicatesConfig
    ALLOWED_CONFIG_NAMES = ["default"]

    def _info(self, config):
        return evaluate.MeasurementInfo(
            # This is the description that will appear on the modules page.
            module_type="measurement",
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            config=config,
            # This defines the format of each prediction and reference
            features=datasets.Features(
                {
                    "data": datasets.Value("string"),
                }
            ),
        )

    def _compute(self, data):
        """Returns the duplicates contained in the input data and the number of times they are repeated."""
        if self.config.list_duplicates == True:
            logger.warning("This functionality can be memory-intensive for large datasets!")
            n_dedup = len(set([get_hash(d) for d in data]))
            c = Counter(data)
            duplicates = {k: v for k, v in c.items() if v > 1}
            return {"duplicate_fraction": 1 - (n_dedup / len(data)), "duplicates_dict": duplicates}
        else:
            n_dedup = len(set([get_hash(d) for d in data]))
            return {"duplicate_fraction": 1 - (n_dedup / len(data))}