File size: 9,842 Bytes
2d876d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
"""Module contains the score calculation algorithems."""
from functools import partial
from typing import Dict, List, Union, cast

from pfzy.types import SCORE_INDICES

SCORE_MIN = float("-inf")
SCORE_MAX = float("inf")
SCORE_GAP_LEADING = -0.005
SCORE_GAP_TRAILING = -0.005
SCORE_GAP_INNER = -0.01
SCORE_MATCH_CONSECUTIVE = 1.0


def _char_range_with(
    char_start: str, char_stop: str, value, hash_table: Dict[str, Union[int, float]]
) -> Dict[str, Union[int, float]]:
    """Generate index mapping for `bonus` calculation.

    Args:
        char_start: Starting char of the range.
        char_stop: Ending char of the range.
        value: Value to give to the range of char.
        hash_table: Base dictionary to add the mapping.

    Returns:
        A dictionary containing the given range with provided index.

    Examples:
        >>> _char_range_with("a", "d", 1, {})
        {'a': 1, 'b': 1, 'c': 1, 'd': 1}
    """
    hash_table = hash_table.copy()
    hash_table.update(
        (chr(uni_char), value)
        for uni_char in range(ord(char_start), ord(char_stop) + 1)
    )
    return hash_table


lower_with = partial(_char_range_with, "a", "z")
upper_with = partial(_char_range_with, "A", "Z")
digit_with = partial(_char_range_with, "0", "9")

SCORE_MATCH_SLASH = 0.9
SCORE_MATCH_WORD = 0.8
SCORE_MATCH_CAPITAL = 0.7
SCORE_MATCH_DOT = 0.6
BONUS_MAP = {
    "/": SCORE_MATCH_SLASH,
    "-": SCORE_MATCH_WORD,
    "_": SCORE_MATCH_WORD,
    " ": SCORE_MATCH_WORD,
    ".": SCORE_MATCH_DOT,
}
BONUS_STATES = [{}, BONUS_MAP, lower_with(SCORE_MATCH_CAPITAL, BONUS_MAP)]
BONUS_INDEX = cast(Dict[str, int], digit_with(1, lower_with(1, upper_with(2, {}))))


def _bonus(haystack: str) -> List[float]:
    """Calculate bonus score for the given haystack.

    The bonus are applied to each char based on the previous char.

    When previous char is within the `BONUS_MAP` then additional bonus
    are applied to the current char due to it might be the start of a new
    word.

    When encountered a mix case character, if the current char is capitalised then
    if the previous char is normal case or within `BONUS_MAP`, additional bounus are applied.

    Args:
        haystack: String to calculate bonus.

    Returns:
        A list of float matching the length of the given haystack
        with each index representing the bonus score to apply.

    Examples:
        >>> _bonus("asdf")
        [0.9, 0, 0, 0]
        >>> _bonus("asdf asdf")
        [0.9, 0, 0, 0, 0, 0.8, 0, 0, 0]
        >>> _bonus("asdf aSdf")
        [0.9, 0, 0, 0, 0, 0.8, 0.7, 0, 0]
        >>> _bonus("asdf/aSdf")
        [0.9, 0, 0, 0, 0, 0.9, 0.7, 0, 0]
    """
    prev_char = "/"
    bonus = []
    for char in haystack:
        bonus.append(BONUS_STATES[BONUS_INDEX.get(char, 0)].get(prev_char, 0))
        prev_char = char
    return bonus


def _score(needle: str, haystack: str) -> SCORE_INDICES:
    """Use fzy logic to calculate score for `needle` within the given `haystack`.

    2 2D array to track the score.
    1. The running score (`running_score`) which represents the best score for the current position.
    2. The result score (`result_score`) which tracks to overall best score that could be for the current positon.

    With every consequtive match, additional bonuse score are given and for every non matching char, a negative
    gap score is applied.

    After the score is calculated, the final matching score will be stored at the last position of the `result_score`.

    Backtrack the result by comparing the 2 2D array to find the corresponding indices.

    Args:
        needle: Substring to find in haystack.
        haystack: String to be searched and scored.

    Returns:
        A tuple of matching score with a list of matching indices.
    """
    needle_len, haystack_len = len(needle), len(haystack)
    bonus_score = _bonus(haystack)

    # smart case
    if needle.islower():
        haystack = haystack.lower()

    # return all values if no query
    if needle_len == 0 or needle_len == haystack_len:
        return SCORE_MAX, list(range(needle_len))

    # best score for the position
    running_score: List[List[float]] = [
        [0 for _ in range(haystack_len)] for _ in range(needle_len)
    ]

    # overall best score at each position
    result_score: List[List[float]] = [
        [0 for _ in range(haystack_len)] for _ in range(needle_len)
    ]

    for i in range(needle_len):
        prev_score = SCORE_MIN

        # gap between matching char
        # more gaps, less score
        gap_score = SCORE_GAP_TRAILING if i == needle_len - 1 else SCORE_GAP_INNER

        for j in range(haystack_len):
            if needle[i] == haystack[j]:
                score = SCORE_MIN
                if i == 0:
                    score = j * SCORE_GAP_LEADING + bonus_score[j]
                elif j != 0:
                    score = max(
                        result_score[i - 1][j - 1] + bonus_score[j],
                        # consecutive match if value is higher
                        running_score[i - 1][j - 1] + SCORE_MATCH_CONSECUTIVE,
                    )
                running_score[i][j] = score
                result_score[i][j] = prev_score = max(score, prev_score + gap_score)
            else:
                running_score[i][j] = SCORE_MIN
                # increment the best score with gap_score since no match
                result_score[i][j] = prev_score = prev_score + gap_score

    # backtrace to find the all indices of optimal matching
    # starting from the end to pick the first possible match we encounter
    i, j = needle_len - 1, haystack_len - 1
    # use to determine if the current match is consequtive match
    match_required = False
    indices = [0 for _ in range(needle_len)]

    while i >= 0:
        while j >= 0:
            # if the prev score is determined to be consecutive match
            # then the current position must be a match
            # e.g. haystack, needle = "auibywcabc", "abc"
            # using match_required: [7, 8, 9]
            # without match_required: [0, 8, 9]
            if (
                match_required or running_score[i][j] == result_score[i][j]
            ) and running_score[i][j] != SCORE_MIN:
                match_required = (
                    i > 0
                    and j > 0
                    and result_score[i][j]
                    == running_score[i - 1][j - 1] + SCORE_MATCH_CONSECUTIVE
                )
                indices[i] = j
                j -= 1
                break
            else:
                j -= 1
        i -= 1

    return result_score[needle_len - 1][haystack_len - 1], indices


def _subsequence(needle: str, haystack: str) -> bool:
    """Check if needle is subsequence of haystack.

    Args:
        needle: Substring to find in haystack.
        haystack: String to be searched and scored.

    Returns:
        Boolean indicating if `needle` is subsequence of `haystack`.

    Examples:
        >>> _subsequence("as", "bbwi")
        False
        >>> _subsequence("as", "bbaiws")
        True
        >>> _subsequence("sa", "bbaiws")
        False
    """
    needle, haystack = needle.lower(), haystack.lower()
    if not needle:
        return True
    offset = 0
    for char in needle:
        offset = haystack.find(char, offset) + 1
        if offset <= 0:
            return False
    return True


def fzy_scorer(needle: str, haystack: str) -> SCORE_INDICES:
    """Use fzy matching algorithem to match needle against haystack.

    Note:
        The `fzf` unordered search is not supported for performance concern.
        When the provided `needle` is not a subsequence of `haystack` at all,
        then `(-inf, None)` is returned.

    See Also:
        https://github.com/jhawthorn/fzy/blob/master/src/match.c

    Args:
        needle: Substring to find in haystack.
        haystack: String to be searched and scored against.

    Returns:
        A tuple of matching score with a list of matching indices.

    Examples:
        >>> fzy_scorer("ab", "acb")
        (0.89, [0, 2])
        >>> fzy_scorer("ab", "acbabc")
        (0.98, [3, 4])
        >>> fzy_scorer("ab", "wc")
        (-inf, None)
    """
    if _subsequence(needle, haystack):
        return _score(needle, haystack)
    else:
        return SCORE_MIN, None


def substr_scorer(needle: str, haystack: str) -> SCORE_INDICES:
    """Match needle against haystack using :meth:`str.find`.

    Note:
        Scores may be negative but the higher the score, the higher
        the match rank. `-inf` score means no match found.

    See Also:
        https://github.com/aslpavel/sweep.py/blob/3f4a179b708059c12b9e5d76d1eb3c70bf2caadc/sweep.py#L837

    Args:
        needle: Substring to find in haystack.
        haystack: String to be searched and scored against.

    Returns:
        A tuple of matching score with a list of matching indices.

    Example:
        >>> substr_scorer("ab", "awsab")
        (-1.3, [3, 4])
        >>> substr_scorer("ab", "abc")
        (0.5, [0, 1])
        >>> substr_scorer("ab", "iop")
        (-inf, None)
        >>> substr_scorer("ab", "asdafswabc")
        (-1.6388888888888888, [7, 8])
        >>> substr_scorer(" ", "asdf")
        (0, [])
    """
    indices = []
    offset = 0
    needle, haystack = needle.lower(), haystack.lower()

    for needle in needle.split(" "):
        if not needle:
            continue
        offset = haystack.find(needle, offset)
        if offset < 0:
            return SCORE_MIN, None
        needle_len = len(needle)
        indices.extend(range(offset, offset + needle_len))
        offset += needle_len

    if not indices:
        return 0, indices

    return (
        -(indices[-1] + 1 - indices[0]) + 2 / (indices[0] + 1) + 1 / (indices[-1] + 1),
        indices,
    )