Spaces:
Sleeping
Sleeping
"""Module contains the score calculation algorithems.""" | |
from functools import partial | |
from typing import Dict, List, Union, cast | |
from pfzy.types import SCORE_INDICES | |
SCORE_MIN = float("-inf") | |
SCORE_MAX = float("inf") | |
SCORE_GAP_LEADING = -0.005 | |
SCORE_GAP_TRAILING = -0.005 | |
SCORE_GAP_INNER = -0.01 | |
SCORE_MATCH_CONSECUTIVE = 1.0 | |
def _char_range_with( | |
char_start: str, char_stop: str, value, hash_table: Dict[str, Union[int, float]] | |
) -> Dict[str, Union[int, float]]: | |
"""Generate index mapping for `bonus` calculation. | |
Args: | |
char_start: Starting char of the range. | |
char_stop: Ending char of the range. | |
value: Value to give to the range of char. | |
hash_table: Base dictionary to add the mapping. | |
Returns: | |
A dictionary containing the given range with provided index. | |
Examples: | |
>>> _char_range_with("a", "d", 1, {}) | |
{'a': 1, 'b': 1, 'c': 1, 'd': 1} | |
""" | |
hash_table = hash_table.copy() | |
hash_table.update( | |
(chr(uni_char), value) | |
for uni_char in range(ord(char_start), ord(char_stop) + 1) | |
) | |
return hash_table | |
lower_with = partial(_char_range_with, "a", "z") | |
upper_with = partial(_char_range_with, "A", "Z") | |
digit_with = partial(_char_range_with, "0", "9") | |
SCORE_MATCH_SLASH = 0.9 | |
SCORE_MATCH_WORD = 0.8 | |
SCORE_MATCH_CAPITAL = 0.7 | |
SCORE_MATCH_DOT = 0.6 | |
BONUS_MAP = { | |
"/": SCORE_MATCH_SLASH, | |
"-": SCORE_MATCH_WORD, | |
"_": SCORE_MATCH_WORD, | |
" ": SCORE_MATCH_WORD, | |
".": SCORE_MATCH_DOT, | |
} | |
BONUS_STATES = [{}, BONUS_MAP, lower_with(SCORE_MATCH_CAPITAL, BONUS_MAP)] | |
BONUS_INDEX = cast(Dict[str, int], digit_with(1, lower_with(1, upper_with(2, {})))) | |
def _bonus(haystack: str) -> List[float]: | |
"""Calculate bonus score for the given haystack. | |
The bonus are applied to each char based on the previous char. | |
When previous char is within the `BONUS_MAP` then additional bonus | |
are applied to the current char due to it might be the start of a new | |
word. | |
When encountered a mix case character, if the current char is capitalised then | |
if the previous char is normal case or within `BONUS_MAP`, additional bounus are applied. | |
Args: | |
haystack: String to calculate bonus. | |
Returns: | |
A list of float matching the length of the given haystack | |
with each index representing the bonus score to apply. | |
Examples: | |
>>> _bonus("asdf") | |
[0.9, 0, 0, 0] | |
>>> _bonus("asdf asdf") | |
[0.9, 0, 0, 0, 0, 0.8, 0, 0, 0] | |
>>> _bonus("asdf aSdf") | |
[0.9, 0, 0, 0, 0, 0.8, 0.7, 0, 0] | |
>>> _bonus("asdf/aSdf") | |
[0.9, 0, 0, 0, 0, 0.9, 0.7, 0, 0] | |
""" | |
prev_char = "/" | |
bonus = [] | |
for char in haystack: | |
bonus.append(BONUS_STATES[BONUS_INDEX.get(char, 0)].get(prev_char, 0)) | |
prev_char = char | |
return bonus | |
def _score(needle: str, haystack: str) -> SCORE_INDICES: | |
"""Use fzy logic to calculate score for `needle` within the given `haystack`. | |
2 2D array to track the score. | |
1. The running score (`running_score`) which represents the best score for the current position. | |
2. The result score (`result_score`) which tracks to overall best score that could be for the current positon. | |
With every consequtive match, additional bonuse score are given and for every non matching char, a negative | |
gap score is applied. | |
After the score is calculated, the final matching score will be stored at the last position of the `result_score`. | |
Backtrack the result by comparing the 2 2D array to find the corresponding indices. | |
Args: | |
needle: Substring to find in haystack. | |
haystack: String to be searched and scored. | |
Returns: | |
A tuple of matching score with a list of matching indices. | |
""" | |
needle_len, haystack_len = len(needle), len(haystack) | |
bonus_score = _bonus(haystack) | |
# smart case | |
if needle.islower(): | |
haystack = haystack.lower() | |
# return all values if no query | |
if needle_len == 0 or needle_len == haystack_len: | |
return SCORE_MAX, list(range(needle_len)) | |
# best score for the position | |
running_score: List[List[float]] = [ | |
[0 for _ in range(haystack_len)] for _ in range(needle_len) | |
] | |
# overall best score at each position | |
result_score: List[List[float]] = [ | |
[0 for _ in range(haystack_len)] for _ in range(needle_len) | |
] | |
for i in range(needle_len): | |
prev_score = SCORE_MIN | |
# gap between matching char | |
# more gaps, less score | |
gap_score = SCORE_GAP_TRAILING if i == needle_len - 1 else SCORE_GAP_INNER | |
for j in range(haystack_len): | |
if needle[i] == haystack[j]: | |
score = SCORE_MIN | |
if i == 0: | |
score = j * SCORE_GAP_LEADING + bonus_score[j] | |
elif j != 0: | |
score = max( | |
result_score[i - 1][j - 1] + bonus_score[j], | |
# consecutive match if value is higher | |
running_score[i - 1][j - 1] + SCORE_MATCH_CONSECUTIVE, | |
) | |
running_score[i][j] = score | |
result_score[i][j] = prev_score = max(score, prev_score + gap_score) | |
else: | |
running_score[i][j] = SCORE_MIN | |
# increment the best score with gap_score since no match | |
result_score[i][j] = prev_score = prev_score + gap_score | |
# backtrace to find the all indices of optimal matching | |
# starting from the end to pick the first possible match we encounter | |
i, j = needle_len - 1, haystack_len - 1 | |
# use to determine if the current match is consequtive match | |
match_required = False | |
indices = [0 for _ in range(needle_len)] | |
while i >= 0: | |
while j >= 0: | |
# if the prev score is determined to be consecutive match | |
# then the current position must be a match | |
# e.g. haystack, needle = "auibywcabc", "abc" | |
# using match_required: [7, 8, 9] | |
# without match_required: [0, 8, 9] | |
if ( | |
match_required or running_score[i][j] == result_score[i][j] | |
) and running_score[i][j] != SCORE_MIN: | |
match_required = ( | |
i > 0 | |
and j > 0 | |
and result_score[i][j] | |
== running_score[i - 1][j - 1] + SCORE_MATCH_CONSECUTIVE | |
) | |
indices[i] = j | |
j -= 1 | |
break | |
else: | |
j -= 1 | |
i -= 1 | |
return result_score[needle_len - 1][haystack_len - 1], indices | |
def _subsequence(needle: str, haystack: str) -> bool: | |
"""Check if needle is subsequence of haystack. | |
Args: | |
needle: Substring to find in haystack. | |
haystack: String to be searched and scored. | |
Returns: | |
Boolean indicating if `needle` is subsequence of `haystack`. | |
Examples: | |
>>> _subsequence("as", "bbwi") | |
False | |
>>> _subsequence("as", "bbaiws") | |
True | |
>>> _subsequence("sa", "bbaiws") | |
False | |
""" | |
needle, haystack = needle.lower(), haystack.lower() | |
if not needle: | |
return True | |
offset = 0 | |
for char in needle: | |
offset = haystack.find(char, offset) + 1 | |
if offset <= 0: | |
return False | |
return True | |
def fzy_scorer(needle: str, haystack: str) -> SCORE_INDICES: | |
"""Use fzy matching algorithem to match needle against haystack. | |
Note: | |
The `fzf` unordered search is not supported for performance concern. | |
When the provided `needle` is not a subsequence of `haystack` at all, | |
then `(-inf, None)` is returned. | |
See Also: | |
https://github.com/jhawthorn/fzy/blob/master/src/match.c | |
Args: | |
needle: Substring to find in haystack. | |
haystack: String to be searched and scored against. | |
Returns: | |
A tuple of matching score with a list of matching indices. | |
Examples: | |
>>> fzy_scorer("ab", "acb") | |
(0.89, [0, 2]) | |
>>> fzy_scorer("ab", "acbabc") | |
(0.98, [3, 4]) | |
>>> fzy_scorer("ab", "wc") | |
(-inf, None) | |
""" | |
if _subsequence(needle, haystack): | |
return _score(needle, haystack) | |
else: | |
return SCORE_MIN, None | |
def substr_scorer(needle: str, haystack: str) -> SCORE_INDICES: | |
"""Match needle against haystack using :meth:`str.find`. | |
Note: | |
Scores may be negative but the higher the score, the higher | |
the match rank. `-inf` score means no match found. | |
See Also: | |
https://github.com/aslpavel/sweep.py/blob/3f4a179b708059c12b9e5d76d1eb3c70bf2caadc/sweep.py#L837 | |
Args: | |
needle: Substring to find in haystack. | |
haystack: String to be searched and scored against. | |
Returns: | |
A tuple of matching score with a list of matching indices. | |
Example: | |
>>> substr_scorer("ab", "awsab") | |
(-1.3, [3, 4]) | |
>>> substr_scorer("ab", "abc") | |
(0.5, [0, 1]) | |
>>> substr_scorer("ab", "iop") | |
(-inf, None) | |
>>> substr_scorer("ab", "asdafswabc") | |
(-1.6388888888888888, [7, 8]) | |
>>> substr_scorer(" ", "asdf") | |
(0, []) | |
""" | |
indices = [] | |
offset = 0 | |
needle, haystack = needle.lower(), haystack.lower() | |
for needle in needle.split(" "): | |
if not needle: | |
continue | |
offset = haystack.find(needle, offset) | |
if offset < 0: | |
return SCORE_MIN, None | |
needle_len = len(needle) | |
indices.extend(range(offset, offset + needle_len)) | |
offset += needle_len | |
if not indices: | |
return 0, indices | |
return ( | |
-(indices[-1] + 1 - indices[0]) + 2 / (indices[0] + 1) + 1 / (indices[-1] + 1), | |
indices, | |
) | |