File size: 2,425 Bytes
85b7206
 
0700cb3
85b7206
 
 
0700cb3
 
 
85b7206
 
 
 
 
 
 
0700cb3
 
85b7206
 
 
 
 
0700cb3
85b7206
0700cb3
85b7206
 
 
 
 
0700cb3
85b7206
0700cb3
 
 
 
 
 
 
 
 
 
85b7206
 
0700cb3
85b7206
0700cb3
85b7206
0700cb3
85b7206
 
 
0700cb3
 
 
 
 
85b7206
 
0700cb3
 
 
 
85b7206
 
0700cb3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import numpy as np


# ref from https://gitlab.com/-/snippets/1948157
# For some variants, look here https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python

# Pure/numpy python
def edit_distance_python2(a: str, b: str) -> np.ndarray | int:
    """A pure python levenshtein distance implementation"""
    # This version is commutative, so as an optimization we force |a|>=|b|
    if len(a) < len(b):
        return edit_distance_python(b, a)
    if len(b) == 0:  # Can deal with empty sequences faster
        return len(a)
    # Only two rows are really needed: the one currently filled in, and the previous
    distances = []
    distances.append([i for i in range(len(b) + 1)])
    distances.append([0 for _ in range(len(b) + 1)])
    # We can prefill the first row:
    costs = [0 for _ in range(3)]
    for i, a_token in enumerate(a, start=1):
        distances[1][0] += 1  # Deals with the first column.
        for j, b_token in enumerate(b, start=1):
            costs[0] = distances[1][j - 1] + 1
            costs[1] = distances[0][j] + 1
            costs[2] = distances[0][j - 1] + (0 if a_token == b_token else 1)
            distances[1][j] = min(costs)
        # Move to the next row:
        distances[0][:] = distances[1][:]
    return distances[1][len(b)]


#https://stackabuse.com/levenshtein-distance-and-text-similarity-in-python/
def edit_distance_python(seq1: str, seq2: str) -> np.ndarray:
    """A levenshtein distance implementation.

    Args:
        seq1 (str): First sequence.
        seq2 (str): Second sequence.

    Returns:
        np.ndarray: The levenshtein distance between the two sequences.
    """
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros((size_x, size_y))
    for x in range(size_x):
        matrix[x, 0] = x
    for y in range(size_y):
        matrix[0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x - 1] == seq2[y - 1]:
                matrix[x, y] = min(
                    matrix[x - 1, y] + 1,
                    matrix[x - 1, y - 1],
                    matrix[x, y - 1] + 1
                )
            else:
                matrix[x, y] = min(
                    matrix[x - 1, y] + 1,
                    matrix[x - 1, y - 1] + 1,
                    matrix[x, y - 1] + 1
                )
    #print (matrix)
    return matrix[size_x - 1, size_y - 1]