File size: 2,236 Bytes
158b61b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
//-*- c++ -*-
#pragma once

// string distance measures
// Code by Ulrich Germann
#include<iostream>


#include <unicode/stringpiece.h>
#include <unicode/translit.h>
#include <unicode/utypes.h>
#include <unicode/unistr.h>
#include <unicode/uchar.h>
#include <unicode/utf8.h>
#include <vector>

#include "moses/TranslationModel/UG/mm/tpt_typedefs.h"


namespace stringdist
{
  float
  levenshtein(UChar const* a, size_t const lenA,
	      UChar const* b, size_t const lenB);

  UErrorCode strip_accents(UnicodeString & trg);

  float
  fillAlignmentMatrix(UChar const* a, size_t const lenA,
		      UChar const* b, size_t const lenB,
		      std::vector<std::vector<float> > & M);

  class StringDiff
  {
  public:
    enum MATCHTYPE
      {
	same,        // a and b are identical
	cap,         // a and b differ only in capitalization
	flip,        // two-letter flip
	permutation, // a and b have same letters but in different order
	accent,      // a and b are the same basic letters, ignoring accents
	duplication, // a is empty
	insertion,   // a is empty
	deletion,    // b is empty
	mismatch,    // none of the above
	noinit       // not initialized
      };

    struct Segment
    {
      static char const* elabel[];
      int start_a, end_a;
      int start_b, end_b;
      MATCHTYPE match;
      float      dist;
      Segment();
      Segment(size_t const as, size_t const ae,
	      size_t const bs, size_t const be,
	      UnicodeString const& a,
	      UnicodeString const& b);
      char const* label() const;
    };
  private:
    UnicodeString a,b;
    std::vector<Segment> difflist;
    std::vector<int> diffcnt;
  public:
    UnicodeString const& set_a(std::string const& a);
    UnicodeString const& set_b(std::string const& b);
    UnicodeString const& get_a() const;
    UnicodeString const& get_b() const;
    StringDiff(std::string const& a, std::string const& b);
    StringDiff();
    size_t size();
    size_t align(bool force=false); // returns the levenshtein distance
    void showDiff(std::ostream& out);
    float levenshtein();
    Segment const& operator[](uint32_t i) const;
    void fillAlignmentMatrix(std::vector<std::vector<float> > & M) const;
    vector<int> const& getFeatures() const;
  };
}