NMTKD
/
translation
/tools
/mosesdecoder
/moses
/TranslationModel
/UG
/generic
/stringdist
/ug_stringdist.cc
// string distance measures | |
// Code by Ulrich Germann | |
namespace stringdist | |
{ | |
UErrorCode strip_accents(UnicodeString & trg) | |
{ | |
UErrorCode status = U_ZERO_ERROR; | |
static Transliterator *stripper | |
= Transliterator::createInstance("NFD; [:M:] Remove; NFC", | |
UTRANS_FORWARD, status); | |
stripper->transliterate(trg); | |
return status; | |
} | |
char const* | |
StringDiff:: | |
Segment:: | |
elabel[] = { "same", "cap", "flip", "permutation", | |
"accent", "duplication", | |
"insertion", "deletion", | |
"mismatch", "noinit" }; | |
StringDiff:: | |
StringDiff() | |
{} | |
StringDiff:: | |
StringDiff(string const& a, string const& b) | |
{ | |
set_a(a); | |
set_b(b); | |
align(); | |
} | |
StringDiff:: | |
Segment:: | |
Segment() | |
: start_a(-1), end_a(-1), start_b(-1), end_b(-1), match(noinit), dist(0) | |
{} | |
UnicodeString const& | |
StringDiff:: | |
set_a(string const& a) | |
{ | |
this->a = a.c_str(); | |
return this->a; | |
} | |
UnicodeString const& | |
StringDiff:: | |
set_b(string const& b) | |
{ | |
this->b = b.c_str(); | |
return this->b; | |
} | |
UnicodeString const& | |
StringDiff:: | |
get_a() const | |
{ | |
return this->a; | |
} | |
UnicodeString const& | |
StringDiff:: | |
get_b() const | |
{ | |
return this->b; | |
} | |
size_t | |
StringDiff:: | |
size() | |
{ | |
return this->difflist.size(); | |
} | |
// float | |
// StringDiff:: | |
// levelshtein(bool force) | |
// { | |
// align(force); | |
// float ret = 0; | |
// for (size_t i = 0; i < difflist.size(); +++i) | |
// { | |
// Segment const& s = difflist[i]; | |
// if (s.match == same) continue; | |
// else if (s.match == insertion) ret += s.end_b - s.start_b; | |
// else if (s.match == deletion) ret += s.end_a - s.start_a; | |
// } | |
// } | |
void | |
StringDiff:: | |
fillAlignmentMatrix(vector<vector<float> > & M) const | |
{ | |
assert(a.length() && b.length()); | |
M.assign(a.length(),vector<float>(b.length(),0)); | |
int i = 0,j; | |
while (i < b.length() && b[i] != a[0]) ++i; | |
while (i < b.length()) M[0][i++] = 1; | |
i = 0; | |
while (i < a.length() && a[i] != b[0]) ++i; | |
while (i < a.length()) M[i++][0] = 1; | |
for (i = 1; i < a.length(); ++i) | |
{ | |
for (j = 1; j < b.length(); ++j) | |
{ | |
float & s = M[i][j]; | |
s = max(M[i-1][j],M[i][j-1]); | |
if (a[i] == b[j]) | |
s = max(s,M[i-1][j-1] + 1 + (a[i-1] == b[j-1] ? .1f : 0)); | |
} | |
} | |
string abuf,bbuf; | |
a.toUTF8String(abuf); | |
b.toUTF8String(bbuf); | |
cout << " " << bbuf[0]; | |
for (int x = 1; x < b.length(); ++x) | |
cout << " " << bbuf[x]; | |
cout << endl; | |
for (int x = 0; x < a.length(); ++x) | |
{ | |
cout << abuf[x] << " "; | |
for (int y = 0; y < b.length(); ++y) | |
cout << int(M[x][y]) << " "; | |
cout << endl; | |
} | |
} | |
float | |
fillAlignmentMatrix(UChar const* a, size_t const lenA, | |
UChar const* b, size_t const lenB, | |
vector<vector<float> > & M) | |
{ | |
M.assign(lenA,vector<float>(lenB,0)); | |
assert(lenA); assert(lenB); | |
size_t i = 0; | |
while (i < lenB && b[i] != a[0]) ++i; | |
while (i < lenB) M[0][i++] = 1; | |
i = 0; | |
while (i < lenA && a[i] != b[0]) ++i; | |
while (i < lenA) M[i++][0] = 1; | |
for (i = 1; i < lenA; ++i) | |
{ | |
for (size_t j = 1; j < lenB; ++j) | |
{ | |
float & s = M[i][j]; | |
s = max(M[i-1][j], M[i][j-1]); | |
if (a[i] == b[j]) | |
s = max(s, M[i-1][j-1] + 1); | |
} | |
} | |
return M.back().back(); | |
} | |
float | |
levenshtein(UChar const* a, size_t const lenA, | |
UChar const* b, size_t const lenB) | |
{ | |
vector<vector<float> > M; | |
fillAlignmentMatrix(a,lenA,b,lenB,M); | |
size_t ret = 0; | |
for (size_t i = 0; i < M.size(); ++i) | |
{ | |
for (size_t j = 0; j < M[i].size(); ++j) | |
cout << M[i][j] << " "; | |
cout << endl; | |
} | |
cout << string(25,'-') << endl; | |
int i = M.size() -1; | |
int j = M.back().size() -1; | |
int I=i, J=j; | |
for (;i >= 0 || j >= 0; --i, --j) | |
{ | |
I=i, J=j; | |
if (j>=0) while (i > 0 && M[i-1][j] == M[i][j]) --i; | |
if (i>=0) while (j > 0 && M[i][j-1] == M[i][j]) --j; | |
size_t ilen = I >= 0 ? I - i : 0; | |
size_t jlen = J >= 0 ? J - j : 0; | |
ret += max(ilen,jlen); | |
cout << I << ":" << i << " " << J << ":" << j << " " << ret << endl; | |
I=i, J=j; | |
} | |
size_t ilen = I >= 0 ? I - i : 0; | |
size_t jlen = J >= 0 ? J - j : 0; | |
ret += max(ilen,jlen); | |
cout << I << ":" << i << " " << J << ":" << j << " " << ret << endl; | |
return ret; | |
} | |
StringDiff:: | |
Segment:: | |
Segment(size_t const as, size_t const ae, | |
size_t const bs, size_t const be, | |
UnicodeString const& a, | |
UnicodeString const& b) | |
{ | |
dist = 0; | |
start_a = as; end_a = ae; | |
start_b = bs; end_b = be; | |
if (as == ae) | |
match = bs == be ? same : insertion; | |
else if (bs == be) | |
match = deletion; | |
else if (be-bs != ae-as) | |
{ | |
match = mismatch; | |
dist = stringdist::levenshtein(a.getBuffer() + as, ae - as, | |
b.getBuffer() + bs, be - bs); | |
} | |
else | |
{ | |
match = same; | |
size_t stop = ae-as; | |
for (size_t i = 0; i < stop && match == same; ++i) | |
if (a[as+i] != b[bs+i]) match = mismatch; | |
if (match == mismatch) | |
{ | |
if (ae-as == 2 && a[as] == b[bs+1] && a[as+1] == b[bs]) | |
match = flip; | |
else | |
{ | |
vector<UChar> x(a.getBuffer() + as, a.getBuffer() + ae); | |
vector<UChar> y(b.getBuffer() + bs, b.getBuffer() + be); | |
sort(x.begin(),x.end()); | |
sort(y.begin(),y.end()); | |
if (x == y) match = permutation; | |
else dist = stringdist::levenshtein(a.getBuffer() + as, ae - as, | |
b.getBuffer() + bs, be - bs); | |
} | |
} | |
} | |
if (match == insertion) | |
{ | |
dist = be-bs; | |
} | |
else if (match == deletion) | |
{ | |
dist = ae-as; | |
} | |
else if (match == flip) dist = 1; | |
else if (match == permutation) dist = ae-as-1; | |
if (match == mismatch) | |
{ | |
UnicodeString ax(a,as,ae-as); | |
UnicodeString bx(b,bs,be-bs); | |
if (ax.toLower() == bx.toLower()) | |
match = cap; | |
else | |
{ | |
strip_accents(ax); | |
strip_accents(bx); | |
if (ax == bx) match = accent; | |
} | |
} | |
} | |
size_t | |
StringDiff:: | |
align(bool force) | |
{ | |
if (force) difflist.clear(); | |
if (difflist.size()) return 0; | |
vector<vector<float> > M; | |
fillAlignmentMatrix(M); | |
// now backtrack | |
int i = a.length() - 1; | |
int j = b.length() - 1; | |
vector<int> A(a.length(), -1); | |
vector<int> B(b.length(), -1); | |
while (i + j) | |
{ | |
while (i && M[i-1][j] == M[i][j]) --i; | |
while (j && M[i][j-1] == M[i][j]) --j; | |
if (a[i] == b[j]) { A[i] = j; B[j] = i; } | |
if (i) --i; | |
if (j) --j; | |
} | |
i = a.length() - 1; | |
j = b.length() - 1; | |
vector<int> A2(a.length(), -1); | |
vector<int> B2(b.length(), -1); | |
while (i + j) | |
{ | |
while (j && M[i][j-1] == M[i][j]) --j; | |
while (i && M[i-1][j] == M[i][j]) --i; | |
if (a[i] == b[j]) { A2[i] = j; B2[j] = i; } | |
if (i) --i; | |
if (j) --j; | |
} | |
for (size_t k = 0; k < A.size(); ++k) | |
A[k] = min(A[k],A2[k]); | |
for (size_t k = 0; k < B.size(); ++k) | |
B[k] = min(B[k],B2[k]); | |
if (a[i] == b[j]) { A[i] = j; B[j] = i; } | |
i = 0; | |
j = 0; | |
size_t I, J; | |
while (i < a.length() and j < b.length()) | |
{ | |
if (A[i] < 0) | |
{ | |
I = i + 1; | |
while (I < A.size() and A[I] < 0) ++I; | |
if (i) | |
{ for (J = j = A[i-1]+1; J < B.size() && B[J] < 0; ++J); } | |
else if (I < A.size()) | |
{ for (j = J = A[I]; j && B[j-1] < 0; --j); } | |
else J = B.size(); | |
difflist.push_back(Segment(i,I,j,J,a,b)); | |
i = I; j = J; | |
} | |
else if (B[j] < 0) | |
{ | |
for (J = j + 1; J < B.size() && B[J] < 0; ++J); | |
difflist.push_back(Segment(i,i,j,J,a,b)); | |
j = J; | |
} | |
else | |
{ | |
I = i; | |
J = j; | |
while(I < A.size() && A[I] >= 0 && J < B.size() && B[J] >= 0) | |
{ ++I; ++J; } | |
difflist.push_back(Segment(i,I,j,J,a,b)); | |
i = I; j = J; | |
} | |
} | |
if (i < a.length() || j < b.length()) | |
difflist.push_back(Segment(i,a.length(),j,b.length(),a,b)); | |
diffcnt.assign(noinit,0); | |
for (size_t i = 0; i < difflist.size(); ++i) | |
{ | |
Segment & s = difflist[i]; | |
if (s.match == insertion and | |
((s.start_a and a[s.start_a - 1] == b[s.start_b]) or | |
(s.end_a < a.length() and a[s.end_a] == b[s.start_b]))) | |
{ | |
bool sameletter = true; | |
for (int i = s.start_b + 1; sameletter and i < s.end_b; ++i) | |
sameletter = b[i] == b[i-1]; | |
if (sameletter) s.match = duplication; | |
} | |
else if (s.match == deletion and | |
((s.start_b and b[s.start_b - 1] == a[s.start_a]) or | |
(s.end_b < b.length() and b[s.end_b] == a[s.start_a]))) | |
{ | |
bool sameletter = true; | |
for (int i = s.start_a + 1; sameletter and i < s.end_a; ++i) | |
sameletter = a[i] == a[i-1]; | |
if (sameletter) s.match= duplication; | |
} | |
++diffcnt[s.match]; | |
} | |
return 0; | |
} | |
void | |
StringDiff:: | |
showDiff(std::ostream& out) | |
{ | |
if (difflist.size() == 0) align(); | |
vector<size_t> fromEnd(difflist.size(),0); | |
for (int d = difflist.size()-1; d-- > 0;) | |
{ | |
fromEnd[d] = a.length() - difflist[d].end_a; | |
// cout << d << " " << fromEnd[d] << " " | |
// << difflist[d].start_a << "-" | |
// << difflist[d].end_a << endl; | |
} | |
for (size_t d = 0; d < difflist.size(); ++d) | |
{ | |
Segment const& s = difflist[d]; | |
UnicodeString aseg,bseg; | |
a.extract(s.start_a, s.end_a - s.start_a, aseg); | |
b.extract(s.start_b, s.end_b - s.start_b, bseg); | |
string abuf,bbuf; | |
aseg.toUTF8String(abuf); | |
bseg.toUTF8String(bbuf); | |
out << abuf << " "; | |
out << bbuf << " "; | |
out << s.label() << " " | |
<< s.dist << " " | |
<< fromEnd[d] | |
<< endl; | |
} | |
} | |
char const* | |
StringDiff:: | |
Segment:: | |
label() const | |
{ | |
return elabel[this->match]; | |
} | |
StringDiff::Segment const& | |
StringDiff:: | |
operator[](uint32_t const i) const | |
{ | |
return difflist.at(i); | |
} | |
vector<int> const& | |
StringDiff:: | |
getFeatures() const | |
{ | |
return diffcnt; | |
} | |
} | |