|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "tercalc.h" |
|
using namespace std; |
|
using namespace TERCPPNS_Tools; |
|
namespace TERCPPNS_TERCpp |
|
{ |
|
|
|
terCalc::terCalc() |
|
{ |
|
TAILLE_PERMUT_MAX = 10; |
|
NBR_PERMUT_MAX = 10; |
|
infinite = 99999.0; |
|
shift_cost = 1.0; |
|
insert_cost = 1.0; |
|
delete_cost = 1.0; |
|
substitute_cost = 1.0; |
|
match_cost = 0.0; |
|
NBR_SEGS_EVALUATED = 0; |
|
NBR_PERMUTS_CONSID = 0; |
|
NBR_BS_APPELS = 0; |
|
TAILLE_BEAM = 10; |
|
DIST_MAX_PERMUT = 25; |
|
PRINT_DEBUG = false; |
|
hypSpans.clear(); |
|
refSpans.clear(); |
|
CALL_TER_ALIGN=0; |
|
CALL_CALC_PERMUT=0; |
|
CALL_FIND_BSHIFT=0; |
|
MAX_LENGTH_SENTENCE=10; |
|
S = new vector < vector < double > >(MAX_LENGTH_SENTENCE, std::vector<double>(MAX_LENGTH_SENTENCE,0.0)); |
|
P = new vector < vector < char > >(MAX_LENGTH_SENTENCE, std::vector<char>(MAX_LENGTH_SENTENCE,' ')); |
|
} |
|
|
|
terCalc::~terCalc() |
|
{ |
|
delete(S); |
|
delete(P); |
|
} |
|
|
|
|
|
terAlignment terCalc::WERCalculation ( vector< string >& hyp , vector< string >& ref ) |
|
{ |
|
|
|
return minimizeDistanceEdition ( hyp, ref, hypSpans ); |
|
|
|
} |
|
|
|
terAlignment terCalc::TER ( vector< int >& hyp, vector< int >& ref ) |
|
{ |
|
stringstream s; |
|
s.str ( "" ); |
|
string stringRef ( "" ); |
|
string stringHyp ( "" ); |
|
for ( vector<int>::iterator l_it = ref.begin(); l_it != ref.end(); l_it++ ) { |
|
if ( l_it == ref.begin() ) { |
|
s << ( *l_it ); |
|
} else { |
|
s << " " << ( *l_it ); |
|
} |
|
} |
|
stringRef = s.str(); |
|
s.str ( "" ); |
|
for ( vector<int>::iterator l_itHyp = hyp.begin(); l_itHyp != hyp.end(); l_itHyp++ ) { |
|
if ( l_itHyp == hyp.begin() ) { |
|
s << ( *l_itHyp ); |
|
} else { |
|
s << " " << ( *l_itHyp ); |
|
} |
|
} |
|
stringHyp = s.str(); |
|
s.str ( "" ); |
|
vector<string> l_vref=stringToVector ( stringRef , " " ); |
|
vector<string> l_vhyp=stringToVector ( stringHyp , " " ); |
|
return TER ( l_vhyp , l_vref); |
|
} |
|
|
|
|
|
hashMapInfos terCalc::createConcordMots ( vector< string >& hyp, vector< string >& ref ) |
|
{ |
|
hashMap tempHash; |
|
hashMapInfos retour; |
|
for ( int i = 0; i < ( int ) hyp.size(); i++ ) { |
|
tempHash.addHasher ( hyp.at ( i ), "" ); |
|
} |
|
bool cor[ref.size() ]; |
|
for ( int i = 0; i < ( int ) ref.size(); i++ ) { |
|
if ( tempHash.trouve ( ( string ) ref.at ( i ) ) ) { |
|
cor[i] = true; |
|
} else { |
|
cor[i] = false; |
|
} |
|
} |
|
for ( int start = 0; start < ( int ) ref.size(); start++ ) { |
|
if ( cor[start] ) { |
|
for ( int end = start; ( ( end < ( int ) ref.size() ) && ( end - start <= TAILLE_PERMUT_MAX ) && ( cor[end] ) ); end++ ) { |
|
vector<string> ajouter = subVector ( ref, start, end + 1 ); |
|
string ajouterString = vectorToString ( ajouter ); |
|
vector<int> values = retour.getValue ( ajouterString ); |
|
values.push_back ( start ); |
|
if ( values.size() > 1 ) { |
|
retour.setValue ( ajouterString, values ); |
|
} else { |
|
retour.addValue ( ajouterString, values ); |
|
} |
|
} |
|
} |
|
} |
|
return retour; |
|
} |
|
|
|
bool terCalc::trouverIntersection ( vecInt& refSpan, vecInt& hypSpan ) |
|
{ |
|
if ( ( refSpan.at ( 1 ) >= hypSpan.at ( 0 ) ) && ( refSpan.at ( 0 ) <= hypSpan.at ( 1 ) ) ) { |
|
return true; |
|
} |
|
return false; |
|
} |
|
|
|
|
|
terAlignment terCalc::minimizeDistanceEdition ( vector< string >& hyp, vector< string >& ref, vector< vecInt >& curHypSpans ) |
|
{ |
|
double current_best = infinite; |
|
double last_best = infinite; |
|
int first_good = 0; |
|
int current_first_good = 0; |
|
int last_good = -1; |
|
int cur_last_good = 0; |
|
int last_peak = 0; |
|
int cur_last_peak = 0; |
|
int i=0; |
|
int j=0; |
|
int ref_size=0 ; |
|
ref_size=( int ) ref.size(); |
|
int hyp_size=0; |
|
hyp_size=( int ) hyp.size(); |
|
double cost, icost, dcost; |
|
double score; |
|
delete(S); |
|
delete(P); |
|
S = new vector < vector < double > >(ref_size+1, std::vector<double>(hyp_size+1,-1.0)); |
|
P = new vector < vector < char > >(ref_size+1, std::vector<char>(hyp_size+1,'0')); |
|
|
|
|
|
|
|
NBR_BS_APPELS++; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
S->at(0).at(0) = 0.0; |
|
for ( j = 0; j <= hyp_size; j++ ) { |
|
last_best = current_best; |
|
current_best = infinite; |
|
first_good = current_first_good; |
|
current_first_good = -1; |
|
last_good = cur_last_good; |
|
cur_last_good = -1; |
|
last_peak = cur_last_peak; |
|
cur_last_peak = 0; |
|
for ( i = first_good; i <= ref_size; i++ ) { |
|
if ( i > last_good ) { |
|
break; |
|
} |
|
if ( S->at(i).at(j) < 0 ) { |
|
continue; |
|
} |
|
score = S->at(i).at(j); |
|
if ( ( j < hyp_size ) && ( score > last_best + TAILLE_BEAM ) ) { |
|
continue; |
|
} |
|
if ( current_first_good == -1 ) { |
|
current_first_good = i ; |
|
} |
|
if ( ( i < ref_size ) && ( j < hyp_size ) ) { |
|
if ( ( int ) refSpans.size() == 0 || ( int ) hypSpans.size() == 0 || trouverIntersection ( refSpans.at ( i ), curHypSpans.at ( j ) ) ) { |
|
if ( ( int ) ( ref.at ( i ).compare ( hyp.at ( j ) ) ) == 0 ) { |
|
cost = match_cost + score; |
|
if ( ( S->at(i+1).at(j+1) == -1 ) || ( cost < S->at(i+1).at(j+1) ) ) { |
|
S->at(i+1).at(j+1) = cost; |
|
P->at(i+1).at(j+1) = 'A'; |
|
} |
|
if ( cost < current_best ) { |
|
current_best = cost; |
|
} |
|
if ( current_best == cost ) { |
|
cur_last_peak = i + 1; |
|
} |
|
} else { |
|
cost = substitute_cost + score; |
|
if ( ( S->at(i+1).at(j+1) < 0 ) || ( cost < S->at(i+1).at(j+1) ) ) { |
|
S->at(i+1).at(j+1) = cost; |
|
P->at(i+1).at(j+1) = 'S'; |
|
if ( cost < current_best ) { |
|
current_best = cost; |
|
} |
|
if ( current_best == cost ) { |
|
cur_last_peak = i + 1 ; |
|
} |
|
} |
|
} |
|
} |
|
} |
|
cur_last_good = i + 1; |
|
if ( j < hyp_size ) { |
|
icost = score + insert_cost; |
|
if ( ( S->at(i).at(j+1) < 0 ) || ( S->at(i).at(j+1) > icost ) ) { |
|
S->at(i).at(j+1) = icost; |
|
P->at(i).at(j+1) = 'I'; |
|
if ( ( cur_last_peak < i ) && ( current_best == icost ) ) { |
|
cur_last_peak = i; |
|
} |
|
} |
|
} |
|
if ( i < ref_size ) { |
|
dcost = score + delete_cost; |
|
if ( ( S->at(i+1).at(j) < 0.0 ) || ( S->at(i+1).at(j) > dcost ) ) { |
|
S->at(i+1).at(j) = dcost; |
|
P->at(i+1).at(j) = 'D'; |
|
if ( i >= last_good ) { |
|
last_good = i + 1 ; |
|
} |
|
} |
|
} |
|
} |
|
} |
|
|
|
|
|
int tracelength = 0; |
|
i = ref.size(); |
|
j = hyp.size(); |
|
while ( ( i > 0 ) || ( j > 0 ) ) { |
|
tracelength++; |
|
if ( P->at(i).at(j) == 'A' ) { |
|
i--; |
|
j--; |
|
} else if ( P->at(i).at(j) == 'S' ) { |
|
i--; |
|
j--; |
|
} else if ( P->at(i).at(j) == 'D' ) { |
|
i--; |
|
} else if ( P->at(i).at(j) == 'I' ) { |
|
j--; |
|
} else { |
|
cerr << "ERROR : terCalc::minimizeDistanceEdition : Invalid path : " << P->at(i).at(j) << endl; |
|
exit ( -1 ); |
|
} |
|
} |
|
vector<char> path ( tracelength ); |
|
i = ref.size(); |
|
j = hyp.size(); |
|
while ( ( i > 0 ) || ( j > 0 ) ) { |
|
path[--tracelength] = P->at(i).at(j); |
|
if ( P->at(i).at(j) == 'A' ) { |
|
i--; |
|
j--; |
|
} else if ( P->at(i).at(j) == 'S' ) { |
|
i--; |
|
j--; |
|
} else if ( P->at(i).at(j) == 'D' ) { |
|
i--; |
|
} else if ( P->at(i).at(j) == 'I' ) { |
|
j--; |
|
} |
|
} |
|
terAlignment to_return; |
|
to_return.numWords = ref_size; |
|
to_return.alignment = path; |
|
to_return.numEdits = S->at(ref_size).at(hyp_size); |
|
to_return.hyp = hyp; |
|
to_return.ref = ref; |
|
to_return.averageWords = ref_size; |
|
if ( PRINT_DEBUG ) { |
|
cerr << "BEGIN DEBUG : terCalc::minimizeDistanceEdition : to_return :" << endl << to_return.toString() << endl << "END DEBUG" << endl; |
|
} |
|
return to_return; |
|
|
|
} |
|
void terCalc::minimizeDistanceEdition ( vector< string >& hyp, vector< string >& ref, vector< vecInt >& curHypSpans, terAlignment* to_return ) |
|
{ |
|
double current_best = infinite; |
|
double last_best = infinite; |
|
int first_good = 0; |
|
int current_first_good = 0; |
|
int last_good = -1; |
|
int cur_last_good = 0; |
|
int last_peak = 0; |
|
int cur_last_peak = 0; |
|
int i=0; |
|
int j=0; |
|
int ref_size=0 ; |
|
ref_size=( int ) ref.size(); |
|
int hyp_size=0; |
|
hyp_size=( int ) hyp.size(); |
|
double cost, icost, dcost; |
|
double score; |
|
delete(S); |
|
delete(P); |
|
S = new vector < vector < double > >(ref_size+1, std::vector<double>(hyp_size+1,-1.0)); |
|
P = new vector < vector < char > >(ref_size+1, std::vector<char>(hyp_size+1,'0')); |
|
|
|
NBR_BS_APPELS++; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
S->at(0).at(0) = 0.0; |
|
for ( j = 0; j <= hyp_size; j++ ) { |
|
last_best = current_best; |
|
current_best = infinite; |
|
first_good = current_first_good; |
|
current_first_good = -1; |
|
last_good = cur_last_good; |
|
cur_last_good = -1; |
|
last_peak = cur_last_peak; |
|
cur_last_peak = 0; |
|
for ( i = first_good; i <= ref_size; i++ ) { |
|
if ( i > last_good ) { |
|
break; |
|
} |
|
if (S->at(i).at(j) < 0 ) { |
|
continue; |
|
} |
|
score = S->at(i).at(j); |
|
if ( ( j < hyp_size ) && ( score > last_best + TAILLE_BEAM ) ) { |
|
continue; |
|
} |
|
if ( current_first_good == -1 ) { |
|
current_first_good = i ; |
|
} |
|
if ( ( i < ref_size ) && ( j < hyp_size ) ) { |
|
if ( ( int ) refSpans.size() == 0 || ( int ) hypSpans.size() == 0 || trouverIntersection ( refSpans.at ( i ), curHypSpans.at ( j ) ) ) { |
|
if ( ( int ) ( ref.at ( i ).compare ( hyp.at ( j ) ) ) == 0 ) { |
|
cost = match_cost + score; |
|
if ( ( S->at(i+1).at(j+1) == -1 ) || ( cost < S->at(i+1).at(j+1) ) ) { |
|
S->at(i+1).at(j+1) = cost; |
|
P->at(i+1).at(j+1) = 'A'; |
|
} |
|
if ( cost < current_best ) { |
|
current_best = cost; |
|
} |
|
if ( current_best == cost ) { |
|
cur_last_peak = i + 1; |
|
} |
|
} else { |
|
cost = substitute_cost + score; |
|
if ( ( S->at(i+1).at(j+1) < 0 ) || ( cost < S->at(i+1).at(j+1) ) ) { |
|
S->at(i+1).at(j+1) = cost; |
|
P->at(i+1).at(j+1) = 'S'; |
|
if ( cost < current_best ) { |
|
current_best = cost; |
|
} |
|
if ( current_best == cost ) { |
|
cur_last_peak = i + 1 ; |
|
} |
|
} |
|
} |
|
} |
|
} |
|
cur_last_good = i + 1; |
|
if ( j < hyp_size ) { |
|
icost = score + insert_cost; |
|
if ( ( S->at(i).at(j+1) < 0 ) || ( S->at(i).at(j+1) > icost ) ) { |
|
S->at(i).at(j+1) = icost; |
|
P->at(i).at(j+1) = 'I'; |
|
if ( ( cur_last_peak < i ) && ( current_best == icost ) ) { |
|
cur_last_peak = i; |
|
} |
|
} |
|
} |
|
if ( i < ref_size ) { |
|
dcost = score + delete_cost; |
|
if ( ( S->at(i+1).at(j) < 0.0 ) || ( S->at(i+1).at(j) > dcost ) ) { |
|
S->at(i+1).at(j) = dcost; |
|
P->at(i+1).at(j) = 'D'; |
|
if ( i >= last_good ) { |
|
last_good = i + 1 ; |
|
} |
|
} |
|
} |
|
} |
|
} |
|
|
|
|
|
int tracelength = 0; |
|
i = ref_size;; |
|
j = hyp_size; |
|
while ( ( i > 0 ) || ( j > 0 ) ) { |
|
tracelength++; |
|
if (P->at(i).at(j) == 'A' ) { |
|
i--; |
|
j--; |
|
} else if (P->at(i).at(j) == 'S' ) { |
|
i--; |
|
j--; |
|
} else if (P->at(i).at(j) == 'D' ) { |
|
i--; |
|
} else if (P->at(i).at(j) == 'I' ) { |
|
j--; |
|
} else { |
|
cerr << "ERROR : terCalc::minimizeDistanceEdition : Invalid path : " <<P->at(i).at(j) << endl; |
|
exit ( -1 ); |
|
} |
|
} |
|
vector<char> path ( tracelength ); |
|
i = ref_size; |
|
j = hyp_size; |
|
while ( ( i > 0 ) || ( j > 0 ) ) { |
|
path[--tracelength] =P->at(i).at(j); |
|
if (P->at(i).at(j) == 'A' ) { |
|
i--; |
|
j--; |
|
} else if (P->at(i).at(j) == 'S' ) { |
|
i--; |
|
j--; |
|
} else if (P->at(i).at(j) == 'D' ) { |
|
i--; |
|
} else if (P->at(i).at(j) == 'I' ) { |
|
j--; |
|
} |
|
} |
|
|
|
to_return->numWords = ref_size; |
|
to_return->alignment = path; |
|
to_return->numEdits = S->at(ref_size).at(hyp_size); |
|
to_return->hyp = hyp; |
|
to_return->ref = ref; |
|
to_return->averageWords = ref_size; |
|
if ( PRINT_DEBUG ) { |
|
cerr << "BEGIN DEBUG : terCalc::minimizeDistanceEdition : to_return :" << endl << to_return->toString() << endl << "END DEBUG" << endl; |
|
} |
|
|
|
|
|
} |
|
|
|
|
|
terAlignment terCalc::TER ( vector<string>& hyp, vector<string>& ref ) |
|
{ |
|
hashMapInfos rloc = createConcordMots ( hyp, ref ); |
|
terAlignment cur_align = minimizeDistanceEdition ( hyp, ref, hypSpans ); |
|
vector<string> cur = hyp; |
|
cur_align.hyp = hyp; |
|
cur_align.ref = ref; |
|
cur_align.aftershift = hyp; |
|
double edits = 0; |
|
|
|
|
|
vector<terShift> * allshifts=new vector<terShift>(0); |
|
bestShiftStruct * returns=new bestShiftStruct(); |
|
|
|
|
|
if ( PRINT_DEBUG ) { |
|
cerr << "BEGIN DEBUG : terCalc::TER : cur_align :" << endl << cur_align.toString() << endl << "END DEBUG" << endl; |
|
} |
|
while ( true ) { |
|
|
|
returns=findBestShift ( cur, hyp, ref, rloc, cur_align ); |
|
|
|
if ( returns->getEmpty()) { |
|
break; |
|
} |
|
terShift bestShift = (*(returns->m_best_shift)); |
|
cur_align = (*(returns->m_best_align)); |
|
edits += bestShift.cost; |
|
bestShift.alignment = cur_align.alignment; |
|
bestShift.aftershift = cur_align.aftershift; |
|
allshifts->push_back ( bestShift ); |
|
cur = cur_align.aftershift; |
|
delete(returns); |
|
} |
|
if ( PRINT_DEBUG ) { |
|
cerr << "BEGIN DEBUG : terCalc::TER : Final to return :" << endl << cur_align.toString() << endl << "END DEBUG" << endl; |
|
} |
|
terAlignment to_return; |
|
to_return = cur_align; |
|
to_return.allshifts = (*(allshifts)); |
|
to_return.numEdits += edits; |
|
NBR_SEGS_EVALUATED++; |
|
return to_return; |
|
} |
|
bestShiftStruct * terCalc::findBestShift ( vector<string>& cur, vector<string>& hyp, vector<string>& ref, hashMapInfos& rloc, terAlignment& med_align ) |
|
{ |
|
CALL_FIND_BSHIFT++; |
|
|
|
|
|
bool anygain = false; |
|
vector <bool> * herr = new vector<bool>(( int ) hyp.size() + 1 ); |
|
vector <bool> * rerr = new vector<bool>( ( int ) ref.size() + 1 ); |
|
vector <int> * ralign = new vector<int>( ( int ) ref.size() + 1 ); |
|
int l_i,i,j,s; |
|
for (i = 0 ; i< ( int ) hyp.size() + 1 ; i++) { |
|
herr->at(i)=false; |
|
} |
|
for (i = 0 ; i< ( int ) ref.size() + 1 ; i++) { |
|
rerr->at(i)=false; |
|
ralign->at(i)=-1; |
|
} |
|
calculateTerAlignment ( med_align, herr, rerr, ralign ); |
|
vector<vecTerShift> * poss_shifts = new vector< vector<terShift> >(0) ; |
|
terAlignment * cur_best_align = new terAlignment(); |
|
terShift * cur_best_shift = new terShift(); |
|
double cur_best_shift_cost = 0.0; |
|
vector<string> shiftarr; |
|
vector<vecInt> curHypSpans; |
|
terShift * curshift = new terShift(); |
|
alignmentStruct shiftReturns; |
|
terAlignment * curalign = new terAlignment() ; |
|
|
|
|
|
if ( PRINT_DEBUG ) { |
|
cerr << "BEGIN DEBUG : terCalc::findBestShift (after the calculateTerAlignment call) :" << endl; |
|
cerr << "indices: "; |
|
for (l_i=0; l_i < ( int ) ref.size() ; l_i++) { |
|
cerr << l_i << "\t"; |
|
} |
|
cerr << endl; |
|
cerr << "hyp : \t"<<vectorToString(hyp ,"\t") << endl; |
|
cerr << "cur : \t"<<vectorToString(cur ,"\t") << endl; |
|
cerr << "ref : \t"<<vectorToString(ref ,"\t") << endl; |
|
cerr << "herr : "<<vectorToString(herr,"\t",( int ) hyp.size()) << " | " << ( int ) hyp.size() <<endl; |
|
cerr << "rerr : "<<vectorToString(rerr,"\t",( int ) ref.size()) << " | " << ( int ) ref.size() <<endl; |
|
cerr << "ralign : "<< vectorToString(ralign,"\t",( int ) ref.size()) << " | " << ( int ) ref.size() << endl; |
|
cerr << "END DEBUG " << endl; |
|
} |
|
poss_shifts = calculerPermutations ( cur, ref, rloc, med_align, herr, rerr, ralign ); |
|
double curerr = med_align.numEdits; |
|
if ( PRINT_DEBUG ) { |
|
cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl; |
|
cerr << "Possible Shifts:" << endl; |
|
for ( i = ( int ) poss_shifts->size() - 1; i >= 0; i-- ) { |
|
for ( j = 0; j < ( int ) ( poss_shifts->at ( i ) ).size(); j++ ) { |
|
cerr << " [" << i << "] " << ( ( poss_shifts->at ( i ) ).at ( j ) ).toString() << endl; |
|
} |
|
} |
|
cerr << endl; |
|
cerr << "END DEBUG " << endl; |
|
} |
|
|
|
cur_best_align->set(med_align); |
|
for ( i = ( int ) poss_shifts->size() - 1; i >= 0; i-- ) { |
|
if ( PRINT_DEBUG ) { |
|
cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl; |
|
cerr << "Considering shift of length " << i << " (" << ( poss_shifts->at ( i ) ).size() << ")" << endl; |
|
cerr << "END DEBUG " << endl; |
|
} |
|
|
|
double curfix = curerr - ( cur_best_shift_cost + cur_best_align->numEdits ); |
|
double maxfix = ( 2 * ( 1 + i ) ); |
|
if ( ( curfix > maxfix ) || ( ( cur_best_shift_cost != 0 ) && ( curfix == maxfix ) ) ) { |
|
break; |
|
} else { |
|
for ( s = 0; s < ( int ) ( poss_shifts->at ( i ) ).size(); s++ ) { |
|
curfix = curerr - ( cur_best_shift_cost + cur_best_align->numEdits ); |
|
if ( ( curfix > maxfix ) || ( ( cur_best_shift_cost != 0 ) && ( curfix == maxfix ) ) ) { |
|
break; |
|
} else { |
|
curshift->set(( poss_shifts->at ( i ) ).at ( s )); |
|
if ( PRINT_DEBUG ) { |
|
cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl; |
|
cerr << "cur : "<< join(" ",cur) << endl; |
|
cerr << "shift size : "<< i << endl; |
|
cerr << "shift number : "<< s << endl; |
|
cerr << "size of shift size : "<< ( int ) ( poss_shifts->at ( i ) ).size() << endl; |
|
cerr << "curshift : "<< curshift->toString() << endl; |
|
|
|
} |
|
|
|
shiftReturns.set(permuter ( cur, curshift )); |
|
shiftarr = shiftReturns.nwords; |
|
curHypSpans = shiftReturns.aftershift; |
|
|
|
if ( PRINT_DEBUG ) { |
|
cerr << "shiftarr : "<< join(" ",shiftarr) << endl; |
|
cerr << "curHypSpans size : "<< (int)curHypSpans.size() << endl; |
|
cerr << "END DEBUG " << endl; |
|
} |
|
|
|
minimizeDistanceEdition ( shiftarr, ref, curHypSpans, curalign ); |
|
|
|
|
|
curalign->hyp = hyp; |
|
curalign->ref = ref; |
|
curalign->aftershift = shiftarr; |
|
|
|
|
|
double gain = ( cur_best_align->numEdits + cur_best_shift_cost ) - ( curalign->numEdits + curshift->cost ); |
|
|
|
if ( PRINT_DEBUG ) { |
|
cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl; |
|
cerr << "Gain for " << curshift->toString() << " is " << gain << ". (result: [" << curalign->join ( " ", shiftarr ) << "]" << endl; |
|
cerr << "Details of gains : gain = ( cur_best_align->numEdits + cur_best_shift_cost ) - ( curalign->numEdits + curshift->cost )"<<endl; |
|
cerr << "Details of gains : gain = ("<<cur_best_align->numEdits << "+" << cur_best_shift_cost << ") - (" << curalign->numEdits << "+" << curshift->cost << ")"<<endl; |
|
cerr << "" << curalign->toString() << "\n" << endl; |
|
cerr << "END DEBUG " << endl; |
|
} |
|
|
|
if ( ( gain > 0 ) || ( ( cur_best_shift_cost == 0 ) && ( gain == 0 ) ) ) { |
|
anygain = true; |
|
cur_best_shift->set(curshift); |
|
cur_best_shift_cost = curshift->cost; |
|
cur_best_align->set(curalign); |
|
if ( PRINT_DEBUG ) { |
|
cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl; |
|
cerr << "Tmp Choosing shift: " << cur_best_shift->toString() << " gives:\n" << cur_best_align->toString() << "\n" << endl; |
|
cerr << "END DEBUG " << endl; |
|
} |
|
} |
|
} |
|
} |
|
} |
|
} |
|
bestShiftStruct * to_return=new bestShiftStruct(); |
|
if ( anygain ) { |
|
to_return->setEmpty(false); |
|
if ( PRINT_DEBUG ) { |
|
cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl; |
|
cerr << "Final shift chosen : " << cur_best_shift->toString() << " gives:\n" << cur_best_align->toString() << "\n" << endl; |
|
cerr << "END DEBUG " << endl; |
|
} |
|
to_return->m_best_shift->set(cur_best_shift); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
to_return->m_best_align->set(cur_best_align); |
|
|
|
} else { |
|
to_return->setEmpty(true); |
|
} |
|
|
|
delete(poss_shifts); |
|
delete(cur_best_align); |
|
delete(cur_best_shift); |
|
delete(curshift); |
|
delete(curalign) ; |
|
return to_return; |
|
} |
|
|
|
void terCalc::calculateTerAlignment ( terAlignment& align, vector<bool>* herr, vector<bool>* rerr, vector<int>* ralign ) |
|
{ |
|
int hpos = -1; |
|
int rpos = -1; |
|
CALL_TER_ALIGN++; |
|
|
|
if ( PRINT_DEBUG ) { |
|
|
|
cerr << "BEGIN DEBUG : terCalc::calculateTerAlignment : " << endl << align.toString() << endl; |
|
cerr << "END DEBUG " << endl; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for ( int i = 0; i < ( int ) align.alignment.size(); i++ ) { |
|
char sym = align.alignment.at(i); |
|
if ( sym == 'A' ) { |
|
hpos++; |
|
rpos++; |
|
herr->at(hpos) = false; |
|
rerr->at(rpos) = false; |
|
ralign->at(rpos) = hpos; |
|
} else if ( sym == 'S' ) { |
|
hpos++; |
|
rpos++; |
|
herr->at(hpos) = true; |
|
rerr->at(rpos) = true; |
|
ralign->at(rpos) = hpos; |
|
} else if ( sym == 'I' ) { |
|
hpos++; |
|
herr->at(hpos) = true; |
|
} else if ( sym == 'D' ) { |
|
rpos++; |
|
rerr->at(rpos) = true; |
|
ralign->at(rpos) = hpos+1; |
|
} else { |
|
cerr << "ERROR : terCalc::calculateTerAlignment : Invalid mini align sequence " << sym << " at pos " << i << endl; |
|
exit ( -1 ); |
|
} |
|
} |
|
} |
|
|
|
vector<vecTerShift> * terCalc::calculerPermutations ( vector< string >& hyp, vector< string >& ref, hashMapInfos& rloc, TERCPPNS_TERCpp::terAlignment& align, vector<bool>* herr, vector<bool>* rerr, vector<int>* ralign ) |
|
{ |
|
vector<vecTerShift> * allshifts = new vector<vecTerShift>(0); |
|
|
|
CALL_CALC_PERMUT++; |
|
|
|
if ( ( TAILLE_PERMUT_MAX <= 0 ) || ( DIST_MAX_PERMUT <= 0 ) ) { |
|
return allshifts; |
|
} |
|
allshifts = new vector<vecTerShift>( TAILLE_PERMUT_MAX + 1 ); |
|
int start=0; |
|
int end=0; |
|
bool ok = false; |
|
vector<int> mtiVec(0); |
|
vector<int>::iterator mti; |
|
int moveto=0; |
|
vector<string> cand(0); |
|
bool any_herr = false; |
|
bool any_rerr = false; |
|
int i=0; |
|
int l_nbr_permuts=0; |
|
|
|
vector<int> movetoitVec(0); |
|
string subVectorHypString=""; |
|
terShift * topush; |
|
for ( start = 0; start < ( int ) hyp.size(); start++ ) { |
|
subVectorHypString = vectorToString ( subVector ( hyp, start, start + 1 ) ); |
|
if ( ! rloc.trouve ( subVectorHypString ) ) { |
|
continue; |
|
} |
|
|
|
ok = false; |
|
mtiVec = rloc.getValue ( subVectorHypString ); |
|
mti = mtiVec.begin(); |
|
while ( mti != mtiVec.end() && ( ! ok ) ) { |
|
moveto = ( *mti ); |
|
mti++; |
|
if ( ( start != ralign->at(moveto) ) && ( ( ralign->at(moveto) - start ) <= DIST_MAX_PERMUT ) && ( ( start - ralign->at(moveto) - 1 ) <= DIST_MAX_PERMUT ) ) { |
|
ok = true; |
|
} |
|
} |
|
if ( ! ok ) { |
|
continue; |
|
} |
|
ok = true; |
|
for ( end = start; ( ok && ( end < ( int ) hyp.size() ) && ( end < start + TAILLE_PERMUT_MAX ) ); end++ ) { |
|
|
|
cand = subVector ( hyp, start, end + 1 ); |
|
ok = false; |
|
if ( ! ( rloc.trouve ( vectorToString ( cand ) ) ) ) { |
|
continue; |
|
} |
|
|
|
any_herr = false; |
|
|
|
for ( i = 0; ( ( i <= ( end - start ) ) && ( ! any_herr ) ); i++ ) { |
|
if ( herr->at(start+i) ) { |
|
any_herr = true; |
|
} |
|
} |
|
if ( any_herr == false ) { |
|
ok = true; |
|
continue; |
|
} |
|
|
|
movetoitVec = rloc.getValue ( ( string ) vectorToString ( cand ) ); |
|
|
|
vector<int>::iterator movetoit; |
|
movetoit = movetoitVec.begin(); |
|
while ( movetoit != movetoitVec.end() ) { |
|
moveto = ( *movetoit ); |
|
movetoit++; |
|
if ( ! ( ( ralign->at(moveto) != start ) && ( ( ralign->at(moveto) < start ) || ( ralign->at(moveto) > end ) ) && ( ( ralign->at(moveto) - start ) <= DIST_MAX_PERMUT ) && ( ( start - ralign->at(moveto) ) <= DIST_MAX_PERMUT ) ) ) { |
|
continue; |
|
} |
|
ok = true; |
|
|
|
|
|
|
|
|
|
|
|
any_rerr = false; |
|
for ( int i = 0; ( i <= end - start ) && ( ! any_rerr ); i++ ) { |
|
if ( rerr->at(moveto+i) ) { |
|
any_rerr = true; |
|
} |
|
} |
|
if ( ! any_rerr ) { |
|
continue; |
|
} |
|
for ( int roff = -1; roff <= ( end - start ); roff++ ) { |
|
topush = new terShift(); |
|
bool topushNull = true; |
|
if ( ( roff == -1 ) && ( moveto == 0 ) ) { |
|
if ( PRINT_DEBUG ) { |
|
|
|
cerr << "BEGIN DEBUG : terCalc::calculerPermutations 01 : " << endl << "Consider making " << start << "..." << end << " (" << vectorToString(cand," ")<< ") moveto: " << moveto << " roff: " << roff << " ralign[mt+roff]: -1" << endl << "END DEBUG" << endl; |
|
} |
|
|
|
|
|
topush->start=start; |
|
topush->end=end; |
|
topush->moveto=-1; |
|
topush->newloc=-1; |
|
topushNull = false; |
|
} else if ( ( start != ralign->at(moveto+roff) ) && ( ( roff == 0 ) || ( ralign->at(moveto+roff) != ralign->at(moveto) ) ) ) { |
|
int newloc = ralign->at(moveto+roff); |
|
if ( PRINT_DEBUG ) { |
|
|
|
cerr << "BEGIN DEBUG : terCalc::calculerPermutations 02 : " << endl << "Consider making " << start << "..." << end << " (" << vectorToString(cand," ")<< ") moveto: " << moveto << " roff: " << roff << " ralign[mt+roff]: " << newloc << endl << "END DEBUG" << endl; |
|
} |
|
|
|
|
|
topush->start=start; |
|
topush->end=end; |
|
topush->moveto=moveto + roff; |
|
topush->newloc=newloc; |
|
topushNull = false; |
|
} |
|
if ( !topushNull ) { |
|
topush->shifted = cand; |
|
topush->cost = shift_cost; |
|
l_nbr_permuts++; |
|
if ( PRINT_DEBUG ) { |
|
|
|
cerr << "BEGIN DEBUG : terCalc::calculerPermutations 02 : " << endl; |
|
cerr << "start : " << start << endl; |
|
cerr << "end : " << end << endl; |
|
cerr << "end - start : " << end - start << endl; |
|
cerr << "nbr Permutations added: " << l_nbr_permuts << endl; |
|
cerr << "END DEBUG " << endl; |
|
} |
|
if (l_nbr_permuts < NBR_PERMUT_MAX + 1) { |
|
( allshifts->at ( end - start ) ).push_back ( (*(topush)) ); |
|
} |
|
|
|
|
|
|
|
|
|
} |
|
delete(topush); |
|
} |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
return allshifts; |
|
} |
|
|
|
|
|
alignmentStruct terCalc::permuter ( vector< string >& words, TERCPPNS_TERCpp::terShift& s ) |
|
{ |
|
return permuter ( words, s.start, s.end, s.newloc ); |
|
} |
|
alignmentStruct terCalc::permuter ( vector< string >& words, TERCPPNS_TERCpp::terShift* s ) |
|
{ |
|
return permuter ( words, s->start, s->end, s->newloc ); |
|
} |
|
|
|
|
|
alignmentStruct terCalc::permuter ( vector< string >& words, int start, int end, int newloc ) |
|
{ |
|
int c = 0; |
|
vector<string> nwords ( words ); |
|
vector<vecInt> spans ( ( int ) hypSpans.size() ); |
|
alignmentStruct to_return; |
|
if ( PRINT_DEBUG ) { |
|
|
|
if ( ( int ) hypSpans.size() > 0 ) { |
|
cerr << "BEGIN DEBUG : terCalc::permuter :" << endl << "word length: " << ( int ) words.size() << " span length: " << ( int ) hypSpans.size() << endl ; |
|
} else { |
|
cerr << "BEGIN DEBUG : terCalc::permuter :" << endl << "word length: " << ( int ) words.size() << " span length: null" << endl ; |
|
} |
|
cerr << "BEGIN DEBUG : terCalc::permuter :" << endl << join(" ",words) << " start: " << start << " end: " << end << " newloc "<< newloc << endl << "END DEBUG " << endl; |
|
} |
|
if (newloc >= ( int ) words.size()) { |
|
if ( PRINT_DEBUG ) { |
|
cerr << "WARNING: Relocation over the size of the hypothesis, replacing at the end of it."<<endl; |
|
} |
|
newloc = ( int ) words.size()-1; |
|
} |
|
|
|
|
|
|
|
if ( newloc == -1 ) { |
|
for ( int i = start; i <= end; i++ ) { |
|
nwords.at ( c++ ) = words.at ( i ); |
|
if ( ( int ) hypSpans.size() > 0 ) { |
|
spans.at ( c - 1 ) = hypSpans.at ( i ); |
|
} |
|
} |
|
for ( int i = 0; i <= start - 1; i++ ) { |
|
nwords.at ( c++ ) = words.at ( i ); |
|
if ( ( int ) hypSpans.size() > 0 ) { |
|
spans.at ( c - 1 ) = hypSpans.at ( i ); |
|
} |
|
} |
|
for ( int i = end + 1; i < ( int ) words.size(); i++ ) { |
|
nwords.at ( c++ ) = words.at ( i ); |
|
if ( ( int ) hypSpans.size() > 0 ) { |
|
spans.at ( c - 1 ) = hypSpans.at ( i ); |
|
} |
|
} |
|
} else { |
|
if ( newloc < start ) { |
|
|
|
for ( int i = 0; i < newloc; i++ ) { |
|
nwords.at ( c++ ) = words.at ( i ); |
|
if ( ( int ) hypSpans.size() > 0 ) { |
|
spans.at ( c - 1 ) = hypSpans.at ( i ); |
|
} |
|
} |
|
for ( int i = start; i <= end; i++ ) { |
|
nwords.at ( c++ ) = words.at ( i ); |
|
if ( ( int ) hypSpans.size() > 0 ) { |
|
spans.at ( c - 1 ) = hypSpans.at ( i ); |
|
} |
|
} |
|
for ( int i = newloc ; i < start ; i++ ) { |
|
nwords.at ( c++ ) = words.at ( i ); |
|
if ( ( int ) hypSpans.size() > 0 ) { |
|
spans.at ( c - 1 ) = hypSpans.at ( i ); |
|
} |
|
} |
|
for ( int i = end + 1; i < ( int ) words.size(); i++ ) { |
|
nwords.at ( c++ ) = words.at ( i ); |
|
if ( ( int ) hypSpans.size() > 0 ) { |
|
spans.at ( c - 1 ) = hypSpans.at ( i ); |
|
} |
|
} |
|
} else { |
|
if ( newloc > end ) { |
|
for ( int i = 0; i <= start - 1; i++ ) { |
|
nwords.at ( c++ ) = words.at ( i ); |
|
if ( ( int ) hypSpans.size() > 0 ) { |
|
spans.at ( c - 1 ) = hypSpans.at ( i ); |
|
} |
|
} |
|
for ( int i = end + 1; i <= newloc; i++ ) { |
|
nwords.at ( c++ ) = words.at ( i ); |
|
if ( ( int ) hypSpans.size() > 0 ) { |
|
spans.at ( c - 1 ) = hypSpans.at ( i ); |
|
} |
|
} |
|
for ( int i = start; i <= end; i++ ) { |
|
nwords.at ( c++ ) = words.at ( i ); |
|
if ( ( int ) hypSpans.size() > 0 ) { |
|
spans.at ( c - 1 ) = hypSpans.at ( i ); |
|
} |
|
} |
|
for ( int i = newloc + 1; i < ( int ) words.size(); i++ ) { |
|
nwords.at ( c++ ) = words.at ( i ); |
|
if ( ( int ) hypSpans.size() > 0 ) { |
|
spans.at ( c - 1 ) = hypSpans.at ( i ); |
|
} |
|
} |
|
} else { |
|
|
|
for ( int i = 0; i <= start - 1; i++ ) { |
|
nwords.at ( c++ ) = words.at ( i ); |
|
if ( ( int ) hypSpans.size() > 0 ) { |
|
spans.at ( c - 1 ) = hypSpans.at ( i ); |
|
} |
|
} |
|
for ( int i = end + 1; ( i < ( int ) words.size() ) && ( i <= ( end + ( newloc - start ) ) ); i++ ) { |
|
nwords.at ( c++ ) = words.at ( i ); |
|
if ( ( int ) hypSpans.size() > 0 ) { |
|
spans.at ( c - 1 ) = hypSpans.at ( i ); |
|
} |
|
} |
|
for ( int i = start; i <= end; i++ ) { |
|
nwords.at ( c++ ) = words.at ( i ); |
|
if ( ( int ) hypSpans.size() > 0 ) { |
|
spans.at ( c - 1 ) = hypSpans.at ( i ); |
|
} |
|
} |
|
for ( int i = ( end + ( newloc - start ) + 1 ); i < ( int ) words.size(); i++ ) { |
|
nwords.at ( c++ ) = words.at ( i ); |
|
if ( ( int ) hypSpans.size() > 0 ) { |
|
spans.at ( c - 1 ) = hypSpans.at ( i ); |
|
} |
|
} |
|
} |
|
} |
|
} |
|
NBR_PERMUTS_CONSID++; |
|
|
|
if ( PRINT_DEBUG ) { |
|
cerr << "nwords" << join(" ",nwords) << endl; |
|
|
|
} |
|
|
|
to_return.nwords = nwords; |
|
to_return.aftershift = spans; |
|
return to_return; |
|
} |
|
void terCalc::setDebugMode ( bool b ) |
|
{ |
|
PRINT_DEBUG = b; |
|
} |
|
|
|
} |
|
|