|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <cstdlib> |
|
#include <map> |
|
#include <set> |
|
#include <cmath> |
|
#include <fstream> |
|
#include <iostream> |
|
#include <vector> |
|
#include <sstream> |
|
|
|
|
|
using namespace std; |
|
|
|
|
|
double initTransitionProb; |
|
double LAMBDA; |
|
|
|
double addLogProbs(double A , double B) |
|
{ |
|
|
|
if (A == B) |
|
return (A + log10(2.0)); |
|
|
|
if (A > B) { |
|
if (A - B > 6) |
|
return A; |
|
else |
|
return (A + log10(1+pow(10,(B-A)))); |
|
} |
|
|
|
else { |
|
if (B - A > 6) |
|
return B; |
|
else |
|
return (B + log10(1+pow(10,(A-B)))); |
|
} |
|
|
|
} |
|
|
|
|
|
class NodeStructure |
|
{ |
|
|
|
public: |
|
|
|
NodeStructure() {}; |
|
NodeStructure(vector <string> & s , vector <string> & t); |
|
double getPosterior() { |
|
return PPR; |
|
} |
|
void computeFwdBckProbs(map <string , double> & gammas, map <string, double> & alignmentCounts); |
|
void computeNonTransliterationProb (map <string , double> & sourceUnigrams , map <string , double> & targetUnigrams); |
|
void print(); |
|
|
|
vector <string> source; |
|
vector <string> target; |
|
~NodeStructure() {}; |
|
|
|
private: |
|
|
|
double NTR; |
|
double PPR; |
|
double ALPHA; |
|
double BETA; |
|
|
|
void computeGammaForEdges(map < pair <int , int> , double > & parents, map < pair <int , int> , double > & children , map <string, double> & transitionProbs , map <string, double> & alignmentCounts); |
|
double computeFwdProbs(pair <int , int> & ST, map <string , double> & gammas, map < pair <int , int> , double > & parents); |
|
double FwdProb (pair <int , int> & TS, map <string , double> & gammas, map < pair <int , int> , double > & parents); |
|
double BckProb (pair <int , int> & TS, map <string , double> & gammas, map < pair <int , int> , double > & chidren); |
|
double computeBckProbs(pair <int , int> & ST, map <string , double> & gammas, map < pair <int , int> , double > & children); |
|
void getIncomingEdges (pair <int , int> & ST , vector < pair < int , int> > & incomingEdges); |
|
void getOutgoingEdges (pair <int , int> & ST , vector < pair < int , int> > & outgoingEdges); |
|
double getTransitionProb(map <string, double> & transitionProbs , pair <int,int> & edge); |
|
void updateAlignmentCount(map <string, double> & transitionProbs, map <string, double> & alignmentCounts , pair <int,int> & edge , double alpha , double beta); |
|
void computePosteriorProb(); |
|
double scaleGamma(double g); |
|
void getEdge (pair <int , int> & v1 , pair <int , int> & v2 , pair <int , int> & v3); |
|
|
|
}; |
|
|
|
void NodeStructure :: print() |
|
{ |
|
|
|
for (int i = 0; i < source.size(); i++) |
|
cout<<source[i]; |
|
|
|
cout<<"\t"; |
|
|
|
for (int i = 0; i < target.size(); i++) |
|
cout<<target[i]; |
|
|
|
cout<<"\t"<<pow(10,PPR)<<endl; |
|
|
|
} |
|
|
|
NodeStructure :: NodeStructure(vector <string> & s , vector <string> & t) |
|
{ |
|
source = s; |
|
target = t; |
|
} |
|
|
|
|
|
void NodeStructure :: getEdge (pair <int , int> & v1 , pair <int , int> & v2 , pair <int , int> & v3) |
|
{ |
|
if (v2.first - v1.first == 0) |
|
v3.first = -1; |
|
else |
|
v3.first = v2.first; |
|
|
|
if (v2.second - v1.second == 0) |
|
v3.second = -1; |
|
else |
|
v3.second = v2.second; |
|
} |
|
|
|
void NodeStructure :: computeGammaForEdges(map < pair <int , int> , double > & parents, map < pair <int , int> , double > & children , map <string, double> & transitionProbs , map <string, double> & alignmentCounts) |
|
{ |
|
|
|
vector < pair < int , int> > incomingEdges; |
|
map < pair <int , int> , double > :: iterator cIter; |
|
map < pair <int , int> , double > :: iterator pIter; |
|
pair <int , int> ST = make_pair (-1,-1); |
|
pair <int , int> edge; |
|
|
|
children.erase(ST); |
|
double tProb; |
|
double alpha; |
|
double beta; |
|
|
|
for (cIter = children.begin(); cIter != children.end(); cIter++) { |
|
ST = cIter->first; |
|
|
|
getIncomingEdges (ST , incomingEdges); |
|
beta = cIter->second; |
|
|
|
for (int i = 0; i< incomingEdges.size(); i++) { |
|
pIter = parents.find(incomingEdges[i]); |
|
|
|
alpha = pIter->second; |
|
getEdge (incomingEdges[i] , ST , edge); |
|
|
|
updateAlignmentCount(transitionProbs, alignmentCounts , edge , alpha , beta); |
|
} |
|
} |
|
|
|
} |
|
|
|
void NodeStructure :: computeNonTransliterationProb (map <string , double> & sourceUnigrams , map <string , double> & targetUnigrams) |
|
{ |
|
|
|
NTR = 0.0; |
|
|
|
for (int i = 0; i < source.size(); i++) { |
|
NTR += sourceUnigrams[source[i]]; |
|
} |
|
|
|
for (int i = 0; i < target.size(); i++) { |
|
|
|
NTR += targetUnigrams[target[i]]; |
|
} |
|
|
|
} |
|
|
|
double NodeStructure :: scaleGamma(double g) |
|
{ |
|
double translit = log10 (1 - pow (10, PPR)); |
|
return g + translit; |
|
} |
|
void NodeStructure :: computePosteriorProb() |
|
{ |
|
double LAMBDA2 = log10(1 - pow(10, LAMBDA)); |
|
double transliterate = LAMBDA2 + ALPHA; |
|
double translate = LAMBDA + NTR; |
|
double trans = transliterate - translate; |
|
|
|
|
|
|
|
double prob = 1/(1+ pow(10 , trans)); |
|
PPR = log10(prob); |
|
|
|
|
|
} |
|
|
|
void NodeStructure :: computeFwdBckProbs(map <string , double> & gammas , map <string, double> & alignmentCounts) |
|
{ |
|
pair <int , int> START = make_pair (source.size()-1 , target.size()-1); |
|
pair <int , int> END = make_pair (-1 , -1); |
|
|
|
map < pair <int , int> , double > parents; |
|
parents[make_pair(-1,-1)] = 0.0; |
|
map < pair <int , int> , double > children; |
|
children[make_pair(source.size()-1,target.size()-1)] = 0.0; |
|
|
|
ALPHA = computeFwdProbs(START , gammas, parents); |
|
BETA = computeBckProbs(END , gammas, children); |
|
|
|
computePosteriorProb(); |
|
|
|
computeGammaForEdges(parents , children , gammas , alignmentCounts); |
|
|
|
} |
|
|
|
void NodeStructure :: getIncomingEdges (pair <int , int> & ST , vector < pair < int , int> > & incomingEdges) |
|
{ |
|
incomingEdges.clear(); |
|
|
|
if (ST.first == -1) { |
|
incomingEdges.push_back(make_pair(ST.first , ST.second-1)); |
|
} else if (ST.second == -1) { |
|
incomingEdges.push_back(make_pair(ST.first-1 , ST.second)); |
|
} else { |
|
incomingEdges.push_back(make_pair(ST.first , ST.second-1)); |
|
incomingEdges.push_back(make_pair(ST.first-1 , ST.second)); |
|
incomingEdges.push_back(make_pair(ST.first-1 , ST.second-1)); |
|
} |
|
|
|
} |
|
|
|
void NodeStructure :: getOutgoingEdges (pair <int , int> & ST , vector < pair < int , int> > & outgoingEdges) |
|
{ |
|
|
|
if (ST.first == source.size()-1) { |
|
outgoingEdges.push_back(make_pair(ST.first , ST.second+1)); |
|
} else if (ST.second == target.size()-1) { |
|
outgoingEdges.push_back(make_pair(ST.first+1 , ST.second)); |
|
} else { |
|
outgoingEdges.push_back(make_pair(ST.first , ST.second+1)); |
|
outgoingEdges.push_back(make_pair(ST.first+1 , ST.second)); |
|
outgoingEdges.push_back(make_pair(ST.first+1 , ST.second+1)); |
|
} |
|
|
|
} |
|
|
|
void NodeStructure :: updateAlignmentCount(map <string, double> & transitionProbs, map <string, double> & alignmentCounts , pair <int,int> & edge , double alpha , double beta) |
|
{ |
|
|
|
double tProb; |
|
double tgamma; |
|
double gamma; |
|
map <string , double> :: iterator aCounts; |
|
string query; |
|
|
|
if (edge.first == -1) |
|
query = "NULL"; |
|
else |
|
query = source[edge.first]; |
|
|
|
query += "-"; |
|
|
|
if (edge.second == -1) |
|
query += "NULL"; |
|
else |
|
query += target[edge.second]; |
|
|
|
|
|
if (transitionProbs.size() == 0) |
|
tProb = initTransitionProb; |
|
else |
|
tProb = transitionProbs[query]; |
|
|
|
|
|
tgamma = alpha + tProb + beta - ALPHA; |
|
gamma = scaleGamma(tgamma); |
|
|
|
|
|
|
|
aCounts = alignmentCounts.find(query); |
|
|
|
if (aCounts == alignmentCounts.end()) { |
|
alignmentCounts[query] = gamma; |
|
} else { |
|
double temp = aCounts->second; |
|
aCounts->second = addLogProbs(temp , gamma); |
|
} |
|
|
|
} |
|
|
|
double NodeStructure :: getTransitionProb(map <string, double> & transitionProbs , pair <int,int> & edge) |
|
{ |
|
|
|
if (transitionProbs.size() == 0) |
|
return initTransitionProb; |
|
|
|
string query; |
|
|
|
if (edge.first == -1) |
|
query = "NULL"; |
|
else |
|
query = source[edge.first]; |
|
|
|
query += "-"; |
|
|
|
if (edge.second == -1) |
|
query += "NULL"; |
|
else |
|
query += target[edge.second]; |
|
|
|
|
|
return transitionProbs[query]; |
|
} |
|
|
|
double NodeStructure :: FwdProb (pair <int , int> & TS, map <string , double> & gammas, map < pair <int , int> , double > & parents) |
|
{ |
|
|
|
double thisAlpha; |
|
double alpha = -2000; |
|
vector < pair < int , int> > incomingEdges; |
|
pair <int , int> edge; |
|
|
|
|
|
getIncomingEdges (TS , incomingEdges); |
|
|
|
for (int k = 0; k < incomingEdges.size(); k++) { |
|
thisAlpha = parents[incomingEdges[k]]; |
|
getEdge (incomingEdges[k], TS , edge); |
|
thisAlpha += getTransitionProb(gammas , edge); |
|
double temp = alpha; |
|
alpha = addLogProbs(temp , thisAlpha); |
|
|
|
} |
|
|
|
return alpha; |
|
} |
|
|
|
double NodeStructure :: computeFwdProbs(pair <int , int> & ST, map <string , double> & gammas, map < pair <int , int> , double > & parents) |
|
{ |
|
|
|
pair <int , int> TS; |
|
double alpha; |
|
|
|
for (int i = 0; i < source.size(); i++) { |
|
TS = make_pair (i , -1); |
|
alpha = FwdProb (TS, gammas, parents); |
|
parents[TS] = alpha; |
|
} |
|
|
|
for (int i = 0; i < target.size(); i++) { |
|
TS = make_pair (-1 , i); |
|
alpha = FwdProb (TS, gammas, parents); |
|
parents[TS] = alpha; |
|
} |
|
|
|
for (int i = 0; i < source.size(); i++) { |
|
for (int j = 0; j < target.size(); j++) { |
|
TS = make_pair (i , j); |
|
alpha = FwdProb (TS, gammas, parents); |
|
parents[TS] = alpha; |
|
} |
|
} |
|
|
|
return parents[ST]; |
|
} |
|
|
|
double NodeStructure :: BckProb (pair <int , int> & TS, map <string , double> & gammas, map < pair <int , int> , double > & children) |
|
{ |
|
|
|
double thisBeta; |
|
double beta = -2000; |
|
vector < pair < int , int> > outgoingEdges; |
|
pair <int , int> edge; |
|
|
|
getOutgoingEdges (TS , outgoingEdges); |
|
|
|
for (int k = 0; k < outgoingEdges.size(); k++) { |
|
thisBeta = children[outgoingEdges[k]]; |
|
getEdge (TS , outgoingEdges[k], edge); |
|
thisBeta += getTransitionProb(gammas , edge); |
|
double temp = beta; |
|
beta = addLogProbs(temp , thisBeta); |
|
|
|
} |
|
|
|
return beta; |
|
} |
|
|
|
|
|
double NodeStructure :: computeBckProbs(pair <int , int> & ST, map <string , double> & gammas, map < pair <int , int> , double > & children) |
|
{ |
|
|
|
pair <int , int> TS; |
|
double beta; |
|
|
|
for (int i = source.size()-2; i >= -1; i--) { |
|
TS = make_pair (i , target.size()-1); |
|
beta = BckProb (TS, gammas, children); |
|
children[TS] = beta; |
|
} |
|
|
|
for (int i = target.size()-2; i >=-1; i--) { |
|
TS = make_pair (source.size()-1 , i); |
|
beta = BckProb (TS, gammas, children); |
|
children[TS] = beta; |
|
} |
|
|
|
for (int i = source.size()-2 ; i >= -1 ; i--) { |
|
for (int j = target.size()-2 ; j >= -1; j--) { |
|
TS = make_pair (i , j); |
|
beta = BckProb (TS, gammas, children); |
|
children[TS] = beta; |
|
} |
|
} |
|
|
|
return children[ST]; |
|
} |
|
|
|
|
|
|
|
void loadInput(const char * fileName, vector <string> & input) |
|
{ |
|
|
|
|
|
|
|
ifstream sr (fileName); |
|
string line; |
|
|
|
if(sr.is_open()) { |
|
while(getline(sr , line )) { |
|
input.push_back(line); |
|
} |
|
|
|
sr.close(); |
|
} else { |
|
cout<<"Unable to read "<<fileName<<endl; |
|
exit(1); |
|
} |
|
|
|
} |
|
|
|
void printGammas(map <string, double> & alignmentCounts) |
|
{ |
|
map <string , double> :: iterator aCounts; |
|
|
|
for (aCounts = alignmentCounts.begin(); aCounts != alignmentCounts.end(); aCounts++) { |
|
cout<<aCounts->first<<" "<<aCounts->second<<endl; |
|
} |
|
} |
|
|
|
|
|
void getWords(string s, vector <string> & currInput) |
|
{ |
|
|
|
|
|
|
|
istringstream iss(s); |
|
currInput.clear(); |
|
do { |
|
string sub; |
|
iss >> sub; |
|
currInput.push_back(sub); |
|
|
|
} while (iss); |
|
|
|
currInput.pop_back(); |
|
} |
|
|
|
double getInitTransitionProb(int sourceToken, int targetToken) |
|
{ |
|
double prod = sourceToken * targetToken; |
|
return log10(1/prod); |
|
} |
|
|
|
void runIteration(map <int , NodeStructure> & graph , map <string , double> & gammas , int size) |
|
{ |
|
|
|
map <string, double> alignmentCounts; |
|
map <int , NodeStructure> :: iterator i; |
|
map <string , double> :: iterator aCounts; |
|
double sum = -2000.0; |
|
double tPPR = -2000.0; |
|
|
|
for (i = graph.begin(); i != graph.end(); i++) { |
|
|
|
i->second.computeFwdBckProbs(gammas , alignmentCounts); |
|
double temp = tPPR; |
|
|
|
tPPR = addLogProbs(graph[i->first].getPosterior() , temp); |
|
|
|
} |
|
|
|
for (aCounts = alignmentCounts.begin(); aCounts != alignmentCounts.end(); aCounts++) { |
|
double temp = sum; |
|
sum = addLogProbs(aCounts->second, temp); |
|
} |
|
|
|
|
|
for (aCounts = alignmentCounts.begin(); aCounts != alignmentCounts.end(); aCounts++) { |
|
aCounts->second = aCounts->second - sum; |
|
} |
|
|
|
gammas.clear(); |
|
gammas = alignmentCounts; |
|
|
|
LAMBDA = tPPR - log10(size); |
|
} |
|
|
|
|
|
void setNTRProbabilities(map <int , NodeStructure> & graph , map <string , double> & sourceTypes , map <string , double > & targetTypes, double sourceTokens, double targetTokens) |
|
{ |
|
|
|
map <string , double> :: iterator i; |
|
map <int , NodeStructure> :: iterator j; |
|
|
|
|
|
for (i = sourceTypes.begin(); i!= sourceTypes.end(); i++) { |
|
i->second = log10(i->second/sourceTokens); |
|
} |
|
|
|
for (i = targetTypes.begin(); i!= targetTypes.end(); i++) { |
|
i->second = log10(i->second/targetTokens); |
|
} |
|
|
|
|
|
for (j = graph.begin(); j != graph.end(); j++) { |
|
j->second.computeNonTransliterationProb(sourceTypes , targetTypes); |
|
} |
|
|
|
} |
|
|
|
void printPosterior(map <int , NodeStructure> & graph) |
|
{ |
|
|
|
map <int , NodeStructure> :: iterator i; |
|
|
|
for (i = graph.begin(); i != graph.end(); i++) |
|
graph[i->first].print(); |
|
} |
|
|
|
|
|
int main(int argc, char * argv[]) |
|
{ |
|
|
|
vector <string> input; |
|
vector <string> source; |
|
vector <string> target; |
|
map <string , double> sourceTypes; |
|
map <string , double> targetTypes; |
|
set < vector <string> > tgt; |
|
set < vector <string> > src; |
|
double sourceTokens = 0; |
|
double targetTokens = 0; |
|
map <int , NodeStructure> graph; |
|
map <string , double> gammas; |
|
|
|
loadInput(argv[1],input); |
|
|
|
cerr<<"Constructing Graph "<<endl; |
|
|
|
for(int i=0; i<input.size(); i+=2) { |
|
|
|
|
|
|
|
|
|
|
|
getWords(input[i],source); |
|
getWords(input[i+1],target); |
|
|
|
if (src.find(source) == src.end()) { |
|
for (int j = 0; j< source.size(); j++) |
|
sourceTypes[source[j]]++; |
|
src.insert(source); |
|
sourceTokens += source.size(); |
|
} |
|
|
|
if (tgt.find(target) == tgt.end()) { |
|
for (int j = 0; j< target.size(); j++) |
|
targetTypes[target[j]]++; |
|
|
|
tgt.insert(target); |
|
targetTokens += target.size(); |
|
} |
|
|
|
NodeStructure obj (source,target); |
|
graph[i] = obj; |
|
|
|
} |
|
|
|
setNTRProbabilities(graph, sourceTypes, targetTypes, sourceTokens, targetTokens); |
|
initTransitionProb = getInitTransitionProb(sourceTypes.size()+1, targetTypes.size()+1); |
|
|
|
LAMBDA = log10(0.5); |
|
|
|
|
|
for (int i = 0; i< 10; i++) { |
|
|
|
cerr<<"Computing Probs : iteration "<<i+1<<endl; |
|
runIteration(graph , gammas , input.size()/2); |
|
|
|
} |
|
|
|
printPosterior(graph); |
|
cerr<<"Finished..."<<endl; |
|
|
|
return 0; |
|
} |
|
|
|
|