NMTKD
/
translation
/tools
/mosesdecoder
/contrib
/expected-bleu-training
/ExpectedBleuOptimizer.cpp
/* | |
Moses - statistical machine translation system | |
Copyright (C) 2005-2015 University of Edinburgh | |
This library is free software; you can redistribute it and/or | |
modify it under the terms of the GNU Lesser General Public | |
License as published by the Free Software Foundation; either | |
version 2.1 of the License, or (at your option) any later version. | |
This library is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
Lesser General Public License for more details. | |
You should have received a copy of the GNU Lesser General Public | |
License along with this library; if not, write to the Free Software | |
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
*/ | |
namespace ExpectedBleuTraining | |
{ | |
void ExpectedBleuOptimizer::AddTrainingInstance(const size_t nBestSizeCount, | |
const std::vector<float>& sBleu, | |
const std::vector<double>& overallScoreUntransformed, | |
const std::vector< boost::unordered_map<size_t, float> > &sparseScore, | |
bool maintainUpdateSet) | |
{ | |
// compute xBLEU | |
double sumUntransformedScores = 0.0; | |
for (std::vector<double>::const_iterator overallScoreUntransformedIt=overallScoreUntransformed.begin(); | |
overallScoreUntransformedIt!=overallScoreUntransformed.end(); ++overallScoreUntransformedIt) | |
{ | |
sumUntransformedScores += *overallScoreUntransformedIt; | |
} | |
double xBleu = 0.0; | |
assert(nBestSizeCount == overallScoreUntransformed.size()); | |
std::vector<double> p; | |
for (size_t i=0; i<nBestSizeCount; ++i) | |
{ | |
if (sumUntransformedScores != 0) { | |
p.push_back( overallScoreUntransformed[i] / sumUntransformedScores ); | |
} else { | |
p.push_back( 0 ); | |
} | |
xBleu += p.back() * sBleu[ i ]; | |
} | |
for (size_t i=0; i<nBestSizeCount; ++i) | |
{ | |
double D = sBleu[ i ] - xBleu; | |
for (boost::unordered_map<size_t, float>::const_iterator sparseScoreIt=sparseScore[i].begin(); | |
sparseScoreIt!=sparseScore[i].end(); ++sparseScoreIt) | |
{ | |
const size_t name = sparseScoreIt->first; | |
float N = sparseScoreIt->second; | |
if ( std::fpclassify( p[i] * N * D ) == FP_SUBNORMAL ) | |
{ | |
m_err << "Error: encountered subnormal value: p[i] * N * D= " << p[i] * N * D | |
<< " with p[i]= " << p[i] << " N= " << N << " D= " << D << '\n'; | |
m_err.flush(); | |
exit(1); | |
} else { | |
m_gradient[name] += p[i] * N * D; | |
if ( maintainUpdateSet ) | |
{ | |
m_updateSet.insert(name); | |
} | |
} | |
} | |
} | |
m_xBleu += xBleu; | |
} | |
void ExpectedBleuOptimizer::InitSGD(const std::vector<float>& sparseScalingFactor) | |
{ | |
const size_t nFeatures = sparseScalingFactor.size(); | |
memcpy(&m_previousSparseScalingFactor.at(0), &sparseScalingFactor.at(0), nFeatures); | |
m_gradient.resize(nFeatures); | |
} | |
float ExpectedBleuOptimizer::UpdateSGD(std::vector<float>& sparseScalingFactor, | |
size_t batchSize, | |
bool useUpdateSet) | |
{ | |
float xBleu = m_xBleu / batchSize; | |
// update sparse scaling factors | |
if (useUpdateSet) { | |
for (std::set<size_t>::const_iterator it = m_updateSet.begin(); it != m_updateSet.end(); ++it) | |
{ | |
size_t name = *it; | |
UpdateSingleScalingFactorSGD(name, sparseScalingFactor, batchSize); | |
} | |
m_updateSet.clear(); | |
} else { | |
for (size_t name=0; name<sparseScalingFactor.size(); ++name) | |
{ | |
UpdateSingleScalingFactorSGD(name, sparseScalingFactor, batchSize); | |
} | |
} | |
m_xBleu = 0; | |
m_gradient.clear(); | |
return xBleu; | |
} | |
void ExpectedBleuOptimizer::UpdateSingleScalingFactorSGD(size_t name, | |
std::vector<float>& sparseScalingFactor, | |
size_t batchSize) | |
{ | |
// regularization | |
if ( m_regularizationParameter != 0 ) | |
{ | |
m_gradient[name] = m_gradient[name] / m_xBleu - m_regularizationParameter * 2 * sparseScalingFactor[name]; | |
} else { | |
// need to normalize by dividing by batchSize | |
m_gradient[name] /= batchSize; | |
} | |
// the actual update | |
sparseScalingFactor[name] += m_learningRate * m_gradient[name]; | |
// discard scaling factors below a threshold | |
if ( fabs(sparseScalingFactor[name]) < m_floorAbsScalingFactor ) | |
{ | |
sparseScalingFactor[name] = 0; | |
} | |
} | |
void ExpectedBleuOptimizer::InitRPROP(const std::vector<float>& sparseScalingFactor) | |
{ | |
const size_t nFeatures = sparseScalingFactor.size(); | |
m_previousSparseScalingFactor.resize(nFeatures); | |
memcpy(&m_previousSparseScalingFactor.at(0), &sparseScalingFactor.at(0), nFeatures); | |
m_previousGradient.resize(nFeatures); | |
m_gradient.resize(nFeatures); | |
m_stepSize.resize(nFeatures, m_initialStepSize); | |
} | |
float ExpectedBleuOptimizer::UpdateRPROP(std::vector<float>& sparseScalingFactor, | |
const size_t batchSize) | |
{ | |
float xBleu = m_xBleu / batchSize; | |
// update sparse scaling factors | |
for (size_t name=0; name<sparseScalingFactor.size(); ++name) | |
{ | |
// Sum of gradients. All we need is the sign. Don't need to normalize by dividing by batchSize. | |
// regularization | |
if ( m_regularizationParameter != 0 ) | |
{ | |
m_gradient[name] = m_gradient[name] / m_xBleu - m_regularizationParameter * 2 * sparseScalingFactor[name]; | |
} | |
// step size | |
int sign = Sign(m_gradient[name]) * Sign(m_previousGradient[name]); | |
if (sign > 0) { | |
m_stepSize[name] *= m_increaseRate; | |
} else if (sign < 0) { | |
m_stepSize[name] *= m_decreaseRate; | |
} | |
if (m_stepSize[name] < m_minStepSize) { | |
m_stepSize[name] = m_minStepSize; | |
} | |
if (m_stepSize[name] > m_maxStepSize) { | |
m_stepSize[name] = m_maxStepSize; | |
} | |
// the actual update | |
m_previousGradient[name] = m_gradient[name]; | |
if (sign >= 0) { | |
if (m_gradient[name] > 0) { | |
m_previousSparseScalingFactor[name] = sparseScalingFactor[name]; | |
sparseScalingFactor[name] += m_stepSize[name]; | |
} else if (m_gradient[name] < 0) { | |
m_previousSparseScalingFactor[name] = sparseScalingFactor[name]; | |
sparseScalingFactor[name] -= m_stepSize[name]; | |
} | |
} else { | |
sparseScalingFactor[name] = m_previousSparseScalingFactor[name]; | |
// m_previousGradient[name] = 0; | |
} | |
// discard scaling factors below a threshold | |
if ( fabs(sparseScalingFactor[name]) < m_floorAbsScalingFactor ) | |
{ | |
sparseScalingFactor[name] = 0; | |
} | |
} | |
m_xBleu = 0; | |
m_gradient.clear(); | |
return xBleu; | |
} | |
} | |