sakharamg
/

NMTKD

Model card Files Files and versions Community

NMTKD / translation /tools /mosesdecoder /contrib /promix /nbest.py

sakharamg

Uploading all files

158b61b about 2 years ago

raw

history blame contribute delete

7.09 kB

	#!/usr/bin/env python

	import gzip
	import os
	import re
	import numpy as np
	import sys

	from bleu import BleuScorer
	from coll import OrderedDict
	# Edit to set moses python path
	sys.path.append(os.path.dirname(__file__) + "/../python")
	import moses.dictree as binpt

	class DataFormatException(Exception):
	pass

	class Hypothesis:
	def __init__(self,text,fv,segments=False):
	self.alignment = [] #only stored for segmented hypos
	self.tokens = [] #only stored for segmented hypos
	if not segments:
	self.text = text
	# Triples of (source-start, source-end, target-end) where segments end positions
	# are 1 beyond the last token
	else:
	# recover segmentation
	self.tokens = []
	align_re = re.compile("\\|(\d+)-(\d+)\\|")
	for token in text.split():
	match = align_re.match(token)
	if match:
	self.alignment.append\
	((int(match.group(1)), 1+int(match.group(2)), len(self.tokens)))
	else:
	self.tokens.append(token)
	self.text = " ".join(self.tokens)
	if not self.alignment:
	raise DataFormatException("Expected segmentation information not found in nbest")


	self.fv = np.array(fv)
	self.score = 0

	def __str__(self):
	return "{text=%s fv=%s score=%5.4f}" % (self.text, str(self.fv), self.score)

	class NBestList:
	def __init__(self,id):
	self.id = id
	self.hyps = []

	# Maps feature ids (short feature names) to their values
	_feature_index = {}
	def set_feature_start(name,index):
	indexes = _feature_index.get(name, [index,0])
	indexes[0] = index
	_feature_index[name] = indexes

	def set_feature_end(name,index):
	indexes = _feature_index.get(name, [0,index])
	indexes[1] = index
	_feature_index[name] = indexes

	def get_feature_index(name):
	return _feature_index.get(name, [0,0])

	def get_nbests(nbest_file, segments=False):
	"""Iterate through nbest lists"""
	if nbest_file.endswith("gz"):
	fh = gzip.GzipFile(nbest_file)
	else:
	fh = open(nbest_file)
	lineno = 0
	nbest = None
	for line in fh:
	fields = line.split(" \|\|\| ")
	if len(fields) != 4:
	raise DataFormatException("nbest(%d): %s" % (lineno,line))
	(id, text, scores, total) = fields
	if nbest and nbest.id != id:
	yield nbest
	nbest = None
	if not nbest:
	nbest = NBestList(id)
	fv = []
	score_name = None
	for score in scores.split():
	if score.endswith(":"):
	score = score[:-1]
	if score_name:
	set_feature_end(score_name,len(fv))
	score_name = score
	set_feature_start(score_name,len(fv))
	else:
	fv.append(float(score))
	if score_name: set_feature_end(score_name,len(fv))
	hyp = Hypothesis(text[:-1],fv,segments)
	nbest.hyps.append(hyp)
	if nbest:
	yield nbest

	def get_scores(score_data_file):
	"""Iterate through the score data, returning a set of scores for each sentence"""
	scorer = BleuScorer()
	fh = open(score_data_file)
	lineno = 0
	score_vectors = None
	for line in fh:
	if line.startswith("SCORES_TXT_BEGIN"):
	score_vectors = []
	elif line.startswith("SCORES_TXT_END"):
	scores = [scorer.score(score_vector) for score_vector in score_vectors]
	yield scores
	else:
	score_vectors.append([float(i) for i in line[:-1].split()])


	def get_scored_nbests(nbest_file, score_data_file, input_file, segments=False):
	score_gen = get_scores(score_data_file)
	input_gen = None
	if input_file: input_gen = open(input_file)
	try:
	for nbest in get_nbests(nbest_file, segments=segments):
	scores = score_gen.next()
	if len(scores) != len(nbest.hyps):
	raise DataFormatException("Length of nbest %s does not match score list (%d != %d)" %
	(nbest.id,len(nbest.hyps), len(scores)))
	input_line = None
	if input_gen:
	input_line = input_gen.next()[:-1]
	for hyp,score in zip(nbest.hyps, scores):
	hyp.score = score
	hyp.input_line = input_line
	yield nbest

	except StopIteration:
	raise DataFormatException("Score file shorter than nbest list file")

	class PhraseCache:
	"""An LRU cache for ttable lookups"""
	def __init__(self, max_size):
	self.max_size = max_size
	self.pairs_to_scores = OrderedDict()

	def get(self, source, target):
	key = (source,target)
	scores = self.pairs_to_scores.get(key,None)
	if scores:
	# cache hit - update access time
	del self.pairs_to_scores[key]
	self.pairs_to_scores[key] = scores
	return scores

	def add(self,source,target,scores):
	key = (source,target)
	self.pairs_to_scores[key] = scores
	while len(self.pairs_to_scores) > self.max_size:
	self.pairs_to_scores.popitem(last=False)

	#
	# Should I store full lists of options, or just phrase pairs?
	# Should probably store phrase-pairs, but may want to add
	# high scoring pairs (say, 20?) when I load the translations
	# of a given phrase
	#

	class CachedPhraseTable:
	def __init__(self,ttable_file,nscores=5,cache_size=20000):
	wa = False
	if binpt.PhraseDictionaryTree.canLoad(ttable_file,True):
	# assume word alignment is included
	wa = True
	self.ttable = binpt.PhraseDictionaryTree(ttable_file,nscores = nscores,wa = wa, tableLimit=0)
	self.cache = PhraseCache(cache_size)
	self.nscores = nscores

	def get_scores(self,phrase):
	source = " ".join(phrase[0])
	target_tuple = tuple(phrase[1])
	target = " ".join(target_tuple)
	scores = self.cache.get(source,target)
	if not scores:
	# cache miss
	scores = [0] * (self.nscores-1) # ignore penalty
	entries = self.ttable.query(source, converter=None)
	# find correct target
	for entry in entries:
	if entry.rhs == target_tuple:
	scores = entry.scores[:-1]
	break
	#print "QUERY",source,"\|\|\|",target,"\|\|\|",scores
	self.cache.add(source,target,scores)
	#else:
	# print "CACHE",source,"\|\|\|",target,"\|\|\|",scores
	return scores


	class MosesPhraseScorer:
	def __init__(self,ttable_files, cache_size=20000):
	self.ttables = []
	for ttable_file in ttable_files:
	self.ttables.append(CachedPhraseTable(ttable_file, cache_size=cache_size))

	def add_scores(self, hyp):
	"""Add the phrase scores to a hypothesis"""
	# Collect up the phrase pairs
	phrases = []
	source_tokens = hyp.input_line.split()
	tgt_st = 0
	if not hyp.alignment:
	raise DataFormatException("Alignments missing from: " + str(hyp))
	for src_st,src_end,tgt_end in hyp.alignment:
	phrases.append((source_tokens[src_st:src_end], hyp.tokens[tgt_st:tgt_end]))
	tgt_st = tgt_end
	# Look up the scores
	phrase_scores = []
	for ttable in self.ttables:
	phrase_scores.append([])
	for phrase in phrases:
	phrase_scores[-1].append(ttable.get_scores(phrase))
	# phrase_scores = np.array(phrase_scores)
	# eps = np.exp(-100)
	# phrase_scores[phrase_scores<eps]=eps
	floor = np.exp(-100)
	phrase_scores = np.clip(np.array(phrase_scores), floor, np.inf)
	hyp.phrase_scores = phrase_scores