sakharamg
/

NMTKD

Model card Files Files and versions Community

NMTKD / translation /tools /mosesdecoder /phrase-extract /XmlTree.cpp

sakharamg

Uploading all files

158b61b about 2 years ago

raw

history blame contribute delete

14.3 kB

	/***********************************************************************
	Moses - factored phrase-based language decoder
	Copyright (C) 2006 University of Edinburgh

	This library is free software; you can redistribute it and/or
	modify it under the terms of the GNU Lesser General Public
	License as published by the Free Software Foundation; either
	version 2.1 of the License, or (at your option) any later version.

	This library is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	Lesser General Public License for more details.

	You should have received a copy of the GNU Lesser General Public
	License along with this library; if not, write to the Free Software
	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	***********************************************************************/

	#include <cassert>
	#include <vector>
	#include <string>
	#include <set>
	#include <iostream>
	#include <cstdlib>
	#include <sstream>

	#include "SyntaxNodeCollection.h"
	#include "XmlException.h"

	using namespace std;

	namespace MosesTraining
	{

	inline std::vector<std::string> Tokenize(const std::string& str,
	const std::string& delimiters = " \t")
	{
	std::vector<std::string> tokens;
	// Skip delimiters at beginning.
	std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
	// Find first "non-delimiter".
	std::string::size_type pos = str.find_first_of(delimiters, lastPos);

	while (std::string::npos != pos \|\| std::string::npos != lastPos) {
	// Found a token, add it to the vector.
	tokens.push_back(str.substr(lastPos, pos - lastPos));
	// Skip delimiters. Note the "not_of"
	lastPos = str.find_first_not_of(delimiters, pos);
	// Find next "non-delimiter"
	pos = str.find_first_of(delimiters, lastPos);
	}

	return tokens;
	}

	std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r")
	{
	std::string res = str;
	res.erase(str.find_last_not_of(dropChars)+1);
	return res.erase(0, res.find_first_not_of(dropChars));
	}

	string ParseXmlTagAttribute(const string& tag,const string& attributeName)
	{
	/TODO deal with unescaping \"/
	string tagOpen = attributeName + "=\"";
	size_t contentsStart = tag.find(tagOpen);
	if (contentsStart == string::npos) return "";
	contentsStart += tagOpen.size();
	size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
	if (contentsEnd == string::npos) {
	cerr << "Malformed XML attribute: "<< tag;
	return "";
	}
	size_t possibleEnd;
	while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != string::npos) {
	contentsEnd = possibleEnd;
	}
	return tag.substr(contentsStart,contentsEnd-contentsStart);
	}

	// s should be a sequence of name=attribute pairs separated by whitespace.
	// e.g. "label=\"S\" pcfg=\"-1.452\" foo=\"blah\\\"blah\""
	void ParseXmlTagAttributes(const std::string &s,
	std::map<std::string, std::string> &attributes)
	{
	std::size_t begin = 0;
	while (true) {
	std::size_t pos = s.find('=', begin);
	if (pos == std::string::npos) {
	return;
	}
	std::string name = Trim(s.substr(begin, pos-begin));
	begin = s.find('"', pos+1);
	if (begin == std::string::npos) {
	throw XmlException("invalid tag content");
	}
	pos = s.find('"', begin+1);
	if (pos == std::string::npos) {
	throw XmlException("invalid tag content");
	}
	while (s[pos-1] == '\\') {
	pos = s.find('"', pos+1);
	if (pos == std::string::npos) {
	throw XmlException("invalid tag content");
	}
	}
	if (name != "label" && name != "span") {
	attributes[name] = s.substr(begin+1, pos-begin-1);
	}
	begin = pos+1;
	}
	}

	/**
	* Remove "<" and ">" from XML tag
	*
	* \param str xml token to be stripped
	*/
	string TrimXml(const string& str)
	{
	// too short to be xml token -> do nothing
	if (str.size() < 2) return str;

	// strip first and last character
	if (str[0] == '<' && str[str.size() - 1] == '>') {
	return str.substr(1, str.size() - 2);
	}
	// not an xml token -> do nothing
	else {
	return str;
	}
	}

	/**
	* Check if the token is an XML tag, i.e. starts with "<"
	*
	* \param tag token to be checked
	*/
	bool isXmlTag(const string& tag)
	{
	return tag[0] == '<';
	}

	/**
	* Unescape XML special characters.
	*/
	string unescape(const string& str)
	{
	string s;
	s.reserve(str.size());
	string::size_type n;
	string::size_type start = 0;
	while ((n = str.find('&', start)) != string::npos) {
	s += str.substr(start, n-start);
	string::size_type end = str.find(';', n);
	assert(n != string::npos);
	string name = str.substr(n+1, end-n-1);
	if (name == "lt") {
	s += string("<");
	} else if (name == "gt") {
	s += string(">");
	} else if (name == "#91") {
	s += string("[");
	} else if (name == "#93") {
	s += string("]");
	} else if (name == "bra") {
	s += string("[");
	} else if (name == "ket") {
	s += string("]");
	} else if (name == "bar" \|\| name == "#124") {
	s += string("\|");
	} else if (name == "amp") {
	s += string("&");
	} else if (name == "apos") {
	s += string("'");
	} else if (name == "quot") {
	s += string("\"");
	} else {
	// Currently only handles the following five XML escape sequences:
	// < <
	// > >
	// & &
	// ' '
	// " "
	// Numeric character references (like ö) are not supported.
	std::ostringstream msg;
	msg << "unsupported XML escape sequence: &" << name << ";";
	throw XmlException(msg.str());
	}
	if (end == str.size()-1) {
	return s;
	}
	start = end + 1;
	}
	s += str.substr(start);
	return s;
	}

	/**
	* Split up the input character string into tokens made up of
	* either XML tags or text.
	* example: this <b> is a </b> test .
	* => (this ), (<b>), ( is a ), (</b>), ( test .)
	*
	* \param str input string
	*/
	vector<string> TokenizeXml(const string& str)
	{
	string lbrack = "<";
	string rbrack = ">";
	vector<string> tokens; // vector of tokens to be returned
	string::size_type cpos = 0; // current position in string
	string::size_type lpos = 0; // left start of xml tag
	string::size_type rpos = 0; // right end of xml tag

	// walk thorugh the string (loop vver cpos)
	while (cpos != str.size()) {
	// find the next opening "<" of an xml tag
	lpos = str.find_first_of(lbrack, cpos);
	if (lpos != string::npos) {
	// find the end of the xml tag
	rpos = str.find_first_of(rbrack, lpos);
	// sanity check: there has to be closing ">"
	if (rpos == string::npos) {
	cerr << "ERROR: malformed XML: " << str << endl;
	return tokens;
	}
	} else { // no more tags found
	// add the rest as token
	tokens.push_back(str.substr(cpos));
	break;
	}

	// add stuff before xml tag as token, if there is any
	if (lpos - cpos > 0)
	tokens.push_back(str.substr(cpos, lpos - cpos));

	// add xml tag as token
	tokens.push_back(str.substr(lpos, rpos-lpos+1));
	cpos = rpos + 1;
	}
	return tokens;
	}

	/**
	* Process a sentence with XML-style annotation of syntactic nodes.
	*
	* \param line[in,out] in: sentence, out: sentence without the XML
	* \param nodeCollection[out] the collection of SyntaxNode objects for this
	* sentence
	* \param labelCollection[out] label values are inserted into this set
	* \param topLabelCollection[out] top labels (key) and their counts (value)
	* are inserted into this map
	* \param unescapeSpecialChars flag indicating whether XML special characters
	* should be unescaped
	*/
	bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
	set< string > &labelCollection,
	map< string, int > &topLabelCollection,
	bool unescapeSpecialChars )
	{
	//parse XML markup in translation line

	// no xml tag? we're done.
	if (line.find_first_of('<') == string::npos) {
	return true;
	}

	// break up input into a vector of xml tags and text
	// example: (this), (<b>), (is a), (</b>), (test .)
	vector<string> xmlTokens = TokenizeXml(line);

	// we need to store opened tags, until they are closed
	// tags are stored as tripled (tagname, startpos, contents)
	typedef pair< string, pair< size_t, string > > OpenedTag;
	vector< OpenedTag > tagStack; // stack that contains active opened tags

	string cleanLine; // return string (text without xml)
	size_t wordPos = 0; // position in sentence (in terms of number of words)

	// loop through the tokens
	for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
	// not a xml tag, but regular text (may contain many words)
	if(!isXmlTag(xmlTokens[xmlTokenPos])) {
	// add a space at boundary, if necessary
	if (cleanLine.size()>0 &&
	cleanLine[cleanLine.size() - 1] != ' ' &&
	xmlTokens[xmlTokenPos][0] != ' ') {
	cleanLine += " ";
	}
	// add words to output
	if (unescapeSpecialChars) {
	cleanLine += unescape(xmlTokens[xmlTokenPos]);
	} else {
	cleanLine += xmlTokens[xmlTokenPos];
	}
	wordPos = Tokenize(cleanLine).size(); // count all the words
	}

	// process xml tag
	else {
	// * get essential information about tag *

	// strip extra boundary spaces and "<" and ">"
	string tag = Trim(TrimXml(xmlTokens[xmlTokenPos]));
	// cerr << "XML TAG IS: " << tag << std::endl;

	if (tag.size() == 0) {
	cerr << "ERROR: empty tag name: " << line << endl;
	return false;
	}

	// check if unary (e.g., "<wall/>")
	bool isUnary = ( tag[tag.size() - 1] == '/' );

	// check if opening tag (e.g. "<a>", not "</a>")g
	bool isClosed = ( tag[0] == '/' );
	bool isOpen = !isClosed;

	if (isClosed && isUnary) {
	cerr << "ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl;
	return false;
	}

	if (isClosed)
	tag = tag.substr(1); // remove "/" at the beginning
	if (isUnary)
	tag = tag.substr(0,tag.size()-1); // remove "/" at the end

	// find the tag name and contents
	string::size_type endOfName = tag.find_first_of(' ');
	string tagName = tag;
	string tagContent = "";
	if (endOfName != string::npos) {
	tagName = tag.substr(0,endOfName);
	tagContent = tag.substr(endOfName+1);
	}

	// * process new tag *

	if (isOpen \|\| isUnary) {
	// put the tag on the tag stack
	OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
	tagStack.push_back( openedTag );
	// cerr << "XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl;
	}

	// * process completed tag *

	if (isClosed \|\| isUnary) {
	// pop last opened tag from stack;
	if (tagStack.size() == 0) {
	cerr << "ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl;
	return false;
	}
	OpenedTag openedTag = tagStack.back();
	tagStack.pop_back();

	// tag names have to match
	if (openedTag.first != tagName) {
	cerr << "ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl;
	return false;
	}

	// assemble remaining information about tag
	size_t startPos = openedTag.second.first;
	string tagContent = openedTag.second.second;
	size_t endPos = wordPos;

	// span attribute overwrites position
	string span = ParseXmlTagAttribute(tagContent,"span");
	if (! span.empty()) {
	vector<string> ij = Tokenize(span, "-");
	if (ij.size() != 1 && ij.size() != 2) {
	cerr << "ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl;
	return false;
	}
	startPos = atoi(ij[0].c_str());
	if (ij.size() == 1) endPos = startPos + 1;
	else endPos = atoi(ij[1].c_str()) + 1;
	}

	// cerr << "XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl;

	if (startPos > endPos) {
	cerr << "ERROR: tag " << tagName << " startPos is bigger than endPos (" << startPos << "-" << endPos << "): " << line << endl;
	return false;
	} else if (startPos == endPos) {
	cerr << "WARNING: tag " << tagName << ". Ignoring 0 span (" << startPos << "-" << endPos << "): " << line << endl;
	continue;
	}

	string label = ParseXmlTagAttribute(tagContent,"label");
	labelCollection.insert( label );

	// report what we have processed so far
	if (0) {
	cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
	cerr << "XML TAG LABEL IS: '" << label << "'" << endl;
	cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
	}
	SyntaxNode *node = nodeCollection.AddNode( startPos, endPos-1, label );
	ParseXmlTagAttributes(tagContent, node->attributes);
	}
	}
	}
	// we are done. check if there are tags that are still open
	if (tagStack.size() > 0) {
	cerr << "ERROR: some opened tags were never closed: " << line << endl;
	return false;
	}

	// collect top labels
	const vector< SyntaxNode* >& topNodes = nodeCollection.GetNodes( 0, wordPos-1 );
	for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) {
	SyntaxNode n = node;
	const string &label = n->label;
	if (topLabelCollection.find( label ) == topLabelCollection.end())
	topLabelCollection[ label ] = 0;
	topLabelCollection[ label ]++;
	}

	// return de-xml'ed sentence in line
	line = cleanLine;
	return true;
	}

	}