|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <cassert> |
|
#include <vector> |
|
#include <string> |
|
#include <set> |
|
#include <iostream> |
|
#include <cstdlib> |
|
#include <sstream> |
|
|
|
#include "SyntaxNodeCollection.h" |
|
#include "XmlException.h" |
|
|
|
using namespace std; |
|
|
|
namespace MosesTraining |
|
{ |
|
|
|
inline std::vector<std::string> Tokenize(const std::string& str, |
|
const std::string& delimiters = " \t") |
|
{ |
|
std::vector<std::string> tokens; |
|
|
|
std::string::size_type lastPos = str.find_first_not_of(delimiters, 0); |
|
|
|
std::string::size_type pos = str.find_first_of(delimiters, lastPos); |
|
|
|
while (std::string::npos != pos || std::string::npos != lastPos) { |
|
|
|
tokens.push_back(str.substr(lastPos, pos - lastPos)); |
|
|
|
lastPos = str.find_first_not_of(delimiters, pos); |
|
|
|
pos = str.find_first_of(delimiters, lastPos); |
|
} |
|
|
|
return tokens; |
|
} |
|
|
|
std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r") |
|
{ |
|
std::string res = str; |
|
res.erase(str.find_last_not_of(dropChars)+1); |
|
return res.erase(0, res.find_first_not_of(dropChars)); |
|
} |
|
|
|
string ParseXmlTagAttribute(const string& tag,const string& attributeName) |
|
{ |
|
|
|
string tagOpen = attributeName + "=\""; |
|
size_t contentsStart = tag.find(tagOpen); |
|
if (contentsStart == string::npos) return ""; |
|
contentsStart += tagOpen.size(); |
|
size_t contentsEnd = tag.find_first_of('"',contentsStart+1); |
|
if (contentsEnd == string::npos) { |
|
cerr << "Malformed XML attribute: "<< tag; |
|
return ""; |
|
} |
|
size_t possibleEnd; |
|
while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != string::npos) { |
|
contentsEnd = possibleEnd; |
|
} |
|
return tag.substr(contentsStart,contentsEnd-contentsStart); |
|
} |
|
|
|
|
|
|
|
void ParseXmlTagAttributes(const std::string &s, |
|
std::map<std::string, std::string> &attributes) |
|
{ |
|
std::size_t begin = 0; |
|
while (true) { |
|
std::size_t pos = s.find('=', begin); |
|
if (pos == std::string::npos) { |
|
return; |
|
} |
|
std::string name = Trim(s.substr(begin, pos-begin)); |
|
begin = s.find('"', pos+1); |
|
if (begin == std::string::npos) { |
|
throw XmlException("invalid tag content"); |
|
} |
|
pos = s.find('"', begin+1); |
|
if (pos == std::string::npos) { |
|
throw XmlException("invalid tag content"); |
|
} |
|
while (s[pos-1] == '\\') { |
|
pos = s.find('"', pos+1); |
|
if (pos == std::string::npos) { |
|
throw XmlException("invalid tag content"); |
|
} |
|
} |
|
if (name != "label" && name != "span") { |
|
attributes[name] = s.substr(begin+1, pos-begin-1); |
|
} |
|
begin = pos+1; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
string TrimXml(const string& str) |
|
{ |
|
|
|
if (str.size() < 2) return str; |
|
|
|
|
|
if (str[0] == '<' && str[str.size() - 1] == '>') { |
|
return str.substr(1, str.size() - 2); |
|
} |
|
|
|
else { |
|
return str; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
bool isXmlTag(const string& tag) |
|
{ |
|
return tag[0] == '<'; |
|
} |
|
|
|
|
|
|
|
|
|
string unescape(const string& str) |
|
{ |
|
string s; |
|
s.reserve(str.size()); |
|
string::size_type n; |
|
string::size_type start = 0; |
|
while ((n = str.find('&', start)) != string::npos) { |
|
s += str.substr(start, n-start); |
|
string::size_type end = str.find(';', n); |
|
assert(n != string::npos); |
|
string name = str.substr(n+1, end-n-1); |
|
if (name == "lt") { |
|
s += string("<"); |
|
} else if (name == "gt") { |
|
s += string(">"); |
|
} else if (name == "#91") { |
|
s += string("["); |
|
} else if (name == "#93") { |
|
s += string("]"); |
|
} else if (name == "bra") { |
|
s += string("["); |
|
} else if (name == "ket") { |
|
s += string("]"); |
|
} else if (name == "bar" || name == "#124") { |
|
s += string("|"); |
|
} else if (name == "amp") { |
|
s += string("&"); |
|
} else if (name == "apos") { |
|
s += string("'"); |
|
} else if (name == "quot") { |
|
s += string("\""); |
|
} else { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::ostringstream msg; |
|
msg << "unsupported XML escape sequence: &" << name << ";"; |
|
throw XmlException(msg.str()); |
|
} |
|
if (end == str.size()-1) { |
|
return s; |
|
} |
|
start = end + 1; |
|
} |
|
s += str.substr(start); |
|
return s; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vector<string> TokenizeXml(const string& str) |
|
{ |
|
string lbrack = "<"; |
|
string rbrack = ">"; |
|
vector<string> tokens; |
|
string::size_type cpos = 0; |
|
string::size_type lpos = 0; |
|
string::size_type rpos = 0; |
|
|
|
|
|
while (cpos != str.size()) { |
|
|
|
lpos = str.find_first_of(lbrack, cpos); |
|
if (lpos != string::npos) { |
|
|
|
rpos = str.find_first_of(rbrack, lpos); |
|
|
|
if (rpos == string::npos) { |
|
cerr << "ERROR: malformed XML: " << str << endl; |
|
return tokens; |
|
} |
|
} else { |
|
|
|
tokens.push_back(str.substr(cpos)); |
|
break; |
|
} |
|
|
|
|
|
if (lpos - cpos > 0) |
|
tokens.push_back(str.substr(cpos, lpos - cpos)); |
|
|
|
|
|
tokens.push_back(str.substr(lpos, rpos-lpos+1)); |
|
cpos = rpos + 1; |
|
} |
|
return tokens; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection, |
|
set< string > &labelCollection, |
|
map< string, int > &topLabelCollection, |
|
bool unescapeSpecialChars ) |
|
{ |
|
|
|
|
|
|
|
if (line.find_first_of('<') == string::npos) { |
|
return true; |
|
} |
|
|
|
|
|
|
|
vector<string> xmlTokens = TokenizeXml(line); |
|
|
|
|
|
|
|
typedef pair< string, pair< size_t, string > > OpenedTag; |
|
vector< OpenedTag > tagStack; |
|
|
|
string cleanLine; |
|
size_t wordPos = 0; |
|
|
|
|
|
for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) { |
|
|
|
if(!isXmlTag(xmlTokens[xmlTokenPos])) { |
|
|
|
if (cleanLine.size()>0 && |
|
cleanLine[cleanLine.size() - 1] != ' ' && |
|
xmlTokens[xmlTokenPos][0] != ' ') { |
|
cleanLine += " "; |
|
} |
|
|
|
if (unescapeSpecialChars) { |
|
cleanLine += unescape(xmlTokens[xmlTokenPos]); |
|
} else { |
|
cleanLine += xmlTokens[xmlTokenPos]; |
|
} |
|
wordPos = Tokenize(cleanLine).size(); |
|
} |
|
|
|
|
|
else { |
|
|
|
|
|
|
|
string tag = Trim(TrimXml(xmlTokens[xmlTokenPos])); |
|
|
|
|
|
if (tag.size() == 0) { |
|
cerr << "ERROR: empty tag name: " << line << endl; |
|
return false; |
|
} |
|
|
|
|
|
bool isUnary = ( tag[tag.size() - 1] == '/' ); |
|
|
|
|
|
bool isClosed = ( tag[0] == '/' ); |
|
bool isOpen = !isClosed; |
|
|
|
if (isClosed && isUnary) { |
|
cerr << "ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl; |
|
return false; |
|
} |
|
|
|
if (isClosed) |
|
tag = tag.substr(1); |
|
if (isUnary) |
|
tag = tag.substr(0,tag.size()-1); |
|
|
|
|
|
string::size_type endOfName = tag.find_first_of(' '); |
|
string tagName = tag; |
|
string tagContent = ""; |
|
if (endOfName != string::npos) { |
|
tagName = tag.substr(0,endOfName); |
|
tagContent = tag.substr(endOfName+1); |
|
} |
|
|
|
|
|
|
|
if (isOpen || isUnary) { |
|
|
|
OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) ); |
|
tagStack.push_back( openedTag ); |
|
|
|
} |
|
|
|
|
|
|
|
if (isClosed || isUnary) { |
|
|
|
if (tagStack.size() == 0) { |
|
cerr << "ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl; |
|
return false; |
|
} |
|
OpenedTag openedTag = tagStack.back(); |
|
tagStack.pop_back(); |
|
|
|
|
|
if (openedTag.first != tagName) { |
|
cerr << "ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl; |
|
return false; |
|
} |
|
|
|
|
|
size_t startPos = openedTag.second.first; |
|
string tagContent = openedTag.second.second; |
|
size_t endPos = wordPos; |
|
|
|
|
|
string span = ParseXmlTagAttribute(tagContent,"span"); |
|
if (! span.empty()) { |
|
vector<string> ij = Tokenize(span, "-"); |
|
if (ij.size() != 1 && ij.size() != 2) { |
|
cerr << "ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl; |
|
return false; |
|
} |
|
startPos = atoi(ij[0].c_str()); |
|
if (ij.size() == 1) endPos = startPos + 1; |
|
else endPos = atoi(ij[1].c_str()) + 1; |
|
} |
|
|
|
|
|
|
|
if (startPos > endPos) { |
|
cerr << "ERROR: tag " << tagName << " startPos is bigger than endPos (" << startPos << "-" << endPos << "): " << line << endl; |
|
return false; |
|
} else if (startPos == endPos) { |
|
cerr << "WARNING: tag " << tagName << ". Ignoring 0 span (" << startPos << "-" << endPos << "): " << line << endl; |
|
continue; |
|
} |
|
|
|
string label = ParseXmlTagAttribute(tagContent,"label"); |
|
labelCollection.insert( label ); |
|
|
|
|
|
if (0) { |
|
cerr << "XML TAG NAME IS: '" << tagName << "'" << endl; |
|
cerr << "XML TAG LABEL IS: '" << label << "'" << endl; |
|
cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl; |
|
} |
|
SyntaxNode *node = nodeCollection.AddNode( startPos, endPos-1, label ); |
|
ParseXmlTagAttributes(tagContent, node->attributes); |
|
} |
|
} |
|
} |
|
|
|
if (tagStack.size() > 0) { |
|
cerr << "ERROR: some opened tags were never closed: " << line << endl; |
|
return false; |
|
} |
|
|
|
|
|
const vector< SyntaxNode* >& topNodes = nodeCollection.GetNodes( 0, wordPos-1 ); |
|
for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) { |
|
SyntaxNode *n = *node; |
|
const string &label = n->label; |
|
if (topLabelCollection.find( label ) == topLabelCollection.end()) |
|
topLabelCollection[ label ] = 0; |
|
topLabelCollection[ label ]++; |
|
} |
|
|
|
|
|
line = cleanLine; |
|
return true; |
|
} |
|
|
|
} |
|
|