File size: 4,381 Bytes
158b61b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/*
 * Sentence.cpp
 *
 *  Created on: 14 Dec 2015
 *      Author: hieu
 */

#include "Sentence.h"
#include "../System.h"

using namespace std;

namespace Moses2
{
namespace SCFG
{
Sentence *Sentence::CreateFromString(MemPool &pool, FactorCollection &vocab,
                                     const System &system, const std::string &str, long translationId)
{
  //cerr << "SCFG Sentence" << endl;

  Sentence *ret;

  if (system.options.input.xml_policy) {
    // xml
    ret = CreateFromStringXML(pool, vocab, system, str);
    //cerr << "ret=" << ret->Debug(system) << endl;
  } else {
    std::vector<std::string> toks = Tokenize(str);
    size_t size = toks.size() + 2;

    ret = new (pool.Allocate<SCFG::Sentence>()) Sentence(pool, size);
    ret->PhraseImplTemplate<SCFG::Word>::CreateFromString(vocab, system, toks, true);

  }

  return ret;
}

Sentence *Sentence::CreateFromStringXML(MemPool &pool, FactorCollection &vocab,
                                        const System &system, const std::string &str)
{
  Sentence *ret;

  vector<XMLOption*> xmlOptions;
  pugi::xml_document doc;

  string str2 = "<xml>" + str + "</xml>";
  pugi::xml_parse_result result = doc.load(str2.c_str(),
                                  pugi::parse_cdata | pugi::parse_wconv_attribute | pugi::parse_eol | pugi::parse_comments);
  pugi::xml_node topNode = doc.child("xml");

  std::vector<std::string> toks;
  XMLParse(pool, system, 0, topNode, toks, xmlOptions);

  // debug
  /*
  cerr << "xmloptions:" << endl;
  for (size_t i = 0; i < xmlOptions.size(); ++i) {
    cerr << xmlOptions[i]->Debug(system) << endl;
  }
  */

  // create words
  size_t size = toks.size() + 2;
  ret = new (pool.Allocate<Sentence>()) Sentence(pool, size);
  ret->PhraseImplTemplate<SCFG::Word>::CreateFromString(vocab, system, toks, true);

  // xml
  for(size_t i=0; i<xmlOptions.size(); i++) {
    const XMLOption *xmlOption = xmlOptions[i];
    if (strcmp(xmlOption->GetNodeName(), "ne") == 0) {
      FactorType placeholderFactor = system.options.input.placeholder_factor;
      UTIL_THROW_IF2(placeholderFactor == NOT_FOUND,
                     "Placeholder XML in input. Must have argument -placeholder-factor [NUM]");
      UTIL_THROW_IF2(xmlOption->phraseSize != 1,
                     "Placeholder must only cover 1 word");

      const Factor *factor = vocab.AddFactor(xmlOption->GetEntity(), system, false);
      (*ret)[xmlOption->startPos + 1][placeholderFactor] = factor;
    } else {
      // default - forced translation. Add to class variable
      ret->AddXMLOption(system, xmlOption);
    }
  }

  //cerr << "ret=" << ret->Debug(system) << endl;
  return ret;
}

void Sentence::XMLParse(
  MemPool &pool,
  const System &system,
  size_t depth,
  const pugi::xml_node &parentNode,
  std::vector<std::string> &toks,
  vector<XMLOption*> &xmlOptions)
{
  // pugixml
  for (pugi::xml_node childNode = parentNode.first_child(); childNode; childNode = childNode.next_sibling()) {
    string nodeName = childNode.name();
    //cerr << depth << " nodeName=" << nodeName << endl;

    int startPos = toks.size();

    string value = childNode.value();
    if (!value.empty()) {
      //cerr << depth << "childNode text=" << value << endl;
      std::vector<std::string> subPhraseToks = Tokenize(value);
      for (size_t i = 0; i < subPhraseToks.size(); ++i) {
        toks.push_back(subPhraseToks[i]);
      }
    }

    if (!nodeName.empty()) {
      XMLOption *xmlOption = new (pool.Allocate<XMLOption>()) XMLOption(pool, nodeName, startPos);

      pugi::xml_attribute attr;
      attr = childNode.attribute("translation");
      if (!attr.empty()) {
        xmlOption->SetTranslation(pool, attr.as_string());
      }

      attr = childNode.attribute("entity");
      if (!attr.empty()) {
        xmlOption->SetEntity(pool, attr.as_string());
      }

      attr = childNode.attribute("prob");
      if (!attr.empty()) {
        xmlOption->prob = attr.as_float();
      }

      xmlOptions.push_back(xmlOption);

      // recursively call this function. For proper recursive trees
      XMLParse(pool, system, depth + 1, childNode, toks, xmlOptions);

      size_t endPos = toks.size();
      xmlOption->phraseSize = endPos - startPos;

      /*
      cerr << "xmlOptions=";
      xmlOption->Debug(cerr, system);
      cerr << endl;
      */
    }

  }
}

}
} /* namespace Moses2 */