File size: 2,129 Bytes
158b61b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#include "TreeFragmentTokenizer.h"

#include <cctype>

namespace Moses
{
namespace Syntax
{
namespace F2S
{

TreeFragmentToken::TreeFragmentToken(TreeFragmentTokenType t,
                                     StringPiece v, std::size_t p)
  : type(t)
  , value(v)
  , pos(p)
{
}

TreeFragmentTokenizer::TreeFragmentTokenizer()
  : value_(TreeFragmentToken_EOS, "", -1)
{
}

TreeFragmentTokenizer::TreeFragmentTokenizer(const StringPiece &s)
  : str_(s)
  , value_(TreeFragmentToken_EOS, "", -1)
  , iter_(s.begin())
  , end_(s.end())
  , pos_(0)
{
  ++(*this);
}

TreeFragmentTokenizer &TreeFragmentTokenizer::operator++()
{
  while (iter_ != end_ && (*iter_ == ' ' || *iter_ == '\t')) {
    ++iter_;
    ++pos_;
  }

  if (iter_ == end_) {
    value_ = TreeFragmentToken(TreeFragmentToken_EOS, "", pos_);
    return *this;
  }

  if (*iter_ == '[') {
    value_ = TreeFragmentToken(TreeFragmentToken_LSB, "[", pos_);
    ++iter_;
    ++pos_;
  } else if (*iter_ == ']') {
    value_ = TreeFragmentToken(TreeFragmentToken_RSB, "]", pos_);
    ++iter_;
    ++pos_;
  } else {
    std::size_t start = pos_;
    while (true) {
      ++iter_;
      ++pos_;
      if (iter_ == end_ || *iter_ == ' ' || *iter_ == '\t') {
        break;
      }
      if (*iter_ == '[' || *iter_ == ']') {
        break;
      }
    }
    StringPiece word = str_.substr(start, pos_-start);
    value_ = TreeFragmentToken(TreeFragmentToken_WORD, word, start);
  }

  return *this;
}

TreeFragmentTokenizer TreeFragmentTokenizer::operator++(int)
{
  TreeFragmentTokenizer tmp(*this);
  ++*this;
  return tmp;
}

bool operator==(const TreeFragmentTokenizer &lhs,
                const TreeFragmentTokenizer &rhs)
{
  if (lhs.value_.type == TreeFragmentToken_EOS ||
      rhs.value_.type == TreeFragmentToken_EOS) {
    return lhs.value_.type == TreeFragmentToken_EOS &&
           rhs.value_.type == TreeFragmentToken_EOS;
  }
  return lhs.iter_ == rhs.iter_;
}

bool operator!=(const TreeFragmentTokenizer &lhs,
                const TreeFragmentTokenizer &rhs)
{
  return !(lhs == rhs);
}

}  // namespace F2S
}  // namespace Syntax
}  // namespace Moses