File size: 2,521 Bytes
158b61b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2  -*-
#include "ug_im_bitext.h"

namespace sapt
{
  using namespace tpt;
  template<>
  SPTR<imBitext<L2R_Token<SimpleWordId> > >
  imBitext<L2R_Token<SimpleWordId> >::
  add(std::vector<std::string> const& s1,
      std::vector<std::string> const& s2,
      std::vector<std::string> const& aln) const
  {
    typedef L2R_Token<SimpleWordId> TKN;
    assert(s1.size() == s2.size() && s1.size() == aln.size());

#ifndef NDEBUG
    size_t first_new_snt = this->T1 ? this->T1->size() : 0;
#endif

    SPTR<imBitext<TKN> > ret;
    {
      boost::unique_lock<boost::shared_mutex> guard(m_lock);
      ret.reset(new imBitext<TKN>(*this));
    }

    // we add the sentences in separate threads (so it's faster)
    boost::thread thread1(snt_adder<TKN>(s1,*ret->V1,ret->myT1,ret->myI1));
    // thread1.join(); // for debugging
    boost::thread thread2(snt_adder<TKN>(s2,*ret->V2,ret->myT2,ret->myI2));
    BOOST_FOREACH(std::string const& a, aln)
      {
        std::istringstream ibuf(a);
        std::ostringstream obuf;
        uint32_t row,col; char c;
        while (ibuf >> row >> c >> col)
          {
            UTIL_THROW_IF2(c != '-', "[" << HERE << "] "
                           << "Error in alignment information:\n" << a);
            binwrite(obuf,row);
            binwrite(obuf,col);
          }
        // important: DO NOT replace the two lines below this comment by
        // char const* x = obuf.str().c_str(), as the memory x is pointing
        // to is freed immediately upon deconstruction of the string object.
        std::string foo = obuf.str();
        char const* x = foo.c_str();
        std::vector<char> v(x,x+foo.size());
        ret->myTx = append(ret->myTx, v);
      }
    
    thread1.join();
    thread2.join();
    
    ret->Tx = ret->myTx;
    ret->T1 = ret->myT1;
    ret->T2 = ret->myT2;
    ret->I1 = ret->myI1;
    ret->I2 = ret->myI2;
    
#ifndef NDEBUG
    // sanity check
    for (size_t i = first_new_snt; i < ret->T1->size(); ++i)
      {
        size_t slen1  = ret->T1->sntLen(i);
        size_t slen2  = ret->T2->sntLen(i);
        char const* p = ret->Tx->sntStart(i);
        char const* q = ret->Tx->sntEnd(i);
        size_t k;
        while (p < q)
          {
            p = binread(p,k);
            assert(p);
            assert(p < q);
            assert(k < slen1);
            p = binread(p,k);
            assert(p);
            assert(k < slen2);
          }
      }
#endif
    return ret;
  }
  
}