# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import copy
import json
import math
import re
import collections
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.parameter import Parameter


def gelu(x):
    return (
        0.5
        * x
        * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
    )


def swish(x):
    return x * torch.sigmoid(x)


class LayerNorm(nn.Module):
    "Construct a layernorm module in the OpenAI style (epsilon inside the square root)."

    def __init__(self, n_state, e=1e-5):
        super(LayerNorm, self).__init__()
        self.g = nn.Parameter(torch.ones(n_state))
        self.b = nn.Parameter(torch.zeros(n_state))
        self.e = e

    """
     Input:
       x: n_state-dim
     Output:
       o: n_state-dim
    """

    def forward(self, x):
        u = x.mean(-1, keepdim=True)
        s = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.e)
        return self.g * x + self.b


"""
 Convolution
 nx is the last input dim
 nf is the last output dim
"""


class Conv1D(nn.Module):
    def __init__(self, nf, nx):
        super(Conv1D, self).__init__()
        self.nf = nf
        w = torch.empty(nx, nf)
        nn.init.normal_(w, std=0.02)
        self.w = Parameter(w)
        self.b = Parameter(torch.zeros(nf))

    """
      Input: 
         x: batch x len x nx
      Output: 
         x: batch x len x nf
    """

    def forward(self, x):
        size_out = x.size()[:-1] + (self.nf,)
        x = torch.addmm(self.b, x.view(-1, x.size(-1)), self.w)
        x = x.view(*size_out)
        return x


class PositionalEmbedding(nn.Module):
    def __init__(self, opt, demb):
        super(PositionalEmbedding, self).__init__()
        self.demb = demb
        inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb))
        self.pos_discount = float(opt["TRANSFORMER_POS_DISCOUNT"])
        self.register_buffer("inv_freq", inv_freq)

    """
     Input: 
        pos_seq: len
     Output:
        pos_emb: len x demb
    """

    def forward(self, pos_seq):
        sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
        pos_emb = (
            torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
            / self.pos_discount
        )
        return pos_emb


"""
  Splitter
"""


class Splitter(nn.Module):
    def __init__(self, nx):
        super(Splitter, self).__init__()
        self.nx = nx
        self.augmenter = Conv1D(nx * 3, nx)

    """
     Input: 
        x: batch x len x nx
     Output:
        query,key,value: batch x len x nx
    """

    def forward(self, x):
        x = self.augmenter(x)
        # x: batch x len x (3 x nx)

        query, key, value = x.split(self.nx, dim=2)
        # query,key,value: batch x len x nx

        return query, key, value


"""
  Multi-head Attention
"""


class Attention(nn.Module):
    """
    nx: input dimension
    """

    def __init__(self, nx, opt):
        super(Attention, self).__init__()
        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
        n_head = int(opt["TRANSFORMER_HEAD"])
        resid_pdrop = opt["TRANSFORMER_RESIDUAL_DROPOUT"]
        attn_pdrop = opt["TRANSFORMER_ATTENTION_DROPOUT"]
        use_cuda = opt["cuda"]

        assert n_state % n_head == 0
        # if mask is needed, uncomment this
        self.maxlen = 2048  # beyond this scale
        self.mask = (
            Variable(
                torch.tril(torch.ones(self.maxlen, self.maxlen)).view(
                    1, 1, self.maxlen, self.maxlen
                ),
                requires_grad=False,
            ).cuda()
            if use_cuda
            else Variable(
                torch.tril(torch.ones(self.maxlen, self.maxlen)).view(
                    1, 1, self.maxlen, self.maxlen
                ),
                requires_grad=False,
            )
        )
        self.n_head = n_head
        self.c_proj = Conv1D(n_state, nx)
        self.attn_dropout = nn.Dropout(attn_pdrop)
        self.resid_dropout = nn.Dropout(resid_pdrop)
        self.use_cuda = use_cuda

    """
      Input:
        q: batch x n_head x len x dim
        k: batch x n_head x dim x kv_len
        v: batch x n_head x kv_len x dim
        x_mask: batch x kv_len # key and value's mask (if not None, used for encoder's self-attention and decoder's src-tgt attention)
        one_dir_visible: only sees previous history  (used for decoder's self-attention)
        return_attn_weight: if true, also return the attention weights
      Output:
        a: batch x n_head x len x n_state x dim
        attn_weight (if return_attn_weight): attn_weight: batch x n_head x len x kv_len
    """

    def _attn(self, q, k, v, x_mask, one_dir_visible, return_attn_weight):
        w = torch.matmul(q, k)
        # batch x n_head x len x kv_len
        w = w / math.sqrt(v.size(-1))

        mask = None
        if one_dir_visible:  # mask "seeing the future"
            if w.size(-2) <= self.maxlen and w.size(-1) <= self.maxlen:
                mask = (
                    self.mask[:, :, : w.size(-2), : w.size(-1)].cuda()
                    if self.use_cuda
                    else self.mask[:, :, : w.size(-2), : w.size(-1)]
                )
            else:
                mask = (
                    Variable(
                        torch.tril(torch.ones(w.size(-2), w.size(-1))).view(
                            1, 1, w.size(-2), w.size(-1)
                        ),
                        requires_grad=False,
                    ).cuda()
                    if self.use_cuda
                    else Variable(
                        torch.tril(torch.ones(w.size(-2), w.size(-1))).view(
                            1, 1, w.size(-2), w.size(-1)
                        ),
                        requires_grad=False,
                    )
                )

        if x_mask is not None:
            mask = x_mask.unsqueeze(1).unsqueeze(1).expand_as(w).float()
            # batch x n_head x len x kv_len

        if mask is not None:
            w = w * mask + -1e9 * (1 - mask)

        w_prob = nn.Softmax(dim=-1)(w)
        w_prob = self.attn_dropout(w_prob)
        if return_attn_weight:
            return torch.matmul(w_prob, v), w
        else:
            return torch.matmul(w_prob, v)

    def merge_heads(self, x):
        x = x.permute(0, 2, 1, 3).contiguous()
        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states

    """
      Input:
        x: batch x len x dim
      Output:
        not k: batch x n_head x (dim/n_head)  x len    
        k: batch x n_head x len x (dim/n_head)
    """

    def split_heads(self, x, k=False):
        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
        if k:
            return x.permute(0, 2, 3, 1)
        else:
            return x.permute(0, 2, 1, 3)

    """
     Input: 
       query: batch x len x n_state
       key, value: batch x kv_len x n_state
       x_mask: batch x kv_len # key and value's mask (if not None, used for encoder's self-attention and decoder's src-tgt attention)
    one_dir_visible: only sees previous history  (used for decoder's self-attention)
    return_attn_weight: if true, also return the attention weights
     Output: 
       a: batch x len x n_state
       attn_weight (if return_attn_weight): batch x len x kv_len
    """

    def forward(
        self, query, key, value, x_mask, one_dir_visible=False, return_attn_weight=False
    ):
        query = self.split_heads(query)
        # batch x n_head x len x (n_state/n_head)

        key = self.split_heads(key, k=True)
        # batch x n_head x (n_state/n_head) x kv_len

        value = self.split_heads(value)
        # batch x n_head x kv_len x (n_state/n_head)

        out = self._attn(query, key, value, x_mask, one_dir_visible, return_attn_weight)

        if return_attn_weight:
            a, attn_weight = out
            # a: batch x n_head x len x (n_state/n_head)
            # attn_weight: batch x n_head x len x kv_len
            attn_weight = attn_weight.permute(0, 2, 3, 1).contiguous()
            # batch x len x kv_len x n_head
            attn_weight = torch.sum(attn_weight, dim=3)
            # batch x len x kv_len
        else:
            a = out
            # batch x n_head x len x (n_state/n_head)

        a = self.merge_heads(a)
        # batch x len x n_state

        a = self.c_proj(a)
        # batch x len x n_state

        a = self.resid_dropout(a)
        # batch x len x n_state

        if return_attn_weight:
            return a, attn_weight
        else:
            return a


"""
 Two-layer network
"""


class MLP(nn.Module):
    """
    Input:
      n_state: intermediate dim
    """

    def __init__(self, n_state, opt):  # in MLP: n_state=3072 (4 * n_embd)
        super(MLP, self).__init__()
        nx = int(opt["transformer_embed_dim"])
        resid_pdrop = opt["TRANSFORMER_RESIDUAL_DROPOUT"]
        self.c_fc = Conv1D(n_state, nx)
        self.c_proj = Conv1D(nx, n_state)
        self.dropout = nn.Dropout(resid_pdrop)

    """
      Input: 
       x: batch x len x nx
      Output: batch x len x nx
    """

    def forward(self, x):
        h = F.relu(self.c_fc(x))
        h2 = self.c_proj(h)
        return self.dropout(h2)


"""
 One encoder block of transformer
"""


class EncoderBlock(nn.Module):
    def __init__(self, opt):
        super(EncoderBlock, self).__init__()
        nx = int(opt["transformer_embed_dim"])
        self.one_dir_visible = False
        if "transformer_encoder_one_dir_visible" in opt:
            self.one_dir_visible = opt["transformer_encoder_one_dir_visible"]
        self.splitter = Splitter(nx)
        self.attn = Attention(nx, opt)
        self.ln_1 = LayerNorm(nx)
        self.mlp = MLP(4 * nx, opt)
        self.ln_2 = LayerNorm(nx)

    """
     Input: 
       x: batch x len x n_state
       x_mask: batch x len (1 means there's something)
     Output: 
       h: batch x len x n_state
    """

    def forward(self, x, x_mask):
        query, key, value = self.splitter(x)
        if self.one_dir_visible:
            # in this case, use triangle masking, as it's one_direction
            a = self.attn(query, key, value, None, one_dir_visible=True)
        else:
            # in this case, use x_mask for attention masking
            a = self.attn(query, key, value, x_mask, one_dir_visible=False)

        n = self.ln_1(x + a)  # residual
        m = self.mlp(n)
        h = self.ln_2(n + m)
        return h


"""
 One encoder block of transformer
"""


class DecoderBlock(nn.Module):
    def __init__(self, opt):
        super(DecoderBlock, self).__init__()
        nx = int(opt["transformer_embed_dim"])
        self.decoder_splitter = Splitter(nx)
        self.self_attn = Attention(nx, opt)
        self.cross_attn = Attention(nx, opt)
        self.ln_1 = LayerNorm(nx)
        self.ln_2 = LayerNorm(nx)
        self.mlp = MLP(4 * nx, opt)
        self.ln_3 = LayerNorm(nx)

    """
     Input:        
       x_mask: batch x len, mask for encoder's input
       y: batch x len x n_state (decoder part)
       enc_key: batch x encoder_len x n_state
       enc_value: batch x encoder_len x n_state
       lang_model: whether it's for language model training (no encoder part is used)
     Output: 
       h: batch x len x n_state
    """

    def forward(self, x_mask, y, enc_key, enc_value, lang_model=False):
        query, key, value = self.decoder_splitter(y)
        # batch x len x n_state

        # self-attention
        a = self.self_attn(query, key, value, None, one_dir_visible=True)
        # batch x len x n_state

        n = self.ln_1(y + a)  # residual

        # seq2seq
        if not lang_model:
            # src-tgt attention
            o = self.cross_attn(n, enc_key, enc_value, x_mask)
            p = self.ln_2(n + o)  # residual
            # batch x len x n_state
        else:  # language model
            p = n

        m = self.mlp(p)
        h = self.ln_3(p + m)
        return h


"""
  Embedder 
"""


class Embedder(nn.Module):
    """
    Input:
      vocab: size of vocabulary
    """

    def __init__(self, opt, embed=None):
        super(Embedder, self).__init__()
        n_state = int(opt["transformer_embed_dim"])  # n_state
        embed_dropout_rate = opt["TRANSFORMER_EMBED_DROPOUT"]
        if embed is None:
            self.embed = nn.Embedding(opt["vocab_size"], n_state)
            nn.init.normal_(self.embed.weight, std=0.02)
        else:
            self.embed = embed
        self.drop = nn.Dropout(embed_dropout_rate)
        self.pos_emb = PositionalEmbedding(opt, n_state)
        self.use_cuda = opt["cuda"]

    """
       Input:
        x: batch x len    (word_id)
       Output:
        h: batch x len x n_state
    """

    def forward(self, x):
        x_emb = self.embed(x)
        batch_size = x.shape[0]
        x_len = x.shape[1]
        x_pos = self.pos_emb(
            torch.arange(x_len).type(
                torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor
            )
        )  # len x n_state
        x_pos = (
            Variable(
                x_pos.unsqueeze(0).repeat(batch_size, 1, 1), requires_grad=False
            ).cuda()
            if self.use_cuda
            else Variable(
                x_pos.unsqueeze(0).repeat(batch_size, 1, 1), requires_grad=False
            )
        )
        x_input = x_emb + x_pos
        h = self.drop(x_input)
        return h


"""
  Transformer encoder
"""


class TransformerEncoder(nn.Module):
    """
    Input:
      embed: (if not None) pre-computed vocab embeddings
    """

    def __init__(self, opt, embed=None):
        super(TransformerEncoder, self).__init__()
        vocab = int(opt["vocab_size"])
        n_state = int(opt["transformer_embed_dim"])
        n_layer = int(opt["TRANSFORMER_LAYER"])
        if "vae_z_scale_factor" in opt:
            self.vae_z_scale_factor = float(opt["vae_z_scale_factor"])

        self.embedder = Embedder(opt, embed)
        block = EncoderBlock(opt)
        self.blocks = nn.ModuleList([copy.deepcopy(block) for _ in range(n_layer)])
        self.use_cuda = opt["cuda"]

    """
      Input: 
        x: batch x len (word_id)
        z (optional): batch x len x n_state (for VAE)
      Output: 
        h: batch x len x n_state (word_id)  
    """

    def forward(self, x, z=None):
        x_mask = ~x.eq(0)  # 1 is PAD_id
        x_mask = x_mask.type(
            torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor
        )

        h = self.embedder(x)
        if z is not None:
            z *= self.vae_z_scale_factor
            h += z

        for block in self.blocks:
            h = block(h, x_mask)
        return h


"""
  Transformer decoder
"""


class TransformerDecoder(nn.Module):
    """
    Input:
      embed: (if not None) pre-computed vocab embeddings
    """

    def __init__(self, opt, embed=None):
        super(TransformerDecoder, self).__init__()
        self.opt = opt
        vocab_size = int(opt["vocab_size"])
        n_state = int(opt["transformer_embed_dim"])  # n_state
        n_layer = int(opt["TRANSFORMER_LAYER"])
        self.embedder = Embedder(opt, embed)
        self.encoder_splitter = Splitter(n_state)
        block = DecoderBlock(opt)
        self.blocks = nn.ModuleList([copy.deepcopy(block) for _ in range(n_layer)])
        if embed is None:
            self.linear = Conv1D(vocab_size, n_state)
        else:
            self.linear = nn.Linear(n_state, vocab_size, bias=False)
            if (
                "FINETUNE_RETRAIN_SOFTMAX" not in opt
            ):  # if FINETUNE_RETRAIN_SOFTMAX, linear needs to be seperately trained
                self.linear.weight = embed.weight  # share weight
        self.use_coda = opt["cuda"]

    """
      Input: 
        x: batch x encoder_len (word id)
        x_out: batch x encoder_len x n_state
        y: batch x len (word_id)   (decoder part)
        lang_model: whether it's for language model training (no encoder part is used)
      Output: 
        prob: batch x len x vocab_size (probabilities after softmax)
    """

    def forward(self, x, x_out, y, lang_model=False):
        # seq2seq
        if not lang_model:
            _, enc_key, enc_value = self.encoder_splitter(x_out)
            # enc_key: batch x encoder_len x n_state
            # enc_value: batch x encoder_len x n_state

            x_mask = ~x.eq(0)  # 1 is PAD_id
            x_mask = x_mask.type(
                torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor
            )
        else:
            enc_key = None
            enc_value = None
            x_mask = None

        h = self.embedder(y)
        for block in self.blocks:
            h = block(x_mask, h, enc_key, enc_value, lang_model)
        prob = F.softmax(self.linear(h), dim=-1)
        return prob


class TransformerBeam:
    """
    Input:
     encoder: TransformerEncoder class
     decoder: TransformerDecoder class
     begin_id: word id of '<BEGIN>'
     vocab: list of words
    """

    def __init__(self, opt, encoder, decoder, begin_id, vocab):
        self.encoder = encoder
        self.decoder = decoder
        self.opt = opt
        self.max_sent_len = int(opt["max_sent_len"])
        self.begin_id = begin_id
        self.vocab = vocab
        self.beam_width = int(opt["beam_width"])
        self.use_cuda = opt["cuda"]

    # each candidate is (idx, prob, 0/1, position/wordid)
    def merge_candidates(self, cand_A, cand_B):
        C = []
        pA, lA, pB, lB = 0, len(cand_A), 0, len(cand_B)
        lC = 0
        while (pA < lA or pB < lB) and (lC < self.beam_width):
            if pA < lA and (pB >= lB or cand_A[pA][1] > cand_B[pB][1]):
                C.append(cand_A[pA])
                pA += 1
            else:
                C.append(cand_B[pB])
                pB += 1
            lC += 1
        return C

    """
     Input:
      x = batch * encoder_len (word_ids)   encoder's input
      k: top-k sampling
     Output:
      sents: list of words, with batch items, each one with up to beam_width (sentence, log_prob), each sentence with up to max_sent_len_word words
    """

    def topk(self, x, k):
        batch_size = x.shape[0]
        x_len = x.shape[1]
        x_out = self.encoder(x)
        # x_out: batch x encoder_len x n_state

        # sent_ids is the words for each of the batch_size sentences
        sent_ids = []
        for i in range(batch_size):
            sent_ids.append([self.begin_id])

        topk = 1
        MIN_GEN_LENGTH = 45
        if "MIN_GEN_LENGTH" in self.opt:
            MIN_GEN_LENGTH = int(self.opt["MIN_GEN_LENGTH"])
        for l in range(self.max_sent_len):
            y = (
                Variable(torch.LongTensor(sent_ids)).cuda()
                if self.use_cuda
                else Variable(torch.LongTensor(sent_ids))
            )  # batch_size x l
            decoder_outputs = self.decoder(x, x_out, y)
            probs = decoder_outputs[
                :, -1, :
            ]  # batch_size x vocab_size (only take the last output)
            for i in range(batch_size):
                topk_probs, _ = torch.topk(probs[i], k)
                threshold = float(topk_probs[-1])
                probs[i][probs[i] < threshold] = 0.0

            samples = torch.multinomial(
                probs, 2
            )  # sample 2 since the first one may be <END>
            for i in range(batch_size):
                if l < MIN_GEN_LENGTH and self.vocab[int(samples[i, 0])] == "<END>":
                    sent_ids[i].append(int(samples[i, 1]))
                else:
                    sent_ids[i].append(int(samples[i, 0]))

        sents = []
        for i in range(batch_size):
            utt = []
            for j in range(len(sent_ids[i])):
                w = self.vocab[sent_ids[i][j]]
                if w == "<BEGIN>":
                    continue
                if w == "<END>":
                    break
                utt.append(w)
            sents.append([(utt, 0)])

        return sents

    """
     Input:
      x = batch * encoder_len (word_ids)   encoder's input
     Output:
      sents: list of words, with batch items, each one with up to beam_width (sentence, log_prob), each sentence with up to max_sent_len_word words
    """

    def beam_search(self, x):
        batch_size = x.shape[0]
        x_len = x.shape[1]
        x_out = self.encoder(x)
        # x_out: batch x encoder_len x n_state

        sents = []
        topk = 1
        history_nodes = [{}]
        end_nodes = {}
        for idx in range(batch_size):
            start_node = BeamSearchNode([self.begin_id], 0, 1)
            history_nodes[0][idx] = [start_node]
            end_nodes[idx] = []

        for l in range(self.max_sent_len):
            last_nodes = history_nodes[-1]
            if sum([len(l) for i, l in last_nodes.items()]) == 0:  # no nodes left
                break
            ys = []
            x_outs = []
            xs = []
            for idx in range(batch_size):
                ys.extend([node.word_ids for node in last_nodes[idx]])
                x_outs.extend(
                    [x_out[idx, :, :].unsqueeze(0) for node in last_nodes[idx]]
                )
                xs.extend([x[idx, :].unsqueeze(0) for node in last_nodes[idx]])

            ys = (
                Variable(torch.LongTensor(ys)).cuda()
                if self.use_cuda
                else Variable(torch.LongTensor(ys))
            )  # N x l
            x_outs = torch.cat(x_outs, dim=0)  # N x x_len x n_state
            xs = torch.cat(xs, dim=0)  # N x x_len
            probs = self.decoder(xs, x_outs, ys)
            log_probs = torch.log(
                probs[:, -1, :] + 1e-15
            )  # N x vocab_size (only take the last output)

            history_nodes.append({})
            p = 0
            for idx in range(batch_size):
                history_nodes[-1][idx] = []
                N = len(last_nodes[idx])
                if N == 0:
                    continue
                log_prob = log_probs[p : p + N]
                p += N
                # log_prob = N x extended_vocab_size

                # generate
                candidates = []
                for k in range(N):
                    logprobs, ids = torch.topk(log_prob[k], self.beam_width)
                    candidates = self.merge_candidates(
                        candidates, [(k, p, d) for p, d in zip(logprobs, ids)]
                    )

                candidates = candidates[: self.beam_width]
                extended_nodes_in_last_nodes = set()
                for k in range(len(candidates)):
                    h, logp, next_word_id = candidates[
                        k
                    ]  # h means "the h-th node in last_nodes"
                    logp = float(logp)
                    next_word_id = int(next_word_id)
                    prev_node = last_nodes[idx][h]
                    next_wordids = prev_node.word_ids + [next_word_id]
                    next_word = self.vocab[next_word_id]

                    next_node = BeamSearchNode(
                        next_wordids, prev_node.log_prob + logp, prev_node.length + 1
                    )
                    if next_node.duplicate == False:  # no duplicate trigram generated
                        extended_nodes_in_last_nodes.add(h)
                        if next_word == "<END>" or l == self.max_sent_len - 1:
                            end_nodes[idx].append((next_node.eval(), next_node))
                        else:
                            history_nodes[-1][idx].append(next_node)

                special_words = ["<PAD>", "<UNK>", "<s>", "</s>", "<BEGIN>", "<END>"]
                for k in range(N):
                    if k not in extended_nodes_in_last_nodes:
                        node = last_nodes[idx][k]
                        effective_word_count = sum(
                            [
                                1
                                for x in node.word_ids
                                if self.vocab[x] not in special_words
                            ]
                        )
                        if effective_word_count >= 5:
                            end_nodes[idx].append((node.eval(), node))

        MIN_GEN_LENGTH = 45
        if "MIN_GEN_LENGTH" in self.opt:
            MIN_GEN_LENGTH = int(self.opt["MIN_GEN_LENGTH"])
        for idx in range(batch_size):
            t = len([w for w in end_nodes[idx] if w[1].length > MIN_GEN_LENGTH])
            if t > 0:
                end_nodes[idx] = [
                    w for w in end_nodes[idx] if w[1].length > MIN_GEN_LENGTH
                ]

            end_nodes[idx].sort(key=lambda tup: tup[0], reverse=True)
            candidates = []
            for score, node in end_nodes[idx][:topk]:
                utt = [self.vocab[x] for x in node.word_ids]
                utt = [x for x in utt if x not in ["<BEGIN>", "<END>"]]
                candidates.append((utt, score))
            if len(candidates) == 0:
                candidates.append(("", 0))
            sents.append(candidates)

        return sents


class BeamSearchNode(object):
    def __init__(self, word_ids, log_prob, length):
        self.word_ids = word_ids
        self.log_prob = log_prob
        self.length = length

        trigram_set = set()
        self.duplicate = False

        for i in range(2, len(word_ids)):
            trigram = (
                str(word_ids[i - 2])
                + " "
                + str(word_ids[i - 1])
                + " "
                + str(word_ids[i])
            )
            if trigram in trigram_set:
                self.duplicate = True
                break
            trigram_set.add(trigram)

    def eval(self):
        return self.log_prob / float(self.length - 1.0 + 1e-6)

    def __lt__(self, other):
        return self.length < other.length