Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

File size: 60,921 Bytes

d916065

# Natural Language Toolkit: Maximum Entropy Classifiers
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Edward Loper <[email protected]>
#         Dmitry Chichkov <[email protected]> (TypedMaxentFeatureEncoding)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

"""

A classifier model based on maximum entropy modeling framework.  This

framework considers all of the probability distributions that are

empirically consistent with the training data; and chooses the

distribution with the highest entropy.  A probability distribution is

"empirically consistent" with a set of training data if its estimated

frequency with which a class and a feature vector value co-occur is

equal to the actual frequency in the data.



Terminology: 'feature'

======================

The term *feature* is usually used to refer to some property of an

unlabeled token.  For example, when performing word sense

disambiguation, we might define a ``'prevword'`` feature whose value is

the word preceding the target word.  However, in the context of

maxent modeling, the term *feature* is typically used to refer to a

property of a "labeled" token.  In order to prevent confusion, we

will introduce two distinct terms to disambiguate these two different

concepts:



  - An "input-feature" is a property of an unlabeled token.

  - A "joint-feature" is a property of a labeled token.



In the rest of the ``nltk.classify`` module, the term "features" is

used to refer to what we will call "input-features" in this module.



In literature that describes and discusses maximum entropy models,

input-features are typically called "contexts", and joint-features

are simply referred to as "features".



Converting Input-Features to Joint-Features

-------------------------------------------

In maximum entropy models, joint-features are required to have numeric

values.  Typically, each input-feature ``input_feat`` is mapped to a

set of joint-features of the form:



|   joint_feat(token, label) = { 1 if input_feat(token) == feat_val

|                              {      and label == some_label

|                              {

|                              { 0 otherwise



For all values of ``feat_val`` and ``some_label``.  This mapping is

performed by classes that implement the ``MaxentFeatureEncodingI``

interface.

"""
try:
    import numpy
except ImportError:
    pass

import os
import tempfile
from collections import defaultdict

from nltk.classify.api import ClassifierI
from nltk.classify.megam import call_megam, parse_megam_weights, write_megam_file
from nltk.classify.tadm import call_tadm, parse_tadm_weights, write_tadm_file
from nltk.classify.util import CutoffChecker, accuracy, log_likelihood
from nltk.data import gzip_open_unicode
from nltk.probability import DictionaryProbDist
from nltk.util import OrderedDict

__docformat__ = "epytext en"

######################################################################
# { Classifier Model
######################################################################


class MaxentClassifier(ClassifierI):
    """

    A maximum entropy classifier (also known as a "conditional

    exponential classifier").  This classifier is parameterized by a

    set of "weights", which are used to combine the joint-features

    that are generated from a featureset by an "encoding".  In

    particular, the encoding maps each ``(featureset, label)`` pair to

    a vector.  The probability of each label is then computed using

    the following equation::



                                dotprod(weights, encode(fs,label))

      prob(fs|label) = ---------------------------------------------------

                       sum(dotprod(weights, encode(fs,l)) for l in labels)



    Where ``dotprod`` is the dot product::



      dotprod(a,b) = sum(x*y for (x,y) in zip(a,b))

    """

    def __init__(self, encoding, weights, logarithmic=True):
        """

        Construct a new maxent classifier model.  Typically, new

        classifier models are created using the ``train()`` method.



        :type encoding: MaxentFeatureEncodingI

        :param encoding: An encoding that is used to convert the

            featuresets that are given to the ``classify`` method into

            joint-feature vectors, which are used by the maxent

            classifier model.



        :type weights: list of float

        :param weights:  The feature weight vector for this classifier.



        :type logarithmic: bool

        :param logarithmic: If false, then use non-logarithmic weights.

        """
        self._encoding = encoding
        self._weights = weights
        self._logarithmic = logarithmic
        # self._logarithmic = False
        assert encoding.length() == len(weights)

    def labels(self):
        return self._encoding.labels()

    def set_weights(self, new_weights):
        """

        Set the feature weight vector for this classifier.

        :param new_weights: The new feature weight vector.

        :type new_weights: list of float

        """
        self._weights = new_weights
        assert self._encoding.length() == len(new_weights)

    def weights(self):
        """

        :return: The feature weight vector for this classifier.

        :rtype: list of float

        """
        return self._weights

    def classify(self, featureset):
        return self.prob_classify(featureset).max()

    def prob_classify(self, featureset):
        prob_dict = {}
        for label in self._encoding.labels():
            feature_vector = self._encoding.encode(featureset, label)

            if self._logarithmic:
                total = 0.0
                for (f_id, f_val) in feature_vector:
                    total += self._weights[f_id] * f_val
                prob_dict[label] = total

            else:
                prod = 1.0
                for (f_id, f_val) in feature_vector:
                    prod *= self._weights[f_id] ** f_val
                prob_dict[label] = prod

        # Normalize the dictionary to give a probability distribution
        return DictionaryProbDist(prob_dict, log=self._logarithmic, normalize=True)

    def explain(self, featureset, columns=4):
        """

        Print a table showing the effect of each of the features in

        the given feature set, and how they combine to determine the

        probabilities of each label for that featureset.

        """
        descr_width = 50
        TEMPLATE = "  %-" + str(descr_width - 2) + "s%s%8.3f"

        pdist = self.prob_classify(featureset)
        labels = sorted(pdist.samples(), key=pdist.prob, reverse=True)
        labels = labels[:columns]
        print(
            "  Feature".ljust(descr_width)
            + "".join("%8s" % (("%s" % l)[:7]) for l in labels)
        )
        print("  " + "-" * (descr_width - 2 + 8 * len(labels)))
        sums = defaultdict(int)
        for i, label in enumerate(labels):
            feature_vector = self._encoding.encode(featureset, label)
            feature_vector.sort(
                key=lambda fid__: abs(self._weights[fid__[0]]), reverse=True
            )
            for (f_id, f_val) in feature_vector:
                if self._logarithmic:
                    score = self._weights[f_id] * f_val
                else:
                    score = self._weights[f_id] ** f_val
                descr = self._encoding.describe(f_id)
                descr = descr.split(" and label is ")[0]  # hack
                descr += " (%s)" % f_val  # hack
                if len(descr) > 47:
                    descr = descr[:44] + "..."
                print(TEMPLATE % (descr, i * 8 * " ", score))
                sums[label] += score
        print("  " + "-" * (descr_width - 1 + 8 * len(labels)))
        print(
            "  TOTAL:".ljust(descr_width) + "".join("%8.3f" % sums[l] for l in labels)
        )
        print(
            "  PROBS:".ljust(descr_width)
            + "".join("%8.3f" % pdist.prob(l) for l in labels)
        )

    def most_informative_features(self, n=10):
        """

        Generates the ranked list of informative features from most to least.

        """
        if hasattr(self, "_most_informative_features"):
            return self._most_informative_features[:n]
        else:
            self._most_informative_features = sorted(
                list(range(len(self._weights))),
                key=lambda fid: abs(self._weights[fid]),
                reverse=True,
            )
            return self._most_informative_features[:n]

    def show_most_informative_features(self, n=10, show="all"):
        """

        :param show: all, neg, or pos (for negative-only or positive-only)

        :type show: str

        :param n: The no. of top features

        :type n: int

        """
        # Use None the full list of ranked features.
        fids = self.most_informative_features(None)
        if show == "pos":
            fids = [fid for fid in fids if self._weights[fid] > 0]
        elif show == "neg":
            fids = [fid for fid in fids if self._weights[fid] < 0]
        for fid in fids[:n]:
            print(f"{self._weights[fid]:8.3f} {self._encoding.describe(fid)}")

    def __repr__(self):
        return "<ConditionalExponentialClassifier: %d labels, %d features>" % (
            len(self._encoding.labels()),
            self._encoding.length(),
        )

    #: A list of the algorithm names that are accepted for the
    #: ``train()`` method's ``algorithm`` parameter.
    ALGORITHMS = ["GIS", "IIS", "MEGAM", "TADM"]

    @classmethod
    def train(

        cls,

        train_toks,

        algorithm=None,

        trace=3,

        encoding=None,

        labels=None,

        gaussian_prior_sigma=0,

        **cutoffs,

    ):
        """

        Train a new maxent classifier based on the given corpus of

        training samples.  This classifier will have its weights

        chosen to maximize entropy while remaining empirically

        consistent with the training corpus.



        :rtype: MaxentClassifier

        :return: The new maxent classifier



        :type train_toks: list

        :param train_toks: Training data, represented as a list of

            pairs, the first member of which is a featureset,

            and the second of which is a classification label.



        :type algorithm: str

        :param algorithm: A case-insensitive string, specifying which

            algorithm should be used to train the classifier.  The

            following algorithms are currently available.



            - Iterative Scaling Methods: Generalized Iterative Scaling (``'GIS'``),

              Improved Iterative Scaling (``'IIS'``)

            - External Libraries (requiring megam):

              LM-BFGS algorithm, with training performed by Megam (``'megam'``)



            The default algorithm is ``'IIS'``.



        :type trace: int

        :param trace: The level of diagnostic tracing output to produce.

            Higher values produce more verbose output.

        :type encoding: MaxentFeatureEncodingI

        :param encoding: A feature encoding, used to convert featuresets

            into feature vectors.  If none is specified, then a

            ``BinaryMaxentFeatureEncoding`` will be built based on the

            features that are attested in the training corpus.

        :type labels: list(str)

        :param labels: The set of possible labels.  If none is given, then

            the set of all labels attested in the training data will be

            used instead.

        :param gaussian_prior_sigma: The sigma value for a gaussian

            prior on model weights.  Currently, this is supported by

            ``megam``. For other algorithms, its value is ignored.

        :param cutoffs: Arguments specifying various conditions under

            which the training should be halted.  (Some of the cutoff

            conditions are not supported by some algorithms.)



            - ``max_iter=v``: Terminate after ``v`` iterations.

            - ``min_ll=v``: Terminate after the negative average

              log-likelihood drops under ``v``.

            - ``min_lldelta=v``: Terminate if a single iteration improves

              log likelihood by less than ``v``.

        """
        if algorithm is None:
            algorithm = "iis"
        for key in cutoffs:
            if key not in (
                "max_iter",
                "min_ll",
                "min_lldelta",
                "max_acc",
                "min_accdelta",
                "count_cutoff",
                "norm",
                "explicit",
                "bernoulli",
            ):
                raise TypeError("Unexpected keyword arg %r" % key)
        algorithm = algorithm.lower()
        if algorithm == "iis":
            return train_maxent_classifier_with_iis(
                train_toks, trace, encoding, labels, **cutoffs
            )
        elif algorithm == "gis":
            return train_maxent_classifier_with_gis(
                train_toks, trace, encoding, labels, **cutoffs
            )
        elif algorithm == "megam":
            return train_maxent_classifier_with_megam(
                train_toks, trace, encoding, labels, gaussian_prior_sigma, **cutoffs
            )
        elif algorithm == "tadm":
            kwargs = cutoffs
            kwargs["trace"] = trace
            kwargs["encoding"] = encoding
            kwargs["labels"] = labels
            kwargs["gaussian_prior_sigma"] = gaussian_prior_sigma
            return TadmMaxentClassifier.train(train_toks, **kwargs)
        else:
            raise ValueError("Unknown algorithm %s" % algorithm)


#: Alias for MaxentClassifier.
ConditionalExponentialClassifier = MaxentClassifier


######################################################################
# { Feature Encodings
######################################################################


class MaxentFeatureEncodingI:
    """

    A mapping that converts a set of input-feature values to a vector

    of joint-feature values, given a label.  This conversion is

    necessary to translate featuresets into a format that can be used

    by maximum entropy models.



    The set of joint-features used by a given encoding is fixed, and

    each index in the generated joint-feature vectors corresponds to a

    single joint-feature.  The length of the generated joint-feature

    vectors is therefore constant (for a given encoding).



    Because the joint-feature vectors generated by

    ``MaxentFeatureEncodingI`` are typically very sparse, they are

    represented as a list of ``(index, value)`` tuples, specifying the

    value of each non-zero joint-feature.



    Feature encodings are generally created using the ``train()``

    method, which generates an appropriate encoding based on the

    input-feature values and labels that are present in a given

    corpus.

    """

    def encode(self, featureset, label):
        """

        Given a (featureset, label) pair, return the corresponding

        vector of joint-feature values.  This vector is represented as

        a list of ``(index, value)`` tuples, specifying the value of

        each non-zero joint-feature.



        :type featureset: dict

        :rtype: list(tuple(int, int))

        """
        raise NotImplementedError()

    def length(self):
        """

        :return: The size of the fixed-length joint-feature vectors

            that are generated by this encoding.

        :rtype: int

        """
        raise NotImplementedError()

    def labels(self):
        """

        :return: A list of the \"known labels\" -- i.e., all labels

            ``l`` such that ``self.encode(fs,l)`` can be a nonzero

            joint-feature vector for some value of ``fs``.

        :rtype: list

        """
        raise NotImplementedError()

    def describe(self, fid):
        """

        :return: A string describing the value of the joint-feature

            whose index in the generated feature vectors is ``fid``.

        :rtype: str

        """
        raise NotImplementedError()

    def train(cls, train_toks):
        """

        Construct and return new feature encoding, based on a given

        training corpus ``train_toks``.



        :type train_toks: list(tuple(dict, str))

        :param train_toks: Training data, represented as a list of

            pairs, the first member of which is a feature dictionary,

            and the second of which is a classification label.

        """
        raise NotImplementedError()


class FunctionBackedMaxentFeatureEncoding(MaxentFeatureEncodingI):
    """

    A feature encoding that calls a user-supplied function to map a

    given featureset/label pair to a sparse joint-feature vector.

    """

    def __init__(self, func, length, labels):
        """

        Construct a new feature encoding based on the given function.



        :type func: (callable)

        :param func: A function that takes two arguments, a featureset

             and a label, and returns the sparse joint feature vector

             that encodes them::



                 func(featureset, label) -> feature_vector



             This sparse joint feature vector (``feature_vector``) is a

             list of ``(index,value)`` tuples.



        :type length: int

        :param length: The size of the fixed-length joint-feature

            vectors that are generated by this encoding.



        :type labels: list

        :param labels: A list of the \"known labels\" for this

            encoding -- i.e., all labels ``l`` such that

            ``self.encode(fs,l)`` can be a nonzero joint-feature vector

            for some value of ``fs``.

        """
        self._length = length
        self._func = func
        self._labels = labels

    def encode(self, featureset, label):
        return self._func(featureset, label)

    def length(self):
        return self._length

    def labels(self):
        return self._labels

    def describe(self, fid):
        return "no description available"


class BinaryMaxentFeatureEncoding(MaxentFeatureEncodingI):
    """

    A feature encoding that generates vectors containing a binary

    joint-features of the form:



    |  joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)

    |                      {

    |                      { 0 otherwise



    Where ``fname`` is the name of an input-feature, ``fval`` is a value

    for that input-feature, and ``label`` is a label.



    Typically, these features are constructed based on a training

    corpus, using the ``train()`` method.  This method will create one

    feature for each combination of ``fname``, ``fval``, and ``label``

    that occurs at least once in the training corpus.



    The ``unseen_features`` parameter can be used to add "unseen-value

    features", which are used whenever an input feature has a value

    that was not encountered in the training corpus.  These features

    have the form:



    |  joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])

    |                      {      and l == label

    |                      {

    |                      { 0 otherwise



    Where ``is_unseen(fname, fval)`` is true if the encoding does not

    contain any joint features that are true when ``fs[fname]==fval``.



    The ``alwayson_features`` parameter can be used to add "always-on

    features", which have the form::



    |  joint_feat(fs, l) = { 1 if (l == label)

    |                      {

    |                      { 0 otherwise



    These always-on features allow the maxent model to directly model

    the prior probabilities of each label.

    """

    def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False):
        """

        :param labels: A list of the \"known labels\" for this encoding.



        :param mapping: A dictionary mapping from ``(fname,fval,label)``

            tuples to corresponding joint-feature indexes.  These

            indexes must be the set of integers from 0...len(mapping).

            If ``mapping[fname,fval,label]=id``, then

            ``self.encode(..., fname:fval, ..., label)[id]`` is 1;

            otherwise, it is 0.



        :param unseen_features: If true, then include unseen value

           features in the generated joint-feature vectors.



        :param alwayson_features: If true, then include always-on

           features in the generated joint-feature vectors.

        """
        if set(mapping.values()) != set(range(len(mapping))):
            raise ValueError(
                "Mapping values must be exactly the "
                "set of integers from 0...len(mapping)"
            )

        self._labels = list(labels)
        """A list of attested labels."""

        self._mapping = mapping
        """dict mapping from (fname,fval,label) -> fid"""

        self._length = len(mapping)
        """The length of generated joint feature vectors."""

        self._alwayson = None
        """dict mapping from label -> fid"""

        self._unseen = None
        """dict mapping from fname -> fid"""

        if alwayson_features:
            self._alwayson = {
                label: i + self._length for (i, label) in enumerate(labels)
            }
            self._length += len(self._alwayson)

        if unseen_features:
            fnames = {fname for (fname, fval, label) in mapping}
            self._unseen = {fname: i + self._length for (i, fname) in enumerate(fnames)}
            self._length += len(fnames)

    def encode(self, featureset, label):
        # Inherit docs.
        encoding = []

        # Convert input-features to joint-features:
        for fname, fval in featureset.items():
            # Known feature name & value:
            if (fname, fval, label) in self._mapping:
                encoding.append((self._mapping[fname, fval, label], 1))

            # Otherwise, we might want to fire an "unseen-value feature".
            elif self._unseen:
                # Have we seen this fname/fval combination with any label?
                for label2 in self._labels:
                    if (fname, fval, label2) in self._mapping:
                        break  # we've seen this fname/fval combo
                # We haven't -- fire the unseen-value feature
                else:
                    if fname in self._unseen:
                        encoding.append((self._unseen[fname], 1))

        # Add always-on features:
        if self._alwayson and label in self._alwayson:
            encoding.append((self._alwayson[label], 1))

        return encoding

    def describe(self, f_id):
        # Inherit docs.
        if not isinstance(f_id, int):
            raise TypeError("describe() expected an int")
        try:
            self._inv_mapping
        except AttributeError:
            self._inv_mapping = [-1] * len(self._mapping)
            for (info, i) in self._mapping.items():
                self._inv_mapping[i] = info

        if f_id < len(self._mapping):
            (fname, fval, label) = self._inv_mapping[f_id]
            return f"{fname}=={fval!r} and label is {label!r}"
        elif self._alwayson and f_id in self._alwayson.values():
            for (label, f_id2) in self._alwayson.items():
                if f_id == f_id2:
                    return "label is %r" % label
        elif self._unseen and f_id in self._unseen.values():
            for (fname, f_id2) in self._unseen.items():
                if f_id == f_id2:
                    return "%s is unseen" % fname
        else:
            raise ValueError("Bad feature id")

    def labels(self):
        # Inherit docs.
        return self._labels

    def length(self):
        # Inherit docs.
        return self._length

    @classmethod
    def train(cls, train_toks, count_cutoff=0, labels=None, **options):
        """

        Construct and return new feature encoding, based on a given

        training corpus ``train_toks``.  See the class description

        ``BinaryMaxentFeatureEncoding`` for a description of the

        joint-features that will be included in this encoding.



        :type train_toks: list(tuple(dict, str))

        :param train_toks: Training data, represented as a list of

            pairs, the first member of which is a feature dictionary,

            and the second of which is a classification label.



        :type count_cutoff: int

        :param count_cutoff: A cutoff value that is used to discard

            rare joint-features.  If a joint-feature's value is 1

            fewer than ``count_cutoff`` times in the training corpus,

            then that joint-feature is not included in the generated

            encoding.



        :type labels: list

        :param labels: A list of labels that should be used by the

            classifier.  If not specified, then the set of labels

            attested in ``train_toks`` will be used.



        :param options: Extra parameters for the constructor, such as

            ``unseen_features`` and ``alwayson_features``.

        """
        mapping = {}  # maps (fname, fval, label) -> fid
        seen_labels = set()  # The set of labels we've encountered
        count = defaultdict(int)  # maps (fname, fval) -> count

        for (tok, label) in train_toks:
            if labels and label not in labels:
                raise ValueError("Unexpected label %s" % label)
            seen_labels.add(label)

            # Record each of the features.
            for (fname, fval) in tok.items():

                # If a count cutoff is given, then only add a joint
                # feature once the corresponding (fname, fval, label)
                # tuple exceeds that cutoff.
                count[fname, fval] += 1
                if count[fname, fval] >= count_cutoff:
                    if (fname, fval, label) not in mapping:
                        mapping[fname, fval, label] = len(mapping)

        if labels is None:
            labels = seen_labels
        return cls(labels, mapping, **options)


class GISEncoding(BinaryMaxentFeatureEncoding):
    """

    A binary feature encoding which adds one new joint-feature to the

    joint-features defined by ``BinaryMaxentFeatureEncoding``: a

    correction feature, whose value is chosen to ensure that the

    sparse vector always sums to a constant non-negative number.  This

    new feature is used to ensure two preconditions for the GIS

    training algorithm:



      - At least one feature vector index must be nonzero for every

        token.

      - The feature vector must sum to a constant non-negative number

        for every token.

    """

    def __init__(

        self, labels, mapping, unseen_features=False, alwayson_features=False, C=None

    ):
        """

        :param C: The correction constant.  The value of the correction

            feature is based on this value.  In particular, its value is

            ``C - sum([v for (f,v) in encoding])``.

        :seealso: ``BinaryMaxentFeatureEncoding.__init__``

        """
        BinaryMaxentFeatureEncoding.__init__(
            self, labels, mapping, unseen_features, alwayson_features
        )
        if C is None:
            C = len({fname for (fname, fval, label) in mapping}) + 1
        self._C = C

    @property
    def C(self):
        """The non-negative constant that all encoded feature vectors

        will sum to."""
        return self._C

    def encode(self, featureset, label):
        # Get the basic encoding.
        encoding = BinaryMaxentFeatureEncoding.encode(self, featureset, label)
        base_length = BinaryMaxentFeatureEncoding.length(self)

        # Add a correction feature.
        total = sum(v for (f, v) in encoding)
        if total >= self._C:
            raise ValueError("Correction feature is not high enough!")
        encoding.append((base_length, self._C - total))

        # Return the result
        return encoding

    def length(self):
        return BinaryMaxentFeatureEncoding.length(self) + 1

    def describe(self, f_id):
        if f_id == BinaryMaxentFeatureEncoding.length(self):
            return "Correction feature (%s)" % self._C
        else:
            return BinaryMaxentFeatureEncoding.describe(self, f_id)


class TadmEventMaxentFeatureEncoding(BinaryMaxentFeatureEncoding):
    def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False):
        self._mapping = OrderedDict(mapping)
        self._label_mapping = OrderedDict()
        BinaryMaxentFeatureEncoding.__init__(
            self, labels, self._mapping, unseen_features, alwayson_features
        )

    def encode(self, featureset, label):
        encoding = []
        for feature, value in featureset.items():
            if (feature, label) not in self._mapping:
                self._mapping[(feature, label)] = len(self._mapping)
            if value not in self._label_mapping:
                if not isinstance(value, int):
                    self._label_mapping[value] = len(self._label_mapping)
                else:
                    self._label_mapping[value] = value
            encoding.append(
                (self._mapping[(feature, label)], self._label_mapping[value])
            )
        return encoding

    def labels(self):
        return self._labels

    def describe(self, fid):
        for (feature, label) in self._mapping:
            if self._mapping[(feature, label)] == fid:
                return (feature, label)

    def length(self):
        return len(self._mapping)

    @classmethod
    def train(cls, train_toks, count_cutoff=0, labels=None, **options):
        mapping = OrderedDict()
        if not labels:
            labels = []

        # This gets read twice, so compute the values in case it's lazy.
        train_toks = list(train_toks)

        for (featureset, label) in train_toks:
            if label not in labels:
                labels.append(label)

        for (featureset, label) in train_toks:
            for label in labels:
                for feature in featureset:
                    if (feature, label) not in mapping:
                        mapping[(feature, label)] = len(mapping)

        return cls(labels, mapping, **options)


class TypedMaxentFeatureEncoding(MaxentFeatureEncodingI):
    """

    A feature encoding that generates vectors containing integer,

    float and binary joint-features of the form:



    Binary (for string and boolean features):



    |  joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)

    |                      {

    |                      { 0 otherwise



    Value (for integer and float features):



    |  joint_feat(fs, l) = { fval if     (fs[fname] == type(fval))

    |                      {         and (l == label)

    |                      {

    |                      { not encoded otherwise



    Where ``fname`` is the name of an input-feature, ``fval`` is a value

    for that input-feature, and ``label`` is a label.



    Typically, these features are constructed based on a training

    corpus, using the ``train()`` method.



    For string and boolean features [type(fval) not in (int, float)]

    this method will create one feature for each combination of

    ``fname``, ``fval``, and ``label`` that occurs at least once in the

    training corpus.



    For integer and float features [type(fval) in (int, float)] this

    method will create one feature for each combination of ``fname``

    and ``label`` that occurs at least once in the training corpus.



    For binary features the ``unseen_features`` parameter can be used

    to add "unseen-value features", which are used whenever an input

    feature has a value that was not encountered in the training

    corpus.  These features have the form:



    |  joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])

    |                      {      and l == label

    |                      {

    |                      { 0 otherwise



    Where ``is_unseen(fname, fval)`` is true if the encoding does not

    contain any joint features that are true when ``fs[fname]==fval``.



    The ``alwayson_features`` parameter can be used to add "always-on

    features", which have the form:



    |  joint_feat(fs, l) = { 1 if (l == label)

    |                      {

    |                      { 0 otherwise



    These always-on features allow the maxent model to directly model

    the prior probabilities of each label.

    """

    def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False):
        """

        :param labels: A list of the \"known labels\" for this encoding.



        :param mapping: A dictionary mapping from ``(fname,fval,label)``

            tuples to corresponding joint-feature indexes.  These

            indexes must be the set of integers from 0...len(mapping).

            If ``mapping[fname,fval,label]=id``, then

            ``self.encode({..., fname:fval, ...``, label)[id]} is 1;

            otherwise, it is 0.



        :param unseen_features: If true, then include unseen value

           features in the generated joint-feature vectors.



        :param alwayson_features: If true, then include always-on

           features in the generated joint-feature vectors.

        """
        if set(mapping.values()) != set(range(len(mapping))):
            raise ValueError(
                "Mapping values must be exactly the "
                "set of integers from 0...len(mapping)"
            )

        self._labels = list(labels)
        """A list of attested labels."""

        self._mapping = mapping
        """dict mapping from (fname,fval,label) -> fid"""

        self._length = len(mapping)
        """The length of generated joint feature vectors."""

        self._alwayson = None
        """dict mapping from label -> fid"""

        self._unseen = None
        """dict mapping from fname -> fid"""

        if alwayson_features:
            self._alwayson = {
                label: i + self._length for (i, label) in enumerate(labels)
            }
            self._length += len(self._alwayson)

        if unseen_features:
            fnames = {fname for (fname, fval, label) in mapping}
            self._unseen = {fname: i + self._length for (i, fname) in enumerate(fnames)}
            self._length += len(fnames)

    def encode(self, featureset, label):
        # Inherit docs.
        encoding = []

        # Convert input-features to joint-features:
        for fname, fval in featureset.items():
            if isinstance(fval, (int, float)):
                # Known feature name & value:
                if (fname, type(fval), label) in self._mapping:
                    encoding.append((self._mapping[fname, type(fval), label], fval))
            else:
                # Known feature name & value:
                if (fname, fval, label) in self._mapping:
                    encoding.append((self._mapping[fname, fval, label], 1))

                # Otherwise, we might want to fire an "unseen-value feature".
                elif self._unseen:
                    # Have we seen this fname/fval combination with any label?
                    for label2 in self._labels:
                        if (fname, fval, label2) in self._mapping:
                            break  # we've seen this fname/fval combo
                    # We haven't -- fire the unseen-value feature
                    else:
                        if fname in self._unseen:
                            encoding.append((self._unseen[fname], 1))

        # Add always-on features:
        if self._alwayson and label in self._alwayson:
            encoding.append((self._alwayson[label], 1))

        return encoding

    def describe(self, f_id):
        # Inherit docs.
        if not isinstance(f_id, int):
            raise TypeError("describe() expected an int")
        try:
            self._inv_mapping
        except AttributeError:
            self._inv_mapping = [-1] * len(self._mapping)
            for (info, i) in self._mapping.items():
                self._inv_mapping[i] = info

        if f_id < len(self._mapping):
            (fname, fval, label) = self._inv_mapping[f_id]
            return f"{fname}=={fval!r} and label is {label!r}"
        elif self._alwayson and f_id in self._alwayson.values():
            for (label, f_id2) in self._alwayson.items():
                if f_id == f_id2:
                    return "label is %r" % label
        elif self._unseen and f_id in self._unseen.values():
            for (fname, f_id2) in self._unseen.items():
                if f_id == f_id2:
                    return "%s is unseen" % fname
        else:
            raise ValueError("Bad feature id")

    def labels(self):
        # Inherit docs.
        return self._labels

    def length(self):
        # Inherit docs.
        return self._length

    @classmethod
    def train(cls, train_toks, count_cutoff=0, labels=None, **options):
        """

        Construct and return new feature encoding, based on a given

        training corpus ``train_toks``.  See the class description

        ``TypedMaxentFeatureEncoding`` for a description of the

        joint-features that will be included in this encoding.



        Note: recognized feature values types are (int, float), over

        types are interpreted as regular binary features.



        :type train_toks: list(tuple(dict, str))

        :param train_toks: Training data, represented as a list of

            pairs, the first member of which is a feature dictionary,

            and the second of which is a classification label.



        :type count_cutoff: int

        :param count_cutoff: A cutoff value that is used to discard

            rare joint-features.  If a joint-feature's value is 1

            fewer than ``count_cutoff`` times in the training corpus,

            then that joint-feature is not included in the generated

            encoding.



        :type labels: list

        :param labels: A list of labels that should be used by the

            classifier.  If not specified, then the set of labels

            attested in ``train_toks`` will be used.



        :param options: Extra parameters for the constructor, such as

            ``unseen_features`` and ``alwayson_features``.

        """
        mapping = {}  # maps (fname, fval, label) -> fid
        seen_labels = set()  # The set of labels we've encountered
        count = defaultdict(int)  # maps (fname, fval) -> count

        for (tok, label) in train_toks:
            if labels and label not in labels:
                raise ValueError("Unexpected label %s" % label)
            seen_labels.add(label)

            # Record each of the features.
            for (fname, fval) in tok.items():
                if type(fval) in (int, float):
                    fval = type(fval)
                # If a count cutoff is given, then only add a joint
                # feature once the corresponding (fname, fval, label)
                # tuple exceeds that cutoff.
                count[fname, fval] += 1
                if count[fname, fval] >= count_cutoff:
                    if (fname, fval, label) not in mapping:
                        mapping[fname, fval, label] = len(mapping)

        if labels is None:
            labels = seen_labels
        return cls(labels, mapping, **options)


######################################################################
# { Classifier Trainer: Generalized Iterative Scaling
######################################################################


def train_maxent_classifier_with_gis(

    train_toks, trace=3, encoding=None, labels=None, **cutoffs

):
    """

    Train a new ``ConditionalExponentialClassifier``, using the given

    training samples, using the Generalized Iterative Scaling

    algorithm.  This ``ConditionalExponentialClassifier`` will encode

    the model that maximizes entropy from all the models that are

    empirically consistent with ``train_toks``.



    :see: ``train_maxent_classifier()`` for parameter descriptions.

    """
    cutoffs.setdefault("max_iter", 100)
    cutoffchecker = CutoffChecker(cutoffs)

    # Construct an encoding from the training data.
    if encoding is None:
        encoding = GISEncoding.train(train_toks, labels=labels)

    if not hasattr(encoding, "C"):
        raise TypeError(
            "The GIS algorithm requires an encoding that "
            "defines C (e.g., GISEncoding)."
        )

    # Cinv is the inverse of the sum of each joint feature vector.
    # This controls the learning rate: higher Cinv (or lower C) gives
    # faster learning.
    Cinv = 1.0 / encoding.C

    # Count how many times each feature occurs in the training data.
    empirical_fcount = calculate_empirical_fcount(train_toks, encoding)

    # Check for any features that are not attested in train_toks.
    unattested = set(numpy.nonzero(empirical_fcount == 0)[0])

    # Build the classifier.  Start with weight=0 for each attested
    # feature, and weight=-infinity for each unattested feature.
    weights = numpy.zeros(len(empirical_fcount), "d")
    for fid in unattested:
        weights[fid] = numpy.NINF
    classifier = ConditionalExponentialClassifier(encoding, weights)

    # Take the log of the empirical fcount.
    log_empirical_fcount = numpy.log2(empirical_fcount)
    del empirical_fcount

    if trace > 0:
        print("  ==> Training (%d iterations)" % cutoffs["max_iter"])
    if trace > 2:
        print()
        print("      Iteration    Log Likelihood    Accuracy")
        print("      ---------------------------------------")

    # Train the classifier.
    try:
        while True:
            if trace > 2:
                ll = cutoffchecker.ll or log_likelihood(classifier, train_toks)
                acc = cutoffchecker.acc or accuracy(classifier, train_toks)
                iternum = cutoffchecker.iter
                print("     %9d    %14.5f    %9.3f" % (iternum, ll, acc))

            # Use the model to estimate the number of times each
            # feature should occur in the training data.
            estimated_fcount = calculate_estimated_fcount(
                classifier, train_toks, encoding
            )

            # Take the log of estimated fcount (avoid taking log(0).)
            for fid in unattested:
                estimated_fcount[fid] += 1
            log_estimated_fcount = numpy.log2(estimated_fcount)
            del estimated_fcount

            # Update the classifier weights
            weights = classifier.weights()
            weights += (log_empirical_fcount - log_estimated_fcount) * Cinv
            classifier.set_weights(weights)

            # Check the log-likelihood & accuracy cutoffs.
            if cutoffchecker.check(classifier, train_toks):
                break

    except KeyboardInterrupt:
        print("      Training stopped: keyboard interrupt")
    except:
        raise

    if trace > 2:
        ll = log_likelihood(classifier, train_toks)
        acc = accuracy(classifier, train_toks)
        print(f"         Final    {ll:14.5f}    {acc:9.3f}")

    # Return the classifier.
    return classifier


def calculate_empirical_fcount(train_toks, encoding):
    fcount = numpy.zeros(encoding.length(), "d")

    for tok, label in train_toks:
        for (index, val) in encoding.encode(tok, label):
            fcount[index] += val

    return fcount


def calculate_estimated_fcount(classifier, train_toks, encoding):
    fcount = numpy.zeros(encoding.length(), "d")

    for tok, label in train_toks:
        pdist = classifier.prob_classify(tok)
        for label in pdist.samples():
            prob = pdist.prob(label)
            for (fid, fval) in encoding.encode(tok, label):
                fcount[fid] += prob * fval

    return fcount


######################################################################
# { Classifier Trainer: Improved Iterative Scaling
######################################################################


def train_maxent_classifier_with_iis(

    train_toks, trace=3, encoding=None, labels=None, **cutoffs

):
    """

    Train a new ``ConditionalExponentialClassifier``, using the given

    training samples, using the Improved Iterative Scaling algorithm.

    This ``ConditionalExponentialClassifier`` will encode the model

    that maximizes entropy from all the models that are empirically

    consistent with ``train_toks``.



    :see: ``train_maxent_classifier()`` for parameter descriptions.

    """
    cutoffs.setdefault("max_iter", 100)
    cutoffchecker = CutoffChecker(cutoffs)

    # Construct an encoding from the training data.
    if encoding is None:
        encoding = BinaryMaxentFeatureEncoding.train(train_toks, labels=labels)

    # Count how many times each feature occurs in the training data.
    empirical_ffreq = calculate_empirical_fcount(train_toks, encoding) / len(train_toks)

    # Find the nf map, and related variables nfarray and nfident.
    # nf is the sum of the features for a given labeled text.
    # nfmap compresses this sparse set of values to a dense list.
    # nfarray performs the reverse operation.  nfident is
    # nfarray multiplied by an identity matrix.
    nfmap = calculate_nfmap(train_toks, encoding)
    nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), "d")
    nftranspose = numpy.reshape(nfarray, (len(nfarray), 1))

    # Check for any features that are not attested in train_toks.
    unattested = set(numpy.nonzero(empirical_ffreq == 0)[0])

    # Build the classifier.  Start with weight=0 for each attested
    # feature, and weight=-infinity for each unattested feature.
    weights = numpy.zeros(len(empirical_ffreq), "d")
    for fid in unattested:
        weights[fid] = numpy.NINF
    classifier = ConditionalExponentialClassifier(encoding, weights)

    if trace > 0:
        print("  ==> Training (%d iterations)" % cutoffs["max_iter"])
    if trace > 2:
        print()
        print("      Iteration    Log Likelihood    Accuracy")
        print("      ---------------------------------------")

    # Train the classifier.
    try:
        while True:
            if trace > 2:
                ll = cutoffchecker.ll or log_likelihood(classifier, train_toks)
                acc = cutoffchecker.acc or accuracy(classifier, train_toks)
                iternum = cutoffchecker.iter
                print("     %9d    %14.5f    %9.3f" % (iternum, ll, acc))

            # Calculate the deltas for this iteration, using Newton's method.
            deltas = calculate_deltas(
                train_toks,
                classifier,
                unattested,
                empirical_ffreq,
                nfmap,
                nfarray,
                nftranspose,
                encoding,
            )

            # Use the deltas to update our weights.
            weights = classifier.weights()
            weights += deltas
            classifier.set_weights(weights)

            # Check the log-likelihood & accuracy cutoffs.
            if cutoffchecker.check(classifier, train_toks):
                break

    except KeyboardInterrupt:
        print("      Training stopped: keyboard interrupt")
    except:
        raise

    if trace > 2:
        ll = log_likelihood(classifier, train_toks)
        acc = accuracy(classifier, train_toks)
        print(f"         Final    {ll:14.5f}    {acc:9.3f}")

    # Return the classifier.
    return classifier


def calculate_nfmap(train_toks, encoding):
    """

    Construct a map that can be used to compress ``nf`` (which is

    typically sparse).



    *nf(feature_vector)* is the sum of the feature values for

    *feature_vector*.



    This represents the number of features that are active for a

    given labeled text.  This method finds all values of *nf(t)*

    that are attested for at least one token in the given list of

    training tokens; and constructs a dictionary mapping these

    attested values to a continuous range *0...N*.  For example,

    if the only values of *nf()* that were attested were 3, 5, and

    7, then ``_nfmap`` might return the dictionary ``{3:0, 5:1, 7:2}``.



    :return: A map that can be used to compress ``nf`` to a dense

        vector.

    :rtype: dict(int -> int)

    """
    # Map from nf to indices.  This allows us to use smaller arrays.
    nfset = set()
    for tok, _ in train_toks:
        for label in encoding.labels():
            nfset.add(sum(val for (id, val) in encoding.encode(tok, label)))
    return {nf: i for (i, nf) in enumerate(nfset)}


def calculate_deltas(

    train_toks,

    classifier,

    unattested,

    ffreq_empirical,

    nfmap,

    nfarray,

    nftranspose,

    encoding,

):
    r"""

    Calculate the update values for the classifier weights for

    this iteration of IIS.  These update weights are the value of

    ``delta`` that solves the equation::



      ffreq_empirical[i]

             =

      SUM[fs,l] (classifier.prob_classify(fs).prob(l) *

                 feature_vector(fs,l)[i] *

                 exp(delta[i] * nf(feature_vector(fs,l))))



    Where:

        - *(fs,l)* is a (featureset, label) tuple from ``train_toks``

        - *feature_vector(fs,l)* = ``encoding.encode(fs,l)``

        - *nf(vector)* = ``sum([val for (id,val) in vector])``



    This method uses Newton's method to solve this equation for

    *delta[i]*.  In particular, it starts with a guess of

    ``delta[i]`` = 1; and iteratively updates ``delta`` with:



    | delta[i] -= (ffreq_empirical[i] - sum1[i])/(-sum2[i])



    until convergence, where *sum1* and *sum2* are defined as:



    |    sum1[i](delta) = SUM[fs,l] f[i](fs,l,delta)

    |    sum2[i](delta) = SUM[fs,l] (f[i](fs,l,delta).nf(feature_vector(fs,l)))

    |    f[i](fs,l,delta) = (classifier.prob_classify(fs).prob(l) .

    |                        feature_vector(fs,l)[i] .

    |                        exp(delta[i] . nf(feature_vector(fs,l))))



    Note that *sum1* and *sum2* depend on ``delta``; so they need

    to be re-computed each iteration.



    The variables ``nfmap``, ``nfarray``, and ``nftranspose`` are

    used to generate a dense encoding for *nf(ltext)*.  This

    allows ``_deltas`` to calculate *sum1* and *sum2* using

    matrices, which yields a significant performance improvement.



    :param train_toks: The set of training tokens.

    :type train_toks: list(tuple(dict, str))

    :param classifier: The current classifier.

    :type classifier: ClassifierI

    :param ffreq_empirical: An array containing the empirical

        frequency for each feature.  The *i*\ th element of this

        array is the empirical frequency for feature *i*.

    :type ffreq_empirical: sequence of float

    :param unattested: An array that is 1 for features that are

        not attested in the training data; and 0 for features that

        are attested.  In other words, ``unattested[i]==0`` iff

        ``ffreq_empirical[i]==0``.

    :type unattested: sequence of int

    :param nfmap: A map that can be used to compress ``nf`` to a dense

        vector.

    :type nfmap: dict(int -> int)

    :param nfarray: An array that can be used to uncompress ``nf``

        from a dense vector.

    :type nfarray: array(float)

    :param nftranspose: The transpose of ``nfarray``

    :type nftranspose: array(float)

    """
    # These parameters control when we decide that we've
    # converged.  It probably should be possible to set these
    # manually, via keyword arguments to train.
    NEWTON_CONVERGE = 1e-12
    MAX_NEWTON = 300

    deltas = numpy.ones(encoding.length(), "d")

    # Precompute the A matrix:
    # A[nf][id] = sum ( p(fs) * p(label|fs) * f(fs,label) )
    # over all label,fs s.t. num_features[label,fs]=nf
    A = numpy.zeros((len(nfmap), encoding.length()), "d")

    for tok, label in train_toks:
        dist = classifier.prob_classify(tok)

        for label in encoding.labels():
            # Generate the feature vector
            feature_vector = encoding.encode(tok, label)
            # Find the number of active features
            nf = sum(val for (id, val) in feature_vector)
            # Update the A matrix
            for (id, val) in feature_vector:
                A[nfmap[nf], id] += dist.prob(label) * val
    A /= len(train_toks)

    # Iteratively solve for delta.  Use the following variables:
    #   - nf_delta[x][y] = nfarray[x] * delta[y]
    #   - exp_nf_delta[x][y] = exp(nf[x] * delta[y])
    #   - nf_exp_nf_delta[x][y] = nf[x] * exp(nf[x] * delta[y])
    #   - sum1[i][nf] = sum p(fs)p(label|fs)f[i](label,fs)
    #                       exp(delta[i]nf)
    #   - sum2[i][nf] = sum p(fs)p(label|fs)f[i](label,fs)
    #                       nf exp(delta[i]nf)
    for rangenum in range(MAX_NEWTON):
        nf_delta = numpy.outer(nfarray, deltas)
        exp_nf_delta = 2**nf_delta
        nf_exp_nf_delta = nftranspose * exp_nf_delta
        sum1 = numpy.sum(exp_nf_delta * A, axis=0)
        sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0)

        # Avoid division by zero.
        for fid in unattested:
            sum2[fid] += 1

        # Update the deltas.
        deltas -= (ffreq_empirical - sum1) / -sum2

        # We can stop once we converge.
        n_error = numpy.sum(abs(ffreq_empirical - sum1)) / numpy.sum(abs(deltas))
        if n_error < NEWTON_CONVERGE:
            return deltas

    return deltas


######################################################################
# { Classifier Trainer: megam
######################################################################

# [xx] possible extension: add support for using implicit file format;
# this would need to put requirements on what encoding is used.  But
# we may need this for other maxent classifier trainers that require
# implicit formats anyway.
def train_maxent_classifier_with_megam(

    train_toks, trace=3, encoding=None, labels=None, gaussian_prior_sigma=0, **kwargs

):
    """

    Train a new ``ConditionalExponentialClassifier``, using the given

    training samples, using the external ``megam`` library.  This

    ``ConditionalExponentialClassifier`` will encode the model that

    maximizes entropy from all the models that are empirically

    consistent with ``train_toks``.



    :see: ``train_maxent_classifier()`` for parameter descriptions.

    :see: ``nltk.classify.megam``

    """

    explicit = True
    bernoulli = True
    if "explicit" in kwargs:
        explicit = kwargs["explicit"]
    if "bernoulli" in kwargs:
        bernoulli = kwargs["bernoulli"]

    # Construct an encoding from the training data.
    if encoding is None:
        # Count cutoff can also be controlled by megam with the -minfc
        # option. Not sure where the best place for it is.
        count_cutoff = kwargs.get("count_cutoff", 0)
        encoding = BinaryMaxentFeatureEncoding.train(
            train_toks, count_cutoff, labels=labels, alwayson_features=True
        )
    elif labels is not None:
        raise ValueError("Specify encoding or labels, not both")

    # Write a training file for megam.
    try:
        fd, trainfile_name = tempfile.mkstemp(prefix="nltk-")
        with open(trainfile_name, "w") as trainfile:
            write_megam_file(
                train_toks, encoding, trainfile, explicit=explicit, bernoulli=bernoulli
            )
        os.close(fd)
    except (OSError, ValueError) as e:
        raise ValueError("Error while creating megam training file: %s" % e) from e

    # Run megam on the training file.
    options = []
    options += ["-nobias", "-repeat", "10"]
    if explicit:
        options += ["-explicit"]
    if not bernoulli:
        options += ["-fvals"]
    if gaussian_prior_sigma:
        # Lambda is just the precision of the Gaussian prior, i.e. it's the
        # inverse variance, so the parameter conversion is 1.0/sigma**2.
        # See https://users.umiacs.umd.edu/~hal/docs/daume04cg-bfgs.pdf
        inv_variance = 1.0 / gaussian_prior_sigma**2
    else:
        inv_variance = 0
    options += ["-lambda", "%.2f" % inv_variance, "-tune"]
    if trace < 3:
        options += ["-quiet"]
    if "max_iter" in kwargs:
        options += ["-maxi", "%s" % kwargs["max_iter"]]
    if "ll_delta" in kwargs:
        # [xx] this is actually a perplexity delta, not a log
        # likelihood delta
        options += ["-dpp", "%s" % abs(kwargs["ll_delta"])]
    if hasattr(encoding, "cost"):
        options += ["-multilabel"]  # each possible la
    options += ["multiclass", trainfile_name]
    stdout = call_megam(options)
    # print('./megam_i686.opt ', ' '.join(options))
    # Delete the training file
    try:
        os.remove(trainfile_name)
    except OSError as e:
        print(f"Warning: unable to delete {trainfile_name}: {e}")

    # Parse the generated weight vector.
    weights = parse_megam_weights(stdout, encoding.length(), explicit)

    # Convert from base-e to base-2 weights.
    weights *= numpy.log2(numpy.e)

    # Build the classifier
    return MaxentClassifier(encoding, weights)


######################################################################
# { Classifier Trainer: tadm
######################################################################


class TadmMaxentClassifier(MaxentClassifier):
    @classmethod
    def train(cls, train_toks, **kwargs):
        algorithm = kwargs.get("algorithm", "tao_lmvm")
        trace = kwargs.get("trace", 3)
        encoding = kwargs.get("encoding", None)
        labels = kwargs.get("labels", None)
        sigma = kwargs.get("gaussian_prior_sigma", 0)
        count_cutoff = kwargs.get("count_cutoff", 0)
        max_iter = kwargs.get("max_iter")
        ll_delta = kwargs.get("min_lldelta")

        # Construct an encoding from the training data.
        if not encoding:
            encoding = TadmEventMaxentFeatureEncoding.train(
                train_toks, count_cutoff, labels=labels
            )

        trainfile_fd, trainfile_name = tempfile.mkstemp(
            prefix="nltk-tadm-events-", suffix=".gz"
        )
        weightfile_fd, weightfile_name = tempfile.mkstemp(prefix="nltk-tadm-weights-")

        trainfile = gzip_open_unicode(trainfile_name, "w")
        write_tadm_file(train_toks, encoding, trainfile)
        trainfile.close()

        options = []
        options.extend(["-monitor"])
        options.extend(["-method", algorithm])
        if sigma:
            options.extend(["-l2", "%.6f" % sigma**2])
        if max_iter:
            options.extend(["-max_it", "%d" % max_iter])
        if ll_delta:
            options.extend(["-fatol", "%.6f" % abs(ll_delta)])
        options.extend(["-events_in", trainfile_name])
        options.extend(["-params_out", weightfile_name])
        if trace < 3:
            options.extend(["2>&1"])
        else:
            options.extend(["-summary"])

        call_tadm(options)

        with open(weightfile_name) as weightfile:
            weights = parse_tadm_weights(weightfile)

        os.remove(trainfile_name)
        os.remove(weightfile_name)

        # Convert from base-e to base-2 weights.
        weights *= numpy.log2(numpy.e)

        # Build the classifier
        return cls(encoding, weights)


######################################################################
# { Demo
######################################################################
def demo():
    from nltk.classify.util import names_demo

    classifier = names_demo(MaxentClassifier.train)


if __name__ == "__main__":
    demo()