Spaces:
No application file
No application file
# Copyright 2000 by Jeffrey Chang. All rights reserved. | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""General Naive Bayes learner. | |
Naive Bayes is a supervised classification algorithm that uses Bayes | |
rule to compute the fit between a new observation and some previously | |
observed data. The observations are discrete feature vectors, with | |
the Bayes assumption that the features are independent. Although this | |
is hardly ever true, the classifier works well enough in practice. | |
Glossary: | |
- observation - A feature vector of discrete data. | |
- class - A possible classification for an observation. | |
Classes: | |
- NaiveBayes - Holds information for a naive Bayes classifier. | |
Functions: | |
- train - Train a new naive Bayes classifier. | |
- calculate - Calculate the probabilities of each class, | |
given an observation. | |
- classify - Classify an observation into a class. | |
""" | |
try: | |
import numpy | |
except ImportError: | |
from Bio import MissingPythonDependencyError | |
raise MissingPythonDependencyError( | |
"Install NumPy if you want to use Bio.MaxEntropy." | |
) | |
def _contents(items): | |
"""Return a dictionary where the key is the item and the value is the probablity associated (PRIVATE).""" | |
term = 1.0 / len(items) | |
counts = {} | |
for item in items: | |
counts[item] = counts.get(item, 0) + term | |
return counts | |
class NaiveBayes: | |
"""Hold information for a NaiveBayes classifier. | |
Attributes: | |
- classes - List of the possible classes of data. | |
- p_conditional - CLASS x DIM array of dicts of value -> ``P(value|class,dim)`` | |
- p_prior - List of the prior probabilities for every class. | |
- dimensionality - Dimensionality of the data. | |
""" | |
def __init__(self): | |
"""Initialize the class.""" | |
self.classes = [] | |
self.p_conditional = None | |
self.p_prior = [] | |
self.dimensionality = None | |
def calculate(nb, observation, scale=False): | |
"""Calculate the logarithmic conditional probability for each class. | |
Arguments: | |
- nb - A NaiveBayes classifier that has been trained. | |
- observation - A list representing the observed data. | |
- scale - Boolean to indicate whether the probability should be | |
scaled by ``P(observation)``. By default, no scaling is done. | |
A dictionary is returned where the key is the class and the value is | |
the log probability of the class. | |
""" | |
# P(class|observation) = P(observation|class)*P(class)/P(observation) | |
# Taking the log: | |
# lP(class|observation) = lP(observation|class)+lP(class)-lP(observation) | |
# Make sure the observation has the right dimensionality. | |
if len(observation) != nb.dimensionality: | |
raise ValueError( | |
f"observation in {len(observation)} dimension," | |
f" but classifier in {nb.dimensionality}" | |
) | |
# Calculate log P(observation|class) for every class. | |
n = len(nb.classes) | |
lp_observation_class = numpy.zeros(n) # array of log P(observation|class) | |
for i in range(n): | |
# log P(observation|class) = SUM_i log P(observation_i|class) | |
probs = [None] * len(observation) | |
for j in range(len(observation)): | |
probs[j] = nb.p_conditional[i][j].get(observation[j], 0) | |
lprobs = numpy.log(numpy.clip(probs, 1.0e-300, 1.0e300)) | |
lp_observation_class[i] = sum(lprobs) | |
# Calculate log P(class). | |
lp_prior = numpy.log(nb.p_prior) | |
# Calculate log P(observation). | |
lp_observation = 0.0 # P(observation) | |
if scale: # Only calculate this if requested. | |
# log P(observation) = log SUM_i P(observation|class_i)P(class_i) | |
obs = numpy.exp(numpy.clip(lp_prior + lp_observation_class, -700, +700)) | |
lp_observation = numpy.log(sum(obs)) | |
# Calculate log P(class|observation). | |
lp_class_observation = {} # Dict of class : log P(class|observation) | |
for i in range(len(nb.classes)): | |
lp_class_observation[nb.classes[i]] = ( | |
lp_observation_class[i] + lp_prior[i] - lp_observation | |
) | |
return lp_class_observation | |
def classify(nb, observation): | |
"""Classify an observation into a class.""" | |
# The class is the one with the highest probability. | |
probs = calculate(nb, observation, scale=False) | |
max_prob = max_class = None | |
for klass in nb.classes: | |
if max_prob is None or probs[klass] > max_prob: | |
max_prob, max_class = probs[klass], klass | |
return max_class | |
def train(training_set, results, priors=None, typecode=None): | |
"""Train a NaiveBayes classifier on a training set. | |
Arguments: | |
- training_set - List of observations. | |
- results - List of the class assignments for each observation. | |
Thus, training_set and results must be the same length. | |
- priors - Optional dictionary specifying the prior probabilities | |
for each type of result. If not specified, the priors will | |
be estimated from the training results. | |
""" | |
if not len(training_set): | |
raise ValueError("No data in the training set.") | |
if len(training_set) != len(results): | |
raise ValueError("training_set and results should be parallel lists.") | |
# If no typecode is specified, try to pick a reasonable one. If | |
# training_set is a Numeric array, then use that typecode. | |
# Otherwise, choose a reasonable default. | |
# XXX NOT IMPLEMENTED | |
# Check to make sure each vector in the training set has the same | |
# dimensionality. | |
dimensions = [len(x) for x in training_set] | |
if min(dimensions) != max(dimensions): | |
raise ValueError("observations have different dimensionality") | |
nb = NaiveBayes() | |
nb.dimensionality = dimensions[0] | |
# Get a list of all the classes, and | |
# estimate the prior probabilities for the classes. | |
if priors is not None: | |
percs = priors | |
nb.classes = list(set(results)) | |
else: | |
class_freq = _contents(results) | |
nb.classes = list(class_freq.keys()) | |
percs = class_freq | |
nb.classes.sort() # keep it tidy | |
nb.p_prior = numpy.zeros(len(nb.classes)) | |
for i in range(len(nb.classes)): | |
nb.p_prior[i] = percs[nb.classes[i]] | |
# Collect all the observations in class. For each class, make a | |
# matrix of training instances versus dimensions. I might be able | |
# to optimize this with Numeric, if the training_set parameter | |
# were guaranteed to be a matrix. However, this may not be the | |
# case, because the client may be hacking up a sparse matrix or | |
# something. | |
c2i = {} # class to index of class | |
for index, key in enumerate(nb.classes): | |
c2i[key] = index | |
observations = [[] for c in nb.classes] # separate observations by class | |
for i in range(len(results)): | |
klass, obs = results[i], training_set[i] | |
observations[c2i[klass]].append(obs) | |
# Now make the observations Numeric matrix. | |
for i in range(len(observations)): | |
# XXX typecode must be specified! | |
observations[i] = numpy.asarray(observations[i], typecode) | |
# Calculate P(value|class,dim) for every class. | |
# This is a good loop to optimize. | |
nb.p_conditional = [] | |
for i in range(len(nb.classes)): | |
class_observations = observations[i] # observations for this class | |
nb.p_conditional.append([None] * nb.dimensionality) | |
for j in range(nb.dimensionality): | |
# Collect all the values in this dimension. | |
values = class_observations[:, j] | |
# Add pseudocounts here. This needs to be parameterized. | |
# values = list(values) + range(len(nb.classes)) # XXX add 1 | |
# Estimate P(value|class,dim) | |
nb.p_conditional[i][j] = _contents(values) | |
return nb | |