aakash0017's picture
Upload folder using huggingface_hub
b7731cd
# Copyright 2002 by Jeffrey Chang.
# All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""A state-emitting MarkovModel.
Note terminology similar to Manning and Schutze is used.
Functions:
train_bw Train a markov model using the Baum-Welch algorithm.
train_visible Train a visible markov model using MLE.
find_states Find the a state sequence that explains some observations.
load Load a MarkovModel.
save Save a MarkovModel.
Classes:
MarkovModel Holds the description of a markov model
"""
import numpy
try:
logaddexp = numpy.logaddexp
except AttributeError:
# Numpy versions older than 1.3 do not contain logaddexp.
# Once we require Numpy version 1.3 or later, we should revisit this
# module to see if we can simplify some of the other functions in
# this module.
import warnings
warnings.warn(
"For optimal speed, please update to Numpy version 1.3 or later (current version is %s)"
% numpy.__version__
)
def logaddexp(logx, logy):
"""Implement logaddexp method if Numpy version is older than 1.3."""
if logy - logx > 100:
return logy
elif logx - logy > 100:
return logx
minxy = min(logx, logy)
return minxy + numpy.log(numpy.exp(logx - minxy) + numpy.exp(logy - minxy))
def itemindex(values):
"""Return a dictionary of values with their sequence offset as keys."""
d = {}
entries = enumerate(values[::-1])
n = len(values) - 1
for index, key in entries:
d[key] = n - index
return d
numpy.random.seed()
VERY_SMALL_NUMBER = 1e-300
LOG0 = numpy.log(VERY_SMALL_NUMBER)
class MarkovModel:
"""Create a state-emitting MarkovModel object."""
def __init__(
self, states, alphabet, p_initial=None, p_transition=None, p_emission=None
):
"""Initialize the class."""
self.states = states
self.alphabet = alphabet
self.p_initial = p_initial
self.p_transition = p_transition
self.p_emission = p_emission
def __str__(self):
"""Create a string representation of the MarkovModel object."""
from io import StringIO
handle = StringIO()
save(self, handle)
handle.seek(0)
return handle.read()
def _readline_and_check_start(handle, start):
"""Read the first line and evaluate that begisn with the correct start (PRIVATE)."""
line = handle.readline()
if not line.startswith(start):
raise ValueError(f"I expected {start!r} but got {line!r}")
return line
def load(handle):
"""Parse a file handle into a MarkovModel object."""
# Load the states.
line = _readline_and_check_start(handle, "STATES:")
states = line.split()[1:]
# Load the alphabet.
line = _readline_and_check_start(handle, "ALPHABET:")
alphabet = line.split()[1:]
mm = MarkovModel(states, alphabet)
N, M = len(states), len(alphabet)
# Load the initial probabilities.
mm.p_initial = numpy.zeros(N)
line = _readline_and_check_start(handle, "INITIAL:")
for i in range(len(states)):
line = _readline_and_check_start(handle, f" {states[i]}:")
mm.p_initial[i] = float(line.split()[-1])
# Load the transition.
mm.p_transition = numpy.zeros((N, N))
line = _readline_and_check_start(handle, "TRANSITION:")
for i in range(len(states)):
line = _readline_and_check_start(handle, f" {states[i]}:")
mm.p_transition[i, :] = [float(v) for v in line.split()[1:]]
# Load the emission.
mm.p_emission = numpy.zeros((N, M))
line = _readline_and_check_start(handle, "EMISSION:")
for i in range(len(states)):
line = _readline_and_check_start(handle, f" {states[i]}:")
mm.p_emission[i, :] = [float(v) for v in line.split()[1:]]
return mm
def save(mm, handle):
"""Save MarkovModel object into handle."""
# This will fail if there are spaces in the states or alphabet.
w = handle.write
w(f"STATES: {' '.join(mm.states)}\n")
w(f"ALPHABET: {' '.join(mm.alphabet)}\n")
w("INITIAL:\n")
for i in range(len(mm.p_initial)):
w(f" {mm.states[i]}: {mm.p_initial[i]:g}\n")
w("TRANSITION:\n")
for i in range(len(mm.p_transition)):
w(f" {mm.states[i]}: {' '.join(str(x) for x in mm.p_transition[i])}\n")
w("EMISSION:\n")
for i in range(len(mm.p_emission)):
w(f" {mm.states[i]}: {' '.join(str(x) for x in mm.p_emission[i])}\n")
# XXX allow them to specify starting points
def train_bw(
states,
alphabet,
training_data,
pseudo_initial=None,
pseudo_transition=None,
pseudo_emission=None,
update_fn=None,
):
"""Train a MarkovModel using the Baum-Welch algorithm.
Train a MarkovModel using the Baum-Welch algorithm. states is a list
of strings that describe the names of each state. alphabet is a
list of objects that indicate the allowed outputs. training_data
is a list of observations. Each observation is a list of objects
from the alphabet.
pseudo_initial, pseudo_transition, and pseudo_emission are
optional parameters that you can use to assign pseudo-counts to
different matrices. They should be matrices of the appropriate
size that contain numbers to add to each parameter matrix, before
normalization.
update_fn is an optional callback that takes parameters
(iteration, log_likelihood). It is called once per iteration.
"""
N, M = len(states), len(alphabet)
if not training_data:
raise ValueError("No training data given.")
if pseudo_initial is not None:
pseudo_initial = numpy.asarray(pseudo_initial)
if pseudo_initial.shape != (N,):
raise ValueError("pseudo_initial not shape len(states)")
if pseudo_transition is not None:
pseudo_transition = numpy.asarray(pseudo_transition)
if pseudo_transition.shape != (N, N):
raise ValueError("pseudo_transition not shape len(states) X len(states)")
if pseudo_emission is not None:
pseudo_emission = numpy.asarray(pseudo_emission)
if pseudo_emission.shape != (N, M):
raise ValueError("pseudo_emission not shape len(states) X len(alphabet)")
# Training data is given as a list of members of the alphabet.
# Replace those with indexes into the alphabet list for easier
# computation.
training_outputs = []
indexes = itemindex(alphabet)
for outputs in training_data:
training_outputs.append([indexes[x] for x in outputs])
# Do some sanity checking on the outputs.
lengths = [len(x) for x in training_outputs]
if min(lengths) == 0:
raise ValueError("I got training data with outputs of length 0")
# Do the training with baum welch.
x = _baum_welch(
N,
M,
training_outputs,
pseudo_initial=pseudo_initial,
pseudo_transition=pseudo_transition,
pseudo_emission=pseudo_emission,
update_fn=update_fn,
)
p_initial, p_transition, p_emission = x
return MarkovModel(states, alphabet, p_initial, p_transition, p_emission)
MAX_ITERATIONS = 1000
def _baum_welch(
N,
M,
training_outputs,
p_initial=None,
p_transition=None,
p_emission=None,
pseudo_initial=None,
pseudo_transition=None,
pseudo_emission=None,
update_fn=None,
):
"""Implement the Baum-Welch algorithm to evaluate unknown parameters in the MarkovModel object (PRIVATE)."""
if p_initial is None:
p_initial = _random_norm(N)
else:
p_initial = _copy_and_check(p_initial, (N,))
if p_transition is None:
p_transition = _random_norm((N, N))
else:
p_transition = _copy_and_check(p_transition, (N, N))
if p_emission is None:
p_emission = _random_norm((N, M))
else:
p_emission = _copy_and_check(p_emission, (N, M))
# Do all the calculations in log space to avoid underflows.
lp_initial = numpy.log(p_initial)
lp_transition = numpy.log(p_transition)
lp_emission = numpy.log(p_emission)
if pseudo_initial is not None:
lpseudo_initial = numpy.log(pseudo_initial)
else:
lpseudo_initial = None
if pseudo_transition is not None:
lpseudo_transition = numpy.log(pseudo_transition)
else:
lpseudo_transition = None
if pseudo_emission is not None:
lpseudo_emission = numpy.log(pseudo_emission)
else:
lpseudo_emission = None
# Iterate through each sequence of output, updating the parameters
# to the HMM. Stop when the log likelihoods of the sequences
# stops varying.
prev_llik = None
for i in range(MAX_ITERATIONS):
llik = LOG0
for outputs in training_outputs:
llik += _baum_welch_one(
N,
M,
outputs,
lp_initial,
lp_transition,
lp_emission,
lpseudo_initial,
lpseudo_transition,
lpseudo_emission,
)
if update_fn is not None:
update_fn(i, llik)
if prev_llik is not None and numpy.fabs(prev_llik - llik) < 0.1:
break
prev_llik = llik
else:
raise RuntimeError("HMM did not converge in %d iterations" % MAX_ITERATIONS)
# Return everything back in normal space.
return [numpy.exp(_) for _ in (lp_initial, lp_transition, lp_emission)]
def _baum_welch_one(
N,
M,
outputs,
lp_initial,
lp_transition,
lp_emission,
lpseudo_initial,
lpseudo_transition,
lpseudo_emission,
):
"""Execute one step for Baum-Welch algorithm (PRIVATE).
Do one iteration of Baum-Welch based on a sequence of output.
Changes the value for lp_initial, lp_transition and lp_emission in place.
"""
T = len(outputs)
fmat = _forward(N, T, lp_initial, lp_transition, lp_emission, outputs)
bmat = _backward(N, T, lp_transition, lp_emission, outputs)
# Calculate the probability of traversing each arc for any given
# transition.
lp_arc = numpy.zeros((N, N, T))
for t in range(T):
k = outputs[t]
lp_traverse = numpy.zeros((N, N)) # P going over one arc.
for i in range(N):
for j in range(N):
# P(getting to this arc)
# P(making this transition)
# P(emitting this character)
# P(going to the end)
lp = (
fmat[i][t]
+ lp_transition[i][j]
+ lp_emission[i][k]
+ bmat[j][t + 1]
)
lp_traverse[i][j] = lp
# Normalize the probability for this time step.
lp_arc[:, :, t] = lp_traverse - _logsum(lp_traverse)
# Sum of all the transitions out of state i at time t.
lp_arcout_t = numpy.zeros((N, T))
for t in range(T):
for i in range(N):
lp_arcout_t[i][t] = _logsum(lp_arc[i, :, t])
# Sum of all the transitions out of state i.
lp_arcout = numpy.zeros(N)
for i in range(N):
lp_arcout[i] = _logsum(lp_arcout_t[i, :])
# UPDATE P_INITIAL.
lp_initial = lp_arcout_t[:, 0]
if lpseudo_initial is not None:
lp_initial = _logvecadd(lp_initial, lpseudo_initial)
lp_initial = lp_initial - _logsum(lp_initial)
# UPDATE P_TRANSITION. p_transition[i][j] is the sum of all the
# transitions from i to j, normalized by the sum of the
# transitions out of i.
for i in range(N):
for j in range(N):
lp_transition[i][j] = _logsum(lp_arc[i, j, :]) - lp_arcout[i]
if lpseudo_transition is not None:
lp_transition[i] = _logvecadd(lp_transition[i], lpseudo_transition)
lp_transition[i] = lp_transition[i] - _logsum(lp_transition[i])
# UPDATE P_EMISSION. lp_emission[i][k] is the sum of all the
# transitions out of i when k is observed, divided by the sum of
# the transitions out of i.
for i in range(N):
ksum = numpy.zeros(M) + LOG0 # ksum[k] is the sum of all i with k.
for t in range(T):
k = outputs[t]
for j in range(N):
ksum[k] = logaddexp(ksum[k], lp_arc[i, j, t])
ksum = ksum - _logsum(ksum) # Normalize
if lpseudo_emission is not None:
ksum = _logvecadd(ksum, lpseudo_emission[i])
ksum = ksum - _logsum(ksum) # Renormalize
lp_emission[i, :] = ksum
# Calculate the log likelihood of the output based on the forward
# matrix. Since the parameters of the HMM has changed, the log
# likelihoods are going to be a step behind, and we might be doing
# one extra iteration of training. The alternative is to rerun
# the _forward algorithm and calculate from the clean one, but
# that may be more expensive than overshooting the training by one
# step.
return _logsum(fmat[:, T])
def _forward(N, T, lp_initial, lp_transition, lp_emission, outputs):
"""Implement forward algorithm (PRIVATE).
Calculate a Nx(T+1) matrix, where the last column is the total
probability of the output.
"""
matrix = numpy.zeros((N, T + 1))
# Initialize the first column to be the initial values.
matrix[:, 0] = lp_initial
for t in range(1, T + 1):
k = outputs[t - 1]
for j in range(N):
# The probability of the state is the sum of the
# transitions from all the states from time t-1.
lprob = LOG0
for i in range(N):
lp = matrix[i][t - 1] + lp_transition[i][j] + lp_emission[i][k]
lprob = logaddexp(lprob, lp)
matrix[j][t] = lprob
return matrix
def _backward(N, T, lp_transition, lp_emission, outputs):
"""Implement backward algorithm (PRIVATE)."""
matrix = numpy.zeros((N, T + 1))
for t in range(T - 1, -1, -1):
k = outputs[t]
for i in range(N):
# The probability of the state is the sum of the
# transitions from all the states from time t+1.
lprob = LOG0
for j in range(N):
lp = matrix[j][t + 1] + lp_transition[i][j] + lp_emission[i][k]
lprob = logaddexp(lprob, lp)
matrix[i][t] = lprob
return matrix
def train_visible(
states,
alphabet,
training_data,
pseudo_initial=None,
pseudo_transition=None,
pseudo_emission=None,
):
"""Train a visible MarkovModel using maximum likelihoood estimates for each of the parameters.
Train a visible MarkovModel using maximum likelihoood estimates
for each of the parameters. states is a list of strings that
describe the names of each state. alphabet is a list of objects
that indicate the allowed outputs. training_data is a list of
(outputs, observed states) where outputs is a list of the emission
from the alphabet, and observed states is a list of states from
states.
pseudo_initial, pseudo_transition, and pseudo_emission are
optional parameters that you can use to assign pseudo-counts to
different matrices. They should be matrices of the appropriate
size that contain numbers to add to each parameter matrix.
"""
N, M = len(states), len(alphabet)
if pseudo_initial is not None:
pseudo_initial = numpy.asarray(pseudo_initial)
if pseudo_initial.shape != (N,):
raise ValueError("pseudo_initial not shape len(states)")
if pseudo_transition is not None:
pseudo_transition = numpy.asarray(pseudo_transition)
if pseudo_transition.shape != (N, N):
raise ValueError("pseudo_transition not shape len(states) X len(states)")
if pseudo_emission is not None:
pseudo_emission = numpy.asarray(pseudo_emission)
if pseudo_emission.shape != (N, M):
raise ValueError("pseudo_emission not shape len(states) X len(alphabet)")
# Training data is given as a list of members of the alphabet.
# Replace those with indexes into the alphabet list for easier
# computation.
training_states, training_outputs = [], []
states_indexes = itemindex(states)
outputs_indexes = itemindex(alphabet)
for toutputs, tstates in training_data:
if len(tstates) != len(toutputs):
raise ValueError("states and outputs not aligned")
training_states.append([states_indexes[x] for x in tstates])
training_outputs.append([outputs_indexes[x] for x in toutputs])
x = _mle(
N,
M,
training_outputs,
training_states,
pseudo_initial,
pseudo_transition,
pseudo_emission,
)
p_initial, p_transition, p_emission = x
return MarkovModel(states, alphabet, p_initial, p_transition, p_emission)
def _mle(
N,
M,
training_outputs,
training_states,
pseudo_initial,
pseudo_transition,
pseudo_emission,
):
"""Implement Maximum likelihood estimation algorithm (PRIVATE)."""
# p_initial is the probability that a sequence of states starts
# off with a particular one.
p_initial = numpy.zeros(N)
if pseudo_initial:
p_initial = p_initial + pseudo_initial
for states in training_states:
p_initial[states[0]] += 1
p_initial = _normalize(p_initial)
# p_transition is the probability that a state leads to the next
# one. C(i,j)/C(i) where i and j are states.
p_transition = numpy.zeros((N, N))
if pseudo_transition:
p_transition = p_transition + pseudo_transition
for states in training_states:
for n in range(len(states) - 1):
i, j = states[n], states[n + 1]
p_transition[i, j] += 1
for i in range(len(p_transition)):
p_transition[i, :] = p_transition[i, :] / sum(p_transition[i, :])
# p_emission is the probability of an output given a state.
# C(s,o)|C(s) where o is an output and s is a state.
p_emission = numpy.zeros((N, M))
if pseudo_emission:
p_emission = p_emission + pseudo_emission
p_emission = numpy.ones((N, M))
for outputs, states in zip(training_outputs, training_states):
for o, s in zip(outputs, states):
p_emission[s, o] += 1
for i in range(len(p_emission)):
p_emission[i, :] = p_emission[i, :] / sum(p_emission[i, :])
return p_initial, p_transition, p_emission
def _argmaxes(vector, allowance=None):
"""Return indices of the maximum values aong the vector (PRIVATE)."""
return [numpy.argmax(vector)]
def find_states(markov_model, output):
"""Find states in the given Markov model output.
Returns a list of (states, score) tuples.
"""
mm = markov_model
N = len(mm.states)
# _viterbi does calculations in log space. Add a tiny bit to the
# matrices so that the logs will not break.
lp_initial = numpy.log(mm.p_initial + VERY_SMALL_NUMBER)
lp_transition = numpy.log(mm.p_transition + VERY_SMALL_NUMBER)
lp_emission = numpy.log(mm.p_emission + VERY_SMALL_NUMBER)
# Change output into a list of indexes into the alphabet.
indexes = itemindex(mm.alphabet)
output = [indexes[x] for x in output]
# Run the viterbi algorithm.
results = _viterbi(N, lp_initial, lp_transition, lp_emission, output)
for i in range(len(results)):
states, score = results[i]
results[i] = [mm.states[x] for x in states], numpy.exp(score)
return results
def _viterbi(N, lp_initial, lp_transition, lp_emission, output):
"""Implement Viterbi algorithm to find most likely states for a given input (PRIVATE)."""
T = len(output)
# Store the backtrace in a NxT matrix.
backtrace = [] # list of indexes of states in previous timestep.
for i in range(N):
backtrace.append([None] * T)
# Store the best scores.
scores = numpy.zeros((N, T))
scores[:, 0] = lp_initial + lp_emission[:, output[0]]
for t in range(1, T):
k = output[t]
for j in range(N):
# Find the most likely place it came from.
i_scores = scores[:, t - 1] + lp_transition[:, j] + lp_emission[j, k]
indexes = _argmaxes(i_scores)
scores[j, t] = i_scores[indexes[0]]
backtrace[j][t] = indexes
# Do the backtrace. First, find a good place to start. Then,
# we'll follow the backtrace matrix to find the list of states.
# In the event of ties, there may be multiple paths back through
# the matrix, which implies a recursive solution. We'll simulate
# it by keeping our own stack.
in_process = [] # list of (t, states, score)
results = [] # return values. list of (states, score)
indexes = _argmaxes(scores[:, T - 1]) # pick the first place
for i in indexes:
in_process.append((T - 1, [i], scores[i][T - 1]))
while in_process:
t, states, score = in_process.pop()
if t == 0:
results.append((states, score))
else:
indexes = backtrace[states[0]][t]
for i in indexes:
in_process.append((t - 1, [i] + states, score))
return results
def _normalize(matrix):
"""Normalize matrix object (PRIVATE)."""
if len(matrix.shape) == 1:
matrix = matrix / sum(matrix)
elif len(matrix.shape) == 2:
# Normalize by rows.
for i in range(len(matrix)):
matrix[i, :] = matrix[i, :] / sum(matrix[i, :])
else:
raise ValueError("I cannot handle matrixes of that shape")
return matrix
def _uniform_norm(shape):
"""Normalize a uniform matrix (PRIVATE)."""
matrix = numpy.ones(shape)
return _normalize(matrix)
def _random_norm(shape):
"""Normalize a random matrix (PRIVATE)."""
matrix = numpy.random.random(shape)
return _normalize(matrix)
def _copy_and_check(matrix, desired_shape):
"""Copy a matrix and check its dimension. Normalize at the end (PRIVATE)."""
# Copy the matrix.
matrix = numpy.array(matrix, copy=1)
# Check the dimensions.
if matrix.shape != desired_shape:
raise ValueError("Incorrect dimension")
# Make sure it's normalized.
if len(matrix.shape) == 1:
if numpy.fabs(sum(matrix) - 1.0) > 0.01:
raise ValueError("matrix not normalized to 1.0")
elif len(matrix.shape) == 2:
for i in range(len(matrix)):
if numpy.fabs(sum(matrix[i]) - 1.0) > 0.01:
raise ValueError("matrix %d not normalized to 1.0" % i)
else:
raise ValueError("I don't handle matrices > 2 dimensions")
return matrix
def _logsum(matrix):
"""Implement logsum for a matrix object (PRIVATE)."""
if len(matrix.shape) > 1:
vec = numpy.reshape(matrix, (numpy.product(matrix.shape),))
else:
vec = matrix
sum = LOG0
for num in vec:
sum = logaddexp(sum, num)
return sum
def _logvecadd(logvec1, logvec2):
"""Implement a log sum for two vector objects (PRIVATE)."""
assert len(logvec1) == len(logvec2), "vectors aren't the same length"
sumvec = numpy.zeros(len(logvec1))
for i in range(len(logvec1)):
sumvec[i] = logaddexp(logvec1[i], logvec2[i])
return sumvec
def _exp_logsum(numbers):
"""Return the exponential of a logsum (PRIVATE)."""
sum = _logsum(numbers)
return numpy.exp(sum)