|
import cooler |
|
import numpy as np |
|
from types import SimpleNamespace |
|
import random |
|
import sys |
|
|
|
def shuffleIFWithCount(df): |
|
shuf=df[['count','balanced']].sample(frac=1) |
|
df[['count','balanced']]=shuf[['count','balanced']].to_numpy() |
|
return df |
|
|
|
def shuffleIF(df): |
|
if len(df)<10: |
|
df = shuffleIFWithCount(df) |
|
return df |
|
min=np.min(df['bin1_id']) |
|
max=np.max(df['bin1_id']) |
|
distance = df['distance'].iloc[0] |
|
bin1_id = np.random.randint(min, high=max, size=int(len(df)*1.5)) |
|
bin2_id = bin1_id + distance |
|
pair_id = set(zip(bin1_id,bin2_id)) |
|
if len(pair_id)<len(df)-50: |
|
bin1_id = np.random.randint(min, high=max, size=len(df)) |
|
bin2_id = bin1_id + distance |
|
extra_pair_id = set(zip(bin1_id,bin2_id)) |
|
pair_id.update(extra_pair_id) |
|
if len(pair_id)<len(df): |
|
df = df.sample(len(pair_id)) |
|
pair_id = list(pair_id) |
|
random.shuffle(pair_id) |
|
pair_id=np.asarray(pair_id[:len(df)]) |
|
df['bin1_id']=pair_id[:,0] |
|
df['bin2_id'] = pair_id[:,1] |
|
return df |
|
|
|
class bandmatrix(): |
|
def __init__(self, pixels, extent, max_distance_bins=None, bins=None, info=None): |
|
self.extent = extent |
|
self.max_distance_bins = max_distance_bins |
|
self.bmatrix = np.zeros((extent[1] - extent[0], max_distance_bins)) |
|
self.offset = extent[0] |
|
self.bmatrix[pixels['bin1_id'] - self.offset, (pixels['bin2_id'] - pixels['bin1_id']).abs()] = pixels[ |
|
'balanced'] |
|
self.diag_mean = np.nanmean(self.bmatrix, axis=0) |
|
np.nan_to_num(self.bmatrix, copy=False) |
|
|
|
self.bins = bins |
|
self.bp2bin = \ |
|
bins['start'].reset_index(drop=False).rename(columns={"start": "bp", "index": "bin"}).set_index( |
|
'bp').to_dict()[ |
|
'bin'] |
|
self.resol = self.bins.iloc[0]['end'] - self.bins.iloc[0]['start'] |
|
self.info = info |
|
self.bin2bias = np.zeros(self.extent[1] - self.extent[0] + 1) |
|
if 'full_sum' in self.info: |
|
self.totalRC = self.info['full_sum'] |
|
elif 'sum' in info: |
|
self.totalRC = self.info['sum'] |
|
else: |
|
self.totalRC = None |
|
self.bin2bias = np.zeros((extent[1] - extent[0])) |
|
for k, v in bins.to_dict()['weight'].items(): |
|
self.bin2bias[k - self.offset] = v |
|
self.bin2bias = np.nan_to_num(self.bin2bias) |
|
|
|
self.continousRows = {'start_bp': np.inf, 'end_bp': -1, 'O_matrix': None, 'OE_matrix': None, 'bias': None, |
|
'offset_bin': 0} |
|
self.continousRows = SimpleNamespace(**self.continousRows) |
|
|
|
def __bandedRows2fullRows(self, x): |
|
""" |
|
coverting rows in bandedMatrix to upper triangle (+ necessary lower triangle) fullMatrix |
|
x???? x???x000 |
|
x@@xx ?x@@xx00 |
|
x#xxx --> ?@x#xxx0 |
|
xxxxx ?@#xxxxx |
|
""" |
|
b, h, w = x.shape |
|
output = np.zeros((b, h, h + w)) |
|
output[:b, :h, :w] = x |
|
output = output.reshape(b, -1)[:, :-h].reshape(b, h, -1)[:, :, :h + w] |
|
i_lower = np.tril_indices(h, -1) |
|
for i in range(b): |
|
output[i][i_lower] = output[i].swapaxes(-1, -2)[i_lower] |
|
return output |
|
|
|
def __relative_right_shift(self, x): |
|
""" |
|
.........xxxxxx xxxxxx0000000000 |
|
........xxxxxx. xxxxxx.000000000 |
|
.......xxxxxx.. ---> xxxxxx..00000000 |
|
......xxxxxx... xxxxxx...0000000 |
|
.....xxxxxx.... xxxxxx....000000 |
|
""" |
|
b, h, w = x.shape |
|
output = np.zeros((b, h, 2 * w)) |
|
output[:b, :h, :w] = x |
|
return output.reshape(b, -1)[:, :-h].reshape(b, h, -1)[:, :, h - 1:] |
|
|
|
def __tril_block(self, top, left, bottom, right, type='o'): |
|
""" |
|
fetch data in lower triangular part without main diagonal |
|
Parameters: |
|
top,left,bottom,right : block coords. left/right < 0 |
|
type : o [observe], oe [o/e], b [both] |
|
""" |
|
|
|
if left >= 0 or right >= 0: |
|
raise Exception("Trying to access data outside lower triangular part with tril_block") |
|
|
|
height = bottom - top |
|
top, bottom = top + left, bottom + right |
|
left, right = -right, -left |
|
|
|
if top < 0 or bottom > self.bmatrix.shape[0] - 1: |
|
raise Exception("Accessing values outside the contact map ... valid region:" + |
|
str(10 * self.resol) + '~' + str((self.extent[1] - self.extent[0] - 10) * self.resol)) |
|
|
|
O = self.bmatrix[top:bottom + 1, left:right + 1] |
|
|
|
if type == 'o': |
|
out = self.__relative_right_shift(O[None].swapaxes(-1, 1)).swapaxes(-1, 1)[:, :height + 1, :] |
|
elif type == 'oe': |
|
OE = O / self.diag_mean[left:right + 1] |
|
out = self.__relative_right_shift(OE[None].swapaxes(-1, 1)).swapaxes(-1, 1)[:, :height + 1, :] |
|
else: |
|
OE = O / self.diag_mean[left:right + 1] |
|
out = np.concatenate((O[None], OE[None])) |
|
out = self.__relative_right_shift(out.swapaxes(-1, 1)).swapaxes(-1, 1)[:, :height + 1, :] |
|
|
|
return out[..., ::-1] |
|
|
|
def rows(self, firstRow, lastRow, type='o', returnBias=False): |
|
""" |
|
fetch rows [firstRow,lastRow] of contacts |
|
Parameters |
|
---------- |
|
firstRow : inclusive first row in bp |
|
lastRow : inclusive last row in bp |
|
type : o [observe], oe [o/e], b [both] |
|
returnBias : If true, return bias in an array for bins [first row,last row + max_distance_bins) |
|
""" |
|
firstRow = firstRow // self.resol * self.resol |
|
lastRow = lastRow // self.resol * self.resol |
|
ORows = None |
|
OERows = None |
|
if firstRow < 0 or lastRow < 0 or firstRow > (self.extent[1] - self.extent[0]) * self.resol or lastRow > ( |
|
self.extent[1] - self.extent[0]) * self.resol: |
|
raise Exception("Accessing values outside the contact map ... valid region: 0 ~ " |
|
+ str((self.extent[1] - self.extent[0]) * self.resol)) |
|
|
|
firstRowRelativeBin = self.bp2bin[firstRow] - self.offset |
|
lastRowRelativeBin = self.bp2bin[lastRow] - self.offset |
|
ORows = self.bmatrix[firstRowRelativeBin:lastRowRelativeBin + 1, :][None] |
|
|
|
if type == 'o': |
|
outRows = ORows |
|
elif type == 'oe': |
|
OERows = (ORows / self.diag_mean) |
|
outRows = OERows |
|
elif type == 'b': |
|
OERows = (ORows / self.diag_mean) |
|
outRows = np.concatenate((ORows, OERows), axis=0) |
|
|
|
outRows = self.__bandedRows2fullRows(outRows) |
|
|
|
if returnBias: |
|
bias = self.bin2bias[firstRowRelativeBin:lastRowRelativeBin + self.max_distance_bins] |
|
|
|
|
|
return outRows, bias |
|
|
|
return outRows |
|
|
|
def __squareFromContinousRows(self, xCenter, yCenter, w, type='o', meta=True): |
|
""" |
|
fetch a (2w+1)*(2w+1) square of contacts centered at (xCenter,yCenter) from continousrows efficiently |
|
Parameters |
|
---------- |
|
xCenter : xCenter in bp |
|
yCenter : yCenter in bp |
|
w : block width = 2w+1, in bins |
|
type : o [observe], oe [o/e], b [both] |
|
""" |
|
|
|
if xCenter < self.continousRows.start_bp or xCenter > self.continousRows.end_bp: |
|
print('miss') |
|
rowStep = 1000 |
|
startRow_bp = np.max([0, xCenter // (rowStep * self.resol) * (rowStep - 2 * w) * self.resol]) |
|
endRow_bp = np.min( |
|
[startRow_bp + (rowStep + 2 * w) * self.resol, (self.extent[1] - self.offset - 1) * self.resol]) |
|
mat, bias = self.rows(startRow_bp, endRow_bp, type='b', returnBias=True) |
|
|
|
self.continousRows.start_bp = startRow_bp |
|
self.continousRows.end_bp = endRow_bp |
|
self.continousRows.O_matrix = mat[0, :, :] |
|
self.continousRows.OE_matrix = mat[1, :, :] |
|
self.continousRows.bias = bias |
|
else: |
|
print('hit') |
|
|
|
xCenterRelativeBin = (xCenter - self.continousRows.start_bp) // self.resol |
|
yCenterRelativeBin = (yCenter - self.continousRows.start_bp) // self.resol |
|
|
|
|
|
if type == 'o': |
|
output = self.continousRows.O_matrix[xCenterRelativeBin - w:xCenterRelativeBin + w + 1, |
|
yCenterRelativeBin - w:yCenterRelativeBin + w + 1][None] |
|
elif type == 'oe': |
|
output = self.continousRows.OE_matrix[xCenterRelativeBin - w:xCenterRelativeBin + w + 1, |
|
yCenterRelativeBin - w:yCenterRelativeBin + w + 1][None] |
|
else: |
|
OEsquare = self.continousRows.OE_matrix[xCenterRelativeBin - w:xCenterRelativeBin + w + 1, |
|
yCenterRelativeBin - w:yCenterRelativeBin + w + 1][None] |
|
Osquare = self.continousRows.O_matrix[xCenterRelativeBin - w:xCenterRelativeBin + w + 1, |
|
yCenterRelativeBin - w:yCenterRelativeBin + w + 1][None] |
|
output = np.concatenate((Osquare, OEsquare)) |
|
|
|
if meta: |
|
xBias = self.continousRows.bias[xCenterRelativeBin - w:xCenterRelativeBin + w + 1] |
|
yBias = self.continousRows.bias[yCenterRelativeBin - w:yCenterRelativeBin + w + 1] |
|
bias = np.concatenate((xBias, yBias)) |
|
p2ll,crk = self.p2ll(output[-1, :, :], cw=3) |
|
return output, np.concatenate((bias, [self.totalRC, p2ll,yCenterRelativeBin,crk])) |
|
return output |
|
|
|
def p2ll(self, x, cw=3): |
|
""" |
|
P2LL for a peak. |
|
Parameters: |
|
x : sqaure matrix, peak and its surrandings |
|
cw : lower-left corner width |
|
""" |
|
c = x.shape[0] // 2 |
|
llcorner = x[-cw:, :cw].flatten() |
|
if sum(llcorner) == 0: |
|
return 0,np.sum(x[c,c]>x[c-1:c+2,c-1:c+2]) |
|
return x[c, c] / (sum(llcorner) / len(llcorner)),np.sum(x[c,c]>x[c-1:c+2,c-1:c+2]) |
|
|
|
def square(self, xCenter, yCenter, w, type='o', meta=True, cache=False): |
|
""" |
|
fetch a (2w+1)*(2w+1) square of contacts centered at (xCenter,yCenter) |
|
Parameters |
|
---------- |
|
xCenter : xCenter in bp |
|
yCenter : yCenter in bp |
|
w : block width = 2w+1, in bins |
|
type : o [observe], oe [o/e], b [both] |
|
""" |
|
|
|
tril = None |
|
xCenter = xCenter // self.resol * self.resol |
|
yCenter = yCenter // self.resol * self.resol |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
xCenterRelativeBin = self.bp2bin[xCenter] - self.offset |
|
yCenterRelativeBin = self.bp2bin[yCenter] - self.offset - xCenterRelativeBin |
|
|
|
|
|
|
|
topleft = [xCenterRelativeBin - w, yCenterRelativeBin - 2 * w] |
|
bottomright = [xCenterRelativeBin + w, yCenterRelativeBin + 2 * w] |
|
|
|
if topleft[1] < 0: |
|
tril = (topleft[0], topleft[1], bottomright[0], -1) |
|
topleft[1] = 0 |
|
tril_part = self.__tril_block(tril[0], tril[1], tril[2], tril[3], type) |
|
|
|
Osquare = self.bmatrix[topleft[0]:bottomright[0] + 1, topleft[1]:bottomright[1] + 1] |
|
|
|
if type == 'o': |
|
Osquare = Osquare[None] |
|
if tril is not None: |
|
Osquare = np.concatenate((tril_part, Osquare), axis=-1) |
|
output = self.__relative_right_shift(Osquare)[:, :, :2 * w + 1] |
|
elif type == 'oe': |
|
OEsquare = (Osquare / self.diag_mean[topleft[1]:bottomright[1] + 1])[None] |
|
if tril is not None: |
|
OEsquare = np.concatenate((tril_part, OEsquare), axis=-1) |
|
output = self.__relative_right_shift(OEsquare)[:, :, :2 * w + 1] |
|
else: |
|
OEsquare = Osquare / self.diag_mean[topleft[1]:bottomright[1] + 1] |
|
output = np.concatenate((Osquare[None], OEsquare[None])) |
|
if tril is not None: |
|
output = np.concatenate((tril_part, output), axis=-1) |
|
output = self.__relative_right_shift(output)[:, :, :2 * w + 1] |
|
if meta: |
|
xBias = self.bin2bias[self.bp2bin[xCenter] - self.offset - w:self.bp2bin[xCenter] - self.offset + w + 1] |
|
yBias = self.bin2bias[self.bp2bin[yCenter] - self.offset - w:self.bp2bin[yCenter] - self.offset + w + 1] |
|
bias = np.concatenate((xBias, yBias)) |
|
|
|
p2ll,crk = self.p2ll(output[-1, :, :], cw=3) |
|
return output, np.concatenate((bias, [self.totalRC, p2ll,yCenterRelativeBin,crk])) |
|
return output |
|
|
|
class bcool(cooler.Cooler): |
|
def __init__(self, store): |
|
super().__init__(store) |
|
|
|
def bchr(self, chrom, max_distance=None, annotate=True,decoy=False,restrictDecoy=False): |
|
''' |
|
get banded matrix for a given chrom |
|
''' |
|
balance = True |
|
resol = self.info['bin-size'] |
|
if max_distance is not None and 'max_distance' in self.info and max_distance > self.info['max_distance']: |
|
raise Exception("max distance in this bcool file is ", self.info['max_distance']) |
|
else: |
|
if 'max_distance' in self.info: |
|
max_distance = self.info['max_distance'] |
|
else: |
|
max_distance = 3000000 |
|
pixels = self.matrix(balance=balance, as_pixels=True).fetch(chrom) |
|
pixels=pixels[(pixels['bin2_id']-pixels['bin1_id']).abs()<max_distance//resol].reset_index(drop=True) |
|
|
|
if decoy: |
|
np.random.seed(0) |
|
pixels['distance']=(pixels['bin2_id']-pixels['bin1_id']).abs() |
|
if restrictDecoy: |
|
pixels = pixels.groupby('distance').apply(shuffleIFWithCount) |
|
else: |
|
pixels=pixels.groupby('distance').apply(shuffleIF) |
|
|
|
|
|
if annotate: |
|
bins = self.bins().fetch(chrom) |
|
info = self.info |
|
else: |
|
bins = None |
|
info = None |
|
extent = self.extent(chrom) |
|
bmatrix = bandmatrix(pixels, extent, max_distance // resol, bins, info) |
|
return bmatrix |
|
|