File size: 7,262 Bytes
import sys
import click
import numpy as np
from sklearn.neighbors import KDTree
import pandas as pd
from tqdm import tqdm

def rhoDelta(data,resol,dc,radius): 
    
    pos = data[[1, 4]].to_numpy() // resol
    posTree = KDTree(pos, leaf_size=30, metric='chebyshev')
    NNindexes, NNdists = posTree.query_radius(pos, r=radius, return_distance=True)
    _l = []
    for v in NNindexes:
        _l.append(len(v))
    _l=np.asarray(_l)
    data = data[_l>5].reset_index(drop=True)
    
    if data.shape[0] != 0:
        pos = data[[1, 4]].to_numpy() // resol
        val = data[6].to_numpy()

        try:
            posTree = KDTree(pos, leaf_size=30, metric='chebyshev')
            NNindexes, NNdists = posTree.query_radius(pos, r=dc, return_distance=True)
        except ValueError as e:
            if "Found array with 0 sample(s)" in str(e):
                print("#"*88,'\n#')
                print("#\033[91m Error!!! The data is too sparse. Please decrease the value of: [t]\033[0m\n#")
                print("#"*88,'\n')
                sys.exit(1)
            else:
                raise

        rhos = []
        for i in range(len(NNindexes)):
            rhos.append(np.dot(np.exp(-(NNdists[i] / dc) ** 2), val[NNindexes[i]]))
        rhos = np.asarray(rhos)

        _r = 100
        _indexes, _dists = posTree.query_radius(pos, r=_r, return_distance=True, sort_results=True)
        deltas = rhos * 0
        LargerNei = rhos * 0 - 1
        for i in range(len(_indexes)):
            idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
            if idx.shape[0] == 0:
                deltas[i] = _dists[i][-1] + 1
            else:
                LargerNei[i] = _indexes[i][idx[0]]
                deltas[i] = _dists[i][idx[0]]
        failed = np.argwhere(LargerNei == -1).flatten()
        while len(failed) > 1 and _r < 100000:
            _r = _r * 10
            _indexes, _dists = posTree.query_radius(pos[failed], r=_r, return_distance=True, sort_results=True)
            for i in range(len(_indexes)):
                idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
                if idx.shape[0] == 0:
                    deltas[failed[i]] = _dists[i][-1] + 1
                else:
                    LargerNei[failed[i]] = _indexes[i][idx[0]]
                    deltas[failed[i]] = _dists[i][idx[0]]
            failed = np.argwhere(LargerNei == -1).flatten()

        data['rhos']=rhos
        data['deltas']=deltas
    else:
        data['rhos']=[]
        data['deltas']=[]
    return data



@click.command()
@click.option('-dc','--distance_cutoff', type=int, default=5, help='Distance cutoff for local density calculation in terms of bin. [5]')
@click.option('-t','--threshold', type=float, default=0.6, help='Loop score threshold [0.6]')
@click.option('-r','--resol', default=5000, help='resolution [5000]')
@click.option('-R','--radius', type=int, default=2, help='Radius threshold to remove outliers. [2]')
@click.option('-d','--mindelta', type=float, default=5, help='Min distance allowed between two loops [5]')
@click.option('-i','--candidates', type=str,required=True,help ='Loop candidates file path')
@click.option('-o','--output', type=str,required=True,help ='.bedpe file path to save loops')
def pool(distance_cutoff,candidates,resol,mindelta,threshold,output,radius,refine=True):
    """Call loops from loop candidates by clustering
    """
    print('\npolaris loop pool START :) ')

    data = pd.read_csv(candidates, sep='\t', header=None)
    
    ccs = set(data.iloc[:,0])

    if data.shape[0] == 0:
        print("#"*88,'\n#')
        print("#\033[91m Error!!! The file is empty. Please check your file.\033[0m\n#")
        print("#"*88,'\n')
        sys.exit(1)
    data = data[data[6] > threshold].reset_index(drop=True)
    data = data[data[4] - data[1] > 11*resol].reset_index(drop=True)
    if data.shape[0] == 0:
        print("#"*88,'\n#')
        print("#\033[91m Error!!! The data is too sparse. Please decrease: [threshold] (minimum: 0.5).\033[0m\n#")
        print("#"*88,'\n')
        sys.exit(1)
    data[['rhos','deltas']]=0
    data=data.groupby([0]).apply(rhoDelta,resol=resol,dc=distance_cutoff,radius=radius).reset_index(drop=True)
    minrho=0
    targetData=data.reset_index(drop=True)

    loopPds=[]
    chroms=tqdm(set(targetData[0]), dynamic_ncols=True)
    for chrom in chroms:
        chroms.desc = f"[Runing clustering on {chrom}]"
        data = targetData[targetData[0]==chrom].reset_index(drop=True)

        pos = data[[1, 4]].to_numpy() // resol
        posTree = KDTree(pos, leaf_size=30, metric='chebyshev')

        rhos = data['rhos'].to_numpy()
        deltas = data['deltas'].to_numpy()
        centroid = np.argwhere((rhos > minrho) & (deltas > mindelta)).flatten()

        _r = 100
        _indexes, _dists = posTree.query_radius(pos, r=_r, return_distance=True, sort_results=True)
        LargerNei = rhos * 0 - 1
        for i in range(len(_indexes)):
            idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
            if idx.shape[0] == 0:
                pass
            else:
                LargerNei[i] = _indexes[i][idx[0]]

        failed = np.argwhere(LargerNei == -1).flatten()
        while len(failed) > 1 and _r < 100000:
            _r = _r * 10
            _indexes, _dists = posTree.query_radius(pos[failed], r=_r, return_distance=True, sort_results=True)
            for i in range(len(_indexes)):
                idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
                if idx.shape[0] == 0:
                    pass
                else:
                    LargerNei[failed[i]] = _indexes[i][idx[0]]
            failed = np.argwhere(LargerNei == -1).flatten()

        LargerNei = LargerNei.astype(int)
        label = LargerNei * 0 - 1
        for i in range(len(centroid)):
            label[centroid[i]] = i
        decreasingsortedIdxRhos = np.argsort(-rhos)
        for i in decreasingsortedIdxRhos:
            if label[i] == -1:
                label[i] = label[LargerNei[i]]

        val = data[6].to_numpy()
        refinedLoop = []
        label = label.flatten()
        for l in set(label):
            idx = np.argwhere(label == l).flatten()
            if len(idx) > 0:
                refinedLoop.append(idx[np.argmax(val[idx])])
        if refine:
            loopPds.append(data.loc[refinedLoop])
        else:
            loopPds.append(data.loc[centroid])

    loopPd=pd.concat(loopPds).sort_values(6,ascending=False)
    loopPd[[1, 2, 4, 5]] = loopPd[[1, 2, 4, 5]].astype(int)
    loopPd[[0,1,2,3,4,5,6]].to_csv(output,sep='\t',header=False, index=False)
    
    ccs_ = set(loopPd.iloc[:,0])
    badc = ccs.difference(ccs_)
    if len(badc) == len(ccs):
        raise ValueError("polaris loop pool FAILED :(\nPlease check input and mcool file to yield scoreFile. Or use higher '-s' value for more sparse mcool data.")
    else:
        print(f'\npolaris loop pool FINISHED :)\n{len(loopPd)} loops saved to {output}')
        if len(badc) > 0:
            print(f"But the loop score of {badc} are too sparse.\nYou may need to check the mcool data or re-run polaris loop score by increasing -s.")         
            
    
if __name__ == '__main__':
    pool()