Polaris / polaris /loopPool.py.bak

Upload folder using huggingface_hub

3290550 verified 4 months ago

7.26 kB

	import sys
	import click
	import numpy as np
	from sklearn.neighbors import KDTree
	import pandas as pd
	from tqdm import tqdm

	def rhoDelta(data,resol,dc,radius):

	pos = data[[1, 4]].to_numpy() // resol
	posTree = KDTree(pos, leaf_size=30, metric='chebyshev')
	NNindexes, NNdists = posTree.query_radius(pos, r=radius, return_distance=True)
	_l = []
	for v in NNindexes:
	_l.append(len(v))
	_l=np.asarray(_l)
	data = data[_l>5].reset_index(drop=True)

	if data.shape[0] != 0:
	pos = data[[1, 4]].to_numpy() // resol
	val = data[6].to_numpy()

	try:
	posTree = KDTree(pos, leaf_size=30, metric='chebyshev')
	NNindexes, NNdists = posTree.query_radius(pos, r=dc, return_distance=True)
	except ValueError as e:
	if "Found array with 0 sample(s)" in str(e):
	print("#"*88,'\n#')
	print("#\033[91m Error!!! The data is too sparse. Please decrease the value of: [t]\033[0m\n#")
	print("#"*88,'\n')
	sys.exit(1)
	else:
	raise

	rhos = []
	for i in range(len(NNindexes)):
	rhos.append(np.dot(np.exp(-(NNdists[i] / dc) ** 2), val[NNindexes[i]]))
	rhos = np.asarray(rhos)

	_r = 100
	_indexes, _dists = posTree.query_radius(pos, r=_r, return_distance=True, sort_results=True)
	deltas = rhos * 0
	LargerNei = rhos * 0 - 1
	for i in range(len(_indexes)):
	idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
	if idx.shape[0] == 0:
	deltas[i] = _dists[i][-1] + 1
	else:
	LargerNei[i] = _indexes[i][idx[0]]
	deltas[i] = _dists[i][idx[0]]
	failed = np.argwhere(LargerNei == -1).flatten()
	while len(failed) > 1 and _r < 100000:
	_r = _r * 10
	_indexes, _dists = posTree.query_radius(pos[failed], r=_r, return_distance=True, sort_results=True)
	for i in range(len(_indexes)):
	idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
	if idx.shape[0] == 0:
	deltas[failed[i]] = _dists[i][-1] + 1
	else:
	LargerNei[failed[i]] = _indexes[i][idx[0]]
	deltas[failed[i]] = _dists[i][idx[0]]
	failed = np.argwhere(LargerNei == -1).flatten()

	data['rhos']=rhos
	data['deltas']=deltas
	else:
	data['rhos']=[]
	data['deltas']=[]
	return data



	@click.command()
	@click.option('-dc','--distance_cutoff', type=int, default=5, help='Distance cutoff for local density calculation in terms of bin. [5]')
	@click.option('-t','--threshold', type=float, default=0.6, help='Loop score threshold [0.6]')
	@click.option('-r','--resol', default=5000, help='resolution [5000]')
	@click.option('-R','--radius', type=int, default=2, help='Radius threshold to remove outliers. [2]')
	@click.option('-d','--mindelta', type=float, default=5, help='Min distance allowed between two loops [5]')
	@click.option('-i','--candidates', type=str,required=True,help ='Loop candidates file path')
	@click.option('-o','--output', type=str,required=True,help ='.bedpe file path to save loops')
	def pool(distance_cutoff,candidates,resol,mindelta,threshold,output,radius,refine=True):
	"""Call loops from loop candidates by clustering
	"""
	print('\npolaris loop pool START :) ')

	data = pd.read_csv(candidates, sep='\t', header=None)

	ccs = set(data.iloc[:,0])

	if data.shape[0] == 0:
	print("#"*88,'\n#')
	print("#\033[91m Error!!! The file is empty. Please check your file.\033[0m\n#")
	print("#"*88,'\n')
	sys.exit(1)
	data = data[data[6] > threshold].reset_index(drop=True)
	data = data[data[4] - data[1] > 11*resol].reset_index(drop=True)
	if data.shape[0] == 0:
	print("#"*88,'\n#')
	print("#\033[91m Error!!! The data is too sparse. Please decrease: [threshold] (minimum: 0.5).\033[0m\n#")
	print("#"*88,'\n')
	sys.exit(1)
	data[['rhos','deltas']]=0
	data=data.groupby([0]).apply(rhoDelta,resol=resol,dc=distance_cutoff,radius=radius).reset_index(drop=True)
	minrho=0
	targetData=data.reset_index(drop=True)

	loopPds=[]
	chroms=tqdm(set(targetData[0]), dynamic_ncols=True)
	for chrom in chroms:
	chroms.desc = f"[Runing clustering on {chrom}]"
	data = targetData[targetData[0]==chrom].reset_index(drop=True)

	pos = data[[1, 4]].to_numpy() // resol
	posTree = KDTree(pos, leaf_size=30, metric='chebyshev')

	rhos = data['rhos'].to_numpy()
	deltas = data['deltas'].to_numpy()
	centroid = np.argwhere((rhos > minrho) & (deltas > mindelta)).flatten()

	_r = 100
	_indexes, _dists = posTree.query_radius(pos, r=_r, return_distance=True, sort_results=True)
	LargerNei = rhos * 0 - 1
	for i in range(len(_indexes)):
	idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
	if idx.shape[0] == 0:
	pass
	else:
	LargerNei[i] = _indexes[i][idx[0]]

	failed = np.argwhere(LargerNei == -1).flatten()
	while len(failed) > 1 and _r < 100000:
	_r = _r * 10
	_indexes, _dists = posTree.query_radius(pos[failed], r=_r, return_distance=True, sort_results=True)
	for i in range(len(_indexes)):
	idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
	if idx.shape[0] == 0:
	pass
	else:
	LargerNei[failed[i]] = _indexes[i][idx[0]]
	failed = np.argwhere(LargerNei == -1).flatten()

	LargerNei = LargerNei.astype(int)
	label = LargerNei * 0 - 1
	for i in range(len(centroid)):
	label[centroid[i]] = i
	decreasingsortedIdxRhos = np.argsort(-rhos)
	for i in decreasingsortedIdxRhos:
	if label[i] == -1:
	label[i] = label[LargerNei[i]]

	val = data[6].to_numpy()
	refinedLoop = []
	label = label.flatten()
	for l in set(label):
	idx = np.argwhere(label == l).flatten()
	if len(idx) > 0:
	refinedLoop.append(idx[np.argmax(val[idx])])
	if refine:
	loopPds.append(data.loc[refinedLoop])
	else:
	loopPds.append(data.loc[centroid])

	loopPd=pd.concat(loopPds).sort_values(6,ascending=False)
	loopPd[[1, 2, 4, 5]] = loopPd[[1, 2, 4, 5]].astype(int)
	loopPd[[0,1,2,3,4,5,6]].to_csv(output,sep='\t',header=False, index=False)

	ccs_ = set(loopPd.iloc[:,0])
	badc = ccs.difference(ccs_)
	if len(badc) == len(ccs):
	raise ValueError("polaris loop pool FAILED :(\nPlease check input and mcool file to yield scoreFile. Or use higher '-s' value for more sparse mcool data.")
	else:
	print(f'\npolaris loop pool FINISHED :)\n{len(loopPd)} loops saved to {output}')
	if len(badc) > 0:
	print(f"But the loop score of {badc} are too sparse.\nYou may need to check the mcool data or re-run polaris loop score by increasing -s.")


	if __name__ == '__main__':
	pool()