Upload folder using huggingface_hub

05ebc17 verified 4 months ago

13.3 kB

	import sys
	import torch
	import cooler
	import click
	import numpy as np
	import pandas as pd
	from importlib_resources import files

	from torch import nn
	from tqdm import tqdm
	from torch.cuda.amp import autocast
	from torch.utils.data import DataLoader

	from sklearn.neighbors import KDTree
	from polaris.model.polarisnet import polarisnet
	from polaris.utils.util_data import centerPredCoolDataset

	def rhoDelta(data,resol,dc,radius):

	pos = data[[1, 4]].to_numpy() // resol
	posTree = KDTree(pos, leaf_size=30, metric='chebyshev')
	NNindexes, NNdists = posTree.query_radius(pos, r=radius, return_distance=True)
	_l = []
	for v in NNindexes:
	_l.append(len(v))
	_l=np.asarray(_l)
	data = data[_l>5].reset_index(drop=True)

	if data.shape[0] != 0:
	pos = data[[1, 4]].to_numpy() // resol
	val = data[6].to_numpy()

	try:
	posTree = KDTree(pos, leaf_size=30, metric='chebyshev')
	NNindexes, NNdists = posTree.query_radius(pos, r=dc, return_distance=True)
	except ValueError as e:
	if "Found array with 0 sample(s)" in str(e):
	print("#"*88,'\n#')
	print("#\033[91m Error!!! The data is too sparse. Please decrease the value of: [t]\033[0m\n#")
	print("#"*88,'\n')
	sys.exit(1)
	else:
	raise

	rhos = []
	for i in range(len(NNindexes)):
	rhos.append(np.dot(np.exp(-(NNdists[i] / dc) ** 2), val[NNindexes[i]]))
	rhos = np.asarray(rhos)

	_r = 100
	_indexes, _dists = posTree.query_radius(pos, r=_r, return_distance=True, sort_results=True)
	deltas = rhos * 0
	LargerNei = rhos * 0 - 1
	for i in range(len(_indexes)):
	idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
	if idx.shape[0] == 0:
	deltas[i] = _dists[i][-1] + 1
	else:
	LargerNei[i] = _indexes[i][idx[0]]
	deltas[i] = _dists[i][idx[0]]
	failed = np.argwhere(LargerNei == -1).flatten()
	while len(failed) > 1 and _r < 100000:
	_r = _r * 10
	_indexes, _dists = posTree.query_radius(pos[failed], r=_r, return_distance=True, sort_results=True)
	for i in range(len(_indexes)):
	idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
	if idx.shape[0] == 0:
	deltas[failed[i]] = _dists[i][-1] + 1
	else:
	LargerNei[failed[i]] = _indexes[i][idx[0]]
	deltas[failed[i]] = _dists[i][idx[0]]
	failed = np.argwhere(LargerNei == -1).flatten()

	data['rhos']=rhos
	data['deltas']=deltas
	else:
	data['rhos']=[]
	data['deltas']=[]
	return data

	def pool(data,dc,resol,mindelta,t,output,radius,refine=True):
	ccs = set(data.iloc[:,0])

	if data.shape[0] == 0:
	print("#"*88,'\n#')
	print("#\033[91m Error!!! The file is empty. Please check your file.\033[0m\n#")
	print("#"*88,'\n')
	sys.exit(1)
	data = data[data[6] > t].reset_index(drop=True)
	data = data[data[4] - data[1] > 11*resol].reset_index(drop=True)
	if data.shape[0] == 0:
	print("#"*88,'\n#')
	print("#\033[91m Error!!! The data is too sparse. Please decrease: [threshold] (minimum: 0.5).\033[0m\n#")
	print("#"*88,'\n')
	sys.exit(1)
	data[['rhos','deltas']]=0
	data=data.groupby([0]).apply(rhoDelta,resol=resol,dc=dc,radius=radius).reset_index(drop=True)
	minrho=0
	targetData=data.reset_index(drop=True)

	loopPds=[]
	chroms=tqdm(set(targetData[0]), dynamic_ncols=True)
	for chrom in chroms:
	chroms.desc = f"[Runing clustering on {chrom}]"
	data = targetData[targetData[0]==chrom].reset_index(drop=True)

	pos = data[[1, 4]].to_numpy() // resol
	posTree = KDTree(pos, leaf_size=30, metric='chebyshev')

	rhos = data['rhos'].to_numpy()
	deltas = data['deltas'].to_numpy()
	centroid = np.argwhere((rhos > minrho) & (deltas > mindelta)).flatten()

	_r = 100
	_indexes, _dists = posTree.query_radius(pos, r=_r, return_distance=True, sort_results=True)
	LargerNei = rhos * 0 - 1
	for i in range(len(_indexes)):
	idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
	if idx.shape[0] == 0:
	pass
	else:
	LargerNei[i] = _indexes[i][idx[0]]

	failed = np.argwhere(LargerNei == -1).flatten()
	while len(failed) > 1 and _r < 100000:
	_r = _r * 10
	_indexes, _dists = posTree.query_radius(pos[failed], r=_r, return_distance=True, sort_results=True)
	for i in range(len(_indexes)):
	idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
	if idx.shape[0] == 0:
	pass
	else:
	LargerNei[failed[i]] = _indexes[i][idx[0]]
	failed = np.argwhere(LargerNei == -1).flatten()

	LargerNei = LargerNei.astype(int)
	label = LargerNei * 0 - 1
	for i in range(len(centroid)):
	label[centroid[i]] = i
	decreasingsortedIdxRhos = np.argsort(-rhos)
	for i in decreasingsortedIdxRhos:
	if label[i] == -1:
	label[i] = label[LargerNei[i]]

	val = data[6].to_numpy()
	refinedLoop = []
	label = label.flatten()
	for l in set(label):
	idx = np.argwhere(label == l).flatten()
	if len(idx) > 0:
	refinedLoop.append(idx[np.argmax(val[idx])])
	if refine:
	loopPds.append(data.loc[refinedLoop])
	else:
	loopPds.append(data.loc[centroid])

	loopPd=pd.concat(loopPds).sort_values(6,ascending=False)
	loopPd[[1, 2, 4, 5]] = loopPd[[1, 2, 4, 5]].astype(int)
	loopPd[[0,1,2,3,4,5,6]].to_csv(output,sep='\t',header=False, index=False)

	ccs_ = set(loopPd.iloc[:,0])
	badc = ccs.difference(ccs_)

	return len(loopPd),badc,ccs


	@click.command()
	@click.option('-b','--batchsize', type=int, default=128, help='Batch size [128]')
	@click.option('-C','--cpu', type=bool, default=False, help='Use CPU [False]')
	@click.option('-G','--gpu', type=str, default=None, help='Comma-separated GPU indices [auto select]')
	@click.option('-c','--chrom', type=str, default=None, help='Comma separated chroms [all autosomes]')
	@click.option('-nw','--workers', type=int, default=16, help='Number of cpu threads [16]')
	@click.option('-t','--threshold', type=float, default=0.6, help='Loop Score Threshold [0.6]')
	@click.option('-s','--sparsity', type=float, default=0.9, help='Allowed sparsity of submatrices [0.9]')
	@click.option('-md','--max_distance', type=int, default=3000000, help='Max distance (bp) between contact pairs [3000000]')
	@click.option('-r','--resol',type=int,default=5000,help ='Resolution [5000]')
	@click.option('-dc','--distance_cutoff', type=int, default=5, help='Distance cutoff for local density calculation in terms of bin. [5]')
	@click.option('-R','--radius', type=int, default=2, help='Radius threshold to remove outliers. [2]')
	@click.option('-d','--mindelta', type=float, default=5, help='Min distance allowed between two loops [5]')
	@click.option('--raw',type=bool,default=False,help ='Raw matrix or balanced matrix')
	@click.option('-i','--input', type=str,required=True,help='Hi-C contact map path')
	@click.option('-o','--output', type=str,required=True,help='.bedpe file path to save loops')
	def pred(batchsize, cpu, gpu, chrom, threshold, sparsity, workers, max_distance, resol, distance_cutoff, radius, mindelta, input, output, raw, image=224):
	"""Predict loops from input contact map directly
	"""
	print('\npolaris loop pred START :)')

	center_size = image // 2
	start_idx = (image - center_size) // 2
	end_idx = (image + center_size) // 2
	slice_obj_pred = (slice(None), slice(None), slice(start_idx, end_idx), slice(start_idx, end_idx))
	slice_obj_coord = (slice(None), slice(start_idx, end_idx), slice(start_idx, end_idx))

	results=[]

	if cpu:
	assert gpu is None, "\033[91m QAQ The CPU and GPU modes cannot be used simultaneously. Please check the command. \033[0m\n"
	gpu = ['None']
	device = torch.device("cpu")
	print('Using CPU mode... (This may take significantly longer than using GPU mode.)')
	else:
	if torch.cuda.is_available():
	if gpu is not None:
	print("Using the specified GPU: " + gpu)
	gpu=[int(i) for i in gpu.split(',')]
	device = torch.device(f"cuda:{gpu[0]}")
	else:
	gpuIdx = torch.cuda.current_device()
	device = torch.device(gpuIdx)
	print("Automatically selected GPU: " + str(gpuIdx))
	gpu=[gpu]
	else:
	device = torch.device("cpu")
	gpu = ['None']
	cpu = True
	print('GPU is not available!')
	print('Using CPU mode... (This may take significantly longer than using GPU mode.)')


	coolfile = cooler.Cooler(input + '::/resolutions/' + str(resol))
	modelstate = str(files('polaris').joinpath('model/sft_loop.pt'))
	_modelstate = torch.load(modelstate, map_location=device.type)
	parameters = _modelstate['parameters']

	if chrom is None:
	chrom =coolfile.chromnames
	else:
	chrom = chrom.split(',')

	# for rmchr in ['chrMT','MT','chrM','M','Y','chrY','X','chrX','chrW','W','chrZ','Z']: # 'Y','chrY','X','chrX'
	# if rmchr in chrom:
	# chrom.remove(rmchr)

	print(f"Analysing chroms: {chrom}")

	model = polarisnet(
	image_size=parameters['image_size'],
	in_channels=parameters['in_channels'],
	out_channels=parameters['out_channels'],
	embed_dim=parameters['embed_dim'],
	depths=parameters['depths'],
	channels=parameters['channels'],
	num_heads=parameters['num_heads'],
	drop=parameters['drop'],
	drop_path=parameters['drop_path'],
	pos_embed=parameters['pos_embed']
	).to(device)
	model.load_state_dict(_modelstate['model_state_dict'])
	if not cpu and len(gpu) > 1:
	model = nn.DataParallel(model, device_ids=gpu)
	model.eval()

	print('\n******score START******')

	badc=[]
	chrom_ = tqdm(chrom, dynamic_ncols=True)
	for _chrom in chrom_:
	test_data = centerPredCoolDataset(coolfile,_chrom,max_distance_bin=max_distance//resol,w=image,step=center_size,s=sparsity,raw=raw)
	test_dataloader = DataLoader(test_data, batch_size=batchsize, shuffle=False,num_workers=workers,prefetch_factor=4,pin_memory=(gpu is not None))

	chrom_.desc = f"[Analyzing {_chrom} with {len(test_data)} submatrices]"

	if len(test_data) == 0:
	badc.append(_chrom)

	with torch.no_grad():
	for X in test_dataloader:
	bin_i,bin_j,targetX=X
	bin_i = bin_i*resol
	bin_j = bin_j*resol
	with autocast():
	pred = torch.sigmoid(model(targetX.float().to(device)))[slice_obj_pred].flatten()
	loop = torch.nonzero(pred>threshold).flatten().cpu()
	prob = pred[loop].cpu().numpy().flatten().tolist()
	frag1 = bin_i[slice_obj_coord].flatten().cpu().numpy()[loop].flatten().tolist()
	frag2 = bin_j[slice_obj_coord].flatten().cpu().numpy()[loop].flatten().tolist()

	for i in range(len(frag1)):
	# if frag1[i] < frag2[i] and frag2[i]-frag1[i] > 11*resol and frag2[i]-frag1[i] < max_distance:
	if frag1[i] < frag2[i] and frag2[i]-frag1[i] < max_distance:
	results.append([_chrom, frag1[i], frag1[i] + resol,
	_chrom, frag2[i], frag2[i] + resol,
	prob[i]])
	if len(badc)==len(chrom):
	raise ValueError("score FAILED :(\nThe '-s' value needs to be increased for more sparse data.")
	else:
	print(f'******score FINISHED******')
	if len(badc)>0:
	print(f"· But the size of {badc} are too small or their contact matrix are too sparse.\n· You may need to check the data or run these chr respectively by increasing -s.")
	print(f'******pool START******')

	df = pd.DataFrame(results)
	loopNum,badcp,ccs = pool(df,distance_cutoff,resol,mindelta,threshold,output,radius)
	if len(badcp) == len(ccs):
	raise ValueError("pool FAILED :(\nPlease check input and mcool file to yield scoreFile. Or use higher '-s' value for more sparse mcool data.")
	else:
	print(f'******pool FINISHED******')
	if len(badcp) > 0:
	print(f"· But the loop score of {badcp} are too sparse.\n· You may need to check the mcool data or re-run polaris loop score by increasing -s.")


	print(f'\npolaris loop pred FINISHED :)\n{loopNum} loops saved to {output}')

	if __name__ == '__main__':
	pred()