DeepSEQreen_fast_build

Running on CPU Upgrade

App Files Files Community

DeepSEQreen_fast_build / deepscreen /data /featurizers /fcs.py

libokj

Upload 110 files

c0ec7e6 about 1 year ago

raw

history blame

2.15 kB

	from importlib import resources

	import numpy as np
	import pandas as pd
	from subword_nmt.apply_bpe import BPE
	import codecs

	vocab_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/protein_codes_uniprot.txt')
	bpe_codes_protein = codecs.open(vocab_path)
	protein_bpe = BPE(bpe_codes_protein, merges=-1, separator='')

	sub_csv_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/subword_units_map_uniprot.csv')
	sub_csv = pd.read_csv(sub_csv_path)
	idx2word_protein = sub_csv['index'].values
	words2idx_protein = dict(zip(idx2word_protein, range(0, len(idx2word_protein))))

	vocab_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/drug_codes_chembl.txt')
	bpe_codes_drug = codecs.open(vocab_path)
	drug_bpe = BPE(bpe_codes_drug, merges=-1, separator='')

	sub_csv_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/subword_units_map_chembl.csv')
	sub_csv = pd.read_csv(sub_csv_path)
	idx2word_drug = sub_csv['index'].values
	words2idx_drug = dict(zip(idx2word_drug, range(0, len(idx2word_drug))))


	def protein_to_embedding(x, max_sequence_length):
	max_p = max_sequence_length
	t1 = protein_bpe.process_line(x).split() # split
	try:
	i1 = np.asarray([words2idx_protein[i] for i in t1]) # index
	except:
	i1 = np.array([0])
	# print(x)

	l = len(i1)

	if l < max_p:
	i = np.pad(i1, (0, max_p - l), 'constant', constant_values=0)
	input_mask = ([1] * l) + ([0] * (max_p - l))
	else:
	i = i1[:max_p]
	input_mask = [1] * max_p

	return i, np.asarray(input_mask)


	def drug_to_embedding(x, max_sequence_length):
	max_d = max_sequence_length
	t1 = drug_bpe.process_line(x).split() # split
	try:
	i1 = np.asarray([words2idx_drug[i] for i in t1]) # index
	except:
	i1 = np.array([0])
	# print(x)

	l = len(i1)

	if l < max_d:
	i = np.pad(i1, (0, max_d - l), 'constant', constant_values=0)
	input_mask = ([1] * l) + ([0] * (max_d - l))

	else:
	i = i1[:max_d]
	input_mask = [1] * max_d

	return i, np.asarray(input_mask)