Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 2,152 Bytes
c0ec7e6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
from importlib import resources
import numpy as np
import pandas as pd
from subword_nmt.apply_bpe import BPE
import codecs
vocab_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/protein_codes_uniprot.txt')
bpe_codes_protein = codecs.open(vocab_path)
protein_bpe = BPE(bpe_codes_protein, merges=-1, separator='')
sub_csv_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/subword_units_map_uniprot.csv')
sub_csv = pd.read_csv(sub_csv_path)
idx2word_protein = sub_csv['index'].values
words2idx_protein = dict(zip(idx2word_protein, range(0, len(idx2word_protein))))
vocab_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/drug_codes_chembl.txt')
bpe_codes_drug = codecs.open(vocab_path)
drug_bpe = BPE(bpe_codes_drug, merges=-1, separator='')
sub_csv_path = resources.files('deepscreen').parent.joinpath('resources/vocabs/ESPF/subword_units_map_chembl.csv')
sub_csv = pd.read_csv(sub_csv_path)
idx2word_drug = sub_csv['index'].values
words2idx_drug = dict(zip(idx2word_drug, range(0, len(idx2word_drug))))
def protein_to_embedding(x, max_sequence_length):
max_p = max_sequence_length
t1 = protein_bpe.process_line(x).split() # split
try:
i1 = np.asarray([words2idx_protein[i] for i in t1]) # index
except:
i1 = np.array([0])
# print(x)
l = len(i1)
if l < max_p:
i = np.pad(i1, (0, max_p - l), 'constant', constant_values=0)
input_mask = ([1] * l) + ([0] * (max_p - l))
else:
i = i1[:max_p]
input_mask = [1] * max_p
return i, np.asarray(input_mask)
def drug_to_embedding(x, max_sequence_length):
max_d = max_sequence_length
t1 = drug_bpe.process_line(x).split() # split
try:
i1 = np.asarray([words2idx_drug[i] for i in t1]) # index
except:
i1 = np.array([0])
# print(x)
l = len(i1)
if l < max_d:
i = np.pad(i1, (0, max_d - l), 'constant', constant_values=0)
input_mask = ([1] * l) + ([0] * (max_d - l))
else:
i = i1[:max_d]
input_mask = [1] * max_d
return i, np.asarray(input_mask)
|