gzhong commited on
Commit
8d4f72a
·
verified ·
1 Parent(s): bfbdd3d

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
.gitattributes CHANGED
@@ -71,3 +71,14 @@ data.files/gMVP.MSA.tgz.part-ab filter=lfs diff=lfs merge=lfs -text
71
  parse.input.table/swissprot_and_human.full.seq.csv.tgz.part-aa filter=lfs diff=lfs merge=lfs -text
72
  parse.input.table/swissprot_and_human.full.seq.csv.tgz.part-ab filter=lfs diff=lfs merge=lfs -text
73
  parse.input.table/swissprot_and_human.full.seq.csv.tgz.part-ac filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
71
  parse.input.table/swissprot_and_human.full.seq.csv.tgz.part-aa filter=lfs diff=lfs merge=lfs -text
72
  parse.input.table/swissprot_and_human.full.seq.csv.tgz.part-ab filter=lfs diff=lfs merge=lfs -text
73
  parse.input.table/swissprot_and_human.full.seq.csv.tgz.part-ac filter=lfs diff=lfs merge=lfs -text
74
+ MSA.tgz.part-aa filter=lfs diff=lfs merge=lfs -text
75
+ af2.files.tgz.part-aa filter=lfs diff=lfs merge=lfs -text
76
+ esm.MSA.tgz.part-aa filter=lfs diff=lfs merge=lfs -text
77
+ esm.MSA.tgz.part-ab filter=lfs diff=lfs merge=lfs -text
78
+ esm.files.tgz.part-aa filter=lfs diff=lfs merge=lfs -text
79
+ esm.files.tgz.part-ab filter=lfs diff=lfs merge=lfs -text
80
+ esm.files.tgz.part-ac filter=lfs diff=lfs merge=lfs -text
81
+ esm.files.tgz.part-ad filter=lfs diff=lfs merge=lfs -text
82
+ esm.files.tgz.part-ae filter=lfs diff=lfs merge=lfs -text
83
+ gMVP.MSA.tgz.part-aa filter=lfs diff=lfs merge=lfs -text
84
+ pretrain.tgz.part-aa filter=lfs diff=lfs merge=lfs -text
ALL.csv.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ba809ae66bc775ff9072934ade95a242fc378962eb6562fca5e926e6eadd5d2
3
+ size 11103682
ASPA.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5e401c824edb3ae63e05d973dafb82a40e0c0b4167f8ea8a44f255cb2e61a2f
3
+ size 6497828
CCR5.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32b1fd9f3c79438e5804c24fbfd63db0f102741d60837e1735d03e554005027a
3
+ size 5790823
CXCR4.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5be7e596768b968ec92ae5a74857fc9d047d03a1387fed181fa378edf41eb057
3
+ size 5826481
CYP2C9.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d89ae48ab501c9b110c8da0b89ebe40a8843b3af8a80ce26a751f9548f246984
3
+ size 7476136
GCK.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6af4a7f205161f42e26b645c0c7baa468be540af912a5349cc700945a10429e7
3
+ size 9954541
ICC.seed.0.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb36b5eadfefc849887c480554752f43a6b76c545a2ced700d260a1cfe56c592
3
+ size 4588973
ICC.seed.1.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:402f281b6b1f83da14b1b1ab81774c2da45a67883f6c0d9d778a804a15722bd2
3
+ size 4577813
ICC.seed.2.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82a8afea44f05506266ffcddce3dcbff145aad59427b606b7c4d5356fba26a80
3
+ size 4577738
ICC.seed.3.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b52b45a04a73cf531bc9be4ef31785dc3dd2f8b0cb649aa4b8fd3ed3ec5c99e
3
+ size 4588209
ICC.seed.4.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:405b089729ebdcc19710f135d06e31896cfbd4a99b0e4098926443aa218817f9
3
+ size 4563681
MSA.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8b28e09ba5f647d720f865fc37c21063efe8fa5f6b1fe67d4b9f1348f21c6b5
3
+ size 853335
MSA.tgz.part-aa ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:670504f78fb5828face000895457b0bdf006e2c762f90546d4160c1741abdad7
3
+ size 756213964
NUDT15.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac39d81c5d54a408e4a024961cd889f2702cab1f7ed40d83262067f1314c494f
3
+ size 2648155
PTEN.bin.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbe7d33dbbf5630c5c161986eb052f12deb61b7ccbf678fa7ff5bc7897ba4c59
3
+ size 983279
PTEN.replicate.rest.1.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a5a8491533eb09736c97526c22aa83220d3df81cbc3e086d0c97cf82fecbac4
3
+ size 4065895
PTEN.replicate.rest.2.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e22735a538e5dc3ffbf3926514ea32a3b8ef5638f0fd274eacab37ec3e076f7
3
+ size 4065999
PTEN.replicate.rest.3.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6bb4a7d231337c8436e3a0b46af24e7b8e87953c75676ed9849a6d27b87157d
3
+ size 4065725
PTEN.replicate.rest.4.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90216071931145f6c8b4a2ec31224f7bb1edce8a43490e16a57173a6cc634433
3
+ size 4065705
PTEN.replicate.rest.5.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87694c8d852f9d79230f7e064b4079902b17b4e4a66347b9b359902d987d8e23
3
+ size 4065975
PTEN.replicate.rest.6.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c2afb3de5c57da35333d61dc11e48732916455cd4f48430c0291161702797fa
3
+ size 4065851
PTEN.replicate.rest.7.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:470a04794e6e88bc525ca7ca9de5f05ba0c4406cd321b3aa11a6f77beda102d2
3
+ size 4065635
PTEN.replicate.rest.8.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9004017f797684b8166ab337e3ba7211a0920fbc733d69ef1192d449d5ce51b
3
+ size 4065531
PTEN.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4eff76435a175961076847bdea25b79f2a45a548a9b1f393dc1d73a32fbca170
3
+ size 4244420
SNCA.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da09fe734efc81689f94e280656d497f3c3118e3389498b945bbae5daa0bd562
3
+ size 1932672
Stab.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b4332f6557b56d355eb012317070675941c985f217f5a52c032e4789af5dc1f
3
+ size 49191908
af2.files.tgz.part-aa ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8c99e093f3deb83e5cf57f7d3f2003af69268e2d7020342d23d98dfe8464342
3
+ size 1747009312
esm.MSA.tgz.part-aa ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2a14923d1aaea14ad3ccedaafc5cc89b819811cc6fd8fd58584a7572dff0d91
3
+ size 10737418240
esm.MSA.tgz.part-ab ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a40159327f7454b4ff2ee5285e3bb7d778936bca39e41649d369aa3a3499a6aa
3
+ size 2929687838
esm.files.tgz.part-aa ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98a5c81d3651a6cc33c0e99a4e2b206fae7ee26c3bf9a0df8ee006741004a8c6
3
+ size 10737418240
esm.files.tgz.part-ab ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9caf98a5e3acba2a0ca3f9a3c3effe2803e82c0a3114bf5f911f77f105e9a949
3
+ size 10737418240
esm.files.tgz.part-ac ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa44afe7158f565e229a8ead8d8760cb8393340c6e3287e457385d9678cb31df
3
+ size 10737418240
esm.files.tgz.part-ad ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91d4e1b0fa3a399a6755537f64783cbe6eb0a731e2d7812f091a4919d3c39268
3
+ size 10737418240
esm.files.tgz.part-ae ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b975435639ee8fc7ab9b45da83a35dc2f246a943aec53a59d459b6a46ab6a85d
3
+ size 362286942
esm.inference.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import os
4
+ import esm
5
+ import torch
6
+ import argparse
7
+
8
+ os.environ['CUDA_LAUNCH_BLOCKING'] = '-1'
9
+
10
+
11
+ def precompute_sequence(transcript_id, sequence, esm_model, batch_converter, out_dir, device_id=0):
12
+ if os.path.exists(os.path.join(out_dir, transcript_id + '.contacts.npy')):
13
+ return
14
+ else:
15
+ print('begin precompute sequence for {}'.format(transcript_id))
16
+ try:
17
+ data = [(transcript_id, sequence)]
18
+ _, _, toks = batch_converter(data)
19
+ except:
20
+ print(transcript_id)
21
+ return
22
+ toks = toks.to(f'cuda:{device_id}')
23
+ aa = toks.shape[1]
24
+ if aa <= 2250:
25
+ print(f"{transcript_id} has {toks.shape[1]} amino acids")
26
+ return
27
+ with torch.no_grad():
28
+ out = esm_model(toks, repr_layers=[33], return_contacts=True, need_head_weights=False)
29
+ representations = out["representations"][33][0].to(device='cpu').detach().numpy()
30
+ # output is batch x layers x heads x seqlen x seqlen
31
+ # attentions = out["attentions"][0].to(device="cpu").detach().numpy()
32
+ contacts = out['contacts'][0].to(device="cpu").detach().numpy()
33
+ logits = out['logits'][0].to(device="cpu").detach().numpy()
34
+ np.save(
35
+ f"{out_dir}/{transcript_id}.representations.layer.48.npy",
36
+ representations,
37
+ )
38
+ np.save(
39
+ f"{out_dir}/{transcript_id}.contacts.npy",
40
+ contacts,
41
+ )
42
+ np.save(
43
+ f"{out_dir}/{transcript_id}.logits.npy",
44
+ logits,
45
+ )
46
+ return
47
+
48
+
49
+ def precompute_sequence_multiple_gpus(transcript_id, sequence, esm_model, batch_converter, out_dir):
50
+ if os.path.exists(os.path.join(out_dir, transcript_id + '.contacts.npy')):
51
+ return
52
+ else:
53
+ print('begin precompute sequence for {}'.format(transcript_id))
54
+ try:
55
+ data = [(transcript_id, sequence)]
56
+ _, _, toks = batch_converter(data)
57
+ except:
58
+ print(transcript_id)
59
+ return
60
+ toks = toks.to('cuda:0')
61
+ if toks.shape[1] > 30000:
62
+ print(f"{transcript_id} has {toks.shape[1]} amino acids, don't proceed")
63
+ return
64
+ print(f"{transcript_id} has {toks.shape[1]} amino acids")
65
+ if toks.shape[1] > 5500:
66
+ need_head_weights = False
67
+ return_contacts = False
68
+ else:
69
+ need_head_weights = True
70
+ return_contacts = True
71
+ with torch.no_grad():
72
+ assert toks.ndim == 2
73
+ padding_mask = toks.eq(esm_model.padding_idx) # B, T
74
+ x = esm_model.embed_scale * esm_model.embed_tokens(toks)
75
+
76
+ if esm_model.token_dropout:
77
+ x.masked_fill_((toks == esm_model.mask_idx).unsqueeze(-1), 0.0)
78
+ # x: B x T x C
79
+ mask_ratio_train = 0.15 * 0.8
80
+ src_lengths = (~padding_mask).sum(-1)
81
+ mask_ratio_observed = (toks == esm_model.mask_idx).sum(-1).to(x.dtype) / src_lengths
82
+ x = x * (1 - mask_ratio_train) / (1 - mask_ratio_observed)[:, None, None]
83
+
84
+ if padding_mask is not None:
85
+ x = x * (1 - padding_mask.unsqueeze(-1).type_as(x))
86
+
87
+ repr_layers = {33}
88
+ hidden_representations = {}
89
+ if 0 in repr_layers:
90
+ hidden_representations[0] = x
91
+ if need_head_weights:
92
+ attn_weights = []
93
+ # (B, T, E) => (T, B, E)
94
+ x = x.transpose(0, 1)
95
+ if not padding_mask.any():
96
+ padding_mask = None
97
+ for layer_idx, layer in enumerate(esm_model.layers):
98
+ x = x.to(f'cuda:{layer_idx // 9}')
99
+ x, attn = layer(
100
+ x,
101
+ self_attn_padding_mask=padding_mask,
102
+ need_head_weights=need_head_weights,
103
+ )
104
+ if (layer_idx + 1) in repr_layers:
105
+ hidden_representations[layer_idx + 1] = x.transpose(0, 1)
106
+ if need_head_weights:
107
+ # (H, B, T, T) => (B, H, T, T)
108
+ attn_weights.append(attn.transpose(1, 0).cpu())
109
+ x = esm_model.emb_layer_norm_after(x)
110
+ x = x.transpose(0, 1) # (T, B, E) => (B, T, E)
111
+
112
+ # last hidden representation should have layer norm applied
113
+ if (layer_idx + 1) in repr_layers:
114
+ hidden_representations[layer_idx + 1] = x
115
+ # lm head is on cuda:0, x is on cuda:3
116
+ x = esm_model.lm_head(x.to('cuda:0'))
117
+ out = {"logits": x, "representations": hidden_representations}
118
+ if need_head_weights:
119
+ # attentions: B x L x H x T x T
120
+ attentions = torch.stack(attn_weights, 1)
121
+ if padding_mask is not None:
122
+ attention_mask = 1 - padding_mask.type_as(attentions)
123
+ attention_mask = attention_mask.unsqueeze(1) * attention_mask.unsqueeze(2)
124
+ attentions = attentions * attention_mask[:, None, None, :, :]
125
+ out["attentions"] = attentions
126
+ if return_contacts:
127
+ contacts = esm_model.contact_head(toks, attentions)
128
+ out["contacts"] = contacts
129
+ representations = out["representations"][33][0].to(device='cpu').detach().numpy()
130
+ # output is batch x layers x heads x seqlen x seqlen
131
+
132
+ logits = out['logits'][0].to(device="cpu").detach().numpy()
133
+ np.save(
134
+ f"{out_dir}/{transcript_id}.representations.layer.48.npy",
135
+ representations,
136
+ )
137
+ np.save(
138
+ f"{out_dir}/{transcript_id}.logits.npy",
139
+ logits,
140
+ )
141
+ if return_contacts:
142
+ contacts = out['contacts'][0].to(device="cpu").detach().numpy()
143
+ np.save(
144
+ f"{out_dir}/{transcript_id}.contacts.npy",
145
+ contacts,
146
+ )
147
+ return
148
+
149
+
150
+ def main(file=None, outdir=None):
151
+ model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
152
+ if torch.cuda.is_available():
153
+ # manually split the model into 4 GPUs
154
+ model.embed_tokens.to('cuda:0')
155
+ for layer_idx, layer in enumerate(model.layers):
156
+ layer.to(f'cuda:{layer_idx // 9}')
157
+ model.emb_layer_norm_after.to('cuda:3')
158
+ model.lm_head.to('cuda:0')
159
+ model.contact_head.to('cpu')
160
+ print("Transferred model to GPUs")
161
+ # model = model.to(f'cuda:{rank}')
162
+ if file is None:
163
+ return
164
+ files = pd.read_csv(file, index_col=0)
165
+ os.makedirs(outdir, exist_ok=True)
166
+ for transcript_id, sequence in zip(files['uniprotID'], files['sequence']):
167
+ precompute_sequence_multiple_gpus(transcript_id, sequence, model,
168
+ alphabet.get_batch_converter(),
169
+ outdir)
170
+
171
+
172
+ if __name__ == '__main__':
173
+ parser = argparse.ArgumentParser()
174
+ parser.add_argument('--file', type=str, default=None)
175
+ parser.add_argument('--outdir', type=str, default=None)
176
+ args = parser.parse_args()
177
+ main(args.file, args.outdir)
fluorescence.tgz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1089b47959bb0540b338460de27a666dd1788372d3bb499e0b33fa4089c05448
3
+ size 15124238
gMVP.MSA.tgz.part-aa ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9e251ed2ac02926a7f121b585eef3bb8fe351826c2ebf2410fc93f1a7dcd146
3
+ size 3105673966
preprocess.gene.pfam.R ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ source('/share/vault/Users/gz2294/Pipeline/uniprot.table.add.annotation.R')
2
+ ALL <- read.csv('ALL.csv', row.names = 1)
3
+
4
+ ALL <- uniprot.table.add.annotation.parallel(ALL, "InterPro")
5
+ # remove glazer
6
+ ALL <- ALL[ALL$data_source != "glazer",]
7
+
8
+ good.uniprotIDs <- data.frame(
9
+ uniprotID=c("P15056", "P21802", "P07949",
10
+ "P04637", "Q09428", "O00555",
11
+ "Q14654", "Q99250", "Q14524"))
12
+ good.uniprotIDs.df <- data.frame()
13
+ frac <- 0.8
14
+ for (seed in 0:4) {
15
+ split.dir <- paste0('ICC.seed.', seed, '/')
16
+ dir.create(split.dir)
17
+ for (i in 1:dim(good.uniprotIDs)[1]) {
18
+ gene.itan <- ALL[ALL$uniprotID==good.uniprotIDs$uniprotID[i],]
19
+ # prepare some types
20
+ pfams <- unique(unlist(strsplit(gene.itan$InterPro,";")))
21
+ # pick ratio% of variants as training
22
+ for (pfam in pfams) {
23
+ set.seed(seed)
24
+ GO.itan <- ALL[grep(pfam, ALL$InterPro),]
25
+ GO.itan.training <- GO.itan[!GO.itan$uniprotID %in% gene.itan$uniprotID,]
26
+ if (dim(GO.itan.training)[1] > 0) {
27
+ GO.itan.training$split <- 'train'
28
+ }
29
+ # select only the data in domain as train/test
30
+ gene.itan.domain <- gene.itan[grep(pfam, gene.itan$InterPro),]
31
+ # random select testing and validation
32
+ # select equal amount of gof and lof for testing
33
+ gof.training <- sample(which(gene.itan.domain$score==1), size = floor(sum(gene.itan.domain$score==1)*frac))
34
+ lof.training <- sample(which(gene.itan.domain$score==-1), size = floor(sum(gene.itan.domain$score==-1)*frac))
35
+ # select equal amount of gof and lof for validation
36
+ if (length(gof.training) > 0 & length(lof.training) > 0) {
37
+ gene.itan.domain.training <- gene.itan.domain[c(gof.training, lof.training),]
38
+ gene.itan.domain.training$split <- 'train'
39
+ gof.val <- sample(which(gene.itan.domain.training$score==1), size = floor(sum(gene.itan.domain$score==1)*(1-frac)))
40
+ lof.val <- sample(which(gene.itan.domain.training$score==-1), size = floor(sum(gene.itan.domain$score==-1)*(1-frac)))
41
+ gene.itan.domain.training$split[c(gof.val, lof.val)] <- 'val'
42
+
43
+ GO.itan.testing <- gene.itan.domain[-c(gof.training, lof.training),]
44
+ if (dim(GO.itan.testing)[1] > 0) {
45
+ # first save the gene itself
46
+ dir.create(paste0(split.dir, good.uniprotIDs$uniprotID[i], '.', pfam, ".", "self"))
47
+ write.csv(gene.itan.domain.training[sample(dim(gene.itan.domain.training)[1]),], paste0(split.dir, good.uniprotIDs$uniprotID[i], ".", pfam, ".self", "/training.csv"))
48
+ write.csv(GO.itan.testing, paste0(split.dir, good.uniprotIDs$uniprotID[i], ".", pfam, ".self", "/testing.csv"))
49
+ good.uniprotIDs.df <- rbind(good.uniprotIDs.df,
50
+ data.frame(dataID=paste0(good.uniprotIDs$uniprotID[i], ".", pfam, ".self"),
51
+ uniprotID=paste0(good.uniprotIDs$uniprotID[i]),
52
+ pfam=pfam,
53
+ gof.training=sum(gene.itan.domain.training$score==1),
54
+ lof.training=sum(gene.itan.domain.training$score==-1),
55
+ gof.testing=sum(GO.itan.testing$score==1),
56
+ lof.testing=sum(GO.itan.testing$score==-1),
57
+ seed=seed))
58
+ # next concatenate and shuffle
59
+ GO.itan.training <- dplyr::bind_rows(gene.itan.domain.training, GO.itan.training)
60
+ GO.itan.training <- GO.itan.training[sample(dim(GO.itan.training)[1]),]
61
+ GO.itan.testing <- GO.itan.testing[sample(dim(GO.itan.testing)[1]),]
62
+ # save the training files
63
+ dir.create(paste0(split.dir, good.uniprotIDs$uniprotID[i], '.', pfam, ".", pfam))
64
+ write.csv(GO.itan.training, paste0(split.dir, good.uniprotIDs$uniprotID[i], ".", pfam, ".", pfam, "/training.csv"))
65
+ write.csv(GO.itan.testing, paste0(split.dir, good.uniprotIDs$uniprotID[i], ".", pfam, ".", pfam, "/testing.csv"))
66
+
67
+ good.uniprotIDs.df <- rbind(good.uniprotIDs.df,
68
+ data.frame(dataID=paste0(good.uniprotIDs$uniprotID[i], ".", pfam, ".", pfam),
69
+ uniprotID=paste0(good.uniprotIDs$uniprotID[i]),
70
+ pfam=pfam,
71
+ gof.training=sum(GO.itan.training$score==1),
72
+ lof.training=sum(GO.itan.training$score==-1),
73
+ gof.testing=sum(GO.itan.testing$score==1),
74
+ lof.testing=sum(GO.itan.testing$score==-1),
75
+ seed=seed))
76
+ }
77
+ }
78
+ }
79
+ }
80
+ }
81
+ entry.list <- read.delim('/share/vault/Users/gz2294/Data/Protein/InterPro/entry.list')
82
+ good.uniprotIDs.df$name <- entry.list$ENTRY_NAME[match(good.uniprotIDs.df$pfam, entry.list$ENTRY_AC)]
83
+ write.csv(good.uniprotIDs.df, file = "good.uniprotIDs.InterPros.csv")
preprocess.separate.gene.R ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ source('../analysis/prepare.biochem.R')
2
+ ALL <- read.csv('../analysis/figs/ALL.csv', row.names = 1)
3
+ ALL$score.label <- NULL
4
+ gof.lof.df <- data.frame(uniprotIDs=as.character(unique(unlist(strsplit(ALL$uniprotID, split = ";")))), gof=0, lof=0)
5
+ for (i in 1:dim(gof.lof.df)[1]) {
6
+ gene <- ALL[grep(gof.lof.df$uniprotIDs[i], ALL$uniprotID),]
7
+ gof.lof.df$gof[i] <- sum(gene$score==1)
8
+ gof.lof.df$lof[i] <- sum(gene$score==-1)
9
+ }
10
+
11
+ good.uniprotIDs <- gof.lof.df[gof.lof.df$gof >= 15 & gof.lof.df$lof >= 15, ]
12
+
13
+ # split by random
14
+ ratio <- 0.8
15
+ good.uniprotIDs$gof.training <- NA
16
+ good.uniprotIDs$gof.testing <- NA
17
+ good.uniprotIDs$lof.training <- NA
18
+ good.uniprotIDs$lof.testing <- NA
19
+ for (seed in 0:4) {
20
+ split.dir <- paste0('ICC.seed.', seed, '/')
21
+ dir.create(split.dir)
22
+ for (i in 1:dim(good.uniprotIDs)[1]) {
23
+ gene <- ALL[grep(good.uniprotIDs$uniprotIDs[i], ALL$uniprotID),]
24
+ gene.gof <- gene[gene$score == 1,]
25
+ gene.lof <- gene[gene$score == -1,]
26
+ set.seed(seed)
27
+ # pick ratio% of variants as training
28
+ if (floor(dim(gene.gof)[1] * ratio) > 0 &
29
+ floor(dim(gene.lof)[1] * ratio) > 0) {
30
+ gene.gof.training <- sample(dim(gene.gof)[1], floor(dim(gene.gof)[1] * ratio))
31
+ gene.lof.training <- sample(dim(gene.lof)[1], floor(dim(gene.lof)[1] * ratio))
32
+ gene.training <- rbind(gene.gof[gene.gof.training,], gene.lof[gene.lof.training,])
33
+ gene.testing <- rbind(gene.gof[-gene.gof.training,], gene.lof[-gene.lof.training,])
34
+
35
+ gene.training <- gene.training[sample(dim(gene.training)[1]),]
36
+ gene.testing <- gene.testing[sample(dim(gene.testing)[1]),]
37
+
38
+ dir.create(paste0(split.dir, good.uniprotIDs$uniprotIDs[i]))
39
+
40
+ uid <- good.uniprotIDs$uniprotIDs[i]
41
+ if (uid == 'Q14524') {
42
+ uid <- 'Q14524.clean'
43
+ }
44
+
45
+ if (!file.exists(paste0(split.dir, uid, "/training.csv")) | uid=='Q99250') {
46
+ print(uid)
47
+ write.csv(gene.training, paste0(split.dir, uid, "/training.csv"))
48
+ }
49
+ if (!file.exists(paste0(split.dir, uid, "/testing.csv")) | uid=='Q99250') {
50
+ print(uid)
51
+ write.csv(gene.testing, paste0(split.dir, uid, "/testing.csv"))
52
+ }
53
+
54
+ good.uniprotIDs$gof.training[i] <- sum(tmp.training$score==1)
55
+ good.uniprotIDs$lof.training[i] <- sum(tmp.training$score==-1)
56
+
57
+ good.uniprotIDs$gof.testing[i] <- sum(tmp.testing$score==1)
58
+ good.uniprotIDs$lof.testing[i] <- sum(tmp.testing$score==-1)
59
+
60
+ }
61
+ }
62
+ }
63
+ write.csv(good.uniprotIDs, file = "sup.data.1.csv")
preprocess.separate.gene.itan.R ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # sum the variant number of uniprotIDs in the dataset
2
+ library(ggplot2)
3
+ source('~/Pipeline/uniprot.table.add.annotation.R')
4
+ ALL <- read.csv('ALL.csv', row.names = 1)
5
+ ALL$score[ALL$score == 0] <- -1
6
+ # remove glazer
7
+ ALL <- ALL[ALL$data_source != "glazer",]
8
+ ALL$score.label <- NULL
9
+ good.uniprotIDs <- data.frame(
10
+ uniprotIDs=c("P15056", "P21802", "P07949",
11
+ "P04637", "Q09428", "O00555",
12
+ "Q14654", "Q99250"))
13
+
14
+ core.columns <- c("uniprotID", "ref", "alt", "pos.orig", "ENST", "wt.orig", "score")
15
+ for (i in 1:dim(good.uniprotIDs)[1]) {
16
+ GO <- ALL[grep(good.uniprotIDs$uniprotIDs[i], ALL$uniprotID),]
17
+ print(table(GO$score[!grepl('Itan', GO$data_source)]))
18
+ }
19
+
20
+ itan.aucs.1 <- c()
21
+ itan.aucs.2 <- c()
22
+ # split by random, add beni
23
+ ratio <- 0.25
24
+ good.uniprotIDs$gof.training <- NA
25
+ good.uniprotIDs$gof.testing <- NA
26
+ good.uniprotIDs$lof.training <- NA
27
+ good.uniprotIDs$lof.testing <- NA
28
+ for (seed in 0:4) {
29
+ split.dir <- paste0('ICC.seed.', seed, '/')
30
+ dir.create(split.dir)
31
+ for (i in 1:dim(good.uniprotIDs)[1]) {
32
+ GO <- ALL[grep(good.uniprotIDs$uniprotIDs[i], ALL$uniprotID),]
33
+ GO.gof <- GO[GO$score == 1,]
34
+ GO.lof <- GO[GO$score == -1,]
35
+ # split variants from GO and other
36
+ GO.non.itan.gof <- GO.gof[!grepl('Itan', GO.gof$data_source),]
37
+ GO.non.itan.lof <- GO.lof[!grepl('Itan', GO.lof$data_source),]
38
+ GO.itan.gof <- GO.gof[grepl('Itan', GO.gof$data_source),]
39
+ GO.itan.lof <- GO.lof[grepl('Itan', GO.lof$data_source),]
40
+
41
+ set.seed(seed)
42
+ # pick ratio% of variants as training
43
+ if (floor(dim(GO.gof)[1] * ratio) > 0 &
44
+ floor(dim(GO.lof)[1] * ratio) > 0) {
45
+ GO.gof.testing <- sample(dim(GO.non.itan.gof)[1], min(dim(GO.non.itan.gof)[1], floor(dim(GO.gof)[1] * ratio)))
46
+ GO.lof.testing <- sample(dim(GO.non.itan.lof)[1], min(dim(GO.non.itan.lof)[1], floor(dim(GO.lof)[1] * ratio)))
47
+ GO.training <- rbind(GO.itan.gof,
48
+ GO.itan.lof,
49
+ GO.non.itan.gof[-GO.gof.testing,],
50
+ GO.non.itan.lof[-GO.lof.testing,])
51
+ GO.testing <- rbind(GO.non.itan.gof[GO.gof.testing,],
52
+ GO.non.itan.lof[GO.lof.testing,])
53
+
54
+ GO.training <- GO.training[sample(dim(GO.training)[1]),]
55
+ GO.testing <- GO.testing[sample(dim(GO.testing)[1]),]
56
+ # beni.training.beni <- beni.training[beni.training$score==0,]
57
+ if (sum(is.na(GO.testing$itan.beni)) > 0) {
58
+ print(good.uniprotIDs$uniprotIDs[i])
59
+ print(seed)
60
+ }
61
+ if (dim(GO.testing)[1] > 0) {
62
+ # print(paste0(good.uniprotIDs$uniprotIDs[i], ":", seed))
63
+ itan.aucs.1 <- c(itan.aucs.1, plot.AUC(GO.testing$score, 1-GO.testing$itan.gof)$auc)
64
+ itan.aucs.2 <- c(itan.aucs.2, plot.AUC(GO.testing$score, GO.testing$itan.lof/GO.testing$itan.gof)$auc)
65
+ } else {
66
+ itan.aucs.1 <- c(itan.aucs.1, NA)
67
+ itan.aucs.2 <- c(itan.aucs.2, NA)
68
+ }
69
+ dir.create(paste0(split.dir, good.uniprotIDs$uniprotIDs[i], '.itan.split'))
70
+ write.csv(GO.training, paste0(split.dir, good.uniprotIDs$uniprotIDs[i], ".itan.split/training.csv"))
71
+ # write.csv(beni.training.beni, paste0(split.dir, good.uniprotIDs$uniprotIDs[i], ".chps/beni.csv"))
72
+ write.csv(GO.testing, paste0(split.dir, good.uniprotIDs$uniprotIDs[i], ".itan.split/testing.csv"))
73
+
74
+ good.uniprotIDs$gof.training[i] <- sum(GO.training$score==1)
75
+ good.uniprotIDs$lof.training[i] <- sum(GO.training$score==-1)
76
+ good.uniprotIDs$beni.training[i] <- sum(GO.training$score==0)
77
+ good.uniprotIDs$patho.training[i] <- sum(GO.training$score==3)
78
+
79
+ good.uniprotIDs$gof.testing[i] <- sum(GO.testing$score==1)
80
+ good.uniprotIDs$lof.testing[i] <- sum(GO.testing$score==-1)
81
+ good.uniprotIDs$beni.testing[i] <- sum(GO.testing$score==0)
82
+ good.uniprotIDs$patho.testing[i] <- sum(GO.testing$score==3)
83
+
84
+ }
85
+ }
86
+ }
87
+ write.csv(good.uniprotIDs, file = "good.uniprotIDs.itan.csv")
88
+
89
+ # # split by random, add beni
90
+ ratio <- 0.25
91
+ # good.uniprotIDs$gof.training <- NA
92
+ # good.uniprotIDs$gof.testing <- NA
93
+ # good.uniprotIDs$lof.training <- NA
94
+ # good.uniprotIDs$lof.testing <- NA
95
+ # source('~/Pipeline/AUROC.R')
96
+ #
97
+ # for (seed in 0:4) {
98
+ # if (seed == 0) {
99
+ # split.dir <- paste0('pfams.add.beni.', 0.8, '/')
100
+ # } else {
101
+ # split.dir <- paste0('pfams.add.beni.', 0.8, '.seed.', seed, '/')
102
+ # }
103
+ # dir.create(split.dir)
104
+ # for (i in 1:dim(good.uniprotIDs)[1]) {
105
+ # GO <- ALL[grep(good.uniprotIDs$uniprotIDs[i], ALL$uniprotID),]
106
+ # GO.gof <- GO[GO$score == 1,]
107
+ # GO.lof <- GO[GO$score == -1,]
108
+ # # split variants from GO and other
109
+ # non.itan.gof.idx <- which(GO.gof$data_source != 'Itan' & !is.na(GO.gof$itan.beni))
110
+ # non.itan.lof.idx <- which(GO.lof$data_source != 'Itan' & !is.na(GO.lof$itan.beni))
111
+ #
112
+ # GO.non.itan.gof <- GO.gof[non.itan.gof.idx,]
113
+ # GO.non.itan.lof <- GO.lof[non.itan.lof.idx,]
114
+ # GO.itan.gof <- GO.gof[-non.itan.gof.idx,]
115
+ # GO.itan.lof <- GO.lof[-non.itan.lof.idx,]
116
+ #
117
+ # set.seed(seed)
118
+ # # pick ratio% of variants as training
119
+ # if (floor(dim(GO.gof)[1] * ratio) > 0 &
120
+ # floor(dim(GO.lof)[1] * ratio) > 0) {
121
+ # GO.gof.testing <- sample(dim(GO.non.itan.gof)[1], min(dim(GO.non.itan.gof)[1], floor(dim(GO.gof)[1] * ratio)))
122
+ # GO.lof.testing <- sample(dim(GO.non.itan.lof)[1], min(dim(GO.non.itan.lof)[1], floor(dim(GO.lof)[1] * ratio)))
123
+ # GO.training <- rbind(GO.itan.gof,
124
+ # GO.itan.lof,
125
+ # GO.non.itan.gof[-GO.gof.testing,],
126
+ # GO.non.itan.lof[-GO.lof.testing,])
127
+ # GO.testing <- rbind(GO.non.itan.gof[GO.gof.testing,],
128
+ # GO.non.itan.lof[GO.lof.testing,])
129
+ #
130
+ # GO.training <- GO.training[sample(dim(GO.training)[1]),]
131
+ # GO.testing <- GO.testing[sample(dim(GO.testing)[1]),]
132
+ # # beni.training.beni <- beni.training[beni.training$score==0,]
133
+ # if (dim(GO.testing)[1] > 0) {
134
+ # print(paste0(good.uniprotIDs$uniprotIDs[i], ":", seed))
135
+ # itan.aucs.1 <- c(itan.aucs.1, plot.AUC(GO.testing$score, GO.testing$itan.gof)$auc)
136
+ # itan.aucs.2 <- c(itan.aucs.2, plot.AUC(GO.testing$score, GO.testing$itan.gof/GO.testing$itan.lof)$auc)
137
+ # } else {
138
+ # itan.aucs.1 <- c(itan.aucs.1, NA)
139
+ # itan.aucs.2 <- c(itan.aucs.2, NA)
140
+ # }
141
+ # dir.create(paste0(split.dir, good.uniprotIDs$uniprotIDs[i], '.itan.split.clean'))
142
+ # write.csv(GO.training, paste0(split.dir, good.uniprotIDs$uniprotIDs[i], ".itan.split.clean/training.csv"))
143
+ # # write.csv(beni.training.beni, paste0(split.dir, good.uniprotIDs$uniprotIDs[i], ".chps/beni.csv"))
144
+ # write.csv(GO.testing, paste0(split.dir, good.uniprotIDs$uniprotIDs[i], ".itan.split.clean/testing.csv"))
145
+ #
146
+ # good.uniprotIDs$gof.training[i] <- sum(GO.training$score==1)
147
+ # good.uniprotIDs$lof.training[i] <- sum(GO.training$score==-1)
148
+ # good.uniprotIDs$beni.training[i] <- sum(GO.training$score==0)
149
+ # good.uniprotIDs$patho.training[i] <- sum(GO.training$score==3)
150
+ #
151
+ # good.uniprotIDs$gof.testing[i] <- sum(GO.testing$score==1)
152
+ # good.uniprotIDs$lof.testing[i] <- sum(GO.testing$score==-1)
153
+ # good.uniprotIDs$beni.testing[i] <- sum(GO.testing$score==0)
154
+ # good.uniprotIDs$patho.testing[i] <- sum(GO.testing$score==3)
155
+ #
156
+ # }
157
+ # }
158
+ # }
159
+ # write.csv(good.uniprotIDs, file = "good.uniprotIDs.itan.clean.csv")
160
+
161
+ # for SCN5A, remove glazer
162
+ for (seed in 0:4) {
163
+ if (seed == 0) {
164
+ split.dir <- paste0('pfams.add.beni.', 0.8, '/')
165
+ } else {
166
+ split.dir <- paste0('pfams.add.beni.', 0.8, '.seed.', seed, '/')
167
+ }
168
+ dir.create(split.dir)
169
+ GO <- ALL[grep('Q14524', ALL$uniprotID),]
170
+ GO <- GO[GO$data_source != 'glazer',]
171
+ GO.gof <- GO[GO$score == 1,]
172
+ GO.lof <- GO[GO$score == -1,]
173
+ # split variants from GO and other
174
+ non.itan.gof.idx <- which(!grepl('Itan', GO.gof$data_source) & !is.na(GO.gof$itan.beni))
175
+ non.itan.lof.idx <- which(!grepl('Itan', GO.lof$data_source) & !is.na(GO.lof$itan.beni))
176
+
177
+ GO.non.itan.gof <- GO.gof[non.itan.gof.idx,]
178
+ GO.non.itan.lof <- GO.lof[non.itan.lof.idx,]
179
+ GO.itan.gof <- GO.gof[-non.itan.gof.idx,]
180
+ GO.itan.lof <- GO.lof[-non.itan.lof.idx,]
181
+
182
+ set.seed(seed)
183
+ # pick ratio% of variants as training
184
+ if (floor(dim(GO.gof)[1] * ratio) > 0 &
185
+ floor(dim(GO.lof)[1] * ratio) > 0) {
186
+ GO.gof.testing <- sample(dim(GO.non.itan.gof)[1], min(dim(GO.non.itan.gof)[1], floor(dim(GO.gof)[1] * ratio)))
187
+ GO.lof.testing <- sample(dim(GO.non.itan.lof)[1], min(dim(GO.non.itan.lof)[1], floor(dim(GO.lof)[1] * ratio)))
188
+ GO.training <- rbind(GO.itan.gof,
189
+ GO.itan.lof,
190
+ GO.non.itan.gof[-GO.gof.testing,],
191
+ GO.non.itan.lof[-GO.lof.testing,])
192
+ GO.testing <- rbind(GO.non.itan.gof[GO.gof.testing,],
193
+ GO.non.itan.lof[GO.lof.testing,])
194
+
195
+ GO.training <- GO.training[sample(dim(GO.training)[1]),]
196
+ GO.testing <- GO.testing[sample(dim(GO.testing)[1]),]
197
+ # beni.training.beni <- beni.training[beni.training$score==0,]
198
+ if (dim(GO.testing)[1] > 0) {
199
+ # print(paste0(good.uniprotIDs$uniprotIDs[i], ":", seed))
200
+ itan.aucs.1 <- c(itan.aucs.1, plot.AUC(GO.testing$score, GO.testing$itan.gof)$auc)
201
+ itan.aucs.2 <- c(itan.aucs.2, plot.AUC(GO.testing$score, GO.testing$itan.gof/GO.testing$itan.lof)$auc)
202
+ } else {
203
+ itan.aucs.1 <- c(itan.aucs.1, NA)
204
+ itan.aucs.2 <- c(itan.aucs.2, NA)
205
+ }
206
+ dir.create(paste0(split.dir, 'Q14524', '.clean.itan.split'))
207
+ write.csv(GO.training, paste0(split.dir, 'Q14524', ".clean.itan.split/training.csv"))
208
+ # write.csv(beni.training.beni, paste0(split.dir, 'Q14524', ".chps/beni.csv"))
209
+ write.csv(GO.testing, paste0(split.dir, 'Q14524', ".clean.itan.split/testing.csv"))
210
+ }
211
+ }
212
+
213
+ # for SCN5A, remove glazer, don't do itan split, just split
214
+ ratio <- 0.8
215
+ for (seed in 0:4) {
216
+ if (seed == 0) {
217
+ split.dir <- paste0('pfams.add.beni.', 0.8, '/')
218
+ } else {
219
+ split.dir <- paste0('pfams.add.beni.', 0.8, '.seed.', seed, '/')
220
+ }
221
+ dir.create(split.dir)
222
+ GO <- ALL[grep('Q14524', ALL$uniprotID),]
223
+ GO.itan <- GO[GO$data_source != 'glazer',]
224
+ GO.itan.gof <- GO.itan[GO.itan$score == 1,]
225
+ GO.itan.lof <- GO.itan[GO.itan$score == -1,]
226
+ set.seed(seed)
227
+ # pick ratio% of variants as training
228
+ if (floor(dim(GO.itan.gof)[1] * ratio) > 0 &
229
+ floor(dim(GO.itan.lof)[1] * ratio) > 0) {
230
+ GO.itan.gof.training <- sample(dim(GO.itan.gof)[1], floor(dim(GO.itan.gof)[1] * ratio))
231
+ GO.itan.lof.training <- sample(dim(GO.itan.lof)[1], floor(dim(GO.itan.lof)[1] * ratio))
232
+ GO.itan.training <- rbind(GO.itan.gof[GO.itan.gof.training,],
233
+ GO.itan.lof[GO.itan.lof.training,])
234
+ GO.itan.testing <- rbind(GO.itan.gof[-GO.itan.gof.training,],
235
+ GO.itan.lof[-GO.itan.lof.training,])
236
+ GO.itan.training <- GO.itan.training[sample(dim(GO.itan.training)[1]),]
237
+ GO.itan.testing <- GO.itan.testing[sample(dim(GO.itan.testing)[1]),]
238
+
239
+ dir.create(paste0(split.dir, 'Q14524', '.clean'))
240
+ write.csv(GO.itan.training, paste0(split.dir, 'Q14524', ".clean/training.csv"))
241
+ # write.csv(beni.training.beni, paste0(split.dir, 'Q14524', ".chps/beni.csv"))
242
+ write.csv(GO.itan.testing, paste0(split.dir, 'Q14524', ".clean/testing.csv"))
243
+ }
244
+ }
245
+
246
+ # # for FGFR2, further clean data, only use CKB validated as testing
247
+ # fgfr2.check <- read.csv('fgfr2.check.csv')
248
+ # fgfr2.check$uniprotID <- 'P21802'
249
+ # source('/share/vault/Users/gz2294/Pipeline/dnv.table.to.uniprot.R')
250
+ # fgfr2.check <- dnv.table.to.uniprot.by.af2.uniprotID.parallel(fgfr2.check, 'aaChg', 'score', 'uniprotID', 'aaChg')
251
+ # source('/share/vault/Users/gz2294/Pipeline/uniprot.table.add.annotation.R')
252
+ # fgfr2.check <- uniprot.table.add.annotation.parallel(fgfr2.check$result.noNA, 'Itan')
253
+ #
254
+
255
+
256
+
257
+
258
+
preprocess.separate.gene.subset.R ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ good.uniprotIDs <- c('P15056', 'P21802', 'P07949', 'P04637', 'Q09428', 'O00555', 'Q14654', 'Q99250', 'Q14524.clean')
2
+ # split by random, add beni
3
+ good.uniprotIDs.df <- data.frame()
4
+ for (seed in 0:4) {
5
+ split.dir <- paste0('ICC.seed.', seed, '/')
6
+ ratios <- c(1, 2, 4, 6)
7
+ dir.create(split.dir)
8
+ for (i in 1:length(good.uniprotIDs)) {
9
+ GO.itan.training <- read.csv(paste0('ICC.seed.', 0, '/', good.uniprotIDs[i], '/training.csv'))
10
+
11
+ GO.itan.testing <- read.csv(paste0('ICC.seed.', 0 ,'/', good.uniprotIDs[i], '/testing.csv'))
12
+ GO.itan.gof <- GO.itan.training[GO.itan.training$score==1,]
13
+ GO.itan.lof <- GO.itan.training[GO.itan.training$score==-1,]
14
+ set.seed(seed)
15
+ # pick ratio% of variants as training
16
+ for (ratio in ratios) {
17
+ if (floor(dim(GO.itan.gof)[1] * ratio/8) > 0 &
18
+ floor(dim(GO.itan.lof)[1] * ratio/8) > 0) {
19
+ GO.itan.gof.training <- sample(dim(GO.itan.gof)[1], ceiling(dim(GO.itan.gof)[1] * ratio/8))
20
+ GO.itan.lof.training <- sample(dim(GO.itan.lof)[1], ceiling(dim(GO.itan.lof)[1] * ratio/8))
21
+ GO.itan.training <- rbind(GO.itan.gof[GO.itan.gof.training,],
22
+ GO.itan.lof[GO.itan.lof.training,])
23
+ GO.itan.training$split <- 'train'
24
+
25
+ GO.itan.training <- GO.itan.training[sample(dim(GO.itan.training)[1]),]
26
+ GO.itan.testing <- GO.itan.testing[sample(dim(GO.itan.testing)[1]),]
27
+
28
+ dir.create(paste0(split.dir, good.uniprotIDs[i], '.subset2.', ratio))
29
+ if (!file.exists(paste0(split.dir, good.uniprotIDs[i], '.subset2.', ratio, "/training.csv"))) {
30
+ print(good.uniprotIDs[i])
31
+ write.csv(GO.itan.training, paste0(split.dir, good.uniprotIDs[i], '.subset2.', ratio, "/training.csv"))
32
+ }
33
+ if (!file.exists(paste0(split.dir, good.uniprotIDs[i], '.subset2.', ratio, "/testing.csv"))) {
34
+ print(good.uniprotIDs[i])
35
+ write.csv(GO.itan.testing, paste0(split.dir, good.uniprotIDs[i], '.subset2.', ratio, "/testing.csv"))
36
+ }
37
+ good.uniprotIDs.df <- rbind(good.uniprotIDs.df,
38
+ data.frame(gene=good.uniprotIDs[i],
39
+ ratio=ratio,
40
+ seed=seed,
41
+ gof.training = sum(GO.itan.training$score==1 & GO.itan.training$split=='train'),
42
+ lof.training = sum(GO.itan.training$score==-1 & GO.itan.training$split=='train'),
43
+ gof.val = sum(GO.itan.training$score==1 & GO.itan.training$split=='val'),
44
+ lof.val = sum(GO.itan.training$score==-1 & GO.itan.training$split=='val'),
45
+ gof.testing = sum(GO.itan.testing$score==1),
46
+ lof.testing = sum(GO.itan.testing$score==-1)))
47
+ }
48
+ }
49
+ }
50
+ }
51
+
52
+ write.csv(good.uniprotIDs.df, file = "good.uniprotIDs.subsets.csv")
pretrain.tgz.part-aa ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d4f19e4fca0765d0ede2c93de380d0134c14c906ce3b75232893727c1ba0f9b
3
+ size 894753652
pretrain/testing.csv.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ce066487c65d6a810759ed7087960959bc860e45409cdb2e8b946bdb37173f1
3
+ size 1138034
pretrain/training.0.csv.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d1bfe529668cf8706daae5b69b602775add2787b7f88ad367810e3d08c97b53
3
+ size 24972339
pretrain/training.1.csv.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4918e5ba8c2e4ccb27505a6517b32b228ccf4eacce4d4052c7562c80ce90245
3
+ size 22754591
pretrain/training.2.csv.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfcc1224356ae0e684d1d8f64f4b24ad3d78b552f158f16e009bf2e30c7681e7
3
+ size 22809905
pretrain/training.3.csv.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dec4591c575c60228349b2eac512afb67513bb5d7e78a4dc22f401192bb1b84c
3
+ size 22286420
pretrain/training.csv.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc50e2d1946267babc3ccb267374d13196c1a145ff58476b5d33f9968e80f7da
3
+ size 93760937
ptm.small.csv.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1b4d1e38fe5fe797dd5189751869335e6ea3f2dfc6adeda91df3fa14b835184
3
+ size 2419426