Upload preprocessing fit_vcf.py to fit given vcf with required snps for neural network
Browse files- preprocess/fit_vcf.py +117 -0
preprocess/fit_vcf.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
import pandas as pd
|
3 |
+
from loguru import logger
|
4 |
+
import pprint
|
5 |
+
import os
|
6 |
+
|
7 |
+
pd.set_option('display.max_columns', None)
|
8 |
+
"""
|
9 |
+
to get script working:
|
10 |
+
- fill required vars in .env
|
11 |
+
- run in terminal
|
12 |
+
export $(grep -v '^#' .env | xargs)
|
13 |
+
|
14 |
+
"""
|
15 |
+
standard_vcf_cols = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']
|
16 |
+
|
17 |
+
|
18 |
+
def record_result(vcf_out_path, vcf_df, required_vcf_cols, samples):
|
19 |
+
new_info_header = [
|
20 |
+
'##fileformat=VCFv4.2',
|
21 |
+
'##FILTER=<ID=PASS,Description="All filters passed">',
|
22 |
+
'##filedate=20230718',
|
23 |
+
'##source="beagle.22Jul22.46e.jar"',
|
24 |
+
'##INFO=<ID=AF,Number=A,Type=Float,Description="Estimated ALT Allele Frequencies">',
|
25 |
+
'##INFO=<ID=DR2,Number=A,Type=Float,Description="Dosage R-Squared: estimated squared correlation between estimated REF dose [P(RA) + 2*P(RR)] and true REF dose">',
|
26 |
+
'##INFO=<ID=IMP,Number=0,Type=Flag,Description="Imputed marker">',
|
27 |
+
'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
|
28 |
+
'##FORMAT=<ID=DS,Number=A,Type=Float,Description="estimated ALT dose [P(RA) + 2*P(AA)]">',
|
29 |
+
'##contig=<ID=1,length=249250621>',
|
30 |
+
'##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes">',
|
31 |
+
'##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">'
|
32 |
+
]
|
33 |
+
with open(vcf_out_path, 'w') as vcf_file:
|
34 |
+
for s in new_info_header:
|
35 |
+
vcf_file.write(s + '\n')
|
36 |
+
cols_header = '\t'.join(required_vcf_cols)
|
37 |
+
vcf_file.write(f'{cols_header}\n')
|
38 |
+
for _, row in vcf_df.iterrows():
|
39 |
+
chrom = row['#CHROM']
|
40 |
+
pos = row['POS']
|
41 |
+
id = row['ID']
|
42 |
+
ref = row['REF']
|
43 |
+
alt = row['ALT']
|
44 |
+
qual = row['QUAL']
|
45 |
+
filter = row['FILTER']
|
46 |
+
info = row['INFO']
|
47 |
+
format = row['FORMAT']
|
48 |
+
genotypes = list()
|
49 |
+
for sample in samples:
|
50 |
+
sample_genotype = row[f'{sample}']
|
51 |
+
genotypes.append(sample_genotype)
|
52 |
+
genotypes_str = '\t'.join(genotypes)
|
53 |
+
vcf_record = f'{chrom}\t{pos}\t{id}\t{ref}\t{alt}\t{qual}\t{filter}\t{info}\t{format}\t{genotypes_str}\n'
|
54 |
+
vcf_file.write(vcf_record)
|
55 |
+
print('vcf recorded')
|
56 |
+
|
57 |
+
|
58 |
+
def convert_columns(snp_df, vcf_df, required_vcf_cols, samples):
|
59 |
+
global standard_vcf_cols
|
60 |
+
snp_df.rename(columns={'CHR': '#CHROM', 'bp': 'POS', 'allele1': 'REF', 'allele2': 'ALT'}, inplace=True)
|
61 |
+
vcf_df.rename(columns={'#[1]CHROM': '#CHROM', '[2]POS': 'POS', '[3]ID': 'ID', '[4]REF': 'REF', '[5]ALT': 'ALT',
|
62 |
+
'[6](null)': 'INFO'}, inplace=True)
|
63 |
+
for idx, sample in enumerate(samples):
|
64 |
+
vcf_df.rename(columns={f'[{idx+7}]{sample}:GT': f'{sample}'}, inplace=True)
|
65 |
+
snp_df[f'{sample}'] = None
|
66 |
+
logger.info(f'snp_df.cols {snp_df.columns}')
|
67 |
+
snp_df['#CHROM'] = snp_df['#CHROM'].astype(str)
|
68 |
+
vcf_df['#CHROM'] = vcf_df['#CHROM'].astype(str)
|
69 |
+
snp_df['POS'] = snp_df['POS'].astype(str)
|
70 |
+
vcf_df['POS'] = vcf_df['POS'].astype(str)
|
71 |
+
snp_df['REF'] = snp_df['REF'].astype(str)
|
72 |
+
vcf_df['REF'] = vcf_df['REF'].astype(str)
|
73 |
+
snp_df['ALT'] = snp_df['ALT'].astype(str)
|
74 |
+
vcf_df['ALT'] = vcf_df['ALT'].astype(str)
|
75 |
+
vcf_df['INFO'] = vcf_df['INFO'].astype(str)
|
76 |
+
snp_df['ID'] = None
|
77 |
+
snp_df['QUAL'] = '.'
|
78 |
+
snp_df['FILTER'] = 'PASS'
|
79 |
+
snp_df['INFO'] = '.'
|
80 |
+
snp_df['FORMAT'] = 'GT'
|
81 |
+
|
82 |
+
excess_cols = [col for col in snp_df.columns if col not in required_vcf_cols]
|
83 |
+
snp_df.drop(excess_cols, axis=1, inplace=True)
|
84 |
+
return snp_df, vcf_df
|
85 |
+
|
86 |
+
|
87 |
+
def main():
|
88 |
+
input_snp = os.getenv('input_snp_path') # Annotations/UKBiobank/Input_SNPs_refseq.csv
|
89 |
+
out_file = os.getenv('fitted_vcf')
|
90 |
+
"""
|
91 |
+
target_vcf_snps - .csv file with necessary info from VCF, can be obtain by
|
92 |
+
bcftools query -H -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%INFO\t[%GT\t]\n' example.vcf.gz > example_snps.csv
|
93 |
+
"""
|
94 |
+
target_vcf_snps = os.getenv('target_vcf_snps')
|
95 |
+
samples = os.getenv('samples_to_extract').split(',')
|
96 |
+
logger.info(f'samples: {samples}')
|
97 |
+
vcf_df = pd.read_csv(target_vcf_snps, delimiter='\t', dtype={'#[1]CHROM': str})
|
98 |
+
snp_df = pd.read_csv(input_snp, delimiter=',')
|
99 |
+
required_vcf_cols = standard_vcf_cols + samples
|
100 |
+
logger.info(f'required cols {required_vcf_cols}, snp_df.cols {snp_df.columns}')
|
101 |
+
snp_df, vcf_df = convert_columns(snp_df, vcf_df, required_vcf_cols, samples)
|
102 |
+
logger.info(
|
103 |
+
f'after convert snp_df shape {snp_df.shape}, {snp_df.columns}\n, fit_vcf shape {vcf_df.shape}, {vcf_df.columns}')
|
104 |
+
merged = snp_df.merge(vcf_df, on=['#CHROM', 'POS', 'REF', 'ALT'], how='left', suffixes=('_snp', '_vcf'))
|
105 |
+
merged.rename(columns={'ID_vcf': 'ID', 'INFO_snp': 'INFO'}, inplace=True)
|
106 |
+
for sample in samples:
|
107 |
+
merged.rename(columns={f'{sample}_vcf': f'{sample}'}, inplace=True)
|
108 |
+
merged[f'{sample}'].fillna('./.', inplace=True)
|
109 |
+
merged = merged[required_vcf_cols]
|
110 |
+
merged['ID'].fillna('.', inplace=True)
|
111 |
+
pprint.pprint(merged)
|
112 |
+
logger.info(f'final_df header {merged.columns} \n final_df shape: {merged.shape}')
|
113 |
+
record_result(out_file, merged, required_vcf_cols, samples)
|
114 |
+
|
115 |
+
|
116 |
+
main()
|
117 |
+
|