lnalinaf commited on
Commit
2be7364
·
verified ·
1 Parent(s): 81b5a62

Upload preprocessing fit_vcf.py to fit given vcf with required snps for neural network

Browse files
Files changed (1) hide show
  1. preprocess/fit_vcf.py +117 -0
preprocess/fit_vcf.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ import pandas as pd
3
+ from loguru import logger
4
+ import pprint
5
+ import os
6
+
7
+ pd.set_option('display.max_columns', None)
8
+ """
9
+ to get script working:
10
+ - fill required vars in .env
11
+ - run in terminal
12
+ export $(grep -v '^#' .env | xargs)
13
+
14
+ """
15
+ standard_vcf_cols = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']
16
+
17
+
18
+ def record_result(vcf_out_path, vcf_df, required_vcf_cols, samples):
19
+ new_info_header = [
20
+ '##fileformat=VCFv4.2',
21
+ '##FILTER=<ID=PASS,Description="All filters passed">',
22
+ '##filedate=20230718',
23
+ '##source="beagle.22Jul22.46e.jar"',
24
+ '##INFO=<ID=AF,Number=A,Type=Float,Description="Estimated ALT Allele Frequencies">',
25
+ '##INFO=<ID=DR2,Number=A,Type=Float,Description="Dosage R-Squared: estimated squared correlation between estimated REF dose [P(RA) + 2*P(RR)] and true REF dose">',
26
+ '##INFO=<ID=IMP,Number=0,Type=Flag,Description="Imputed marker">',
27
+ '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
28
+ '##FORMAT=<ID=DS,Number=A,Type=Float,Description="estimated ALT dose [P(RA) + 2*P(AA)]">',
29
+ '##contig=<ID=1,length=249250621>',
30
+ '##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes">',
31
+ '##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">'
32
+ ]
33
+ with open(vcf_out_path, 'w') as vcf_file:
34
+ for s in new_info_header:
35
+ vcf_file.write(s + '\n')
36
+ cols_header = '\t'.join(required_vcf_cols)
37
+ vcf_file.write(f'{cols_header}\n')
38
+ for _, row in vcf_df.iterrows():
39
+ chrom = row['#CHROM']
40
+ pos = row['POS']
41
+ id = row['ID']
42
+ ref = row['REF']
43
+ alt = row['ALT']
44
+ qual = row['QUAL']
45
+ filter = row['FILTER']
46
+ info = row['INFO']
47
+ format = row['FORMAT']
48
+ genotypes = list()
49
+ for sample in samples:
50
+ sample_genotype = row[f'{sample}']
51
+ genotypes.append(sample_genotype)
52
+ genotypes_str = '\t'.join(genotypes)
53
+ vcf_record = f'{chrom}\t{pos}\t{id}\t{ref}\t{alt}\t{qual}\t{filter}\t{info}\t{format}\t{genotypes_str}\n'
54
+ vcf_file.write(vcf_record)
55
+ print('vcf recorded')
56
+
57
+
58
+ def convert_columns(snp_df, vcf_df, required_vcf_cols, samples):
59
+ global standard_vcf_cols
60
+ snp_df.rename(columns={'CHR': '#CHROM', 'bp': 'POS', 'allele1': 'REF', 'allele2': 'ALT'}, inplace=True)
61
+ vcf_df.rename(columns={'#[1]CHROM': '#CHROM', '[2]POS': 'POS', '[3]ID': 'ID', '[4]REF': 'REF', '[5]ALT': 'ALT',
62
+ '[6](null)': 'INFO'}, inplace=True)
63
+ for idx, sample in enumerate(samples):
64
+ vcf_df.rename(columns={f'[{idx+7}]{sample}:GT': f'{sample}'}, inplace=True)
65
+ snp_df[f'{sample}'] = None
66
+ logger.info(f'snp_df.cols {snp_df.columns}')
67
+ snp_df['#CHROM'] = snp_df['#CHROM'].astype(str)
68
+ vcf_df['#CHROM'] = vcf_df['#CHROM'].astype(str)
69
+ snp_df['POS'] = snp_df['POS'].astype(str)
70
+ vcf_df['POS'] = vcf_df['POS'].astype(str)
71
+ snp_df['REF'] = snp_df['REF'].astype(str)
72
+ vcf_df['REF'] = vcf_df['REF'].astype(str)
73
+ snp_df['ALT'] = snp_df['ALT'].astype(str)
74
+ vcf_df['ALT'] = vcf_df['ALT'].astype(str)
75
+ vcf_df['INFO'] = vcf_df['INFO'].astype(str)
76
+ snp_df['ID'] = None
77
+ snp_df['QUAL'] = '.'
78
+ snp_df['FILTER'] = 'PASS'
79
+ snp_df['INFO'] = '.'
80
+ snp_df['FORMAT'] = 'GT'
81
+
82
+ excess_cols = [col for col in snp_df.columns if col not in required_vcf_cols]
83
+ snp_df.drop(excess_cols, axis=1, inplace=True)
84
+ return snp_df, vcf_df
85
+
86
+
87
+ def main():
88
+ input_snp = os.getenv('input_snp_path') # Annotations/UKBiobank/Input_SNPs_refseq.csv
89
+ out_file = os.getenv('fitted_vcf')
90
+ """
91
+ target_vcf_snps - .csv file with necessary info from VCF, can be obtain by
92
+ bcftools query -H -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%INFO\t[%GT\t]\n' example.vcf.gz > example_snps.csv
93
+ """
94
+ target_vcf_snps = os.getenv('target_vcf_snps')
95
+ samples = os.getenv('samples_to_extract').split(',')
96
+ logger.info(f'samples: {samples}')
97
+ vcf_df = pd.read_csv(target_vcf_snps, delimiter='\t', dtype={'#[1]CHROM': str})
98
+ snp_df = pd.read_csv(input_snp, delimiter=',')
99
+ required_vcf_cols = standard_vcf_cols + samples
100
+ logger.info(f'required cols {required_vcf_cols}, snp_df.cols {snp_df.columns}')
101
+ snp_df, vcf_df = convert_columns(snp_df, vcf_df, required_vcf_cols, samples)
102
+ logger.info(
103
+ f'after convert snp_df shape {snp_df.shape}, {snp_df.columns}\n, fit_vcf shape {vcf_df.shape}, {vcf_df.columns}')
104
+ merged = snp_df.merge(vcf_df, on=['#CHROM', 'POS', 'REF', 'ALT'], how='left', suffixes=('_snp', '_vcf'))
105
+ merged.rename(columns={'ID_vcf': 'ID', 'INFO_snp': 'INFO'}, inplace=True)
106
+ for sample in samples:
107
+ merged.rename(columns={f'{sample}_vcf': f'{sample}'}, inplace=True)
108
+ merged[f'{sample}'].fillna('./.', inplace=True)
109
+ merged = merged[required_vcf_cols]
110
+ merged['ID'].fillna('.', inplace=True)
111
+ pprint.pprint(merged)
112
+ logger.info(f'final_df header {merged.columns} \n final_df shape: {merged.shape}')
113
+ record_result(out_file, merged, required_vcf_cols, samples)
114
+
115
+
116
+ main()
117
+