Alina Fedorova
commited on
Commit
·
81b5a62
1
Parent(s):
0b7ce57
env via pydantic added
Browse files- environment.yml +4 -0
- preprocess/.env_paths +7 -0
- preprocess/run_beagle.py +32 -16
- preprocess/setting.py +23 -0
environment.yml
CHANGED
@@ -132,6 +132,7 @@ dependencies:
|
|
132 |
- zipp=3.15.0=pyhd8ed1ab_0
|
133 |
- zlib=1.2.11=h166bdaf_1014
|
134 |
- pip:
|
|
|
135 |
- anyio==3.7.0
|
136 |
- argon2-cffi==21.3.0
|
137 |
- argon2-cffi-bindings==21.2.0
|
@@ -193,11 +194,14 @@ dependencies:
|
|
193 |
- prompt-toolkit==3.0.38
|
194 |
- psutil==5.9.5
|
195 |
- ptyprocess==0.7.0
|
|
|
|
|
196 |
- pygments==2.15.1
|
197 |
- pyparsing==3.1.0
|
198 |
- pyrsistent==0.19.3
|
199 |
- pytest==7.4.0
|
200 |
- python-dateutil==2.8.2
|
|
|
201 |
- pytz==2023.3
|
202 |
- pyzmq==25.1.0
|
203 |
- qtconsole==5.4.3
|
|
|
132 |
- zipp=3.15.0=pyhd8ed1ab_0
|
133 |
- zlib=1.2.11=h166bdaf_1014
|
134 |
- pip:
|
135 |
+
- annotated-types==0.5.0
|
136 |
- anyio==3.7.0
|
137 |
- argon2-cffi==21.3.0
|
138 |
- argon2-cffi-bindings==21.2.0
|
|
|
194 |
- prompt-toolkit==3.0.38
|
195 |
- psutil==5.9.5
|
196 |
- ptyprocess==0.7.0
|
197 |
+
- pydantic==2.0.3
|
198 |
+
- pydantic-core==2.3.0
|
199 |
- pygments==2.15.1
|
200 |
- pyparsing==3.1.0
|
201 |
- pyrsistent==0.19.3
|
202 |
- pytest==7.4.0
|
203 |
- python-dateutil==2.8.2
|
204 |
+
- python-dotenv==0.21.1
|
205 |
- pytz==2023.3
|
206 |
- pyzmq==25.1.0
|
207 |
- qtconsole==5.4.3
|
preprocess/.env_paths
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
vcf='/absolute_path_to_your_file/example.vcf'
|
2 |
+
samples='/absolute_path_to_your_files/hapmap-ceu-all.lift.vcf'
|
3 |
+
conform='/absolute_your_path/conform-gt.24May16.cee.jar'
|
4 |
+
beagle='/absolute_your_path/beagle.22Jul22.46e.jar'
|
5 |
+
ref='/absolute_your_path/reference'
|
6 |
+
maps='/absolute_your_path/maps'
|
7 |
+
gb=20
|
preprocess/run_beagle.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1 |
#!/usr/bin/env python
|
2 |
import re
|
3 |
-
import click
|
4 |
import os
|
5 |
import glob
|
|
|
|
|
6 |
"""
|
7 |
script for running Beagle 5.4
|
8 |
All kind of data for this script like human reference panel, genetic maps
|
@@ -45,7 +46,6 @@ def run_conform(conform, vcf_gz_file, ref_folder):
|
|
45 |
reference: files was downloaded from from Beagle human reference link
|
46 |
https://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/b37.vcf/"""
|
47 |
for ref_file in glob.glob(f'{ref_folder}/**/chr*.vcf.gz', recursive=True):
|
48 |
-
print('conform ', ref_file)
|
49 |
if re.search("chr(\d+)", ref_file):
|
50 |
chr_type = (re.search("chr(\d+)", ref_file))[1]
|
51 |
elif re.search("chrX", ref_file):
|
@@ -58,7 +58,6 @@ def run_conform(conform, vcf_gz_file, ref_folder):
|
|
58 |
def ensure_biallelic_ref(ref_dir):
|
59 |
for ref_file in glob.glob(f'{ref_dir}/chr*.v5a.vcf.gz'):
|
60 |
ref_biall_path = os.path.join(ref_dir, f'{ref_file.split("vcf")[0]}biallelic.vcf.gz')
|
61 |
-
print('ensure ', ref_file, ref_biall_path)
|
62 |
os.system(f'bcftools view -m2 -M2 -v snps -Oz -o {ref_biall_path} {ref_file}')
|
63 |
os.system(f'bcftools index {ref_biall_path}.gz')
|
64 |
os.remove(ref_file) # remove initial ref file
|
@@ -78,25 +77,42 @@ def run_beagle(beagle, gb, map_dir, ref_dir):
|
|
78 |
f' out=imputed_{chr_type} map={map_file}')
|
79 |
|
80 |
|
81 |
-
@click.command()
|
82 |
-
@click.option('--vcf', help='Path to the target vcf file')
|
83 |
-
@click.option('--samples', help='Path to VCF with other samples for conform checks, not required if target VCF'
|
84 |
-
|
85 |
-
@click.option('--conform', help='Path to conform .jar file')
|
86 |
-
@click.option('--beagle', help='Path to beagle .jar file')
|
87 |
-
@click.option('--ref', help='Path to folder with reference genome:'
|
88 |
-
|
89 |
-
@click.option('--
|
90 |
-
|
91 |
-
@click.option('--gb', help='Number of gigabytes for running beagle', default=10, show_default=True)
|
92 |
-
def main(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
bgzip_and_index(vcf, samples)
|
94 |
if samples:
|
95 |
merge(vcf, samples)
|
96 |
cleaned_file = clean_and_gzip(vcf, samples) # returned cleaned file in .vcf.gz (gzip) format
|
97 |
ensure_biallelic_ref(ref)
|
98 |
run_conform(conform, cleaned_file, ref)
|
99 |
-
run_beagle(beagle, gb,
|
100 |
|
101 |
|
102 |
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
#!/usr/bin/env python
|
2 |
import re
|
|
|
3 |
import os
|
4 |
import glob
|
5 |
+
from setting import IMPUTATION_SETTINGS
|
6 |
+
|
7 |
"""
|
8 |
script for running Beagle 5.4
|
9 |
All kind of data for this script like human reference panel, genetic maps
|
|
|
46 |
reference: files was downloaded from from Beagle human reference link
|
47 |
https://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/b37.vcf/"""
|
48 |
for ref_file in glob.glob(f'{ref_folder}/**/chr*.vcf.gz', recursive=True):
|
|
|
49 |
if re.search("chr(\d+)", ref_file):
|
50 |
chr_type = (re.search("chr(\d+)", ref_file))[1]
|
51 |
elif re.search("chrX", ref_file):
|
|
|
58 |
def ensure_biallelic_ref(ref_dir):
|
59 |
for ref_file in glob.glob(f'{ref_dir}/chr*.v5a.vcf.gz'):
|
60 |
ref_biall_path = os.path.join(ref_dir, f'{ref_file.split("vcf")[0]}biallelic.vcf.gz')
|
|
|
61 |
os.system(f'bcftools view -m2 -M2 -v snps -Oz -o {ref_biall_path} {ref_file}')
|
62 |
os.system(f'bcftools index {ref_biall_path}.gz')
|
63 |
os.remove(ref_file) # remove initial ref file
|
|
|
77 |
f' out=imputed_{chr_type} map={map_file}')
|
78 |
|
79 |
|
80 |
+
# @click.command()
|
81 |
+
# @click.option('--vcf', help='Path to the target vcf file')
|
82 |
+
# @click.option('--samples', help='Path to VCF with other samples for conform checks, not required if target VCF'
|
83 |
+
# 'contains data for at least 20 individuals', required=False)
|
84 |
+
# @click.option('--conform', help='Path to conform .jar file')
|
85 |
+
# @click.option('--beagle', help='Path to beagle .jar file')
|
86 |
+
# @click.option('--ref', help='Path to folder with reference genome:'
|
87 |
+
# ' .vcf.gz files are expected to start with "chr1."..."chr22.", "chrX."')
|
88 |
+
# @click.option('--maps', help='Path to folder with PLINK format genetic maps, files are expected to start with'
|
89 |
+
# '"plink.chr1.", ..."plink.chr22.", "plink.chrX."')
|
90 |
+
# @click.option('--gb', help='Number of gigabytes for running beagle', default=10, show_default=True)
|
91 |
+
def main():
|
92 |
+
vcf = IMPUTATION_SETTINGS.vcf
|
93 |
+
samples = IMPUTATION_SETTINGS.samples
|
94 |
+
conform = IMPUTATION_SETTINGS.conform
|
95 |
+
beagle = IMPUTATION_SETTINGS.beagle
|
96 |
+
ref = IMPUTATION_SETTINGS.ref
|
97 |
+
maps = IMPUTATION_SETTINGS.maps
|
98 |
+
gb = IMPUTATION_SETTINGS.gb
|
99 |
bgzip_and_index(vcf, samples)
|
100 |
if samples:
|
101 |
merge(vcf, samples)
|
102 |
cleaned_file = clean_and_gzip(vcf, samples) # returned cleaned file in .vcf.gz (gzip) format
|
103 |
ensure_biallelic_ref(ref)
|
104 |
run_conform(conform, cleaned_file, ref)
|
105 |
+
run_beagle(beagle, gb, maps, ref)
|
106 |
|
107 |
|
108 |
main()
|
109 |
+
"""
|
110 |
+
python preprocess/run_beagle.py
|
111 |
+
--vcf /Users/alina/Documents/longevity/genomes/antonkulaga.hg37.pickard.annotate_bcf_alldbsnp.vcf
|
112 |
+
--samples /Users/alina/Documents/longevity/genomes/hapmap-ceu-all.lift.vcf
|
113 |
+
--conform /Users/alina/tools/conform-gt.24May16.cee.jar
|
114 |
+
--beagle /Users/alina/tools/beagle.22Jul22.46e.jar
|
115 |
+
--ref /Users/alina/progproj/gennet/test_beagle/reference
|
116 |
+
--map /Users/alina/progproj/gennet/test_beagle/maps --gb 20
|
117 |
+
|
118 |
+
"""
|
preprocess/setting.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
from pydantic import BaseSettings, Field
|
3 |
+
import os
|
4 |
+
|
5 |
+
|
6 |
+
class EnvBeagle(BaseSettings):
|
7 |
+
vcf: str = Field(description='Path to the target vcf file')
|
8 |
+
samples: Optional[str] = Field(default=None,
|
9 |
+
description='Path to VCF with other samples for conform checks, not required if '
|
10 |
+
'target VCF contains data for at least 20 individuals')
|
11 |
+
conform: str = Field(description='Path to conform .jar file')
|
12 |
+
beagle: str = Field(description='Path to beagle .jar file')
|
13 |
+
ref: str = Field(description='Path to folder with reference genome:'
|
14 |
+
' .vcf.gz files are expected to start with "chr1."..."chr22.", "chrX."')
|
15 |
+
maps: str = Field(description='Path to folder with PLINK format genetic maps, files are expected to start with'
|
16 |
+
'"plink.chr1.", ..."plink.chr22.", "plink.chrX."')
|
17 |
+
gb: int = Field(description='Number of gigabytes for running beagle')
|
18 |
+
|
19 |
+
class Config:
|
20 |
+
env_file = os.path.dirname(os.path.abspath(__file__))+"/.env_paths"
|
21 |
+
|
22 |
+
|
23 |
+
IMPUTATION_SETTINGS = EnvBeagle()
|