Alina Fedorova commited on
Commit
81b5a62
·
1 Parent(s): 0b7ce57

env via pydantic added

Browse files
environment.yml CHANGED
@@ -132,6 +132,7 @@ dependencies:
132
  - zipp=3.15.0=pyhd8ed1ab_0
133
  - zlib=1.2.11=h166bdaf_1014
134
  - pip:
 
135
  - anyio==3.7.0
136
  - argon2-cffi==21.3.0
137
  - argon2-cffi-bindings==21.2.0
@@ -193,11 +194,14 @@ dependencies:
193
  - prompt-toolkit==3.0.38
194
  - psutil==5.9.5
195
  - ptyprocess==0.7.0
 
 
196
  - pygments==2.15.1
197
  - pyparsing==3.1.0
198
  - pyrsistent==0.19.3
199
  - pytest==7.4.0
200
  - python-dateutil==2.8.2
 
201
  - pytz==2023.3
202
  - pyzmq==25.1.0
203
  - qtconsole==5.4.3
 
132
  - zipp=3.15.0=pyhd8ed1ab_0
133
  - zlib=1.2.11=h166bdaf_1014
134
  - pip:
135
+ - annotated-types==0.5.0
136
  - anyio==3.7.0
137
  - argon2-cffi==21.3.0
138
  - argon2-cffi-bindings==21.2.0
 
194
  - prompt-toolkit==3.0.38
195
  - psutil==5.9.5
196
  - ptyprocess==0.7.0
197
+ - pydantic==2.0.3
198
+ - pydantic-core==2.3.0
199
  - pygments==2.15.1
200
  - pyparsing==3.1.0
201
  - pyrsistent==0.19.3
202
  - pytest==7.4.0
203
  - python-dateutil==2.8.2
204
+ - python-dotenv==0.21.1
205
  - pytz==2023.3
206
  - pyzmq==25.1.0
207
  - qtconsole==5.4.3
preprocess/.env_paths ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ vcf='/absolute_path_to_your_file/example.vcf'
2
+ samples='/absolute_path_to_your_files/hapmap-ceu-all.lift.vcf'
3
+ conform='/absolute_your_path/conform-gt.24May16.cee.jar'
4
+ beagle='/absolute_your_path/beagle.22Jul22.46e.jar'
5
+ ref='/absolute_your_path/reference'
6
+ maps='/absolute_your_path/maps'
7
+ gb=20
preprocess/run_beagle.py CHANGED
@@ -1,8 +1,9 @@
1
  #!/usr/bin/env python
2
  import re
3
- import click
4
  import os
5
  import glob
 
 
6
  """
7
  script for running Beagle 5.4
8
  All kind of data for this script like human reference panel, genetic maps
@@ -45,7 +46,6 @@ def run_conform(conform, vcf_gz_file, ref_folder):
45
  reference: files was downloaded from from Beagle human reference link
46
  https://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/b37.vcf/"""
47
  for ref_file in glob.glob(f'{ref_folder}/**/chr*.vcf.gz', recursive=True):
48
- print('conform ', ref_file)
49
  if re.search("chr(\d+)", ref_file):
50
  chr_type = (re.search("chr(\d+)", ref_file))[1]
51
  elif re.search("chrX", ref_file):
@@ -58,7 +58,6 @@ def run_conform(conform, vcf_gz_file, ref_folder):
58
  def ensure_biallelic_ref(ref_dir):
59
  for ref_file in glob.glob(f'{ref_dir}/chr*.v5a.vcf.gz'):
60
  ref_biall_path = os.path.join(ref_dir, f'{ref_file.split("vcf")[0]}biallelic.vcf.gz')
61
- print('ensure ', ref_file, ref_biall_path)
62
  os.system(f'bcftools view -m2 -M2 -v snps -Oz -o {ref_biall_path} {ref_file}')
63
  os.system(f'bcftools index {ref_biall_path}.gz')
64
  os.remove(ref_file) # remove initial ref file
@@ -78,25 +77,42 @@ def run_beagle(beagle, gb, map_dir, ref_dir):
78
  f' out=imputed_{chr_type} map={map_file}')
79
 
80
 
81
- @click.command()
82
- @click.option('--vcf', help='Path to the target vcf file')
83
- @click.option('--samples', help='Path to VCF with other samples for conform checks, not required if target VCF'
84
- 'contains data for at least 20 individuals', required=False)
85
- @click.option('--conform', help='Path to conform .jar file')
86
- @click.option('--beagle', help='Path to beagle .jar file')
87
- @click.option('--ref', help='Path to folder with reference genome:'
88
- ' .vcf.gz files are expected to start with "chr1."..."chr22.", "chrX."')
89
- @click.option('--map', help='Path to folder with PLINK format genetic maps, files are expected to start with'
90
- '"plink.chr1.", ..."plink.chr22.", "plink.chrX."')
91
- @click.option('--gb', help='Number of gigabytes for running beagle', default=10, show_default=True)
92
- def main(vcf, samples, conform, beagle, ref, map, gb):
 
 
 
 
 
 
 
93
  bgzip_and_index(vcf, samples)
94
  if samples:
95
  merge(vcf, samples)
96
  cleaned_file = clean_and_gzip(vcf, samples) # returned cleaned file in .vcf.gz (gzip) format
97
  ensure_biallelic_ref(ref)
98
  run_conform(conform, cleaned_file, ref)
99
- run_beagle(beagle, gb, map, ref)
100
 
101
 
102
  main()
 
 
 
 
 
 
 
 
 
 
 
1
  #!/usr/bin/env python
2
  import re
 
3
  import os
4
  import glob
5
+ from setting import IMPUTATION_SETTINGS
6
+
7
  """
8
  script for running Beagle 5.4
9
  All kind of data for this script like human reference panel, genetic maps
 
46
  reference: files was downloaded from from Beagle human reference link
47
  https://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/b37.vcf/"""
48
  for ref_file in glob.glob(f'{ref_folder}/**/chr*.vcf.gz', recursive=True):
 
49
  if re.search("chr(\d+)", ref_file):
50
  chr_type = (re.search("chr(\d+)", ref_file))[1]
51
  elif re.search("chrX", ref_file):
 
58
  def ensure_biallelic_ref(ref_dir):
59
  for ref_file in glob.glob(f'{ref_dir}/chr*.v5a.vcf.gz'):
60
  ref_biall_path = os.path.join(ref_dir, f'{ref_file.split("vcf")[0]}biallelic.vcf.gz')
 
61
  os.system(f'bcftools view -m2 -M2 -v snps -Oz -o {ref_biall_path} {ref_file}')
62
  os.system(f'bcftools index {ref_biall_path}.gz')
63
  os.remove(ref_file) # remove initial ref file
 
77
  f' out=imputed_{chr_type} map={map_file}')
78
 
79
 
80
+ # @click.command()
81
+ # @click.option('--vcf', help='Path to the target vcf file')
82
+ # @click.option('--samples', help='Path to VCF with other samples for conform checks, not required if target VCF'
83
+ # 'contains data for at least 20 individuals', required=False)
84
+ # @click.option('--conform', help='Path to conform .jar file')
85
+ # @click.option('--beagle', help='Path to beagle .jar file')
86
+ # @click.option('--ref', help='Path to folder with reference genome:'
87
+ # ' .vcf.gz files are expected to start with "chr1."..."chr22.", "chrX."')
88
+ # @click.option('--maps', help='Path to folder with PLINK format genetic maps, files are expected to start with'
89
+ # '"plink.chr1.", ..."plink.chr22.", "plink.chrX."')
90
+ # @click.option('--gb', help='Number of gigabytes for running beagle', default=10, show_default=True)
91
+ def main():
92
+ vcf = IMPUTATION_SETTINGS.vcf
93
+ samples = IMPUTATION_SETTINGS.samples
94
+ conform = IMPUTATION_SETTINGS.conform
95
+ beagle = IMPUTATION_SETTINGS.beagle
96
+ ref = IMPUTATION_SETTINGS.ref
97
+ maps = IMPUTATION_SETTINGS.maps
98
+ gb = IMPUTATION_SETTINGS.gb
99
  bgzip_and_index(vcf, samples)
100
  if samples:
101
  merge(vcf, samples)
102
  cleaned_file = clean_and_gzip(vcf, samples) # returned cleaned file in .vcf.gz (gzip) format
103
  ensure_biallelic_ref(ref)
104
  run_conform(conform, cleaned_file, ref)
105
+ run_beagle(beagle, gb, maps, ref)
106
 
107
 
108
  main()
109
+ """
110
+ python preprocess/run_beagle.py
111
+ --vcf /Users/alina/Documents/longevity/genomes/antonkulaga.hg37.pickard.annotate_bcf_alldbsnp.vcf
112
+ --samples /Users/alina/Documents/longevity/genomes/hapmap-ceu-all.lift.vcf
113
+ --conform /Users/alina/tools/conform-gt.24May16.cee.jar
114
+ --beagle /Users/alina/tools/beagle.22Jul22.46e.jar
115
+ --ref /Users/alina/progproj/gennet/test_beagle/reference
116
+ --map /Users/alina/progproj/gennet/test_beagle/maps --gb 20
117
+
118
+ """
preprocess/setting.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+ from pydantic import BaseSettings, Field
3
+ import os
4
+
5
+
6
+ class EnvBeagle(BaseSettings):
7
+ vcf: str = Field(description='Path to the target vcf file')
8
+ samples: Optional[str] = Field(default=None,
9
+ description='Path to VCF with other samples for conform checks, not required if '
10
+ 'target VCF contains data for at least 20 individuals')
11
+ conform: str = Field(description='Path to conform .jar file')
12
+ beagle: str = Field(description='Path to beagle .jar file')
13
+ ref: str = Field(description='Path to folder with reference genome:'
14
+ ' .vcf.gz files are expected to start with "chr1."..."chr22.", "chrX."')
15
+ maps: str = Field(description='Path to folder with PLINK format genetic maps, files are expected to start with'
16
+ '"plink.chr1.", ..."plink.chr22.", "plink.chrX."')
17
+ gb: int = Field(description='Number of gigabytes for running beagle')
18
+
19
+ class Config:
20
+ env_file = os.path.dirname(os.path.abspath(__file__))+"/.env_paths"
21
+
22
+
23
+ IMPUTATION_SETTINGS = EnvBeagle()