Spaces:
No application file
No application file
File size: 3,919 Bytes
b7731cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
# Copyright 2010 by Tiago Antao. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Large file parsing of Genepop files.
The standard parser loads the whole file into memory. This parser
provides an iterator over data.
Classes:
- LargeRecord - Holds GenePop data.
Functions:
- read - Parses a GenePop record (file) into a Record object.
"""
def get_indiv(line):
"""Get individual's data from line."""
indiv_name, marker_line = line.split(",")
markers = marker_line.replace("\t", " ").split(" ")
markers = [marker for marker in markers if marker != ""]
if len(markers[0]) in [2, 4]: # 2 digits per allele
marker_len = 2
else:
marker_len = 3
try:
allele_list = [
(int(marker[0:marker_len]), int(marker[marker_len:])) for marker in markers
]
except ValueError: # Haploid
allele_list = [(int(marker[0:marker_len]),) for marker in markers]
return indiv_name, allele_list, marker_len
def read(handle):
"""Parse a handle containing a GenePop file.
Arguments:
- handle is a file-like object that contains a GenePop record.
"""
record = Record(handle)
record.comment_line = next(handle).rstrip()
# We can now have one loci per line or all loci in a single line
# separated by either space or comma+space...
# We will remove all commas on loci... that should not be a problem
sample_loci_line = next(handle).rstrip().replace(",", "")
all_loci = sample_loci_line.split(" ")
record.loci_list.extend(all_loci)
line = handle.readline()
while line != "":
line = line.rstrip()
if line.upper() == "POP":
record.stack.append("POP")
break
record.loci_list.append(line)
line = handle.readline()
next_line = handle.readline().rstrip()
indiv_name, allele_list, record.marker_len = get_indiv(next_line)
record.stack.append(next_line)
return record
class Record:
"""Hold information from a GenePop record.
Members:
marker_len The marker length (2 or 3 digit code per allele).
comment_line Comment line.
loci_list List of loci names.
data_generator Iterates over population data.
The generator will only work once. If you want to read a handle
twice you have to re-open it!
data_generator can either be () - an empty tuple - marking a new
population or an individual. An individual is something like
('Ind1', [(1,1), (3,None), (200,201)],
In the case above the individual is called Ind1,
has three diploid loci. For the second loci, one of the alleles
is unknown.
"""
def __init__(self, handle):
"""Initialize the class."""
self.handle = handle
self.marker_len = 0
self.comment_line = ""
self.loci_list = []
self.populations = []
self.stack = []
def data_generator(self):
"""Extract population data."""
for handle in [self.stack, self.handle]:
for line in handle:
line = line.rstrip()
if line.upper() == "POP":
yield ()
else:
indiv_name, allele_list, marker_len = get_indiv(line)
clean_list = []
for locus in allele_list:
mk_real = []
for al in locus:
if al == 0:
mk_real.append(None)
else:
mk_real.append(al)
clean_list.append(tuple(mk_real))
yield indiv_name, clean_list
|