Spaces:
No application file
No application file
File size: 12,738 Bytes
b7731cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 |
# Copyright 2006 by Sean Davis, National Cancer Institute, NIH.
# All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Parse Unigene flat file format files such as the Hs.data file.
Here is an overview of the flat file format that this parser deals with:
Line types/qualifiers::
ID UniGene cluster ID
TITLE Title for the cluster
GENE Gene symbol
CYTOBAND Cytological band
EXPRESS Tissues of origin for ESTs in cluster
RESTR_EXPR Single tissue or development stage contributes
more than half the total EST frequency for this gene.
GNM_TERMINUS genomic confirmation of presence of a 3' terminus;
T if a non-templated polyA tail is found among
a cluster's sequences; else
I if templated As are found in genomic sequence or
S if a canonical polyA signal is found on
the genomic sequence
GENE_ID Entrez gene identifier associated with at least one
sequence in this cluster;
to be used instead of LocusLink.
LOCUSLINK LocusLink identifier associated with at least one
sequence in this cluster;
deprecated in favor of GENE_ID
HOMOL Homology;
CHROMOSOME Chromosome. For plants, CHROMOSOME refers to mapping
on the arabidopsis genome.
STS STS
ACC= GenBank/EMBL/DDBJ accession number of STS
[optional field]
UNISTS= identifier in NCBI's UNISTS database
TXMAP Transcript map interval
MARKER= Marker found on at least one sequence in this
cluster
RHPANEL= Radiation Hybrid panel used to place marker
PROTSIM Protein Similarity data for the sequence with
highest-scoring protein similarity in this cluster
ORG= Organism
PROTGI= Sequence GI of protein
PROTID= Sequence ID of protein
PCT= Percent alignment
ALN= length of aligned region (aa)
SCOUNT Number of sequences in the cluster
SEQUENCE Sequence
ACC= GenBank/EMBL/DDBJ accession number of sequence
NID= Unique nucleotide sequence identifier (gi)
PID= Unique protein sequence identifier (used for
non-ESTs)
CLONE= Clone identifier (used for ESTs only)
END= End (5'/3') of clone insert read (used for
ESTs only)
LID= Library ID; see Hs.lib.info for library name
and tissue
MGC= 5' CDS-completeness indicator; if present, the
clone associated with this sequence is believed
CDS-complete. A value greater than 511 is the gi
of the CDS-complete mRNA matched by the EST,
otherwise the value is an indicator of the
reliability of the test indicating CDS
completeness; higher values indicate more
reliable CDS-completeness predictions.
SEQTYPE= Description of the nucleotide sequence.
Possible values are mRNA, EST and HTC.
TRACE= The Trace ID of the EST sequence, as provided by
NCBI Trace Archive
"""
class SequenceLine:
"""Store the information for one SEQUENCE line from a Unigene file.
Initialize with the text part of the SEQUENCE line, or nothing.
Attributes and descriptions (access as LOWER CASE):
- ACC= GenBank/EMBL/DDBJ accession number of sequence
- NID= Unique nucleotide sequence identifier (gi)
- PID= Unique protein sequence identifier (used for non-ESTs)
- CLONE= Clone identifier (used for ESTs only)
- END= End (5'/3') of clone insert read (used for ESTs only)
- LID= Library ID; see Hs.lib.info for library name and tissue
- MGC= 5' CDS-completeness indicator; if present,
the clone associated with this sequence
is believed CDS-complete. A value greater than 511
is the gi of the CDS-complete mRNA matched by the EST,
otherwise the value is an indicator of the reliability
of the test indicating CDS completeness;
higher values indicate more reliable CDS-completeness
predictions.
- SEQTYPE= Description of the nucleotide sequence. Possible values
are mRNA, EST and HTC.
- TRACE= The Trace ID of the EST sequence, as provided by NCBI
Trace Archive
"""
def __init__(self, text=None):
"""Initialize the class."""
self.acc = ""
self.nid = ""
self.lid = ""
self.pid = ""
self.clone = ""
self.image = ""
self.is_image = False
self.end = ""
self.mgc = ""
self.seqtype = ""
self.trace = ""
if text is not None:
self.text = text
self._init_from_text(text)
def _init_from_text(self, text):
parts = text.split("; ")
for part in parts:
key, val = part.split("=")
if key == "CLONE":
if val[:5] == "IMAGE":
self.is_image = True
self.image = val[6:]
setattr(self, key.lower(), val)
def __repr__(self):
"""Return UniGene SequenceLine object as a string."""
return self.text
class ProtsimLine:
"""Store the information for one PROTSIM line from a Unigene file.
Initialize with the text part of the PROTSIM line, or nothing.
Attributes and descriptions (access as LOWER CASE)
ORG= Organism
PROTGI= Sequence GI of protein
PROTID= Sequence ID of protein
PCT= Percent alignment
ALN= length of aligned region (aa)
"""
def __init__(self, text=None):
"""Initialize the class."""
self.org = ""
self.protgi = ""
self.protid = ""
self.pct = ""
self.aln = ""
if text is not None:
self.text = text
self._init_from_text(text)
def _init_from_text(self, text):
parts = text.split("; ")
for part in parts:
key, val = part.split("=")
setattr(self, key.lower(), val)
def __repr__(self):
"""Return UniGene ProtsimLine object as a string."""
return self.text
class STSLine:
"""Store the information for one STS line from a Unigene file.
Initialize with the text part of the STS line, or nothing.
Attributes and descriptions (access as LOWER CASE)
ACC= GenBank/EMBL/DDBJ accession number of STS [optional field]
UNISTS= identifier in NCBI's UNISTS database
"""
def __init__(self, text=None):
"""Initialize the class."""
self.acc = ""
self.unists = ""
if text is not None:
self.text = text
self._init_from_text(text)
def _init_from_text(self, text):
parts = text.split(" ")
for part in parts:
key, val = part.split("=")
setattr(self, key.lower(), val)
def __repr__(self):
"""Return UniGene STSLine object as a string."""
return self.text
class Record:
"""Store a Unigene record.
Here is what is stored::
self.ID = '' # ID line
self.species = '' # Hs, Bt, etc.
self.title = '' # TITLE line
self.symbol = '' # GENE line
self.cytoband = '' # CYTOBAND line
self.express = [] # EXPRESS line, parsed on ';'
# Will be an array of strings
self.restr_expr = '' # RESTR_EXPR line
self.gnm_terminus = '' # GNM_TERMINUS line
self.gene_id = '' # GENE_ID line
self.locuslink = '' # LOCUSLINK line
self.homol = '' # HOMOL line
self.chromosome = '' # CHROMOSOME line
self.protsim = [] # PROTSIM entries, array of Protsims
# Type ProtsimLine
self.sequence = [] # SEQUENCE entries, array of Sequence entries
# Type SequenceLine
self.sts = [] # STS entries, array of STS entries
# Type STSLine
self.txmap = [] # TXMAP entries, array of TXMap entries
"""
def __init__(self):
"""Initialize the class."""
self.ID = "" # ID line
self.species = "" # Hs, Bt, etc.
self.title = "" # TITLE line
self.symbol = "" # GENE line
self.cytoband = "" # CYTOBAND line
self.express = [] # EXPRESS line, parsed on ';'
self.restr_expr = "" # RESTR_EXPR line
self.gnm_terminus = "" # GNM_TERMINUS line
self.gene_id = "" # GENE_ID line
self.locuslink = "" # LOCUSLINK line
self.homol = "" # HOMOL line
self.chromosome = "" # CHROMOSOME line
self.protsim = [] # PROTSIM entries, array of Protsims
self.sequence = [] # SEQUENCE entries, array of Sequence entries
self.sts = [] # STS entries, array of STS entries
self.txmap = [] # TXMAP entries, array of TXMap entries
def __repr__(self):
"""Represent the UniGene Record object as a string for debugging."""
return f"<{self.__class__.__name__}> {self.ID} {self.symbol} {self.title}"
def parse(handle):
"""Read and load a UniGene records, for files containing multiple records."""
while True:
record = _read(handle)
if not record:
return
yield record
def read(handle):
"""Read and load a UniGene record, one record per file."""
record = _read(handle)
if not record:
raise ValueError("No SwissProt record found")
# We should have reached the end of the record by now
remainder = handle.read()
if remainder:
raise ValueError("More than one SwissProt record found")
return record
# Everything below is private
def _read(handle):
UG_INDENT = 12
record = None
for line in handle:
tag, value = line[:UG_INDENT].rstrip(), line[UG_INDENT:].rstrip()
line = line.rstrip()
if tag == "ID":
record = Record()
record.ID = value
record.species = record.ID.split(".")[0]
elif tag == "TITLE":
record.title = value
elif tag == "GENE":
record.symbol = value
elif tag == "GENE_ID":
record.gene_id = value
elif tag == "LOCUSLINK":
record.locuslink = value
elif tag == "HOMOL":
if value == "YES":
record.homol = True
elif value == "NO":
record.homol = True
else:
raise ValueError(f"Cannot parse HOMOL line {line}")
elif tag == "EXPRESS":
record.express = [word.strip() for word in value.split("|")]
elif tag == "RESTR_EXPR":
record.restr_expr = [word.strip() for word in value.split("|")]
elif tag == "CHROMOSOME":
record.chromosome = value
elif tag == "CYTOBAND":
record.cytoband = value
elif tag == "PROTSIM":
protsim = ProtsimLine(value)
record.protsim.append(protsim)
elif tag == "SCOUNT":
scount = int(value)
elif tag == "SEQUENCE":
sequence = SequenceLine(value)
record.sequence.append(sequence)
elif tag == "STS":
sts = STSLine(value)
record.sts.append(sts)
elif tag == "//":
if len(record.sequence) != scount:
raise ValueError(
"The number of sequences specified in the record "
"(%d) does not agree with the number of sequences found (%d)"
% (scount, len(record.sequence))
)
return record
else:
raise ValueError(f"Unknown tag {tag}")
if record:
raise ValueError("Unexpected end of stream.")
|