Spaces:
No application file
No application file
# Copyright 2006 by Sean Davis, National Cancer Institute, NIH. | |
# All rights reserved. | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Parse Unigene flat file format files such as the Hs.data file. | |
Here is an overview of the flat file format that this parser deals with: | |
Line types/qualifiers:: | |
ID UniGene cluster ID | |
TITLE Title for the cluster | |
GENE Gene symbol | |
CYTOBAND Cytological band | |
EXPRESS Tissues of origin for ESTs in cluster | |
RESTR_EXPR Single tissue or development stage contributes | |
more than half the total EST frequency for this gene. | |
GNM_TERMINUS genomic confirmation of presence of a 3' terminus; | |
T if a non-templated polyA tail is found among | |
a cluster's sequences; else | |
I if templated As are found in genomic sequence or | |
S if a canonical polyA signal is found on | |
the genomic sequence | |
GENE_ID Entrez gene identifier associated with at least one | |
sequence in this cluster; | |
to be used instead of LocusLink. | |
LOCUSLINK LocusLink identifier associated with at least one | |
sequence in this cluster; | |
deprecated in favor of GENE_ID | |
HOMOL Homology; | |
CHROMOSOME Chromosome. For plants, CHROMOSOME refers to mapping | |
on the arabidopsis genome. | |
STS STS | |
ACC= GenBank/EMBL/DDBJ accession number of STS | |
[optional field] | |
UNISTS= identifier in NCBI's UNISTS database | |
TXMAP Transcript map interval | |
MARKER= Marker found on at least one sequence in this | |
cluster | |
RHPANEL= Radiation Hybrid panel used to place marker | |
PROTSIM Protein Similarity data for the sequence with | |
highest-scoring protein similarity in this cluster | |
ORG= Organism | |
PROTGI= Sequence GI of protein | |
PROTID= Sequence ID of protein | |
PCT= Percent alignment | |
ALN= length of aligned region (aa) | |
SCOUNT Number of sequences in the cluster | |
SEQUENCE Sequence | |
ACC= GenBank/EMBL/DDBJ accession number of sequence | |
NID= Unique nucleotide sequence identifier (gi) | |
PID= Unique protein sequence identifier (used for | |
non-ESTs) | |
CLONE= Clone identifier (used for ESTs only) | |
END= End (5'/3') of clone insert read (used for | |
ESTs only) | |
LID= Library ID; see Hs.lib.info for library name | |
and tissue | |
MGC= 5' CDS-completeness indicator; if present, the | |
clone associated with this sequence is believed | |
CDS-complete. A value greater than 511 is the gi | |
of the CDS-complete mRNA matched by the EST, | |
otherwise the value is an indicator of the | |
reliability of the test indicating CDS | |
completeness; higher values indicate more | |
reliable CDS-completeness predictions. | |
SEQTYPE= Description of the nucleotide sequence. | |
Possible values are mRNA, EST and HTC. | |
TRACE= The Trace ID of the EST sequence, as provided by | |
NCBI Trace Archive | |
""" | |
class SequenceLine: | |
"""Store the information for one SEQUENCE line from a Unigene file. | |
Initialize with the text part of the SEQUENCE line, or nothing. | |
Attributes and descriptions (access as LOWER CASE): | |
- ACC= GenBank/EMBL/DDBJ accession number of sequence | |
- NID= Unique nucleotide sequence identifier (gi) | |
- PID= Unique protein sequence identifier (used for non-ESTs) | |
- CLONE= Clone identifier (used for ESTs only) | |
- END= End (5'/3') of clone insert read (used for ESTs only) | |
- LID= Library ID; see Hs.lib.info for library name and tissue | |
- MGC= 5' CDS-completeness indicator; if present, | |
the clone associated with this sequence | |
is believed CDS-complete. A value greater than 511 | |
is the gi of the CDS-complete mRNA matched by the EST, | |
otherwise the value is an indicator of the reliability | |
of the test indicating CDS completeness; | |
higher values indicate more reliable CDS-completeness | |
predictions. | |
- SEQTYPE= Description of the nucleotide sequence. Possible values | |
are mRNA, EST and HTC. | |
- TRACE= The Trace ID of the EST sequence, as provided by NCBI | |
Trace Archive | |
""" | |
def __init__(self, text=None): | |
"""Initialize the class.""" | |
self.acc = "" | |
self.nid = "" | |
self.lid = "" | |
self.pid = "" | |
self.clone = "" | |
self.image = "" | |
self.is_image = False | |
self.end = "" | |
self.mgc = "" | |
self.seqtype = "" | |
self.trace = "" | |
if text is not None: | |
self.text = text | |
self._init_from_text(text) | |
def _init_from_text(self, text): | |
parts = text.split("; ") | |
for part in parts: | |
key, val = part.split("=") | |
if key == "CLONE": | |
if val[:5] == "IMAGE": | |
self.is_image = True | |
self.image = val[6:] | |
setattr(self, key.lower(), val) | |
def __repr__(self): | |
"""Return UniGene SequenceLine object as a string.""" | |
return self.text | |
class ProtsimLine: | |
"""Store the information for one PROTSIM line from a Unigene file. | |
Initialize with the text part of the PROTSIM line, or nothing. | |
Attributes and descriptions (access as LOWER CASE) | |
ORG= Organism | |
PROTGI= Sequence GI of protein | |
PROTID= Sequence ID of protein | |
PCT= Percent alignment | |
ALN= length of aligned region (aa) | |
""" | |
def __init__(self, text=None): | |
"""Initialize the class.""" | |
self.org = "" | |
self.protgi = "" | |
self.protid = "" | |
self.pct = "" | |
self.aln = "" | |
if text is not None: | |
self.text = text | |
self._init_from_text(text) | |
def _init_from_text(self, text): | |
parts = text.split("; ") | |
for part in parts: | |
key, val = part.split("=") | |
setattr(self, key.lower(), val) | |
def __repr__(self): | |
"""Return UniGene ProtsimLine object as a string.""" | |
return self.text | |
class STSLine: | |
"""Store the information for one STS line from a Unigene file. | |
Initialize with the text part of the STS line, or nothing. | |
Attributes and descriptions (access as LOWER CASE) | |
ACC= GenBank/EMBL/DDBJ accession number of STS [optional field] | |
UNISTS= identifier in NCBI's UNISTS database | |
""" | |
def __init__(self, text=None): | |
"""Initialize the class.""" | |
self.acc = "" | |
self.unists = "" | |
if text is not None: | |
self.text = text | |
self._init_from_text(text) | |
def _init_from_text(self, text): | |
parts = text.split(" ") | |
for part in parts: | |
key, val = part.split("=") | |
setattr(self, key.lower(), val) | |
def __repr__(self): | |
"""Return UniGene STSLine object as a string.""" | |
return self.text | |
class Record: | |
"""Store a Unigene record. | |
Here is what is stored:: | |
self.ID = '' # ID line | |
self.species = '' # Hs, Bt, etc. | |
self.title = '' # TITLE line | |
self.symbol = '' # GENE line | |
self.cytoband = '' # CYTOBAND line | |
self.express = [] # EXPRESS line, parsed on ';' | |
# Will be an array of strings | |
self.restr_expr = '' # RESTR_EXPR line | |
self.gnm_terminus = '' # GNM_TERMINUS line | |
self.gene_id = '' # GENE_ID line | |
self.locuslink = '' # LOCUSLINK line | |
self.homol = '' # HOMOL line | |
self.chromosome = '' # CHROMOSOME line | |
self.protsim = [] # PROTSIM entries, array of Protsims | |
# Type ProtsimLine | |
self.sequence = [] # SEQUENCE entries, array of Sequence entries | |
# Type SequenceLine | |
self.sts = [] # STS entries, array of STS entries | |
# Type STSLine | |
self.txmap = [] # TXMAP entries, array of TXMap entries | |
""" | |
def __init__(self): | |
"""Initialize the class.""" | |
self.ID = "" # ID line | |
self.species = "" # Hs, Bt, etc. | |
self.title = "" # TITLE line | |
self.symbol = "" # GENE line | |
self.cytoband = "" # CYTOBAND line | |
self.express = [] # EXPRESS line, parsed on ';' | |
self.restr_expr = "" # RESTR_EXPR line | |
self.gnm_terminus = "" # GNM_TERMINUS line | |
self.gene_id = "" # GENE_ID line | |
self.locuslink = "" # LOCUSLINK line | |
self.homol = "" # HOMOL line | |
self.chromosome = "" # CHROMOSOME line | |
self.protsim = [] # PROTSIM entries, array of Protsims | |
self.sequence = [] # SEQUENCE entries, array of Sequence entries | |
self.sts = [] # STS entries, array of STS entries | |
self.txmap = [] # TXMAP entries, array of TXMap entries | |
def __repr__(self): | |
"""Represent the UniGene Record object as a string for debugging.""" | |
return f"<{self.__class__.__name__}> {self.ID} {self.symbol} {self.title}" | |
def parse(handle): | |
"""Read and load a UniGene records, for files containing multiple records.""" | |
while True: | |
record = _read(handle) | |
if not record: | |
return | |
yield record | |
def read(handle): | |
"""Read and load a UniGene record, one record per file.""" | |
record = _read(handle) | |
if not record: | |
raise ValueError("No SwissProt record found") | |
# We should have reached the end of the record by now | |
remainder = handle.read() | |
if remainder: | |
raise ValueError("More than one SwissProt record found") | |
return record | |
# Everything below is private | |
def _read(handle): | |
UG_INDENT = 12 | |
record = None | |
for line in handle: | |
tag, value = line[:UG_INDENT].rstrip(), line[UG_INDENT:].rstrip() | |
line = line.rstrip() | |
if tag == "ID": | |
record = Record() | |
record.ID = value | |
record.species = record.ID.split(".")[0] | |
elif tag == "TITLE": | |
record.title = value | |
elif tag == "GENE": | |
record.symbol = value | |
elif tag == "GENE_ID": | |
record.gene_id = value | |
elif tag == "LOCUSLINK": | |
record.locuslink = value | |
elif tag == "HOMOL": | |
if value == "YES": | |
record.homol = True | |
elif value == "NO": | |
record.homol = True | |
else: | |
raise ValueError(f"Cannot parse HOMOL line {line}") | |
elif tag == "EXPRESS": | |
record.express = [word.strip() for word in value.split("|")] | |
elif tag == "RESTR_EXPR": | |
record.restr_expr = [word.strip() for word in value.split("|")] | |
elif tag == "CHROMOSOME": | |
record.chromosome = value | |
elif tag == "CYTOBAND": | |
record.cytoband = value | |
elif tag == "PROTSIM": | |
protsim = ProtsimLine(value) | |
record.protsim.append(protsim) | |
elif tag == "SCOUNT": | |
scount = int(value) | |
elif tag == "SEQUENCE": | |
sequence = SequenceLine(value) | |
record.sequence.append(sequence) | |
elif tag == "STS": | |
sts = STSLine(value) | |
record.sts.append(sts) | |
elif tag == "//": | |
if len(record.sequence) != scount: | |
raise ValueError( | |
"The number of sequences specified in the record " | |
"(%d) does not agree with the number of sequences found (%d)" | |
% (scount, len(record.sequence)) | |
) | |
return record | |
else: | |
raise ValueError(f"Unknown tag {tag}") | |
if record: | |
raise ValueError("Unexpected end of stream.") | |