aakash0017's picture
Upload folder using huggingface_hub
b7731cd
# Copyright 2006 by Sean Davis, National Cancer Institute, NIH.
# All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Parse Unigene flat file format files such as the Hs.data file.
Here is an overview of the flat file format that this parser deals with:
Line types/qualifiers::
ID UniGene cluster ID
TITLE Title for the cluster
GENE Gene symbol
CYTOBAND Cytological band
EXPRESS Tissues of origin for ESTs in cluster
RESTR_EXPR Single tissue or development stage contributes
more than half the total EST frequency for this gene.
GNM_TERMINUS genomic confirmation of presence of a 3' terminus;
T if a non-templated polyA tail is found among
a cluster's sequences; else
I if templated As are found in genomic sequence or
S if a canonical polyA signal is found on
the genomic sequence
GENE_ID Entrez gene identifier associated with at least one
sequence in this cluster;
to be used instead of LocusLink.
LOCUSLINK LocusLink identifier associated with at least one
sequence in this cluster;
deprecated in favor of GENE_ID
HOMOL Homology;
CHROMOSOME Chromosome. For plants, CHROMOSOME refers to mapping
on the arabidopsis genome.
STS STS
ACC= GenBank/EMBL/DDBJ accession number of STS
[optional field]
UNISTS= identifier in NCBI's UNISTS database
TXMAP Transcript map interval
MARKER= Marker found on at least one sequence in this
cluster
RHPANEL= Radiation Hybrid panel used to place marker
PROTSIM Protein Similarity data for the sequence with
highest-scoring protein similarity in this cluster
ORG= Organism
PROTGI= Sequence GI of protein
PROTID= Sequence ID of protein
PCT= Percent alignment
ALN= length of aligned region (aa)
SCOUNT Number of sequences in the cluster
SEQUENCE Sequence
ACC= GenBank/EMBL/DDBJ accession number of sequence
NID= Unique nucleotide sequence identifier (gi)
PID= Unique protein sequence identifier (used for
non-ESTs)
CLONE= Clone identifier (used for ESTs only)
END= End (5'/3') of clone insert read (used for
ESTs only)
LID= Library ID; see Hs.lib.info for library name
and tissue
MGC= 5' CDS-completeness indicator; if present, the
clone associated with this sequence is believed
CDS-complete. A value greater than 511 is the gi
of the CDS-complete mRNA matched by the EST,
otherwise the value is an indicator of the
reliability of the test indicating CDS
completeness; higher values indicate more
reliable CDS-completeness predictions.
SEQTYPE= Description of the nucleotide sequence.
Possible values are mRNA, EST and HTC.
TRACE= The Trace ID of the EST sequence, as provided by
NCBI Trace Archive
"""
class SequenceLine:
"""Store the information for one SEQUENCE line from a Unigene file.
Initialize with the text part of the SEQUENCE line, or nothing.
Attributes and descriptions (access as LOWER CASE):
- ACC= GenBank/EMBL/DDBJ accession number of sequence
- NID= Unique nucleotide sequence identifier (gi)
- PID= Unique protein sequence identifier (used for non-ESTs)
- CLONE= Clone identifier (used for ESTs only)
- END= End (5'/3') of clone insert read (used for ESTs only)
- LID= Library ID; see Hs.lib.info for library name and tissue
- MGC= 5' CDS-completeness indicator; if present,
the clone associated with this sequence
is believed CDS-complete. A value greater than 511
is the gi of the CDS-complete mRNA matched by the EST,
otherwise the value is an indicator of the reliability
of the test indicating CDS completeness;
higher values indicate more reliable CDS-completeness
predictions.
- SEQTYPE= Description of the nucleotide sequence. Possible values
are mRNA, EST and HTC.
- TRACE= The Trace ID of the EST sequence, as provided by NCBI
Trace Archive
"""
def __init__(self, text=None):
"""Initialize the class."""
self.acc = ""
self.nid = ""
self.lid = ""
self.pid = ""
self.clone = ""
self.image = ""
self.is_image = False
self.end = ""
self.mgc = ""
self.seqtype = ""
self.trace = ""
if text is not None:
self.text = text
self._init_from_text(text)
def _init_from_text(self, text):
parts = text.split("; ")
for part in parts:
key, val = part.split("=")
if key == "CLONE":
if val[:5] == "IMAGE":
self.is_image = True
self.image = val[6:]
setattr(self, key.lower(), val)
def __repr__(self):
"""Return UniGene SequenceLine object as a string."""
return self.text
class ProtsimLine:
"""Store the information for one PROTSIM line from a Unigene file.
Initialize with the text part of the PROTSIM line, or nothing.
Attributes and descriptions (access as LOWER CASE)
ORG= Organism
PROTGI= Sequence GI of protein
PROTID= Sequence ID of protein
PCT= Percent alignment
ALN= length of aligned region (aa)
"""
def __init__(self, text=None):
"""Initialize the class."""
self.org = ""
self.protgi = ""
self.protid = ""
self.pct = ""
self.aln = ""
if text is not None:
self.text = text
self._init_from_text(text)
def _init_from_text(self, text):
parts = text.split("; ")
for part in parts:
key, val = part.split("=")
setattr(self, key.lower(), val)
def __repr__(self):
"""Return UniGene ProtsimLine object as a string."""
return self.text
class STSLine:
"""Store the information for one STS line from a Unigene file.
Initialize with the text part of the STS line, or nothing.
Attributes and descriptions (access as LOWER CASE)
ACC= GenBank/EMBL/DDBJ accession number of STS [optional field]
UNISTS= identifier in NCBI's UNISTS database
"""
def __init__(self, text=None):
"""Initialize the class."""
self.acc = ""
self.unists = ""
if text is not None:
self.text = text
self._init_from_text(text)
def _init_from_text(self, text):
parts = text.split(" ")
for part in parts:
key, val = part.split("=")
setattr(self, key.lower(), val)
def __repr__(self):
"""Return UniGene STSLine object as a string."""
return self.text
class Record:
"""Store a Unigene record.
Here is what is stored::
self.ID = '' # ID line
self.species = '' # Hs, Bt, etc.
self.title = '' # TITLE line
self.symbol = '' # GENE line
self.cytoband = '' # CYTOBAND line
self.express = [] # EXPRESS line, parsed on ';'
# Will be an array of strings
self.restr_expr = '' # RESTR_EXPR line
self.gnm_terminus = '' # GNM_TERMINUS line
self.gene_id = '' # GENE_ID line
self.locuslink = '' # LOCUSLINK line
self.homol = '' # HOMOL line
self.chromosome = '' # CHROMOSOME line
self.protsim = [] # PROTSIM entries, array of Protsims
# Type ProtsimLine
self.sequence = [] # SEQUENCE entries, array of Sequence entries
# Type SequenceLine
self.sts = [] # STS entries, array of STS entries
# Type STSLine
self.txmap = [] # TXMAP entries, array of TXMap entries
"""
def __init__(self):
"""Initialize the class."""
self.ID = "" # ID line
self.species = "" # Hs, Bt, etc.
self.title = "" # TITLE line
self.symbol = "" # GENE line
self.cytoband = "" # CYTOBAND line
self.express = [] # EXPRESS line, parsed on ';'
self.restr_expr = "" # RESTR_EXPR line
self.gnm_terminus = "" # GNM_TERMINUS line
self.gene_id = "" # GENE_ID line
self.locuslink = "" # LOCUSLINK line
self.homol = "" # HOMOL line
self.chromosome = "" # CHROMOSOME line
self.protsim = [] # PROTSIM entries, array of Protsims
self.sequence = [] # SEQUENCE entries, array of Sequence entries
self.sts = [] # STS entries, array of STS entries
self.txmap = [] # TXMAP entries, array of TXMap entries
def __repr__(self):
"""Represent the UniGene Record object as a string for debugging."""
return f"<{self.__class__.__name__}> {self.ID} {self.symbol} {self.title}"
def parse(handle):
"""Read and load a UniGene records, for files containing multiple records."""
while True:
record = _read(handle)
if not record:
return
yield record
def read(handle):
"""Read and load a UniGene record, one record per file."""
record = _read(handle)
if not record:
raise ValueError("No SwissProt record found")
# We should have reached the end of the record by now
remainder = handle.read()
if remainder:
raise ValueError("More than one SwissProt record found")
return record
# Everything below is private
def _read(handle):
UG_INDENT = 12
record = None
for line in handle:
tag, value = line[:UG_INDENT].rstrip(), line[UG_INDENT:].rstrip()
line = line.rstrip()
if tag == "ID":
record = Record()
record.ID = value
record.species = record.ID.split(".")[0]
elif tag == "TITLE":
record.title = value
elif tag == "GENE":
record.symbol = value
elif tag == "GENE_ID":
record.gene_id = value
elif tag == "LOCUSLINK":
record.locuslink = value
elif tag == "HOMOL":
if value == "YES":
record.homol = True
elif value == "NO":
record.homol = True
else:
raise ValueError(f"Cannot parse HOMOL line {line}")
elif tag == "EXPRESS":
record.express = [word.strip() for word in value.split("|")]
elif tag == "RESTR_EXPR":
record.restr_expr = [word.strip() for word in value.split("|")]
elif tag == "CHROMOSOME":
record.chromosome = value
elif tag == "CYTOBAND":
record.cytoband = value
elif tag == "PROTSIM":
protsim = ProtsimLine(value)
record.protsim.append(protsim)
elif tag == "SCOUNT":
scount = int(value)
elif tag == "SEQUENCE":
sequence = SequenceLine(value)
record.sequence.append(sequence)
elif tag == "STS":
sts = STSLine(value)
record.sts.append(sts)
elif tag == "//":
if len(record.sequence) != scount:
raise ValueError(
"The number of sequences specified in the record "
"(%d) does not agree with the number of sequences found (%d)"
% (scount, len(record.sequence))
)
return record
else:
raise ValueError(f"Unknown tag {tag}")
if record:
raise ValueError("Unexpected end of stream.")