Source code for tag.transcript

#!/usr/bin/env python
#
# -----------------------------------------------------------------------------
# Copyright (C) 2016 Daniel Standage <daniel.standage@gmail.com>
#
# This file is part of tag (http://github.com/standage/tag) and is licensed
# under the BSD 3-clause license: see LICENSE.
# -----------------------------------------------------------------------------

from __future__ import print_function
from collections import defaultdict
from sys import stderr
import tag


# One day I may write a program to crawl the Sequence Ontology and build a more
# comprehensive version of this set of terms. For now, this should handle >95%
# of GFF3 files in the wild.
type_terms = set([
    'mRNA',
    'transcript',
    'primary_transcript',
    'lnc_RNA',
    'tRNA',
    'rRNA',
])


def _emplace_pmrna(mrnas, parent, strict=False):
    """Retrieve the primary mRNA and discard all others."""
    mrnas.sort(key=lambda m: (m.cdslen, m.get_attribute('ID')))
    pmrna = mrnas.pop()
    if strict:
        parent.children = [pmrna]
    else:
        parent.children = [c for c in parent.children if c not in mrnas]


def _emplace_transcript(transcripts, parent):
    """Retrieve the primary transcript and discard all others."""
    transcripts.sort(key=lambda t: (len(t), t.get_attribute('ID')))
    pt = transcripts.pop()
    parent.children = [pt]


[docs]def primary_mrna(entrystream, parenttype='gene'):
    """
    Select a single mRNA as a representative for each protein-coding gene.

    The primary mRNA is the one with the longest translation product. In cases
    where multiple isoforms have the same translated length, the feature ID is
    used for sorting.

    This function **does not** return only mRNA features, it returns all GFF3
    entry types (pragmas, features, sequences, etc). The function **does**
    modify the gene features that pass through to ensure that they have at most
    a single mRNA feature.

    >>> reader = tag.GFF3Reader(tag.tests.data_stream('pdom-withseq.gff3'))
    >>> filter = tag.transcript.primary_mrna(reader)
    >>> for gene in tag.select.features(filter, type='gene'):
    ...    assert gene.num_children == 1
    """
    for entry in entrystream:
        if not isinstance(entry, tag.Feature):
            yield entry
            continue

        for parent in tag.select.features(entry, parenttype, traverse=True):
            mrnas = [f for f in parent.children if f.type == 'mRNA']
            if len(mrnas) == 0:
                continue
            _emplace_pmrna(mrnas, parent)
        yield entry


def _get_primary_type(ttypes, parent, logstream=stderr):
    """Check for multiple transcript types and, if possible, select one."""
    if len(ttypes) > 1:
        if logstream:  # pragma: no branch
            message = '[tag::transcript::primary_transcript]'
            message += ' WARNING: feature {:s}'.format(parent.slug)
            message += ' has multiple associated transcript types'
            message += ' {}'.format(ttypes)
            print(message, file=logstream)
        if 'mRNA' not in ttypes:
            message = (
                'cannot resolve multiple transcript types if "mRNA" is'
                ' not one of those types {}'.format(ttypes)
            )
            raise Exception(message)
        ttypes = ['mRNA']
    return ttypes[0]


[docs]def primary_transcript(entrystream, parenttype='gene', logstream=stderr):
    """
    Select a single transcript as a representative for each gene.

    This function is a generalization of the `primary_mrna` function that
    attempts, under certain conditions, to select a single transcript as a
    representative for each gene. If a gene encodes multiple transcript types,
    one of those types must be **mRNA** or the function will complain loudly
    and fail.

    For mRNAs, the primary transcript is selected according to translated
    length. For all other transcript types, the length of the transcript
    feature itself is used. I'd be eager to hear suggestions for alternative
    selection criteria.

    Like the `primary_mrna` function, this function **does not** return only
    transcript features. It **does** modify gene features to ensure that each
    has at most one transcript feature.

    >>> reader = tag.GFF3Reader(
    ...     tag.tests.data_stream('psyllid-mixed-gene.gff3.gz')
    ... )
    >>> gene_filter = tag.select.features(reader, type='gene')
    >>> trans_filter = tag.transcript.primary_transcript(gene_filter)
    >>> for gene in trans_filter:
    ...     assert gene.num_children == 1

    In cases where the direct children of a gene feature have heterogenous
    types, the `primary_mrna` function will only discard mRNA features. This
    function, however, will discard all direct children of the gene that are
    not the primary transcript, including non-transcript children. This is a
    retty subtle distinction, and anecdotal experience suggests that cases in
    which the distinction actually matters are extremely rare.
    """
    for entry in entrystream:
        if not isinstance(entry, tag.Feature):
            yield entry
            continue

        for parent in tag.select.features(entry, parenttype, traverse=True):
            if parent.num_children == 0:
                continue

            transcripts = defaultdict(list)
            for child in parent.children:
                if child.type in type_terms:
                    transcripts[child.type].append(child)

            if len(transcripts) == 0:
                continue

            ttypes = list(transcripts.keys())
            ttype = _get_primary_type(ttypes, parent)
            transcript_list = transcripts[ttype]

            if ttype == 'mRNA':
                _emplace_pmrna(transcript_list, parent, strict=True)
            else:
                _emplace_transcript(transcript_list, parent)

        yield entry