Source code for tag.bae

#!/usr/bin/env python
#
# -----------------------------------------------------------------------------
# Copyright (C) 2019 Battelle National Biodefense Institute.
#
# This file is part of tag (http://github.com/standage/tag) and is licensed
# under the BSD 3-clause license: see LICENSE.
# -----------------------------------------------------------------------------

from __future__ import division
from collections import defaultdict
import tag


def encodes_cds(feature):
    for subfeature in feature:
        if subfeature.type == 'CDS':
            return True
    return False


[docs]def collapse_locus(features):
    """Collapse redundant annotation information.

    Given a locus, collapse any gene predictions with identical coordinates
    into a single prediction, noting the annotation sources that support the
    collapsed gene model.
    """
    features_to_keep = list()
    cds_by_coord = defaultdict(list)
    for cds in tag.select.features(features, type='CDS', traverse=True):
        cds_by_coord[cds.range].append(cds)

    for coord in sorted(cds_by_coord):
        cdss = cds_by_coord[coord]
        support = ','.join([c.source for c in cdss])
        newcds = tag.Feature(
            cdss[0].seqid, 'CDS', coord.start, coord.end, source='tag::bae',
            score=len(cdss), strand=cdss[0].strand,
            attrstr='support_from=' + support
        )
        features_to_keep.append(newcds)

    for feature in features:
        if not encodes_cds(feature):
            features_to_keep.append(feature)

    for feature in sorted(features_to_keep):
        yield feature


[docs]def collapse_stream(locusstream):
    """Feature stream for collapsing bacterial annotation data."""
    for seqid, interval, locus in locusstream:
        for feature in collapse_locus(locus):
            yield feature
        yield tag.Directive('###')


[docs]def eval_locus(features):
    """Evaluate congruence between gene predictions from different sources.

    Gene predictions are assumed to be microbial protein-coding genes, marked
    as type `CDS`. Any reference protein alignments can be included as features
    of type `translated_nucleotide_match`. Compute the agreement between
    different sources of start/stop codon position, entire ORF position, and
    coverage from reference proteins.
    """
    starts = defaultdict(int)
    ends = defaultdict(int)
    intervals = defaultdict(int)
    coverage = defaultdict(int)

    numorfs = len(list(
        tag.select.features(features, type='CDS', traverse=True)
    ))
    selector = tag.select.features(
        features, type=set(['CDS', 'translated_nucleotide_match']),
        traverse=True,
    )
    for feature in selector:
        starts[feature.start] += 1
        ends[feature.end] += 1
        intervals[feature.range] += 1

    aligns = tag.select.features(features, type='translated_nucleotide_match')
    ranges = [a.range for a in aligns]
    for block in tag.Range.merge_overlapping(ranges):
        for feat in tag.select.features(features, type='CDS', traverse=True):
            bpoverlap = feat.range.overlap_extent(block)
            coverage[feat] += bpoverlap

    for feature in tag.select.features(features, type='CDS', traverse=True):
        start_confirmed = starts[feature.start] - 1
        start_shared = starts[feature.start] / sum(starts.values())
        end_confirmed = ends[feature.end] - 1
        end_shared = ends[feature.end] / sum(ends.values())
        interval_confirmed = intervals[feature.range] - 1
        interval_shared = intervals[feature.range] / sum(intervals.values())
        prot_coverage = coverage[feature] / len(feature)
        feature.add_attribute('start_confirmed', start_confirmed)
        feature.add_attribute('start_shared', start_shared)
        feature.add_attribute('end_confirmed', end_confirmed)
        feature.add_attribute('end_shared', end_shared)
        feature.add_attribute('orf_confirmed', interval_confirmed)
        feature.add_attribute('orf_shared', interval_shared)
        feature.add_attribute('locus_orfs', numorfs)
        feature.add_attribute('protein_coverage', prot_coverage)


[docs]def eval_stream(locusstream):
    """Feature stream for bacterial annotation evaluation."""
    for seqid, interval, locus in locusstream:
        eval_locus(locus)
        for feature in locus:
            yield feature
        yield tag.Directive('###')