Source code for tag.bae

#!/usr/bin/env python
#
# -----------------------------------------------------------------------------
# Copyright (C) 2019 Battelle National Biodefense Institute.
#
# This file is part of tag (http://github.com/standage/tag) and is licensed
# under the BSD 3-clause license: see LICENSE.
# -----------------------------------------------------------------------------

from __future__ import division
from collections import defaultdict
import tag


def encodes_cds(feature):
    for subfeature in feature:
        if subfeature.type == 'CDS':
            return True
    return False


[docs]def collapse_locus(features): """Collapse redundant annotation information. Given a locus, collapse any gene predictions with identical coordinates into a single prediction, noting the annotation sources that support the collapsed gene model. """ features_to_keep = list() cds_by_coord = defaultdict(list) for cds in tag.select.features(features, type='CDS', traverse=True): cds_by_coord[cds.range].append(cds) for coord in sorted(cds_by_coord): cdss = cds_by_coord[coord] support = ','.join([c.source for c in cdss]) newcds = tag.Feature( cdss[0].seqid, 'CDS', coord.start, coord.end, source='tag::bae', score=len(cdss), strand=cdss[0].strand, attrstr='support_from=' + support ) features_to_keep.append(newcds) for feature in features: if not encodes_cds(feature): features_to_keep.append(feature) for feature in sorted(features_to_keep): yield feature
[docs]def collapse_stream(locusstream): """Feature stream for collapsing bacterial annotation data.""" for seqid, interval, locus in locusstream: for feature in collapse_locus(locus): yield feature yield tag.Directive('###')
[docs]def eval_locus(features): """Evaluate congruence between gene predictions from different sources. Gene predictions are assumed to be microbial protein-coding genes, marked as type `CDS`. Any reference protein alignments can be included as features of type `translated_nucleotide_match`. Compute the agreement between different sources of start/stop codon position, entire ORF position, and coverage from reference proteins. """ starts = defaultdict(int) ends = defaultdict(int) intervals = defaultdict(int) coverage = defaultdict(int) numorfs = len(list( tag.select.features(features, type='CDS', traverse=True) )) selector = tag.select.features( features, type=set(['CDS', 'translated_nucleotide_match']), traverse=True, ) for feature in selector: starts[feature.start] += 1 ends[feature.end] += 1 intervals[feature.range] += 1 aligns = tag.select.features(features, type='translated_nucleotide_match') ranges = [a.range for a in aligns] for block in tag.Range.merge_overlapping(ranges): for feat in tag.select.features(features, type='CDS', traverse=True): bpoverlap = feat.range.overlap_extent(block) coverage[feat] += bpoverlap for feature in tag.select.features(features, type='CDS', traverse=True): start_confirmed = starts[feature.start] - 1 start_shared = starts[feature.start] / sum(starts.values()) end_confirmed = ends[feature.end] - 1 end_shared = ends[feature.end] / sum(ends.values()) interval_confirmed = intervals[feature.range] - 1 interval_shared = intervals[feature.range] / sum(intervals.values()) prot_coverage = coverage[feature] / len(feature) feature.add_attribute('start_confirmed', start_confirmed) feature.add_attribute('start_shared', start_shared) feature.add_attribute('end_confirmed', end_confirmed) feature.add_attribute('end_shared', end_shared) feature.add_attribute('orf_confirmed', interval_confirmed) feature.add_attribute('orf_shared', interval_shared) feature.add_attribute('locus_orfs', numorfs) feature.add_attribute('protein_coverage', prot_coverage)
[docs]def eval_stream(locusstream): """Feature stream for bacterial annotation evaluation.""" for seqid, interval, locus in locusstream: eval_locus(locus) for feature in locus: yield feature yield tag.Directive('###')