Skip to content

Commit

Permalink
Merge pull request #63 from standage/cli/sum
Browse files Browse the repository at this point in the history
Added simple summary script to CLI
  • Loading branch information
standage authored Jun 21, 2017
2 parents 7111850 + 9b0ac68 commit 94686ad
Show file tree
Hide file tree
Showing 7 changed files with 82 additions and 1 deletion.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@
All notable changes to this project will be documented in this file.
This project adheres to [Semantic Versioning](http://semver.org/).

## [0.3.3] - 2017-06-21
### Fixed
- Missing `extent` query from the index implementation.
- Aliased `index.keys()` to `index.seqids`.

### Added
- Script `tag sum` to provide very basic summaries of genomic GFF3 files.

## [0.3.2] - 2017-04-19
### Added
- Pseudo-features for better handling and sorting of top-level multi-features.
Expand Down
1 change: 1 addition & 0 deletions tag/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
'gff3': tag.cli.gff3.main,
'occ': tag.cli.occ.main,
'pmrna': tag.cli.pmrna.main,
'sum': tag.cli.sum.main,
}


Expand Down
4 changes: 3 additions & 1 deletion tag/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from . import gff3
from . import occ
from . import pmrna
from . import sum


def parser():
Expand All @@ -22,9 +23,10 @@ def parser():
parser.add_argument('-l', '--logfile', metavar='FILE', default=sys.stderr,
type=argparse.FileType('w'))
subparsers = parser.add_subparsers(dest='cmd', metavar='cmd',
help='gff3 | occ | pmrna')
help='gff3 | occ | pmrna | sum')
tag.cli.gff3.subparser(subparsers)
tag.cli.occ.subparser(subparsers)
tag.cli.pmrna.subparser(subparsers)
tag.cli.sum.subparser(subparsers)

return parser
49 changes: 49 additions & 0 deletions tag/cli/sum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env python
#
# -----------------------------------------------------------------------------
# Copyright (C) 2017 Daniel Standage <[email protected]>
#
# This file is part of tag (http://github.com/standage/tag) and is licensed
# under the BSD 3-clause license: see LICENSE.
# -----------------------------------------------------------------------------

from __future__ import print_function
import argparse
from collections import defaultdict
import tag


def subparser(subparsers):
desc = 'Briefly summarize a GFF3 file'
subparser = subparsers.add_parser('sum', description=desc)
subparser.add_argument('gff3', help='input file')


def main(args):
annot = tag.index.Index()
annot.consume_file(args.gff3)
annot.yield_inferred = False

seqs = list(annot.seqids)
size = 0
for seqid in seqs:
start, end = annot.extent(seqid)
size += end - start

counts = defaultdict(int)
maxlens = defaultdict(int)
for feature in tag.select.features(annot):
for subfeat in feature:
counts[subfeat.type] += 1
maxlens[subfeat.type] = max(len(subfeat), maxlens[subfeat.type])

sumstr = 'Summary for file "{}":\n'.format(args.gff3)
sumstr += ' - {} annotated sequences'.format(len(seqs))
sumstr += ' for a total length of {} bp\n'.format(size)
sumstr += ' - {} annotated features'.format(sum(counts.values()))
sumstr += ' (or feature entries)\n'
for ft in sorted(counts):
sumstr += ' - {} entries of type {}'.format(counts[ft], ft)
sumstr += ', maximum length: {} bp\n'.format(maxlens[ft])

print(sumstr.strip())
9 changes: 9 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,12 @@ def test_pmrna(capsys):
out, err = capsys.readouterr()
exp_out = tag.pkgdata('nanosplice-primary.gff3').read()
assert out.strip() == exp_out.strip()


def test_sum(capsys):
infile = 'tests/testdata/GCF_001639295.1_ASM163929v1_genomic.gff.gz'
args = tag.cli.parser().parse_args(['sum', infile])
tag.cli.sum.main(args)

out, err = capsys.readouterr()
assert out.strip() == tag.pkgdata('sum-test-out.txt').read().strip()
Binary file not shown.
12 changes: 12 additions & 0 deletions tests/testdata/sum-test-out.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Summary for file "tests/testdata/GCF_001639295.1_ASM163929v1_genomic.gff.gz":
- 232 annotated sequences for a total length of 2414608 bp
- 4298 annotated features (or feature entries)
- 1973 entries of type CDS, maximum length: 12025 bp
- 1 entries of type RNase_P_RNA, maximum length: 274 bp
- 1 entries of type SRP_RNA, maximum length: 316 bp
- 40 entries of type exon, maximum length: 2857 bp
- 2012 entries of type gene, maximum length: 12025 bp
- 4 entries of type rRNA, maximum length: 2857 bp
- 232 entries of type region, maximum length: 74332 bp
- 2 entries of type repeat_region, maximum length: 3602 bp
- 33 entries of type tRNA, maximum length: 107 bp

0 comments on commit 94686ad

Please sign in to comment.