Skip to content

Commit

Permalink
Full fragment coverage mode for proper pairs (#246)
Browse files Browse the repository at this point in the history
* Add initial idea for full fragment coverage mode

* Adds bam file to test fragment-mode

* Simplify read filtering in fragment-mode

* Passes the fragment-mode arg to coverage()

* Removes unnecessary 0-indexing correction (start is 1-indexed)

* Adds regression test for fragment mode

* Uses tabs instead of spaces in expected test output

* Creates new bam file for testing fragment-mode

* Regenerates test bam files for fragment-mode

* Updates test expectation

* Fixes whitespace in expectation

* Removes seen_ids hash set. Improves arg string

---------

Co-authored-by: Ludvig <[email protected]>
  • Loading branch information
LudvigOlsen and Ludvig authored Jan 17, 2025
1 parent a827b97 commit b54ce45
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 6 deletions.
14 changes: 14 additions & 0 deletions functional-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,20 @@ assert_in_stderr "--max-frag-len was lower than --min-frag-len."
assert_exit_code 2


# fragment-mode
run fragment_mode $exe t --fragment-mode tests/full-fragment-pairs.bam
assert_equal "$(zcat < t.per-base.bed.gz)" "chr22:20000000-23000000 0 17318 0
chr22:20000000-23000000 17318 17320 1
chr22:20000000-23000000 17320 17420 2
chr22:20000000-23000000 17420 17756 1
chr22:20000000-23000000 17756 52130 0
chr22:20000000-23000000 52130 52135 1
chr22:20000000-23000000 52135 52235 2
chr22:20000000-23000000 52235 52546 1
chr22:20000000-23000000 52546 3000001 0"
assert_exit_code 0


unset MOSDEPTH_Q0
unset MOSDEPTH_Q1
unset MOSDEPTH_Q2
Expand Down
29 changes: 23 additions & 6 deletions mosdepth.nim
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ proc coverage(bam: hts.Bam, arr: var coverage_t, region: var region_t,
targets: seq[Target], mapq: int = -1, min_len: int = -1,
max_len: int = int.high, eflag: uint16 = 1796, iflag: uint16 = 0,
read_groups: seq[string] = (@[]), fast_mode: bool = false,
last_tid: var int = -1): int =
fragment_mode: bool = false, last_tid: var int = -1): int =
# depth updates arr in-place and yields the tid for each chrom.
# returns -1 if the chrom is not found in the bam header
# returns -2 if the chrom was found in the header, but there was no data for it
Expand Down Expand Up @@ -276,7 +276,7 @@ proc coverage(bam: hts.Bam, arr: var coverage_t, region: var region_t,
# rec: --------------
# mate: ------------
# handle overlapping mate pairs.
if (not fast_mode) and rec.flag.proper_pair and (
if (not fast_mode) and (not fragment_mode) and rec.flag.proper_pair and (
not rec.flag.supplementary):
if rec.b.core.tid == rec.b.core.mtid and rec.stop > rec.matepos and
# First case is partial overlap, second case is complete overlap
Expand Down Expand Up @@ -325,9 +325,20 @@ proc coverage(bam: hts.Bam, arr: var coverage_t, region: var region_t,
last_pos = p.pos
if pair_depth != 0: echo $rec.qname & ":" & $rec & " " &
$mate.qname & ":" & $mate & " " & $pair_depth

if fast_mode:
arr[rec.start] += 1
arr[rec.stop] -= 1

elif fragment_mode:
# only count read1 from proper pairs
if rec.flag.read2 or (not rec.flag.proper_pair) or rec.flag.supplementary:
continue

var fragment_start = min(rec.start, rec.matepos)
arr[fragment_start] += 1
arr[fragment_start + abs(rec.isize)] -= 1

else:
inc_coverage(rec.cigar, rec.start.int, arr)

Expand Down Expand Up @@ -563,8 +574,7 @@ proc to_tuples(targets: seq[Target]): seq[tuple[name: string, length: int]] =
result[i] = (t.name, t.length.int)

proc main(bam: hts.Bam, chrom: region_t, mapq: int, min_len: int, max_len: int, eflag: uint16, iflag: uint16, region: string, thresholds: seq[int],
fast_mode: bool, args: Table[string, docopt.Value],
use_median: bool = false, use_d4: bool = false) =
fast_mode: bool, args: Table[string, docopt.Value], use_median: bool = false, fragment_mode: bool = false, use_d4: bool = false) =
# windows are either from regions, or fixed-length windows.
# we assume the input is sorted by chrom.
var
Expand Down Expand Up @@ -676,7 +686,7 @@ proc main(bam: hts.Bam, chrom: region_t, mapq: int, min_len: int, max_len: int,
continue
rchrom = region_t(chrom: target.name)
var tid = coverage(bam, arr, rchrom, targets, mapq, min_len, max_len, eflag,
iflag, read_groups = read_groups, fast_mode = fast_mode,
iflag, read_groups = read_groups, fast_mode = fast_mode, fragment_mode = fragment_mode,
last_tid = last_tid)
if tid == -1: continue # -1 means that chrom is not even in the bam
if tid != -2: # -2 means there were no reads in the bam
Expand Down Expand Up @@ -872,6 +882,7 @@ Other options:
-F --flag <FLAG> exclude reads with any of the bits in FLAG set [default: 1796]
-i --include-flag <FLAG> only include reads with any of the bits in FLAG set. default is unset. [default: 0]
-x --fast-mode dont look at internal cigar operations or correct mate overlaps (recommended for most use-cases).
-a --fragment-mode count the coverage of the full fragment including the full insert (proper pairs only).
-q --quantize <segments> write quantized output see docs for description.
-Q --mapq <mapq> mapping quality threshold. reads with a quality less than this value are ignored [default: 0]
-l --min-frag-len <min-frag-len> minimum insert size. reads with a smaller insert size than this are ignored [default: -1]
Expand Down Expand Up @@ -904,12 +915,17 @@ Other options:
region: string
thresholds: seq[int] = threshold_args($args["--thresholds"])
fast_mode: bool = args["--fast-mode"]
fragment_mode: bool = args["--fragment-mode"]
use_median: bool = args["--use-median"]
when defined(d4):
var use_d4: bool = args["--d4"] and not args["--no-per-base"]
else:
var use_d4: bool = false

if fragment_mode and fast_mode:
stderr.write_line("[mosdepth] error only one of --fast-mode and --fragment-mode can be specified")
quit(2)

if $args["--by"] != "nil":
region = $args["--by"]
else:
Expand Down Expand Up @@ -948,4 +964,5 @@ Other options:
check_chrom(chrom, bam.hdr.targets)

main(bam, chrom, mapq, min_len, max_len, eflag, iflag, region, thresholds,
fast_mode, args, use_median = use_median, use_d4 = use_d4)
fast_mode, args, use_median = use_median, fragment_mode = fragment_mode, use_d4 = use_d4)

Binary file added tests/full-fragment-pairs.bam
Binary file not shown.
Binary file added tests/full-fragment-pairs.bam.bai
Binary file not shown.

0 comments on commit b54ce45

Please sign in to comment.