edIted

#!/usr/bin/env python3

''' edIted: statistical comparison of RNA editing '''

import gzip
import logging
import re
from argparse import ArgumentParser, FileType
from bisect import bisect
from collections import Counter, defaultdict as dd
from functools import total_ordering
from itertools import chain
from math import ceil, log10, pow
from signal import signal, SIGPIPE, SIG_DFL
from sys import stdout
from typing import Any, Iterator, Tuple

import numpy as np
from scipy.stats import dirichlet


__author__ = 'George R Young'
__maintainer__ = 'George R Young'
__email__ = 'george.young@crick.ac.uk'
__version__ = '0.9'
__stamp__ = 'GY200916'
__status__ = 'Development'
__license__ = 'MIT license'


signal(SIGPIPE, SIG_DFL)  # Gracefully handle downstream PIPE closure
np.seterr(all='ignore')  # Silence numpy stderr warnings


# Globals #####################################################################


rc = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
other_nts = {'A': 'CGT', 'C': 'AGT', 'G': 'ACT', 'T': 'ACG'}
indel_re = re.compile('[0-9]+')


# Classes #####################################################################


class Empty(object):
    ''' Dummy class to allow class creation without __init__ being called when
    copying Pileup instances '''
    pass


@total_ordering
class Pileup(object):
    ''' A class for holding per-base stranded mpileup data '''

    def __init__(self, mpileup: str) -> None:
        self.mpileup = mpileup.strip().split('\t')
        self.chrom = self.mpileup[0]
        self.pos = int(self.mpileup[1])
        self.ref = self.mpileup[2]

    def _parse(self) -> None:
        self.base_counts = {
            True: {'A': 0, 'C': 0, 'G': 0, 'T': 0},
            False: {'A': 0, 'C': 0, 'G': 0, 'T': 0}}
        self.base_probs = {
            True: {'A': [], 'C': [], 'G': [], 'T': []},
            False: {'A': [], 'C': [], 'G': [], 'T': []}}
        mapping = self.mpileup[4]
        quals = self.mpileup[5]
        try:
            strand = self.mpileup[args.ts - 1].split(',')
            if strand[0] not in '+-':
                raise
        except:
            raise ValueError(
                'check the position of the `TS` information in the mpileup')
        strand = [True if s == '+' else False for s in strand]
        i, j = 0, 0
        while i < len(mapping):
            if mapping[i] in '.,ACGTacgt':
                b = self.ref if mapping[i] in '.,' else mapping[i].upper()
                if (s := strand[j]) is False:
                    b = rc[b]
                p = pow(10, -ord(quals[j]) / 10)
                self.base_counts[s][b] += 1
                self.base_probs[s][b].append(1 - p)
                for a in other_nts[b]:
                    self.base_probs[s][a].append(p / 3)
                i += 1
                j += 1
            elif mapping[i] in '<>*#Nn':
                i += 1
                j += 1
            elif mapping[i] == '^':
                i += 2
            elif mapping[i] == '$':
                i += 1
            elif mapping[i] in '-+':
                result = indel_re.match(mapping, i + 1)
                length = int(mapping[result.start():result.end()])
                i = result.end() + length

    def _is_valid_operand(self, other: object) -> bool:
        return type(self) == type(other) and \
            hasattr(other, 'chrom') and hasattr(other, 'pos')

    def __eq__(self, other: object) -> bool:
        if not self._is_valid_operand(other):
            return NotImplemented
        return (self.chrom, self.pos) == (other.chrom, other.pos)

    def __lt__(self, other: object) -> bool:
        if not self._is_valid_operand(other):
            return NotImplemented
        return (self.chrom, self.pos) < (other.chrom, other.pos)

    def __getitem__(self, ktuple: Tuple[str, bool]) -> int:
        base, strand = ktuple
        try:
            return self.base_counts[strand][base]
        except AttributeError:
            self._parse()
            return self[ktuple]

    def __getattr__(self, name: str) -> Any:
        try:
            self._parse()
            if name not in self.__dict__:
                raise AttributeError
            return self.__dict__[name]
        except AttributeError:
            raise AttributeError(f'{name} is not a Pileup attribute')

    def __add__(self, other: object) -> object:
        if self == other:
            obj = Empty()
            obj.__class__ = self.__class__
            obj.chrom, obj.pos, obj.ref = self.chrom, self.pos, self.ref
            obj.base_counts = {True: {}, False: {}}
            obj.base_probs = {True: {}, False: {}}
            for strand in (True, False):
                for base in 'ACGT':
                    obj.base_counts[strand][base] = \
                        self[base, strand] + other[base, strand]
                    obj.base_probs[strand][base] = \
                        self.base_probs[strand][base] + \
                        other.base_probs[strand][base]
            return obj
        else:
            raise ValueError('Pileup objects do not refer to the same position')

    def __radd__(self, other: object) -> object:
        if other == 0:
            return self
        return self.__add__(other)

    def depth(self, strand=None) -> int:
        try:
            if strand is not None:
                return sum(self.base_counts[strand].values())
            return sum(v2 for v in self.base_counts.values()
                       for v2 in v.values())
        except AttributeError:
            self._parse()
            return self.depth(strand)

    def is_potential_edit(self, a: str, strand: bool) -> bool:
        try:
            d = self.depth(strand)
            return max(ceil(d * args.min_edited), args.min_alt_depth) \
                <= self[a, strand] \
                < ceil(d * args.max_edited) \
                and d >= ceil(self.depth(not strand) * 0.025)
        except AttributeError:
            self._parse()
            return self.is_potential_edit(strand)

    def a(self, strand: bool, noise: float) -> np.ndarray:
        ''' unscaled alpha values for scipy.stats.dirichlet '''
        try:
            b = self.ref if strand else rc[self.ref]
            return np.array(list(
                (0.0 if k == b else noise / 3) + (sum(v) / len(v) if v else 0.0)
                for k, v in self.base_probs[strand].items()))
        except AttributeError:
            self._parse()
            return self.a(strand)

    def x(self, strand: bool) -> np.ndarray:
        ''' quantile values for scipy.stats.dirichlet '''
        try:
            return np.array(list(
                sum(v) / len(v) if v else 0.0
                for k, v in self.base_probs[strand].items()))
        except AttributeError:
            self._parse()
            return self.x(strand)

    def __str__(self) -> str:
        try:
            return f'{self.chrom}\t{self.pos}\t{self.ref}\t' + \
                f'{self.base_counts[True]["A"]}:{self.base_counts[False]["A"]}\t' + \
                f'{self.base_counts[True]["C"]}:{self.base_counts[False]["C"]}\t' + \
                f'{self.base_counts[True]["G"]}:{self.base_counts[False]["G"]}\t' + \
                f'{self.base_counts[True]["T"]}:{self.base_counts[False]["T"]}'
        except AttributeError:
            self._parse()
            return self.__str__()


class Blacklist(object):
    ''' A class for holding blacklisted regions '''

    def __init__(self) -> None:
        self.blacklist = dd(list)

    def update(self, chrom: str, start: int, stop: int) -> None:
        left = right = bisect(self.blacklist[chrom], (start, stop))
        # print(left, right, start, stop)
        if right != len(self.blacklist[chrom]):
            if start == self.blacklist[chrom][right][0]:
                # completely contained already
                return
            else:
                # stretch rightwards to cover overlapping regions
                while right != len(self.blacklist[chrom]) \
                        and stop >= self.blacklist[chrom][right][0]:
                    stop = max(stop, self.blacklist[chrom][right][1])
                    right += 1
        if left > 0 and start <= self.blacklist[chrom][left - 1][1]:
            # stretch leftwards to cover overlapping regions
            left -= 1
            start = self.blacklist[chrom][left][0]
            stop = max(stop, self.blacklist[chrom][left][1])
        if left == right:
            self.blacklist[chrom].insert(left, (start, stop))
        else:
            self.blacklist[chrom][left:right] = [(start, stop)]

    def __contains__(self, key) -> bool:
        chrom, pos = key
        try:
            loc = bisect(self.blacklist[chrom], (pos, pos))
            if self.blacklist[chrom][loc - 1][1] >= pos \
                    or self.blacklist[chrom][loc][0] == pos:
                return True
        except (KeyError, IndexError) as e:
            pass
        return False

    def __len__(self) -> int:
        return sum(len(v) for v in self.blacklist.values())


# Functions ###################################################################


def shared_elements(*inputs: Iterator) -> Iterator[list]:
    ''' Aligns an arbitrary number of sorted iterables and yields matching
    groups '''
    End = object()
    inputs = [chain(i, [End]) for i in inputs]
    vals = [next(i) for i in inputs]
    while not any(v is End for v in vals):
        low = min(vals)
        if all(v == low for v in vals):
            yield vals
            vals = [next(i) for i in inputs]
        else:
            vals = [next(i) if v == low else v for i, v in zip(inputs, vals)]


def align_elements(*inputs: Iterator, missing: Any=None) -> Iterator[tuple]:
    ''' Aligns an arbitrary number of sorted iterables and yields groups,
    filling missing entries for individual iterables with `missing` '''
    End = object()
    inputs = [chain(i, [End]) for i in inputs]
    vals = [next(i) for i in inputs]
    while not all(v is End for v in vals):
        low = min(v for v in vals if v is not End)
        yield tuple(v if v == low else missing for v in vals)
        vals = [next(i) if v == low else v for i, v in zip(inputs, vals)]


def detect(tests: Tuple[Pileup], ctrls: Tuple[Pileup]=tuple()) -> bool:
    ''' Conduct comparisons of Dirichlet models of base frequency to determine
    if a site is edited or, with control data, differentially edited '''
    strand = None
    # determine which strand matches args.edit[0] and check potential edits
    if tests[0].ref == ref \
            and all(t.depth(True) >= args.min_depth for t in tests):
        test = sum(tests)
        if test.is_potential_edit(alt, True) \
                and (n_edited := 1 if len(tests) == 1 else
                     sum(t[alt, True] > 0 for t in tests)) >= args.reps:
            strand = True
    elif tests[0].ref == _ref \
            and all(t.depth(False) >= args.min_depth for t in tests):
        test = sum(tests)
        if test.is_potential_edit(alt, False) \
                and (n_edited := 1 if len(tests) == 1 else
                     sum(t[alt, False] > 0 for t in tests)) >= args.reps:
            strand = False
    if strand is None:
        return False
    # if we're doing a differential assessment, check the control data
    if ctrls:
        ctrl = sum(ctrls)
        try:
            if ctrl.depth(strand) < args.min_depth or \
                    (test[alt, strand] / test.depth(strand)) / \
                    (ctrl[alt, strand] / ctrl.depth(strand)) < args.min_fold:
                return False
        except ZeroDivisionError:
            pass
    try:
        # determine if the base displays significant editing
        scale = test.depth(strand)
        test_x = test.x(strand)
        test_a = test.a(strand, args.detect_noise) * scale
        Z = log10(
            dirichlet.pdf(test_x, test_a)
            / dirichlet.pdf(test_x, background_probs[ref] * scale))
        Z *= n_edited / len(tests)
        if Z > args.z_score:
            if ctrls:
                # determine if the editing is differential
                scale = min(test.depth(strand), ctrl.depth(strand))
                test_a = test.a(strand, args.differential_noise) * scale
                ctrl_x = ctrl.x(strand)
                ctrl_a = ctrl.a(strand, args.differential_noise) * scale
                Z = log10(
                    (dirichlet.pdf(test_x, test_a) * dirichlet.pdf(ctrl_x, ctrl_a))
                    / (dirichlet.pdf(ctrl_x, test_a) * dirichlet.pdf(test_x, ctrl_a)))
                Z *= n_edited / len(tests)
                if Z > args.z_score:
                    print(
                        f'{test.chrom}\t'
                        f'{test.pos - 1}\t'  # half-open 0-based ... [0,1)
                        f'{test.pos}\t'
                        f'{ref}{alt}'
                        f'_{test[alt, strand]}_{test.depth(strand)}'
                        f'_{ctrl[alt, strand]}_{ctrl.depth(strand)}\t'
                        f'{Z:.3f}\t'
                        f'{"+" if strand else "-"}\t'
                        f'''{",".join(f'{t[alt, strand]}_{t.depth(strand)}' for t in tests)}\t'''
                        f'''{",".join(f'{c[alt, strand]}_{c.depth(strand)}' for c in ctrls)}''',
                        file=args.output, flush=True)
                    return True
            else:
                print(
                    f'{test.chrom}\t'
                    f'{test.pos - 1}\t'  # half-open 0-based ... [0,1)
                    f'{test.pos}\t'
                    f'{ref}{alt}_{test[alt, strand]}_{test.depth(strand)}\t'
                    f'{Z:.3f}\t'
                    f'{"+" if strand else "-"}\t'
                    f'''{",".join(f'{t[alt, strand]}_{t.depth(strand)}' for t in tests)}''',
                    file=args.output, flush=True)
                return True
    except ValueError:
        # Swallow occasional numpy math errors
        pass


###############################################################################


if __name__ == '__main__':

    # Parse commandline args ##################################################

    parser = ArgumentParser(
        description='edIted: statistical comparison of RNA editing',
        prog='edIted',
        epilog=f'%(prog)s v{__version__}_{__stamp__} ({__status__})')
    parser.add_argument(
        '-t', '--test', nargs='+', required=True,
        help='test/treated mpileup file(s), with multiple files treated as \
        biological replicates')
    parser.add_argument(
        '-c', '--control', nargs='+',
        help='optional control/untreated mpileup file(s), which will make \
        edIted run a differential editing analysis')
    parser.add_argument(
        '-R', '--region', type=str, default='',
        help='only run edIted on a specific sequence (e.g. "chr1")')
    parser.add_argument(
        '-o', '--output', type=FileType('w'), default=stdout,
        help='write output to this file instead of piping to stdout')
    parser.add_argument(
        '-e', '--edit', type=str, default='AG',
        help='search for this type of editing (default %(default)s)')
    parser.add_argument(
        '-r', '--reps', type=int, default=1,
        help='where replicates are given, the minimum number in which editing \
        must be found (default %(default)s)')
    parser.add_argument(
        '--detect_noise', type=float, default=0.001,
        help='fixed error probability to spread between alt bases for \
        modelling sequencing-independent error sources when detecting editing \
        sites (default %(default)s)')
    parser.add_argument(
        '--differential_noise', type=float, default=0.01,
        help='fixed error probability to spread between alt bases for \
        modelling sequencing-independent error sources when determining \
        differential editing (default %(default)s)')
    parser.add_argument(
        '--z_score', type=float, default=2.58,
        help='threshold Z score for discriminating the goodness of fit of \
        Dirichlet models of nucleotide frequencies (default %(default)s)')
    parser.add_argument(
        '--min_fold', type=float, default=2.0,
        help='minimum fold difference required for differential editing to be \
        reported at a base (default %(default)s)')
    parser.add_argument(
        '--min_depth', type=int, default=5,
        help='required coverage (in each sample if biological replicates are \
        provided) for a base to be considered (default %(default)s)')
    parser.add_argument(
        '--min_alt_depth', type=int, default=2,
        help='minimum alt count for editing to be considered (see also \
        `--min_edited`) (default %(default)s)')
    parser.add_argument(
        '--min_edited', type=float, default=0.01,
        help='minimum proportion (0 - <1) of bases at a site that must \
        display editing (see also `--min_alt_depth`) (default %(default)s)')
    parser.add_argument(
        '--max_edited', type=float, default=0.9,
        help='filter away edits with a frequency (>0 - 1) exceeding this \
        value to prevent inclusion of SNPs that are not passed through \
        `--blacklist` (default %(default)s)')
    parser.add_argument(
        '--ts', type=int, default=7,
        help='column in the mpileup inputs in which the TS (originating \
        transcript strand) data can be found (default %(default)s)')
    parser.add_argument(
        '--blacklist', nargs='+',
        help='file(s) containing regions (BED format) to exclude from \
        analyses, e.g. splice sites, ENCODE blacklisted regions, etc')
    parser.add_argument(
        '-q', '--quiet', action='store_true',
        help='silence progress reporting')
    args = parser.parse_args()

    if not 0.0 <= args.min_edited < 1.0:
        raise ValueError('--min_edited must be >=0.0 & <1.0')
    if not 0.0 < args.max_edited <= 1.0:
        raise ValueError('--filter must be >0.0 & <=1.0')
    if not 0 < args.reps <= len(args.test):
        raise ValueError('--reps must be >= 1 & <= number of test replicates')
    args.edit = args.edit.upper()
    if len(args.edit) != 2:
        raise ValueError('--edit must be passed two nucleotides, e.g. "AG"')
    if not all(c in 'ACGT' for c in args.edit):
        raise ValueError('--edit nucleotides must be A, C, G, or T')
    ref, alt = args.edit
    _ref = rc[ref]
    ref_bases = ref + _ref
    if args.region:
        line_start = args.region + '\t'

    # Setup logging system ####################################################

    logger = logging.getLogger()
    logstream = logging.StreamHandler()
    logstream.setFormatter(logging.Formatter(
        '%(asctime)s\t%(message)s', '%y%m%d %H:%M:%S'))
    logging.basicConfig(
        level=logging.ERROR if args.quiet else logging.INFO,
        handlers=[logstream])
    logger.info(f'edIted v{__version__}_{__stamp__} ({__status__})')

    ###########################################################################

    if args.blacklist:
        logger.info('Preparing blacklist')
        B = Blacklist()
        for f in args.blacklist:
            logger.info(f'. {f}')
            regions = 0
            for chrom, start, stop, *_ in \
                    (line.strip().split('\t') for line in gzip.open(f, 'rt')):
                if args.region and chrom != args.region:
                    continue
                B.update(chrom, int(start) + 1, int(stop))
                regions += 1
            logger.info(f'.. {regions} regions')
        logger.info(f'{len(B)} unique blacklisted regions prepared')

    ###########################################################################

    logger.info('Determining background error rates')
    alt_freq = {k: Counter({'A': 0, 'C': 0, 'G': 0, 'T': 0}) for k in 'ACGT'}
    aligned_pileups = shared_elements(
        *((Pileup(line) for line in gzip.open(f, 'rt')) for f in args.test))
    for _ in range(1_000_000):
        try:
            base = sum(next(aligned_pileups))
            if base.ref != 'N':
                alt_freq[base.ref].update(base.base_counts[True])
                alt_freq[rc[base.ref]].update(base.base_counts[False])
        except StopIteration:
            break
    background_probs = {}
    for base in 'ACGT':
        total = sum(alt_freq[base].values())
        background_probs[base] = np.array(list({
            k: (v / total) + (0.0 if k == base else args.detect_noise / 3)
            for k, v in alt_freq[base].items()}.values()))
    del(aligned_pileups, alt_freq, base, total)

    ###########################################################################

    significant_bases = 0

    if args.control:
        # Differential analysis pipeline

        logger.info('Scanning for strand-specific differential base modification')
        print('track name=edIted description="Stranded differential RNA modifications" useScore=1',
              file=args.output, flush=True)

        if len(args.test) == 1 == len(args.control):
            if args.region:
                aligned_pileups = shared_elements(
                    (Pileup(line) for line in gzip.open(args.test[0], 'rt')
                        if line.startswith(line_start)),
                    (Pileup(line) for line in gzip.open(args.control[0], 'rt')
                        if line.startswith(line_start)))
            else:
                aligned_pileups = shared_elements(
                    (Pileup(line) for line in gzip.open(args.test[0], 'rt')),
                    (Pileup(line) for line in gzip.open(args.control[0], 'rt')))
        elif args.region:
            aligned_pileups = align_elements(
                *((Pileup(line) for line in gzip.open(f, 'rt')
                    if line.startswith(line_start)) for f in args.test),
                *((Pileup(line) for line in gzip.open(f, 'rt')
                    if line.startswith(line_start)) for f in args.control))
        else:
            aligned_pileups = align_elements(
                *((Pileup(line) for line in gzip.open(f, 'rt'))
                    for f in args.test),
                *((Pileup(line) for line in gzip.open(f, 'rt'))
                    for f in args.control))
        for base_pileup in aligned_pileups:
            test_pileup = \
                tuple(i for i in base_pileup[:len(args.test)] if i is not None)
            if len(test_pileup) < args.reps \
                    or test_pileup[0].ref not in ref_bases \
                    or (args.blacklist
                        and (test_pileup[0].chrom, test_pileup[0].pos) in B):
                continue
            ctrl_pileups = \
                tuple(i for i in base_pileup[len(args.test):]
                      if i is not None)
            if ctrl_pileups and detect(test_pileup, ctrl_pileups):
                significant_bases += 1

    else:
        # Detect modifications only

        logger.info('Scanning for strand-specific base modifications')
        print('track name=edIted description="Stranded RNA modifications" useScore=1',
            file=args.output, flush=True)

        if len(args.test) == 1:
            if args.region:
                aligned_pileups = \
                    ((Pileup(line),) for line in gzip.open(args.test[0], 'rt')
                        if line.startswith(line_start))
            else:
                aligned_pileups = \
                    ((Pileup(line),) for line in gzip.open(args.test[0], 'rt'))
        elif args.region:
            aligned_pileups = align_elements(
                *((Pileup(line) for line in gzip.open(f, 'rt')
                    if line.startswith(line_start)) for f in args.test))
        else:
            aligned_pileups = align_elements(
                *((Pileup(line) for line in gzip.open(f, 'rt'))
                    for f in args.test))
        for base_pileup in aligned_pileups:
            base_pileup = \
                tuple(i for i in base_pileup[:len(args.test)]
                      if i is not None)
            if len(base_pileup) >= args.reps \
                    and base_pileup[0].ref in ref_bases \
                    and (not args.blacklist \
                        or (base_pileup[0].chrom, base_pileup[0].pos) not in B) \
                    and detect(base_pileup):
                significant_bases += 1

    ###########################################################################

    logger.info(f'{significant_bases} significant bases found')
    logging.info('Finished')