#!/usr/bin/env python
# Guruprasad Ananda
# Refactored 2011 to use numpy instead of rpy, Kanwei Li
"""
This tool provides the SQL "group by" functionality.
Arguments:
    1 output file name
    2 input file name
    3 grouping column
    4 ignore case (1/0)
    5 ascii to delete (comma separated list)
    6... op,col,do_round,default
"""

from __future__ import print_function

import random
import subprocess
import sys
import tempfile
from itertools import groupby

import numpy


def float_wdefault(s, d, c):
    """
    convert list of strings s into list of floats
    non convertable entries are replaced by d if d is not None (otherwise error)
    """
    for i in range(len(s)):
        try:
            s[i] = float(s[i])
        except ValueError:
            if d is not None:
                s[i] = d
            else:
                stop_err("non float value '%s' found in colum %d" % (s[i], c))
    return s


def stop_err(msg):
    sys.stderr.write(msg)
    sys.exit(1)


def mode(data):
    counts = {}
    for x in data:
        counts[x] = counts.get(x, 0) + 1
    maxcount = max(counts.values())
    modelist = []
    for x in counts:
        if counts[x] == maxcount:
            modelist.append(str(x))
    return ",".join(modelist)


def main():
    inputfile = sys.argv[2]
    ignorecase = int(sys.argv[4])
    ops = []
    cols = []
    round_val = []
    default_val = []

    # remove comment lines
    if sys.argv[5] != "None":
        asciitodelete = sys.argv[5]
        if asciitodelete:
            newinputfile = "input_cleaned.tsv"
            with open(inputfile) as oldfile, open(newinputfile, "w") as newfile:
                asciitodelete = {chr(int(_)) for _ in asciitodelete.split(",")}
                for line in oldfile:
                    if line[0] not in asciitodelete:
                        newfile.write(line)
            inputfile = newinputfile

    # get operations and options in separate arrays
    for var in sys.argv[6:]:
        op, col, do_round, default = var.split(",")
        ops.append(op)
        cols.append(col)
        round_val.append(do_round)
        default_val.append(float(default) if default != "" else None)

    # At this point, ops, cols and rounds will look something like this:
    # ops:  ['mean', 'min', 'c']
    # cols: ['1', '3', '4']
    # round_val: ['no', 'yes' 'no']
    # default_val: [0, 1, None]

    try:
        group_col = int(sys.argv[3]) - 1
    except Exception:
        stop_err("Group column not specified.")

    # sort file into a temporary file
    tmpfile = tempfile.NamedTemporaryFile(mode="r")
    try:
        """
        The -k option for the Posix sort command is as follows:
        -k, --key=POS1[,POS2]
        start a key at POS1, end it at POS2 (origin 1)
        In other words, column positions start at 1 rather than 0, so
        we need to add 1 to group_col.
        if POS2 is not specified, the newer versions of sort will consider the entire line for sorting. To prevent this, we set POS2=POS1.
        """
        group_col_str = str(group_col + 1)
        command_line = ["sort", "-t", "\t", "-k%s,%s" % (group_col_str, group_col_str), "-o", tmpfile.name, inputfile]
        if ignorecase == 1:
            command_line.append("-f")
    except Exception as exc:
        stop_err("Initialization error -> %s" % str(exc))

    try:
        subprocess.check_output(command_line, stderr=subprocess.STDOUT)
    except subprocess.CalledProcessError as e:
        stop_err("Sorting input dataset resulted in error: %s: %s" % (e.returncode, e.output.decode()))

    def is_new_item(line):
        try:
            item = line.rstrip("\r\n").split("\t")[group_col]
        except IndexError:
            stop_err("The following line didn't have %s columns: %s" % (group_col + 1, line))
        if ignorecase == 1:
            return item.lower()
        return item

    with open(sys.argv[1], "w") as fout:
        for key, line_list in groupby(tmpfile, key=is_new_item):
            op_vals = [[] for _ in ops]
            out_str = key

            for line in line_list:
                fields = line.split("\t")
                for i, col in enumerate(cols):
                    col = int(col) - 1  # cXX from galaxy is 1-based
                    try:
                        val = fields[col].strip()
                        op_vals[i].append(val)
                    except IndexError:
                        sys.stderr.write(
                            'Could not access the value for column %s on line: "%s". Make sure file is tab-delimited.\n'
                            % (col + 1, line)
                        )
                        sys.exit(1)

            # Generate string for each op for this group
            for i, op in enumerate(ops):
                data = op_vals[i]
                rval = ""
                if op == "mode":
                    rval = mode(data)
                elif op == "length":
                    rval = len(data)
                elif op == "random":
                    rval = random.choice(data)
                elif op in ["cat", "cat_uniq"]:
                    if op == "cat_uniq":
                        data = numpy.unique(data)
                    rval = ",".join(data)
                elif op == "unique":
                    rval = len(numpy.unique(data))
                else:
                    # some kind of numpy fn
                    try:
                        data = float_wdefault(data, default_val[i], col + 1)
                    except ValueError:
                        sys.stderr.write("Operation %s expected number values but got %s instead.\n" % (op, data))
                        sys.exit(1)
                    rval = getattr(numpy, op)(data)
                    if round_val[i] == "yes":
                        rval = int(round(rval))
                    else:
                        rval = "%g" % rval
                out_str += "\t%s" % rval

            fout.write(out_str + "\n")

    tmpfile.close()

    # Generate a useful info message.
    msg = "--Group by c%d: " % (group_col + 1)
    for i, op in enumerate(ops):
        if op == "cat":
            op = "concat"
        elif op == "cat_uniq":
            op = "concat_distinct"
        elif op == "length":
            op = "count"
        elif op == "unique":
            op = "count_distinct"
        elif op == "random":
            op = "randomly_pick"

        msg += op + "[c" + cols[i] + "] "

    print(msg)


if __name__ == "__main__":
    main()