#!/usr/bin/env python """ Collect and report statistics on job run times To use metrics (which provide more accurate information), see: https://github.com/galaxyproject/galaxy/blob/dev/config/job_metrics_conf.xml.sample If you do not have metrics enabled, use the `--source history` option to use the less accurate job_state_history table. Examples -------- # Stats for Nate's runs of the Bowtie 2 tool installed from the Tool Shed (all # versions): % ./runtime_stats.py -c galaxy.ini -u nate@bx.psu.edu 'toolshed.g2.bx.psu.edu/repos/devteam/bowtie2/bowtie2/' # Stats for all runs of the Bowtie 2 tool installed from the Tool Shed (version # 0.4 only): % ./runtime_stats.py -c galaxy.ini 'toolshed.g2.bx.psu.edu/repos/devteam/bowtie2/bowtie2/0.4' # Stats for all runs of the Bowtie 2 tool installed from the Tool Shed but we # don't feel like figuring out or typing the long ID (matches any tool with # '/bowtie2/' in its full ID): % ./runtime_stats.py -c galaxy.ini --like 'bowtie2' # Stats for all runs of Tophat 2 that took longer than 2 minutes but less than # 2 days: % ./runtime_stats.py -c galaxy.ini --like -m $((2 * 60)) -M $((2 * 24 * 60 * 60)) 'tophat2' """ import argparse import os import re import sys import numpy import psycopg2 from sqlalchemy.engine import url galaxy_root = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)) sys.path.insert(1, os.path.join(galaxy_root, "lib")) import galaxy.config from galaxy.util.script import ( app_properties_from_args, populate_config_args, ) DATA_SOURCES = ("metrics", "history") METRICS_SQL = """ SELECT metric_value FROM job_metric_numeric jmn JOIN job j ON jmn.job_id = j.id WHERE j.state = 'ok' AND jmn.plugin = 'core' AND jmn.metric_name = 'runtime_seconds' {tool_clause} {user_clause} {time_clause} """ HISTORY_SQL = """ SELECT ctimes[1] - ctimes[2] AS delta FROM (SELECT jsh.job_id, array_agg(jsh.create_time ORDER BY jsh.create_time DESC) AS ctimes FROM job_state_history jsh JOIN job j ON jsh.job_id = j.id WHERE jsh.state IN ('running','ok') AND j.state = 'ok' {tool_clause} {user_clause} GROUP BY jsh.job_id) AS t_arrs {time_clause} """ def parse_arguments(): parser = argparse.ArgumentParser(description="Generate walltime statistics") parser.add_argument("tool_id", help="Tool (by ID) to collect stats about") parser.add_argument( "--like", action="store_true", default=False, help='Use SQL `LIKE` operator to find a shed-installed tool using the tool\'s "short" id', ) populate_config_args(parser) parser.add_argument("-d", "--debug", action="store_true", default=False, help="Print extra info") parser.add_argument("-m", "--min", type=int, default=-1, help="Ignore runtimes less than MIN seconds") parser.add_argument("-M", "--max", type=int, default=-1, help="Ignore runtimes greater than MAX seconds") parser.add_argument("-u", "--user", help="Return stats for only this user (id, email, or username)") parser.add_argument( "-s", "--source", default="metrics", help="Runtime data source (SOURCES: {})".format(", ".join(DATA_SOURCES)) ) args = parser.parse_args() if args.like and "/" in args.tool_id: print("ERROR: Do not use --like with a tool shed tool id (the tool id should not contain `/` characters)") sys.exit(2) args.source = args.source.lower() if args.source not in ("metrics", "history"): print("ERROR: Data source `{}` unknown, valid source are: {}".format(args.source, ", ".join(DATA_SOURCES))) app_properties = app_properties_from_args(args) config = galaxy.config.Configuration(**app_properties) uri = config.database_connection names = {"database": "dbname", "username": "user"} args.connect_args = url.make_url(uri).translate_connect_args(**names) if args.debug: print("Got options:") for key, value in vars(args).items(): print(f"{key}: {value}") return args def query( tool_id=None, user=None, like=None, source="metrics", connect_args=None, debug=False, min=-1, max=-1, **kwargs ): connect_arg_str = "" for k, v in connect_args.items(): connect_arg_str += f"{k}={v}" pc = psycopg2.connect(connect_arg_str) cur = pc.cursor() if user: try: user_id = int(user) except ValueError: if "@" not in user: field = "username" else: field = "email" sql = "SELECT id FROM galaxy_user WHERE {} = {}".format(field, "%s") cur.execute(sql, (user,)) if debug: print("Executed:") print(cur.query) row = cur.fetchone() if row: user_id = row[0] else: print(f"Invalid user: {user}") sys.exit(1) if like: query_tool_id = f"%/{tool_id}/%" elif "/" in tool_id and not re.match(r"\d+\.\d+", tool_id.split("/")[-1]): query_tool_id = f"{tool_id}%" like = True else: query_tool_id = tool_id sql_args = [query_tool_id] if like: tool_clause = "AND j.tool_id LIKE %s" else: tool_clause = "AND j.tool_id = %s" if user: user_clause = "AND j.user_id = %s" sql_args.append(user_id) else: user_clause = "" if source == "metrics": if min > 0 and max > 0: time_clause = """AND metric_value > %s AND metric_value < %s""" sql_args.append(min) sql_args.append(max) elif min > 0: time_clause = "AND metric_value > %s" sql_args.append(min) elif max > 0: time_clause = "AND metric_value < %s" sql_args.append(max) else: time_clause = "" sql = METRICS_SQL elif source == "history": if min > 0 and max > 0: time_clause = """WHERE ctimes[1] - ctimes[2] > interval %s AND ctimes[1] - ctimes[2] < interval %s""" sql_args.append(f"{min} seconds") sql_args.append(f"{max} seconds") elif min > 0: time_clause = "WHERE ctimes[1] - ctimes[2] > interval %s" sql_args.append(f"{min} seconds") elif max > 0: time_clause = "WHERE ctimes[1] - ctimes[2] < interval %s" sql_args.append(f"{max} seconds") else: time_clause = "" sql = HISTORY_SQL sql = sql.format(tool_clause=tool_clause, user_clause=user_clause, time_clause=time_clause) cur.execute(sql, sql_args) if debug: print("Executed:") print(cur.query) print(f"Query returned {cur.rowcount} rows") if source == "metrics": times = numpy.array([r[0] for r in cur if r[0]]) elif source == "history": times = numpy.array([r[0].total_seconds() for r in cur if r[0]]) print(f"Collected {times.size} times") if times.size == 0: return if user: print(f"Displaying statistics for user {user}") stats = ( ("Mean runtime", numpy.mean(times)), ("Standard deviation", numpy.std(times)), ("Minimum runtime", times.min()), ("Maximum runtime", times.max()), ) for name, seconds in stats: hours, minutes = nice_times(seconds) msg = name + f" is {seconds:0.0f} seconds" if minutes: msg += f" (={minutes:0.2f} minutes)" if hours: msg += f" (={hours:0.2f} hours)" print(msg) def nice_times(seconds): if seconds < 60 * 60: hours = None if seconds < 60: minutes = None else: minutes = seconds / 60 else: minutes = seconds / 60 hours = seconds / 60 / 60 return hours, minutes def main(): args = parse_arguments() query(**vars(args)) if __name__ == "__main__": main()