diff --git a/misc/regression/cpuusage.py b/misc/regression/cpuusage.py new file mode 100644 index 000000000..b30ed708c --- /dev/null +++ b/misc/regression/cpuusage.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python +# +# +# Copyright 2016, NICTA +# +# This software may be distributed and modified according to the terms of +# the BSD 2-Clause license. Note that NO WARRANTY is provided. +# See "LICENSE_BSD2.txt" for details. +# +# @TAG(NICTA_BSD) +# + +''' +Monitors the total CPU usage of a process and its children. Usage is similar +to the UNIX `time` utility. + +NB: In order to get up-to-date information, we don't use interfaces such + as os.times, which only includes terminated and waited-for children. + Instead, we poll the process tree regularly. This means that when a + child process dies, its CPU time since the last poll is lost and not + included in the total time. + + Hence the total reported time will be an underestimate of the true + CPU usage, especially for short-lived child processes. +''' + +from __future__ import print_function +import os +import psutil +import signal +import subprocess +import sys +import threading +import time +import warnings + +try: + import psutil + if not hasattr(psutil.Process, "children") and hasattr(psutil.Process, "get_children"): + psutil.Process.children = psutil.Process.get_children + if not hasattr(psutil.Process, "memory_maps") and hasattr(psutil.Process, "get_memory_maps"): + psutil.Process.memory_maps = psutil.Process.get_memory_maps + +except ImportError: + print("Error: 'psutil' module not available. Run\n" + "\n" + " pip install --user psutil\n" + "\n" + "to install.", file=sys.stderr) + sys.exit(1) + +class Poller(threading.Thread): + '''Subclass of threading.Thread that monitors CPU usage of another process. + Use run() to start the process. + Use cpu_usage() to retrieve the latest estimate of CPU usage.''' + def __init__(self, pid): + super(Poller, self).__init__() + # Daemonise ourselves to avoid delaying exit of the process of our + # calling thread. + self.daemon = True + self.pid = pid + self.finished = False + self.started = threading.Semaphore(0) + self.proc = None + + # Reported stat. + self.cpu = 0.0 + + # Remember CPU times of recently seen children. + # This is to prevent double-counting for child processes. + self.current_children = {} # {(pid, create_time): CPU time} + # CPU time of dead children is recorded here. + self.old_children_cpu = 0.0 + + def run(self): + def update(): + total = 0.0 + + # Fetch process's usage. + try: + if self.proc is None: + self.proc = psutil.Process(self.pid) + total += sum(self.proc.cpu_times()) + + # Fetch children's usage. + new_current_children = {} + for c in self.proc.children(recursive=True): + try: + t = sum(c.cpu_times()) + new_current_children[(c.pid, c.create_time())] = t + total += t + except psutil.NoSuchProcess: + pass + except psutil.AccessDenied: + pass + + # For children that are no longer running, remember their + # most recently recorded CPU time. + reaped_cpu = 0.0 + for c_id, c_t in self.current_children.iteritems(): + if c_id not in new_current_children: + reaped_cpu += c_t + self.old_children_cpu += reaped_cpu + self.current_children = new_current_children + total += self.old_children_cpu + + except psutil.AccessDenied as err: + warnings.warn("access denied: pid=%d" % err.pid, RuntimeWarning) + + if total < self.cpu: + try: + cmd = repr(' '.join(self.proc.cmdline())) + except Exception: + cmd = '??' + warnings.warn("cpu non-monotonic: %.15f -> %.15f, pid=%d, cmd=%s" % + (self.cpu, total, self.pid, cmd), + RuntimeWarning) + return total + + # Fetch a sample, and notify others that we have started. + self.cpu = update() + self.started.release() + + # Poll the process periodically. + # + # We poll quickly at the beginning and use exponential backout + # to try and get better stats on short-lived processes. + # + polling_interval = 0.01 + max_interval = 0.5 + while not self.finished: + time.sleep(polling_interval) + try: + self.cpu = update() + except psutil.NoSuchProcess: + break + if polling_interval < max_interval: + polling_interval = min(polling_interval * 1.5, max_interval) + + def cpu_usage(self): + return self.cpu + + def __enter__(self): + return self + + def __exit__(self, *_): + self.finished = True + +def process_poller(pid): + '''Initiate polling of a subprocess. This is intended to be used in a + `with` block.''' + # Create a new thread and start it up. + p = Poller(pid) + p.start() + + # Wait for the thread to record at least one sample before continuing. + p.started.acquire() + + return p + +def main(): + if len(sys.argv) <= 1 or sys.argv[1] in ['-?', '--help']: + print('Usage: %s command args...\n Measure total CPU ' \ + 'usage of a command' % sys.argv[0], file=sys.stderr) + return -1 + + # Run the command requested. + try: + p = subprocess.Popen(sys.argv[1:]) + except OSError: + print('command not found', file=sys.stderr) + return -1 + + cpu = 0 + m = process_poller(p.pid) + while True: + try: + p.returncode = p.wait() + break + except KeyboardInterrupt: + # The user Ctrl-C-ed us. The child should have received SIGINT; + # continue waiting for it to finish + pass + + print('Total cpu %f seconds' % m.cpu_usage(), file=sys.stderr) + + return p.returncode + +if __name__ == '__main__': + sys.exit(main()) diff --git a/misc/regression/regression.dtd b/misc/regression/regression.dtd index eafdd69c6..c39c5c487 100644 --- a/misc/regression/regression.dtd +++ b/misc/regression/regression.dtd @@ -11,21 +11,25 @@ + + + + diff --git a/misc/regression/run_tests.py b/misc/regression/run_tests.py index 018566362..59be3a86c 100755 --- a/misc/regression/run_tests.py +++ b/misc/regression/run_tests.py @@ -12,14 +12,13 @@ # # Very simple command-line test runner. # -# Ignores timeouts. -# from __future__ import print_function import argparse import atexit import datetime +import cpuusage import fnmatch import memusage import os @@ -33,19 +32,20 @@ import subprocess import sys import testspec import threading +import time import traceback import xml.etree.ElementTree as ET -# Try importing psutil. -PS_UTIL_AVAILABLE = False try: import psutil - from psutil import NoSuchProcess - PS_UTIL_AVAILABLE = True if not hasattr(psutil.Process, "children") and hasattr(psutil.Process, "get_children"): + # psutil API change psutil.Process.children = psutil.Process.get_children except ImportError: - pass + print("Error: failed to import psutil module.\n" + "To install psutil, try:\n" + " pip install --user psutil", file=sys.stderr) + sys.exit(2) ANSI_RESET = "\033[0m" ANSI_RED = "\033[31;1m" @@ -75,27 +75,24 @@ def which(file): # are looking at it, but not where a PID has been reused. # def kill_family(parent_pid): - if not PS_UTIL_AVAILABLE: - return - # Find process. try: process = psutil.Process(parent_pid) - except NoSuchProcess: + except psutil.NoSuchProcess: # Race. Nothing more to do. return # SIGSTOP everyone first. try: process.suspend() - except NoSuchProcess: + except psutil.NoSuchProcess: # Race. Nothing more to do. return process_list = [process] for child in process.children(recursive=True): try: child.suspend() - except NoSuchProcess: + except psutil.NoSuchProcess: # Race. pass else: @@ -109,10 +106,11 @@ def kill_family(parent_pid): # # Run a single test. # -# Return a tuple of (success, log, time_taken, memory_usage). +# Return a dict of (name, status, output, cpu time, elapsed time, memory usage). +# This is placed onto the given queue. # # Log only contains the output if verbose is *false*; otherwise, the -# log is output to stdout where we can't easily get to it. +# log is output to stdout where we can't easily get to it. # def run_test(test, status_queue, verbose=False): # Construct the base command. @@ -142,6 +140,7 @@ def run_test(test, status_queue, verbose=False): # Start the command. peak_mem_usage = None + cpu_usage = None try: process = subprocess.Popen(command, stdout=output, stderr=subprocess.STDOUT, stdin=subprocess.PIPE, @@ -161,6 +160,7 @@ def run_test(test, status_queue, verbose=False): # Setup an alarm at the timeout. was_timeout = [False] # Wrap in list to prevent do_timeout getting the wrong variable scope + was_cpu_timeout = [False] # as above def do_timeout(): was_timeout[0] = True kill_family(process.pid) @@ -168,11 +168,31 @@ def run_test(test, status_queue, verbose=False): if test.timeout > 0: timer.start() - # Wait for the command to finish. - with memusage.process_poller(process.pid) as m: - (output, _) = process.communicate() - peak_mem_usage = m.peak_mem_usage() - process_running = False + with cpuusage.process_poller(process.pid) as c: + # Also set a CPU timeout. We poll the cpu usage periodically. + def cpu_timeout(): + interval = min(0.5, test.cpu_timeout / 10.0) + while process_running: + if c.cpu_usage() > test.cpu_timeout: + was_cpu_timeout[0] = True + kill_family(process.pid) + break + time.sleep(interval) + + if test.cpu_timeout > 0: + cpu_timer = threading.Thread(target=cpu_timeout) + cpu_timer.daemon = True + cpu_timer.start() + + with memusage.process_poller(process.pid) as m: + # Wait for the command to finish. + (output, _) = process.communicate() + peak_mem_usage = m.peak_mem_usage() + cpu_usage = c.cpu_usage() + + process_running = False + if test.cpu_timeout > 0: + cpu_timer.join() # Cancel the alarm. Small race here (if the timer fires just after the # process finished), but the returncode of our process should still be 0, @@ -185,7 +205,7 @@ def run_test(test, status_queue, verbose=False): output = output.decode() if process.returncode == 0: status = "pass" - elif was_timeout[0]: + elif was_timeout[0] or was_cpu_timeout[0]: status = "TIMEOUT" else: status = "FAILED" @@ -193,6 +213,7 @@ def run_test(test, status_queue, verbose=False): 'status': status, 'output': output, 'real_time': datetime.datetime.now() - start_time, + 'cpu_time': cpu_usage, 'mem_usage': peak_mem_usage}) # Print a status line. @@ -202,15 +223,21 @@ def print_test_line_start(test_name, legacy=False): print(" Started %-25s " % (test_name + " ...")) sys.stdout.flush() -def print_test_line(test_name, color, status, time_taken, mem, legacy=False): - if mem: +def print_test_line(test_name, color, status, real_time=None, cpu_time=None, mem=None, legacy=False): + if mem is not None: # Report memory usage in gigabytes. mem = '%5.2fGB' % round(float(mem) / 1024 / 1024 / 1024, 2) - if time_taken: - # Strip milliseconds for better printing. - time_taken = datetime.timedelta(seconds=int(time_taken.total_seconds())) - time_taken = '%8s' % time_taken - extras = ', '.join(filter(None, [time_taken, mem])) + + if real_time is not None: + # Format times as H:MM:SS; strip milliseconds for better printing. + real_time = datetime.timedelta(seconds=int(real_time.total_seconds())) + real_time = '%8s real' % real_time + + if cpu_time is not None: + cpu_time = datetime.timedelta(seconds=int(cpu_time)) + cpu_time = '%8s cpu' % cpu_time + + extras = ', '.join(filter(None, [real_time, cpu_time, mem])) # Print status line. if legacy: @@ -251,6 +278,7 @@ def main(): help="list known tests") parser.add_argument("--legacy", action="store_true", help="use legacy 'IsaMakefile' specs") + # --legacy-status used by top-level regression-v2 script parser.add_argument("--legacy-status", action="store_true", help="emulate legacy (sequential code) status lines") parser.add_argument("-v", "--verbose", action="store_true", @@ -290,17 +318,6 @@ def main(): parser.error("Unknown test names: %s" % (", ".join(sorted(bad_names)))) tests_to_run = [t for t in tests if t.name in desired_names] - # If running at least one test, and psutil is not available, print a warning. - if len(tests_to_run) > 0 and not PS_UTIL_AVAILABLE: - print("\n" - "Warning: 'psutil' module not available. Processes may not be correctly\n" - "stopped. Run\n" - "\n" - " pip install --user psutil\n" - "\n" - "to install.\n" - "\n") - # Run the tests. print("Running %d test(s)..." % len(tests_to_run)) failed_tests = set() @@ -351,7 +368,7 @@ def main(): # Non-blocked but depends on a failed test. Remove it. if len(real_depends & failed_tests) > 0: wipe_tty_status() - print_test_line(t.name, ANSI_YELLOW, "skipped", None, None, args.legacy_status) + print_test_line(t.name, ANSI_YELLOW, "skipped", legacy=args.legacy_status) failed_tests.add(t.name) del tests_queue[i] break @@ -360,20 +377,23 @@ def main(): try: while True: info = status_queue.get(block=True, timeout=0.1337) # Built-in pause - name = info['name'] - del current_jobs[name] + name, status = info['name'], info['status'] - status, log, time_taken, mem = info['status'], info['output'], info['real_time'], info['mem_usage'] test_results[name] = info + del current_jobs[name] # Print result. wipe_tty_status() if status != 'pass': failed_tests.add(name) - print_test_line(name, ANSI_RED, "%s *" % status, time_taken, mem, args.legacy_status) + status += " *" + colour = ANSI_RED else: passed_tests.add(name) - print_test_line(name, ANSI_GREEN, status, time_taken, mem, args.legacy_status) + colour = ANSI_GREEN + print_test_line(name, colour, status, + real_time=info['real_time'], cpu_time=info['cpu_time'], mem=info['mem_usage'], + legacy=args.legacy_status) except Queue.Empty: pass wipe_tty_status() diff --git a/misc/regression/testspec.py b/misc/regression/testspec.py index a7d6cbb5a..ad0000022 100755 --- a/misc/regression/testspec.py +++ b/misc/regression/testspec.py @@ -31,13 +31,15 @@ class TestEnv(): self.pwd = pwd self.cwd = "." self.timeout = 0 + self.cpu_timeout = 0 self.depends = set() class Test(): - def __init__(self, name, command, timeout=0, cwd="", depends=None): + def __init__(self, name, command, timeout=0, cpu_timeout=0, cwd="", depends=None): self.name = name self.command = command self.timeout = timeout + self.cpu_timeout = cpu_timeout self.cwd = cwd if depends == None: @@ -53,6 +55,12 @@ def parse_attributes(tag, env, strict=True): except: if strict: raise + if tag.get("cpu-timeout"): + try: + env.cpu_timeout = float(tag.get("cpu-timeout")) + except: + if strict: + raise if tag.get("cwd"): env.cwd = tag.get("cwd") if tag.get("depends"): @@ -64,6 +72,7 @@ def parse_test(doc, env, strict=True): parse_attributes(doc, env, strict=strict) return Test(doc.get("name"), doc.text.strip(), timeout=env.timeout, + cpu_timeout=env.cpu_timeout, cwd=os.path.normpath(os.path.join(env.pwd, env.cwd)), depends=env.depends) @@ -347,8 +356,8 @@ def main(): # Print results for test in tests: - print("\"%s\" [timeout=%d, parents=%s, cwd=%s]" % ( - test.command, test.timeout, ",".join(test.depends), test.cwd)) + print("\"%s\" [timeout=%d, cpu-timeout=%g, parents=%s, cwd=%s]" % ( + test.command, test.timeout, test.cpu_timeout, ",".join(test.depends), test.cwd)) if __name__ == "__main__":