lh-l4v/misc/autostop/stop.c

421 lines
11 KiB
C

/*
* Copyright 2020, Data61, CSIRO (ABN 41 687 119 230)
*
* SPDX-License-Identifier: BSD-2-Clause
*/
/*
* Running 64-bit Isabelle is a dangerous task. In particular, it is liable to
* suck up all your RAM and send your system into swap-death on quite a regular
* basis.
*
* This Linux utility will regularly scan the system for signs of swap-death
* (i.e., low memory and high pagefault rate and high load average) and send
* SIGSTOP to processes suspected of being the cause.
*
* When it triggers, it will write to syslog stating the process stopped.
*
* 2012 David Greenaway
*/
#define _GNU_SOURCE /* for asprintf */
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/sysinfo.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <dirent.h>
#include <unistd.h>
#include <signal.h>
#include <syslog.h>
#include <string.h>
/* A system load-average considered "dangerous". */
#define DANGEROUS_LOAD 3.0
/* A system load-average considered "very dangerous". */
#define VERY_DANGEROUS_LOAD 10.0
/* A percentage of RAM considered to be "dangerously low". */
#define DANGEROUS_FREE_RAM 0.02
/* A number of page-faults per second considered to be heavy swapping. */
#define DANGEROUS_FAULTS_PER_SECOND 100
/* A minimum Linux OOM score for a process to be considered for stopping. */
#define MIN_OOM_SCORE 3
/* Number of seconds we sleep between each system probe. */
#define SLEEP_TIME 5
/* Number of seconds we sleep for after stopping a process before considereing
* stopping another. */
#define SLEEP_AFTER_STOP_SECONDS 15
/* Scheduling priority we should run at. */
#define SCHED_PRIO (-10)
/* Misc OS constants. */
#define MAX_PATH_SIZE 1024
#define MAX_LINE_SIZE 1024
#define LINUX_SYSINFO_LOADS_SCALE 65536
void fatal(const char *str)
{
printf("%s\n", str);
exit(1);
}
/* Get the name of a process from its PID. */
int name_of(int pid, char *output, size_t len)
{
char *path;
if (asprintf(&path, "/proc/%d/cmdline", pid) == -1) {
return -1;
}
/* Open the process's command line details from /proc. */
FILE *f = fopen(path, "r");
free(path);
if (f == NULL) {
return -1;
}
/* Here we potentially read too much, but cmdline entries are NUL delimited
* so the resulting data is a valid C string of just the first argument as
* desired.
*/
int r = fread(output, len, 1, f);
(void)r;
fclose(f);
return 0;
}
/* Iterate through processes in the system. */
void iterate_processes(char **limit, void (*proc_fn)(int, void *), void *data)
{
/* Open /proc */
DIR *proc_dir = opendir("/proc");
if (proc_dir == NULL) {
fprintf(stderr, "Could not open /proc.");
exit(1);
}
/* Read through processes. */
while (1) {
/* Read directory. */
struct dirent *e = readdir(proc_dir);
if (e == NULL) {
break;
}
/* Skip non-directories. */
if ((e->d_type & DT_DIR) == 0) {
continue;
}
/* Process? */
int p = atoi(e->d_name);
if (p == 0) {
continue;
}
if (limit != NULL) {
int skip = 1;
/* Find the name of the process we're looking at. */
char name[PATH_MAX];
if (name_of(p, name, PATH_MAX) != 0)
/* This process doesn't have a name. Poor thing. */
{
continue;
}
/* Determine if this process matches any of the processes we should
* be considering.
*/
char **l;
for (l = limit; *l != NULL; l++) {
if (!strcmp(name, *l)) {
skip = 0;
break;
}
char *last_slash = strrchr(name, '/');
if (last_slash != NULL && !strcmp(last_slash + 1, *l)) {
skip = 0;
break;
}
}
if (skip == 1)
/* No match. */
{
continue;
}
#if DEBUG
printf("Considering %s...\n", name);
#endif
}
proc_fn(p, data);
}
/* Cleanup. */
closedir(proc_dir);
}
struct test_data {
int worst_pid;
unsigned long worst_oom_score;
long long total_faults;
};
void test_process(int p, void *d)
{
struct test_data *data = d;
char buf[MAX_PATH_SIZE];
unsigned long oom_score = 0;
unsigned long vmem_usage = 0;
unsigned long rmem_usage = 0;
unsigned long pagefaults = 0;
char state;
FILE *f;
int n;
/* Read OOM score of process. */
sprintf(buf, "/proc/%d/oom_score", p);
f = fopen(buf, "r");
if (f == NULL) {
return;
}
n = fscanf(f, "%lu", &oom_score);
if (n != 1) {
fatal("Could not read process oom_score.");
}
fclose(f);
/* Read memory usage of process. */
sprintf(buf, "/proc/%d/statm", p);
f = fopen(buf, "r");
if (f == NULL) {
return;
}
n = fscanf(f, "%lu %lu", &vmem_usage, &rmem_usage);
if (n != 2) {
fatal("Could not read process memory usage.");
}
fclose(f);
/* Read pagefault information about the process. */
sprintf(buf, "/proc/%d/stat", p);
f = fopen(buf, "r");
if (f == NULL) {
return;
}
n = fscanf(f, "%*d %*s %c %*d %*d %*d %*d %*d %*u %*u %*u %lu", &state, &pagefaults);
if (n != 2) {
fatal("Could not read process stat info.");
}
fclose(f);
/* Are we in an active_state? */
int process_active = (state != 'T' && state != 'Z');
/* Collate data. */
data->total_faults += pagefaults;
if (oom_score > data->worst_oom_score && process_active) {
data->worst_oom_score = oom_score;
data->worst_pid = p;
}
}
static long int parse_meminfo_int(char *buf)
{
while (*buf == ' ') {
buf++;
}
return strtol(buf, NULL, 10);
}
static void get_free_memory(unsigned long *total, unsigned long *free)
{
char buf[MAX_LINE_SIZE];
unsigned long memtotal = 0;
unsigned long memfree = 0;
unsigned long memcached = 0;
/* Read meminfo file. */
FILE *f = fopen("/proc/meminfo", "r");
if (f == NULL) {
fprintf(stderr, "Could not open /proc/meminfo.");
exit(1);
}
while (1) {
char *r = fgets(buf, MAX_LINE_SIZE, f);
if (r == NULL) {
break;
}
if (strncmp("MemTotal: ", buf, 10) == 0) {
memtotal = parse_meminfo_int(buf + 10);
} else if (strncmp("MemFree: ", buf, 10) == 0) {
memfree = parse_meminfo_int(buf + 10);
} else if (strncmp("Cached: ", buf, 10) == 0) {
memcached = parse_meminfo_int(buf + 10);
}
}
fclose(f);
*total = memtotal;
*free = memfree + memcached;
}
int is_system_unstable(
long long last_fault_count,
long long this_fault_count)
{
struct sysinfo info;
int error = sysinfo(&info);
if (error) {
return 0;
}
/* Get free RAM. */
unsigned long memtotal, memfree;
get_free_memory(&memtotal, &memfree);
double free_ram = (memfree / (double)memtotal);
/* Get number of faults. */
long long faults = 0;
if (last_fault_count > 0) {
faults = (this_fault_count - last_fault_count);
}
/* Get system load. */
double system_load = info.loads[0] / (double)LINUX_SYSINFO_LOADS_SCALE;
#if DEBUG
/* Print information. */
printf("[RAM: %5.1lf] [LOAD: %5.1lf] [FAULTS: %5lld]\n",
free_ram * 100.0, system_load, faults);
#endif
/* Determine if the system is unstable. */
if (free_ram > DANGEROUS_FREE_RAM) {
return 0;
}
if (system_load < DANGEROUS_LOAD) {
return 0;
}
if (faults < DANGEROUS_FAULTS_PER_SECOND * SLEEP_TIME && system_load < VERY_DANGEROUS_LOAD) {
return 0;
}
return 1;
}
/* Determine what signal the given string parses to. */
int parse_signal(const char *input, int *signal, const char **signame)
{
if (!strcmp(input, "SIGABRT") || !strcmp(input, "6")) {
*signal = SIGABRT;
*signame = "SIGABRT";
return 0;
}
if (!strcmp(input, "SIGSTOP") || !strcmp(input, "17")) {
*signal = SIGSTOP;
*signame = "SIGSTOP";
return 0;
}
if (!strcmp(input, "SIGTERM") || !strcmp(input, "15")) {
*signal = SIGTERM;
*signame = "SIGTERM";
return 0;
}
if (!strcmp(input, "SIGKILL") || !strcmp(input, "9")) {
*signal = SIGKILL;
*signame = "SIGKILL";
return 0;
}
return 1;
}
void usage(int argc, char **argv)
{
printf("\n"
"usage: %s [<SIGNAL>] [<processes>]\n\n"
"Monitors the system for high load and sends a signal to (hopefully)\n"
"the culprit process.\n\n"
"<SIGNAL> must be either SIGKILL or SIGSTOP.\n"
"If you don't pass a list of candidate processes, all are considered.\n\n",
argc > 0 ? argv[0] : "autostop");
}
int main(int argc, char **argv)
{
int skip_count = 0;
long long last_fault_count = 0;
int signal;
const char *signame;
char **suspects;
/* Determine which signal to send. */
if (argc < 2) {
signal = SIGSTOP;
signame = "SIGSTOP";
} else {
int error = parse_signal(argv[1], &signal, &signame);
if (error) {
usage(argc, argv);
return 1;
}
}
/* Determine the list of candidate processes. */
suspects = argc > 2 ? &argv[2] : NULL;
/* Set our scheduling priority higher. */
(void)setpriority(PRIO_PROCESS, 0, SCHED_PRIO);
while (1) {
/* Collect data. */
struct test_data d = {
.worst_pid = -1,
.worst_oom_score = MIN_OOM_SCORE,
.total_faults = 0,
};
iterate_processes(suspects, test_process, &d);
/* Determine if things are looking bad and we haven't recently stoped something. */
if (is_system_unstable(last_fault_count, d.total_faults)) {
if (d.worst_pid != -1 && skip_count == 0) {
#if DEBUG
printf("Sending %s to pid %d.\n", signame, d.worst_pid);
#endif
int error = kill(d.worst_pid, signal);
if (!error) {
syslog(LOG_ALERT,
"auto-stop: Sending %s to pid %d to prevent system melt-down.\n", signame, d.worst_pid);
skip_count = SLEEP_AFTER_STOP_SECONDS / SLEEP_TIME;
}
}
}
if (skip_count > 0) {
skip_count--;
}
last_fault_count = d.total_faults;
sleep(SLEEP_TIME);
}
return 0;
}