[pve-devel] [PATCH qemu-server 2/7] qmeventd: add last-ditch effort SIGKILL cleanup

Stefan Reiter s.reiter at proxmox.com
Thu Sep 3 10:58:46 CEST 2020


'alarm' is used to schedule an additionaly cleanup round 5 seconds after
sending SIGTERM via terminate_client. This then calls SIGKILL, making
sure that the QEMU process is *really* dead and won't be left behind in
an undetermined state.

This shouldn't be an issue under normal circumstances, but can help
avoid dead processes lying around if QEMU hangs after SIGTERM.

Signed-off-by: Stefan Reiter <s.reiter at proxmox.com>
---

Unsure about this one, it's technically not necessary, but might help avoid
situations where a QEMU process is left over.

Also, is PID reuse an issue? Suppose QEMU quits correctly on SIGTERM, another
process starts up with the now free PID, and 5 seconds after we SIGKILL it. Not
sure how this could be solved though...

 qmeventd/qmeventd.c | 54 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/qmeventd/qmeventd.c b/qmeventd/qmeventd.c
index 5616ae4..ed657b8 100644
--- a/qmeventd/qmeventd.c
+++ b/qmeventd/qmeventd.c
@@ -63,6 +63,8 @@ static int verbose = 0;
 static int epoll_fd = 0;
 static const char *progname;
 GHashTable *vm_clients; // key=vmid (freed on remove), value=*Client (free manually)
+GSList *forced_cleanups;
+volatile sig_atomic_t alarm_triggered = 0;
 
 /*
  * Helper functions
@@ -470,6 +472,12 @@ terminate_client(struct Client *client)
 
     int err = kill(client->pid, SIGTERM);
     log_neg(err, "kill");
+
+    // resets any other alarms, but will fire eventually and cleanup all
+    pid_t *pid_ptr = malloc(sizeof(pid_t));
+    *pid_ptr = client->pid;
+    forced_cleanups = g_slist_prepend(forced_cleanups, (void *)pid_ptr);
+    alarm(5);
 }
 
 void
@@ -545,6 +553,47 @@ handle_client(struct Client *client)
 }
 
 
+/*
+ * SIGALRM and cleanup handling
+ *
+ * terminate_client will set an alarm for 5 seconds and add its client's PID to
+ * the forced_cleanups list - when the timer expires, we iterate the list and
+ * attempt to issue SIGKILL to all processes which haven't yet stopped.
+ */
+
+static void
+alarm_handler(__attribute__((unused)) int signum)
+{
+    alarm_triggered = 1;
+}
+
+static void
+sigkill(void *pid_ptr, __attribute__((unused)) void *unused)
+{
+    pid_t pid = *((pid_t *)pid_ptr);
+    int err = kill(pid, SIGKILL);
+    if (err < 0) {
+	if (errno != ESRCH) {
+	    fprintf(stderr, "SIGKILL cleanup of pid '%d' failed - %s\n",
+		    pid, strerror(errno));
+	}
+    } else {
+	fprintf(stderr, "cleanup failed, terminating pid '%d' with SIGKILL\n", pid);
+    }
+}
+
+static void
+handle_forced_cleanup()
+{
+    if (alarm_triggered) {
+	alarm_triggered = 0;
+	g_slist_foreach(forced_cleanups, sigkill, NULL);
+	g_slist_free_full(forced_cleanups, free);
+	forced_cleanups = NULL;
+    }
+}
+
+
 int
 main(int argc, char *argv[])
 {
@@ -577,6 +626,7 @@ main(int argc, char *argv[])
     }
 
     signal(SIGCHLD, SIG_IGN);
+    signal(SIGALRM, alarm_handler);
 
     socket_path = argv[optind];
 
@@ -612,7 +662,7 @@ main(int argc, char *argv[])
     for(;;) {
 	nevents = epoll_wait(epoll_fd, events, 1, -1);
 	if (nevents < 0 && errno == EINTR) {
-	    // signal happened, try again
+	    handle_forced_cleanup();
 	    continue;
 	}
 	bail_neg(nevents, "epoll_wait");
@@ -630,5 +680,7 @@ main(int argc, char *argv[])
 		handle_client((struct Client *)events[n].data.ptr);
 	    }
 	}
+
+	handle_forced_cleanup();
     }
 }
-- 
2.20.1






More information about the pve-devel mailing list