25aafc
From de87d273266404f3d8fc079efda3d325f6031cc5 Mon Sep 17 00:00:00 2001
25aafc
Message-Id: <de87d273266404f3d8fc079efda3d325f6031cc5@dist-git>
25aafc
From: Michal Privoznik <mprivozn@redhat.com>
25aafc
Date: Wed, 9 Oct 2019 14:27:44 +0200
25aafc
Subject: [PATCH] virCommand: use procfs to learn opened FDs
25aafc
MIME-Version: 1.0
25aafc
Content-Type: text/plain; charset=UTF-8
25aafc
Content-Transfer-Encoding: 8bit
25aafc
25aafc
When spawning a child process, between fork() and exec() we close
25aafc
all file descriptors and keep only those the caller wants us to
25aafc
pass onto the child. The problem is how we do that. Currently, we
25aafc
get the limit of opened files and then iterate through each one
25aafc
of them and either close() it or make it survive exec(). This
25aafc
approach is suboptimal (although, not that much in default
25aafc
configurations where the limit is pretty low - 1024). We have
25aafc
/proc where we can learn what FDs we hold open and thus we can
25aafc
selectively close only those.
25aafc
25aafc
Signed-off-by: Michal Privoznik <mprivozn@redhat.com>
25aafc
Reviewed-by: Ján Tomko <jtomko@redhat.com>
25aafc
(cherry picked from commit 432faf259b696043ee5d7e8f657d855419a9a3fa)
25aafc
25aafc
https://bugzilla.redhat.com/show_bug.cgi?id=1759904
25aafc
25aafc
Signed-off-by: Michal Privoznik <mprivozn@redhat.com>
25aafc
Message-Id: <b9a09b65205ad8d2b001a55c329ec36c0e4f3183.1570623892.git.mprivozn@redhat.com>
25aafc
Reviewed-by: Jiri Denemark <jdenemar@redhat.com>
25aafc
---
25aafc
 src/util/vircommand.c | 86 +++++++++++++++++++++++++++++++++++++++----
25aafc
 1 file changed, 78 insertions(+), 8 deletions(-)
25aafc
25aafc
diff --git a/src/util/vircommand.c b/src/util/vircommand.c
25aafc
index 02dbe0fc76..2d0c987fe2 100644
25aafc
--- a/src/util/vircommand.c
25aafc
+++ b/src/util/vircommand.c
25aafc
@@ -492,27 +492,97 @@ virExecCommon(virCommandPtr cmd, gid_t *groups, int ngroups)
25aafc
     return ret;
25aafc
 }
25aafc
 
25aafc
+# ifdef __linux__
25aafc
+/* On Linux, we can utilize procfs and read the table of opened
25aafc
+ * FDs and selectively close only those FDs we don't want to pass
25aafc
+ * onto child process (well, the one we will exec soon since this
25aafc
+ * is called from the child). */
25aafc
+static int
25aafc
+virCommandMassCloseGetFDsLinux(virCommandPtr cmd ATTRIBUTE_UNUSED,
25aafc
+                               virBitmapPtr fds)
25aafc
+{
25aafc
+    DIR *dp = NULL;
25aafc
+    struct dirent *entry;
25aafc
+    const char *dirName = "/proc/self/fd";
25aafc
+    int rc;
25aafc
+    int ret = -1;
25aafc
+
25aafc
+    if (virDirOpen(&dp, dirName) < 0)
25aafc
+        return -1;
25aafc
+
25aafc
+    while ((rc = virDirRead(dp, &entry, dirName)) > 0) {
25aafc
+        int fd;
25aafc
+
25aafc
+        if (virStrToLong_i(entry->d_name, NULL, 10, &fd) < 0) {
25aafc
+            virReportError(VIR_ERR_INTERNAL_ERROR,
25aafc
+                           _("unable to parse FD: %s"),
25aafc
+                           entry->d_name);
25aafc
+            goto cleanup;
25aafc
+        }
25aafc
+
25aafc
+        if (virBitmapSetBit(fds, fd) < 0) {
25aafc
+            virReportError(VIR_ERR_INTERNAL_ERROR,
25aafc
+                           _("unable to set FD as open: %d"),
25aafc
+                           fd);
25aafc
+            goto cleanup;
25aafc
+        }
25aafc
+    }
25aafc
+
25aafc
+    if (rc < 0)
25aafc
+        goto cleanup;
25aafc
+
25aafc
+    ret = 0;
25aafc
+ cleanup:
25aafc
+    VIR_DIR_CLOSE(dp);
25aafc
+    return ret;
25aafc
+}
25aafc
+
25aafc
+# else /* !__linux__ */
25aafc
+
25aafc
+static int
25aafc
+virCommandMassCloseGetFDsGeneric(virCommandPtr cmd ATTRIBUTE_UNUSED,
25aafc
+                                 virBitmapPtr fds)
25aafc
+{
25aafc
+    virBitmapSetAll(fds);
25aafc
+    return 0;
25aafc
+}
25aafc
+# endif /* !__linux__ */
25aafc
+
25aafc
 static int
25aafc
 virCommandMassClose(virCommandPtr cmd,
25aafc
                     int childin,
25aafc
                     int childout,
25aafc
                     int childerr)
25aafc
 {
25aafc
+    VIR_AUTOPTR(virBitmap) fds = NULL;
25aafc
     int openmax = sysconf(_SC_OPEN_MAX);
25aafc
-    int fd;
25aafc
-    int tmpfd;
25aafc
+    int fd = -1;
25aafc
 
25aafc
-    if (openmax < 0) {
25aafc
-        virReportSystemError(errno,  "%s",
25aafc
-                             _("sysconf(_SC_OPEN_MAX) failed"));
25aafc
+    /* In general, it is not safe to call malloc() between fork() and exec()
25aafc
+     * because the child might have forked at the worst possible time, i.e.
25aafc
+     * when another thread was in malloc() and thus held its lock. That is to
25aafc
+     * say, POSIX does not mandate malloc() to be async-safe. Fortunately,
25aafc
+     * glibc developers are aware of this and made malloc() async-safe.
25aafc
+     * Therefore we can safely allocate memory here (and transitively call
25aafc
+     * opendir/readdir) without a deadlock. */
25aafc
+
25aafc
+    if (!(fds = virBitmapNew(openmax)))
25aafc
         return -1;
25aafc
-    }
25aafc
 
25aafc
-    for (fd = 3; fd < openmax; fd++) {
25aafc
+# ifdef __linux__
25aafc
+    if (virCommandMassCloseGetFDsLinux(cmd, fds) < 0)
25aafc
+        return -1;
25aafc
+# else
25aafc
+    if (virCommandMassCloseGetFDsGeneric(cmd, fds) < 0)
25aafc
+        return -1;
25aafc
+# endif
25aafc
+
25aafc
+    fd = virBitmapNextSetBit(fds, 2);
25aafc
+    for (; fd >= 0; fd = virBitmapNextSetBit(fds, fd)) {
25aafc
         if (fd == childin || fd == childout || fd == childerr)
25aafc
             continue;
25aafc
         if (!virCommandFDIsSet(cmd, fd)) {
25aafc
-            tmpfd = fd;
25aafc
+            int tmpfd = fd;
25aafc
             VIR_MASS_CLOSE(tmpfd);
25aafc
         } else if (virSetInherit(fd, true) < 0) {
25aafc
             virReportSystemError(errno, _("failed to preserve fd %d"), fd);
25aafc
-- 
25aafc
2.23.0
25aafc