f6265e
From c1c6d0b586e5556a37b9b813afbb4e6a24921adf Mon Sep 17 00:00:00 2001
f6265e
From: Eduardo Otubo <otubo@redhat.com>
f6265e
Date: Wed, 23 Jan 2019 12:30:21 +0100
f6265e
Subject: net: Wait for dhclient to daemonize before reading lease file
f6265e
f6265e
RH-Author: Eduardo Otubo <otubo@redhat.com>
f6265e
Message-id: <20190123123021.32708-1-otubo@redhat.com>
f6265e
Patchwork-id: 84095
f6265e
O-Subject: [RHEL-7.7 cloud-init PATCH] net: Wait for dhclient to daemonize before reading lease file
f6265e
Bugzilla: 1632967
f6265e
RH-Acked-by: Vitaly Kuznetsov <vkuznets@redhat.com>
f6265e
RH-Acked-by: Miroslav Rezanina <mrezanin@redhat.com>
f6265e
f6265e
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1632967
f6265e
Brew: https://bugzilla.redhat.com/show_bug.cgi?id=1632967
f6265e
Tested: Me and upstream
f6265e
f6265e
commit fdadcb5fae51f4e6799314ab98e3aec56c79b17c
f6265e
Author: Jason Zions <jasonzio@microsoft.com>
f6265e
Date:   Tue Jan 15 21:37:17 2019 +0000
f6265e
f6265e
    net: Wait for dhclient to daemonize before reading lease file
f6265e
f6265e
    cloud-init uses dhclient to fetch the DHCP lease so it can extract
f6265e
    DHCP options.  dhclient creates the leasefile, then writes to it;
f6265e
    simply waiting for the leasefile to appear creates a race between
f6265e
    dhclient and cloud-init. Instead, wait for dhclient to be parented by
f6265e
    init. At that point, we know it has written to the leasefile, so it's
f6265e
    safe to copy the file and kill the process.
f6265e
f6265e
    cloud-init creates a temporary directory in which to execute dhclient,
f6265e
    and deletes that directory after it has killed the process. If
f6265e
    cloud-init abandons waiting for dhclient to daemonize, it will still
f6265e
    attempt to delete the temporary directory, but will not report an
f6265e
    exception should that attempt fail.
f6265e
f6265e
    LP: #1794399
f6265e
f6265e
Signed-off-by: Eduardo Otubo <otubo@redhat.com>
f6265e
Signed-off-by: Miroslav Rezanina <mrezanin@redhat.com>
f6265e
---
f6265e
 cloudinit/net/dhcp.py              | 44 +++++++++++++++++++++++++++-----------
f6265e
 cloudinit/net/tests/test_dhcp.py   | 15 ++++++++++---
f6265e
 cloudinit/temp_utils.py            |  4 ++--
f6265e
 cloudinit/tests/test_temp_utils.py | 18 +++++++++++++++-
f6265e
 cloudinit/util.py                  | 16 +++++++++++++-
f6265e
 tests/unittests/test_util.py       |  6 ++++++
f6265e
 6 files changed, 83 insertions(+), 20 deletions(-)
f6265e
f6265e
diff --git a/cloudinit/net/dhcp.py b/cloudinit/net/dhcp.py
f6265e
index 0db991d..c98a97c 100644
f6265e
--- a/cloudinit/net/dhcp.py
f6265e
+++ b/cloudinit/net/dhcp.py
f6265e
@@ -9,6 +9,7 @@ import logging
f6265e
 import os
f6265e
 import re
f6265e
 import signal
f6265e
+import time
f6265e
 
f6265e
 from cloudinit.net import (
f6265e
     EphemeralIPv4Network, find_fallback_nic, get_devicelist,
f6265e
@@ -127,7 +128,9 @@ def maybe_perform_dhcp_discovery(nic=None):
f6265e
     if not dhclient_path:
f6265e
         LOG.debug('Skip dhclient configuration: No dhclient command found.')
f6265e
         return []
f6265e
-    with temp_utils.tempdir(prefix='cloud-init-dhcp-', needs_exe=True) as tdir:
f6265e
+    with temp_utils.tempdir(rmtree_ignore_errors=True,
f6265e
+                            prefix='cloud-init-dhcp-',
f6265e
+                            needs_exe=True) as tdir:
f6265e
         # Use /var/tmp because /run/cloud-init/tmp is mounted noexec
f6265e
         return dhcp_discovery(dhclient_path, nic, tdir)
f6265e
 
f6265e
@@ -195,24 +198,39 @@ def dhcp_discovery(dhclient_cmd_path, interface, cleandir):
f6265e
            '-pf', pid_file, interface, '-sf', '/bin/true']
f6265e
     util.subp(cmd, capture=True)
f6265e
 
f6265e
-    # dhclient doesn't write a pid file until after it forks when it gets a
f6265e
-    # proper lease response. Since cleandir is a temp directory that gets
f6265e
-    # removed, we need to wait for that pidfile creation before the
f6265e
-    # cleandir is removed, otherwise we get FileNotFound errors.
f6265e
+    # Wait for pid file and lease file to appear, and for the process
f6265e
+    # named by the pid file to daemonize (have pid 1 as its parent). If we
f6265e
+    # try to read the lease file before daemonization happens, we might try
f6265e
+    # to read it before the dhclient has actually written it. We also have
f6265e
+    # to wait until the dhclient has become a daemon so we can be sure to
f6265e
+    # kill the correct process, thus freeing cleandir to be deleted back
f6265e
+    # up the callstack.
f6265e
     missing = util.wait_for_files(
f6265e
         [pid_file, lease_file], maxwait=5, naplen=0.01)
f6265e
     if missing:
f6265e
         LOG.warning("dhclient did not produce expected files: %s",
f6265e
                     ', '.join(os.path.basename(f) for f in missing))
f6265e
         return []
f6265e
-    pid_content = util.load_file(pid_file).strip()
f6265e
-    try:
f6265e
-        pid = int(pid_content)
f6265e
-    except ValueError:
f6265e
-        LOG.debug(
f6265e
-            "pid file contains non-integer content '%s'", pid_content)
f6265e
-    else:
f6265e
-        os.kill(pid, signal.SIGKILL)
f6265e
+
f6265e
+    ppid = 'unknown'
f6265e
+    for _ in range(0, 1000):
f6265e
+        pid_content = util.load_file(pid_file).strip()
f6265e
+        try:
f6265e
+            pid = int(pid_content)
f6265e
+        except ValueError:
f6265e
+            pass
f6265e
+        else:
f6265e
+            ppid = util.get_proc_ppid(pid)
f6265e
+            if ppid == 1:
f6265e
+                LOG.debug('killing dhclient with pid=%s', pid)
f6265e
+                os.kill(pid, signal.SIGKILL)
f6265e
+                return parse_dhcp_lease_file(lease_file)
f6265e
+        time.sleep(0.01)
f6265e
+
f6265e
+    LOG.error(
f6265e
+        'dhclient(pid=%s, parentpid=%s) failed to daemonize after %s seconds',
f6265e
+        pid_content, ppid, 0.01 * 1000
f6265e
+    )
f6265e
     return parse_dhcp_lease_file(lease_file)
f6265e
 
f6265e
 
f6265e
diff --git a/cloudinit/net/tests/test_dhcp.py b/cloudinit/net/tests/test_dhcp.py
f6265e
index cd3e732..79e8842 100644
f6265e
--- a/cloudinit/net/tests/test_dhcp.py
f6265e
+++ b/cloudinit/net/tests/test_dhcp.py
f6265e
@@ -145,16 +145,20 @@ class TestDHCPDiscoveryClean(CiTestCase):
f6265e
               'subnet-mask': '255.255.255.0', 'routers': '192.168.2.1'}],
f6265e
             dhcp_discovery(dhclient_script, 'eth9', tmpdir))
f6265e
         self.assertIn(
f6265e
-            "pid file contains non-integer content ''", self.logs.getvalue())
f6265e
+            "dhclient(pid=, parentpid=unknown) failed "
f6265e
+            "to daemonize after 10.0 seconds",
f6265e
+            self.logs.getvalue())
f6265e
         m_kill.assert_not_called()
f6265e
 
f6265e
+    @mock.patch('cloudinit.net.dhcp.util.get_proc_ppid')
f6265e
     @mock.patch('cloudinit.net.dhcp.os.kill')
f6265e
     @mock.patch('cloudinit.net.dhcp.util.wait_for_files')
f6265e
     @mock.patch('cloudinit.net.dhcp.util.subp')
f6265e
     def test_dhcp_discovery_run_in_sandbox_waits_on_lease_and_pid(self,
f6265e
                                                                   m_subp,
f6265e
                                                                   m_wait,
f6265e
-                                                                  m_kill):
f6265e
+                                                                  m_kill,
f6265e
+                                                                  m_getppid):
f6265e
         """dhcp_discovery waits for the presence of pidfile and dhcp.leases."""
f6265e
         tmpdir = self.tmp_dir()
f6265e
         dhclient_script = os.path.join(tmpdir, 'dhclient.orig')
f6265e
@@ -164,6 +168,7 @@ class TestDHCPDiscoveryClean(CiTestCase):
f6265e
         pidfile = self.tmp_path('dhclient.pid', tmpdir)
f6265e
         leasefile = self.tmp_path('dhcp.leases', tmpdir)
f6265e
         m_wait.return_value = [pidfile]  # Return the missing pidfile wait for
f6265e
+        m_getppid.return_value = 1  # Indicate that dhclient has daemonized
f6265e
         self.assertEqual([], dhcp_discovery(dhclient_script, 'eth9', tmpdir))
f6265e
         self.assertEqual(
f6265e
             mock.call([pidfile, leasefile], maxwait=5, naplen=0.01),
f6265e
@@ -173,9 +178,10 @@ class TestDHCPDiscoveryClean(CiTestCase):
f6265e
             self.logs.getvalue())
f6265e
         m_kill.assert_not_called()
f6265e
 
f6265e
+    @mock.patch('cloudinit.net.dhcp.util.get_proc_ppid')
f6265e
     @mock.patch('cloudinit.net.dhcp.os.kill')
f6265e
     @mock.patch('cloudinit.net.dhcp.util.subp')
f6265e
-    def test_dhcp_discovery_run_in_sandbox(self, m_subp, m_kill):
f6265e
+    def test_dhcp_discovery_run_in_sandbox(self, m_subp, m_kill, m_getppid):
f6265e
         """dhcp_discovery brings up the interface and runs dhclient.
f6265e
 
f6265e
         It also returns the parsed dhcp.leases file generated in the sandbox.
f6265e
@@ -197,6 +203,7 @@ class TestDHCPDiscoveryClean(CiTestCase):
f6265e
         pid_file = os.path.join(tmpdir, 'dhclient.pid')
f6265e
         my_pid = 1
f6265e
         write_file(pid_file, "%d\n" % my_pid)
f6265e
+        m_getppid.return_value = 1  # Indicate that dhclient has daemonized
f6265e
 
f6265e
         self.assertItemsEqual(
f6265e
             [{'interface': 'eth9', 'fixed-address': '192.168.2.74',
f6265e
@@ -355,3 +362,5 @@ class TestEphemeralDhcpNoNetworkSetup(HttprettyTestCase):
f6265e
             self.assertEqual(fake_lease, lease)
f6265e
         # Ensure that dhcp discovery occurs
f6265e
         m_dhcp.called_once_with()
f6265e
+
f6265e
+# vi: ts=4 expandtab
f6265e
diff --git a/cloudinit/temp_utils.py b/cloudinit/temp_utils.py
f6265e
index c98a1b5..346276e 100644
f6265e
--- a/cloudinit/temp_utils.py
f6265e
+++ b/cloudinit/temp_utils.py
f6265e
@@ -81,7 +81,7 @@ def ExtendedTemporaryFile(**kwargs):
f6265e
 
f6265e
 
f6265e
 @contextlib.contextmanager
f6265e
-def tempdir(**kwargs):
f6265e
+def tempdir(rmtree_ignore_errors=False, **kwargs):
f6265e
     # This seems like it was only added in python 3.2
f6265e
     # Make it since its useful...
f6265e
     # See: http://bugs.python.org/file12970/tempdir.patch
f6265e
@@ -89,7 +89,7 @@ def tempdir(**kwargs):
f6265e
     try:
f6265e
         yield tdir
f6265e
     finally:
f6265e
-        shutil.rmtree(tdir)
f6265e
+        shutil.rmtree(tdir, ignore_errors=rmtree_ignore_errors)
f6265e
 
f6265e
 
f6265e
 def mkdtemp(**kwargs):
f6265e
diff --git a/cloudinit/tests/test_temp_utils.py b/cloudinit/tests/test_temp_utils.py
f6265e
index ffbb92c..4a52ef8 100644
f6265e
--- a/cloudinit/tests/test_temp_utils.py
f6265e
+++ b/cloudinit/tests/test_temp_utils.py
f6265e
@@ -2,8 +2,9 @@
f6265e
 
f6265e
 """Tests for cloudinit.temp_utils"""
f6265e
 
f6265e
-from cloudinit.temp_utils import mkdtemp, mkstemp
f6265e
+from cloudinit.temp_utils import mkdtemp, mkstemp, tempdir
f6265e
 from cloudinit.tests.helpers import CiTestCase, wrap_and_call
f6265e
+import os
f6265e
 
f6265e
 
f6265e
 class TestTempUtils(CiTestCase):
f6265e
@@ -98,4 +99,19 @@ class TestTempUtils(CiTestCase):
f6265e
         self.assertEqual('/fake/return/path', retval)
f6265e
         self.assertEqual([{'dir': '/run/cloud-init/tmp'}], calls)
f6265e
 
f6265e
+    def test_tempdir_error_suppression(self):
f6265e
+        """test tempdir suppresses errors during directory removal."""
f6265e
+
f6265e
+        with self.assertRaises(OSError):
f6265e
+            with tempdir(prefix='cloud-init-dhcp-') as tdir:
f6265e
+                os.rmdir(tdir)
f6265e
+                # As a result, the directory is already gone,
f6265e
+                # so shutil.rmtree should raise OSError
f6265e
+
f6265e
+        with tempdir(rmtree_ignore_errors=True,
f6265e
+                     prefix='cloud-init-dhcp-') as tdir:
f6265e
+            os.rmdir(tdir)
f6265e
+            # Since the directory is already gone, shutil.rmtree would raise
f6265e
+            # OSError, but we suppress that
f6265e
+
f6265e
 # vi: ts=4 expandtab
f6265e
diff --git a/cloudinit/util.py b/cloudinit/util.py
f6265e
index 7800f7b..a84112a 100644
f6265e
--- a/cloudinit/util.py
f6265e
+++ b/cloudinit/util.py
f6265e
@@ -2861,7 +2861,6 @@ def mount_is_read_write(mount_point):
f6265e
     mount_opts = result[-1].split(',')
f6265e
     return mount_opts[0] == 'rw'
f6265e
 
f6265e
-
f6265e
 def udevadm_settle(exists=None, timeout=None):
f6265e
     """Invoke udevadm settle with optional exists and timeout parameters"""
f6265e
     settle_cmd = ["udevadm", "settle"]
f6265e
@@ -2875,5 +2874,20 @@ def udevadm_settle(exists=None, timeout=None):
f6265e
 
f6265e
     return subp(settle_cmd)
f6265e
 
f6265e
+def get_proc_ppid(pid):
f6265e
+    """
f6265e
+    Return the parent pid of a process.
f6265e
+    """
f6265e
+    ppid = 0
f6265e
+    try:
f6265e
+        contents = load_file("/proc/%s/stat" % pid, quiet=True)
f6265e
+    except IOError as e:
f6265e
+        LOG.warning('Failed to load /proc/%s/stat. %s', pid, e)
f6265e
+    if contents:
f6265e
+        parts = contents.split(" ", 4)
f6265e
+        # man proc says
f6265e
+        #  ppid %d     (4) The PID of the parent.
f6265e
+        ppid = int(parts[3])
f6265e
+    return ppid
f6265e
 
f6265e
 # vi: ts=4 expandtab
f6265e
diff --git a/tests/unittests/test_util.py b/tests/unittests/test_util.py
f6265e
index 5a14479..8aebcd6 100644
f6265e
--- a/tests/unittests/test_util.py
f6265e
+++ b/tests/unittests/test_util.py
f6265e
@@ -1114,6 +1114,12 @@ class TestLoadShellContent(helpers.TestCase):
f6265e
                 'key3="val3 #tricky"',
f6265e
                 ''])))
f6265e
 
f6265e
+    def test_get_proc_ppid(self):
f6265e
+        """get_proc_ppid returns correct parent pid value."""
f6265e
+        my_pid = os.getpid()
f6265e
+        my_ppid = os.getppid()
f6265e
+        self.assertEqual(my_ppid, util.get_proc_ppid(my_pid))
f6265e
+
f6265e
 
f6265e
 class TestGetProcEnv(helpers.TestCase):
f6265e
     """test get_proc_env."""
f6265e
-- 
f6265e
1.8.3.1
f6265e