46a734
From ca5b83cee7b45bf56eec258db739cb5fe51b3231 Mon Sep 17 00:00:00 2001
46a734
From: aswinrajamannar <39812128+aswinrajamannar@users.noreply.github.com>
46a734
Date: Mon, 26 Apr 2021 07:28:39 -0700
46a734
Subject: [PATCH 6/7] Azure: Retry net metadata during nic attach for
46a734
 non-timeout errs (#878)
46a734
46a734
RH-Author: Eduardo Otubo <otubo@redhat.com>
46a734
RH-MergeRequest: 45: Add support for userdata on Azure from IMDS
46a734
RH-Commit: [6/7] 4e6e44f017d5ffcb72ac8959a94f80c71fef9560
46a734
RH-Bugzilla: 2023940
46a734
RH-Acked-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
46a734
RH-Acked-by: Mohamed Gamal Morsy <mmorsy@redhat.com>
46a734
46a734
When network interfaces are hot-attached to the VM, attempting to get
46a734
network metadata might return 410 (or 500, 503 etc) because the info
46a734
is not yet available. In those cases, we retry getting the metadata
46a734
before giving up. The only case where we can move on to wait for more
46a734
nic attach events is if the call times out despite retries, which
46a734
means the interface is not likely a primary interface, and we should
46a734
try for more nic attach events.
46a734
---
46a734
 cloudinit/sources/DataSourceAzure.py          | 65 +++++++++++--
46a734
 tests/unittests/test_datasource/test_azure.py | 95 ++++++++++++++++---
46a734
 2 files changed, 140 insertions(+), 20 deletions(-)
46a734
46a734
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py
46a734
index 6d7954ee..d0be6d84 100755
46a734
--- a/cloudinit/sources/DataSourceAzure.py
46a734
+++ b/cloudinit/sources/DataSourceAzure.py
46a734
@@ -17,6 +17,7 @@ from time import sleep
46a734
 from xml.dom import minidom
46a734
 import xml.etree.ElementTree as ET
46a734
 from enum import Enum
46a734
+import requests
46a734
 
46a734
 from cloudinit import dmi
46a734
 from cloudinit import log as logging
46a734
@@ -665,7 +666,9 @@ class DataSourceAzure(sources.DataSource):
46a734
             self,
46a734
             fallback_nic,
46a734
             retries,
46a734
-            md_type=metadata_type.compute):
46a734
+            md_type=metadata_type.compute,
46a734
+            exc_cb=retry_on_url_exc,
46a734
+            infinite=False):
46a734
         """
46a734
         Wrapper for get_metadata_from_imds so that we can have flexibility
46a734
         in which IMDS api-version we use. If a particular instance of IMDS
46a734
@@ -685,7 +688,8 @@ class DataSourceAzure(sources.DataSource):
46a734
                         fallback_nic=fallback_nic,
46a734
                         retries=0,
46a734
                         md_type=md_type,
46a734
-                        api_version=IMDS_VER_WANT
46a734
+                        api_version=IMDS_VER_WANT,
46a734
+                        exc_cb=exc_cb
46a734
                     )
46a734
                 except UrlError as err:
46a734
                     LOG.info(
46a734
@@ -708,7 +712,9 @@ class DataSourceAzure(sources.DataSource):
46a734
             fallback_nic=fallback_nic,
46a734
             retries=retries,
46a734
             md_type=md_type,
46a734
-            api_version=IMDS_VER_MIN
46a734
+            api_version=IMDS_VER_MIN,
46a734
+            exc_cb=exc_cb,
46a734
+            infinite=infinite
46a734
         )
46a734
 
46a734
     def device_name_to_device(self, name):
46a734
@@ -938,6 +944,9 @@ class DataSourceAzure(sources.DataSource):
46a734
         is_primary = False
46a734
         expected_nic_count = -1
46a734
         imds_md = None
46a734
+        metadata_poll_count = 0
46a734
+        metadata_logging_threshold = 1
46a734
+        metadata_timeout_count = 0
46a734
 
46a734
         # For now, only a VM's primary NIC can contact IMDS and WireServer. If
46a734
         # DHCP fails for a NIC, we have no mechanism to determine if the NIC is
46a734
@@ -962,14 +971,48 @@ class DataSourceAzure(sources.DataSource):
46a734
                                     % (ifname, e), logger_func=LOG.error)
46a734
             raise
46a734
 
46a734
+        # Retry polling network metadata for a limited duration only when the
46a734
+        # calls fail due to timeout. This is because the platform drops packets
46a734
+        # going towards IMDS when it is not a primary nic. If the calls fail
46a734
+        # due to other issues like 410, 503 etc, then it means we are primary
46a734
+        # but IMDS service is unavailable at the moment. Retry indefinitely in
46a734
+        # those cases since we cannot move on without the network metadata.
46a734
+        def network_metadata_exc_cb(msg, exc):
46a734
+            nonlocal metadata_timeout_count, metadata_poll_count
46a734
+            nonlocal metadata_logging_threshold
46a734
+
46a734
+            metadata_poll_count = metadata_poll_count + 1
46a734
+
46a734
+            # Log when needed but back off exponentially to avoid exploding
46a734
+            # the log file.
46a734
+            if metadata_poll_count >= metadata_logging_threshold:
46a734
+                metadata_logging_threshold *= 2
46a734
+                report_diagnostic_event(
46a734
+                    "Ran into exception when attempting to reach %s "
46a734
+                    "after %d polls." % (msg, metadata_poll_count),
46a734
+                    logger_func=LOG.error)
46a734
+
46a734
+                if isinstance(exc, UrlError):
46a734
+                    report_diagnostic_event("poll IMDS with %s failed. "
46a734
+                                            "Exception: %s and code: %s" %
46a734
+                                            (msg, exc.cause, exc.code),
46a734
+                                            logger_func=LOG.error)
46a734
+
46a734
+            if exc.cause and isinstance(exc.cause, requests.Timeout):
46a734
+                metadata_timeout_count = metadata_timeout_count + 1
46a734
+                return (metadata_timeout_count <= 10)
46a734
+            return True
46a734
+
46a734
         # Primary nic detection will be optimized in the future. The fact that
46a734
         # primary nic is being attached first helps here. Otherwise each nic
46a734
         # could add several seconds of delay.
46a734
         try:
46a734
             imds_md = self.get_imds_data_with_api_fallback(
46a734
                 ifname,
46a734
-                5,
46a734
-                metadata_type.network
46a734
+                0,
46a734
+                metadata_type.network,
46a734
+                network_metadata_exc_cb,
46a734
+                True
46a734
             )
46a734
         except Exception as e:
46a734
             LOG.warning(
46a734
@@ -2139,7 +2182,9 @@ def _generate_network_config_from_fallback_config() -> dict:
46a734
 def get_metadata_from_imds(fallback_nic,
46a734
                            retries,
46a734
                            md_type=metadata_type.compute,
46a734
-                           api_version=IMDS_VER_MIN):
46a734
+                           api_version=IMDS_VER_MIN,
46a734
+                           exc_cb=retry_on_url_exc,
46a734
+                           infinite=False):
46a734
     """Query Azure's instance metadata service, returning a dictionary.
46a734
 
46a734
     If network is not up, setup ephemeral dhcp on fallback_nic to talk to the
46a734
@@ -2158,7 +2203,7 @@ def get_metadata_from_imds(fallback_nic,
46a734
     kwargs = {'logfunc': LOG.debug,
46a734
               'msg': 'Crawl of Azure Instance Metadata Service (IMDS)',
46a734
               'func': _get_metadata_from_imds,
46a734
-              'args': (retries, md_type, api_version,)}
46a734
+              'args': (retries, exc_cb, md_type, api_version, infinite)}
46a734
     if net.is_up(fallback_nic):
46a734
         return util.log_time(**kwargs)
46a734
     else:
46a734
@@ -2176,14 +2221,16 @@ def get_metadata_from_imds(fallback_nic,
46a734
 @azure_ds_telemetry_reporter
46a734
 def _get_metadata_from_imds(
46a734
         retries,
46a734
+        exc_cb,
46a734
         md_type=metadata_type.compute,
46a734
-        api_version=IMDS_VER_MIN):
46a734
+        api_version=IMDS_VER_MIN,
46a734
+        infinite=False):
46a734
     url = "{}?api-version={}".format(md_type.value, api_version)
46a734
     headers = {"Metadata": "true"}
46a734
     try:
46a734
         response = readurl(
46a734
             url, timeout=IMDS_TIMEOUT_IN_SECONDS, headers=headers,
46a734
-            retries=retries, exception_cb=retry_on_url_exc)
46a734
+            retries=retries, exception_cb=exc_cb, infinite=infinite)
46a734
     except Exception as e:
46a734
         # pylint:disable=no-member
46a734
         if isinstance(e, UrlError) and e.code == 400:
46a734
diff --git a/tests/unittests/test_datasource/test_azure.py b/tests/unittests/test_datasource/test_azure.py
46a734
index d9817d84..c4a8e08d 100644
46a734
--- a/tests/unittests/test_datasource/test_azure.py
46a734
+++ b/tests/unittests/test_datasource/test_azure.py
46a734
@@ -448,7 +448,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase):
46a734
             "http://169.254.169.254/metadata/instance?api-version="
46a734
             "2019-06-01", exception_cb=mock.ANY,
46a734
             headers=mock.ANY, retries=mock.ANY,
46a734
-            timeout=mock.ANY)
46a734
+            timeout=mock.ANY, infinite=False)
46a734
 
46a734
     @mock.patch(MOCKPATH + 'readurl', autospec=True)
46a734
     @mock.patch(MOCKPATH + 'EphemeralDHCPv4')
46a734
@@ -467,7 +467,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase):
46a734
             "http://169.254.169.254/metadata/instance/network?api-version="
46a734
             "2019-06-01", exception_cb=mock.ANY,
46a734
             headers=mock.ANY, retries=mock.ANY,
46a734
-            timeout=mock.ANY)
46a734
+            timeout=mock.ANY, infinite=False)
46a734
 
46a734
     @mock.patch(MOCKPATH + 'readurl', autospec=True)
46a734
     @mock.patch(MOCKPATH + 'EphemeralDHCPv4')
46a734
@@ -486,7 +486,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase):
46a734
             "http://169.254.169.254/metadata/instance?api-version="
46a734
             "2019-06-01", exception_cb=mock.ANY,
46a734
             headers=mock.ANY, retries=mock.ANY,
46a734
-            timeout=mock.ANY)
46a734
+            timeout=mock.ANY, infinite=False)
46a734
 
46a734
     @mock.patch(MOCKPATH + 'readurl', autospec=True)
46a734
     @mock.patch(MOCKPATH + 'EphemeralDHCPv4WithReporting', autospec=True)
46a734
@@ -511,7 +511,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase):
46a734
         m_readurl.assert_called_with(
46a734
             self.network_md_url, exception_cb=mock.ANY,
46a734
             headers={'Metadata': 'true'}, retries=2,
46a734
-            timeout=dsaz.IMDS_TIMEOUT_IN_SECONDS)
46a734
+            timeout=dsaz.IMDS_TIMEOUT_IN_SECONDS, infinite=False)
46a734
 
46a734
     @mock.patch('cloudinit.url_helper.time.sleep')
46a734
     @mock.patch(MOCKPATH + 'net.is_up', autospec=True)
46a734
@@ -2694,15 +2694,22 @@ class TestPreprovisioningHotAttachNics(CiTestCase):
46a734
 
46a734
         def nic_attach_ret(nl_sock, nics_found):
46a734
             nonlocal m_attach_call_count
46a734
-            if m_attach_call_count == 0:
46a734
-                m_attach_call_count = m_attach_call_count + 1
46a734
+            m_attach_call_count = m_attach_call_count + 1
46a734
+            if m_attach_call_count == 1:
46a734
                 return "eth0"
46a734
-            return "eth1"
46a734
+            elif m_attach_call_count == 2:
46a734
+                return "eth1"
46a734
+            raise RuntimeError("Must have found primary nic by now.")
46a734
+
46a734
+        # Simulate two NICs by adding the same one twice.
46a734
+        md = {
46a734
+            "interface": [
46a734
+                IMDS_NETWORK_METADATA['interface'][0],
46a734
+                IMDS_NETWORK_METADATA['interface'][0]
46a734
+            ]
46a734
+        }
46a734
 
46a734
-        def network_metadata_ret(ifname, retries, type):
46a734
-            # Simulate two NICs by adding the same one twice.
46a734
-            md = IMDS_NETWORK_METADATA
46a734
-            md['interface'].append(md['interface'][0])
46a734
+        def network_metadata_ret(ifname, retries, type, exc_cb, infinite):
46a734
             if ifname == "eth0":
46a734
                 return md
46a734
             raise requests.Timeout('Fake connection timeout')
46a734
@@ -2724,6 +2731,72 @@ class TestPreprovisioningHotAttachNics(CiTestCase):
46a734
         self.assertEqual(1, m_imds.call_count)
46a734
         self.assertEqual(2, m_link_up.call_count)
46a734
 
46a734
+    @mock.patch(MOCKPATH + 'DataSourceAzure.get_imds_data_with_api_fallback')
46a734
+    @mock.patch(MOCKPATH + 'EphemeralDHCPv4')
46a734
+    def test_check_if_nic_is_primary_retries_on_failures(
46a734
+            self, m_dhcpv4, m_imds):
46a734
+        """Retry polling for network metadata on all failures except timeout"""
46a734
+        dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths)
46a734
+        lease = {
46a734
+            'interface': 'eth9', 'fixed-address': '192.168.2.9',
46a734
+            'routers': '192.168.2.1', 'subnet-mask': '255.255.255.0',
46a734
+            'unknown-245': '624c3620'}
46a734
+
46a734
+        eth0Retries = []
46a734
+        eth1Retries = []
46a734
+        # Simulate two NICs by adding the same one twice.
46a734
+        md = {
46a734
+            "interface": [
46a734
+                IMDS_NETWORK_METADATA['interface'][0],
46a734
+                IMDS_NETWORK_METADATA['interface'][0]
46a734
+            ]
46a734
+        }
46a734
+
46a734
+        def network_metadata_ret(ifname, retries, type, exc_cb, infinite):
46a734
+            nonlocal eth0Retries, eth1Retries
46a734
+
46a734
+            # Simulate readurl functionality with retries and
46a734
+            # exception callbacks so that the callback logic can be
46a734
+            # validated.
46a734
+            if ifname == "eth0":
46a734
+                cause = requests.HTTPError()
46a734
+                for _ in range(0, 15):
46a734
+                    error = url_helper.UrlError(cause=cause, code=410)
46a734
+                    eth0Retries.append(exc_cb("No goal state.", error))
46a734
+            else:
46a734
+                cause = requests.Timeout('Fake connection timeout')
46a734
+                for _ in range(0, 10):
46a734
+                    error = url_helper.UrlError(cause=cause)
46a734
+                    eth1Retries.append(exc_cb("Connection timeout", error))
46a734
+                # Should stop retrying after 10 retries
46a734
+                eth1Retries.append(exc_cb("Connection timeout", error))
46a734
+                raise cause
46a734
+            return md
46a734
+
46a734
+        m_imds.side_effect = network_metadata_ret
46a734
+
46a734
+        dhcp_ctx = mock.MagicMock(lease=lease)
46a734
+        dhcp_ctx.obtain_lease.return_value = lease
46a734
+        m_dhcpv4.return_value = dhcp_ctx
46a734
+
46a734
+        is_primary, expected_nic_count = dsa._check_if_nic_is_primary("eth0")
46a734
+        self.assertEqual(True, is_primary)
46a734
+        self.assertEqual(2, expected_nic_count)
46a734
+
46a734
+        # All Eth0 errors are non-timeout errors. So we should have been
46a734
+        # retrying indefinitely until success.
46a734
+        for i in eth0Retries:
46a734
+            self.assertTrue(i)
46a734
+
46a734
+        is_primary, expected_nic_count = dsa._check_if_nic_is_primary("eth1")
46a734
+        self.assertEqual(False, is_primary)
46a734
+
46a734
+        # All Eth1 errors are timeout errors. Retry happens for a max of 10 and
46a734
+        # then we should have moved on assuming it is not the primary nic.
46a734
+        for i in range(0, 10):
46a734
+            self.assertTrue(eth1Retries[i])
46a734
+        self.assertFalse(eth1Retries[10])
46a734
+
46a734
     @mock.patch('cloudinit.distros.networking.LinuxNetworking.try_set_link_up')
46a734
     def test_wait_for_link_up_returns_if_already_up(
46a734
             self, m_is_link_up):
46a734
-- 
46a734
2.27.0
46a734