owenh / rpms / openvswitch

Forked from rpms/openvswitch 2 months ago
Clone
a93e80
From 4ee0f6af9e601cbb5f69a486526d1011314bbfed Mon Sep 17 00:00:00 2001
a93e80
From: Ben Pfaff <blp@ovn.org>
a93e80
Date: Thu, 19 Mar 2020 17:53:10 -0700
a93e80
Subject: [PATCH 01/15] ofproto-dpif-xlate: Fix recirculation when in_port is
a93e80
 OFPP_CONTROLLER.
a93e80
a93e80
[ upstream commit c5a910dd92ecbad24f86b4c59b4ff8105b5149fd ]
a93e80
a93e80
Recirculation usually requires finding the pre-recirculation input port.
a93e80
Packets sent by the controller, with in_port of OFPP_CONTROLLER or
a93e80
OFPP_NONE, do not have a real input port data structure, only a port
a93e80
number.  The code in xlate_lookup_ofproto_() mishandled this case,
a93e80
failing to return the ofproto data structure.  This commit fixes the
a93e80
problem and adds a test to guard against regression.
a93e80
a93e80
Reported-by: Numan Siddique <numans@ovn.org>
a93e80
Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2020-March/368642.html
a93e80
Tested-by: Numan Siddique <numans@ovn.org>
a93e80
Acked-by: Numan Siddique <numans@ovn.org>
a93e80
Signed-off-by: Ben Pfaff <blp@ovn.org>
a93e80
a93e80
Resolves: #1775160
a93e80
Signed-off-by: Numan Siddique <nusiddiq@redhat.com>
a93e80
---
a93e80
 ofproto/ofproto-dpif-xlate.c | 25 +++++++++++++++++++++----
a93e80
 tests/ofproto-dpif.at        | 30 ++++++++++++++++++++++++++++++
a93e80
 2 files changed, 51 insertions(+), 4 deletions(-)
a93e80
a93e80
diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
a93e80
index 4407f9c97a..54cfbfbdff 100644
a93e80
--- a/ofproto/ofproto-dpif-xlate.c
a93e80
+++ b/ofproto/ofproto-dpif-xlate.c
a93e80
@@ -1516,15 +1516,32 @@ xlate_lookup_ofproto_(const struct dpif_backer *backer,
a93e80
             return NULL;
a93e80
         }
a93e80
 
a93e80
-        /* If recirculation was initiated due to bond (in_port = OFPP_NONE)
a93e80
-         * then frozen state is static and xport_uuid is not defined, so xport
a93e80
-         * cannot be restored from frozen state. */
a93e80
-        if (recirc_id_node->state.metadata.in_port != OFPP_NONE) {
a93e80
+        ofp_port_t in_port = recirc_id_node->state.metadata.in_port;
a93e80
+        if (in_port != OFPP_NONE && in_port != OFPP_CONTROLLER) {
a93e80
             struct uuid xport_uuid = recirc_id_node->state.xport_uuid;
a93e80
             xport = xport_lookup_by_uuid(xcfg, &xport_uuid);
a93e80
             if (xport && xport->xbridge && xport->xbridge->ofproto) {
a93e80
                 goto out;
a93e80
             }
a93e80
+        } else {
a93e80
+            /* OFPP_NONE and OFPP_CONTROLLER are not real ports.  They indicate
a93e80
+             * that the packet originated from the controller via an OpenFlow
a93e80
+             * "packet-out".  The right thing to do is to find just the
a93e80
+             * ofproto.  There is no xport, which is OK.
a93e80
+             *
a93e80
+             * OFPP_NONE can also indicate that a bond caused recirculation. */
a93e80
+            struct uuid uuid = recirc_id_node->state.ofproto_uuid;
a93e80
+            const struct xbridge *bridge = xbridge_lookup_by_uuid(xcfg, &uuid);
a93e80
+            if (bridge && bridge->ofproto) {
a93e80
+                if (errorp) {
a93e80
+                    *errorp = NULL;
a93e80
+                }
a93e80
+                *xportp = NULL;
a93e80
+                if (ofp_in_port) {
a93e80
+                    *ofp_in_port = in_port;
a93e80
+                }
a93e80
+                return bridge->ofproto;
a93e80
+            }
a93e80
         }
a93e80
     }
a93e80
 
a93e80
diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at
a93e80
index ff1cc93707..d444cf0844 100644
a93e80
--- a/tests/ofproto-dpif.at
a93e80
+++ b/tests/ofproto-dpif.at
a93e80
@@ -5171,6 +5171,36 @@ AT_CHECK_UNQUOTED([tail -1 stdout], [0], [Datapath actions: 2
a93e80
 OVS_VSWITCHD_STOP
a93e80
 AT_CLEANUP
a93e80
 
a93e80
+# Checks for regression against a bug in which OVS dropped packets
a93e80
+# with in_port=CONTROLLER when they were recirculated (because
a93e80
+# CONTROLLER isn't a real port and could not be looked up).
a93e80
+AT_SETUP([ofproto-dpif - packet-out recirculation])
a93e80
+OVS_VSWITCHD_START
a93e80
+add_of_ports br0 1 2
a93e80
+
a93e80
+AT_DATA([flows.txt], [dnl
a93e80
+table=0 ip actions=mod_dl_dst:83:83:83:83:83:83,ct(table=1)
a93e80
+table=1 ip actions=ct(commit),output:2
a93e80
+])
a93e80
+AT_CHECK([ovs-ofctl add-flows br0 flows.txt])
a93e80
+
a93e80
+packet=ffffffffffff00102030405008004500001c00000000401100000a000002ffffffff0035111100080000
a93e80
+AT_CHECK([ovs-ofctl packet-out br0 "in_port=controller packet=$packet actions=table"])
a93e80
+
a93e80
+# Dumps out the flow table, extracts the number of packets that have gone
a93e80
+# through the (single) flow in table 1, and returns success if it's exactly 1.
a93e80
+#
a93e80
+# If this remains 0, then the recirculation isn't working properly since the
a93e80
+# packet never goes through flow in table 1.
a93e80
+check_flows () {
a93e80
+    n=$(ovs-ofctl dump-flows br0 table=1 | sed -n 's/.*n_packets=\([[0-9]]\{1,\}\).*/\1/p')
a93e80
+    echo "n_packets=$n"
a93e80
+    test "$n" = 1
a93e80
+}
a93e80
+OVS_WAIT_UNTIL([check_flows], [ovs dump-flows br0])
a93e80
+
a93e80
+OVS_VSWITCHD_STOP
a93e80
+AT_CLEANUP
a93e80
 
a93e80
 AT_SETUP([ofproto-dpif - debug_slow action])
a93e80
 OVS_VSWITCHD_START
a93e80
-- 
a93e80
2.25.1
a93e80
a93e80
a93e80
From 71f25b7920093daa59827a0a4be4095309aec6ff Mon Sep 17 00:00:00 2001
a93e80
From: Timothy Redaelli <tredaelli@redhat.com>
a93e80
Date: Thu, 19 Mar 2020 20:05:39 +0100
a93e80
Subject: [PATCH 02/15] bugtool: Fix for Python3.
a93e80
a93e80
Currently ovs-bugtool tool doesn't start on Python 3.
a93e80
This commit fixes ovs-bugtool to make it works on Python 3.
a93e80
a93e80
Replaced StringIO.StringIO with io.BytesIO since the script is
a93e80
processing binary data.
a93e80
a93e80
Reported-at: https://bugzilla.redhat.com/1809241
a93e80
Reported-by: Flavio Leitner <fbl@sysclose.org>
a93e80
Signed-off-by: Timothy Redaelli <tredaelli@redhat.com>
a93e80
Co-authored-by: William Tu <u9012063@gmail.com>
a93e80
Signed-off-by: William Tu <u9012063@gmail.com>
a93e80
(cherry picked from commit 9e6c00bca9af29031d0e160d33174b7ae99b9244)
a93e80
---
a93e80
 utilities/bugtool/ovs-bugtool.in | 48 +++++++++++++++++---------------
a93e80
 1 file changed, 25 insertions(+), 23 deletions(-)
a93e80
a93e80
diff --git a/utilities/bugtool/ovs-bugtool.in b/utilities/bugtool/ovs-bugtool.in
a93e80
index e55bfc2ed5..47f3c4629f 100755
a93e80
--- a/utilities/bugtool/ovs-bugtool.in
a93e80
+++ b/utilities/bugtool/ovs-bugtool.in
a93e80
@@ -33,8 +33,7 @@
a93e80
 # or func_output().
a93e80
 #
a93e80
 
a93e80
-import StringIO
a93e80
-import commands
a93e80
+from io import BytesIO
a93e80
 import fcntl
a93e80
 import getopt
a93e80
 import hashlib
a93e80
@@ -48,7 +47,7 @@ import warnings
a93e80
 import zipfile
a93e80
 from select import select
a93e80
 from signal import SIGTERM
a93e80
-from subprocess import PIPE, Popen
a93e80
+from subprocess import PIPE, Popen, check_output
a93e80
 
a93e80
 from xml.dom.minidom import getDOMImplementation, parse
a93e80
 
a93e80
@@ -348,7 +347,7 @@ def collect_data():
a93e80
             cap = v['cap']
a93e80
             if 'cmd_args' in v:
a93e80
                 if 'output' not in v.keys():
a93e80
-                    v['output'] = StringIOmtime()
a93e80
+                    v['output'] = BytesIOmtime()
a93e80
                 if v['repeat_count'] > 0:
a93e80
                     if cap not in process_lists:
a93e80
                         process_lists[cap] = []
a93e80
@@ -373,20 +372,23 @@ def collect_data():
a93e80
         if 'filename' in v and v['filename'].startswith('/proc/'):
a93e80
             # proc files must be read into memory
a93e80
             try:
a93e80
-                f = open(v['filename'], 'r')
a93e80
+                f = open(v['filename'], 'rb')
a93e80
                 s = f.read()
a93e80
                 f.close()
a93e80
                 if check_space(cap, v['filename'], len(s)):
a93e80
-                    v['output'] = StringIOmtime(s)
a93e80
+                    v['output'] = BytesIOmtime(s)
a93e80
             except:
a93e80
                 pass
a93e80
         elif 'func' in v:
a93e80
             try:
a93e80
                 s = v['func'](cap)
a93e80
             except Exception as e:
a93e80
-                s = str(e)
a93e80
+                s = str(e).encode()
a93e80
             if check_space(cap, k, len(s)):
a93e80
-                v['output'] = StringIOmtime(s)
a93e80
+                if isinstance(s, str):
a93e80
+                    v['output'] = BytesIOmtime(s.encode())
a93e80
+                else:
a93e80
+                    v['output'] = BytesIOmtime(s)
a93e80
 
a93e80
 
a93e80
 def main(argv=None):
a93e80
@@ -704,7 +706,7 @@ exclude those logs from the archive.
a93e80
 
a93e80
     # permit the user to filter out data
a93e80
     # We cannot use iteritems, since we modify 'data' as we pass through
a93e80
-    for (k, v) in sorted(data.items()):
a93e80
+    for (k, v) in data.items():
a93e80
         cap = v['cap']
a93e80
         if 'filename' in v:
a93e80
             key = k[0]
a93e80
@@ -721,7 +723,7 @@ exclude those logs from the archive.
a93e80
 
a93e80
     # include inventory
a93e80
     data['inventory.xml'] = {'cap': None,
a93e80
-                        'output': StringIOmtime(make_inventory(data, subdir))}
a93e80
+                        'output': BytesIOmtime(make_inventory(data, subdir))}
a93e80
 
a93e80
     # create archive
a93e80
     if output_fd == -1:
a93e80
@@ -782,7 +784,7 @@ def dump_scsi_hosts(cap):
a93e80
 
a93e80
 
a93e80
 def module_info(cap):
a93e80
-    output = StringIO.StringIO()
a93e80
+    output = BytesIO()
a93e80
     modules = open(PROC_MODULES, 'r')
a93e80
     procs = []
a93e80
 
a93e80
@@ -806,7 +808,7 @@ def multipathd_topology(cap):
a93e80
 
a93e80
 
a93e80
 def dp_list():
a93e80
-    output = StringIO.StringIO()
a93e80
+    output = BytesIO()
a93e80
     procs = [ProcOutput([OVS_DPCTL, 'dump-dps'],
a93e80
              caps[CAP_NETWORK_STATUS][MAX_TIME], output)]
a93e80
 
a93e80
@@ -828,7 +830,7 @@ def collect_ovsdb():
a93e80
             if os.path.isfile(OPENVSWITCH_COMPACT_DB):
a93e80
                 os.unlink(OPENVSWITCH_COMPACT_DB)
a93e80
 
a93e80
-            output = StringIO.StringIO()
a93e80
+            output = BytesIO()
a93e80
             max_time = 5
a93e80
             procs = [ProcOutput(['ovsdb-tool', 'compact',
a93e80
                                 OPENVSWITCH_CONF_DB, OPENVSWITCH_COMPACT_DB],
a93e80
@@ -871,7 +873,7 @@ def fd_usage(cap):
a93e80
 
a93e80
 
a93e80
 def dump_rdac_groups(cap):
a93e80
-    output = StringIO.StringIO()
a93e80
+    output = BytesIO()
a93e80
     procs = [ProcOutput([MPPUTIL, '-a'], caps[cap][MAX_TIME], output)]
a93e80
 
a93e80
     run_procs([procs])
a93e80
@@ -896,7 +898,7 @@ def load_plugins(just_capabilities=False, filter=None):
a93e80
         for node in nodelist:
a93e80
             if node.nodeType == node.TEXT_NODE:
a93e80
                 rc += node.data
a93e80
-        return rc.encode()
a93e80
+        return rc
a93e80
 
a93e80
     def getBoolAttr(el, attr, default=False):
a93e80
         ret = default
a93e80
@@ -1037,7 +1039,7 @@ def make_tar(subdir, suffix, output_fd, output_file):
a93e80
                     s = os.stat(v['filename'])
a93e80
                     ti.mtime = s.st_mtime
a93e80
                     ti.size = s.st_size
a93e80
-                    tf.addfile(ti, open(v['filename']))
a93e80
+                    tf.addfile(ti, open(v['filename'], 'rb'))
a93e80
             except:
a93e80
                 pass
a93e80
     finally:
a93e80
@@ -1095,12 +1097,12 @@ def make_inventory(inventory, subdir):
a93e80
     s.setAttribute('date', time.strftime('%c'))
a93e80
     s.setAttribute('hostname', platform.node())
a93e80
     s.setAttribute('uname', ' '.join(platform.uname()))
a93e80
-    s.setAttribute('uptime', commands.getoutput(UPTIME))
a93e80
+    s.setAttribute('uptime', check_output(UPTIME).decode())
a93e80
     document.getElementsByTagName(INVENTORY_XML_ROOT)[0].appendChild(s)
a93e80
 
a93e80
     map(lambda k_v: inventory_entry(document, subdir, k_v[0], k_v[1]),
a93e80
         inventory.items())
a93e80
-    return document.toprettyxml()
a93e80
+    return document.toprettyxml().encode()
a93e80
 
a93e80
 
a93e80
 def inventory_entry(document, subdir, k, v):
a93e80
@@ -1301,7 +1303,7 @@ class ProcOutput(object):
a93e80
             line = self.proc.stdout.readline()
a93e80
         else:
a93e80
             line = self.proc.stdout.read(self.bufsize)
a93e80
-        if line == '':
a93e80
+        if line == b'':
a93e80
             # process exited
a93e80
             self.proc.stdout.close()
a93e80
             self.status = self.proc.wait()
a93e80
@@ -1391,13 +1393,13 @@ def get_free_disk_space(path):
a93e80
     return s.f_frsize * s.f_bfree
a93e80
 
a93e80
 
a93e80
-class StringIOmtime(StringIO.StringIO):
a93e80
-    def __init__(self, buf=''):
a93e80
-        StringIO.StringIO.__init__(self, buf)
a93e80
+class BytesIOmtime(BytesIO):
a93e80
+    def __init__(self, buf=b''):
a93e80
+        BytesIO.__init__(self, buf)
a93e80
         self.mtime = time.time()
a93e80
 
a93e80
     def write(self, s):
a93e80
-        StringIO.StringIO.write(self, s)
a93e80
+        BytesIO.write(self, s)
a93e80
         self.mtime = time.time()
a93e80
 
a93e80
 
a93e80
-- 
a93e80
2.25.1
a93e80
a93e80
a93e80
From 914d885061c9f7e7e6e5f921065301e08837e122 Mon Sep 17 00:00:00 2001
a93e80
From: Han Zhou <hzhou@ovn.org>
a93e80
Date: Fri, 28 Feb 2020 18:07:04 -0800
a93e80
Subject: [PATCH 03/15] raft-rpc: Fix message format.
a93e80
a93e80
[ upstream commit 78c8011f58daec41ec97440f2e42795699322742 ]
a93e80
a93e80
Signed-off-by: Han Zhou <hzhou@ovn.org>
a93e80
Signed-off-by: Ben Pfaff <blp@ovn.org>
a93e80
a93e80
Resolves: #1836305
a93e80
Signed-off-by: Dumitru Ceara <dceara@redhat.com>
a93e80
---
a93e80
 ovsdb/raft-rpc.c | 2 +-
a93e80
 1 file changed, 1 insertion(+), 1 deletion(-)
a93e80
a93e80
diff --git a/ovsdb/raft-rpc.c b/ovsdb/raft-rpc.c
a93e80
index 18c83fe9c2..dd14d81091 100644
a93e80
--- a/ovsdb/raft-rpc.c
a93e80
+++ b/ovsdb/raft-rpc.c
a93e80
@@ -544,8 +544,8 @@ raft_format_install_snapshot_request(
a93e80
     ds_put_format(s, " last_index=%"PRIu64, rq->last_index);
a93e80
     ds_put_format(s, " last_term=%"PRIu64, rq->last_term);
a93e80
     ds_put_format(s, " last_eid="UUID_FMT, UUID_ARGS(&rq->last_eid));
a93e80
-    ds_put_cstr(s, " last_servers=");
a93e80
     ds_put_format(s, " election_timer=%"PRIu64, rq->election_timer);
a93e80
+    ds_put_cstr(s, " last_servers=");
a93e80
 
a93e80
     struct hmap servers;
a93e80
     struct ovsdb_error *error =
a93e80
-- 
a93e80
2.25.1
a93e80
a93e80
a93e80
From 8ff30dfee6cb075e36ed38b77695ff03321ce12b Mon Sep 17 00:00:00 2001
a93e80
From: Han Zhou <hzhou@ovn.org>
a93e80
Date: Fri, 28 Feb 2020 18:07:05 -0800
a93e80
Subject: [PATCH 04/15] ovsdb-server: Don't disconnect clients after raft
a93e80
 install_snapshot.
a93e80
a93e80
[ upstream commit f0c8b44c5832c36989fad78927407fc14e64ce46 ]
a93e80
a93e80
When "schema" field is found in read_db(), there can be two cases:
a93e80
1. There is a schema change in clustered DB and the "schema" is the new one.
a93e80
2. There is a install_snapshot RPC happened, which caused log compaction on the
a93e80
server and the next log is just the snapshot, which always constains "schema"
a93e80
field, even though the schema hasn't been changed.
a93e80
a93e80
The current implementation doesn't handle case 2), and always assume the schema
a93e80
is changed hence disconnect all clients of the server. It can cause stability
a93e80
problem when there are big number of clients connected when this happens in
a93e80
a large scale environment.
a93e80
a93e80
Signed-off-by: Han Zhou <hzhou@ovn.org>
a93e80
Signed-off-by: Ben Pfaff <blp@ovn.org>
a93e80
a93e80
Resolves: #1836305
a93e80
Signed-off-by: Dumitru Ceara <dceara@redhat.com>
a93e80
---
a93e80
 ovsdb/ovsdb-server.c   |  3 ++-
a93e80
 tests/ovsdb-cluster.at | 56 ++++++++++++++++++++++++++++++++++++++++++
a93e80
 2 files changed, 58 insertions(+), 1 deletion(-)
a93e80
a93e80
diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c
a93e80
index b6957d7300..d416f1b606 100644
a93e80
--- a/ovsdb/ovsdb-server.c
a93e80
+++ b/ovsdb/ovsdb-server.c
a93e80
@@ -543,7 +543,8 @@ parse_txn(struct server_config *config, struct db *db,
a93e80
           struct ovsdb_schema *schema, const struct json *txn_json,
a93e80
           const struct uuid *txnid)
a93e80
 {
a93e80
-    if (schema) {
a93e80
+    if (schema && (!db->db->schema || strcmp(schema->version,
a93e80
+                                             db->db->schema->version))) {
a93e80
         /* We're replacing the schema (and the data).  Destroy the database
a93e80
          * (first grabbing its storage), then replace it with the new schema.
a93e80
          * The transaction must also include the replacement data.
a93e80
diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at
a93e80
index 3a0bd4579e..5b6188b96f 100644
a93e80
--- a/tests/ovsdb-cluster.at
a93e80
+++ b/tests/ovsdb-cluster.at
a93e80
@@ -273,6 +273,62 @@ OVS_WAIT_UNTIL([ovs-appctl -t "`pwd`"/s4 cluster/status $schema_name | grep "Ele
a93e80
 
a93e80
 AT_CLEANUP
a93e80
 
a93e80
+
a93e80
+AT_BANNER([OVSDB cluster install snapshot RPC])
a93e80
+
a93e80
+AT_SETUP([OVSDB cluster - install snapshot RPC])
a93e80
+AT_KEYWORDS([ovsdb server positive unix cluster snapshot])
a93e80
+
a93e80
+n=3
a93e80
+schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
a93e80
+ordinal_schema > schema
a93e80
+AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr])
a93e80
+cid=`ovsdb-tool db-cid s1.db`
a93e80
+schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
a93e80
+for i in `seq 2 $n`; do
a93e80
+    AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft])
a93e80
+done
a93e80
+
a93e80
+on_exit 'kill `cat *.pid`'
a93e80
+for i in `seq $n`; do
a93e80
+    AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db])
a93e80
+done
a93e80
+for i in `seq $n`; do
a93e80
+    AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
a93e80
+done
a93e80
+
a93e80
+# Kill one follower (s2) and write some data to cluster, so that the follower is falling behind
a93e80
+printf "\ns2: stopping\n"
a93e80
+OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s2], [s2.pid])
a93e80
+
a93e80
+AT_CHECK([ovsdb-client transact unix:s1.ovsdb '[["idltest",
a93e80
+      {"op": "insert",
a93e80
+       "table": "simple",
a93e80
+       "row": {"i": 1}}]]'], [0], [ignore], [ignore])
a93e80
+
a93e80
+# Compact leader online to generate snapshot
a93e80
+AT_CHECK([ovs-appctl -t "`pwd`"/s1 ovsdb-server/compact])
a93e80
+
a93e80
+# Start the follower s2 again.
a93e80
+AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s2.log --pidfile=s2.pid --unixctl=s2 --remote=punix:s2.ovsdb s2.db])
a93e80
+AT_CHECK([ovsdb_client_wait unix:s2.ovsdb $schema_name connected])
a93e80
+
a93e80
+# A client transaction through s2. During this transaction, there will be a
a93e80
+# install_snapshot RPC because s2 detects it is behind and s1 doesn't have the
a93e80
+# pre_log_index requested by s2 because it is already compacted.
a93e80
+# After the install_snapshot RPC process, the transaction through s2 should
a93e80
+# succeed.
a93e80
+AT_CHECK([ovsdb-client transact unix:s2.ovsdb '[["idltest",
a93e80
+      {"op": "insert",
a93e80
+       "table": "simple",
a93e80
+       "row": {"i": 1}}]]'], [0], [ignore], [ignore])
a93e80
+
a93e80
+for i in `seq $n`; do
a93e80
+    OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid])
a93e80
+done
a93e80
+
a93e80
+AT_CLEANUP
a93e80
+
a93e80
 
a93e80
 
a93e80
 OVS_START_SHELL_HELPERS
a93e80
-- 
a93e80
2.25.1
a93e80
a93e80
a93e80
From e732012d7be335650398ff03c2431c64b2c4aaba Mon Sep 17 00:00:00 2001
a93e80
From: Han Zhou <hzhou@ovn.org>
a93e80
Date: Fri, 28 Feb 2020 18:07:06 -0800
a93e80
Subject: [PATCH 05/15] raft: Fix raft_is_connected() when there is no leader
a93e80
 yet.
a93e80
a93e80
[ upstream commit adc64ab057345f7004c44bf92363b9adda862134 ]
a93e80
a93e80
If there is never a leader known by the current server, it's status
a93e80
should be "disconnected" to the cluster. Without this patch, when
a93e80
a server in cluster is restarted, before it successfully connecting
a93e80
back to the cluster it will appear as connected, which is wrong.
a93e80
a93e80
Signed-off-by: Han Zhou <hzhou@ovn.org>
a93e80
Signed-off-by: Ben Pfaff <blp@ovn.org>
a93e80
a93e80
Resolves: #1836305
a93e80
Signed-off-by: Dumitru Ceara <dceara@redhat.com>
a93e80
---
a93e80
 ovsdb/raft.c           | 10 ++++++++--
a93e80
 tests/ovsdb-cluster.at | 35 +++++++++++++++++++++++++++++++++++
a93e80
 2 files changed, 43 insertions(+), 2 deletions(-)
a93e80
a93e80
diff --git a/ovsdb/raft.c b/ovsdb/raft.c
a93e80
index 4789bc4f22..6cd7b0041a 100644
a93e80
--- a/ovsdb/raft.c
a93e80
+++ b/ovsdb/raft.c
a93e80
@@ -298,6 +298,11 @@ struct raft {
a93e80
     bool had_leader;            /* There has been leader elected since last
a93e80
                                    election initiated. This is to help setting
a93e80
                                    candidate_retrying. */
a93e80
+
a93e80
+    /* For all. */
a93e80
+    bool ever_had_leader;       /* There has been leader elected since the raft
a93e80
+                                   is initialized, meaning it is ever
a93e80
+                                   connected. */
a93e80
 };
a93e80
 
a93e80
 /* All Raft structures. */
a93e80
@@ -1024,7 +1029,8 @@ raft_is_connected(const struct raft *raft)
a93e80
             && !raft->joining
a93e80
             && !raft->leaving
a93e80
             && !raft->left
a93e80
-            && !raft->failed);
a93e80
+            && !raft->failed
a93e80
+            && raft->ever_had_leader);
a93e80
     VLOG_DBG("raft_is_connected: %s\n", ret? "true": "false");
a93e80
     return ret;
a93e80
 }
a93e80
@@ -2519,7 +2525,7 @@ static void
a93e80
 raft_set_leader(struct raft *raft, const struct uuid *sid)
a93e80
 {
a93e80
     raft->leader_sid = *sid;
a93e80
-    raft->had_leader = true;
a93e80
+    raft->ever_had_leader = raft->had_leader = true;
a93e80
     raft->candidate_retrying = false;
a93e80
 }
a93e80
 
a93e80
diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at
a93e80
index 5b6188b96f..0aa4564480 100644
a93e80
--- a/tests/ovsdb-cluster.at
a93e80
+++ b/tests/ovsdb-cluster.at
a93e80
@@ -179,6 +179,41 @@ AT_KEYWORDS([ovsdb server negative unix cluster disconnect])
a93e80
 ovsdb_test_cluster_disconnect 5 leader yes
a93e80
 AT_CLEANUP
a93e80
 
a93e80
+AT_SETUP([OVSDB cluster - initial status should be disconnected])
a93e80
+AT_KEYWORDS([ovsdb server negative unix cluster disconnect])
a93e80
+
a93e80
+n=3
a93e80
+schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
a93e80
+ordinal_schema > schema
a93e80
+AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr])
a93e80
+cid=`ovsdb-tool db-cid s1.db`
a93e80
+schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
a93e80
+for i in `seq 2 $n`; do
a93e80
+    AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft])
a93e80
+done
a93e80
+
a93e80
+on_exit 'kill `cat *.pid`'
a93e80
+for i in `seq $n`; do
a93e80
+    AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db])
a93e80
+done
a93e80
+for i in `seq $n`; do
a93e80
+    AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
a93e80
+done
a93e80
+
a93e80
+# Stop all servers, and start the s1 only, to test initial connection status
a93e80
+# when there is no leader yet.
a93e80
+for i in `seq 1 $n`; do
a93e80
+    OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid])
a93e80
+done
a93e80
+i=1
a93e80
+AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db])
a93e80
+
a93e80
+# The initial status should be disconnected. So wait should fail.
a93e80
+AT_CHECK([ovsdb_client_wait --timeout=1 unix:s$i.ovsdb $schema_name connected], [142], [ignore], [ignore])
a93e80
+OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid])
a93e80
+
a93e80
+AT_CLEANUP
a93e80
+
a93e80
 
a93e80
 
a93e80
 AT_BANNER([OVSDB cluster election timer change])
a93e80
-- 
a93e80
2.25.1
a93e80
a93e80
a93e80
From 053b78c8d60ffb4d212fd7894f91be52027f291f Mon Sep 17 00:00:00 2001
a93e80
From: Han Zhou <hzhou@ovn.org>
a93e80
Date: Fri, 28 Feb 2020 18:07:07 -0800
a93e80
Subject: [PATCH 06/15] raft: Avoid busy loop during leader election.
a93e80
a93e80
[ upstream commit 3ae90e1899c5a05148ea1870d9bb4ac3c05e3a19 ]
a93e80
a93e80
When a server doesn't see a leader yet, e.g. during leader re-election,
a93e80
if a transaction comes from a client, it will cause 100% CPU busy loop.
a93e80
With debug log enabled it is like:
a93e80
a93e80
2020-02-28T04:04:35.631Z|00059|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164
a93e80
2020-02-28T04:04:35.631Z|00062|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164
a93e80
2020-02-28T04:04:35.631Z|00065|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164
a93e80
2020-02-28T04:04:35.631Z|00068|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164
a93e80
2020-02-28T04:04:35.631Z|00071|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164
a93e80
2020-02-28T04:04:35.631Z|00074|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164
a93e80
2020-02-28T04:04:35.631Z|00077|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164
a93e80
...
a93e80
a93e80
The problem is that in ovsdb_trigger_try(), all cluster errors are treated
a93e80
as temporary error and retry immediately. This patch fixes it by introducing
a93e80
'run_triggers_now', which tells if a retry is needed immediately. When the
a93e80
cluster error is with detail 'not leader', we don't immediately retry, but
a93e80
will wait for the next poll event to trigger the retry. When 'not leader'
a93e80
status changes, there must be a event, i.e. raft RPC that changes the
a93e80
status, so the trigger is guaranteed to be triggered, without busy loop.
a93e80
a93e80
Signed-off-by: Han Zhou <hzhou@ovn.org>
a93e80
Signed-off-by: Ben Pfaff <blp@ovn.org>
a93e80
a93e80
Resolves: #1836305
a93e80
Signed-off-by: Dumitru Ceara <dceara@redhat.com>
a93e80
---
a93e80
 ovsdb/ovsdb.c       |  2 +-
a93e80
 ovsdb/ovsdb.h       |  1 +
a93e80
 ovsdb/transaction.c |  2 +-
a93e80
 ovsdb/trigger.c     | 11 +++++++++--
a93e80
 4 files changed, 12 insertions(+), 4 deletions(-)
a93e80
a93e80
diff --git a/ovsdb/ovsdb.c b/ovsdb/ovsdb.c
a93e80
index cfc96b32f8..7e683e6815 100644
a93e80
--- a/ovsdb/ovsdb.c
a93e80
+++ b/ovsdb/ovsdb.c
a93e80
@@ -414,7 +414,7 @@ ovsdb_create(struct ovsdb_schema *schema, struct ovsdb_storage *storage)
a93e80
     db->storage = storage;
a93e80
     ovs_list_init(&db->monitors);
a93e80
     ovs_list_init(&db->triggers);
a93e80
-    db->run_triggers = false;
a93e80
+    db->run_triggers_now = db->run_triggers = false;
a93e80
 
a93e80
     shash_init(&db->tables);
a93e80
     if (schema) {
a93e80
diff --git a/ovsdb/ovsdb.h b/ovsdb/ovsdb.h
a93e80
index 32e5333163..5c30a83d92 100644
a93e80
--- a/ovsdb/ovsdb.h
a93e80
+++ b/ovsdb/ovsdb.h
a93e80
@@ -83,6 +83,7 @@ struct ovsdb {
a93e80
     /* Triggers. */
a93e80
     struct ovs_list triggers;   /* Contains "struct ovsdb_trigger"s. */
a93e80
     bool run_triggers;
a93e80
+    bool run_triggers_now;
a93e80
 
a93e80
     struct ovsdb_table *rbac_role;
a93e80
 
a93e80
diff --git a/ovsdb/transaction.c b/ovsdb/transaction.c
a93e80
index 369436bffb..8ffefcf7c9 100644
a93e80
--- a/ovsdb/transaction.c
a93e80
+++ b/ovsdb/transaction.c
a93e80
@@ -967,7 +967,7 @@ ovsdb_txn_complete(struct ovsdb_txn *txn)
a93e80
 {
a93e80
     if (!ovsdb_txn_is_empty(txn)) {
a93e80
 
a93e80
-        txn->db->run_triggers = true;
a93e80
+        txn->db->run_triggers_now = txn->db->run_triggers = true;
a93e80
         ovsdb_monitors_commit(txn->db, txn);
a93e80
         ovsdb_error_assert(for_each_txn_row(txn, ovsdb_txn_update_weak_refs));
a93e80
         ovsdb_error_assert(for_each_txn_row(txn, ovsdb_txn_row_commit));
a93e80
diff --git a/ovsdb/trigger.c b/ovsdb/trigger.c
a93e80
index 7e62e90ae3..0372302af4 100644
a93e80
--- a/ovsdb/trigger.c
a93e80
+++ b/ovsdb/trigger.c
a93e80
@@ -141,7 +141,7 @@ ovsdb_trigger_run(struct ovsdb *db, long long int now)
a93e80
     struct ovsdb_trigger *t, *next;
a93e80
 
a93e80
     bool run_triggers = db->run_triggers;
a93e80
-    db->run_triggers = false;
a93e80
+    db->run_triggers_now = db->run_triggers = false;
a93e80
 
a93e80
     bool disconnect_all = false;
a93e80
 
a93e80
@@ -160,7 +160,7 @@ ovsdb_trigger_run(struct ovsdb *db, long long int now)
a93e80
 void
a93e80
 ovsdb_trigger_wait(struct ovsdb *db, long long int now)
a93e80
 {
a93e80
-    if (db->run_triggers) {
a93e80
+    if (db->run_triggers_now) {
a93e80
         poll_immediate_wake();
a93e80
     } else {
a93e80
         long long int deadline = LLONG_MAX;
a93e80
@@ -319,9 +319,16 @@ ovsdb_trigger_try(struct ovsdb_trigger *t, long long int now)
a93e80
             if (!strcmp(ovsdb_error_get_tag(error), "cluster error")) {
a93e80
                 /* Temporary error.  Transition back to "initialized" state to
a93e80
                  * try again. */
a93e80
+                char *err_s = ovsdb_error_to_string(error);
a93e80
+                VLOG_DBG("cluster error %s", err_s);
a93e80
+
a93e80
                 jsonrpc_msg_destroy(t->reply);
a93e80
                 t->reply = NULL;
a93e80
                 t->db->run_triggers = true;
a93e80
+                if (!strstr(err_s, "not leader")) {
a93e80
+                    t->db->run_triggers_now = true;
a93e80
+                }
a93e80
+                free(err_s);
a93e80
                 ovsdb_error_destroy(error);
a93e80
             } else {
a93e80
                 /* Permanent error.  Transition to "completed" state to report
a93e80
-- 
a93e80
2.25.1
a93e80
a93e80
a93e80
From cc3d02699203e2fe9d9fd384d09e268ba614828d Mon Sep 17 00:00:00 2001
a93e80
From: Han Zhou <hzhou@ovn.org>
a93e80
Date: Fri, 28 Feb 2020 18:07:10 -0800
a93e80
Subject: [PATCH 07/15] raft: Fix next_index in install_snapshot reply
a93e80
 handling.
a93e80
a93e80
[ upstream commit 877618fc833273d1e29e012b5e925d51cba80ff5 ]
a93e80
a93e80
When a leader handles install_snapshot reply, the next_index for
a93e80
the follower should be log_start instead of log_end, because there
a93e80
can be new entries added in leader's log after initiating the
a93e80
install_snapshot procedure.  Also, it should send all the accumulated
a93e80
entries to follower in the following append-request message, instead
a93e80
of sending 0 entries, to speed up the converge.
a93e80
a93e80
Without this fix, there is no functional problem, but it takes
a93e80
uncessary extra rounds of append-requests responsed with "inconsistency"
a93e80
by follower, although finally will be converged.
a93e80
a93e80
Signed-off-by: Han Zhou <hzhou@ovn.org>
a93e80
Signed-off-by: Ben Pfaff <blp@ovn.org>
a93e80
a93e80
Resolves: #1836305
a93e80
Signed-off-by: Dumitru Ceara <dceara@redhat.com>
a93e80
---
a93e80
 ovsdb/raft.c | 5 +++--
a93e80
 1 file changed, 3 insertions(+), 2 deletions(-)
a93e80
a93e80
diff --git a/ovsdb/raft.c b/ovsdb/raft.c
a93e80
index 6cd7b0041a..fa04d8c80b 100644
a93e80
--- a/ovsdb/raft.c
a93e80
+++ b/ovsdb/raft.c
a93e80
@@ -3998,8 +3998,9 @@ raft_handle_install_snapshot_reply(
a93e80
     VLOG_INFO_RL(&rl, "cluster "CID_FMT": installed snapshot on server %s "
a93e80
                  " up to %"PRIu64":%"PRIu64, CID_ARGS(&raft->cid),
a93e80
                  s->nickname, rpy->last_term, rpy->last_index);
a93e80
-    s->next_index = raft->log_end;
a93e80
-    raft_send_append_request(raft, s, 0, "snapshot installed");
a93e80
+    s->next_index = raft->log_start;
a93e80
+    raft_send_append_request(raft, s, raft->log_end - s->next_index,
a93e80
+                             "snapshot installed");
a93e80
 }
a93e80
 
a93e80
 /* Returns true if 'raft' has grown enough since the last snapshot that
a93e80
-- 
a93e80
2.25.1
a93e80
a93e80
a93e80
From 9c76350e271546eedfeb18720975e35b4e36e1f1 Mon Sep 17 00:00:00 2001
a93e80
From: Han Zhou <hzhou@ovn.org>
a93e80
Date: Thu, 5 Mar 2020 23:48:45 -0800
a93e80
Subject: [PATCH 08/15] raft: Fix the problem of stuck in candidate role
a93e80
 forever.
a93e80
a93e80
[ upstream commit 25a7e5547f1e107db0f032ad269f447c57401531 ]
a93e80
a93e80
Sometimes a server can stay in candidate role forever, even if the server
a93e80
already see the new leader and handles append-requests normally. However,
a93e80
because of the wrong role, it appears as disconnected from cluster and
a93e80
so the clients are disconnected.
a93e80
a93e80
This problem happens when 2 servers become candidates in the same
a93e80
term, and one of them is elected as leader in that term. It can be
a93e80
reproduced by the test cases added in this patch.
a93e80
a93e80
The root cause is that the current implementation only changes role to
a93e80
follower when a bigger term is observed (in raft_receive_term__()).
a93e80
According to the RAFT paper, if another candidate becomes leader with
a93e80
the same term, the candidate should change to follower.
a93e80
a93e80
This patch fixes it by changing the role to follower when leader
a93e80
is being updated in raft_update_leader().
a93e80
a93e80
Signed-off-by: Han Zhou <hzhou@ovn.org>
a93e80
Signed-off-by: Ben Pfaff <blp@ovn.org>
a93e80
a93e80
Resolves: #1836305
a93e80
Signed-off-by: Dumitru Ceara <dceara@redhat.com>
a93e80
---
a93e80
 ovsdb/raft.c           | 19 +++++++++++++--
a93e80
 tests/ovsdb-cluster.at | 55 ++++++++++++++++++++++++++++++++++++++++++
a93e80
 2 files changed, 72 insertions(+), 2 deletions(-)
a93e80
a93e80
diff --git a/ovsdb/raft.c b/ovsdb/raft.c
a93e80
index fa04d8c80b..6452182ba6 100644
a93e80
--- a/ovsdb/raft.c
a93e80
+++ b/ovsdb/raft.c
a93e80
@@ -73,7 +73,8 @@ enum raft_failure_test {
a93e80
     FT_CRASH_BEFORE_SEND_EXEC_REQ,
a93e80
     FT_CRASH_AFTER_SEND_EXEC_REQ,
a93e80
     FT_CRASH_AFTER_RECV_APPEND_REQ_UPDATE,
a93e80
-    FT_DELAY_ELECTION
a93e80
+    FT_DELAY_ELECTION,
a93e80
+    FT_DONT_SEND_VOTE_REQUEST
a93e80
 };
a93e80
 static enum raft_failure_test failure_test;
a93e80
 
a93e80
@@ -1647,6 +1648,7 @@ raft_start_election(struct raft *raft, bool leadership_transfer)
a93e80
     }
a93e80
 
a93e80
     ovs_assert(raft->role != RAFT_LEADER);
a93e80
+
a93e80
     raft->role = RAFT_CANDIDATE;
a93e80
     /* If there was no leader elected since last election, we know we are
a93e80
      * retrying now. */
a93e80
@@ -1690,7 +1692,9 @@ raft_start_election(struct raft *raft, bool leadership_transfer)
a93e80
                 .leadership_transfer = leadership_transfer,
a93e80
             },
a93e80
         };
a93e80
-        raft_send(raft, &rq;;
a93e80
+        if (failure_test != FT_DONT_SEND_VOTE_REQUEST) {
a93e80
+            raft_send(raft, &rq;;
a93e80
+        }
a93e80
     }
a93e80
 
a93e80
     /* Vote for ourselves. */
a93e80
@@ -2966,6 +2970,15 @@ raft_update_leader(struct raft *raft, const struct uuid *sid)
a93e80
         };
a93e80
         ignore(ovsdb_log_write_and_free(raft->log, raft_record_to_json(&r)));
a93e80
     }
a93e80
+    if (raft->role == RAFT_CANDIDATE) {
a93e80
+        /* Section 3.4: While waiting for votes, a candidate may
a93e80
+         * receive an AppendEntries RPC from another server claiming to
a93e80
+         * be leader. If the leader’s term (included in its RPC) is at
a93e80
+         * least as large as the candidate’s current term, then the
a93e80
+         * candidate recognizes the leader as legitimate and returns to
a93e80
+         * follower state. */
a93e80
+        raft->role = RAFT_FOLLOWER;
a93e80
+    }
a93e80
     return true;
a93e80
 }
a93e80
 
a93e80
@@ -4674,6 +4687,8 @@ raft_unixctl_failure_test(struct unixctl_conn *conn OVS_UNUSED,
a93e80
                 raft_reset_election_timer(raft);
a93e80
             }
a93e80
         }
a93e80
+    } else if (!strcmp(test, "dont-send-vote-request")) {
a93e80
+        failure_test = FT_DONT_SEND_VOTE_REQUEST;
a93e80
     } else if (!strcmp(test, "clear")) {
a93e80
         failure_test = FT_NO_TEST;
a93e80
         unixctl_command_reply(conn, "test dismissed");
a93e80
diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at
a93e80
index 0aa4564480..9714545151 100644
a93e80
--- a/tests/ovsdb-cluster.at
a93e80
+++ b/tests/ovsdb-cluster.at
a93e80
@@ -527,6 +527,61 @@ AT_KEYWORDS([ovsdb server negative unix cluster pending-txn])
a93e80
 ovsdb_cluster_failure_test 2 2 3 crash-after-receiving-append-request-update
a93e80
 AT_CLEANUP
a93e80
 
a93e80
+
a93e80
+AT_SETUP([OVSDB cluster - competing candidates])
a93e80
+AT_KEYWORDS([ovsdb server negative unix cluster competing-candidates])
a93e80
+
a93e80
+n=3
a93e80
+schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
a93e80
+ordinal_schema > schema
a93e80
+AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr])
a93e80
+cid=`ovsdb-tool db-cid s1.db`
a93e80
+schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
a93e80
+for i in `seq 2 $n`; do
a93e80
+    AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft])
a93e80
+done
a93e80
+
a93e80
+on_exit 'kill `cat *.pid`'
a93e80
+for i in `seq $n`; do
a93e80
+    AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db])
a93e80
+done
a93e80
+for i in `seq $n`; do
a93e80
+    AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
a93e80
+done
a93e80
+
a93e80
+# We need to simulate the situation when 2 candidates starts election with same
a93e80
+# term.
a93e80
+#
a93e80
+# Before triggering leader election, tell follower s2 don't send vote request (simulating
a93e80
+# vote-request lost or not handled in time), and tell follower s3 to delay
a93e80
+# election timer to make sure s3 doesn't send vote-request before s2 enters
a93e80
+# term 2.
a93e80
+AT_CHECK([ovs-appctl -t "`pwd`"/s2 cluster/failure-test dont-send-vote-request], [0], [ignore])
a93e80
+AT_CHECK([ovs-appctl -t "`pwd`"/s3 cluster/failure-test delay-election], [0], [ignore])
a93e80
+
a93e80
+# Restart leader, which will become follower, and both old followers will start
a93e80
+# election as candidate. The new follower (old leader) will vote one of them,
a93e80
+# and the other candidate should step back as follower as again.
a93e80
+kill -9 `cat s1.pid`
a93e80
+AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s1.log --pidfile=s1.pid --unixctl=s1 --remote=punix:s1.ovsdb s1.db])
a93e80
+
a93e80
+# Tell s1 to delay election timer so that it won't start election before s3
a93e80
+# becomes candidate.
a93e80
+AT_CHECK([ovs-appctl -t "`pwd`"/s1 cluster/failure-test delay-election], [0], [ignore])
a93e80
+
a93e80
+OVS_WAIT_UNTIL([ovs-appctl -t "`pwd`"/s1 cluster/status $schema_name | grep "Term: 2"])
a93e80
+
a93e80
+for i in `seq $n`; do
a93e80
+    OVS_WAIT_WHILE([ovs-appctl -t "`pwd`"/s$i cluster/status $schema_name | grep "candidate"])
a93e80
+    AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
a93e80
+done
a93e80
+
a93e80
+for i in `seq $n`; do
a93e80
+    OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid])
a93e80
+done
a93e80
+
a93e80
+AT_CLEANUP
a93e80
+
a93e80
 
a93e80
 AT_BANNER([OVSDB - cluster tests])
a93e80
 
a93e80
-- 
a93e80
2.25.1
a93e80
a93e80
a93e80
From 5c38ccd52fb3925e82eda20f1897ec02abb390d9 Mon Sep 17 00:00:00 2001
a93e80
From: Ilya Maximets <i.maximets@ovn.org>
a93e80
Date: Mon, 4 May 2020 21:55:41 +0200
a93e80
Subject: [PATCH 09/15] raft: Fix leak of the incomplete command.
a93e80
a93e80
[ upstream commit 168beb87ca63056e8896b09a60031565b7b60728 ]
a93e80
a93e80
Function raft_command_initiate() returns correctly referenced command
a93e80
instance.  'n_ref' equals 1 for complete commands and 2 for incomplete
a93e80
commands because one more reference is in raft->commands list.
a93e80
raft_handle_execute_command_request__() leaks the reference by not
a93e80
returning pointer anywhere and not unreferencing incomplete commands.
a93e80
a93e80
 792 bytes in 11 blocks are definitely lost in loss record 258 of 262
a93e80
    at 0x483BB1A: calloc (vg_replace_malloc.c:762)
a93e80
    by 0x44BA32: xcalloc (util.c:121)
a93e80
    by 0x422E5F: raft_command_create_incomplete (raft.c:2038)
a93e80
    by 0x422E5F: raft_command_initiate (raft.c:2061)
a93e80
    by 0x428651: raft_handle_execute_command_request__ (raft.c:4161)
a93e80
    by 0x428651: raft_handle_execute_command_request (raft.c:4177)
a93e80
    by 0x428651: raft_handle_rpc (raft.c:4230)
a93e80
    by 0x428651: raft_conn_run (raft.c:1445)
a93e80
    by 0x428DEA: raft_run (raft.c:1803)
a93e80
    by 0x407392: main_loop (ovsdb-server.c:226)
a93e80
    by 0x407392: main (ovsdb-server.c:469)
a93e80
a93e80
Fixes: 1b1d2e6daa56 ("ovsdb: Introduce experimental support for clustered databases.")
a93e80
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
a93e80
Acked-by: Han Zhou <hzhou@ovn.org>
a93e80
Signed-off-by: William Tu <u9012063@gmail.com>
a93e80
a93e80
Resolves: #1836307
a93e80
Signed-off-by: Dumitru Ceara <dceara@redhat.com>
a93e80
---
a93e80
 ovsdb/raft.c | 4 +---
a93e80
 1 file changed, 1 insertion(+), 3 deletions(-)
a93e80
a93e80
diff --git a/ovsdb/raft.c b/ovsdb/raft.c
a93e80
index 6452182ba6..1505814138 100644
a93e80
--- a/ovsdb/raft.c
a93e80
+++ b/ovsdb/raft.c
a93e80
@@ -4163,9 +4163,7 @@ raft_handle_execute_command_request__(
a93e80
     cmd->sid = rq->common.sid;
a93e80
 
a93e80
     enum raft_command_status status = cmd->status;
a93e80
-    if (status != RAFT_CMD_INCOMPLETE) {
a93e80
-        raft_command_unref(cmd);
a93e80
-    }
a93e80
+    raft_command_unref(cmd);
a93e80
     return status;
a93e80
 }
a93e80
 
a93e80
-- 
a93e80
2.25.1
a93e80
a93e80
a93e80
From 3d9b529afb098531190d57d6f35d1622bb4093cd Mon Sep 17 00:00:00 2001
a93e80
From: Zhen Wang <zhewang@nvidia.com>
a93e80
Date: Mon, 30 Mar 2020 17:21:04 -0700
a93e80
Subject: [PATCH 10/15] raft: Disable RAFT jsonrpc inactivity probe.
a93e80
a93e80
[ upstream commit 1600e0040caded7eaa9b1f41926f9619d8e0ec8d ]
a93e80
a93e80
With the scale test of 640 nodes k8s cluster, raft DB nodes' jsonrpc
a93e80
session got closed due to the timeout of default 5 seconds probe.
a93e80
It will cause disturbance of the raft cluster. Since we already have
a93e80
the heartbeat for RAFT, just disable the probe between the servers
a93e80
to avoid the unnecessary jsonrpc inactivity probe.
a93e80
a93e80
Acked-by: Han Zhou <hzhou@ovn.org>
a93e80
Signed-off-by: Zhen Wang <zhewang@nvidia.com>
a93e80
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
a93e80
a93e80
Resolves: #1836308
a93e80
Signed-off-by: Dumitru Ceara <dceara@redhat.com>
a93e80
---
a93e80
 ovsdb/raft.c | 1 +
a93e80
 1 file changed, 1 insertion(+)
a93e80
a93e80
diff --git a/ovsdb/raft.c b/ovsdb/raft.c
a93e80
index 1505814138..395cc56113 100644
a93e80
--- a/ovsdb/raft.c
a93e80
+++ b/ovsdb/raft.c
a93e80
@@ -938,6 +938,7 @@ raft_add_conn(struct raft *raft, struct jsonrpc_session *js,
a93e80
                                               &conn->sid);
a93e80
     conn->incoming = incoming;
a93e80
     conn->js_seqno = jsonrpc_session_get_seqno(conn->js);
a93e80
+    jsonrpc_session_set_probe_interval(js, 0);
a93e80
 }
a93e80
 
a93e80
 /* Starts the local server in an existing Raft cluster, using the local copy of
a93e80
-- 
a93e80
2.25.1
a93e80
a93e80
a93e80
From 8b155475749cdb7a1817810d447e4cf6598cb6fa Mon Sep 17 00:00:00 2001
a93e80
From: Aaron Conole <aconole@redhat.com>
a93e80
Date: Fri, 15 May 2020 16:36:18 -0400
a93e80
Subject: [PATCH 11/15] netdev-linux: Update LAG in all cases.
a93e80
a93e80
In some cases, when processing a netlink change event, it's possible for
a93e80
an alternate part of OvS (like the IPv6 endpoint processing) to hold an
a93e80
active netdev interface.  This creates a race-condition, where sometimes
a93e80
the OvS change processing will take the normal path.  This doesn't work
a93e80
because the netdev device object won't actually be enslaved to the
a93e80
ovs-system (for instance, a linux bond) and ingress qdisc entries will
a93e80
be missing.
a93e80
a93e80
To address this, we update the LAG information in ALL cases where
a93e80
LAG information could come in.
a93e80
a93e80
Fixes: d22f8927c3c9 ("netdev-linux: monitor and offload LAG slaves to TC")
a93e80
Cc: Marcelo Leitner <mleitner@redhat.com>
a93e80
Cc: John Hurley <john.hurley@netronome.com>
a93e80
Acked-by: Roi Dayan <roid@mellanox.com>
a93e80
Signed-off-by: Aaron Conole <aconole@redhat.com>
a93e80
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
a93e80
---
a93e80
 lib/netdev-linux.c | 11 +++++------
a93e80
 1 file changed, 5 insertions(+), 6 deletions(-)
a93e80
a93e80
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
a93e80
index c6f3d27409..2bf8d4c477 100644
a93e80
--- a/lib/netdev-linux.c
a93e80
+++ b/lib/netdev-linux.c
a93e80
@@ -659,10 +659,6 @@ netdev_linux_update_lag(struct rtnetlink_change *change)
a93e80
 {
a93e80
     struct linux_lag_slave *lag;
a93e80
 
a93e80
-    if (!rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
a93e80
-        return;
a93e80
-    }
a93e80
-
a93e80
     if (change->slave && netdev_linux_kind_is_lag(change->slave)) {
a93e80
         lag = shash_find_data(&lag_shash, change->ifname);
a93e80
 
a93e80
@@ -760,8 +756,11 @@ netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
a93e80
                     netdev_linux_update(netdev, nsid, &change);
a93e80
                     ovs_mutex_unlock(&netdev->mutex);
a93e80
                 }
a93e80
-                else if (!netdev_ && change.ifname) {
a93e80
-                    /* Netdev is not present in OvS but its master could be. */
a93e80
+
a93e80
+                if (change.ifname &&
a93e80
+                    rtnetlink_type_is_rtnlgrp_link(change.nlmsg_type)) {
a93e80
+
a93e80
+                    /* Need to try updating the LAG information. */
a93e80
                     ovs_mutex_lock(&lag_mutex);
a93e80
                     netdev_linux_update_lag(&change);
a93e80
                     ovs_mutex_unlock(&lag_mutex);
a93e80
-- 
a93e80
2.25.1
a93e80
a93e80
a93e80
From d14e39f81bec29064a58df0177ce457765305f8b Mon Sep 17 00:00:00 2001
a93e80
From: Aaron Conole <aconole@redhat.com>
a93e80
Date: Fri, 15 May 2020 16:36:19 -0400
a93e80
Subject: [PATCH 12/15] netdev-offload-tc: Re-fetch block ID after probing.
a93e80
a93e80
It's possible that block_id could changes after the probe for block
a93e80
support.  Therefore, fetch the block_id again after the probe.
a93e80
a93e80
Fixes: edc2055a2bf7 ("netdev-offload-tc: Flush rules on ingress block when init tc flow api")
a93e80
Cc: Dmytro Linkin <dmitrolin@mellanox.com>
a93e80
Acked-by: Roi Dayan <roid@mellanox.com>
a93e80
Co-authored-by: Marcelo Leitner <mleitner@redhat.com>
a93e80
Signed-off-by: Marcelo Leitner <mleitner@redhat.com>
a93e80
Signed-off-by: Aaron Conole <aconole@redhat.com>
a93e80
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
a93e80
---
a93e80
 lib/netdev-offload-tc.c | 2 ++
a93e80
 1 file changed, 2 insertions(+)
a93e80
a93e80
diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c
a93e80
index 550e440b3a..f577311aec 100644
a93e80
--- a/lib/netdev-offload-tc.c
a93e80
+++ b/lib/netdev-offload-tc.c
a93e80
@@ -1922,6 +1922,8 @@ netdev_tc_init_flow_api(struct netdev *netdev)
a93e80
 
a93e80
     if (ovsthread_once_start(&block_once)) {
a93e80
         probe_tc_block_support(ifindex);
a93e80
+        /* Need to re-fetch block id as it depends on feature availability. */
a93e80
+        block_id = get_block_id_from_netdev(netdev);
a93e80
         ovsthread_once_done(&block_once);
a93e80
     }
a93e80
 
a93e80
-- 
a93e80
2.25.1
a93e80
a93e80
a93e80
From fb32a78921e50b1ffa0c52f873167f68622e8723 Mon Sep 17 00:00:00 2001
a93e80
From: Ilya Maximets <i.maximets@ovn.org>
a93e80
Date: Fri, 22 May 2020 18:31:19 +0200
a93e80
Subject: [PATCH 13/15] ovsdb: Add raft memory usage to memory report.
a93e80
a93e80
[ upstream commit 3423cd97f88fe6a8de8b649d79fe6ac83bce94d1 ]
a93e80
a93e80
Memory reports could be found in logs or by calling 'memory/show'
a93e80
appctl command.  For ovsdb-server it includes information about db
a93e80
cells, monitor connections with their backlog size, etc.  But it
a93e80
doesn't contain any information about memory consumed by raft.
a93e80
Backlogs of raft connections could be insanely large because of
a93e80
snapshot installation requests that simply contains the whole database.
a93e80
In not that healthy clusters where one of ovsdb servers is not able to
a93e80
timely handle all the incoming raft traffic, backlog on a sender's side
a93e80
could cause significant memory consumption issues.
a93e80
a93e80
Adding new 'raft-connections' and 'raft-backlog' counters to the
a93e80
memory report to better track such conditions.
a93e80
a93e80
Acked-by: Han Zhou <hzhou@ovn.org>
a93e80
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
a93e80
a93e80
Related: #1834838
a93e80
Signed-off-by: Ilya Maximets <i.maximets@redhat.com>
a93e80
---
a93e80
 ovsdb/ovsdb.c   |  4 ++++
a93e80
 ovsdb/raft.c    | 16 ++++++++++++++++
a93e80
 ovsdb/raft.h    |  2 ++
a93e80
 ovsdb/storage.c | 10 ++++++++++
a93e80
 ovsdb/storage.h |  3 +++
a93e80
 5 files changed, 35 insertions(+)
a93e80
a93e80
diff --git a/ovsdb/ovsdb.c b/ovsdb/ovsdb.c
a93e80
index 7e683e6815..2da117cb36 100644
a93e80
--- a/ovsdb/ovsdb.c
a93e80
+++ b/ovsdb/ovsdb.c
a93e80
@@ -502,6 +502,10 @@ ovsdb_get_memory_usage(const struct ovsdb *db, struct simap *usage)
a93e80
     }
a93e80
 
a93e80
     simap_increase(usage, "cells", cells);
a93e80
+
a93e80
+    if (db->storage) {
a93e80
+        ovsdb_storage_get_memory_usage(db->storage, usage);
a93e80
+    }
a93e80
 }
a93e80
 
a93e80
 struct ovsdb_table *
a93e80
diff --git a/ovsdb/raft.c b/ovsdb/raft.c
a93e80
index 395cc56113..6ca63b4352 100644
a93e80
--- a/ovsdb/raft.c
a93e80
+++ b/ovsdb/raft.c
a93e80
@@ -36,6 +36,7 @@
a93e80
 #include "ovsdb/log.h"
a93e80
 #include "raft-rpc.h"
a93e80
 #include "random.h"
a93e80
+#include "simap.h"
a93e80
 #include "socket-util.h"
a93e80
 #include "stream.h"
a93e80
 #include "timeval.h"
a93e80
@@ -1014,6 +1015,21 @@ raft_get_sid(const struct raft *raft)
a93e80
     return &raft->sid;
a93e80
 }
a93e80
 
a93e80
+/* Adds memory consumption info to 'usage' for later use by memory_report(). */
a93e80
+void
a93e80
+raft_get_memory_usage(const struct raft *raft, struct simap *usage)
a93e80
+{
a93e80
+    struct raft_conn *conn;
a93e80
+    int cnt = 0;
a93e80
+
a93e80
+    LIST_FOR_EACH (conn, list_node, &raft->conns) {
a93e80
+        simap_increase(usage, "raft-backlog",
a93e80
+                       jsonrpc_session_get_backlog(conn->js));
a93e80
+        cnt++;
a93e80
+    }
a93e80
+    simap_increase(usage, "raft-connections", cnt);
a93e80
+}
a93e80
+
a93e80
 /* Returns true if 'raft' has completed joining its cluster, has not left or
a93e80
  * initiated leaving the cluster, does not have failed disk storage, and is
a93e80
  * apparently connected to the leader in a healthy way (or is itself the
a93e80
diff --git a/ovsdb/raft.h b/ovsdb/raft.h
a93e80
index 3d448995af..99d5307e54 100644
a93e80
--- a/ovsdb/raft.h
a93e80
+++ b/ovsdb/raft.h
a93e80
@@ -67,6 +67,7 @@
a93e80
 struct json;
a93e80
 struct ovsdb_log;
a93e80
 struct raft;
a93e80
+struct simap;
a93e80
 struct sset;
a93e80
 
a93e80
 #define RAFT_MAGIC "CLUSTER"
a93e80
@@ -113,6 +114,7 @@ const struct uuid *raft_get_cid(const struct raft *);
a93e80
 const struct uuid *raft_get_sid(const struct raft *);
a93e80
 bool raft_is_connected(const struct raft *);
a93e80
 bool raft_is_leader(const struct raft *);
a93e80
+void raft_get_memory_usage(const struct raft *, struct simap *usage);
a93e80
 
a93e80
 /* Joining a cluster. */
a93e80
 bool raft_is_joining(const struct raft *);
a93e80
diff --git a/ovsdb/storage.c b/ovsdb/storage.c
a93e80
index e26252b066..7b4ad16f60 100644
a93e80
--- a/ovsdb/storage.c
a93e80
+++ b/ovsdb/storage.c
a93e80
@@ -26,6 +26,7 @@
a93e80
 #include "ovsdb.h"
a93e80
 #include "raft.h"
a93e80
 #include "random.h"
a93e80
+#include "simap.h"
a93e80
 #include "timeval.h"
a93e80
 #include "util.h"
a93e80
 
a93e80
@@ -188,6 +189,15 @@ ovsdb_storage_get_applied_index(const struct ovsdb_storage *storage)
a93e80
     return storage->raft ? raft_get_applied_index(storage->raft) : 0;
a93e80
 }
a93e80
 
a93e80
+void
a93e80
+ovsdb_storage_get_memory_usage(const struct ovsdb_storage *storage,
a93e80
+                               struct simap *usage)
a93e80
+{
a93e80
+    if (storage->raft) {
a93e80
+        raft_get_memory_usage(storage->raft, usage);
a93e80
+    }
a93e80
+}
a93e80
+
a93e80
 void
a93e80
 ovsdb_storage_run(struct ovsdb_storage *storage)
a93e80
 {
a93e80
diff --git a/ovsdb/storage.h b/ovsdb/storage.h
a93e80
index 8a9bbab709..a223968912 100644
a93e80
--- a/ovsdb/storage.h
a93e80
+++ b/ovsdb/storage.h
a93e80
@@ -23,6 +23,7 @@
a93e80
 struct json;
a93e80
 struct ovsdb_schema;
a93e80
 struct ovsdb_storage;
a93e80
+struct simap;
a93e80
 struct uuid;
a93e80
 
a93e80
 struct ovsdb_error *ovsdb_storage_open(const char *filename, bool rw,
a93e80
@@ -39,6 +40,8 @@ bool ovsdb_storage_is_leader(const struct ovsdb_storage *);
a93e80
 const struct uuid *ovsdb_storage_get_cid(const struct ovsdb_storage *);
a93e80
 const struct uuid *ovsdb_storage_get_sid(const struct ovsdb_storage *);
a93e80
 uint64_t ovsdb_storage_get_applied_index(const struct ovsdb_storage *);
a93e80
+void ovsdb_storage_get_memory_usage(const struct ovsdb_storage *,
a93e80
+                                    struct simap *usage);
a93e80
 
a93e80
 void ovsdb_storage_run(struct ovsdb_storage *);
a93e80
 void ovsdb_storage_wait(struct ovsdb_storage *);
a93e80
-- 
a93e80
2.25.1
a93e80
a93e80
a93e80
From 92a1e56c8a37927441fb1742e6054a9118654ef0 Mon Sep 17 00:00:00 2001
a93e80
From: Ilya Maximets <i.maximets@ovn.org>
a93e80
Date: Thu, 14 May 2020 22:10:45 +0200
a93e80
Subject: [PATCH 14/15] ovsdb-server: Fix schema leak while reading db.
a93e80
a93e80
[ upstream commit 16e3a80cf646f6c53d22ef98599d5aecb8310414 ]
a93e80
a93e80
parse_txn() function doesn't always take ownership of the 'schema'
a93e80
passed.  So, if the schema of the clustered db has same version as the
a93e80
one that already in use, parse_txn() will not use it, resulting with a
a93e80
memory leak:
a93e80
a93e80
 7,827 (56 direct, 7,771 indirect) bytes in 1 blocks are definitely lost
a93e80
    at 0x483BB1A: calloc (vg_replace_malloc.c:762)
a93e80
    by 0x44AD02: xcalloc (util.c:121)
a93e80
    by 0x40E70E: ovsdb_schema_create (ovsdb.c:41)
a93e80
    by 0x40EA6D: ovsdb_schema_from_json (ovsdb.c:217)
a93e80
    by 0x415EDD: ovsdb_storage_read (storage.c:280)
a93e80
    by 0x408968: read_db (ovsdb-server.c:607)
a93e80
    by 0x40733D: main_loop (ovsdb-server.c:227)
a93e80
    by 0x40733D: main (ovsdb-server.c:469)
a93e80
a93e80
While we could put ovsdb_schema_destroy() in a few places inside
a93e80
'parse_txn()', from the users' point of view it seems better to have a
a93e80
constant argument and just clone the 'schema' if needed.  The caller
a93e80
will be responsible for destroying the 'schema' it owns.
a93e80
a93e80
Fixes: 1b1d2e6daa56 ("ovsdb: Introduce experimental support for clustered databases.")
a93e80
Acked-by: Han Zhou <hzhou@ovn.org>
a93e80
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
a93e80
a93e80
Related: #1834838
a93e80
Signed-off-by: Ilya Maximets <i.maximets@redhat.com>
a93e80
---
a93e80
 ovsdb/ovsdb-server.c | 5 +++--
a93e80
 1 file changed, 3 insertions(+), 2 deletions(-)
a93e80
a93e80
diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c
a93e80
index d416f1b606..ef4e996df2 100644
a93e80
--- a/ovsdb/ovsdb-server.c
a93e80
+++ b/ovsdb/ovsdb-server.c
a93e80
@@ -540,7 +540,7 @@ close_db(struct server_config *config, struct db *db, char *comment)
a93e80
 
a93e80
 static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
a93e80
 parse_txn(struct server_config *config, struct db *db,
a93e80
-          struct ovsdb_schema *schema, const struct json *txn_json,
a93e80
+          const struct ovsdb_schema *schema, const struct json *txn_json,
a93e80
           const struct uuid *txnid)
a93e80
 {
a93e80
     if (schema && (!db->db->schema || strcmp(schema->version,
a93e80
@@ -565,7 +565,7 @@ parse_txn(struct server_config *config, struct db *db,
a93e80
              ? xasprintf("database %s schema changed", db->db->name)
a93e80
              : xasprintf("database %s connected to storage", db->db->name)));
a93e80
 
a93e80
-        ovsdb_replace(db->db, ovsdb_create(schema, NULL));
a93e80
+        ovsdb_replace(db->db, ovsdb_create(ovsdb_schema_clone(schema), NULL));
a93e80
 
a93e80
         /* Force update to schema in _Server database. */
a93e80
         db->row_uuid = UUID_ZERO;
a93e80
@@ -614,6 +614,7 @@ read_db(struct server_config *config, struct db *db)
a93e80
         } else {
a93e80
             error = parse_txn(config, db, schema, txn_json, &txnid);
a93e80
             json_destroy(txn_json);
a93e80
+            ovsdb_schema_destroy(schema);
a93e80
             if (error) {
a93e80
                 break;
a93e80
             }
a93e80
-- 
a93e80
2.25.1
a93e80
a93e80
a93e80
From 3168eba559cbce28937be4e785c3337030694455 Mon Sep 17 00:00:00 2001
a93e80
From: Ilya Maximets <i.maximets@ovn.org>
a93e80
Date: Fri, 22 May 2020 22:36:27 +0200
a93e80
Subject: [PATCH 15/15] raft: Avoid sending equal snapshots.
a93e80
a93e80
[ upstream commit 8c2c503bdb0da1ce6044a53d462f905fd4f8acf5 ]
a93e80
a93e80
Snapshots are huge.  In some cases we could receive several outdated
a93e80
append replies from the remote server.  This could happen in high
a93e80
scale cases if the remote server is overloaded and not able to process
a93e80
all the raft requests in time.  As an action to each outdated append
a93e80
reply we're sending full database snapshot.  While remote server is
a93e80
already overloaded those snapshots will stuck in jsonrpc backlog for
a93e80
a long time making it grow up to few GB.  Since remote server wasn't
a93e80
able to timely process incoming messages it will likely not able to
a93e80
process snapshots leading to the same situation with low chances to
a93e80
recover.  Remote server will likely stuck in 'candidate' state, other
a93e80
servers will grow their memory consumption due to growing jsonrpc
a93e80
backlogs:
a93e80
a93e80
jsonrpc|INFO|excessive sending backlog, jsonrpc: ssl:192.16.0.3:6644,
a93e80
             num of msgs: 3795, backlog: 8838994624.
a93e80
a93e80
This patch is trying to avoid that situation by avoiding sending of
a93e80
equal snapshot install requests.  This helps maintain reasonable memory
a93e80
consumption and allows the cluster to recover on a larger scale.
a93e80
a93e80
Acked-by: Han Zhou <hzhou@ovn.org>
a93e80
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
a93e80
a93e80
Related: #1834838
a93e80
Signed-off-by: Ilya Maximets <i.maximets@redhat.com>
a93e80
---
a93e80
 ovsdb/raft-private.c |  1 +
a93e80
 ovsdb/raft-private.h |  4 ++++
a93e80
 ovsdb/raft.c         | 39 ++++++++++++++++++++++++++++++++++++++-
a93e80
 3 files changed, 43 insertions(+), 1 deletion(-)
a93e80
a93e80
diff --git a/ovsdb/raft-private.c b/ovsdb/raft-private.c
a93e80
index 26d39a087f..9468fdaf4a 100644
a93e80
--- a/ovsdb/raft-private.c
a93e80
+++ b/ovsdb/raft-private.c
a93e80
@@ -137,6 +137,7 @@ raft_server_destroy(struct raft_server *s)
a93e80
     if (s) {
a93e80
         free(s->address);
a93e80
         free(s->nickname);
a93e80
+        free(s->last_install_snapshot_request);
a93e80
         free(s);
a93e80
     }
a93e80
 }
a93e80
diff --git a/ovsdb/raft-private.h b/ovsdb/raft-private.h
a93e80
index ac8656d42f..1f366b4ab3 100644
a93e80
--- a/ovsdb/raft-private.h
a93e80
+++ b/ovsdb/raft-private.h
a93e80
@@ -27,6 +27,7 @@
a93e80
 
a93e80
 struct ds;
a93e80
 struct ovsdb_parser;
a93e80
+struct raft_install_snapshot_request;
a93e80
 
a93e80
 /* Formatting server IDs and cluster IDs for use in human-readable logs.  Do
a93e80
  * not use these in cases where the whole server or cluster ID is needed; use
a93e80
@@ -83,6 +84,9 @@ struct raft_server {
a93e80
     bool replied;            /* Reply to append_request was received from this
a93e80
                                 node during current election_timeout interval.
a93e80
                                 */
a93e80
+    /* Copy of the last install_snapshot_request sent to this server. */
a93e80
+    struct raft_install_snapshot_request *last_install_snapshot_request;
a93e80
+
a93e80
     /* For use in adding and removing servers: */
a93e80
     struct uuid requester_sid;  /* Nonzero if requested via RPC. */
a93e80
     struct unixctl_conn *requester_conn; /* Only if requested via unixctl. */
a93e80
diff --git a/ovsdb/raft.c b/ovsdb/raft.c
a93e80
index 6ca63b4352..8df386fa19 100644
a93e80
--- a/ovsdb/raft.c
a93e80
+++ b/ovsdb/raft.c
a93e80
@@ -1421,8 +1421,20 @@ raft_conn_run(struct raft *raft, struct raft_conn *conn)
a93e80
     jsonrpc_session_run(conn->js);
a93e80
 
a93e80
     unsigned int new_seqno = jsonrpc_session_get_seqno(conn->js);
a93e80
-    bool just_connected = (new_seqno != conn->js_seqno
a93e80
+    bool reconnected = new_seqno != conn->js_seqno;
a93e80
+    bool just_connected = (reconnected
a93e80
                            && jsonrpc_session_is_connected(conn->js));
a93e80
+
a93e80
+    if (reconnected) {
a93e80
+        /* Clear 'last_install_snapshot_request' since it might not reach the
a93e80
+         * destination or server was restarted. */
a93e80
+        struct raft_server *server = raft_find_server(raft, &conn->sid);
a93e80
+        if (server) {
a93e80
+            free(server->last_install_snapshot_request);
a93e80
+            server->last_install_snapshot_request = NULL;
a93e80
+        }
a93e80
+    }
a93e80
+
a93e80
     conn->js_seqno = new_seqno;
a93e80
     if (just_connected) {
a93e80
         if (raft->joining) {
a93e80
@@ -3296,6 +3308,31 @@ raft_send_install_snapshot_request(struct raft *raft,
a93e80
             .election_timer = raft->election_timer, /* use latest value */
a93e80
         }
a93e80
     };
a93e80
+
a93e80
+    if (s->last_install_snapshot_request) {
a93e80
+        struct raft_install_snapshot_request *old, *new;
a93e80
+
a93e80
+        old = s->last_install_snapshot_request;
a93e80
+        new = &rpc.install_snapshot_request;
a93e80
+        if (   old->term           == new->term
a93e80
+            && old->last_index     == new->last_index
a93e80
+            && old->last_term      == new->last_term
a93e80
+            && old->last_servers   == new->last_servers
a93e80
+            && old->data           == new->data
a93e80
+            && old->election_timer == new->election_timer
a93e80
+            && uuid_equals(&old->last_eid, &new->last_eid)) {
a93e80
+            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
a93e80
+
a93e80
+            VLOG_WARN_RL(&rl, "not sending exact same install_snapshot_request"
a93e80
+                              " to server %s again", s->nickname);
a93e80
+            return;
a93e80
+        }
a93e80
+    }
a93e80
+    free(s->last_install_snapshot_request);
a93e80
+    CONST_CAST(struct raft_server *, s)->last_install_snapshot_request
a93e80
+        = xmemdup(&rpc.install_snapshot_request,
a93e80
+                  sizeof rpc.install_snapshot_request);
a93e80
+
a93e80
     raft_send(raft, &rpc;;
a93e80
 }
a93e80
 
a93e80
-- 
a93e80
2.25.1
a93e80
a93e80
diff --git a/dpdk/drivers/bus/pci/linux/pci_vfio.c b/dpdk/drivers/bus/pci/linux/pci_vfio.c
a93e80
index 64cd84a689..ba60e7ce99 100644
a93e80
--- a/dpdk/drivers/bus/pci/linux/pci_vfio.c
a93e80
+++ b/dpdk/drivers/bus/pci/linux/pci_vfio.c
a93e80
@@ -149,6 +149,38 @@ pci_vfio_get_msix_bar(int fd, struct pci_msix_table *msix_table)
a93e80
 	return 0;
a93e80
 }
a93e80
 
a93e80
+/* enable PCI bus memory space */
a93e80
+static int
a93e80
+pci_vfio_enable_bus_memory(int dev_fd)
a93e80
+{
a93e80
+	uint16_t cmd;
a93e80
+	int ret;
a93e80
+
a93e80
+	ret = pread64(dev_fd, &cmd, sizeof(cmd),
a93e80
+		      VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
a93e80
+		      PCI_COMMAND);
a93e80
+
a93e80
+	if (ret != sizeof(cmd)) {
a93e80
+		RTE_LOG(ERR, EAL, "Cannot read command from PCI config space!\n");
a93e80
+		return -1;
a93e80
+	}
a93e80
+
a93e80
+	if (cmd & PCI_COMMAND_MEMORY)
a93e80
+		return 0;
a93e80
+
a93e80
+	cmd |= PCI_COMMAND_MEMORY;
a93e80
+	ret = pwrite64(dev_fd, &cmd, sizeof(cmd),
a93e80
+		       VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
a93e80
+		       PCI_COMMAND);
a93e80
+
a93e80
+	if (ret != sizeof(cmd)) {
a93e80
+		RTE_LOG(ERR, EAL, "Cannot write command to PCI config space!\n");
a93e80
+		return -1;
a93e80
+	}
a93e80
+
a93e80
+	return 0;
a93e80
+}
a93e80
+
a93e80
 /* set PCI bus mastering */
a93e80
 static int
a93e80
 pci_vfio_set_bus_master(int dev_fd, bool op)
a93e80
@@ -427,6 +459,11 @@ pci_rte_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd)
a93e80
 		return -1;
a93e80
 	}
a93e80
 
a93e80
+	if (pci_vfio_enable_bus_memory(vfio_dev_fd)) {
a93e80
+		RTE_LOG(ERR, EAL, "Cannot enable bus memory!\n");
a93e80
+		return -1;
a93e80
+	}
a93e80
+
a93e80
 	/* set bus mastering for the device */
a93e80
 	if (pci_vfio_set_bus_master(vfio_dev_fd, true)) {
a93e80
 		RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n");
a93e80
diff --git a/dpdk/lib/librte_vhost/vhost_user.c b/dpdk/lib/librte_vhost/vhost_user.c
a93e80
index 40c4520c08..8954f7930e 100644
a93e80
--- a/dpdk/lib/librte_vhost/vhost_user.c
a93e80
+++ b/dpdk/lib/librte_vhost/vhost_user.c
a93e80
@@ -206,7 +206,7 @@ vhost_backend_cleanup(struct virtio_net *dev)
a93e80
 			dev->inflight_info->addr = NULL;
a93e80
 		}
a93e80
 
a93e80
-		if (dev->inflight_info->fd > 0) {
a93e80
+		if (dev->inflight_info->fd >= 0) {
a93e80
 			close(dev->inflight_info->fd);
a93e80
 			dev->inflight_info->fd = -1;
a93e80
 		}
a93e80
@@ -1408,6 +1408,7 @@ vhost_user_get_inflight_fd(struct virtio_net **pdev,
a93e80
 				"failed to alloc dev inflight area\n");
a93e80
 			return RTE_VHOST_MSG_RESULT_ERR;
a93e80
 		}
a93e80
+		dev->inflight_info->fd = -1;
a93e80
 	}
a93e80
 
a93e80
 	num_queues = msg->payload.inflight.num_queues;
a93e80
@@ -1433,6 +1434,16 @@ vhost_user_get_inflight_fd(struct virtio_net **pdev,
a93e80
 	}
a93e80
 	memset(addr, 0, mmap_size);
a93e80
 
a93e80
+	if (dev->inflight_info->addr) {
a93e80
+		munmap(dev->inflight_info->addr, dev->inflight_info->size);
a93e80
+		dev->inflight_info->addr = NULL;
a93e80
+	}
a93e80
+
a93e80
+	if (dev->inflight_info->fd >= 0) {
a93e80
+		close(dev->inflight_info->fd);
a93e80
+		dev->inflight_info->fd = -1;
a93e80
+	}
a93e80
+
a93e80
 	dev->inflight_info->addr = addr;
a93e80
 	dev->inflight_info->size = msg->payload.inflight.mmap_size = mmap_size;
a93e80
 	dev->inflight_info->fd = msg->fds[0] = fd;
a93e80
@@ -1515,10 +1526,13 @@ vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
a93e80
 				"failed to alloc dev inflight area\n");
a93e80
 			return RTE_VHOST_MSG_RESULT_ERR;
a93e80
 		}
a93e80
+		dev->inflight_info->fd = -1;
a93e80
 	}
a93e80
 
a93e80
-	if (dev->inflight_info->addr)
a93e80
+	if (dev->inflight_info->addr) {
a93e80
 		munmap(dev->inflight_info->addr, dev->inflight_info->size);
a93e80
+		dev->inflight_info->addr = NULL;
a93e80
+	}
a93e80
 
a93e80
 	addr = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
a93e80
 		    fd, mmap_offset);
a93e80
@@ -1527,8 +1541,10 @@ vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
a93e80
 		return RTE_VHOST_MSG_RESULT_ERR;
a93e80
 	}
a93e80
 
a93e80
-	if (dev->inflight_info->fd)
a93e80
+	if (dev->inflight_info->fd >= 0) {
a93e80
 		close(dev->inflight_info->fd);
a93e80
+		dev->inflight_info->fd = -1;
a93e80
+	}
a93e80
 
a93e80
 	dev->inflight_info->fd = fd;
a93e80
 	dev->inflight_info->addr = addr;
a93e80
@@ -2059,10 +2075,10 @@ vhost_user_set_log_base(struct virtio_net **pdev, struct VhostUserMsg *msg,
a93e80
 	size = msg->payload.log.mmap_size;
a93e80
 	off  = msg->payload.log.mmap_offset;
a93e80
 
a93e80
-	/* Don't allow mmap_offset to point outside the mmap region */
a93e80
-	if (off > size) {
a93e80
+	/* Check for mmap size and offset overflow. */
a93e80
+	if (off >= -size) {
a93e80
 		RTE_LOG(ERR, VHOST_CONFIG,
a93e80
-			"log offset %#"PRIx64" exceeds log size %#"PRIx64"\n",
a93e80
+			"log offset %#"PRIx64" and log size %#"PRIx64" overflow\n",
a93e80
 			off, size);
a93e80
 		return RTE_VHOST_MSG_RESULT_ERR;
a93e80
 	}
a93e80
@@ -2526,7 +2542,7 @@ static int
a93e80
 vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev,
a93e80
 			struct VhostUserMsg *msg)
a93e80
 {
a93e80
-	uint16_t vring_idx;
a93e80
+	uint32_t vring_idx;
a93e80
 
a93e80
 	switch (msg->request.master) {
a93e80
 	case VHOST_USER_SET_VRING_KICK:
a93e80
diff --git a/dpdk/lib/librte_vhost/virtio_net.c b/dpdk/lib/librte_vhost/virtio_net.c
a93e80
index ac2842b2d2..33f10258cf 100644
a93e80
--- a/dpdk/lib/librte_vhost/virtio_net.c
a93e80
+++ b/dpdk/lib/librte_vhost/virtio_net.c
a93e80
@@ -1086,6 +1086,8 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev,
a93e80
 						  VHOST_ACCESS_RW);
a93e80
 
a93e80
 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
a93e80
+		if (unlikely(!desc_addrs[i]))
a93e80
+			return -1;
a93e80
 		if (unlikely(lens[i] != descs[avail_idx + i].len))
a93e80
 			return -1;
a93e80
 	}
a93e80
@@ -1841,6 +1843,8 @@ vhost_reserve_avail_batch_packed(struct virtio_net *dev,
a93e80
 	}
a93e80
 
a93e80
 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
a93e80
+		if (unlikely(!desc_addrs[i]))
a93e80
+			return -1;
a93e80
 		if (unlikely((lens[i] != descs[avail_idx + i].len)))
a93e80
 			return -1;
a93e80
 	}