From 45f4abd6befa50b129fc49b331b42c280f37199a Mon Sep 17 00:00:00 2001 From: Mark Reynolds Date: Thu, 13 Jun 2019 13:01:35 -0400 Subject: [PATCH] Issue 49602 - Revise replication status messages Bug Description: All agreement status messages start with "Error (##)" followed by a text string. Even success states start with "Error", and this is confusing. Added new attributes to display the status in a JSON format for easier parsing for applications: replicaLastUpdateStatusJSON replicaLastInitStatusJSON Design Doc: https://www.port389.org/docs/389ds/design/repl-agmt-status-design.html https://pagure.io/389-ds-base/issue/49602 Reviewed by: firstyear(Thanks!) --- .../suites/replication/single_master_test.py | 19 +++-- ldap/schema/01core389.ldif | 2 + ldap/servers/plugins/replication/repl5_agmt.c | 84 ++++++++++++++++--- .../plugins/replication/repl5_protocol_util.c | 13 +-- 4 files changed, 96 insertions(+), 22 deletions(-) diff --git a/dirsrvtests/tests/suites/replication/single_master_test.py b/dirsrvtests/tests/suites/replication/single_master_test.py index 5b73e23ae..78f849da7 100644 --- a/dirsrvtests/tests/suites/replication/single_master_test.py +++ b/dirsrvtests/tests/suites/replication/single_master_test.py @@ -23,6 +23,7 @@ from lib389._constants import (ReplicaRole, DEFAULT_SUFFIX, REPLICAID_MASTER_1, REPLICATION_BIND_METHOD, REPLICATION_TRANSPORT, DEFAULT_BACKUPDIR, RA_NAME, RA_BINDDN, RA_BINDPW, RA_METHOD, RA_TRANSPORT_PROT, defaultProperties) +import json pytestmark = pytest.mark.tier1 @@ -95,7 +96,7 @@ def test_mail_attr_repl(topo_r): consumer.start() log.info("Make a search for mail attribute in attempt to crash server") - consumer.search_s(DEFAULT_SUFFIX, ldap.SCOPE_SUBTREE, "(mail=testuser@redhat.com)", ["mail"]) + c_user.get_attr_val("mail") log.info("Make sure that server hasn't crashed") repl.test_replication(master, consumer) @@ -111,11 +112,13 @@ def test_lastupdate_attr_before_init(topo_nr): 1. Check nsds5replicaLastUpdateStart value 2. Check nsds5replicaLastUpdateEnd value 3. Check nsds5replicaLastUpdateStatus value + 4. Check nsds5replicaLastUpdateStatusJSON is parsable :expectedresults: 1. nsds5replicaLastUpdateStart should be equal to 0 2. nsds5replicaLastUpdateEnd should be equal to 0 3. nsds5replicaLastUpdateStatus should not be equal - to "0 Replica acquired successfully: Incremental update started" + to "Replica acquired successfully: Incremental update started" + 4. Success """ master = topo_nr.ins["standalone1"] @@ -139,11 +142,15 @@ def test_lastupdate_attr_before_init(topo_nr): with pytest.raises(Exception): repl.wait_for_replication(master, consumer, timeout=5) - assert agmt.get_attr_val_bytes('nsds5replicaLastUpdateStart') == b"19700101000000Z" - assert agmt.get_attr_val_bytes("nsds5replicaLastUpdateEnd") == b"19700101000000Z" - assert b"Replica acquired successfully" not in agmt.get_attr_val_bytes("nsds5replicaLastUpdateStatus") - + assert agmt.get_attr_val_utf8('nsds5replicaLastUpdateStart') == "19700101000000Z" + assert agmt.get_attr_val_utf8("nsds5replicaLastUpdateEnd") == "19700101000000Z" + assert "replica acquired successfully" not in agmt.get_attr_val_utf8_l("nsds5replicaLastUpdateStatus") + # make sure the JSON attribute is parsable + json_status = agmt.get_attr_val_utf8("nsds5replicaLastUpdateStatusJSON") + if json_status is not None: + json_obj = json.loads(json_status) + log.debug("JSON status message: {}".format(json_obj)) if __name__ == '__main__': # Run isolated diff --git a/ldap/schema/01core389.ldif b/ldap/schema/01core389.ldif index 993fa4a6d..7bf4acc5b 100644 --- a/ldap/schema/01core389.ldif +++ b/ldap/schema/01core389.ldif @@ -312,6 +312,8 @@ attributeTypes: ( 2.16.840.1.113730.3.1.2341 NAME 'nsslapd-changelogmaxentries' attributeTypes: ( 2.16.840.1.113730.3.1.2344 NAME 'nsslapd-tls-check-crl' DESC 'Check CRL when opening outbound TLS connections. Valid options are none, peer, all.' SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 SINGLE-VALUE X-ORIGIN '389 Directory Server' ) attributeTypes: ( 2.16.840.1.113730.3.1.2353 NAME 'nsslapd-encryptionalgorithm' DESC 'The encryption algorithm used to encrypt the changelog' SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 SINGLE-VALUE X-ORIGIN '389 Directory Server' ) attributeTypes: ( 2.16.840.1.113730.3.1.2084 NAME 'nsSymmetricKey' DESC 'A symmetric key - currently used by attribute encryption' SYNTAX 1.3.6.1.4.1.1466.115.121.1.40 SINGLE-VALUE X-ORIGIN 'attribute encryption' ) +attributeTypes: ( 2.16.840.1.113730.3.1.2364 NAME 'nsds5replicaLastInitStatusJSON' DESC 'Netscape defined attribute type' SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 SINGLE-VALUE NO-USER-MODIFICATION X-ORIGIN 'Netscape Directory Server' ) +attributeTypes: ( 2.16.840.1.113730.3.1.2365 NAME 'nsds5replicaLastUpdateStatusJSON' DESC 'Netscape defined attribute type' SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 SINGLE-VALUE NO-USER-MODIFICATION X-ORIGIN 'Netscape Directory Server' ) # # objectclasses # diff --git a/ldap/servers/plugins/replication/repl5_agmt.c b/ldap/servers/plugins/replication/repl5_agmt.c index 53e6708c8..8e4586d8b 100644 --- a/ldap/servers/plugins/replication/repl5_agmt.c +++ b/ldap/servers/plugins/replication/repl5_agmt.c @@ -60,7 +60,11 @@ #define DEFAULT_TIMEOUT 120 /* (seconds) default outbound LDAP connection */ #define DEFAULT_FLOWCONTROL_WINDOW 1000 /* #entries sent without acknowledgment */ #define DEFAULT_FLOWCONTROL_PAUSE 2000 /* msec of pause when #entries sent witout acknowledgment */ -#define STATUS_LEN 1024 +#define STATUS_LEN 2048 +#define STATUS_GOOD "green" +#define STATUS_WARNING "amber" +#define STATUS_BAD "red" + struct changecounter { @@ -93,11 +97,13 @@ typedef struct repl5agmt time_t last_update_start_time; /* Local start time of last update session */ time_t last_update_end_time; /* Local end time of last update session */ char last_update_status[STATUS_LEN]; /* Status of last update. Format = numeric code textual description */ + char last_update_status_json[STATUS_LEN]; PRBool update_in_progress; PRBool is_enabled; time_t last_init_start_time; /* Local start time of last total init */ time_t last_init_end_time; /* Local end time of last total init */ char last_init_status[STATUS_LEN]; /* Status of last total init. Format = numeric code textual description */ + char last_init_status_json[STATUS_LEN]; PRLock *lock; Object *consumerRUV; /* last RUV received from the consumer - used for changelog purging */ CSN *consumerSchemaCSN; /* last schema CSN received from the consumer */ @@ -2443,6 +2449,21 @@ agmt_set_last_init_end(Repl_Agmt *ra, time_t end_time) } } +static void +agmt_set_last_update_status_json(Repl_Agmt *ra, char *state, int ldaprc, int replrc) +{ + char ts[SLAPI_TIMESTAMP_BUFSIZE]; + time_t now; + + time(&now); + strftime(ts, sizeof ts, "%FT%TZ", gmtime(&now)); + PR_snprintf(ra->last_update_status_json, STATUS_LEN, + "{\"state\": \"%s\", \"ldap_rc\": \"%d\", \"ldap_rc_text\": \"%s\", " + "\"repl_rc\": \"%d\", \"repl_rc_text\": \"%s\", \"date\": \"%s\", \"message\": \"%s\"}", + state, ldaprc, ldap_err2string(ldaprc), replrc, protocol_response2string(replrc), + ts, ra->last_update_status); +} + void agmt_set_last_update_status(Repl_Agmt *ra, int ldaprc, int replrc, const char *message) { @@ -2463,19 +2484,29 @@ agmt_set_last_update_status(Repl_Agmt *ra, int ldaprc, int replrc, const char *m PR_snprintf(ra->last_update_status, STATUS_LEN, "Error (%d) %s%s - LDAP error: %s%s%s%s", ldaprc, message ? message : "", message ? "" : " - ", slapi_err2string(ldaprc), replmsg ? " (" : "", replmsg ? replmsg : "", replmsg ? ")" : ""); + agmt_set_last_update_status_json(ra, STATUS_BAD, ldaprc, replrc); } /* ldaprc == LDAP_SUCCESS */ else if (replrc != 0) { if (replrc == NSDS50_REPL_REPLICA_BUSY) { PR_snprintf(ra->last_update_status, STATUS_LEN, - "Error (%d) Can't acquire busy replica", replrc); + "Error (%d) Can't acquire busy replica (%s)", + replrc, message ? message : ""); + agmt_set_last_update_status_json(ra, STATUS_WARNING, ldaprc, replrc); + } else if (replrc == NSDS50_REPL_TRANSIENT_ERROR || replrc == NSDS50_REPL_BACKOFF) { + PR_snprintf(ra->last_update_status, STATUS_LEN, + "Error (%d) Can't acquire replica (%s)", + replrc, message ? message : ""); + agmt_set_last_update_status_json(ra, STATUS_WARNING, ldaprc, replrc); } else if (replrc == NSDS50_REPL_REPLICA_RELEASE_SUCCEEDED) { PR_snprintf(ra->last_update_status, STATUS_LEN, "Error (0) Replication session successful"); + agmt_set_last_update_status_json(ra, STATUS_GOOD, ldaprc, replrc); } else if (replrc == NSDS50_REPL_DISABLED) { PR_snprintf(ra->last_update_status, STATUS_LEN, "Error (%d) Incremental update aborted: " "Replication agreement for %s\n can not be updated while the replica is disabled.\n" "(If the suffix is disabled you must enable it then restart the server for replication to take place).", replrc, ra->long_name ? ra->long_name : "a replica"); + agmt_set_last_update_status_json(ra, STATUS_BAD, ldaprc, replrc); /* Log into the errors log, as "ra->long_name" is not accessible from the caller */ slapi_log_err(SLAPI_LOG_ERR, repl_plugin_name, "Incremental update aborted: Replication agreement for \"%s\" " @@ -2487,17 +2518,35 @@ agmt_set_last_update_status(Repl_Agmt *ra, int ldaprc, int replrc, const char *m PR_snprintf(ra->last_update_status, STATUS_LEN, "Error (%d) Replication error acquiring replica: %s%s(%s)", replrc, message ? message : "", message ? " " : "", protocol_response2string(replrc)); + agmt_set_last_update_status_json(ra, STATUS_BAD, ldaprc, replrc); } } else if (message != NULL) /* replrc == NSDS50_REPL_REPLICA_READY == 0 */ { PR_snprintf(ra->last_update_status, STATUS_LEN, "Error (0) Replica acquired successfully: %s", message); + agmt_set_last_update_status_json(ra, STATUS_GOOD, ldaprc, replrc); } else { /* agmt_set_last_update_status(0,0,NULL) to reset agmt */ ra->last_update_status[0] = '\0'; + ra->last_update_status_json[0] = '\0'; } } } +static void +agmt_set_last_init_status_json(Repl_Agmt *ra, char *state, int ldaprc, int replrc, int connrc) +{ + char ts[SLAPI_TIMESTAMP_BUFSIZE]; + time_t now; + + time(&now); + strftime(ts, sizeof ts, "%FT%TZ", gmtime(&now)); + PR_snprintf(ra->last_init_status_json, STATUS_LEN, + "{\"state\": \"%s\", \"ldap_rc\": \"%d\", \"ldap_rc_text\": \"%s\", \"repl_rc\": \"%d\", \"repl_rc_text\": \"%s\", " + "\"conn_rc\": \"%d\", \"conn_rc_text\": \"%s\", \"date\": \"%s\", \"message\": \"%s\"}", + state, ldaprc, ldap_err2string(ldaprc), replrc, protocol_response2string(replrc), + connrc, conn_result2string(connrc), ts, ra->last_init_status); +} + void agmt_set_last_init_status(Repl_Agmt *ra, int ldaprc, int replrc, int connrc, const char *message) { @@ -2523,16 +2572,16 @@ agmt_set_last_init_status(Repl_Agmt *ra, int ldaprc, int replrc, int connrc, con replmsg = NULL; } } - PR_snprintf(ra->last_init_status, STATUS_LEN, "Error (%d) %s%sLDAP error: %s%s%s%s%s", + PR_snprintf(ra->last_init_status, STATUS_LEN, "Error (%d)%s%sLDAP error: %s%s%s%s%s", ldaprc, message ? message : "", message ? "" : " - ", slapi_err2string(ldaprc), replmsg ? " - " : "", replmsg ? replmsg : "", connrc ? " - " : "", connrc ? connmsg : ""); + agmt_set_last_init_status_json(ra, STATUS_BAD, ldaprc, replrc, connrc); } /* ldaprc == LDAP_SUCCESS */ else if (replrc != 0) { if (replrc == NSDS50_REPL_REPLICA_RELEASE_SUCCEEDED) { - PR_snprintf(ra->last_init_status, STATUS_LEN, "Error (%d) %s", - ldaprc, "Replication session successful"); + PR_snprintf(ra->last_init_status, STATUS_LEN, "Replication session successful"); } else if (replrc == NSDS50_REPL_DISABLED) { if (agmt_is_enabled(ra)) { slapi_log_err(SLAPI_LOG_ERR, repl_plugin_name, "Total update aborted: " @@ -2543,6 +2592,7 @@ agmt_set_last_init_status(Repl_Agmt *ra, int ldaprc, int replrc, int connrc, con "Replication agreement for \"%s\" can not be updated while the suffix is disabled.\n" "You must enable it then restart the server for replication to take place).", replrc, ra->long_name ? ra->long_name : "a replica"); + agmt_set_last_init_status_json(ra, STATUS_BAD, ldaprc, replrc, connrc); } else { /* You do not need to restart the server after enabling the agreement */ slapi_log_err(SLAPI_LOG_ERR, repl_plugin_name, "Total update aborted: " @@ -2551,6 +2601,7 @@ agmt_set_last_init_status(Repl_Agmt *ra, int ldaprc, int replrc, int connrc, con PR_snprintf(ra->last_init_status, STATUS_LEN, "Error (%d) Total update aborted: " "Replication agreement for \"%s\" can not be updated while the agreement is disabled.", replrc, ra->long_name ? ra->long_name : "a replica"); + agmt_set_last_init_status_json(ra, STATUS_BAD, ldaprc, replrc, connrc); } } else { PR_snprintf(ra->last_init_status, STATUS_LEN, @@ -2558,19 +2609,21 @@ agmt_set_last_init_status(Repl_Agmt *ra, int ldaprc, int replrc, int connrc, con replrc, protocol_response2string(replrc), message ? " - " : "", message ? message : "", connrc ? " - " : "", connrc ? connmsg : ""); + agmt_set_last_init_status_json(ra, STATUS_BAD, ldaprc, replrc, connrc); } } else if (connrc != CONN_OPERATION_SUCCESS) { PR_snprintf(ra->last_init_status, STATUS_LEN, "Error (%d) connection error: %s%s%s", connrc, connmsg, message ? " - " : "", message ? message : ""); - } else if (message != NULL) /* replrc == NSDS50_REPL_REPLICA_READY == 0 */ - { + agmt_set_last_init_status_json(ra, STATUS_BAD, ldaprc, replrc, connrc); + } else if (message != NULL) { /* replrc == NSDS50_REPL_REPLICA_READY == 0 */ PR_snprintf(ra->last_init_status, STATUS_LEN, - "Error (%d) %s", - ldaprc, message); + "Error (%d) %s", ldaprc, message); + agmt_set_last_init_status_json(ra, STATUS_GOOD, ldaprc, replrc, connrc); } else { /* agmt_set_last_init_status(0,0,NULL) to reset agmt */ - PR_snprintf(ra->last_init_status, STATUS_LEN, "Error (%d)", ldaprc); + ra->last_init_status[0] = '\0'; + ra->last_init_status_json[0] = '\0'; } } } @@ -2705,10 +2758,20 @@ get_agmt_status(Slapi_PBlock *pb __attribute__((unused)), agmt_get_changecount_string(ra, changecount_string, sizeof(changecount_string)); slapi_entry_add_string(e, "nsds5replicaChangesSentSinceStartup", changecount_string); if (ra->last_update_status[0] == '\0') { + char status_msg[STATUS_LEN]; + char ts[SLAPI_TIMESTAMP_BUFSIZE]; + time_t now; + time(&now); + strftime(ts, sizeof ts, "%FT%TZ", gmtime(&now)); slapi_entry_add_string(e, "nsds5replicaLastUpdateStatus", "Error (0) No replication sessions started since server startup"); + PR_snprintf(status_msg, STATUS_LEN, + "{\"state\": \"green\", \"ldap_rc\": \"0\", \"ldap_rc_text\": \"success\", \"repl_rc\": \"0\", \"repl_rc_text\": \"replica acquired\", " + "\"date\": \"%s\", \"message\": \"Error (0) No replication sessions started since server startup\"}", ts); + slapi_entry_add_string(e, "nsds5replicaLastUpdateStatusJSON", status_msg); } else { slapi_entry_add_string(e, "nsds5replicaLastUpdateStatus", ra->last_update_status); + slapi_entry_add_string(e, "nsds5replicaLastUpdateStatusJSON", ra->last_update_status_json); } slapi_entry_add_string(e, "nsds5replicaUpdateInProgress", ra->update_in_progress ? "TRUE" : "FALSE"); @@ -2724,6 +2787,7 @@ get_agmt_status(Slapi_PBlock *pb __attribute__((unused)), if (ra->last_init_status[0] != '\0') { slapi_entry_add_string(e, "nsds5replicaLastInitStatus", ra->last_init_status); + slapi_entry_add_string(e, "nsds5replicaLastInitStatusJSON", ra->last_init_status_json); } } bail: diff --git a/ldap/servers/plugins/replication/repl5_protocol_util.c b/ldap/servers/plugins/replication/repl5_protocol_util.c index a48d4d02a..bb9f9e7e1 100644 --- a/ldap/servers/plugins/replication/repl5_protocol_util.c +++ b/ldap/servers/plugins/replication/repl5_protocol_util.c @@ -374,13 +374,13 @@ acquire_replica(Private_Repl_Protocol *prp, char *prot_oid, RUV **ruv) "has the same Replica ID as this one. " "Replication is aborting.\n", agmt_get_long_name(prp->agmt)); - agmt_set_last_update_status(prp->agmt, 0, 0, - "Unable to aquire replica: the replica has the same " + agmt_set_last_update_status(prp->agmt, 0, NSDS50_REPL_REPLICAID_ERROR, + "Unable to acquire replica: the replica has the same " "Replica ID as this one. Replication is aborting."); return_value = ACQUIRE_FATAL_ERROR; break; case NSDS50_REPL_BACKOFF: - /* A replication sesssion hook on the replica + /* A replication session hook on the replica * wants us to go into backoff mode. */ slapi_log_err(SLAPI_LOG_ERR, repl_plugin_name, "acquire_replica - " @@ -487,9 +487,8 @@ acquire_replica(Private_Repl_Protocol *prp, char *prot_oid, RUV **ruv) "%s: Unable to obtain current CSN. " "Replication is aborting.\n", agmt_get_long_name(prp->agmt)); - agmt_set_last_update_status(prp->agmt, 0, 0, - "Unable to obtain current CSN. " - "Replication is aborting."); + agmt_set_last_update_status(prp->agmt, 0, NSDS50_REPL_INTERNAL_ERROR, + "Unable to obtain current CSN. Replication is aborting."); return_value = ACQUIRE_FATAL_ERROR; } } @@ -665,6 +664,8 @@ protocol_response2string(int response) return "transient warning"; case NSDS50_REPL_RUV_ERROR: return "RUV error"; + case NSDS50_REPL_REPLICA_NO_RESPONSE: + return "no response received"; default: return "unknown error"; } -- 2.21.0