Blame SOURCES/0180-multipath-tools-Add-rbd-checker.patch

4728c8
From e28c340ed961409700d46a1cb9a820a8b7a4d016 Mon Sep 17 00:00:00 2001
4728c8
From: Mike Christie <mchristi@redhat.com>
4728c8
Date: Thu, 11 Aug 2016 02:12:12 -0500
4728c8
Subject: [PATCH 04/11] multipath-tools: Add rbd checker.
4728c8
4728c8
For BZ 1348372 from upstream commit:
4728c8
4728c8
commit d1cad5649b6fcf9027d43ca0405c900080133e32
4728c8
Author: Mike Christie <mchristi@redhat.com>
4728c8
Date:   Mon Aug 8 07:01:49 2016 -0500
4728c8
4728c8
    multipath-tools: Add rbd checker.
4728c8
4728c8
    This checker currently only handles the case where a path is failed
4728c8
    due to it being blacklisted by the ceph cluster. The specific use
4728c8
    case for me is when LIO exports rbd images through multiple LIO
4728c8
    instances.
4728c8
4728c8
    The problem it handles is when rbd instance1 has the exclusive lock,
4728c8
    but becomes unreachable another host in the cluster will take over
4728c8
    and blacklist the instance1. This prevents it from sending stale IO
4728c8
    and corrupting data.
4728c8
4728c8
    Later, when the host is reachable, we will want to failback to it.
4728c8
    To this, the checker will detect we were blacklisted, unmap the old
4728c8
    image which will make sure old IO is failed, and then remap the
4728c8
image
4728c8
    and unblacklist the host. multipathd will then handle this like a
4728c8
    path being removed and re-added.
4728c8
4728c8
--------
4728c8
4728c8
Porting notes:
4728c8
Added rbd to multipath.conf.annotated.
4728c8
4728c8
Signed-off-by: Mike Christie <mchristi@redhat.com>
4728c8
---
4728c8
 libmultipath/checkers/Makefile |    7 
4728c8
 libmultipath/checkers/rbd.c    |  639 +++++++++++++++++++++++++++++++++++++++++
4728c8
 multipath.conf.annotated       |    4 
4728c8
 multipath/multipath.conf.5     |    3 
4728c8
 4 files changed, 651 insertions(+), 2 deletions(-)
4728c8
 create mode 100644 libmultipath/checkers/rbd.c
4728c8
4728c8
Index: multipath-tools-130222/libmultipath/checkers/Makefile
4728c8
===================================================================
4728c8
--- multipath-tools-130222.orig/libmultipath/checkers/Makefile
4728c8
+++ multipath-tools-130222/libmultipath/checkers/Makefile
4728c8
@@ -14,10 +14,17 @@ LIBS= \
4728c8
 	libcheckhp_sw.so \
4728c8
 	libcheckrdac.so
4728c8
 
4728c8
+ifeq ($(shell test -r /usr/include/rados/librados.h && echo 1),1)
4728c8
+LIBS += libcheckrbd.so
4728c8
+endif
4728c8
+
4728c8
 CFLAGS += -fPIC -I..
4728c8
 
4728c8
 all: $(LIBS)
4728c8
 
4728c8
+libcheckrbd.so: rbd.o
4728c8
+	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^ -lrados -ludev
4728c8
+
4728c8
 libcheckdirectio.so: libsg.o directio.o
4728c8
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^ -laio
4728c8
 
4728c8
Index: multipath-tools-130222/libmultipath/checkers/rbd.c
4728c8
===================================================================
4728c8
--- /dev/null
4728c8
+++ multipath-tools-130222/libmultipath/checkers/rbd.c
4728c8
@@ -0,0 +1,639 @@
4728c8
+/*
4728c8
+ * Copyright (c) 2016 Red Hat
4728c8
+ * Copyright (c) 2004 Christophe Varoqui
4728c8
+ *
4728c8
+ * Code based off of tur.c and ceph's krbd.cc
4728c8
+ */
4728c8
+#define _GNU_SOURCE
4728c8
+#include <stdio.h>
4728c8
+#include <stdlib.h>
4728c8
+#include <string.h>
4728c8
+#include <unistd.h>
4728c8
+#include <fcntl.h>
4728c8
+#include <errno.h>
4728c8
+#include <pthread.h>
4728c8
+#include <libudev.h>
4728c8
+#include <ifaddrs.h>
4728c8
+#include <sys/types.h>
4728c8
+#include <sys/stat.h>
4728c8
+#include <sys/ioctl.h>
4728c8
+#include <sys/time.h>
4728c8
+#include <sys/wait.h>
4728c8
+
4728c8
+#include "rados/librados.h"
4728c8
+
4728c8
+#include "structs.h"
4728c8
+#include "checkers.h"
4728c8
+
4728c8
+#include "../libmultipath/debug.h"
4728c8
+#include "../libmultipath/uevent.h"
4728c8
+
4728c8
+struct rbd_checker_context;
4728c8
+typedef int (thread_fn)(struct rbd_checker_context *ct, char *msg);
4728c8
+
4728c8
+#define RBD_MSG(msg, fmt, args...) snprintf(msg, CHECKER_MSG_LEN, fmt, ##args);
4728c8
+
4728c8
+struct rbd_checker_context {
4728c8
+	int rbd_bus_id;
4728c8
+	char *client_addr;
4728c8
+	char *config_info;
4728c8
+	char *snap;
4728c8
+	char *pool;
4728c8
+	char *image;
4728c8
+	char *username;
4728c8
+	int remapped;
4728c8
+	int blacklisted;
4728c8
+
4728c8
+	rados_t cluster;
4728c8
+
4728c8
+	int state;
4728c8
+	int running;
4728c8
+	time_t time;
4728c8
+	thread_fn *fn;
4728c8
+	pthread_t thread;
4728c8
+	pthread_mutex_t lock;
4728c8
+	pthread_cond_t active;
4728c8
+	pthread_spinlock_t hldr_lock;
4728c8
+	int holders;
4728c8
+	char message[CHECKER_MSG_LEN];
4728c8
+};
4728c8
+
4728c8
+int libcheck_init(struct checker * c)
4728c8
+{
4728c8
+	struct rbd_checker_context *ct;
4728c8
+	struct udev_device *block_dev;
4728c8
+	struct udev_device *bus_dev;
4728c8
+	struct udev *udev;
4728c8
+	struct stat sb;
4728c8
+	const char *block_name, *addr, *config_info;
4728c8
+	const char *image, *pool, *snap, *username;
4728c8
+	char sysfs_path[PATH_SIZE];
4728c8
+	int ret;
4728c8
+
4728c8
+	ct = malloc(sizeof(struct rbd_checker_context));
4728c8
+	if (!ct)
4728c8
+		return 1;
4728c8
+	memset(ct, 0, sizeof(struct rbd_checker_context));
4728c8
+	ct->holders = 1;
4728c8
+	pthread_cond_init(&ct->active, NULL);
4728c8
+	pthread_mutex_init(&ct->lock, NULL);
4728c8
+	pthread_spin_init(&ct->hldr_lock, PTHREAD_PROCESS_PRIVATE);
4728c8
+	c->context = ct;
4728c8
+
4728c8
+	/*
4728c8
+	 * The rbd block layer sysfs device is not linked to the rbd bus
4728c8
+	 * device that we interact with, so figure that out now.
4728c8
+	 */
4728c8
+	if (fstat(c->fd, &sb) != 0)
4728c8
+		goto free_ct;
4728c8
+
4728c8
+	udev = udev_new();
4728c8
+	if (!udev)
4728c8
+		goto free_ct;
4728c8
+
4728c8
+	block_dev = udev_device_new_from_devnum(udev, 'b', sb.st_rdev);
4728c8
+	if (!block_dev)
4728c8
+		goto free_udev;
4728c8
+
4728c8
+	block_name  = udev_device_get_sysname(block_dev);
4728c8
+	ret = sscanf(block_name, "rbd%d", &ct->rbd_bus_id);
4728c8
+
4728c8
+	udev_device_unref(block_dev);
4728c8
+	if (ret != 1)
4728c8
+		goto free_udev;
4728c8
+
4728c8
+	snprintf(sysfs_path, sizeof(sysfs_path), "/sys/bus/rbd/devices/%d",
4728c8
+		 ct->rbd_bus_id);
4728c8
+	bus_dev = udev_device_new_from_syspath(udev, sysfs_path);
4728c8
+	if (!bus_dev)
4728c8
+		goto free_udev;
4728c8
+
4728c8
+	addr = udev_device_get_sysattr_value(bus_dev, "client_addr");
4728c8
+	if (!addr) {
4728c8
+		condlog(0, "Could not find client_addr in rbd sysfs. Try "
4728c8
+			"updating kernel");
4728c8
+		goto free_dev;
4728c8
+	}
4728c8
+
4728c8
+	ct->client_addr = strdup(addr);
4728c8
+	if (!ct->client_addr)
4728c8
+		goto free_dev;
4728c8
+
4728c8
+	config_info = udev_device_get_sysattr_value(bus_dev, "config_info");
4728c8
+	if (!config_info)
4728c8
+		goto free_addr;
4728c8
+
4728c8
+	ct->config_info = strdup(config_info);
4728c8
+	if (!ct->config_info)
4728c8
+		goto free_addr;
4728c8
+
4728c8
+	username = strstr(config_info, "name=");
4728c8
+	if (username) {
4728c8
+		char *end;
4728c8
+		int len;
4728c8
+
4728c8
+		username += 5;
4728c8
+		end = strchr(username, ',');
4728c8
+		if (!end)
4728c8
+			goto free_info;
4728c8
+		len = end - username;
4728c8
+
4728c8
+		ct->username = malloc(len + 1);
4728c8
+		if (!ct->username)
4728c8
+			goto free_info;
4728c8
+		strncpy(ct->username, username, len);
4728c8
+		ct->username[len] = '\0';
4728c8
+	}
4728c8
+
4728c8
+	image = udev_device_get_sysattr_value(bus_dev, "name");
4728c8
+	if (!image)
4728c8
+		goto free_username;
4728c8
+
4728c8
+	ct->image = strdup(image);
4728c8
+	if (!ct->image)
4728c8
+		goto free_info;
4728c8
+
4728c8
+	pool = udev_device_get_sysattr_value(bus_dev, "pool");
4728c8
+	if (!pool)
4728c8
+		goto free_image;
4728c8
+
4728c8
+	ct->pool = strdup(pool);
4728c8
+	if (!ct->pool)
4728c8
+		goto free_image;
4728c8
+
4728c8
+	snap = udev_device_get_sysattr_value(bus_dev, "current_snap");
4728c8
+	if (!snap)
4728c8
+		goto free_pool;
4728c8
+
4728c8
+	if (strcmp("-", snap)) {
4728c8
+		ct->snap = strdup(snap);
4728c8
+		if (!ct->snap)
4728c8
+			goto free_pool;
4728c8
+	}
4728c8
+
4728c8
+	if (rados_create(&ct->cluster, NULL) < 0) {
4728c8
+		condlog(0, "Could not create rados cluster");
4728c8
+		goto free_snap;
4728c8
+	}
4728c8
+
4728c8
+	if (rados_conf_read_file(ct->cluster, NULL) < 0) {
4728c8
+		condlog(0, "Could not read rados conf");
4728c8
+		goto shutdown_rados;
4728c8
+	}
4728c8
+
4728c8
+	ret = rados_connect(ct->cluster);
4728c8
+	if (ret < 0) {
4728c8
+		condlog(0, "Could not connect to rados cluster");
4728c8
+		goto shutdown_rados;
4728c8
+	}
4728c8
+
4728c8
+	udev_device_unref(bus_dev);
4728c8
+	udev_unref(udev);
4728c8
+
4728c8
+	condlog(3, "rbd%d checker init %s %s/%s@%s %s", ct->rbd_bus_id,
4728c8
+		ct->client_addr, ct->pool, ct->image, ct->snap ? ct->snap : "-",
4728c8
+		ct->username ? ct->username : "none");
4728c8
+	return 0;
4728c8
+
4728c8
+shutdown_rados:
4728c8
+	rados_shutdown(ct->cluster);
4728c8
+free_snap:
4728c8
+	if (ct->snap)
4728c8
+		free(ct->snap);
4728c8
+free_pool:
4728c8
+	free(ct->pool);
4728c8
+free_image:
4728c8
+	free(ct->image);
4728c8
+free_username:
4728c8
+	if (ct->username)
4728c8
+		free(ct->username);
4728c8
+free_info:
4728c8
+	free(ct->config_info);
4728c8
+free_addr:
4728c8
+	free(ct->client_addr);
4728c8
+free_dev:
4728c8
+	udev_device_unref(bus_dev);
4728c8
+free_udev:
4728c8
+	udev_unref(udev);
4728c8
+free_ct:
4728c8
+	free(ct);
4728c8
+	return 1;
4728c8
+}
4728c8
+
4728c8
+void cleanup_context(struct rbd_checker_context *ct)
4728c8
+{
4728c8
+	pthread_mutex_destroy(&ct->lock);
4728c8
+	pthread_cond_destroy(&ct->active);
4728c8
+	pthread_spin_destroy(&ct->hldr_lock);
4728c8
+
4728c8
+	rados_shutdown(ct->cluster);
4728c8
+
4728c8
+	if (ct->username)
4728c8
+		free(ct->username);
4728c8
+	if (ct->snap)
4728c8
+		free(ct->snap);
4728c8
+	free(ct->pool);
4728c8
+	free(ct->image);
4728c8
+	free(ct->config_info);
4728c8
+	free(ct->client_addr);
4728c8
+	free(ct);
4728c8
+}
4728c8
+
4728c8
+void libcheck_free(struct checker * c)
4728c8
+{
4728c8
+	if (c->context) {
4728c8
+		struct rbd_checker_context *ct = c->context;
4728c8
+		int holders;
4728c8
+		pthread_t thread;
4728c8
+
4728c8
+		pthread_spin_lock(&ct->hldr_lock);
4728c8
+		ct->holders--;
4728c8
+		holders = ct->holders;
4728c8
+		thread = ct->thread;
4728c8
+		pthread_spin_unlock(&ct->hldr_lock);
4728c8
+		if (holders)
4728c8
+			pthread_cancel(thread);
4728c8
+		else
4728c8
+			cleanup_context(ct);
4728c8
+		c->context = NULL;
4728c8
+	}
4728c8
+}
4728c8
+
4728c8
+static int rbd_is_blacklisted(struct rbd_checker_context *ct, char *msg)
4728c8
+{
4728c8
+	char *addr_tok, *start, *save;
4728c8
+	char *cmd[2];
4728c8
+	char *blklist, *stat;
4728c8
+	size_t blklist_len, stat_len;
4728c8
+	int ret;
4728c8
+	char *end;
4728c8
+
4728c8
+	cmd[0] = "{\"prefix\": \"osd blacklist ls\"}";
4728c8
+	cmd[1] = NULL;
4728c8
+
4728c8
+	ret = rados_mon_command(ct->cluster, (const char **)cmd, 1, "", 0,
4728c8
+				&blklist, &blklist_len, &stat, &stat_len);
4728c8
+	if (ret < 0) {
4728c8
+		RBD_MSG(msg, "rbd checker failed: mon command failed %d",
4728c8
+			ret);
4728c8
+		return ret;
4728c8
+	}
4728c8
+
4728c8
+	if (!blklist || !blklist_len)
4728c8
+		goto free_bufs;
4728c8
+
4728c8
+	/*
4728c8
+	 * parse list of addrs with the format
4728c8
+	 * ipv4:port/nonce date time\n
4728c8
+	 * or
4728c8
+	 * [ipv6]:port/nonce date time\n
4728c8
+	 */
4728c8
+	ret = 0;
4728c8
+	for (start = blklist; ; start = NULL) {
4728c8
+		addr_tok = strtok_r(start, "\n", &save);
4728c8
+		if (!addr_tok || !strlen(addr_tok))
4728c8
+			break;
4728c8
+
4728c8
+		end = strchr(addr_tok, ' ');
4728c8
+		if (!end) {
4728c8
+			RBD_MSG(msg, "rbd%d checker failed: invalid blacklist %s",
4728c8
+				 ct->rbd_bus_id, addr_tok);
4728c8
+			break;
4728c8
+		}
4728c8
+		*end = '\0';
4728c8
+
4728c8
+		if (!strcmp(addr_tok, ct->client_addr)) {
4728c8
+			ct->blacklisted = 1;
4728c8
+			RBD_MSG(msg, "rbd%d checker: %s is blacklisted",
4728c8
+				ct->rbd_bus_id, ct->client_addr);
4728c8
+			ret = 1;
4728c8
+			break;
4728c8
+		}
4728c8
+	}
4728c8
+
4728c8
+free_bufs:
4728c8
+	rados_buffer_free(blklist);
4728c8
+	rados_buffer_free(stat);
4728c8
+	return ret;
4728c8
+}
4728c8
+
4728c8
+int rbd_check(struct rbd_checker_context *ct, char *msg)
4728c8
+{
4728c8
+	if (ct->blacklisted || rbd_is_blacklisted(ct, msg) == 1)
4728c8
+		return PATH_DOWN;
4728c8
+
4728c8
+	RBD_MSG(msg, "rbd checker reports path is up");
4728c8
+	/*
4728c8
+	 * Path may have issues, but the ceph cluster is at least
4728c8
+	 * accepting IO, so we can attempt to do IO.
4728c8
+	 *
4728c8
+	 * TODO: in future versions, we can run other tests to
4728c8
+	 * verify OSDs and networks.
4728c8
+	 */
4728c8
+	return PATH_UP;
4728c8
+}
4728c8
+
4728c8
+int safe_write(int fd, const void *buf, size_t count)
4728c8
+{
4728c8
+	while (count > 0) {
4728c8
+		ssize_t r = write(fd, buf, count);
4728c8
+		if (r < 0) {
4728c8
+			if (errno == EINTR)
4728c8
+				continue;
4728c8
+			return -errno;
4728c8
+		}
4728c8
+		count -= r;
4728c8
+		buf = (char *)buf + r;
4728c8
+	}
4728c8
+	return 0;
4728c8
+}
4728c8
+
4728c8
+static int sysfs_write_rbd_bus(const char *which, const char *buf,
4728c8
+			       size_t buf_len)
4728c8
+{
4728c8
+	char sysfs_path[PATH_SIZE];
4728c8
+	int fd;
4728c8
+	int r;
4728c8
+
4728c8
+	/* we require newer kernels so single_major should alwayws be there */
4728c8
+	snprintf(sysfs_path, sizeof(sysfs_path),
4728c8
+		 "/sys/bus/rbd/%s_single_major", which);
4728c8
+	fd = open(sysfs_path, O_WRONLY);
4728c8
+	if (fd < 0)
4728c8
+		return -errno;
4728c8
+
4728c8
+	r = safe_write(fd, buf, buf_len);
4728c8
+	close(fd);
4728c8
+	return r;
4728c8
+}
4728c8
+
4728c8
+static int rbd_remap(struct rbd_checker_context *ct)
4728c8
+{
4728c8
+	char *argv[11];
4728c8
+	pid_t pid;
4728c8
+	int ret = 0, i = 0;
4728c8
+	int status;
4728c8
+
4728c8
+	pid = fork();
4728c8
+	switch (pid) {
4728c8
+	case 0:
4728c8
+		argv[i++] = "rbd";
4728c8
+		argv[i++] = "map";
4728c8
+		argv[i++] = "-o noshare";
4728c8
+		if (ct->username) {
4728c8
+			argv[i++] = "--id";
4728c8
+			argv[i++] = ct->username;
4728c8
+		}
4728c8
+		argv[i++] = "--pool";
4728c8
+		argv[i++] = ct->pool;
4728c8
+		if (ct->snap) {
4728c8
+			argv[i++] = "--snap";
4728c8
+			argv[i++] = ct->snap;
4728c8
+		}
4728c8
+		argv[i++] = ct->image;
4728c8
+		argv[i] = NULL;
4728c8
+
4728c8
+		ret = execvp(argv[0], argv);
4728c8
+		condlog(0, "Error executing rbd: %s", strerror(errno));
4728c8
+		exit(-1);
4728c8
+	case -1:
4728c8
+		condlog(0, "fork failed: %s", strerror(errno));
4728c8
+		return -1;
4728c8
+	default:
4728c8
+		ret = -1;
4728c8
+		wait(&status);
4728c8
+		if (WIFEXITED(status)) {
4728c8
+			status = WEXITSTATUS(status);
4728c8
+			if (status == 0)
4728c8
+				ret = 0;
4728c8
+			else
4728c8
+				condlog(0, "rbd failed with %d", status);
4728c8
+		}
4728c8
+	}
4728c8
+
4728c8
+	return ret;
4728c8
+}
4728c8
+
4728c8
+static int sysfs_write_rbd_remove(const char *buf, int buf_len)
4728c8
+{
4728c8
+	return sysfs_write_rbd_bus("remove", buf, buf_len);
4728c8
+}
4728c8
+
4728c8
+static int rbd_rm_blacklist(struct rbd_checker_context *ct)
4728c8
+{
4728c8
+	char *cmd[2];
4728c8
+	char *stat, *cmd_str;
4728c8
+	size_t stat_len;
4728c8
+	int ret;
4728c8
+
4728c8
+	ret = asprintf(&cmd_str, "{\"prefix\": \"osd blacklist\", \"blacklistop\": \"rm\", \"addr\": \"%s\"}",
4728c8
+		       ct->client_addr);
4728c8
+	if (ret == -1)
4728c8
+		return -ENOMEM;
4728c8
+
4728c8
+	cmd[0] = cmd_str;
4728c8
+	cmd[1] = NULL;
4728c8
+
4728c8
+	ret = rados_mon_command(ct->cluster, (const char **)cmd, 1, "", 0,
4728c8
+				NULL, 0, &stat, &stat_len);
4728c8
+	if (ret < 0) {
4728c8
+		condlog(1, "rbd%d repair failed to remove blacklist for %s %d",
4728c8
+			ct->rbd_bus_id, ct->client_addr, ret);
4728c8
+		goto free_cmd;
4728c8
+	}
4728c8
+
4728c8
+	condlog(1, "rbd%d repair rm blacklist for %s",
4728c8
+	       ct->rbd_bus_id, ct->client_addr);
4728c8
+	free(stat);
4728c8
+free_cmd:
4728c8
+	free(cmd_str);
4728c8
+	return ret;
4728c8
+}
4728c8
+
4728c8
+static int rbd_repair(struct rbd_checker_context *ct, char *msg)
4728c8
+{
4728c8
+	char del[17];
4728c8
+	int ret;
4728c8
+
4728c8
+	if (!ct->blacklisted)
4728c8
+		return PATH_UP;
4728c8
+
4728c8
+	if (!ct->remapped) {
4728c8
+		ret = rbd_remap(ct);
4728c8
+		if (ret) {
4728c8
+			RBD_MSG(msg, "rbd%d repair failed to remap. Err %d",
4728c8
+				ct->rbd_bus_id, ret);
4728c8
+			return PATH_DOWN;
4728c8
+		}
4728c8
+	}
4728c8
+	ct->remapped = 1;
4728c8
+
4728c8
+	snprintf(del, sizeof(del), "%d force", ct->rbd_bus_id);
4728c8
+	ret = sysfs_write_rbd_remove(del, strlen(del) + 1);
4728c8
+	if (ret) {
4728c8
+		RBD_MSG(msg, "rbd%d repair failed to clean up. Err %d",
4728c8
+			ct->rbd_bus_id, ret);
4728c8
+		return PATH_DOWN;
4728c8
+	}
4728c8
+
4728c8
+	ret = rbd_rm_blacklist(ct);
4728c8
+	if (ret) {
4728c8
+		RBD_MSG(msg, "rbd%d repair could not remove blacklist entry. Err %d",
4728c8
+			ct->rbd_bus_id, ret);
4728c8
+		return PATH_DOWN;
4728c8
+	}
4728c8
+
4728c8
+	ct->remapped = 0;
4728c8
+	ct->blacklisted = 0;
4728c8
+
4728c8
+	RBD_MSG(msg, "rbd%d has been repaired", ct->rbd_bus_id);
4728c8
+	return PATH_UP;
4728c8
+}
4728c8
+
4728c8
+#define rbd_thread_cleanup_push(ct) pthread_cleanup_push(cleanup_func, ct)
4728c8
+#define rbd_thread_cleanup_pop(ct) pthread_cleanup_pop(1)
4728c8
+
4728c8
+void cleanup_func(void *data)
4728c8
+{
4728c8
+	int holders;
4728c8
+	struct rbd_checker_context *ct = data;
4728c8
+	pthread_spin_lock(&ct->hldr_lock);
4728c8
+	ct->holders--;
4728c8
+	holders = ct->holders;
4728c8
+	ct->thread = 0;
4728c8
+	pthread_spin_unlock(&ct->hldr_lock);
4728c8
+	if (!holders)
4728c8
+		cleanup_context(ct);
4728c8
+}
4728c8
+
4728c8
+void *rbd_thread(void *ctx)
4728c8
+{
4728c8
+	struct rbd_checker_context *ct = ctx;
4728c8
+	int state;
4728c8
+
4728c8
+	condlog(3, "rbd%d thread starting up", ct->rbd_bus_id);
4728c8
+
4728c8
+	ct->message[0] = '\0';
4728c8
+	/* This thread can be canceled, so setup clean up */
4728c8
+	rbd_thread_cleanup_push(ct)
4728c8
+
4728c8
+	/* checker start up */
4728c8
+	pthread_mutex_lock(&ct->lock);
4728c8
+	ct->state = PATH_PENDING;
4728c8
+	pthread_mutex_unlock(&ct->lock);
4728c8
+
4728c8
+	state = ct->fn(ct, ct->message);
4728c8
+
4728c8
+	/* checker done */
4728c8
+	pthread_mutex_lock(&ct->lock);
4728c8
+	ct->state = state;
4728c8
+	pthread_mutex_unlock(&ct->lock);
4728c8
+	pthread_cond_signal(&ct->active);
4728c8
+
4728c8
+	condlog(3, "rbd%d thead finished, state %s", ct->rbd_bus_id,
4728c8
+		checker_state_name(state));
4728c8
+	rbd_thread_cleanup_pop(ct);
4728c8
+	return ((void *)0);
4728c8
+}
4728c8
+
4728c8
+static void rbd_timeout(struct timespec *tsp)
4728c8
+{
4728c8
+	struct timeval now;
4728c8
+
4728c8
+	gettimeofday(&now, NULL);
4728c8
+	tsp->tv_sec = now.tv_sec;
4728c8
+	tsp->tv_nsec = now.tv_usec * 1000;
4728c8
+	tsp->tv_nsec += 1000000; /* 1 millisecond */
4728c8
+}
4728c8
+
4728c8
+static int rbd_exec_fn(struct checker *c, thread_fn *fn)
4728c8
+{
4728c8
+	struct rbd_checker_context *ct = c->context;
4728c8
+	struct timespec tsp;
4728c8
+	pthread_attr_t attr;
4728c8
+	int rbd_status, r;
4728c8
+
4728c8
+	if (c->sync)
4728c8
+		return rbd_check(ct, c->message);
4728c8
+	/*
4728c8
+	 * Async mode
4728c8
+	 */
4728c8
+	r = pthread_mutex_lock(&ct->lock);
4728c8
+	if (r != 0) {
4728c8
+		condlog(2, "rbd%d mutex lock failed with %d", ct->rbd_bus_id,
4728c8
+			r);
4728c8
+		MSG(c, "rbd%d thread failed to initialize", ct->rbd_bus_id);
4728c8
+		return PATH_WILD;
4728c8
+	}
4728c8
+
4728c8
+	if (ct->running) {
4728c8
+		/* Check if checker is still running */
4728c8
+		if (ct->thread) {
4728c8
+			condlog(3, "rbd%d thread not finished", ct->rbd_bus_id);
4728c8
+			rbd_status = PATH_PENDING;
4728c8
+		} else {
4728c8
+			/* checker done */
4728c8
+			ct->running = 0;
4728c8
+			rbd_status = ct->state;
4728c8
+			strncpy(c->message, ct->message, CHECKER_MSG_LEN);
4728c8
+			c->message[CHECKER_MSG_LEN - 1] = '\0';
4728c8
+		}
4728c8
+		pthread_mutex_unlock(&ct->lock);
4728c8
+	} else {
4728c8
+		/* Start new checker */
4728c8
+		ct->state = PATH_UNCHECKED;
4728c8
+		ct->fn = fn;
4728c8
+		pthread_spin_lock(&ct->hldr_lock);
4728c8
+		ct->holders++;
4728c8
+		pthread_spin_unlock(&ct->hldr_lock);
4728c8
+		setup_thread_attr(&attr, 32 * 1024, 1);
4728c8
+		r = pthread_create(&ct->thread, &attr, rbd_thread, ct);
4728c8
+		if (r) {
4728c8
+			pthread_mutex_unlock(&ct->lock);
4728c8
+			ct->thread = 0;
4728c8
+			ct->holders--;
4728c8
+			condlog(3, "rbd%d failed to start rbd thread, using sync mode",
4728c8
+				ct->rbd_bus_id);
4728c8
+			return fn(ct, c->message);
4728c8
+		}
4728c8
+		pthread_attr_destroy(&attr);
4728c8
+		rbd_timeout(&tsp;;
4728c8
+		r = pthread_cond_timedwait(&ct->active, &ct->lock, &tsp;;
4728c8
+		rbd_status = ct->state;
4728c8
+		strncpy(c->message, ct->message,CHECKER_MSG_LEN);
4728c8
+		c->message[CHECKER_MSG_LEN -1] = '\0';
4728c8
+		pthread_mutex_unlock(&ct->lock);
4728c8
+
4728c8
+		if (ct->thread &&
4728c8
+		    (rbd_status == PATH_PENDING || rbd_status == PATH_UNCHECKED)) {
4728c8
+			condlog(3, "rbd%d thread still running",
4728c8
+				ct->rbd_bus_id);
4728c8
+			ct->running = 1;
4728c8
+			rbd_status = PATH_PENDING;
4728c8
+		}
4728c8
+	}
4728c8
+
4728c8
+	return rbd_status;
4728c8
+}
4728c8
+
4728c8
+void libcheck_repair(struct checker * c)
4728c8
+{
4728c8
+	struct rbd_checker_context *ct = c->context;
4728c8
+
4728c8
+	if (!ct || !ct->blacklisted)
4728c8
+		return;
4728c8
+	rbd_exec_fn(c, rbd_repair);
4728c8
+}
4728c8
+
4728c8
+int libcheck_check(struct checker * c)
4728c8
+{
4728c8
+	struct rbd_checker_context *ct = c->context;
4728c8
+
4728c8
+	if (!ct)
4728c8
+		return PATH_UNCHECKED;
4728c8
+
4728c8
+	if (ct->blacklisted)
4728c8
+		return PATH_DOWN;
4728c8
+
4728c8
+	return rbd_exec_fn(c, rbd_check);
4728c8
+}
4728c8
Index: multipath-tools-130222/multipath.conf.annotated
4728c8
===================================================================
4728c8
--- multipath-tools-130222.orig/multipath.conf.annotated
4728c8
+++ multipath-tools-130222/multipath.conf.annotated
4728c8
@@ -97,7 +97,7 @@
4728c8
 #	# scope   : multipath & multipathd
4728c8
 #	# desc    : the default method used to determine the paths' state
4728c8
 #	# values  : readsector0|tur|emc_clariion|hp_sw|directio|rdac|
4728c8
-#	            cciss_tur|hp_tur
4728c8
+#	            cciss_tur|hp_tur|rbd
4728c8
 #	# default : directio
4728c8
 #	#
4728c8
 #	path_checker	directio
4728c8
@@ -493,7 +493,7 @@
4728c8
 #		# scope   : multipathd & multipathd
4728c8
 #		# desc    : path checking algorithm to use to check path state
4728c8
 #		# values  : readsector0|tur|emc_clariion|hp_sw|directio|rdac|
4728c8
-#		#           cciss_tur|hp_tur
4728c8
+#		#           cciss_tur|hp_tur|rbd
4728c8
 #		#
4728c8
 #		path_checker		directio
4728c8
 #
4728c8
Index: multipath-tools-130222/multipath/multipath.conf.5
4728c8
===================================================================
4728c8
--- multipath-tools-130222.orig/multipath/multipath.conf.5
4728c8
+++ multipath-tools-130222/multipath/multipath.conf.5
4728c8
@@ -284,6 +284,9 @@ Check the path state for LSI/Engenio/Net
4728c8
 .B directio
4728c8
 Read the first sector with direct I/O.
4728c8
 .TP
4728c8
+.B rbd
4728c8
+Check if the path is in the Ceph blacklist.
4728c8
+.TP
4728c8
 Default value is \fIdirectio\fR.
4728c8
 .RE
4728c8
 .TP