diff --git a/.gitignore b/.gitignore index 0d0b884..66693a5 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1 @@ -SOURCES/nvme-stas-1.1.6.tar.gz +SOURCES/nvme-stas-2.1.1.tar.gz diff --git a/.nvme-stas.metadata b/.nvme-stas.metadata index 487f8a9..7fcad34 100644 --- a/.nvme-stas.metadata +++ b/.nvme-stas.metadata @@ -1 +1 @@ -829d844d8ee2f797fdbf557be1815310cd37a1e3 SOURCES/nvme-stas-1.1.6.tar.gz +cba4b6de7241d339de4aa08cdcf55c2bd2293a66 SOURCES/nvme-stas-2.1.1.tar.gz diff --git a/SOURCES/0001-sync-with-1.1.6.patch b/SOURCES/0001-sync-with-1.1.6.patch deleted file mode 100644 index c28df7f..0000000 --- a/SOURCES/0001-sync-with-1.1.6.patch +++ /dev/null @@ -1,3307 +0,0 @@ -diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml -index 93604e8..4e1b6c5 100644 ---- a/.github/workflows/pylint.yml -+++ b/.github/workflows/pylint.yml -@@ -1,9 +1,19 @@ --name: Pylint -+name: Linters - - on: [push] - - jobs: -- build: -+ -+ docker-lint: -+ runs-on: ubuntu-latest -+ steps: -+ - uses: actions/checkout@v3 -+ - uses: hadolint/hadolint-action@v2.1.0 -+ with: -+ recursive: true -+ ignore: DL3041 -+ -+ python-lint: - runs-on: ubuntu-latest - - strategy: -diff --git a/Dockerfile b/Dockerfile -index ad6742e..0ab5138 100644 ---- a/Dockerfile -+++ b/Dockerfile -@@ -2,12 +2,12 @@ FROM fedora:36 - - WORKDIR /root - --# for nvme-stas --RUN dnf install -y python3-dasbus python3-pyudev python3-systemd python3-gobject meson --# for libnvme --RUN dnf install -y git gcc g++ cmake openssl-devel libuuid-devel json-c-devel swig python-devel meson -+# first line for nvme-stas -+# second line for libnvme -+RUN dnf install -y python3-dasbus python3-pyudev python3-systemd python3-gobject meson \ -+ git gcc g++ cmake openssl-devel libuuid-devel json-c-devel swig python-devel meson && dnf clean all - - COPY . . --RUN meson .build && ninja -C .build && cd .build && meson install -+RUN meson .build && ninja -C .build && meson install -C .build - - ENTRYPOINT ["python3"] -diff --git a/NEWS.md b/NEWS.md -index d1515cd..f56a7c9 100644 ---- a/NEWS.md -+++ b/NEWS.md -@@ -5,6 +5,7 @@ - - Fix issues with I/O controller connection audits - - Eliminate pcie devices from list of I/O controller connections to audit - - Add soaking timer to workaround race condition between kernel and user-space applications on "add" uevents. When the kernel adds a new nvme device (e.g. `/dev/nvme7`) and sends a "add" uevent to notify user-space applications, the attributes associated with that device (e.g. `/sys/class/nvme/nvme7/cntrltype`) may not be fully initialized which can lead `stacd` to dismiss a device that should get audited. -+- Make `sticky-connections=enabled` the default (see `stacd.conf`) - - ## Changes with release 1.1.5 - -@@ -32,7 +33,7 @@ stacd: Bug fix. Check that self._cfg_soak_tmr is not None before dereferencing i - - ## Changes with release 1.1.1 - --Make `sticky-connections-disabled` by default -+Make `sticky-connections=disabled` the default (see `stacd.conf`) - - ## Changes with release 1.1 - -diff --git a/coverage.sh.in b/coverage.sh.in -index 96b8c53..5ba2ebe 100755 ---- a/coverage.sh.in -+++ b/coverage.sh.in -@@ -38,14 +38,24 @@ PRIMARY_GRP=$( id -ng ) - PRIMARY_USR=$( id -nu ) - PYTHON_PATH=.:./subprojects/libnvme - -+log() { -+ msg="$1" -+ printf "%b[1;36m%s%b[0m\n" "\0033" "${msg}" "\0033" -+ sudo logger -i "@@@@@ COVERAGE -" -p 4 "${msg}" -+} -+ - sd_stop() { -- unit="$1"-cov.service -+ app="$1" -+ unit="${app}"-cov.service -+ log "Stop ${app}" - sudo systemctl stop "${unit}" >/dev/null 2>&1 - sudo systemctl reset-failed "${unit}" >/dev/null 2>&1 - } - - sd_restart() { -- unit="$1"-cov.service -+ app="$1" -+ unit="${app}"-cov.service -+ log "Restart ${app}" - sudo systemctl restart "${unit}" >/dev/null 2>&1 - } - -@@ -61,7 +71,7 @@ sd_start() { - cmd="${app} --syslog -f ${conf}" - fi - -- printf "\n%b[1;36m%s%b[0m\n" "\0033" "Start ${app}" "\0033" -+ log "Start ${app}" - - RUNTIME_DIRECTORY=/tmp/${app} - rm -rf ${RUNTIME_DIRECTORY} -@@ -75,7 +85,7 @@ reload_cfg() { - app="$1" - unit="${app}"-cov.service - pid=$( systemctl show --property MainPID --value "${unit}" ) -- printf "%b[1;36m%s%b[0m\n" "\0033" "Reload config ${app}" "\0033" -+ log "Reload config ${app}" - sudo kill -HUP "${pid}" - } - -@@ -83,15 +93,24 @@ if [ ! -d coverage ]; then - mkdir coverage - fi - -+ -+log "START-START-START-START-START-START-START-START-START-START-START-START" -+ -+ -+ - ################################################################################ - # Load nvme kernel module -+log "modprobe nvme-tcp" - sudo /usr/sbin/modprobe nvme-tcp - -+log "nvme disconnect-all" - sudo nvme disconnect-all - - ################################################################################ - # Create a dummy config file for @STAFD_PROCNAME@ --stafd_conf_fname=$(mktemp /tmp/@STAFD_PROCNAME@.conf.XXXXXX) -+file=/tmp/@STAFD_PROCNAME@.conf.XXXXXX -+log "Create dummy config file $file" -+stafd_conf_fname=$(mktemp $file) - cat > "${stafd_conf_fname}" <<'EOF' - [Global] - tron=true -@@ -102,7 +121,9 @@ EOF - - ################################################################################ - # Create a dummy config file for @STACD_PROCNAME@ --stacd_conf_fname=$(mktemp /tmp/@STACD_PROCNAME@.conf.XXXXXX) -+file=/tmp/@STACD_PROCNAME@.conf.XXXXXX -+log "Create dummy config file $file" -+stacd_conf_fname=$(mktemp $file) - cat > "${stacd_conf_fname}" <<'EOF' - [Global] - tron=true -@@ -111,6 +132,7 @@ udev-rule=disabled - sticky-connections=enabled - EOF - -+log "Stop & Mask Avahi daemon" - sudo systemctl stop avahi-daemon.service - sudo systemctl stop avahi-daemon.socket - sudo systemctl mask avahi-daemon.service -@@ -118,11 +140,11 @@ sudo systemctl mask avahi-daemon.socket - sleep 1 - - --printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ status while @STAFD_PROCNAME@ is not running" "\0033" -+log "Invoking @STAFD_CTLNAME@ status while @STAFD_PROCNAME@ is not running" - coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ ls >/dev/null 2>&1 - coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ invalid-command >/dev/null 2>&1 - --printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ status while @STACD_PROCNAME@ is not running" "\0033" -+log "Invoking @STACD_CTLNAME@ status while @STACD_PROCNAME@ is not running" - coverage run --rcfile=.coveragerc @STACD_CTLNAME@ ls >/dev/null 2>&1 - coverage run --rcfile=.coveragerc @STACD_CTLNAME@ invalid-command >/dev/null 2>&1 - -@@ -132,30 +154,33 @@ sd_start "@STAFD_PROCNAME@" "@STAFD_DBUS_NAME@" "${stafd_conf_fname}" - sd_start "@STACD_PROCNAME@" "@STACD_DBUS_NAME@" "${stacd_conf_fname}" - sleep 3 - --printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ status" "\0033" -+log "Invoking @STAFD_CTLNAME@ status" - coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ status >/dev/null 2>&1 - - reload_cfg "@STAFD_PROCNAME@" - sleep 1 - -+log "Restart Avahi daemon" - sudo systemctl unmask avahi-daemon.socket - sudo systemctl unmask avahi-daemon.service - sudo systemctl start avahi-daemon.socket - sudo systemctl start avahi-daemon.service - sleep 2 - -+log "Change stafd config: tron=true, persistent-connections=false, zeroconf=enable" - cat > "${stafd_conf_fname}" <<'EOF' - [Global] - tron=true - persistent-connections=false - - [Service Discovery] --zeroconf=disabled -+zeroconf=enabled - EOF - reload_cfg "@STAFD_PROCNAME@" - - sleep 1 - -+log "Change stafd config: ip-family=ipv4, kato=10, adding multiple controllers" - cat > "${stafd_conf_fname}" <<'EOF' - [Global] - tron=true -@@ -172,11 +197,15 @@ controller=transport=tcp;traddr=abracadabra - controller= - controller=trsvcid - controller=transport=rdma;traddr=!@#$ -+controller=transport=fc;traddr=21:00:00:00:00:00:00:00;host-traddr=20:00:00:00:00:00:00:00 -+controller=transport=XM;traddr=2.2.2.2 - blacklist=transport=tcp;traddr=1.1.1.1 - blacklist=transport=tcp;traddr=1000.1000.1000.1000 - EOF - reload_cfg "@STAFD_PROCNAME@" - -+ -+log "Change stacd config: tron=true, udev-rule=disabled, sticky-connections=disabled" - cat > "${stacd_conf_fname}" <<'EOF' - [Global] - tron=true -@@ -186,12 +215,12 @@ EOF - reload_cfg "@STACD_PROCNAME@" - sleep 3 - --printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ status" "\0033" -+log "Invoking @STAFD_CTLNAME@ status" - coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ status >/dev/null 2>&1 - - ################################################################################ - # Fake mDNS packets from a CDC --printf "\n%b[1;36m%s%b[0m\n" "\0033" "Start Avahi publisher" "\0033" -+log "Start Avahi publisher" - AVAHI_PUBLISHER=mdns_publisher.service - sudo systemctl stop ${AVAHI_PUBLISHER} >/dev/null 2>&1 - sudo systemctl reset-failed ${AVAHI_PUBLISHER} >/dev/null 2>&1 -@@ -200,7 +229,7 @@ sleep 1 - - ################################################################################ - # Start nvme target simulator --printf "\n%b[1;36m%s%b[0m\n" "\0033" "Start nvmet" "\0033" -+log "Start nvmet" - sudo ../utils/nvmet/nvmet.py clean - sudo ../utils/nvmet/nvmet.py create -f ../utils/nvmet/nvmet.conf - sleep 2 -@@ -210,76 +239,76 @@ reload_cfg "@STACD_PROCNAME@" - sleep 3 - - ################################################################################ --printf "\n%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_PROCNAME@ --version" "\0033" -+log "Invoking @STAFD_PROCNAME@ --version" - coverage run --rcfile=.coveragerc @STAFD_PROCNAME@ --version --printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_PROCNAME@ --idl" "\0033" -+log "Invoking @STAFD_PROCNAME@ --idl" - coverage run --rcfile=.coveragerc @STAFD_PROCNAME@ --idl /tmp/@STAFD_PROCNAME@.idl - --printf "\n%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_PROCNAME@ --version" "\0033" -+log "Invoking @STACD_PROCNAME@ --version" - coverage run --rcfile=.coveragerc @STACD_PROCNAME@ --version --printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_PROCNAME@ --idl" "\0033" -+log "Invoking @STACD_PROCNAME@ --idl" - coverage run --rcfile=.coveragerc @STACD_PROCNAME@ --idl /tmp/@STACD_PROCNAME@.idl - - ################################################################################ - # Stimulate D-Bus activity --printf "\n%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ --version" "\0033" -+log "Invoking @STAFD_CTLNAME@ --version" - sudo coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ --version --printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ with a bad command" "\0033" -+log "Invoking @STAFD_CTLNAME@ with a bad command" - sudo coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ blah --printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ troff" "\0033" -+log "Invoking @STAFD_CTLNAME@ troff" - sudo coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ troff --printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ status" "\0033" -+log "Invoking @STAFD_CTLNAME@ status" - coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ status >/dev/null 2>&1 --printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ tron" "\0033" -+log "Invoking @STAFD_CTLNAME@ tron" - sudo coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ tron --printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ ls" "\0033" -+log "Invoking @STAFD_CTLNAME@ ls" - coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ ls -d >/dev/null 2>&1 --printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ adlp" "\0033" -+log "Invoking @STAFD_CTLNAME@ adlp" - coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ adlp -d >/dev/null 2>&1 --printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ dlp" "\0033" -+log "Invoking @STAFD_CTLNAME@ dlp" - coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ dlp -t tcp -a ::1 -s 8009 >/dev/null 2>&1 - --printf "\n%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ --version" "\0033" -+log "Invoking @STACD_CTLNAME@ --version" - sudo coverage run --rcfile=.coveragerc @STACD_CTLNAME@ --version --printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ with a bad command" "\0033" -+log "Invoking @STACD_CTLNAME@ with a bad command" - sudo coverage run --rcfile=.coveragerc @STACD_CTLNAME@ blah --printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ troff" "\0033" -+log "Invoking @STACD_CTLNAME@ troff" - sudo coverage run --rcfile=.coveragerc @STACD_CTLNAME@ troff --printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ status" "\0033" -+log "Invoking @STACD_CTLNAME@ status" - coverage run --rcfile=.coveragerc @STACD_CTLNAME@ status >/dev/null 2>&1 --printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ tron" "\0033" -+log "Invoking @STACD_CTLNAME@ tron" - sudo coverage run --rcfile=.coveragerc @STACD_CTLNAME@ tron --printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ ls" "\0033" -+log "Invoking @STACD_CTLNAME@ ls" - coverage run --rcfile=.coveragerc @STACD_CTLNAME@ ls -d >/dev/null 2>&1 - - ################################################################################ - # Stimulate AENs activity by removing/restoring namespaces --printf "\n%b[1;36m%s%b[0m\n" "\0033" "Remove namespace: klingons" "\0033" -+log "Remove namespace: klingons" - sudo ../utils/nvmet/nvmet.py unlink -p 1 -s klingons - sleep 2 --printf "\n%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ ls" "\0033" -+log "Invoking @STACD_CTLNAME@ ls" - coverage run --rcfile=.coveragerc @STACD_CTLNAME@ ls -d >/dev/null 2>&1 - --printf "\n%b[1;36m%s%b[0m\n" "\0033" "Restore namespace: klingons" "\0033" -+log "Restore namespace: klingons" - sudo ../utils/nvmet/nvmet.py link -p 1 -s klingons - sleep 2 --printf "\n%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ ls" "\0033" -+log "Invoking @STACD_CTLNAME@ ls" - coverage run --rcfile=.coveragerc @STACD_CTLNAME@ ls -d >/dev/null 2>&1 - - ################################################################################ - # Stop Avahi Publisher --printf "\n%b[1;36m%s%b[0m\n" "\0033" "Stop Avahi publisher" "\0033" -+log "Stop Avahi publisher" - sudo systemctl stop ${AVAHI_PUBLISHER} - sleep 1 - - ################################################################################ --printf "\n%b[1;36m%s%b[0m\n" "\0033" "Restart Avahi publisher" "\0033" -+log "Restart Avahi publisher" - sudo systemd-run --unit=${AVAHI_PUBLISHER} --working-directory=. avahi-publish -s SFSS _nvme-disc._tcp 8009 "p=tcp" - sleep 2 - - ################################################################################ - # Make config changes for @STAFD_PROCNAME@ --printf "\n%b[1;36m%s%b[0m\n" "\0033" "Empty configuration and disable zeroconf for @STAFD_PROCNAME@" "\0033" -+log "Empty configuration and disable zeroconf for @STAFD_PROCNAME@" - cat > "${stafd_conf_fname}" <<'EOF' - [Global] - tron=true -@@ -293,7 +322,7 @@ sleep 1 - - ################################################################################ - # Make more config changes for @STAFD_PROCNAME@ --printf "\n%b[1;36m%s%b[0m\n" "\0033" "Add single controller (::1) and re-enable zeroconf for @STAFD_PROCNAME@" "\0033" -+log "Add single controller (::1) and re-enable zeroconf for @STAFD_PROCNAME@" - cat > "${stafd_conf_fname}" <<'EOF' - [Global] - tron=true -@@ -307,24 +336,23 @@ sleep 2 - - ################################################################################ - # Stop Avahi Publisher --printf "\n%b[1;36m%s%b[0m\n" "\0033" "Stop Avahi publisher" "\0033" -+log "Stop Avahi publisher" - sudo systemctl stop ${AVAHI_PUBLISHER} - sleep 2 - - ################################################################################ - # Remove one of the NVMe device's --printf "\n%b[1;36m%s%b[0m\n" "\0033" "Remove (disconnect) nvme1" "\0033" -+log "Remove (disconnect) nvme1" - sudo nvme disconnect -d nvme1 - sleep 2 - - - ################################################################################ --printf "%b[1;36m%s%b[0m\n" "\0033" "Restart @STAFD_PROCNAME@ and @STACD_PROCNAME@" "\0033" - sd_restart "@STAFD_PROCNAME@" - sd_restart "@STACD_PROCNAME@" - sleep 1 - --printf "%b[1;36m%s%b[0m\n" "\0033" "Create invalid conditions for saving/loading @STAFD_PROCNAME@'s last known config" "\0033" -+log "Create invalid conditions for saving/loading @STAFD_PROCNAME@'s last known config" - rm -rf "/tmp/@STAFD_PROCNAME@" - sd_stop "@STAFD_PROCNAME@" - sd_restart "@STACD_PROCNAME@" -@@ -334,7 +362,7 @@ sleep 2 - - ################################################################################ - # Stop everything and collect coverage stats --printf "\n%b[1;36m%s%b[0m\n" "\0033" "Stop @STAFD_PROCNAME@ and @STACD_PROCNAME@" "\0033" -+log "Stop @STAFD_PROCNAME@ and @STACD_PROCNAME@" - sd_stop "@STAFD_PROCNAME@" - sd_stop "@STACD_PROCNAME@" - sleep 1 -@@ -345,33 +373,49 @@ sudo chown -R "${PRIMARY_USR}":"${PRIMARY_GRP}" coverage >/dev/null 2>&1 - sudo chown -R "${PRIMARY_USR}":"${PRIMARY_GRP}" staslib/__pycache__ >/dev/null 2>&1 - sudo chown -R "${PRIMARY_USR}":"${PRIMARY_GRP}" subprojects/libnvme/libnvme/__pycache__ >/dev/null 2>&1 - -+log "nvme disconnect-all" - sudo nvme disconnect-all - -+log "Remove ${stafd_conf_fname} and ${stacd_conf_fname}" - rm "${stafd_conf_fname}" - rm "${stacd_conf_fname}" - -+log "Run unit test: test-udev" - PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-udev.py -+log "Run unit test: test-avahi" - PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-avahi.py -+log "Run unit test: test-gtimer" - PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-gtimer.py -+log "Run unit test: test-version" - PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-version.py -+log "Run unit test: test-transport_id" - PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-transport_id.py -+log "Run unit test: test-config" - PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-config.py -+log "Run unit test: test-controller" - PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-controller.py -+log "Run unit test: test-service" - PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-service.py -+log "Run unit test: test-log" - PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-log.py -+log "Run unit test: test-nvme_options" - sudo PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-nvme_options.py - - ################################################################################ - # Stop nvme target simulator --printf "\n%b[1;36m%s%b[0m\n" "\0033" "Stop nvmet" "\0033" -+log "Stop nvmet" - sudo ../utils/nvmet/nvmet.py clean - --printf "\n%b[1;36m%s%b[0m\n" "\0033" "Collect all coverage data" "\0033" -+log "Collect all coverage data" - coverage combine --rcfile=.coveragerc - --printf "\n%b[1;36m%s%b[0m\n" "\0033" "Generating coverage report" "\0033" -+log "Generating coverage report" - coverage report -i --rcfile=.coveragerc - --printf "\n%b[1;36m%s%b[0m\n" "\0033" "Generating coverage report (HTML)" "\0033" -+log "Generating coverage report (HTML)" - coverage html -i --rcfile=.coveragerc - -+ -+log "All done!!!" -+ -+log "FINISHED-FINISHED-FINISHED-FINISHED-FINISHED-FINISHED-FINISHED-FINISHED" -diff --git a/doc/man/stacd.conf.xml b/doc/man/stacd.conf.xml -index 60622f6..65ee71a 100644 ---- a/doc/man/stacd.conf.xml -+++ b/doc/man/stacd.conf.xml -@@ -378,7 +378,7 @@ - entries in stacd.conf have been removed. - - -- With <code>sticky-connections=disabled</code> (default) -+ With <code>sticky-connections=disabled</code> - - stacd immediately disconnects from - a previously connected IOC if the response to a -@@ -411,7 +411,7 @@ - - - -- With <code>sticky-connections=enabled</code> -+ With <code>sticky-connections=enabled (default)</code> - - stacd does not disconnect from IOCs - when a DPLE is removed or a controller= -diff --git a/etc/stas/stacd.conf b/etc/stas/stacd.conf -index 02e7b3e..0434671 100644 ---- a/etc/stas/stacd.conf -+++ b/etc/stas/stacd.conf -@@ -202,8 +202,8 @@ - # - # Type: String - # Range: [disabled, enabled] --# Default: disabled --#sticky-connections=disabled -+# Default: enabled -+#sticky-connections=enabled - - [Controllers] - # controller: I/O Controllers (IOC) are specified with this keyword. -diff --git a/stacd.py b/stacd.py -index 708e372..28cefac 100755 ---- a/stacd.py -+++ b/stacd.py -@@ -10,14 +10,12 @@ - ''' STorage Appliance Connector Daemon - ''' - import sys --import logging - from argparse import ArgumentParser - from staslib import defs - --# pylint: disable=consider-using-f-string --DBUS_IDL = ''' -+DBUS_IDL = f''' - -- -+ - - - -@@ -34,19 +32,16 @@ DBUS_IDL = ''' - - - -- -+ - - -- -+ - - - --''' % ( -- defs.STACD_DBUS_NAME, -- defs.STACD_DBUS_NAME, --) -- -+''' - -+# ****************************************************************************** - def parse_args(conf_file: str): # pylint: disable=missing-function-docstring - parser = ArgumentParser( - description=f'{defs.STAC_DESCRIPTION} ({defs.STAC_ACRONYM}). Must be root to run this program.' -@@ -77,6 +72,12 @@ ARGS = parse_args(defs.STACD_CONFIG_FILE) - - if ARGS.version: - print(f'{defs.PROJECT_NAME} {defs.VERSION}') -+ try: -+ import libnvme -+ -+ print(f'libnvme {libnvme.__version__}') -+ except (AttributeError, ModuleNotFoundError): -+ pass - sys.exit(0) - - if ARGS.idl: -@@ -85,78 +86,14 @@ if ARGS.idl: - sys.exit(0) - - --# There is a reason for having this import here and not at the top of the file. --# We want to allow running stafd with the --version and --idl options and exit --# without having to import stas. --from staslib import stas # pylint: disable=wrong-import-position -- --# Before going any further, make sure the script is allowed to run. --stas.check_if_allowed_to_continue() -- -- --################################################################################ --# Preliminary checks have passed. Let her rip! --# pylint: disable=wrong-import-position --# pylint: disable=wrong-import-order --import json --import pathlib --import systemd.daemon --import dasbus.error --import dasbus.client.observer --import dasbus.client.proxy --from gi.repository import GLib --from staslib import conf, log, gutil, trid, udev, ctrl, service # pylint: disable=ungrouped-imports -- --log.init(ARGS.syslog) -- --UDEV_RULE_SUPPRESS = pathlib.Path('/run/udev/rules.d', '70-nvmf-autoconnect.rules') -- -- --def udev_rule_ctrl(enable): -- '''@brief We add an empty udev rule to /run/udev/rules.d to suppress -- nvme-cli's udev rule that is used to tell udevd to automatically -- connect to I/O controller. This is to avoid race conditions between -- stacd and udevd. This is configurable. See "udev-rule" in stacd.conf -- for details. -- ''' -- if enable: -- try: -- UDEV_RULE_SUPPRESS.unlink() -- except FileNotFoundError: -- pass -- else: -- if not UDEV_RULE_SUPPRESS.exists(): -- pathlib.Path('/run/udev/rules.d').mkdir(parents=True, exist_ok=True) -- UDEV_RULE_SUPPRESS.symlink_to('/dev/null') -- -- - # ****************************************************************************** --class Ioc(ctrl.Controller): -- '''@brief This object establishes a connection to one I/O Controller.''' -- -- def __init__(self, root, host, tid: trid.TID): -- super().__init__(root, host, tid) -- -- def _on_udev_remove(self, udev_obj): -- '''Called when the associated nvme device (/dev/nvmeX) is removed -- from the system. -- ''' -- super()._on_udev_remove(udev_obj) -- -- # Defer removal of this object to the next main loop's idle period. -- GLib.idle_add(STAC.remove_controller, self) -- -- def _find_existing_connection(self): -- return self._udev.find_nvme_ioc_device(self.tid) -- -- --# ****************************************************************************** --class Stac(service.Service): -- '''STorage Appliance Connector (STAC)''' -+if __name__ == '__main__': -+ import json -+ import logging -+ from staslib import log, service, stas, udev # pylint: disable=ungrouped-imports - -- CONF_STABILITY_SOAK_TIME_SEC = 1.5 -- CONF_STABILITY_LONG_SOAK_TIME_SEC = 10 # pylint: disable=invalid-name -- ADD_EVENT_SOAK_TIME_SEC = 1 -+ # Before going any further, make sure the script is allowed to run. -+ stas.check_if_allowed_to_continue() - - class Dbus: - '''This is the DBus interface that external programs can use to -@@ -205,229 +142,8 @@ class Stac(service.Service): - for controller in STAC.get_controllers() - ] - -- # ========================================================================== -- def __init__(self, args): -- super().__init__(args, self._reload_hdlr) -- -- # We don't want to apply configuration changes to nvme-cli right away. -- # Often, multiple changes will occur in a short amount of time (sub-second). -- # We want to wait until there are no more changes before applying them -- # to the system. The following timer acts as a "soak period". Changes -- # will be applied by calling self._on_config_ctrls() at the end of -- # the soak period. -- self._cfg_soak_tmr = gutil.GTimer(Stac.CONF_STABILITY_SOAK_TIME_SEC, self._on_config_ctrls) -- self._cfg_soak_tmr.start() -- -- self._add_event_soak_tmr = gutil.GTimer(Stac.ADD_EVENT_SOAK_TIME_SEC, self._on_add_event_soaked) -- -- self._config_connections_audit() -- -- # Create the D-Bus instance. -- self._config_dbus(Stac.Dbus(), defs.STACD_DBUS_NAME, defs.STACD_DBUS_PATH) -- -- # Connect to STAF D-Bus interface -- self._staf = None -- self._staf_watcher = dasbus.client.observer.DBusObserver(self._sysbus, defs.STAFD_DBUS_NAME) -- self._staf_watcher.service_available.connect(self._connect_to_staf) -- self._staf_watcher.service_unavailable.connect(self._disconnect_from_staf) -- self._staf_watcher.connect_once_available() -- -- # Suppress udev rule to auto-connect when AEN is received. -- udev_rule_ctrl(conf.SvcConf().udev_rule_enabled) -- -- def _release_resources(self): -- logging.debug('Stac._release_resources()') -- -- if self._add_event_soak_tmr: -- self._add_event_soak_tmr.kill() -- -- udev_rule_ctrl(True) -- -- if self._udev: -- self._udev.unregister_for_action_events('add') -- -- self._destroy_staf_comlink(self._staf_watcher) -- if self._staf_watcher is not None: -- self._staf_watcher.disconnect() -- -- super()._release_resources() -- -- self._staf = None -- self._staf_watcher = None -- self._add_event_soak_tmr = None -- -- def _audit_connections(self, tids): -- '''A host should only connect to I/O controllers that have been zoned -- for that host or a manual "controller" entry exists in stcd.conf. -- A host should disconnect from an I/O controller when that I/O controller -- is removed from the zone or a manual "controller" entry is removed from -- stacd.conf. stacd will audit connections if "sticky-connections=disabled". -- stacd will delete any connection that is not supposed to exist. -- ''' -- logging.debug('Stac._audit_connections() - tids = %s', tids) -- num_controllers = len(self._controllers) -- for tid in tids: -- if tid not in self._controllers: -- self._controllers[tid] = Ioc(self._root, self._host, tid) -- -- if num_controllers != len(self._controllers): -- self._cfg_soak_tmr.start(Stac.CONF_STABILITY_SOAK_TIME_SEC) -- -- def _on_add_event(self, udev_obj): # pylint: disable=unused-argument -- '''@brief This function is called when a "add" event is received from -- the kernel for an NVMe device. This is used to trigger an audit and make -- sure that the connection to an I/O controller is allowed. -- -- WARNING: There is a race condition with the "add" event from the kernel. -- The kernel sends the "add" event a bit early and the sysfs attributes -- associated with the nvme object are not always fully initialized. -- To workaround this problem we use a soaking timer to give time for the -- sysfs attributes to stabilize. -- ''' -- self._add_event_soak_tmr.start() -- -- def _on_add_event_soaked(self): -- '''@brief After the add event has been soaking for ADD_EVENT_SOAK_TIME_SEC -- seconds, we can audit the connections. -- ''' -- if not conf.SvcConf().sticky_connections: -- self._audit_connections(self._udev.get_nvme_ioc_tids()) -- return GLib.SOURCE_REMOVE -- -- def _config_connections_audit(self): -- '''This function checks the "sticky_connections" parameter to determine -- whether audits should be performed. Audits are enabled when -- "sticky_connections" is disabled. -- ''' -- if not conf.SvcConf().sticky_connections: -- if self._udev.get_registered_action_cback('add') is None: -- self._udev.register_for_action_events('add', self._on_add_event) -- self._audit_connections(self._udev.get_nvme_ioc_tids()) -- else: -- self._udev.unregister_for_action_events('add') -- -- def _keep_connections_on_exit(self): -- '''@brief Determine whether connections should remain when the -- process exits. -- ''' -- return True -- -- def _reload_hdlr(self): -- '''@brief Reload configuration file. This is triggered by the SIGHUP -- signal, which can be sent with "systemctl reload stacd". -- ''' -- systemd.daemon.notify('RELOADING=1') -- service_cnf = conf.SvcConf() -- service_cnf.reload() -- self.tron = service_cnf.tron -- self._config_connections_audit() -- self._cfg_soak_tmr.start(Stac.CONF_STABILITY_SOAK_TIME_SEC) -- udev_rule_ctrl(service_cnf.udev_rule_enabled) -- systemd.daemon.notify('READY=1') -- return GLib.SOURCE_CONTINUE -- -- def _get_log_pages_from_stafd(self): -- if self._staf: -- try: -- return json.loads(self._staf.get_all_log_pages(True)) -- except dasbus.error.DBusError: -- pass -- -- return list() -- -- def _config_ctrls_finish(self, configured_ctrl_list): -- configured_ctrl_list = [ -- ctrl_dict for ctrl_dict in configured_ctrl_list if 'traddr' in ctrl_dict and 'subsysnqn' in ctrl_dict -- ] -- logging.debug('Stac._config_ctrls_finish() - configured_ctrl_list = %s', configured_ctrl_list) -- -- discovered_ctrl_list = list() -- for staf_data in self._get_log_pages_from_stafd(): -- host_traddr = staf_data['discovery-controller']['host-traddr'] -- host_iface = staf_data['discovery-controller']['host-iface'] -- for dlpe in staf_data['log-pages']: -- if dlpe.get('subtype') == 'nvme': # eliminate discovery controllers -- discovered_ctrl_list.append(stas.cid_from_dlpe(dlpe, host_traddr, host_iface)) -- -- logging.debug('Stac._config_ctrls_finish() - discovered_ctrl_list = %s', discovered_ctrl_list) -- -- controllers = stas.remove_blacklisted(configured_ctrl_list + discovered_ctrl_list) -- controllers = stas.remove_invalid_addresses(controllers) -- -- new_controller_ids = {trid.TID(controller) for controller in controllers} -- cur_controller_ids = set(self._controllers.keys()) -- controllers_to_add = new_controller_ids - cur_controller_ids -- controllers_to_del = cur_controller_ids - new_controller_ids -- -- logging.debug('Stac._config_ctrls_finish() - controllers_to_add = %s', list(controllers_to_add)) -- logging.debug('Stac._config_ctrls_finish() - controllers_to_del = %s', list(controllers_to_del)) -- -- for tid in controllers_to_del: -- controller = self._controllers.pop(tid, None) -- if controller is not None: -- controller.disconnect(self.remove_controller, conf.SvcConf().sticky_connections) -- -- for tid in controllers_to_add: -- self._controllers[tid] = Ioc(self._root, self._host, tid) -- -- def _connect_to_staf(self, _): -- '''@brief Hook up DBus signal handlers for signals from stafd.''' -- try: -- self._staf = self._sysbus.get_proxy(defs.STAFD_DBUS_NAME, defs.STAFD_DBUS_PATH) -- self._staf.log_pages_changed.connect(self._log_pages_changed) -- self._cfg_soak_tmr.start() -- -- # Make sure timer is set back to its normal value. -- self._cfg_soak_tmr.set_timeout(Stac.CONF_STABILITY_SOAK_TIME_SEC) -- logging.debug('Stac._connect_to_staf() - Connected to staf') -- except dasbus.error.DBusError: -- logging.error('Failed to connect to staf') -- -- def _destroy_staf_comlink(self, watcher): # pylint: disable=unused-argument -- if self._staf: -- self._staf.log_pages_changed.disconnect(self._log_pages_changed) -- dasbus.client.proxy.disconnect_proxy(self._staf) -- self._staf = None -- -- def _disconnect_from_staf(self, watcher): -- self._destroy_staf_comlink(watcher) -- -- # When we lose connectivity with stafd, the most logical explanation -- # is that stafd restarted. In that case, it may take some time for stafd -- # to re-populate its log pages cache. So let's give stafd plenty of time -- # to update its log pages cache and send log pages change notifications -- # before triggering a stacd re-config. We do this by momentarily -- # increasing the config soak timer to a longer period. -- if self._cfg_soak_tmr: -- self._cfg_soak_tmr.set_timeout(Stac.CONF_STABILITY_LONG_SOAK_TIME_SEC) -- -- logging.debug('Stac._disconnect_from_staf() - Disconnected from staf') -- -- def _log_pages_changed( # pylint: disable=too-many-arguments -- self, transport, traddr, trsvcid, host_traddr, host_iface, subsysnqn, device -- ): -- logging.debug( -- 'Stac._log_pages_changed() - transport=%s, traddr=%s, trsvcid=%s, host_traddr=%s, host_iface=%s, subsysnqn=%s, device=%s', -- transport, -- traddr, -- trsvcid, -- host_traddr, -- host_iface, -- subsysnqn, -- device, -- ) -- self._cfg_soak_tmr.start(Stac.CONF_STABILITY_SOAK_TIME_SEC) -- -- def _load_last_known_config(self): -- return dict() -- -- def _dump_last_known_config(self, controllers): -- pass -- -- --# ****************************************************************************** --if __name__ == '__main__': -- STAC = Stac(ARGS) -+ log.init(ARGS.syslog) -+ STAC = service.Stac(ARGS, Dbus()) - STAC.run() - - STAC = None -diff --git a/stafd.py b/stafd.py -index aff64fd..8a77c51 100755 ---- a/stafd.py -+++ b/stafd.py -@@ -10,14 +10,12 @@ - ''' STorage Appliance Finder Daemon - ''' - import sys --import logging - from argparse import ArgumentParser - from staslib import defs - --# pylint: disable=consider-using-f-string --DBUS_IDL = ''' -+DBUS_IDL = f''' - -- -+ - - - -@@ -34,10 +32,10 @@ DBUS_IDL = ''' - - - -- -+ - - -- -+ - - - -@@ -46,7 +44,7 @@ DBUS_IDL = ''' - - - -- -+ - - - -@@ -63,12 +61,10 @@ DBUS_IDL = ''' - - - --''' % ( -- defs.STAFD_DBUS_NAME, -- defs.STAFD_DBUS_NAME, --) -+''' - - -+# ****************************************************************************** - def parse_args(conf_file: str): # pylint: disable=missing-function-docstring - parser = ArgumentParser( - description=f'{defs.STAF_DESCRIPTION} ({defs.STAF_ACRONYM}). Must be root to run this program.' -@@ -99,6 +95,12 @@ ARGS = parse_args(defs.STAFD_CONFIG_FILE) - - if ARGS.version: - print(f'{defs.PROJECT_NAME} {defs.VERSION}') -+ try: -+ import libnvme -+ -+ print(f'libnvme {libnvme.__version__}') -+ except (AttributeError, ModuleNotFoundError): -+ pass - sys.exit(0) - - if ARGS.idl: -@@ -107,250 +109,15 @@ if ARGS.idl: - sys.exit(0) - - --# There is a reason for having this import here and not at the top of the file. --# We want to allow running stafd with the --version and --idl options and exit --# without having to import stas and avahi. --from staslib import stas, avahi # pylint: disable=wrong-import-position -- --# Before going any further, make sure the script is allowed to run. --stas.check_if_allowed_to_continue() -- -- --################################################################################ --# Preliminary checks have passed. Let her rip! --# pylint: disable=wrong-import-position --# pylint: disable=wrong-import-order --import json --import pickle --import dasbus.server.interface --import systemd.daemon --from libnvme import nvme --from gi.repository import GLib --from staslib import conf, log, gutil, trid, udev, ctrl, service # pylint: disable=ungrouped-imports -- --log.init(ARGS.syslog) -- --DLP_CHANGED = ( -- (nvme.NVME_LOG_LID_DISCOVER << 16) | (nvme.NVME_AER_NOTICE_DISC_CHANGED << 8) | nvme.NVME_AER_NOTICE --) # 0x70f002 -- -- - # ****************************************************************************** --class Dc(ctrl.Controller): -- '''@brief This object establishes a connection to one Discover Controller (DC). -- It retrieves the discovery log pages and caches them. -- It also monitors udev events associated with that DC and updates -- the cached discovery log pages accordingly. -- ''' -- -- GET_LOG_PAGE_RETRY_RERIOD_SEC = 20 -- REGISTRATION_RETRY_RERIOD_SEC = 10 -- -- def __init__(self, root, host, tid: trid.TID, log_pages=None): -- super().__init__(root, host, tid, discovery_ctrl=True) -- self._register_op = None -- self._get_log_op = None -- self._log_pages = log_pages if log_pages else list() # Log pages cache -- -- def _release_resources(self): -- logging.debug('Dc._release_resources() - %s | %s', self.id, self.device) -- super()._release_resources() -- self._log_pages = list() -- -- def _kill_ops(self): -- super()._kill_ops() -- if self._get_log_op: -- self._get_log_op.kill() -- self._get_log_op = None -- if self._register_op: -- self._register_op.kill() -- self._register_op = None -- -- def info(self) -> dict: -- '''@brief Get the controller info for this object''' -- info = super().info() -- if self._get_log_op: -- info['get log page operation'] = self._get_log_op.as_dict() -- if self._register_op: -- info['register operation'] = self._register_op.as_dict() -- return info -- -- def cancel(self): -- '''@brief Used to cancel pending operations.''' -- super().cancel() -- if self._get_log_op: -- self._get_log_op.cancel() -- if self._register_op: -- self._register_op.cancel() -- -- def log_pages(self) -> list: -- '''@brief Get the cached log pages for this object''' -- return self._log_pages -- -- def referrals(self) -> list: -- '''@brief Return the list of referrals''' -- return [page for page in self._log_pages if page['subtype'] == 'referral'] -- -- def _on_aen(self, aen: int): -- super()._on_aen(aen) -- if aen == DLP_CHANGED and self._get_log_op: -- self._get_log_op.run_async() -- -- def _on_nvme_event(self, nvme_event: str): -- super()._on_nvme_event(nvme_event) -- if nvme_event == 'connected' and self._register_op: -- self._register_op.run_async() -- -- def _on_udev_remove(self, udev_obj): -- super()._on_udev_remove(udev_obj) -- if self._try_to_connect_deferred: -- self._try_to_connect_deferred.schedule() -- -- def _find_existing_connection(self): -- return self._udev.find_nvme_dc_device(self.tid) -- -- # -------------------------------------------------------------------------- -- def _on_connect_success(self, op_obj, data): -- '''@brief Function called when we successfully connect to the -- Discovery Controller. -- ''' -- super()._on_connect_success(op_obj, data) -- -- if self._alive(): -- if self._ctrl.is_registration_supported(): -- self._register_op = gutil.AsyncOperationWithRetry( -- self._on_registration_success, -- self._on_registration_fail, -- self._ctrl.registration_ctlr, -- nvme.NVMF_DIM_TAS_REGISTER, -- ) -- self._register_op.run_async() -- else: -- self._get_log_op = gutil.AsyncOperationWithRetry( -- self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover -- ) -- self._get_log_op.run_async() -- -- # -------------------------------------------------------------------------- -- def _on_registration_success(self, op_obj, data): # pylint: disable=unused-argument -- '''@brief Function called when we successfully register with the -- Discovery Controller. See self._register_op object -- for details. -- ''' -- if self._alive(): -- if data is not None: -- logging.warning('%s | %s - Registration error. %s.', self.id, self.device, data) -- else: -- logging.debug('Dc._on_registration_success() - %s | %s', self.id, self.device) -- self._get_log_op = gutil.AsyncOperationWithRetry( -- self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover -- ) -- self._get_log_op.run_async() -- else: -- logging.debug( -- 'Dc._on_registration_success() - %s | %s Received event on dead object.', self.id, self.device -- ) -- -- def _on_registration_fail(self, op_obj, err, fail_cnt): -- '''@brief Function called when we fail to register with the -- Discovery Controller. See self._register_op object -- for details. -- ''' -- if self._alive(): -- logging.debug( -- 'Dc._on_registration_fail() - %s | %s: %s. Retry in %s sec', -- self.id, -- self.device, -- err, -- Dc.REGISTRATION_RETRY_RERIOD_SEC, -- ) -- if fail_cnt == 1: # Throttle the logs. Only print the first time we fail to connect -- logging.error('%s | %s - Failed to register with Discovery Controller. %s', self.id, self.device, err) -- # op_obj.retry(Dc.REGISTRATION_RETRY_RERIOD_SEC) -- else: -- logging.debug( -- 'Dc._on_registration_fail() - %s | %s Received event on dead object. %s', -- self.id, -- self.device, -- err, -- ) -- op_obj.kill() -- -- # -------------------------------------------------------------------------- -- def _on_get_log_success(self, op_obj, data): # pylint: disable=unused-argument -- '''@brief Function called when we successfully retrieve the log pages -- from the Discovery Controller. See self._get_log_op object -- for details. -- ''' -- if self._alive(): -- # Note that for historical reasons too long to explain, the CDC may -- # return invalid addresses ("0.0.0.0", "::", or ""). Those need to be -- # filtered out. -- referrals_before = self.referrals() -- self._log_pages = ( -- [ -- {k: str(v) for k, v in dictionary.items()} -- for dictionary in data -- if dictionary.get('traddr') not in ('0.0.0.0', '::', '') -- ] -- if data -- else list() -- ) -- logging.info( -- '%s | %s - Received discovery log pages (num records=%s).', self.id, self.device, len(self._log_pages) -- ) -- referrals_after = self.referrals() -- STAF.log_pages_changed(self, self.device) -- if referrals_after != referrals_before: -- logging.debug( -- 'Dc._on_get_log_success() - %s | %s Referrals before = %s', -- self.id, -- self.device, -- referrals_before, -- ) -- logging.debug( -- 'Dc._on_get_log_success() - %s | %s Referrals after = %s', -- self.id, -- self.device, -- referrals_after, -- ) -- STAF.referrals_changed() -- else: -- logging.debug( -- 'Dc._on_get_log_success() - %s | %s Received event on dead object.', self.id, self.device -- ) -- -- def _on_get_log_fail(self, op_obj, err, fail_cnt): -- '''@brief Function called when we fail to retrieve the log pages -- from the Discovery Controller. See self._get_log_op object -- for details. -- ''' -- if self._alive(): -- logging.debug( -- 'Dc._on_get_log_fail() - %s | %s: %s. Retry in %s sec', -- self.id, -- self.device, -- err, -- Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC, -- ) -- if fail_cnt == 1: # Throttle the logs. Only print the first time we fail to connect -- logging.error('%s | %s - Failed to retrieve log pages. %s', self.id, self.device, err) -- op_obj.retry(Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC) -- else: -- logging.debug( -- 'Dc._on_get_log_fail() - %s | %s Received event on dead object. %s', -- self.id, -- self.device, -- err, -- ) -- op_obj.kill() -- -- --# ****************************************************************************** --class Staf(service.Service): -- '''STorage Appliance Finder (STAF)''' -+if __name__ == '__main__': -+ import json -+ import logging -+ import dasbus.server.interface -+ from staslib import log, service, stas, udev # pylint: disable=ungrouped-imports - -- CONF_STABILITY_SOAK_TIME_SEC = 1.5 -+ # Before going any further, make sure the script is allowed to run. -+ stas.check_if_allowed_to_continue() - - class Dbus: - '''This is the DBus interface that external programs can use to -@@ -431,148 +198,8 @@ class Staf(service.Service): - for controller in STAF.get_controllers() - ] - -- # ========================================================================== -- def __init__(self, args): -- super().__init__(args, self._reload_hdlr) -- -- self._avahi = avahi.Avahi(self._sysbus, self._avahi_change) -- self._avahi.config_stypes(conf.SvcConf().get_stypes()) -- -- # We don't want to apply configuration changes to nvme-cli right away. -- # Often, multiple changes will occur in a short amount of time (sub-second). -- # We want to wait until there are no more changes before applying them -- # to the system. The following timer acts as a "soak period". Changes -- # will be applied by calling self._on_config_ctrls() at the end of -- # the soak period. -- self._cfg_soak_tmr = gutil.GTimer(Staf.CONF_STABILITY_SOAK_TIME_SEC, self._on_config_ctrls) -- self._cfg_soak_tmr.start() -- -- # Create the D-Bus instance. -- self._config_dbus(Staf.Dbus(), defs.STAFD_DBUS_NAME, defs.STAFD_DBUS_PATH) -- -- def info(self) -> dict: -- '''@brief Get the status info for this object (used for debug)''' -- info = super().info() -- info['avahi'] = self._avahi.info() -- return info -- -- def _release_resources(self): -- logging.debug('Staf._release_resources()') -- super()._release_resources() -- if self._avahi: -- self._avahi.kill() -- self._avahi = None -- -- def _load_last_known_config(self): -- try: -- with open(self._lkc_file, 'rb') as file: -- config = pickle.load(file) -- except (FileNotFoundError, AttributeError): -- return dict() -- -- logging.debug('Staf._load_last_known_config() - DC count = %s', len(config)) -- return {tid: Dc(self._root, self._host, tid, log_pages) for tid, log_pages in config.items()} -- -- def _dump_last_known_config(self, controllers): -- try: -- with open(self._lkc_file, 'wb') as file: -- config = {tid: dc.log_pages() for tid, dc in controllers.items()} -- logging.debug('Staf._dump_last_known_config() - DC count = %s', len(config)) -- pickle.dump(config, file) -- except FileNotFoundError as ex: -- logging.error('Unable to save last known config: %s', ex) -- -- def _keep_connections_on_exit(self): -- '''@brief Determine whether connections should remain when the -- process exits. -- ''' -- return conf.SvcConf().persistent_connections -- -- def _reload_hdlr(self): -- '''@brief Reload configuration file. This is triggered by the SIGHUP -- signal, which can be sent with "systemctl reload stafd". -- ''' -- systemd.daemon.notify('RELOADING=1') -- service_cnf = conf.SvcConf() -- service_cnf.reload() -- self.tron = service_cnf.tron -- self._avahi.kick_start() # Make sure Avahi is running -- self._avahi.config_stypes(service_cnf.get_stypes()) -- self._cfg_soak_tmr.start() -- systemd.daemon.notify('READY=1') -- return GLib.SOURCE_CONTINUE -- -- def log_pages_changed(self, controller, device): -- '''@brief Function invoked when a controller's cached log pages -- have changed. This will emit a D-Bus signal to inform -- other applications that the cached log pages have changed. -- ''' -- self._dbus_iface.log_pages_changed.emit( -- controller.tid.transport, -- controller.tid.traddr, -- controller.tid.trsvcid, -- controller.tid.host_traddr, -- controller.tid.host_iface, -- controller.tid.subsysnqn, -- device, -- ) -- -- def referrals_changed(self): -- '''@brief Function invoked when a controller's cached referrals -- have changed. -- ''' -- logging.debug('Staf.referrals_changed()') -- self._cfg_soak_tmr.start() -- -- def _referrals(self) -> list: -- return [ -- stas.cid_from_dlpe(dlpe, controller.tid.host_traddr, controller.tid.host_iface) -- for controller in self.get_controllers() -- for dlpe in controller.referrals() -- ] -- -- def _config_ctrls_finish(self, configured_ctrl_list): -- '''@brief Finish discovery controllers configuration after -- hostnames (if any) have been resolved. -- ''' -- configured_ctrl_list = [ -- ctrl_dict -- for ctrl_dict in configured_ctrl_list -- if 'traddr' in ctrl_dict and ctrl_dict.setdefault('subsysnqn', defs.WELL_KNOWN_DISC_NQN) -- ] -- -- discovered_ctrl_list = self._avahi.get_controllers() -- referral_ctrl_list = self._referrals() -- logging.debug('Staf._config_ctrls_finish() - configured_ctrl_list = %s', configured_ctrl_list) -- logging.debug('Staf._config_ctrls_finish() - discovered_ctrl_list = %s', discovered_ctrl_list) -- logging.debug('Staf._config_ctrls_finish() - referral_ctrl_list = %s', referral_ctrl_list) -- -- controllers = stas.remove_blacklisted(configured_ctrl_list + discovered_ctrl_list + referral_ctrl_list) -- controllers = stas.remove_invalid_addresses(controllers) -- -- new_controller_ids = {trid.TID(controller) for controller in controllers} -- cur_controller_ids = set(self._controllers.keys()) -- controllers_to_add = new_controller_ids - cur_controller_ids -- controllers_to_del = cur_controller_ids - new_controller_ids -- -- logging.debug('Staf._config_ctrls_finish() - controllers_to_add = %s', list(controllers_to_add)) -- logging.debug('Staf._config_ctrls_finish() - controllers_to_del = %s', list(controllers_to_del)) -- -- for tid in controllers_to_del: -- controller = self._controllers.pop(tid, None) -- if controller is not None: -- controller.disconnect(self.remove_controller, conf.SvcConf().persistent_connections) -- -- for tid in controllers_to_add: -- self._controllers[tid] = Dc(self._root, self._host, tid) -- -- def _avahi_change(self): -- self._cfg_soak_tmr.start() -- -- --# ****************************************************************************** --if __name__ == '__main__': -- STAF = Staf(ARGS) -+ log.init(ARGS.syslog) -+ STAF = service.Staf(ARGS, Dbus()) - STAF.run() - - STAF = None -diff --git a/staslib/avahi.py b/staslib/avahi.py -index 768bbf4..90a67c8 100644 ---- a/staslib/avahi.py -+++ b/staslib/avahi.py -@@ -172,9 +172,7 @@ class Avahi: # pylint: disable=too-many-instance-attributes - services = dict() - for service, obj in self._services.items(): - interface, protocol, name, stype, domain = service -- key = '({}, {}, {}.{}, {})'.format( # pylint: disable=consider-using-f-string -- socket.if_indextoname(interface), Avahi.protos.get(protocol, 'unknown'), name, domain, stype -- ) -+ key = f'({socket.if_indextoname(interface)}, {Avahi.protos.get(protocol, "unknown")}, {name}.{domain}, {stype})' - services[key] = obj.get('data', {}) - - info = { -@@ -316,7 +314,7 @@ class Avahi: # pylint: disable=too-many-instance-attributes - _interface_name: str, - _signal_name: str, - args: typing.Tuple[int, int, str, str, str, int], -- *_user_data -+ *_user_data, - ): - (interface, protocol, name, stype, domain, flags) = args - logging.debug( -@@ -352,7 +350,7 @@ class Avahi: # pylint: disable=too-many-instance-attributes - _interface_name: str, - _signal_name: str, - args: typing.Tuple[int, int, str, str, str, int], -- *_user_data -+ *_user_data, - ): - (interface, protocol, name, stype, domain, flags) = args - logging.debug( -@@ -386,7 +384,7 @@ class Avahi: # pylint: disable=too-many-instance-attributes - _interface_name: str, - _signal_name: str, - args: typing.Tuple[int, int, str, str, str, str, int, str, int, list, int], -- *_user_data -+ *_user_data, - ): - (interface, protocol, name, stype, domain, host, aprotocol, address, port, txt, flags) = args - txt = _txt2dict(txt) -@@ -428,7 +426,7 @@ class Avahi: # pylint: disable=too-many-instance-attributes - interface_name: str, - _signal_name: str, - args: typing.Tuple[str], -- *_user_data -+ *_user_data, - ): - (error,) = args - if 'ServiceResolver' not in interface_name or 'TimeoutError' not in error: -diff --git a/staslib/conf.py b/staslib/conf.py -index 3f52e4f..c314a9e 100644 ---- a/staslib/conf.py -+++ b/staslib/conf.py -@@ -74,7 +74,7 @@ class SvcConf(metaclass=singleton.Singleton): - ('Global', 'ignore-iface'): 'false', - ('Global', 'ip-family'): 'ipv4+ipv6', - ('Global', 'udev-rule'): 'enabled', -- ('Global', 'sticky-connections'): 'disabled', -+ ('Global', 'sticky-connections'): 'enabled', - ('Service Discovery', 'zeroconf'): 'enabled', - ('Controllers', 'controller'): list(), - ('Controllers', 'blacklist'): list(), -diff --git a/staslib/ctrl.py b/staslib/ctrl.py -index 5504baa..dbc1973 100644 ---- a/staslib/ctrl.py -+++ b/staslib/ctrl.py -@@ -10,69 +10,76 @@ - Dc (Discovery Controller) and Ioc (I/O Controller) objects are derived.''' - - import logging --from gi.repository import Gio, GLib -+from gi.repository import GLib - from libnvme import nvme --from staslib import conf, gutil, trid, udev -+from staslib import conf, gutil, trid, udev, stas - - - DC_KATO_DEFAULT = 30 # seconds - - - # ****************************************************************************** --class Controller: # pylint: disable=too-many-instance-attributes -+class Controller(stas.ControllerABC): - '''@brief Base class used to manage the connection to a controller.''' - -- CONNECT_RETRY_PERIOD_SEC = 60 -- FAST_CONNECT_RETRY_PERIOD_SEC = 3 -- - def __init__(self, root, host, tid: trid.TID, discovery_ctrl=False): -- self._root = root -- self._host = host -- self._udev = udev.UDEV -- self._tid = tid -- self._cancellable = Gio.Cancellable() -- self._connect_op = None -- self._connect_attempts = 0 -- self._retry_connect_tmr = gutil.GTimer(Controller.CONNECT_RETRY_PERIOD_SEC, self._on_try_to_connect) -- self._device = None -- self._ctrl = None -- self._discovery_ctrl = discovery_ctrl -- self._try_to_connect_deferred = gutil.Deferred(self._try_to_connect) -- self._try_to_connect_deferred.schedule() -+ self._udev = udev.UDEV -+ self._device = None # Refers to the nvme device (e.g. /dev/nvme[n]) -+ self._ctrl = None # libnvme's nvme.ctrl object -+ self._connect_op = None -+ -+ super().__init__(root, host, tid, discovery_ctrl) - - def _release_resources(self): - logging.debug('Controller._release_resources() - %s', self.id) - -- # Remove pending deferred from main loop -- if self._try_to_connect_deferred: -- self._try_to_connect_deferred.cancel() -- self._try_to_connect_deferred = None -- - if self._udev: - self._udev.unregister_for_device_events(self._on_udev_notification) - -- if self._retry_connect_tmr is not None: -- self._retry_connect_tmr.kill() -- -- if self._cancellable and not self._cancellable.is_cancelled(): -- self._cancellable.cancel() -- - self._kill_ops() - -- self._tid = None -+ super()._release_resources() -+ - self._ctrl = None -- self._device = None -- self._retry_connect_tmr = None -- self._cancellable = None - self._udev = None - -- def _alive(self): -- '''There may be race condition where a queued event gets processed -- after the object is no longer configured (i.e. alive). This method -- can be used by callback functions to make sure the object is still -- alive before processing further. -- ''' -- return self._cancellable and not self._cancellable.is_cancelled() -+ @property -+ def device(self) -> str: -+ '''@brief return the Linux nvme device id (e.g. nvme3) or empty -+ string if no device is associated with this controller''' -+ if not self._device and self._ctrl and self._ctrl.name: -+ self._device = self._ctrl.name -+ -+ return self._device or 'nvme?' -+ -+ def controller_id_dict(self) -> dict: -+ '''@brief return the controller ID as a dict.''' -+ cid = super().controller_id_dict() -+ cid['device'] = self.device -+ return cid -+ -+ def details(self) -> dict: -+ '''@brief return detailed debug info about this controller''' -+ details = super().details() -+ details.update( -+ self._udev.get_attributes(self.device, -+ ('hostid', 'hostnqn', 'model', -+ 'serial', 'dctype', 'cntrltype')) -+ ) -+ return details -+ -+ def info(self) -> dict: -+ '''@brief Get the controller info for this object''' -+ info = super().info() -+ if self._connect_op: -+ info['connect operation'] = self._connect_op.as_dict() -+ return info -+ -+ def cancel(self): -+ '''@brief Used to cancel pending operations.''' -+ super().cancel() -+ if self._connect_op: -+ self._connect_op.cancel() - - def _kill_ops(self): - if self._connect_op: -@@ -91,7 +98,7 @@ class Controller: # pylint: disable=too-many-instance-attributes - self._on_nvme_event(nvme_event) - elif udev_obj.action == 'remove': - logging.info('%s | %s - Received "remove" event', self.id, udev_obj.sys_name) -- self._on_udev_remove(udev_obj) -+ self._on_ctrl_removed(udev_obj) - else: - logging.debug( - 'Controller._on_udev_notification() - %s | %s - Received "%s" notification.', -@@ -108,33 +115,12 @@ class Controller: # pylint: disable=too-many-instance-attributes - udev_obj.sys_name, - ) - -- def _on_aen(self, aen: int): -- pass -- -- def _on_nvme_event(self, nvme_event): -- pass -- -- def _on_udev_remove(self, udev_obj): # pylint: disable=unused-argument -+ def _on_ctrl_removed(self, obj): # pylint: disable=unused-argument - self._udev.unregister_for_device_events(self._on_udev_notification) - self._kill_ops() # Kill all pending operations - self._ctrl = None - -- def _find_existing_connection(self): -- raise NotImplementedError() -- -- def _on_try_to_connect(self): -- self._try_to_connect_deferred.schedule() -- return GLib.SOURCE_REMOVE -- -- def _try_to_connect(self): -- # This is a deferred function call. Make sure -- # the source of the deferred is still good. -- source = GLib.main_current_source() -- if source and source.is_destroyed(): -- return -- -- self._connect_attempts += 1 -- -+ def _do_connect(self): - host_iface = ( - self.tid.host_iface - if (self.tid.host_iface and not conf.SvcConf().ignore_iface and conf.NvmeOptions().host_iface_supp) -@@ -164,7 +150,6 @@ class Controller: # pylint: disable=too-many-instance-attributes - self._on_connect_success, self._on_connect_fail, self._ctrl.init, self._host, int(udev_obj.sys_number) - ) - else: -- self._device = None - service_conf = conf.SvcConf() - cfg = { 'hdr_digest': service_conf.hdr_digest, - 'data_digest': service_conf.data_digest } -@@ -198,11 +183,10 @@ class Controller: # pylint: disable=too-many-instance-attributes - self._connect_op = None - - if self._alive(): -- if not self._device: -- self._device = self._ctrl.name -+ self._device = self._ctrl.name - logging.info('%s | %s - Connection established!', self.id, self.device) - self._connect_attempts = 0 -- self._udev.register_for_device_events(self.device, self._on_udev_notification) -+ self._udev.register_for_device_events(self._device, self._on_udev_notification) - else: - logging.debug( - 'Controller._on_connect_success() - %s | %s Received event on dead object. data=%s', -@@ -227,11 +211,11 @@ class Controller: # pylint: disable=too-many-instance-attributes - # the same time. This is perfectly fine, except that we may get a bogus - # failed to connect error. By doing a fast re-try, stacd can quickly - # verify that the connection was actually successful. -- self._retry_connect_tmr.set_timeout(Controller.FAST_CONNECT_RETRY_PERIOD_SEC) -+ self._retry_connect_tmr.set_timeout(self.FAST_CONNECT_RETRY_PERIOD_SEC) - elif self._connect_attempts == 2: - # If the fast connect re-try fails, then we can print a message to - # indicate the failure, and start a slow re-try period. -- self._retry_connect_tmr.set_timeout(Controller.CONNECT_RETRY_PERIOD_SEC) -+ self._retry_connect_tmr.set_timeout(self.CONNECT_RETRY_PERIOD_SEC) - logging.error('%s Failed to connect to controller. %s', self.id, getattr(err, 'message', err)) - - logging.debug( -@@ -248,53 +232,6 @@ class Controller: # pylint: disable=too-many-instance-attributes - getattr(err, 'message', err), - ) - -- @property -- def id(self) -> str: # pylint: disable=missing-function-docstring -- return str(self.tid) -- -- @property -- def tid(self): # pylint: disable=missing-function-docstring -- return self._tid -- -- @property -- def device(self) -> str: # pylint: disable=missing-function-docstring -- return self._device if self._device else '' -- -- def controller_id_dict(self) -> dict: -- '''@brief return the controller ID as a dict.''' -- cid = self.tid.as_dict() -- cid['device'] = self.device -- return cid -- -- def details(self) -> dict: -- '''@brief return detailed debug info about this controller''' -- details = self.controller_id_dict() -- details.update(self._udev.get_attributes(self.device, ('hostid', 'hostnqn', 'model', 'serial'))) -- details['connect attempts'] = str(self._connect_attempts) -- details['retry connect timer'] = str(self._retry_connect_tmr) -- return details -- -- def info(self) -> dict: -- '''@brief Get the controller info for this object''' -- info = self.details() -- if self._connect_op: -- info['connect operation'] = self._connect_op.as_dict() -- return info -- -- def cancel(self): -- '''@brief Used to cancel pending operations.''' -- if self._cancellable and not self._cancellable.is_cancelled(): -- logging.debug('Controller.cancel() - %s', self.id) -- self._cancellable.cancel() -- -- if self._connect_op: -- self._connect_op.cancel() -- -- def kill(self): -- '''@brief Used to release all resources associated with this object.''' -- logging.debug('Controller.kill() - %s', self.id) -- self._release_resources() -- - def disconnect(self, disconnected_cb, keep_connection): - '''@brief Issue an asynchronous disconnect command to a Controller. - Once the async command has completed, the callback 'disconnected_cb' -@@ -313,7 +250,7 @@ class Controller: # pylint: disable=too-many-instance-attributes - # cannot be called directly as the current Controller object is in the - # process of being disconnected and the callback will in fact delete - # the object. This would invariably lead to unpredictable outcome. -- GLib.idle_add(disconnected_cb, self) -+ GLib.idle_add(disconnected_cb, self, True) - - def _on_disconn_success(self, op_obj, data, disconnected_cb): # pylint: disable=unused-argument - logging.debug('Controller._on_disconn_success() - %s | %s', self.id, self.device) -@@ -322,7 +259,7 @@ class Controller: # pylint: disable=too-many-instance-attributes - # cannot be called directly as the current Controller object is in the - # process of being disconnected and the callback will in fact delete - # the object. This would invariably lead to unpredictable outcome. -- GLib.idle_add(disconnected_cb, self) -+ GLib.idle_add(disconnected_cb, self, True) - - def _on_disconn_fail(self, op_obj, err, fail_cnt, disconnected_cb): # pylint: disable=unused-argument - logging.debug('Controller._on_disconn_fail() - %s | %s: %s', self.id, self.device, err) -@@ -331,4 +268,249 @@ class Controller: # pylint: disable=too-many-instance-attributes - # cannot be called directly as the current Controller object is in the - # process of being disconnected and the callback will in fact delete - # the object. This would invariably lead to unpredictable outcome. -- GLib.idle_add(disconnected_cb, self) -+ GLib.idle_add(disconnected_cb, self, False) -+ -+ -+# ****************************************************************************** -+class Dc(Controller): -+ '''@brief This object establishes a connection to one Discover Controller (DC). -+ It retrieves the discovery log pages and caches them. -+ It also monitors udev events associated with that DC and updates -+ the cached discovery log pages accordingly. -+ ''' -+ -+ DLP_CHANGED = ( -+ (nvme.NVME_LOG_LID_DISCOVER << 16) | (nvme.NVME_AER_NOTICE_DISC_CHANGED << 8) | nvme.NVME_AER_NOTICE -+ ) # 0x70f002 -+ GET_LOG_PAGE_RETRY_RERIOD_SEC = 20 -+ REGISTRATION_RETRY_RERIOD_SEC = 10 -+ -+ def __init__(self, staf, root, host, tid: trid.TID, log_pages=None): # pylint: disable=too-many-arguments -+ super().__init__(root, host, tid, discovery_ctrl=True) -+ self._staf = staf -+ self._register_op = None -+ self._get_log_op = None -+ self._log_pages = log_pages if log_pages else list() # Log pages cache -+ -+ def _release_resources(self): -+ logging.debug('Dc._release_resources() - %s | %s', self.id, self.device) -+ super()._release_resources() -+ self._log_pages = list() -+ self._staf = None -+ -+ def _kill_ops(self): -+ super()._kill_ops() -+ if self._get_log_op: -+ self._get_log_op.kill() -+ self._get_log_op = None -+ if self._register_op: -+ self._register_op.kill() -+ self._register_op = None -+ -+ def info(self) -> dict: -+ '''@brief Get the controller info for this object''' -+ info = super().info() -+ if self._get_log_op: -+ info['get log page operation'] = self._get_log_op.as_dict() -+ if self._register_op: -+ info['register operation'] = self._register_op.as_dict() -+ return info -+ -+ def cancel(self): -+ '''@brief Used to cancel pending operations.''' -+ super().cancel() -+ if self._get_log_op: -+ self._get_log_op.cancel() -+ if self._register_op: -+ self._register_op.cancel() -+ -+ def log_pages(self) -> list: -+ '''@brief Get the cached log pages for this object''' -+ return self._log_pages -+ -+ def referrals(self) -> list: -+ '''@brief Return the list of referrals''' -+ return [page for page in self._log_pages if page['subtype'] == 'referral'] -+ -+ def _on_aen(self, aen: int): -+ if aen == self.DLP_CHANGED and self._get_log_op: -+ self._get_log_op.run_async() -+ -+ def _on_nvme_event(self, nvme_event: str): -+ if nvme_event == 'connected' and self._register_op: -+ self._register_op.run_async() -+ -+ def _on_ctrl_removed(self, obj): -+ super()._on_ctrl_removed(obj) -+ if self._try_to_connect_deferred: -+ self._try_to_connect_deferred.schedule() -+ -+ def _find_existing_connection(self): -+ return self._udev.find_nvme_dc_device(self.tid) -+ -+ # -------------------------------------------------------------------------- -+ def _on_connect_success(self, op_obj, data): -+ '''@brief Function called when we successfully connect to the -+ Discovery Controller. -+ ''' -+ super()._on_connect_success(op_obj, data) -+ -+ if self._alive(): -+ if self._ctrl.is_registration_supported(): -+ self._register_op = gutil.AsyncOperationWithRetry( -+ self._on_registration_success, -+ self._on_registration_fail, -+ self._ctrl.registration_ctlr, -+ nvme.NVMF_DIM_TAS_REGISTER, -+ ) -+ self._register_op.run_async() -+ else: -+ self._get_log_op = gutil.AsyncOperationWithRetry( -+ self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover -+ ) -+ self._get_log_op.run_async() -+ -+ # -------------------------------------------------------------------------- -+ def _on_registration_success(self, op_obj, data): # pylint: disable=unused-argument -+ '''@brief Function called when we successfully register with the -+ Discovery Controller. See self._register_op object -+ for details. -+ ''' -+ if self._alive(): -+ if data is not None: -+ logging.warning('%s | %s - Registration error. %s.', self.id, self.device, data) -+ else: -+ logging.debug('Dc._on_registration_success() - %s | %s', self.id, self.device) -+ self._get_log_op = gutil.AsyncOperationWithRetry( -+ self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover -+ ) -+ self._get_log_op.run_async() -+ else: -+ logging.debug( -+ 'Dc._on_registration_success() - %s | %s Received event on dead object.', self.id, self.device -+ ) -+ -+ def _on_registration_fail(self, op_obj, err, fail_cnt): -+ '''@brief Function called when we fail to register with the -+ Discovery Controller. See self._register_op object -+ for details. -+ ''' -+ if self._alive(): -+ logging.debug( -+ 'Dc._on_registration_fail() - %s | %s: %s. Retry in %s sec', -+ self.id, -+ self.device, -+ err, -+ Dc.REGISTRATION_RETRY_RERIOD_SEC, -+ ) -+ if fail_cnt == 1: # Throttle the logs. Only print the first time we fail to connect -+ logging.error('%s | %s - Failed to register with Discovery Controller. %s', self.id, self.device, err) -+ # op_obj.retry(Dc.REGISTRATION_RETRY_RERIOD_SEC) -+ else: -+ logging.debug( -+ 'Dc._on_registration_fail() - %s | %s Received event on dead object. %s', -+ self.id, -+ self.device, -+ err, -+ ) -+ op_obj.kill() -+ -+ # -------------------------------------------------------------------------- -+ def _on_get_log_success(self, op_obj, data): # pylint: disable=unused-argument -+ '''@brief Function called when we successfully retrieve the log pages -+ from the Discovery Controller. See self._get_log_op object -+ for details. -+ ''' -+ if self._alive(): -+ # Note that for historical reasons too long to explain, the CDC may -+ # return invalid addresses ("0.0.0.0", "::", or ""). Those need to be -+ # filtered out. -+ referrals_before = self.referrals() -+ self._log_pages = ( -+ [ -+ {k: str(v) for k, v in dictionary.items()} -+ for dictionary in data -+ if dictionary.get('traddr') not in ('0.0.0.0', '::', '') -+ ] -+ if data -+ else list() -+ ) -+ logging.info( -+ '%s | %s - Received discovery log pages (num records=%s).', self.id, self.device, len(self._log_pages) -+ ) -+ referrals_after = self.referrals() -+ self._staf.log_pages_changed(self, self.device) -+ if referrals_after != referrals_before: -+ logging.debug( -+ 'Dc._on_get_log_success() - %s | %s Referrals before = %s', -+ self.id, -+ self.device, -+ referrals_before, -+ ) -+ logging.debug( -+ 'Dc._on_get_log_success() - %s | %s Referrals after = %s', -+ self.id, -+ self.device, -+ referrals_after, -+ ) -+ self._staf.referrals_changed() -+ else: -+ logging.debug( -+ 'Dc._on_get_log_success() - %s | %s Received event on dead object.', self.id, self.device -+ ) -+ -+ def _on_get_log_fail(self, op_obj, err, fail_cnt): -+ '''@brief Function called when we fail to retrieve the log pages -+ from the Discovery Controller. See self._get_log_op object -+ for details. -+ ''' -+ if self._alive(): -+ logging.debug( -+ 'Dc._on_get_log_fail() - %s | %s: %s. Retry in %s sec', -+ self.id, -+ self.device, -+ err, -+ Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC, -+ ) -+ if fail_cnt == 1: # Throttle the logs. Only print the first time we fail to connect -+ logging.error('%s | %s - Failed to retrieve log pages. %s', self.id, self.device, err) -+ op_obj.retry(Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC) -+ else: -+ logging.debug( -+ 'Dc._on_get_log_fail() - %s | %s Received event on dead object. %s', -+ self.id, -+ self.device, -+ err, -+ ) -+ op_obj.kill() -+ -+ -+# ****************************************************************************** -+class Ioc(Controller): -+ '''@brief This object establishes a connection to one I/O Controller.''' -+ -+ def __init__(self, stac, root, host, tid: trid.TID): -+ self._stac = stac -+ super().__init__(root, host, tid) -+ -+ def _release_resources(self): -+ super()._release_resources() -+ self._stac = None -+ -+ def _on_ctrl_removed(self, obj): -+ '''Called when the associated nvme device (/dev/nvmeX) is removed -+ from the system. -+ ''' -+ super()._on_ctrl_removed(obj) -+ -+ # Defer removal of this object to the next main loop's idle period. -+ GLib.idle_add(self._stac.remove_controller, self, True) -+ -+ def _find_existing_connection(self): -+ return self._udev.find_nvme_ioc_device(self.tid) -+ -+ def _on_aen(self, aen: int): -+ pass -+ -+ def _on_nvme_event(self, nvme_event): -+ pass -diff --git a/staslib/gutil.py b/staslib/gutil.py -index b302f3a..36ce2c7 100644 ---- a/staslib/gutil.py -+++ b/staslib/gutil.py -@@ -104,8 +104,7 @@ class GTimer: - - - # ****************************************************************************** --class NameResolver: -- # pylint: disable=too-few-public-methods -+class NameResolver: # pylint: disable=too-few-public-methods - '''@brief DNS resolver to convert host names to IP addresses.''' - - def __init__(self): -@@ -133,8 +132,10 @@ class NameResolver: - else: - logging.error('Cannot resolve traddr: %s', hostname) - -- except GLib.GError: -- logging.error('Cannot resolve traddr: %s', hostname) -+ except GLib.GError as err: -+ # We don't need to report "cancellation" errors. -+ if not err.matches(Gio.io_error_quark(), Gio.IOErrorEnum.CANCELLED): -+ logging.error('Cannot resolve traddr: %s. %s', hostname, err.message) # pylint: disable=no-member - - logging.debug('NameResolver.resolve_ctrl_async() - resolved \'%s\' -> %s', hostname, traddr) - controllers[indx]['traddr'] = traddr -diff --git a/staslib/log.py b/staslib/log.py -index c624978..9622e98 100644 ---- a/staslib/log.py -+++ b/staslib/log.py -@@ -24,7 +24,7 @@ def init(syslog: bool): - if syslog: - try: - # Try journal logger first -- import systemd.journal # pylint: disable=redefined-outer-name,import-outside-toplevel -+ import systemd.journal # pylint: disable=import-outside-toplevel - - handler = systemd.journal.JournalHandler(SYSLOG_IDENTIFIER=defs.PROG_NAME) - except ModuleNotFoundError: -@@ -32,9 +32,7 @@ def init(syslog: bool): - from logging.handlers import SysLogHandler # pylint: disable=import-outside-toplevel - - handler = SysLogHandler(address="/dev/log") -- handler.setFormatter( -- logging.Formatter('{}: %(message)s'.format(defs.PROG_NAME)) # pylint: disable=consider-using-f-string -- ) -+ handler.setFormatter(logging.Formatter(f'{defs.PROG_NAME}: %(message)s')) - else: - # Log to stdout - handler = logging.StreamHandler(stream=sys.stdout) -diff --git a/staslib/service.py b/staslib/service.py -index 556a9f9..a48e66d 100644 ---- a/staslib/service.py -+++ b/staslib/service.py -@@ -9,248 +9,416 @@ - '''This module defines the base Service object from - which the Staf and the Stac objects are derived.''' - --import os --import signal -+import json -+import pickle - import logging -+import pathlib - import systemd.daemon --import dasbus.connection -+import dasbus.error -+import dasbus.client.observer -+import dasbus.client.proxy - --from gi.repository import Gio, GLib -+from gi.repository import GLib - from libnvme import nvme --from staslib import conf, ctrl, defs, gutil, log, stas, trid, udev -+from staslib import avahi, conf, ctrl, defs, gutil, stas, trid, udev - - - # ****************************************************************************** --class Service: # pylint: disable=too-many-instance-attributes -+class Service(stas.ServiceABC): - '''@brief Base class used to manage a STorage Appliance Service''' - - def __init__(self, args, reload_hdlr): -- - sysconf = conf.SysConf() - self._root = nvme.root() - self._host = nvme.host(self._root, sysconf.hostnqn, sysconf.hostid, sysconf.hostsymname) - -- service_conf = conf.SvcConf() -- service_conf.set_conf_file(args.conf_file) # reload configuration -- self._tron = args.tron or service_conf.tron -- log.set_level_from_tron(self._tron) -- self._root.log_level("debug" if self._tron else "err") -+ super().__init__(args, reload_hdlr) - -- self._lkc_file = os.path.join(os.environ.get('RUNTIME_DIRECTORY', os.path.join('/run', defs.PROG_NAME)), 'last-known-config.pickle') -- self._loop = GLib.MainLoop() -- self._udev = udev.UDEV -- self._cancellable = Gio.Cancellable() -- self._resolver = gutil.NameResolver() -- self._controllers = self._load_last_known_config() -- self._dbus_iface = None -- self._cfg_soak_tmr = None -- self._sysbus = dasbus.connection.SystemMessageBus() -- -- GLib.unix_signal_add(GLib.PRIORITY_HIGH, signal.SIGINT, self._stop_hdlr) # CTRL-C -- GLib.unix_signal_add(GLib.PRIORITY_HIGH, signal.SIGTERM, self._stop_hdlr) # systemctl stop stafd -- GLib.unix_signal_add(GLib.PRIORITY_HIGH, signal.SIGHUP, reload_hdlr) # systemctl reload stafd -- -- nvme_options = conf.NvmeOptions() -- if not nvme_options.host_iface_supp or not nvme_options.discovery_supp: -- logging.warning( -- 'Kernel does not appear to support all the options needed to run this program. Consider updating to a later kernel version.' -- ) -+ self._root.log_level("debug" if self._tron else "err") - - def _release_resources(self): - logging.debug('Service._release_resources()') -+ super()._release_resources() - -- if self._cancellable and not self._cancellable.is_cancelled(): -- self._cancellable.cancel() -+ self._host = None -+ self._root = None - -- if self._cfg_soak_tmr is not None: -- self._cfg_soak_tmr.kill() -+ @stas.ServiceABC.tron.setter -+ def tron(self, value): -+ '''@brief Set Trace ON property''' -+ super(__class__, self.__class__).tron.__set__(self, value) -+ self._root.log_level("debug" if self._tron else "err") - -- self._controllers.clear() - -- if self._sysbus: -- self._sysbus.disconnect() -+# ****************************************************************************** -+def udev_rule_ctrl(enable): -+ '''@brief We add an empty udev rule to /run/udev/rules.d to suppress -+ nvme-cli's udev rule that is used to tell udevd to automatically -+ connect to I/O controller. This is to avoid race conditions between -+ stacd and udevd. This is configurable. See "udev-rule" in stacd.conf -+ for details. -+ ''' -+ udev_rule_suppress = pathlib.Path('/run/udev/rules.d', '70-nvmf-autoconnect.rules') -+ if enable: -+ try: -+ udev_rule_suppress.unlink() -+ except FileNotFoundError: -+ pass -+ else: -+ if not udev_rule_suppress.exists(): -+ pathlib.Path('/run/udev/rules.d').mkdir(parents=True, exist_ok=True) -+ udev_rule_suppress.symlink_to('/dev/null') - -- self._cfg_soak_tmr = None -- self._cancellable = None -- self._resolver = None -- self._lkc_file = None -- self._sysbus = None -- self._udev = None - -- def _config_dbus(self, iface_obj, bus_name: str, obj_name: str): -- self._dbus_iface = iface_obj -- self._sysbus.publish_object(obj_name, iface_obj) -- self._sysbus.register_service(bus_name) -+# ****************************************************************************** -+class Stac(Service): -+ '''STorage Appliance Connector (STAC)''' - -- @property -- def tron(self): -- '''@brief Get Trace ON property''' -- return self._tron -+ CONF_STABILITY_LONG_SOAK_TIME_SEC = 10 # pylint: disable=invalid-name -+ ADD_EVENT_SOAK_TIME_SEC = 1 - -- @tron.setter -- def tron(self, value): # pylint: disable=no-self-use -- '''@brief Set Trace ON property''' -- self._tron = value -- log.set_level_from_tron(self._tron) -- self._root.log_level("debug" if self._tron else "err") -+ def __init__(self, args, dbus): -+ super().__init__(args, self._reload_hdlr) - -- def run(self): -- '''@brief Start the main loop execution''' -- try: -- self._loop.run() -- except Exception as ex: # pylint: disable=broad-except -- logging.critical('exception: %s', ex) -+ self._udev = udev.UDEV - -- self._loop = None -+ self._add_event_soak_tmr = gutil.GTimer(self.ADD_EVENT_SOAK_TIME_SEC, self._on_add_event_soaked) - -- def info(self) -> dict: -- '''@brief Get the status info for this object (used for debug)''' -- nvme_options = conf.NvmeOptions() -- return { -- 'last known config file': self._lkc_file, -- 'config soak timer': str(self._cfg_soak_tmr), -- 'kernel support': { -- 'TP8013': nvme_options.discovery_supp, -- 'host_iface': nvme_options.host_iface_supp, -- }, -- 'system config': conf.SysConf().as_dict(), -- } -- -- def get_controllers(self): -- '''@brief return the list of controller objects''' -- return self._controllers.values() -- -- def get_controller( -- self, transport: str, traddr: str, trsvcid: str, host_traddr: str, host_iface: str, subsysnqn: str -- ): # pylint: disable=too-many-arguments -- '''@brief get the specified controller object from the list of controllers''' -- cid = { -- 'transport': transport, -- 'traddr': traddr, -- 'trsvcid': trsvcid, -- 'host-traddr': host_traddr, -- 'host-iface': host_iface, -- 'subsysnqn': subsysnqn, -- } -- return self._controllers.get(trid.TID(cid)) -- -- def _remove_ctrl_from_dict(self, controller): -- tid_to_pop = controller.tid -- if not tid_to_pop: -- # Being paranoid. This should not happen, but let's say the -- # controller object has been purged, but it is somehow still -- # listed in self._controllers. -- for tid, _controller in self._controllers.items(): -- if _controller is controller: -- tid_to_pop = tid -- break -- -- if tid_to_pop: -- logging.debug('Service._remove_ctrl_from_dict() - %s | %s', tid_to_pop, controller.device) -- self._controllers.pop(tid_to_pop, None) -- else: -- logging.debug('Service._remove_ctrl_from_dict() - already removed') -+ self._config_connections_audit() - -- def remove_controller(self, controller): -- '''@brief remove the specified controller object from the list of controllers''' -- logging.debug('Service.remove_controller()') -- if isinstance(controller, ctrl.Controller): -- self._remove_ctrl_from_dict(controller) -+ # Create the D-Bus instance. -+ self._config_dbus(dbus, defs.STACD_DBUS_NAME, defs.STACD_DBUS_PATH) - -- controller.kill() -+ # Connect to STAF D-Bus interface -+ self._staf = None -+ self._staf_watcher = dasbus.client.observer.DBusObserver(self._sysbus, defs.STAFD_DBUS_NAME) -+ self._staf_watcher.service_available.connect(self._connect_to_staf) -+ self._staf_watcher.service_unavailable.connect(self._disconnect_from_staf) -+ self._staf_watcher.connect_once_available() - -- if self._cfg_soak_tmr: -- self._cfg_soak_tmr.start() -+ # Suppress udev rule to auto-connect when AEN is received. -+ udev_rule_ctrl(conf.SvcConf().udev_rule_enabled) - -- def _cancel(self): -- logging.debug('Service._cancel()') -- if not self._cancellable.is_cancelled(): -- self._cancellable.cancel() -+ def _release_resources(self): -+ logging.debug('Stac._release_resources()') -+ -+ if self._add_event_soak_tmr: -+ self._add_event_soak_tmr.kill() -+ -+ udev_rule_ctrl(True) -+ -+ if self._udev: -+ self._udev.unregister_for_action_events('add') -+ -+ self._destroy_staf_comlink(self._staf_watcher) -+ if self._staf_watcher is not None: -+ self._staf_watcher.disconnect() - -- for controller in self._controllers.values(): -- controller.cancel() -+ super()._release_resources() -+ -+ self._udev = None -+ self._staf = None -+ self._staf_watcher = None -+ self._add_event_soak_tmr = None -+ -+ def _audit_connections(self, tids): -+ '''A host should only connect to I/O controllers that have been zoned -+ for that host or a manual "controller" entry exists in stcd.conf. -+ A host should disconnect from an I/O controller when that I/O controller -+ is removed from the zone or a manual "controller" entry is removed from -+ stacd.conf. stacd will audit connections if "sticky-connections=disabled". -+ stacd will delete any connection that is not supposed to exist. -+ ''' -+ logging.debug('Stac._audit_connections() - tids = %s', tids) -+ num_controllers = len(self._controllers) -+ for tid in tids: -+ if tid not in self._controllers: -+ self._controllers[tid] = ctrl.Ioc(self, self._root, self._host, tid) -+ -+ if num_controllers != len(self._controllers): -+ self._cfg_soak_tmr.start(self.CONF_STABILITY_SOAK_TIME_SEC) -+ -+ def _on_add_event(self, udev_obj): # pylint: disable=unused-argument -+ '''@brief This function is called when a "add" event is received from -+ the kernel for an NVMe device. This is used to trigger an audit and make -+ sure that the connection to an I/O controller is allowed. -+ -+ WARNING: There is a race condition with the "add" event from the kernel. -+ The kernel sends the "add" event a bit early and the sysfs attributes -+ associated with the nvme object are not always fully initialized. -+ To workaround this problem we use a soaking timer to give time for the -+ sysfs attributes to stabilize. -+ ''' -+ self._add_event_soak_tmr.start() -+ -+ def _on_add_event_soaked(self): -+ '''@brief After the add event has been soaking for ADD_EVENT_SOAK_TIME_SEC -+ seconds, we can audit the connections. -+ ''' -+ if not conf.SvcConf().sticky_connections: -+ self._audit_connections(self._udev.get_nvme_ioc_tids()) -+ return GLib.SOURCE_REMOVE -+ -+ def _config_connections_audit(self): -+ '''This function checks the "sticky_connections" parameter to determine -+ whether audits should be performed. Audits are enabled when -+ "sticky_connections" is disabled. -+ ''' -+ if not conf.SvcConf().sticky_connections: -+ if self._udev.get_registered_action_cback('add') is None: -+ self._udev.register_for_action_events('add', self._on_add_event) -+ self._audit_connections(self._udev.get_nvme_ioc_tids()) -+ else: -+ self._udev.unregister_for_action_events('add') - - def _keep_connections_on_exit(self): - '''@brief Determine whether connections should remain when the - process exits. -- -- NOTE) This is the base class method used to define the interface. -- It must be overloaded by a child class. - ''' -- raise NotImplementedError() -+ return True - -- def _stop_hdlr(self): -- systemd.daemon.notify('STOPPING=1') -+ def _reload_hdlr(self): -+ '''@brief Reload configuration file. This is triggered by the SIGHUP -+ signal, which can be sent with "systemctl reload stacd". -+ ''' -+ systemd.daemon.notify('RELOADING=1') -+ service_cnf = conf.SvcConf() -+ service_cnf.reload() -+ self.tron = service_cnf.tron -+ self._config_connections_audit() -+ self._cfg_soak_tmr.start(self.CONF_STABILITY_SOAK_TIME_SEC) -+ udev_rule_ctrl(service_cnf.udev_rule_enabled) -+ systemd.daemon.notify('READY=1') -+ return GLib.SOURCE_CONTINUE -+ -+ def _get_log_pages_from_stafd(self): -+ if self._staf: -+ try: -+ return json.loads(self._staf.get_all_log_pages(True)) -+ except dasbus.error.DBusError: -+ pass -+ -+ return list() - -- self._cancel() # Cancel pending operations -+ def _config_ctrls_finish(self, configured_ctrl_list): -+ configured_ctrl_list = [ -+ ctrl_dict for ctrl_dict in configured_ctrl_list if 'traddr' in ctrl_dict and 'subsysnqn' in ctrl_dict -+ ] -+ logging.debug('Stac._config_ctrls_finish() - configured_ctrl_list = %s', configured_ctrl_list) -+ -+ discovered_ctrl_list = list() -+ for staf_data in self._get_log_pages_from_stafd(): -+ host_traddr = staf_data['discovery-controller']['host-traddr'] -+ host_iface = staf_data['discovery-controller']['host-iface'] -+ for dlpe in staf_data['log-pages']: -+ if dlpe.get('subtype') == 'nvme': # eliminate discovery controllers -+ discovered_ctrl_list.append(stas.cid_from_dlpe(dlpe, host_traddr, host_iface)) -+ -+ logging.debug('Stac._config_ctrls_finish() - discovered_ctrl_list = %s', discovered_ctrl_list) -+ -+ controllers = stas.remove_blacklisted(configured_ctrl_list + discovered_ctrl_list) -+ controllers = stas.remove_invalid_addresses(controllers) -+ -+ new_controller_ids = {trid.TID(controller) for controller in controllers} -+ cur_controller_ids = set(self._controllers.keys()) -+ controllers_to_add = new_controller_ids - cur_controller_ids -+ controllers_to_del = cur_controller_ids - new_controller_ids -+ -+ logging.debug('Stac._config_ctrls_finish() - controllers_to_add = %s', list(controllers_to_add)) -+ logging.debug('Stac._config_ctrls_finish() - controllers_to_del = %s', list(controllers_to_del)) -+ -+ for tid in controllers_to_del: -+ controller = self._controllers.pop(tid, None) -+ if controller is not None: -+ controller.disconnect(self.remove_controller, conf.SvcConf().sticky_connections) -+ -+ for tid in controllers_to_add: -+ self._controllers[tid] = ctrl.Ioc(self, self._root, self._host, tid) -+ -+ def _connect_to_staf(self, _): -+ '''@brief Hook up DBus signal handlers for signals from stafd.''' -+ try: -+ self._staf = self._sysbus.get_proxy(defs.STAFD_DBUS_NAME, defs.STAFD_DBUS_PATH) -+ self._staf.log_pages_changed.connect(self._log_pages_changed) -+ self._cfg_soak_tmr.start() - -- self._dump_last_known_config(self._controllers) -+ # Make sure timer is set back to its normal value. -+ self._cfg_soak_tmr.set_timeout(self.CONF_STABILITY_SOAK_TIME_SEC) -+ logging.debug('Stac._connect_to_staf() - Connected to staf') -+ except dasbus.error.DBusError: -+ logging.error('Failed to connect to staf') -+ -+ def _destroy_staf_comlink(self, watcher): # pylint: disable=unused-argument -+ if self._staf: -+ self._staf.log_pages_changed.disconnect(self._log_pages_changed) -+ dasbus.client.proxy.disconnect_proxy(self._staf) -+ self._staf = None -+ -+ def _disconnect_from_staf(self, watcher): -+ self._destroy_staf_comlink(watcher) -+ -+ # When we lose connectivity with stafd, the most logical explanation -+ # is that stafd restarted. In that case, it may take some time for stafd -+ # to re-populate its log pages cache. So let's give stafd plenty of time -+ # to update its log pages cache and send log pages change notifications -+ # before triggering a stacd re-config. We do this by momentarily -+ # increasing the config soak timer to a longer period. -+ if self._cfg_soak_tmr: -+ self._cfg_soak_tmr.set_timeout(self.CONF_STABILITY_LONG_SOAK_TIME_SEC) -+ -+ logging.debug('Stac._disconnect_from_staf() - Disconnected from staf') -+ -+ def _log_pages_changed( # pylint: disable=too-many-arguments -+ self, transport, traddr, trsvcid, host_traddr, host_iface, subsysnqn, device -+ ): -+ logging.debug( -+ 'Stac._log_pages_changed() - transport=%s, traddr=%s, trsvcid=%s, host_traddr=%s, host_iface=%s, subsysnqn=%s, device=%s', -+ transport, -+ traddr, -+ trsvcid, -+ host_traddr, -+ host_iface, -+ subsysnqn, -+ device, -+ ) -+ if self._cfg_soak_tmr: -+ self._cfg_soak_tmr.start(self.CONF_STABILITY_SOAK_TIME_SEC) - -- if len(self._controllers) == 0: -- GLib.idle_add(self._exit) -- else: -- # Tell all controller objects to disconnect -- keep_connections = self._keep_connections_on_exit() -- controllers = self._controllers.values() -- for controller in controllers: -- controller.disconnect(self._on_final_disconnect, keep_connections) -+ def _load_last_known_config(self): -+ return dict() - -- return GLib.SOURCE_REMOVE -+ def _dump_last_known_config(self, controllers): -+ pass - -- def _on_final_disconnect(self, controller): -- '''Callback invoked after a controller is disconnected. -- THIS IS USED DURING PROCESS SHUTDOWN TO WAIT FOR ALL CONTROLLERS TO BE -- DISCONNECTED BEFORE EXITING THE PROGRAM. ONLY CALL ON SHUTDOWN! -- ''' -- logging.debug('Service._on_final_disconnect()') -- self._remove_ctrl_from_dict(controller) - -- controller.kill() -+# ****************************************************************************** -+class Staf(Service): -+ '''STorage Appliance Finder (STAF)''' - -- # When all controllers have disconnected, we can finish the clean up -- if len(self._controllers) == 0: -- # Defer exit to the next main loop's idle period. -- GLib.idle_add(self._exit) -+ def __init__(self, args, dbus): -+ super().__init__(args, self._reload_hdlr) - -- def _exit(self): -- logging.debug('Service._exit()') -- self._release_resources() -- self._loop.quit() -+ self._avahi = avahi.Avahi(self._sysbus, self._avahi_change) -+ self._avahi.config_stypes(conf.SvcConf().get_stypes()) - -- def _on_config_ctrls(self, *_user_data): -- self._config_ctrls() -- return GLib.SOURCE_REMOVE -+ # Create the D-Bus instance. -+ self._config_dbus(dbus, defs.STAFD_DBUS_NAME, defs.STAFD_DBUS_PATH) - -- def _config_ctrls(self): -- '''@brief Start controllers configuration.''' -- # The configuration file may contain controllers and/or blacklist -- # elements with traddr specified as hostname instead of IP address. -- # Because of this, we need to remove those blacklisted elements before -- # running name resolution. And we will need to remove blacklisted -- # elements after name resolution is complete (i.e. in the calback -- # function _config_ctrls_finish) -- logging.debug('Service._config_ctrls()') -- configured_controllers = stas.remove_blacklisted(conf.SvcConf().get_controllers()) -- self._resolver.resolve_ctrl_async(self._cancellable, configured_controllers, self._config_ctrls_finish) -+ def info(self) -> dict: -+ '''@brief Get the status info for this object (used for debug)''' -+ info = super().info() -+ info['avahi'] = self._avahi.info() -+ return info - -- def _config_ctrls_finish(self, configured_ctrl_list): -- '''@brief Finish controllers configuration after hostnames (if any) -- have been resolved. -- -- Configuring controllers must be done asynchronously in 2 steps. -- In the first step, host names get resolved to find their IP addresses. -- Name resolution can take a while, especially when an external name -- resolution server is used. Once that step completed, the callback -- method _config_ctrls_finish() (i.e. this method), gets invoked to -- complete the controller configuration. -- -- NOTE) This is the base class method used to define the interface. -- It must be overloaded by a child class. -- ''' -- raise NotImplementedError() -+ def _release_resources(self): -+ logging.debug('Staf._release_resources()') -+ super()._release_resources() -+ if self._avahi: -+ self._avahi.kill() -+ self._avahi = None - - def _load_last_known_config(self): -- raise NotImplementedError() -+ try: -+ with open(self._lkc_file, 'rb') as file: -+ config = pickle.load(file) -+ except (FileNotFoundError, AttributeError): -+ return dict() -+ -+ logging.debug('Staf._load_last_known_config() - DC count = %s', len(config)) -+ return {tid: ctrl.Dc(self, self._root, self._host, tid, log_pages) for tid, log_pages in config.items()} - - def _dump_last_known_config(self, controllers): -- raise NotImplementedError() -+ try: -+ with open(self._lkc_file, 'wb') as file: -+ config = {tid: dc.log_pages() for tid, dc in controllers.items()} -+ logging.debug('Staf._dump_last_known_config() - DC count = %s', len(config)) -+ pickle.dump(config, file) -+ except FileNotFoundError as ex: -+ logging.error('Unable to save last known config: %s', ex) -+ -+ def _keep_connections_on_exit(self): -+ '''@brief Determine whether connections should remain when the -+ process exits. -+ ''' -+ return conf.SvcConf().persistent_connections -+ -+ def _reload_hdlr(self): -+ '''@brief Reload configuration file. This is triggered by the SIGHUP -+ signal, which can be sent with "systemctl reload stafd". -+ ''' -+ systemd.daemon.notify('RELOADING=1') -+ service_cnf = conf.SvcConf() -+ service_cnf.reload() -+ self.tron = service_cnf.tron -+ self._avahi.kick_start() # Make sure Avahi is running -+ self._avahi.config_stypes(service_cnf.get_stypes()) -+ self._cfg_soak_tmr.start() -+ systemd.daemon.notify('READY=1') -+ return GLib.SOURCE_CONTINUE -+ -+ def log_pages_changed(self, controller, device): -+ '''@brief Function invoked when a controller's cached log pages -+ have changed. This will emit a D-Bus signal to inform -+ other applications that the cached log pages have changed. -+ ''' -+ self._dbus_iface.log_pages_changed.emit( -+ controller.tid.transport, -+ controller.tid.traddr, -+ controller.tid.trsvcid, -+ controller.tid.host_traddr, -+ controller.tid.host_iface, -+ controller.tid.subsysnqn, -+ device, -+ ) -+ -+ def referrals_changed(self): -+ '''@brief Function invoked when a controller's cached referrals -+ have changed. -+ ''' -+ logging.debug('Staf.referrals_changed()') -+ self._cfg_soak_tmr.start() -+ -+ def _referrals(self) -> list: -+ return [ -+ stas.cid_from_dlpe(dlpe, controller.tid.host_traddr, controller.tid.host_iface) -+ for controller in self.get_controllers() -+ for dlpe in controller.referrals() -+ ] -+ -+ def _config_ctrls_finish(self, configured_ctrl_list): -+ '''@brief Finish discovery controllers configuration after -+ hostnames (if any) have been resolved. -+ ''' -+ configured_ctrl_list = [ -+ ctrl_dict -+ for ctrl_dict in configured_ctrl_list -+ if 'traddr' in ctrl_dict and ctrl_dict.setdefault('subsysnqn', defs.WELL_KNOWN_DISC_NQN) -+ ] -+ -+ discovered_ctrl_list = self._avahi.get_controllers() -+ referral_ctrl_list = self._referrals() -+ logging.debug('Staf._config_ctrls_finish() - configured_ctrl_list = %s', configured_ctrl_list) -+ logging.debug('Staf._config_ctrls_finish() - discovered_ctrl_list = %s', discovered_ctrl_list) -+ logging.debug('Staf._config_ctrls_finish() - referral_ctrl_list = %s', referral_ctrl_list) -+ -+ controllers = stas.remove_blacklisted(configured_ctrl_list + discovered_ctrl_list + referral_ctrl_list) -+ controllers = stas.remove_invalid_addresses(controllers) -+ -+ new_controller_ids = {trid.TID(controller) for controller in controllers} -+ cur_controller_ids = set(self._controllers.keys()) -+ controllers_to_add = new_controller_ids - cur_controller_ids -+ controllers_to_del = cur_controller_ids - new_controller_ids -+ -+ logging.debug('Staf._config_ctrls_finish() - controllers_to_add = %s', list(controllers_to_add)) -+ logging.debug('Staf._config_ctrls_finish() - controllers_to_del = %s', list(controllers_to_del)) -+ -+ for tid in controllers_to_del: -+ controller = self._controllers.pop(tid, None) -+ if controller is not None: -+ controller.disconnect(self.remove_controller, conf.SvcConf().persistent_connections) -+ -+ for tid in controllers_to_add: -+ self._controllers[tid] = ctrl.Dc(self, self._root, self._host, tid) -+ -+ def _avahi_change(self): -+ self._cfg_soak_tmr.start() -diff --git a/staslib/stas.py b/staslib/stas.py -index 7bf91e0..496f063 100644 ---- a/staslib/stas.py -+++ b/staslib/stas.py -@@ -6,14 +6,19 @@ - # - # Authors: Martin Belanger - # --'''Library for staf/stac''' -+'''Library for staf/stac. You will find here common code for stafd and stacd -+including the Abstract Base Classes (ABC) for Controllers and Services''' - - import os - import sys --import ipaddress -+import abc -+import signal - import logging -- --from staslib import conf, defs, trid -+import ipaddress -+import systemd.daemon -+import dasbus.connection -+from gi.repository import Gio, GLib -+from staslib import conf, defs, gutil, log, trid - - - # ****************************************************************************** -@@ -108,3 +113,379 @@ def remove_invalid_addresses(controllers: list): - logging.warning('Invalid transport %s', transport) - - return valid_controllers -+ -+ -+# ****************************************************************************** -+class ControllerABC(abc.ABC): # pylint: disable=too-many-instance-attributes -+ '''@brief Base class used to manage the connection to a controller.''' -+ -+ CONNECT_RETRY_PERIOD_SEC = 60 -+ FAST_CONNECT_RETRY_PERIOD_SEC = 3 -+ -+ def __init__(self, root, host, tid: trid.TID, discovery_ctrl=False): -+ self._root = root -+ self._host = host -+ self._tid = tid -+ self._cancellable = Gio.Cancellable() -+ self._connect_attempts = 0 -+ self._retry_connect_tmr = gutil.GTimer(self.CONNECT_RETRY_PERIOD_SEC, self._on_try_to_connect) -+ self._discovery_ctrl = discovery_ctrl -+ self._try_to_connect_deferred = gutil.Deferred(self._try_to_connect) -+ self._try_to_connect_deferred.schedule() -+ -+ def _release_resources(self): -+ # Remove pending deferred from main loop -+ if self._try_to_connect_deferred: -+ self._try_to_connect_deferred.cancel() -+ -+ if self._retry_connect_tmr is not None: -+ self._retry_connect_tmr.kill() -+ -+ if self._cancellable and not self._cancellable.is_cancelled(): -+ self._cancellable.cancel() -+ -+ self._tid = None -+ self._cancellable = None -+ self._retry_connect_tmr = None -+ self._try_to_connect_deferred = None -+ -+ @property -+ def id(self) -> str: -+ '''@brief Return the Transport ID as a printable string''' -+ return str(self.tid) -+ -+ @property -+ def tid(self): -+ '''@brief Return the Transport ID object''' -+ return self._tid -+ -+ def controller_id_dict(self) -> dict: -+ '''@brief return the controller ID as a dict.''' -+ return self.tid.as_dict() -+ -+ def details(self) -> dict: -+ '''@brief return detailed debug info about this controller''' -+ details = self.controller_id_dict() -+ details['connect attempts'] = str(self._connect_attempts) -+ details['retry connect timer'] = str(self._retry_connect_tmr) -+ return details -+ -+ def info(self) -> dict: -+ '''@brief Get the controller info for this object''' -+ return self.details() -+ -+ def cancel(self): -+ '''@brief Used to cancel pending operations.''' -+ if self._cancellable and not self._cancellable.is_cancelled(): -+ logging.debug('ControllerABC.cancel() - %s', self.id) -+ self._cancellable.cancel() -+ -+ def kill(self): -+ '''@brief Used to release all resources associated with this object.''' -+ logging.debug('ControllerABC.kill() - %s', self.id) -+ self._release_resources() -+ -+ def _alive(self): -+ '''There may be race condition where a queued event gets processed -+ after the object is no longer configured (i.e. alive). This method -+ can be used by callback functions to make sure the object is still -+ alive before processing further. -+ ''' -+ return self._cancellable and not self._cancellable.is_cancelled() -+ -+ def _on_try_to_connect(self): -+ self._try_to_connect_deferred.schedule() -+ return GLib.SOURCE_REMOVE -+ -+ def _try_to_connect(self): -+ # This is a deferred function call. Make sure -+ # the source of the deferred is still good. -+ source = GLib.main_current_source() -+ if source and source.is_destroyed(): -+ return -+ -+ self._connect_attempts += 1 -+ -+ self._do_connect() -+ -+ @abc.abstractmethod -+ def _do_connect(self): -+ raise NotImplementedError() -+ -+ @abc.abstractmethod -+ def _on_aen(self, aen: int): -+ raise NotImplementedError() -+ -+ @abc.abstractmethod -+ def _on_nvme_event(self, nvme_event): -+ raise NotImplementedError() -+ -+ @abc.abstractmethod -+ def _on_ctrl_removed(self, obj): -+ raise NotImplementedError() -+ -+ @abc.abstractmethod -+ def _find_existing_connection(self): -+ raise NotImplementedError() -+ -+ @abc.abstractmethod -+ def disconnect(self, disconnected_cb, keep_connection): -+ '''@brief Issue an asynchronous disconnect command to a Controller. -+ Once the async command has completed, the callback 'disconnected_cb' -+ will be invoked. If a controller is already disconnected, then the -+ callback will be added to the main loop's next idle slot to be executed -+ ASAP. -+ ''' -+ raise NotImplementedError() -+ -+ -+# ****************************************************************************** -+class ServiceABC(abc.ABC): # pylint: disable=too-many-instance-attributes -+ '''@brief Base class used to manage a STorage Appliance Service''' -+ -+ CONF_STABILITY_SOAK_TIME_SEC = 1.5 -+ -+ def __init__(self, args, reload_hdlr): -+ -+ service_conf = conf.SvcConf() -+ service_conf.set_conf_file(args.conf_file) # reload configuration -+ self._tron = args.tron or service_conf.tron -+ log.set_level_from_tron(self._tron) -+ -+ self._lkc_file = os.path.join(os.environ.get('RUNTIME_DIRECTORY', os.path.join('/run', defs.PROG_NAME)), 'last-known-config.pickle') -+ self._loop = GLib.MainLoop() -+ self._cancellable = Gio.Cancellable() -+ self._resolver = gutil.NameResolver() -+ self._controllers = self._load_last_known_config() -+ self._dbus_iface = None -+ self._cfg_soak_tmr = gutil.GTimer(self.CONF_STABILITY_SOAK_TIME_SEC, self._on_config_ctrls) -+ self._sysbus = dasbus.connection.SystemMessageBus() -+ -+ GLib.unix_signal_add(GLib.PRIORITY_HIGH, signal.SIGINT, self._stop_hdlr) # CTRL-C -+ GLib.unix_signal_add(GLib.PRIORITY_HIGH, signal.SIGTERM, self._stop_hdlr) # systemctl stop stafd -+ GLib.unix_signal_add(GLib.PRIORITY_HIGH, signal.SIGHUP, reload_hdlr) # systemctl reload stafd -+ -+ nvme_options = conf.NvmeOptions() -+ if not nvme_options.host_iface_supp or not nvme_options.discovery_supp: -+ logging.warning( -+ 'Kernel does not appear to support all the options needed to run this program. Consider updating to a later kernel version.' -+ ) -+ -+ # We don't want to apply configuration changes to nvme-cli right away. -+ # Often, multiple changes will occur in a short amount of time (sub-second). -+ # We want to wait until there are no more changes before applying them -+ # to the system. The following timer acts as a "soak period". Changes -+ # will be applied by calling self._on_config_ctrls() at the end of -+ # the soak period. -+ self._cfg_soak_tmr.start() -+ -+ def _release_resources(self): -+ logging.debug('ServiceABC._release_resources()') -+ -+ if self._cancellable and not self._cancellable.is_cancelled(): -+ self._cancellable.cancel() -+ -+ if self._cfg_soak_tmr is not None: -+ self._cfg_soak_tmr.kill() -+ -+ self._controllers.clear() -+ -+ if self._sysbus: -+ self._sysbus.disconnect() -+ -+ self._cfg_soak_tmr = None -+ self._cancellable = None -+ self._resolver = None -+ self._lkc_file = None -+ self._sysbus = None -+ -+ def _config_dbus(self, iface_obj, bus_name: str, obj_name: str): -+ self._dbus_iface = iface_obj -+ self._sysbus.publish_object(obj_name, iface_obj) -+ self._sysbus.register_service(bus_name) -+ -+ @property -+ def tron(self): -+ '''@brief Get Trace ON property''' -+ return self._tron -+ -+ @tron.setter -+ def tron(self, value): -+ '''@brief Set Trace ON property''' -+ self._tron = value -+ log.set_level_from_tron(self._tron) -+ -+ def run(self): -+ '''@brief Start the main loop execution''' -+ try: -+ self._loop.run() -+ except Exception as ex: # pylint: disable=broad-except -+ logging.critical('exception: %s', ex) -+ -+ self._loop = None -+ -+ def info(self) -> dict: -+ '''@brief Get the status info for this object (used for debug)''' -+ nvme_options = conf.NvmeOptions() -+ return { -+ 'last known config file': self._lkc_file, -+ 'config soak timer': str(self._cfg_soak_tmr), -+ 'kernel support': { -+ 'TP8013': nvme_options.discovery_supp, -+ 'host_iface': nvme_options.host_iface_supp, -+ }, -+ 'system config': conf.SysConf().as_dict(), -+ } -+ -+ def get_controllers(self) -> dict: -+ '''@brief return the list of controller objects''' -+ return self._controllers.values() -+ -+ def get_controller( -+ self, transport: str, traddr: str, trsvcid: str, host_traddr: str, host_iface: str, subsysnqn: str -+ ): # pylint: disable=too-many-arguments -+ '''@brief get the specified controller object from the list of controllers''' -+ cid = { -+ 'transport': transport, -+ 'traddr': traddr, -+ 'trsvcid': trsvcid, -+ 'host-traddr': host_traddr, -+ 'host-iface': host_iface, -+ 'subsysnqn': subsysnqn, -+ } -+ return self._controllers.get(trid.TID(cid)) -+ -+ def _remove_ctrl_from_dict(self, controller): -+ tid_to_pop = controller.tid -+ if not tid_to_pop: -+ # Being paranoid. This should not happen, but let's say the -+ # controller object has been purged, but it is somehow still -+ # listed in self._controllers. -+ for tid, _controller in self._controllers.items(): -+ if _controller is controller: -+ tid_to_pop = tid -+ break -+ -+ if tid_to_pop: -+ logging.debug('ServiceABC._remove_ctrl_from_dict()- %s | %s', tid_to_pop, controller.device) -+ self._controllers.pop(tid_to_pop, None) -+ else: -+ logging.debug('ServiceABC._remove_ctrl_from_dict()- already removed') -+ -+ def remove_controller(self, controller, success): # pylint: disable=unused-argument -+ '''@brief remove the specified controller object from the list of controllers -+ @param controller: the controller object -+ @param success: whether the disconnect was successful''' -+ logging.debug('ServiceABC.remove_controller()') -+ if isinstance(controller, ControllerABC): -+ self._remove_ctrl_from_dict(controller) -+ -+ controller.kill() -+ -+ if self._cfg_soak_tmr: -+ self._cfg_soak_tmr.start() -+ -+ def _cancel(self): -+ logging.debug('ServiceABC._cancel()') -+ if not self._cancellable.is_cancelled(): -+ self._cancellable.cancel() -+ -+ for controller in self._controllers.values(): -+ controller.cancel() -+ -+ def _stop_hdlr(self): -+ logging.debug('ServiceABC._stop_hdlr()') -+ systemd.daemon.notify('STOPPING=1') -+ -+ self._cancel() # Cancel pending operations -+ -+ self._dump_last_known_config(self._controllers) -+ -+ if len(self._controllers) == 0: -+ GLib.idle_add(self._exit) -+ else: -+ # Tell all controller objects to disconnect -+ keep_connections = self._keep_connections_on_exit() -+ controllers = self._controllers.values() -+ logging.debug( -+ 'ServiceABC._stop_hdlr() - Controller count = %s, keep_connections = %s', -+ len(controllers), keep_connections -+ ) -+ for controller in controllers: -+ controller.disconnect(self._on_final_disconnect, keep_connections) -+ -+ return GLib.SOURCE_REMOVE -+ -+ def _on_final_disconnect(self, controller, success): -+ '''Callback invoked after a controller is disconnected. -+ THIS IS USED DURING PROCESS SHUTDOWN TO WAIT FOR ALL CONTROLLERS TO BE -+ DISCONNECTED BEFORE EXITING THE PROGRAM. ONLY CALL ON SHUTDOWN! -+ @param controller: the controller object -+ @param success: whether the disconnect operation was successful -+ ''' -+ logging.debug('ServiceABC._on_final_disconnect() - %s | %s disconnect %s', -+ controller.id, controller.device, 'succeeded' if success else 'failed') -+ self._remove_ctrl_from_dict(controller) -+ -+ controller.kill() -+ -+ # When all controllers have disconnected, we can finish the clean up -+ if len(self._controllers) == 0: -+ # Defer exit to the next main loop's idle period. -+ GLib.idle_add(self._exit) -+ -+ def _exit(self): -+ logging.debug('ServiceABC._exit()') -+ self._release_resources() -+ self._loop.quit() -+ -+ def _on_config_ctrls(self, *_user_data): -+ self._config_ctrls() -+ return GLib.SOURCE_REMOVE -+ -+ def _config_ctrls(self): -+ '''@brief Start controllers configuration.''' -+ # The configuration file may contain controllers and/or blacklist -+ # elements with traddr specified as hostname instead of IP address. -+ # Because of this, we need to remove those blacklisted elements before -+ # running name resolution. And we will need to remove blacklisted -+ # elements after name resolution is complete (i.e. in the calback -+ # function _config_ctrls_finish) -+ logging.debug('ServiceABC._config_ctrls()') -+ configured_controllers = remove_blacklisted(conf.SvcConf().get_controllers()) -+ self._resolver.resolve_ctrl_async(self._cancellable, configured_controllers, self._config_ctrls_finish) -+ -+ @abc.abstractmethod -+ def _keep_connections_on_exit(self): -+ '''@brief Determine whether connections should remain when the -+ process exits. -+ -+ NOTE) This is the base class method used to define the interface. -+ It must be overloaded by a child class. -+ ''' -+ raise NotImplementedError() -+ -+ @abc.abstractmethod -+ def _config_ctrls_finish(self, configured_ctrl_list): -+ '''@brief Finish controllers configuration after hostnames (if any) -+ have been resolved. -+ -+ Configuring controllers must be done asynchronously in 2 steps. -+ In the first step, host names get resolved to find their IP addresses. -+ Name resolution can take a while, especially when an external name -+ resolution server is used. Once that step completed, the callback -+ method _config_ctrls_finish() (i.e. this method), gets invoked to -+ complete the controller configuration. -+ -+ NOTE) This is the base class method used to define the interface. -+ It must be overloaded by a child class. -+ ''' -+ raise NotImplementedError() -+ -+ @abc.abstractmethod -+ def _load_last_known_config(self): -+ raise NotImplementedError() -+ -+ @abc.abstractmethod -+ def _dump_last_known_config(self, controllers): -+ raise NotImplementedError() -diff --git a/staslib/trid.py b/staslib/trid.py -index def6ab2..38619e7 100644 ---- a/staslib/trid.py -+++ b/staslib/trid.py -@@ -12,8 +12,7 @@ throughout nvme-stas to uniquely identify a Controller''' - import hashlib - from staslib import conf - --class TID: -- # pylint: disable=too-many-instance-attributes -+class TID: # pylint: disable=too-many-instance-attributes - '''Transport Identifier''' - RDMA_IP_PORT = '4420' - DISC_IP_PORT = '8009' -diff --git a/staslib/udev.py b/staslib/udev.py -index 29370b8..37b63cc 100644 ---- a/staslib/udev.py -+++ b/staslib/udev.py -@@ -16,7 +16,7 @@ from staslib import defs, trid - try: - from pyudev.glib import MonitorObserver - except (ModuleNotFoundError, AttributeError): -- from staslib.glibudev import MonitorObserver # pylint: disable=relative-beyond-top-level,ungrouped-imports -+ from staslib.glibudev import MonitorObserver # pylint: disable=ungrouped-imports - - # ****************************************************************************** - class Udev: -@@ -99,7 +99,7 @@ class Udev: - def get_attributes(self, sys_name: str, attr_ids) -> dict: - '''@brief Get all the attributes associated with device @sys_name''' - attrs = {attr_id: '' for attr_id in attr_ids} -- if sys_name: -+ if sys_name and sys_name != 'nvme?': - udev = self.get_nvme_device(sys_name) - if udev is not None: - for attr_id in attr_ids: -diff --git a/test/test-config.py b/test/test-config.py -index dad0ebd..db58883 100755 ---- a/test/test-config.py -+++ b/test/test-config.py -@@ -40,7 +40,7 @@ class StasProcessConfUnitTest(unittest.TestCase): - self.assertFalse(service_conf.data_digest) - self.assertTrue(service_conf.persistent_connections) - self.assertTrue(service_conf.udev_rule_enabled) -- self.assertFalse(service_conf.sticky_connections) -+ self.assertTrue(service_conf.sticky_connections) - self.assertFalse(service_conf.ignore_iface) - self.assertIn(6, service_conf.ip_family) - self.assertNotIn(4, service_conf.ip_family) -diff --git a/test/test-controller.py b/test/test-controller.py -index f23125e..f55781a 100755 ---- a/test/test-controller.py -+++ b/test/test-controller.py -@@ -8,24 +8,43 @@ from pyfakefs.fake_filesystem_unittest import TestCase - - LOOP = GLib.MainLoop() - -+ -+class TestController(ctrl.Controller): -+ def _find_existing_connection(self): -+ pass -+ -+ def _on_aen(self, aen: int): -+ pass -+ -+ def _on_nvme_event(self, nvme_event): -+ pass -+ -+ - class Test(TestCase): - '''Unit tests for class Controller''' - - def setUp(self): - self.setUpPyfakefs() - -- self.fs.create_file('/etc/nvme/hostnqn', contents='nqn.2014-08.org.nvmexpress:uuid:01234567-0123-0123-0123-0123456789ab\n') -- self.fs.create_file('/etc/nvme/hostid', contents='01234567-89ab-cdef-0123-456789abcdef\n') -- self.fs.create_file('/dev/nvme-fabrics', contents='instance=-1,cntlid=-1,transport=%s,traddr=%s,trsvcid=%s,nqn=%s,queue_size=%d,nr_io_queues=%d,reconnect_delay=%d,ctrl_loss_tmo=%d,keep_alive_tmo=%d,hostnqn=%s,host_traddr=%s,host_iface=%s,hostid=%s,duplicate_connect,disable_sqflow,hdr_digest,data_digest,nr_write_queues=%d,nr_poll_queues=%d,tos=%d,fast_io_fail_tmo=%d,discovery,dhchap_secret=%s,dhchap_ctrl_secret=%s\n') -+ self.fs.create_file( -+ '/etc/nvme/hostnqn', contents='nqn.2014-08.org.nvmexpress:uuid:01234567-0123-0123-0123-0123456789ab\n' -+ ) -+ self.fs.create_file('/etc/nvme/hostid', contents='01234567-89ab-cdef-0123-456789abcdef\n') -+ self.fs.create_file( -+ '/dev/nvme-fabrics', -+ contents='instance=-1,cntlid=-1,transport=%s,traddr=%s,trsvcid=%s,nqn=%s,queue_size=%d,nr_io_queues=%d,reconnect_delay=%d,ctrl_loss_tmo=%d,keep_alive_tmo=%d,hostnqn=%s,host_traddr=%s,host_iface=%s,hostid=%s,duplicate_connect,disable_sqflow,hdr_digest,data_digest,nr_write_queues=%d,nr_poll_queues=%d,tos=%d,fast_io_fail_tmo=%d,discovery,dhchap_secret=%s,dhchap_ctrl_secret=%s\n', -+ ) - -- self.NVME_TID = trid.TID({ -- 'transport': 'tcp', -- 'traddr': '10.10.10.10', -- 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8', -- 'trsvcid': '8009', -- 'host-traddr': '1.2.3.4', -- 'host-iface': 'wlp0s20f3', -- }) -+ self.NVME_TID = trid.TID( -+ { -+ 'transport': 'tcp', -+ 'traddr': '10.10.10.10', -+ 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8', -+ 'trsvcid': '8009', -+ 'host-traddr': '1.2.3.4', -+ 'host-iface': 'wlp0s20f3', -+ } -+ ) - - sysconf = conf.SysConf() - self.root = nvme.root() -@@ -34,32 +53,92 @@ class Test(TestCase): - def tearDown(self): - LOOP.quit() - -+ def test_cannot_instantiate_concrete_classes_if_abstract_method_are_not_implemented(self): -+ # Make sure we can't instantiate the ABC directly (Abstract Base Class). -+ class Controller(ctrl.Controller): -+ pass -+ -+ self.assertRaises(TypeError, lambda: ctrl.Controller(root=self.root, host=self.host, tid=self.NVME_TID)) -+ - def test_get_device(self): -- controller = ctrl.Controller(root=self.root, host=self.host, tid=self.NVME_TID) -+ controller = TestController(root=self.root, host=self.host, tid=self.NVME_TID) - self.assertEqual(controller._connect_attempts, 0) -- self.assertRaises(NotImplementedError, controller._try_to_connect) -+ controller._try_to_connect() - self.assertEqual(controller._connect_attempts, 1) -- self.assertRaises(NotImplementedError, controller._find_existing_connection) -- self.assertEqual(controller.id, "(tcp, 10.10.10.10, 8009, nqn.1988-11.com.dell:SFSS:2:20220208134025e8, wlp0s20f3, 1.2.3.4)") -+ self.assertEqual( -+ controller.id, "(tcp, 10.10.10.10, 8009, nqn.1988-11.com.dell:SFSS:2:20220208134025e8, wlp0s20f3, 1.2.3.4)" -+ ) - # raise Exception(controller._connect_op) -- self.assertEqual(str(controller.tid), "(tcp, 10.10.10.10, 8009, nqn.1988-11.com.dell:SFSS:2:20220208134025e8, wlp0s20f3, 1.2.3.4)") -- self.assertEqual(controller.device, '') -- self.assertEqual(str(controller.controller_id_dict()), "{'transport': 'tcp', 'traddr': '10.10.10.10', 'trsvcid': '8009', 'host-traddr': '1.2.3.4', 'host-iface': 'wlp0s20f3', 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8', 'device': ''}") -- # self.assertEqual(controller.details(), "{'transport': 'tcp', 'traddr': '10.10.10.[265 chars]ff]'}") -- self.assertEqual(controller.info(), {'transport': 'tcp', 'traddr': '10.10.10.10', 'trsvcid': '8009', 'host-traddr': '1.2.3.4', 'host-iface': 'wlp0s20f3', 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8', 'device': '', 'hostid': '', 'hostnqn': '', 'model': '', 'serial': '', 'connect attempts': '1', 'retry connect timer': '60.0s [off]'}) -+ self.assertEqual( -+ str(controller.tid), -+ "(tcp, 10.10.10.10, 8009, nqn.1988-11.com.dell:SFSS:2:20220208134025e8, wlp0s20f3, 1.2.3.4)", -+ ) -+ self.assertEqual(controller.device, 'nvme?') -+ self.assertEqual( -+ str(controller.controller_id_dict()), -+ "{'transport': 'tcp', 'traddr': '10.10.10.10', 'trsvcid': '8009', 'host-traddr': '1.2.3.4', 'host-iface': 'wlp0s20f3', 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8', 'device': 'nvme?'}", -+ ) -+ self.assertEqual( -+ controller.details(), -+ { -+ 'dctype': '', -+ 'cntrltype': '', -+ 'transport': 'tcp', -+ 'traddr': '10.10.10.10', -+ 'trsvcid': '8009', -+ 'host-traddr': '1.2.3.4', -+ 'host-iface': 'wlp0s20f3', -+ 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8', -+ 'device': 'nvme?', -+ 'connect attempts': '1', -+ 'retry connect timer': '60.0s [off]', -+ 'hostid': '', -+ 'hostnqn': '', -+ 'model': '', -+ 'serial': '', -+ }, -+ ) -+ self.assertEqual( -+ controller.info(), -+ { -+ 'dctype': '', -+ 'cntrltype': '', -+ 'transport': 'tcp', -+ 'traddr': '10.10.10.10', -+ 'trsvcid': '8009', -+ 'host-traddr': '1.2.3.4', -+ 'host-iface': 'wlp0s20f3', -+ 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8', -+ 'device': 'nvme?', -+ 'connect attempts': '1', -+ 'retry connect timer': '60.0s [off]', -+ 'hostid': '', -+ 'hostnqn': '', -+ 'model': '', -+ 'serial': '', -+ 'connect operation': {'fail count': 0}, -+ }, -+ ) -+ - # print(controller._connect_op) - self.assertEqual(controller.cancel(), None) - self.assertEqual(controller.kill(), None) - # self.assertEqual(controller.disconnect(), 0) - - def test_connect(self): -- controller = ctrl.Controller(root=self.root, host=self.host, tid=self.NVME_TID) -+ controller = TestController(root=self.root, host=self.host, tid=self.NVME_TID) - self.assertEqual(controller._connect_attempts, 0) -- controller._find_existing_connection = lambda : None -+ controller._find_existing_connection = lambda: None - with self.assertLogs(logger=logging.getLogger(), level='DEBUG') as captured: - controller._try_to_connect() - self.assertEqual(len(captured.records), 1) -- self.assertTrue(captured.records[0].getMessage().startswith("Controller._try_to_connect() - (tcp, 10.10.10.10, 8009, nqn.1988-11.com.dell:SFSS:2:20220208134025e8, wlp0s20f3, 1.2.3.4) Connecting to nvme control with cfg={'hdr_digest': False, 'data_digest': False")) -+ self.assertTrue( -+ captured.records[0] -+ .getMessage() -+ .startswith( -+ "Controller._try_to_connect() - (tcp, 10.10.10.10, 8009, nqn.1988-11.com.dell:SFSS:2:20220208134025e8, wlp0s20f3, 1.2.3.4) Connecting to nvme control with cfg={'hdr_digest': False, 'data_digest': False" -+ ) -+ ) - self.assertEqual(controller._connect_attempts, 1) - - -diff --git a/test/test-service.py b/test/test-service.py -index 19f9b0c..4ce37be 100755 ---- a/test/test-service.py -+++ b/test/test-service.py -@@ -4,6 +4,7 @@ import unittest - from staslib import service - from pyfakefs.fake_filesystem_unittest import TestCase - -+ - class Args: - def __init__(self): - self.tron = True -@@ -11,6 +12,20 @@ class Args: - self.conf_file = '/dev/null' - - -+class TestService(service.Service): -+ def _config_ctrls_finish(self, configured_ctrl_list): -+ pass -+ -+ def _dump_last_known_config(self, controllers): -+ pass -+ -+ def _keep_connections_on_exit(self): -+ pass -+ -+ def _load_last_known_config(self): -+ return dict() -+ -+ - class Test(TestCase): - '''Unit tests for class Service''' - -@@ -18,22 +33,39 @@ class Test(TestCase): - self.setUpPyfakefs() - - os.environ['RUNTIME_DIRECTORY'] = "/run" -- self.fs.create_file('/etc/nvme/hostnqn', contents='nqn.2014-08.org.nvmexpress:uuid:01234567-0123-0123-0123-0123456789ab\n') -- self.fs.create_file('/etc/nvme/hostid', contents='01234567-89ab-cdef-0123-456789abcdef\n') -- self.fs.create_file('/dev/nvme-fabrics', contents='instance=-1,cntlid=-1,transport=%s,traddr=%s,trsvcid=%s,nqn=%s,queue_size=%d,nr_io_queues=%d,reconnect_delay=%d,ctrl_loss_tmo=%d,keep_alive_tmo=%d,hostnqn=%s,host_traddr=%s,host_iface=%s,hostid=%s,duplicate_connect,disable_sqflow,hdr_digest,data_digest,nr_write_queues=%d,nr_poll_queues=%d,tos=%d,fast_io_fail_tmo=%d,discovery,dhchap_secret=%s,dhchap_ctrl_secret=%s\n') -+ self.fs.create_file( -+ '/etc/nvme/hostnqn', contents='nqn.2014-08.org.nvmexpress:uuid:01234567-0123-0123-0123-0123456789ab\n' -+ ) -+ self.fs.create_file('/etc/nvme/hostid', contents='01234567-89ab-cdef-0123-456789abcdef\n') -+ self.fs.create_file( -+ '/dev/nvme-fabrics', -+ contents='instance=-1,cntlid=-1,transport=%s,traddr=%s,trsvcid=%s,nqn=%s,queue_size=%d,nr_io_queues=%d,reconnect_delay=%d,ctrl_loss_tmo=%d,keep_alive_tmo=%d,hostnqn=%s,host_traddr=%s,host_iface=%s,hostid=%s,duplicate_connect,disable_sqflow,hdr_digest,data_digest,nr_write_queues=%d,nr_poll_queues=%d,tos=%d,fast_io_fail_tmo=%d,discovery,dhchap_secret=%s,dhchap_ctrl_secret=%s\n', -+ ) -+ -+ def test_cannot_instantiate_concrete_classes_if_abstract_method_are_not_implemented(self): -+ # Make sure we can't instantiate the ABC directly (Abstract Base Class). -+ class Service(service.Service): -+ pass -+ -+ self.assertRaises(TypeError, lambda: Service(Args(), reload_hdlr=lambda x: x)) - - def test_get_controller(self): -- # FIXME: this is hack, fix it later -- service.Service._load_last_known_config = lambda x : dict() -- # start the test -- -- srv = service.Service(Args(), reload_hdlr=lambda x : x) -- self.assertRaises(NotImplementedError, srv._keep_connections_on_exit) -- self.assertRaises(NotImplementedError, srv._dump_last_known_config, []) -- self.assertRaises(NotImplementedError, srv._on_config_ctrls) -- #self.assertEqual(srv.get_controllers(), dict()) -- self.assertEqual(srv.get_controller(transport='tcp', traddr='10.10.10.10', trsvcid='8009', host_traddr='1.2.3.4', host_iface='wlp0s20f3', subsysnqn='nqn.1988-11.com.dell:SFSS:2:20220208134025e8'), None) -- self.assertEqual(srv.remove_controller(controller=None), None) -+ srv = TestService(Args(), reload_hdlr=lambda x: x) -+ -+ self.assertEqual(list(srv.get_controllers()), list()) -+ self.assertEqual( -+ srv.get_controller( -+ transport='tcp', -+ traddr='10.10.10.10', -+ trsvcid='8009', -+ host_traddr='1.2.3.4', -+ host_iface='wlp0s20f3', -+ subsysnqn='nqn.1988-11.com.dell:SFSS:2:20220208134025e8', -+ ), -+ None, -+ ) -+ self.assertEqual(srv.remove_controller(controller=None, success=True), None) -+ - - if __name__ == '__main__': - unittest.main() diff --git a/SPECS/nvme-stas.spec b/SPECS/nvme-stas.spec index f8175ea..ace9b74 100644 --- a/SPECS/nvme-stas.spec +++ b/SPECS/nvme-stas.spec @@ -3,18 +3,17 @@ Name: nvme-stas Summary: NVMe STorage Appliance Services -Version: 1.1.6 -Release: 3%{?dist} +Version: 2.1.1 +Release: 1%{?dist} License: ASL 2.0 URL: https://github.com/linux-nvme/nvme-stas Source0: %{url}/archive/v%{version_no_tilde}/%{name}-%{version_no_tilde}.tar.gz -Patch0: 0001-sync-with-1.1.6.patch BuildArch: noarch BuildRequires: meson >= 0.57.0 BuildRequires: glib2-devel -BuildRequires: libnvme-devel +BuildRequires: libnvme-devel >= 1.2 BuildRequires: libxslt BuildRequires: docbook-style-xsl BuildRequires: systemd-devel @@ -32,7 +31,7 @@ BuildRequires: python3-gobject-devel BuildRequires: python3-lxml Requires: avahi -Requires: python3-libnvme +Requires: python3-libnvme >= 1.2 Requires: python3-dasbus Requires: python3-pyudev Requires: python3-systemd @@ -48,6 +47,7 @@ stafd (STorage Appliance Finder) and stacd (STorage Appliance Connector). %prep %autosetup -p1 -n %{name}-%{version_no_tilde} +sed -i meson.build -e "s/subdir('test')//" %build %meson -Dman=true -Dhtml=true @@ -94,8 +94,13 @@ mv %{buildroot}/%{_sysconfdir}/stas/sys.conf.doc %{buildroot}/%{_sysconfdir}/sta %{_mandir}/man7/nvme*.7* %{_mandir}/man8/sta*.8* - %changelog +* Fri Jan 13 2023 John Meneghini - 2.1.1-1 + - Update to the v2.1.1 package + +* Tue Nov 08 2022 Maurizio Lombardi - 2.0-1 +- Update to the latest v2.0 package + * Thu Aug 04 2022 Maurizio Lombardi - 1.1.6-3 - Sync with the official 1.1.6 version