Blob Blame History Raw
From 3ce2235abbd28cefcef348ad7895e2529810ad7c Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@redhat.com>
Date: Thu, 3 Nov 2016 19:30:41 +0800
Subject: [PATCH] 99base: add memtrace-ko.sh to debug kernel module large
 memory consumption

The current method for memory debug is to use "rd.memdebug=[0-3]",
it is not enough for debugging kernel modules. For example, when we
want to find out which kernel module consumes a large amount of memory,
"rd.memdebug=[0-3]" won't help too much.

A better way is needed to achieve this requirement, this is useful for
kdump OOM debugging.

The principle of this patch is to use kernel trace to track slab and
buddy allocation calls during kernel module loading(module_init), thus
we can analyze all the trace data and get the total memory consumption.
As for large slab allocation, it will probably fall into buddy allocation,
thus tracing "mm_page_alloc" alone should be enough for the purpose(this
saves quite some trace buffer memory, also large free is quite unlikey
during module loading, we neglect those memory free events).

The trace events include memory calls under "tracing/events/":
  kmem/mm_page_alloc

We also inpect the following events to detect the module loading:
  module/module_load
  module/module_put

Since we use filters to trace events, the final trace data size won't
be too big. Users can adjust the trace buffer size via "trace_buf_size"
kernel boot command line as needed.

We can get the module name and task pid from "module_load" event which
also mark the beginning of the loading, and module_put called by the
same task pid implies the end of the loading. So the memory events
recorded in between by the same task pid are consumed by this module
during loading(i.e. modprobe or module_init()).

With these information, we can record the rough total memory(the larger,
the more precise the result will be) consumption involved by each kernel
module loading.

Thus we introduce this shell script to find out which kernel module
consumes a large amount of memory during loading. Use "rd.memdebug=4"
as the tigger.

After applying this patch and specifying "rd.memdebug=4", during booting
it will print out something extra like below:
0 pages consumed by "pata_acpi"
0 pages consumed by "ata_generic"
1 pages consumed by "drm"
0 pages consumed by "ttm"
0 pages consumed by "drm_kms_helper"
835 pages consumed by "qxl"
0 pages consumed by "mii"
6 pages consumed by "8139cp"
0 pages consumed by "virtio"
0 pages consumed by "virtio_ring"
9 pages consumed by "virtio_pci"
1 pages consumed by "8139too"
0 pages consumed by "serio_raw"
0 pages consumed by "crc32c_intel"
199 pages consumed by "virtio_console"
0 pages consumed by "libcrc32c"
9 pages consumed by "xfs"

From the print, we see clearly that "qxl" consumed the most memory.

This file will be installed as a separate executable named "tracekomem"
in the following patch.

Signed-off-by: Xunlei Pang <xlpang@redhat.com>
---
 modules.d/99base/memtrace-ko.sh | 191 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 191 insertions(+)
 create mode 100755 modules.d/99base/memtrace-ko.sh

diff --git a/modules.d/99base/memtrace-ko.sh b/modules.d/99base/memtrace-ko.sh
new file mode 100755
index 00000000..ee035e15
--- /dev/null
+++ b/modules.d/99base/memtrace-ko.sh
@@ -0,0 +1,191 @@
+#!/bin/sh
+
+# Try to find out kernel modules with large total memory allocation during loading.
+# For large slab allocation, it will fall into buddy, also not trace "mm_page_free"
+# considering large free is quite rare for module_init, thus saving tons of events
+# to avoid trace data overwritten.
+#
+# Therefore, tracing "mm_page_alloc"alone should be enough for the purpose.
+
+# "sys/kernel/tracing" has the priority if exists.
+get_trace_base() {
+    # trace access through debugfs would be obsolete if "/sys/kernel/tracing" is available.
+    if [ -d "/sys/kernel/tracing" ]; then
+        echo "/sys/kernel"
+    else
+        echo "/sys/kernel/debug"
+    fi
+}
+
+# We want to enable these trace events.
+get_want_events() {
+    echo "module:module_put module:module_load kmem:mm_page_alloc"
+}
+
+get_event_filter() {
+    echo "comm == systemd-udevd || comm == modprobe || comm == insmod"
+}
+
+is_trace_ready() {
+    local trace_base want_events current_events
+
+    trace_base=$(get_trace_base)
+    ! [ -f "$trace_base/tracing/trace" ] && return 1
+
+    [ "$(cat $trace_base/tracing/tracing_on)" -eq 0 ] && return 1
+
+    # Also check if trace events were properly setup.
+    want_events=$(get_want_events)
+    current_events=$(echo $(cat $trace_base/tracing/set_event))
+    [ "$current_events" != "$want_events" ] && return 1
+
+    return 0
+}
+
+prepare_trace() {
+    local trace_base
+
+    trace_base=$(get_trace_base)
+    # old debugfs interface case.
+    if ! [ -d "$trace_base/tracing" ]; then
+        mount none -t debugfs $trace_base
+    # new tracefs interface case.
+    elif ! [ -f "$trace_base/tracing/trace" ]; then
+        mount none -t tracefs "$trace_base/tracing"
+    fi
+
+    if ! [ -f "$trace_base/tracing/trace" ]; then
+        echo "WARN: Mount trace failed for kernel module memory analyzing."
+        return 1
+    fi
+
+    # Active all the wanted trace events.
+    echo "$(get_want_events)" > $trace_base/tracing/set_event
+
+    # There are three kinds of known applications for module loading:
+    # "systemd-udevd", "modprobe" and "insmod".
+    # Set them as the global events filter.
+    # NOTE: Some kernel may not support this format of filter, anyway
+    #       the operation will fail and it doesn't matter.
+    echo "$(get_event_filter)" > $trace_base/tracing/events/kmem/filter 2>&1
+    echo "$(get_event_filter)" > $trace_base/tracing/events/module/filter 2>&1
+
+    # Set the number of comm-pid if supported.
+    if [ -f "$trace_base/tracing/saved_cmdlines_size" ]; then
+        # Thanks to filters, 4096 is big enough(also well supported).
+        echo 4096 > $trace_base/tracing/saved_cmdlines_size
+    fi
+
+    # Enable and clear trace data for the first time.
+    echo 1 > $trace_base/tracing/tracing_on
+    echo > $trace_base/tracing/trace
+    echo "Prepare trace success."
+    return 0
+}
+
+order_to_pages()
+{
+    local pages=1
+    local order=$1
+
+    while [ "$order" != 0 ]; do
+        order=$((order-1))
+        pages=$(($pages*2))
+	done
+
+    echo $pages
+}
+
+parse_trace_data() {
+    local module_name tmp_eval pages
+
+    cat "$(get_trace_base)/tracing/trace" | while read pid cpu flags ts function args
+    do
+        # Skip comment lines
+        if [ "$pid" = "#" ]; then
+            continue
+        fi
+
+        pid=${pid##*-}
+        function=${function%:}
+        if [ "$function" = "module_load" ]; then
+            # One module is being loaded, save the task pid for tracking.
+            # Remove the trailing after whitespace, there may be the module flags.
+            module_name=${args%% *}
+            # Mark current_module to track the task.
+            eval current_module_$pid="$module_name"
+            tmp_eval=$(eval echo '${module_loaded_'${module_name}'}')
+            if [ -n "$tmp_eval" ]; then
+                echo "WARN: \"$module_name\" was loaded multiple times!"
+            fi
+            eval unset module_loaded_$module_name
+            eval nr_alloc_pages_$module_name=0
+            continue
+        fi
+
+        module_name=$(eval echo '${current_module_'${pid}'}')
+        if [ -z "$module_name" ]; then
+            continue
+        fi
+
+        # Once we get here, the task is being tracked(is loading a module).
+        if [ "$function" = "module_put" ]; then
+            # Mark the module as loaded when the first module_put event happens after module_load.
+            tmp_eval=$(eval echo '${nr_alloc_pages_'${module_name}'}')
+            echo "$tmp_eval pages consumed by \"$module_name\""
+            eval module_loaded_$module_name=1
+            # Module loading finished, so untrack the task.
+            eval unset current_module_$pid
+            eval unset nr_alloc_pages_$module_name
+            continue
+        fi
+
+        if [ "$function" = "mm_page_alloc" ]; then
+            # Get order first, then convert to actual pages.
+            pages=$(echo $args | sed -e 's/.*order=\([0-9]*\) .*/\1/')
+            pages=$(order_to_pages "$pages")
+            tmp_eval=$(eval echo '${nr_alloc_pages_'${module_name}'}')
+            eval nr_alloc_pages_$module_name="$(($tmp_eval+$pages))"
+        fi
+    done
+}
+
+cleanup_trace() {
+    local trace_base
+
+    if is_trace_ready; then
+        trace_base=$(get_trace_base)
+        echo 0 > $trace_base/tracing/tracing_on
+        echo > $trace_base/tracing/trace
+        echo > $trace_base/tracing/set_event
+        echo 0 > $trace_base/tracing/events/kmem/filter
+        echo 0 > $trace_base/tracing/events/module/filter
+    fi
+}
+
+show_usage() {
+    echo "Find out kernel modules with large memory consumption during loading based on trace."
+    echo "Usage:"
+    echo "1) run it first to setup trace."
+    echo "2) run again to parse the trace data if any."
+    echo "3) run with \"--cleanup\" option to cleanup trace after use."
+}
+
+if [ "$1" = "--help" ]; then
+    show_usage
+    exit 0
+fi
+
+if [ "$1" = "--cleanup" ]; then
+    cleanup_trace
+    exit 0
+fi
+
+if is_trace_ready ; then
+    echo "tracekomem - Rough memory consumption by loading kernel modules (larger value with better accuracy)"
+    parse_trace_data
+else
+    prepare_trace
+fi
+
+exit $?