18971c
From 3ce2235abbd28cefcef348ad7895e2529810ad7c Mon Sep 17 00:00:00 2001
18971c
From: Xunlei Pang <xlpang@redhat.com>
18971c
Date: Thu, 3 Nov 2016 19:30:41 +0800
18971c
Subject: [PATCH] 99base: add memtrace-ko.sh to debug kernel module large
18971c
 memory consumption
18971c
18971c
The current method for memory debug is to use "rd.memdebug=[0-3]",
18971c
it is not enough for debugging kernel modules. For example, when we
18971c
want to find out which kernel module consumes a large amount of memory,
18971c
"rd.memdebug=[0-3]" won't help too much.
18971c
18971c
A better way is needed to achieve this requirement, this is useful for
18971c
kdump OOM debugging.
18971c
18971c
The principle of this patch is to use kernel trace to track slab and
18971c
buddy allocation calls during kernel module loading(module_init), thus
18971c
we can analyze all the trace data and get the total memory consumption.
18971c
As for large slab allocation, it will probably fall into buddy allocation,
18971c
thus tracing "mm_page_alloc" alone should be enough for the purpose(this
18971c
saves quite some trace buffer memory, also large free is quite unlikey
18971c
during module loading, we neglect those memory free events).
18971c
18971c
The trace events include memory calls under "tracing/events/":
18971c
  kmem/mm_page_alloc
18971c
18971c
We also inpect the following events to detect the module loading:
18971c
  module/module_load
18971c
  module/module_put
18971c
18971c
Since we use filters to trace events, the final trace data size won't
18971c
be too big. Users can adjust the trace buffer size via "trace_buf_size"
18971c
kernel boot command line as needed.
18971c
18971c
We can get the module name and task pid from "module_load" event which
18971c
also mark the beginning of the loading, and module_put called by the
18971c
same task pid implies the end of the loading. So the memory events
18971c
recorded in between by the same task pid are consumed by this module
18971c
during loading(i.e. modprobe or module_init()).
18971c
18971c
With these information, we can record the rough total memory(the larger,
18971c
the more precise the result will be) consumption involved by each kernel
18971c
module loading.
18971c
18971c
Thus we introduce this shell script to find out which kernel module
18971c
consumes a large amount of memory during loading. Use "rd.memdebug=4"
18971c
as the tigger.
18971c
18971c
After applying this patch and specifying "rd.memdebug=4", during booting
18971c
it will print out something extra like below:
18971c
0 pages consumed by "pata_acpi"
18971c
0 pages consumed by "ata_generic"
18971c
1 pages consumed by "drm"
18971c
0 pages consumed by "ttm"
18971c
0 pages consumed by "drm_kms_helper"
18971c
835 pages consumed by "qxl"
18971c
0 pages consumed by "mii"
18971c
6 pages consumed by "8139cp"
18971c
0 pages consumed by "virtio"
18971c
0 pages consumed by "virtio_ring"
18971c
9 pages consumed by "virtio_pci"
18971c
1 pages consumed by "8139too"
18971c
0 pages consumed by "serio_raw"
18971c
0 pages consumed by "crc32c_intel"
18971c
199 pages consumed by "virtio_console"
18971c
0 pages consumed by "libcrc32c"
18971c
9 pages consumed by "xfs"
18971c
18971c
From the print, we see clearly that "qxl" consumed the most memory.
18971c
18971c
This file will be installed as a separate executable named "tracekomem"
18971c
in the following patch.
18971c
18971c
Signed-off-by: Xunlei Pang <xlpang@redhat.com>
18971c
---
18971c
 modules.d/99base/memtrace-ko.sh | 191 ++++++++++++++++++++++++++++++++
18971c
 1 file changed, 191 insertions(+)
18971c
 create mode 100755 modules.d/99base/memtrace-ko.sh
18971c
18971c
diff --git a/modules.d/99base/memtrace-ko.sh b/modules.d/99base/memtrace-ko.sh
18971c
new file mode 100755
18971c
index 00000000..ee035e15
18971c
--- /dev/null
18971c
+++ b/modules.d/99base/memtrace-ko.sh
18971c
@@ -0,0 +1,191 @@
18971c
+#!/bin/sh
18971c
+
18971c
+# Try to find out kernel modules with large total memory allocation during loading.
18971c
+# For large slab allocation, it will fall into buddy, also not trace "mm_page_free"
18971c
+# considering large free is quite rare for module_init, thus saving tons of events
18971c
+# to avoid trace data overwritten.
18971c
+#
18971c
+# Therefore, tracing "mm_page_alloc"alone should be enough for the purpose.
18971c
+
18971c
+# "sys/kernel/tracing" has the priority if exists.
18971c
+get_trace_base() {
18971c
+    # trace access through debugfs would be obsolete if "/sys/kernel/tracing" is available.
18971c
+    if [ -d "/sys/kernel/tracing" ]; then
18971c
+        echo "/sys/kernel"
18971c
+    else
18971c
+        echo "/sys/kernel/debug"
18971c
+    fi
18971c
+}
18971c
+
18971c
+# We want to enable these trace events.
18971c
+get_want_events() {
18971c
+    echo "module:module_put module:module_load kmem:mm_page_alloc"
18971c
+}
18971c
+
18971c
+get_event_filter() {
18971c
+    echo "comm == systemd-udevd || comm == modprobe || comm == insmod"
18971c
+}
18971c
+
18971c
+is_trace_ready() {
18971c
+    local trace_base want_events current_events
18971c
+
18971c
+    trace_base=$(get_trace_base)
18971c
+    ! [ -f "$trace_base/tracing/trace" ] && return 1
18971c
+
18971c
+    [ "$(cat $trace_base/tracing/tracing_on)" -eq 0 ] && return 1
18971c
+
18971c
+    # Also check if trace events were properly setup.
18971c
+    want_events=$(get_want_events)
18971c
+    current_events=$(echo $(cat $trace_base/tracing/set_event))
18971c
+    [ "$current_events" != "$want_events" ] && return 1
18971c
+
18971c
+    return 0
18971c
+}
18971c
+
18971c
+prepare_trace() {
18971c
+    local trace_base
18971c
+
18971c
+    trace_base=$(get_trace_base)
18971c
+    # old debugfs interface case.
18971c
+    if ! [ -d "$trace_base/tracing" ]; then
18971c
+        mount none -t debugfs $trace_base
18971c
+    # new tracefs interface case.
18971c
+    elif ! [ -f "$trace_base/tracing/trace" ]; then
18971c
+        mount none -t tracefs "$trace_base/tracing"
18971c
+    fi
18971c
+
18971c
+    if ! [ -f "$trace_base/tracing/trace" ]; then
18971c
+        echo "WARN: Mount trace failed for kernel module memory analyzing."
18971c
+        return 1
18971c
+    fi
18971c
+
18971c
+    # Active all the wanted trace events.
18971c
+    echo "$(get_want_events)" > $trace_base/tracing/set_event
18971c
+
18971c
+    # There are three kinds of known applications for module loading:
18971c
+    # "systemd-udevd", "modprobe" and "insmod".
18971c
+    # Set them as the global events filter.
18971c
+    # NOTE: Some kernel may not support this format of filter, anyway
18971c
+    #       the operation will fail and it doesn't matter.
18971c
+    echo "$(get_event_filter)" > $trace_base/tracing/events/kmem/filter 2>&1
18971c
+    echo "$(get_event_filter)" > $trace_base/tracing/events/module/filter 2>&1
18971c
+
18971c
+    # Set the number of comm-pid if supported.
18971c
+    if [ -f "$trace_base/tracing/saved_cmdlines_size" ]; then
18971c
+        # Thanks to filters, 4096 is big enough(also well supported).
18971c
+        echo 4096 > $trace_base/tracing/saved_cmdlines_size
18971c
+    fi
18971c
+
18971c
+    # Enable and clear trace data for the first time.
18971c
+    echo 1 > $trace_base/tracing/tracing_on
18971c
+    echo > $trace_base/tracing/trace
18971c
+    echo "Prepare trace success."
18971c
+    return 0
18971c
+}
18971c
+
18971c
+order_to_pages()
18971c
+{
18971c
+    local pages=1
18971c
+    local order=$1
18971c
+
18971c
+    while [ "$order" != 0 ]; do
18971c
+        order=$((order-1))
18971c
+        pages=$(($pages*2))
18971c
+	done
18971c
+
18971c
+    echo $pages
18971c
+}
18971c
+
18971c
+parse_trace_data() {
18971c
+    local module_name tmp_eval pages
18971c
+
18971c
+    cat "$(get_trace_base)/tracing/trace" | while read pid cpu flags ts function args
18971c
+    do
18971c
+        # Skip comment lines
18971c
+        if [ "$pid" = "#" ]; then
18971c
+            continue
18971c
+        fi
18971c
+
18971c
+        pid=${pid##*-}
18971c
+        function=${function%:}
18971c
+        if [ "$function" = "module_load" ]; then
18971c
+            # One module is being loaded, save the task pid for tracking.
18971c
+            # Remove the trailing after whitespace, there may be the module flags.
18971c
+            module_name=${args%% *}
18971c
+            # Mark current_module to track the task.
18971c
+            eval current_module_$pid="$module_name"
18971c
+            tmp_eval=$(eval echo '${module_loaded_'${module_name}'}')
18971c
+            if [ -n "$tmp_eval" ]; then
18971c
+                echo "WARN: \"$module_name\" was loaded multiple times!"
18971c
+            fi
18971c
+            eval unset module_loaded_$module_name
18971c
+            eval nr_alloc_pages_$module_name=0
18971c
+            continue
18971c
+        fi
18971c
+
18971c
+        module_name=$(eval echo '${current_module_'${pid}'}')
18971c
+        if [ -z "$module_name" ]; then
18971c
+            continue
18971c
+        fi
18971c
+
18971c
+        # Once we get here, the task is being tracked(is loading a module).
18971c
+        if [ "$function" = "module_put" ]; then
18971c
+            # Mark the module as loaded when the first module_put event happens after module_load.
18971c
+            tmp_eval=$(eval echo '${nr_alloc_pages_'${module_name}'}')
18971c
+            echo "$tmp_eval pages consumed by \"$module_name\""
18971c
+            eval module_loaded_$module_name=1
18971c
+            # Module loading finished, so untrack the task.
18971c
+            eval unset current_module_$pid
18971c
+            eval unset nr_alloc_pages_$module_name
18971c
+            continue
18971c
+        fi
18971c
+
18971c
+        if [ "$function" = "mm_page_alloc" ]; then
18971c
+            # Get order first, then convert to actual pages.
18971c
+            pages=$(echo $args | sed -e 's/.*order=\([0-9]*\) .*/\1/')
18971c
+            pages=$(order_to_pages "$pages")
18971c
+            tmp_eval=$(eval echo '${nr_alloc_pages_'${module_name}'}')
18971c
+            eval nr_alloc_pages_$module_name="$(($tmp_eval+$pages))"
18971c
+        fi
18971c
+    done
18971c
+}
18971c
+
18971c
+cleanup_trace() {
18971c
+    local trace_base
18971c
+
18971c
+    if is_trace_ready; then
18971c
+        trace_base=$(get_trace_base)
18971c
+        echo 0 > $trace_base/tracing/tracing_on
18971c
+        echo > $trace_base/tracing/trace
18971c
+        echo > $trace_base/tracing/set_event
18971c
+        echo 0 > $trace_base/tracing/events/kmem/filter
18971c
+        echo 0 > $trace_base/tracing/events/module/filter
18971c
+    fi
18971c
+}
18971c
+
18971c
+show_usage() {
18971c
+    echo "Find out kernel modules with large memory consumption during loading based on trace."
18971c
+    echo "Usage:"
18971c
+    echo "1) run it first to setup trace."
18971c
+    echo "2) run again to parse the trace data if any."
18971c
+    echo "3) run with \"--cleanup\" option to cleanup trace after use."
18971c
+}
18971c
+
18971c
+if [ "$1" = "--help" ]; then
18971c
+    show_usage
18971c
+    exit 0
18971c
+fi
18971c
+
18971c
+if [ "$1" = "--cleanup" ]; then
18971c
+    cleanup_trace
18971c
+    exit 0
18971c
+fi
18971c
+
18971c
+if is_trace_ready ; then
18971c
+    echo "tracekomem - Rough memory consumption by loading kernel modules (larger value with better accuracy)"
18971c
+    parse_trace_data
18971c
+else
18971c
+    prepare_trace
18971c
+fi
18971c
+
18971c
+exit $?