From c1a8000e8122d615d6e4d0569e8bbb4a76efc79d Mon Sep 17 00:00:00 2001 From: jaspals Date: Thu, 12 Feb 2026 15:53:33 -0600 Subject: [PATCH] numa checks --- .../plugins/inband/kernel/analyzer_args.py | 8 +- .../plugins/inband/kernel/kernel_analyzer.py | 55 ++++++++---- .../plugins/inband/kernel/kernel_collector.py | 11 ++- .../plugins/inband/kernel/kerneldata.py | 3 + .../fixtures/kernel_plugin_config.json | 1 + test/unit/plugin/test_kernel_analyzer.py | 85 ++++++++++++++++--- test/unit/plugin/test_kernel_collector.py | 80 +++++++++++++++-- 7 files changed, 203 insertions(+), 40 deletions(-) diff --git a/nodescraper/plugins/inband/kernel/analyzer_args.py b/nodescraper/plugins/inband/kernel/analyzer_args.py index e8f4cd61..d7be40b6 100644 --- a/nodescraper/plugins/inband/kernel/analyzer_args.py +++ b/nodescraper/plugins/inband/kernel/analyzer_args.py @@ -23,7 +23,7 @@ # SOFTWARE. # ############################################################################### -from typing import Union +from typing import Optional, Union from pydantic import Field, field_validator @@ -33,6 +33,7 @@ class KernelAnalyzerArgs(AnalyzerArgs): exp_kernel: Union[str, list] = Field(default_factory=list) + exp_numa: Optional[int] = None regex_match: bool = False @field_validator("exp_kernel", mode="before") @@ -61,4 +62,7 @@ def build_from_model(cls, datamodel: KernelDataModel) -> "KernelAnalyzerArgs": Returns: KernelAnalyzerArgs: instance of analyzer args class """ - return cls(exp_kernel=datamodel.kernel_version) + return cls( + exp_kernel=datamodel.kernel_version, + exp_numa=datamodel.numa_balancing, + ) diff --git a/nodescraper/plugins/inband/kernel/kernel_analyzer.py b/nodescraper/plugins/inband/kernel/kernel_analyzer.py index 4657b842..547179f7 100644 --- a/nodescraper/plugins/inband/kernel/kernel_analyzer.py +++ b/nodescraper/plugins/inband/kernel/kernel_analyzer.py @@ -51,11 +51,21 @@ def analyze_data( Returns: TaskResult: Result of the analysis containing status and message. """ - if not args: + correct_kernel_version = False + correct_numa_setting = False + # skip check if data not provided in config + if not args or not args.exp_kernel: self.result.message = "Expected kernel not provided" self.result.status = ExecutionStatus.NOT_RAN return self.result + if ( + args.exp_numa is None + or data.numa_balancing is None + or data.numa_balancing == args.exp_numa + ): + correct_numa_setting = True + for kernel in args.exp_kernel: if args.regex_match: try: @@ -69,21 +79,34 @@ def analyze_data( ) continue if regex_data.match(data.kernel_version): - self.result.message = "Kernel matches expected" - self.result.status = ExecutionStatus.OK - return self.result + correct_kernel_version = True + break elif data.kernel_version == kernel: - self.result.message = "Kernel matches expected" - self.result.status = ExecutionStatus.OK - return self.result + correct_kernel_version = True + break + + if not correct_kernel_version: + self.result.message = "unexpected kernel data!" + self.result.status = ExecutionStatus.ERROR + self._log_event( + category=EventCategory.OS, + description="unexpected kernel version!", + data={"expected": args.exp_kernel, "actual": data.kernel_version}, + priority=EventPriority.CRITICAL, + console_log=True, + ) + elif not correct_numa_setting: + self.result.message = "unexpected kernel data!" + self.result.status = ExecutionStatus.ERROR + self._log_event( + category=EventCategory.OS, + description="unexpected numa_balancing setting!", + data={"expected": args.exp_numa, "actual": data.numa_balancing}, + priority=EventPriority.CRITICAL, + console_log=True, + ) + else: + self.result.message = "Kernel matches expected" + self.result.status = ExecutionStatus.OK - self.result.message = "Kernel mismatch!" - self.result.status = ExecutionStatus.ERROR - self._log_event( - category=EventCategory.OS, - description=f"Kernel mismatch! Expected: {args.exp_kernel}, actual: {data.kernel_version}", - data={"expected": args.exp_kernel, "actual": data.kernel_version}, - priority=EventPriority.CRITICAL, - console_log=True, - ) return self.result diff --git a/nodescraper/plugins/inband/kernel/kernel_collector.py b/nodescraper/plugins/inband/kernel/kernel_collector.py index a9ac81ad..6b188940 100644 --- a/nodescraper/plugins/inband/kernel/kernel_collector.py +++ b/nodescraper/plugins/inband/kernel/kernel_collector.py @@ -39,6 +39,7 @@ class KernelCollector(InBandDataCollector[KernelDataModel, None]): DATA_MODEL = KernelDataModel CMD_WINDOWS = "wmic os get Version /Value" CMD = "sh -c 'uname -a'" + CMD_NUMA_BALANCING = "sh -c 'cat /proc/sys/kernel/numa_balancing'" def _parse_kernel_version(self, uname_a: str) -> Optional[str]: """Extract the kernel release from `uname -a` output. @@ -77,6 +78,7 @@ def collect_data( kernel = None kernel_info = None + numa_balancing = None if self.system_info.os_family == OSFamily.WINDOWS: res = self._run_sut_cmd(self.CMD_WINDOWS) @@ -90,6 +92,9 @@ def collect_data( if res.exit_code == 0: kernel_info = res.stdout kernel = self._parse_kernel_version(kernel_info) + numa_res = self._run_sut_cmd(self.CMD_NUMA_BALANCING) + if numa_res.exit_code == 0 and numa_res.stdout.strip().isdigit(): + numa_balancing = int(numa_res.stdout.strip()) if not kernel: self._log_event( category=EventCategory.OS, @@ -110,7 +115,11 @@ def collect_data( if kernel_info and kernel: - kernel_data = KernelDataModel(kernel_info=kernel_info, kernel_version=kernel) + kernel_data = KernelDataModel( + kernel_info=kernel_info, + kernel_version=kernel, + numa_balancing=numa_balancing, + ) self._log_event( category="KERNEL_READ", description="Kernel version read", diff --git a/nodescraper/plugins/inband/kernel/kerneldata.py b/nodescraper/plugins/inband/kernel/kerneldata.py index 45f521eb..f90832f7 100644 --- a/nodescraper/plugins/inband/kernel/kerneldata.py +++ b/nodescraper/plugins/inband/kernel/kerneldata.py @@ -24,9 +24,12 @@ # ############################################################################### +from typing import Optional + from nodescraper.models import DataModel class KernelDataModel(DataModel): kernel_info: str kernel_version: str + numa_balancing: Optional[int] = None diff --git a/test/functional/fixtures/kernel_plugin_config.json b/test/functional/fixtures/kernel_plugin_config.json index 7c2cec92..291bb0ee 100644 --- a/test/functional/fixtures/kernel_plugin_config.json +++ b/test/functional/fixtures/kernel_plugin_config.json @@ -4,6 +4,7 @@ "KernelPlugin": { "analysis_args": { "exp_kernel": "5.11-generic", + "exp_numa": 0, "regex_match": false } } diff --git a/test/unit/plugin/test_kernel_analyzer.py b/test/unit/plugin/test_kernel_analyzer.py index 81785abd..c8fa3b2b 100644 --- a/test/unit/plugin/test_kernel_analyzer.py +++ b/test/unit/plugin/test_kernel_analyzer.py @@ -38,6 +38,7 @@ def model_obj(): return KernelDataModel( kernel_info="Linux MockSystem 5.13.0-30-generic #1 XYZ Day Month 10 15:19:13 EDT 2024 x86_64 x86_64 x86_64 GNU/Linux", kernel_version="5.13.0-30-generic", + numa_balancing=0, ) @@ -54,7 +55,7 @@ def config(): def test_all_good_data(system_info, model_obj, config): - args = KernelAnalyzerArgs(exp_kernel=config["kernel_name"]) + args = KernelAnalyzerArgs(exp_kernel=config["kernel_name"], exp_numa=0) analyzer = KernelAnalyzer(system_info) result = analyzer.analyze_data(model_obj, args) @@ -64,7 +65,7 @@ def test_all_good_data(system_info, model_obj, config): def test_all_good_data_strings(system_info, model_obj, config): - args = KernelAnalyzerArgs(exp_kernel=config["kernel_name"][0]) + args = KernelAnalyzerArgs(exp_kernel=config["kernel_name"][0], exp_numa=0) analyzer = KernelAnalyzer(system_info) result = analyzer.analyze_data(model_obj, args) @@ -85,27 +86,29 @@ def test_no_config_data(system_info, model_obj): def test_invalid_kernel(system_info, model_obj, config): - args = KernelAnalyzerArgs(exp_kernel=config["kernel_name"]) + args = KernelAnalyzerArgs(exp_kernel=config["kernel_name"], exp_numa=0) model_obj.kernel_version = "some_invalid" analyzer = KernelAnalyzer(system_info) result = analyzer.analyze_data(model_obj, args=args) assert result.status == ExecutionStatus.ERROR - assert "Kernel mismatch" in result.message + assert "unexpected kernel data!" in result.message assert any( - event.priority == EventPriority.CRITICAL and event.category == EventCategory.OS.value + event.priority == EventPriority.CRITICAL + and event.category == EventCategory.OS.value + and "unexpected kernel version!" in event.description for event in result.events ) def test_unexpected_kernel(system_info, model_obj): - args = KernelAnalyzerArgs(exp_kernel=["5.18.2-mi300-build"]) + args = KernelAnalyzerArgs(exp_kernel=["5.18.2-mi300-build"], exp_numa=0) analyzer = KernelAnalyzer(system_info) result = analyzer.analyze_data(model_obj, args) assert result.status == ExecutionStatus.ERROR - assert "Kernel mismatch!" in result.message + assert "unexpected kernel data!" in result.message assert any( event.priority == EventPriority.CRITICAL and event.category == EventCategory.OS.value for event in result.events @@ -113,7 +116,7 @@ def test_unexpected_kernel(system_info, model_obj): def test_invalid_kernel_config(system_info, model_obj, config): - args = KernelAnalyzerArgs(exp_kernel=config["invalid"]) + args = KernelAnalyzerArgs(exp_kernel=config["invalid"], exp_numa=0) analyzer = KernelAnalyzer(system_info) result = analyzer.analyze_data(model_obj, args) @@ -121,14 +124,18 @@ def test_invalid_kernel_config(system_info, model_obj, config): def test_match_regex(system_info, model_obj): - args = KernelAnalyzerArgs(exp_kernel=[r".*5\.13\.\d+-\d+-[\w-]+.*"], regex_match=True) + args = KernelAnalyzerArgs( + exp_kernel=[r".*5\.13\.\d+-\d+-[\w-]+.*"], regex_match=True, exp_numa=0 + ) analyzer = KernelAnalyzer(system_info) result = analyzer.analyze_data(model_obj, args) assert result.status == ExecutionStatus.OK def test_mismatch_regex(system_info, model_obj): - args = KernelAnalyzerArgs(exp_kernel=[r".*4\.13\.\d+-\d+-[\w-]+.*"], regex_match=True) + args = KernelAnalyzerArgs( + exp_kernel=[r".*4\.13\.\d+-\d+-[\w-]+.*"], regex_match=True, exp_numa=0 + ) analyzer = KernelAnalyzer(system_info) result = analyzer.analyze_data(model_obj, args) @@ -136,11 +143,11 @@ def test_mismatch_regex(system_info, model_obj): assert len(result.events) == 1 assert result.events[0].priority == EventPriority.CRITICAL assert result.events[0].category == EventCategory.OS.value - assert "Kernel mismatch!" in result.events[0].description + assert "unexpected kernel version!" in result.events[0].description def test_bad_regex(system_info, model_obj): - args = KernelAnalyzerArgs(exp_kernel=[r"4.[3.\d-\d+-[\w]+"], regex_match=True) + args = KernelAnalyzerArgs(exp_kernel=[r"4.[3.\d-\d+-[\w]+"], regex_match=True, exp_numa=0) analyzer = KernelAnalyzer(system_info) result = analyzer.analyze_data(model_obj, args) @@ -151,4 +158,56 @@ def test_bad_regex(system_info, model_obj): assert result.events[0].description == "Kernel regex is invalid" assert result.events[1].priority == EventPriority.CRITICAL assert result.events[1].category == EventCategory.OS.value - assert "Kernel mismatch!" in result.events[1].description + assert "unexpected kernel version!" in result.events[1].description + + +def test_unexpected_numa(system_info, model_obj, config): + """Test with config specifying a different numa value than actual.""" + args = KernelAnalyzerArgs( + exp_kernel=config["kernel_name"][0], + exp_numa=1, + ) + analyzer = KernelAnalyzer(system_info) + result = analyzer.analyze_data(model_obj, args) + + assert result.status == ExecutionStatus.ERROR + assert "unexpected kernel data!" in result.message + assert any( + event.priority == EventPriority.CRITICAL + and event.category == EventCategory.OS.value + and "unexpected numa_balancing setting!" in event.description + for event in result.events + ) + + +def test_no_expected_numa(system_info, model_obj, config): + """Test with no expected numa provided to analyzer (NUMA check skipped).""" + args = KernelAnalyzerArgs(exp_kernel=config["kernel_name"][0], exp_numa=None) + analyzer = KernelAnalyzer(system_info) + result = analyzer.analyze_data(model_obj, args) + + assert result.status == ExecutionStatus.OK + assert "Kernel matches expected" in result.message + assert all( + event.priority not in [EventPriority.WARNING, EventPriority.ERROR, EventPriority.CRITICAL] + for event in result.events + ) + + +def test_no_numa_balancing(system_info, model_obj, config): + """Test when data has no numa_balancing (e.g. not collected); NUMA check passes.""" + data_no_numa = KernelDataModel( + kernel_info=model_obj.kernel_info, + kernel_version=config["kernel_name"][0], + numa_balancing=None, + ) + args = KernelAnalyzerArgs(exp_kernel=config["kernel_name"][0], exp_numa=0) + analyzer = KernelAnalyzer(system_info) + result = analyzer.analyze_data(data_no_numa, args) + + assert result.status == ExecutionStatus.OK + assert "Kernel matches expected" in result.message + assert all( + event.priority not in [EventPriority.WARNING, EventPriority.ERROR, EventPriority.CRITICAL] + for event in result.events + ) diff --git a/test/unit/plugin/test_kernel_collector.py b/test/unit/plugin/test_kernel_collector.py index b1f26257..3b370783 100644 --- a/test/unit/plugin/test_kernel_collector.py +++ b/test/unit/plugin/test_kernel_collector.py @@ -54,25 +54,36 @@ def test_run_windows(collector, conn_mock): result, data = collector.collect_data() assert data == KernelDataModel( - kernel_info="Version=10.0.19041.1237", kernel_version="10.0.19041.1237" + kernel_info="Version=10.0.19041.1237", + kernel_version="10.0.19041.1237", + numa_balancing=None, ) assert result.status == ExecutionStatus.OK def test_run_linux(collector, conn_mock): collector.system_info.os_family = OSFamily.LINUX - conn_mock.run_command.return_value = CommandArtifact( - exit_code=0, - stdout="Linux MockSystem 5.13.0-30-generic #1 XYZ Day Month 10 15:19:13 EDT 2024 x86_64 x86_64 x86_64 GNU/Linux", - stderr="", - command="sh -c 'uname -a'", - ) + conn_mock.run_command.side_effect = [ + CommandArtifact( + exit_code=0, + stdout="Linux MockSystem 5.13.0-30-generic #1 XYZ Day Month 10 15:19:13 EDT 2024 x86_64 x86_64 x86_64 GNU/Linux", + stderr="", + command="sh -c 'uname -a'", + ), + CommandArtifact( + exit_code=0, + stdout="0", + stderr="", + command="sh -c 'cat /proc/sys/kernel/numa_balancing'", + ), + ] result, data = collector.collect_data() assert data == KernelDataModel( kernel_info="Linux MockSystem 5.13.0-30-generic #1 XYZ Day Month 10 15:19:13 EDT 2024 x86_64 x86_64 x86_64 GNU/Linux", kernel_version="5.13.0-30-generic", + numa_balancing=0, ) assert result.status == ExecutionStatus.OK @@ -83,7 +94,7 @@ def test_run_error(collector, conn_mock): exit_code=1, stdout="", stderr="Error occurred", - command="sh -c 'uname -r'", + command="sh -c 'uname -a'", ) result, data = collector.collect_data() @@ -91,3 +102,56 @@ def test_run_error(collector, conn_mock): assert result.status == ExecutionStatus.ERROR assert data is None assert len(collector.result.events) == 1 + + +def test_run_linux_numa_fails(collector, conn_mock): + """Linux: uname succeeds but numa_balancing command fails; numa_balancing remains None.""" + collector.system_info.os_family = OSFamily.LINUX + conn_mock.run_command.side_effect = [ + CommandArtifact( + exit_code=0, + stdout="Linux MockSystem 5.4.0-88-generic #1 SMP x86_64 GNU/Linux", + stderr="", + command="sh -c 'uname -a'", + ), + CommandArtifact( + exit_code=1, + stdout="", + stderr="Permission denied", + command="sh -c 'cat /proc/sys/kernel/numa_balancing'", + ), + ] + + result, data = collector.collect_data() + + assert data == KernelDataModel( + kernel_info="Linux MockSystem 5.4.0-88-generic #1 SMP x86_64 GNU/Linux", + kernel_version="5.4.0-88-generic", + numa_balancing=None, + ) + assert result.status == ExecutionStatus.OK + + +def test_run_linux_numa_non_digit(collector, conn_mock): + """Linux: numa_balancing file returns non-digit; numa_balancing remains None.""" + collector.system_info.os_family = OSFamily.LINUX + conn_mock.run_command.side_effect = [ + CommandArtifact( + exit_code=0, + stdout="Linux host 5.4.0-88-generic #1 SMP x86_64 GNU/Linux", + stderr="", + command="sh -c 'uname -a'", + ), + CommandArtifact( + exit_code=0, + stdout="off\n", + stderr="", + command="sh -c 'cat /proc/sys/kernel/numa_balancing'", + ), + ] + + result, data = collector.collect_data() + + assert data.kernel_version == "5.4.0-88-generic" + assert data.numa_balancing is None + assert result.status == ExecutionStatus.OK