diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index e0f377efeb0..3235de12d37 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -933,15 +933,17 @@ jobs: # The generic Linux job chooses to use base env, not the one setup by the image CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" + + # Install test requirements + pip install -r backends/nxp/requirements-eiq.txt # Build and install Executorch PYTHON_EXECUTABLE=python \ - CMAKE_ARGS="-DEXECUTORCH_BUILD_NXP_NEUTRON=ON" \ + CMAKE_ARGS="-DEXECUTORCH_BUILD_NXP_NEUTRON=ON -DEXECUTORCH_BUILD_NXP_NEUTRON_RUNNER=ON " \ .ci/scripts/setup-linux.sh --build-tool "cmake" # Install test requirements pip install -r backends/nxp/requirements-tests-pypi.txt - pip install -r backends/nxp/requirements-tests-eiq.txt PYTHON_EXECUTABLE=python bash examples/nxp/setup.sh # Run pytest @@ -950,6 +952,13 @@ jobs: # Run aot examples: PYTHON_EXECUTABLE=python bash examples/nxp/run_aot_example.sh cifar10 PYTHON_EXECUTABLE=python bash examples/nxp/run_aot_example.sh mobilenetv2 + + # Run e2e example with Simulator: + PYTHON_EXECUTABLE=python bash examples/nxp/run.sh cifar10 + + # Run lightweight model tests: + PYTHON_EXECUTABLE=python pytest -c /dev/null backends/nxp/tests_models/ \ + --nxp_runner_path "./examples/nxp/executor_runner/build/nxp_executor_runner" test-samsung-quantmodels-linux: name: test-samsung-quantmodels-linux diff --git a/CMakeLists.txt b/CMakeLists.txt index 09eeecd6b28..bd26fc52699 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -628,6 +628,10 @@ if(EXECUTORCH_BUILD_NXP_NEUTRON) list(APPEND _executorch_backends executorch_delegate_neutron) endif() +if(EXECUTORCH_BUILD_NXP_NEUTRON_RUNNER) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/examples/nxp/executor_runner) +endif() + if(EXECUTORCH_BUILD_COREML) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/coreml) list(APPEND _executorch_backends coremldelegate) diff --git a/backends/nxp/requirements-tests-eiq.txt b/backends/nxp/requirements-eiq.txt similarity index 68% rename from backends/nxp/requirements-tests-eiq.txt rename to backends/nxp/requirements-eiq.txt index 2bd97480ab6..1ca35d4160f 100644 --- a/backends/nxp/requirements-tests-eiq.txt +++ b/backends/nxp/requirements-eiq.txt @@ -1,2 +1,4 @@ --index-url https://eiq.nxp.com/repository neutron_converter_SDK_25_12 +eiq_neutron_sdk==2.2.2 +eiq_nsys diff --git a/backends/nxp/tests_models/__init__.py b/backends/nxp/tests_models/__init__.py new file mode 100644 index 00000000000..55dc5fccf45 --- /dev/null +++ b/backends/nxp/tests_models/__init__.py @@ -0,0 +1,4 @@ +# Copyright 2026 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. diff --git a/backends/nxp/tests_models/config.py b/backends/nxp/tests_models/config.py new file mode 100644 index 00000000000..7a5f108c9b7 --- /dev/null +++ b/backends/nxp/tests_models/config.py @@ -0,0 +1,37 @@ +# Copyright 2026 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import os +import pathlib +import shutil + +import eiq_neutron_sdk + +# The PROJECT_DIR env variable is set by the conftest.py in backends.nxp.tests_models.conftest. +# It is supposed to point at ExecuTorch Project directory (not install folder) to derive path to artefacts (config files, +# dataset, model weight) located in the project directory structure, but not installed. +# TODO(Robert Kalmar) In accordance with the "TODO(dbort): Prune /test[s]/ dirs, /third-party/ dirs" in pyproject.toml, +# once the test folders are not installed we can derive the path from current file location: `pathlib.Path(__file__)` +PROJECT_DIR = os.environ.get("PROJECT_DIR") +assert PROJECT_DIR and os.path.exists(PROJECT_DIR) + +OUTPUTS_DIR = pathlib.Path(os.getcwd()) / ".outputs" + +NSYS_PATH = pathlib.Path(shutil.which("nsys")) +NSYS_CONFIG_PATH = os.path.join( + PROJECT_DIR, "backends", "nxp", "tests_models", "neutron-imxrt700.ini" +) +NSYS_FIRMWARE_PATH = os.path.join( + os.path.dirname(eiq_neutron_sdk.__file__), + "target", + "imxrt700", + "cmodel", + "NeutronFirmware.elf", +) + +# The NXP_RUNNER_PATH env variable is either defined by pytest when using the CLI argument --nxp_executor_path or +# a standard environment variable. +NEUTRON_TEST_PATH = os.environ.get("NXP_RUNNER_PATH") +assert NEUTRON_TEST_PATH and os.path.exists(NEUTRON_TEST_PATH) diff --git a/backends/nxp/tests_models/config_importer.py b/backends/nxp/tests_models/config_importer.py new file mode 100644 index 00000000000..751ed9bd751 --- /dev/null +++ b/backends/nxp/tests_models/config_importer.py @@ -0,0 +1,17 @@ +# Copyright 2026 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import logging + +logger = logging.getLogger(__name__) + +try: + import test.python.config as test_config # noqa F401 + + logger.debug("Importing from executorch-integration") +except ImportError: + import executorch.backends.nxp.tests_models.config as test_config # noqa F401 + + logger.debug("Importing from executorch") diff --git a/backends/nxp/tests_models/conftest.py b/backends/nxp/tests_models/conftest.py new file mode 100644 index 00000000000..f4c7a36092a --- /dev/null +++ b/backends/nxp/tests_models/conftest.py @@ -0,0 +1,38 @@ +# Copyright 2026 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +import os +import pathlib +import shutil + +from executorch.backends.nxp.tests_models.outputs_dir_importer import outputs_dir + + +def pytest_addoption(parser): + parser.addoption( + "--nxp_runner_path", + action="store", + default=None, + help="Path to the nxp_executor_runner executable", + ) + + +def pytest_configure(config): + nxp_runner_path = config.getoption("--nxp_runner_path") + if nxp_runner_path: + os.environ["NXP_RUNNER_PATH"] = nxp_runner_path + + os.environ["PROJECT_DIR"] = str(pathlib.Path(__file__).parent.parent.parent.parent) + + +# noinspection SpellCheckingInspection +def pytest_sessionstart(session): + import executorch.extension.pybindings.portable_lib + import executorch.kernels.quantized # noqa F401 + + # Remove all cached test files + shutil.rmtree(outputs_dir.OUTPUTS_DIR, ignore_errors=True) + os.mkdir(outputs_dir.OUTPUTS_DIR) diff --git a/backends/nxp/tests_models/dataset_creator.py b/backends/nxp/tests_models/dataset_creator.py new file mode 100644 index 00000000000..4af87a38f8d --- /dev/null +++ b/backends/nxp/tests_models/dataset_creator.py @@ -0,0 +1,136 @@ +# Copyright 2025-2026 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import abc +import os.path +import shutil +from collections import OrderedDict +from os import mkdir +from random import sample, seed + +import numpy as np +import torch +from executorch.backends.nxp.backend.ir.converter.conversion import translator + +from executorch.backends.nxp.tests_models.model_input_spec import ModelInputSpec +from executorch.examples.nxp.models.calibration_dataset import CalibrationDataset +from torch import Tensor + + +class DatasetCreator(abc.ABC): + + @abc.abstractmethod + def generate_samples(self, dataset_dir, input_spec: list[ModelInputSpec]): + pass + + +class RandomDatasetCreator(DatasetCreator): + """Dataset creator that generates random input samples.""" + + def __init__(self, num_samples=2): + self._num_samples = num_samples + + def generate_samples(self, dataset_dir, input_spec): + assert isinstance(input_spec, list) and all( + isinstance(spec, ModelInputSpec) for spec in input_spec + ), "Input_spec must be a list of ModelInputSpec." + rng = np.random.default_rng(42) + + for idx in range(self._num_samples): + sample_dir = dataset_dir + + # Multi-input, use a subdirectory containing the inputs for each sample + if len(input_spec) > 1: + sample_dir = os.path.join(dataset_dir, f"{str(idx).zfill(4)}") + mkdir(sample_dir) + + for spec_idx, spec in enumerate(input_spec): + match spec.dim_order: + case torch.contiguous_format: + shape = spec.shape + case torch.channels_last: + shape = tuple( + translator.dims_to_channels_last(list(spec.shape)) + ) + case _: + raise ValueError(f"Unsupported dim_order: {spec.dim_order}") + + sample_vector = rng.random(np.prod(shape), spec.type).reshape(shape) + sample_vector.tofile( + os.path.join(sample_dir, f"{str(spec_idx).zfill(2)}.bin") + ) + + +class CopyDatasetCreator(DatasetCreator): + """Creator that just copies data from other directory.""" + + def __init__(self, source_dir: str): + self._source_dir = source_dir + + def generate_samples(self, dataset_dir, input_spec): + for sample_name in os.listdir(self._source_dir): + sample_path = os.path.join(self._source_dir, sample_name) + shutil.copy(sample_path, dataset_dir) + + +class FromCalibrationDataDatasetCreator(DatasetCreator): + """Creator that uses CalibrationDataset archive file.""" + + def __init__( + self, + dataset: CalibrationDataset, + num_examples: int, + idx_to_label: dict[int, str], + ): + self._dataset = dataset + self._num_examples = num_examples + self._idx_to_label = idx_to_label + seed(42) + + @staticmethod + def _get_example_np_data(example): + if isinstance(example, tuple): + if len(example) == 2: + data, _ = example + elif len(example) == 1: + data = example[0] + else: + raise ValueError(f"Unexpected number of elements in {example}.") + else: + raise NotImplementedError("Examples other than tuple are not supported.") + + if isinstance(data, Tensor): + return [data.unsqueeze(0).numpy()] + elif isinstance(data, list) and all(isinstance(dt, Tensor) for dt in data): + return [dt.unsqueeze(0).numpy() for dt in data] + else: + raise TypeError("Data must be a single Tensor or a list of Tensors.") + + def generate_samples(self, dataset_dir, input_spec): + os.makedirs(dataset_dir, exist_ok=True) + assert ( + type(self._dataset[0]) is tuple and len(self._dataset[0]) == 2 + ), "Provide calibration data with examples and labels" + + # We need to use ordered collection for deterministic selection of samples + classes = OrderedDict([(cl, None) for _, cl in self._dataset]) + examples_per_class = self._num_examples // len(classes) + idx_list = [] + for cl in classes.keys(): + cl_idx_list = [ + idx for idx in range(len(self._dataset)) if self._dataset[idx][1] == cl + ] + class_indices = list( + zip(sample(cl_idx_list, examples_per_class), [cl] * examples_per_class) + ) + idx_list.extend(class_indices) + + for i, (idx, cl) in enumerate(idx_list): + label = self._idx_to_label[cl] + example = self._dataset[idx] + data = self._get_example_np_data(example) + for inp_idx, dt in enumerate(data): + bin_file_name = f"{dataset_dir}/example_{label}_{cl}_{i}_i{str(inp_idx).zfill(2)}.bin" + dt.tofile(bin_file_name) diff --git a/backends/nxp/tests_models/executors.py b/backends/nxp/tests_models/executors.py new file mode 100644 index 00000000000..74689f02c4e --- /dev/null +++ b/backends/nxp/tests_models/executors.py @@ -0,0 +1,456 @@ +# Copyright 2024-2026 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import functools +import inspect +import logging +import os.path +import shutil +import subprocess +from os import mkdir + +import numpy as np +import torch +from executorch.backends.nxp.backend.edge_helper import is_channels_last_dim_order +from executorch.backends.nxp.backend.ir.converter.conversion import translator +from executorch.backends.nxp.neutron_partitioner import NeutronPartitioner + +from executorch.backends.nxp.tests_models.config_importer import test_config + +from executorch.backends.nxp.tests_models.dataset_creator import RandomDatasetCreator +from executorch.backends.nxp.tests_models.graph_verifier import GraphVerifier +from executorch.backends.nxp.tests_models.model_input_spec import ModelInputSpec +from executorch.backends.nxp.tests_models.model_output_comparator import ( + AllCloseOutputComparator, +) +from executorch.backends.nxp.tests_models.outputs_dir_importer import outputs_dir +from executorch.backends.nxp.tests_models.utils import ( + save_pte_program, + to_quantized_executorch_program, +) +from executorch.devtools.visualization.visualization_utils import ( + visualize_with_clusters, +) +from pytest_mock import MockerFixture +from torch.export import ExportedProgram + +logger = logging.getLogger(__name__) + +OUTPUTS_DIR = outputs_dir.OUTPUTS_DIR +NSYS_PATH = test_config.NSYS_PATH +NSYS_CONFIG_PATH = test_config.NSYS_CONFIG_PATH +NSYS_FIRMWARE_PATH = test_config.NSYS_FIRMWARE_PATH +NEUTRON_TEST_PATH = test_config.NEUTRON_TEST_PATH + + +def _run_delegated_executorch_program( + model, + test_dir, + test_name, + dataset_dir, + input_spec, + dlg_model_verifier, + npu_results_dir, + mocker, + use_qat: bool = False, +) -> ExportedProgram: + if len(input_spec) == 1: + # Single input, use --dataset + dataset_cli = "--dataset" + dataset_or_inputs = dataset_dir + else: + # Multiple input, use --inputs with subdirectories + dataset_cli = "--inputs" + dataset_or_inputs = ",".join( + sorted([os.path.join(dataset_dir, d) for d in os.listdir(dataset_dir)]) + ) + + # Run nxp_executor_runner with program delegated to NPU + delegated_model_path = os.path.abspath( + os.path.join(test_dir, f"{test_name}_delegated.pte") + ) + + delegated_cmd = f"{NEUTRON_TEST_PATH} --model {delegated_model_path} {dataset_cli} {dataset_or_inputs} \ + --output {npu_results_dir} --firmware {NSYS_FIRMWARE_PATH} --nsys {NSYS_PATH} --nsys_config {NSYS_CONFIG_PATH}" + try: + if mocker: + method = getattr(NeutronPartitioner, "partition") # noqa B009 + + def wrapper(*args, **kwargs): + result = method(*args, **kwargs) + visualize_with_clusters( + result.tagged_exported_program, + os.path.join(test_dir, test_name + "_partitioned.json"), + False, + ) + return result + + wrapped = functools.update_wrapper(wrapper, method) + mocker.patch.object( + NeutronPartitioner, "partition", side_effect=wrapped, autospec=True + ) + delegated_program = to_quantized_executorch_program( + model, input_spec, dataset_dir, delegate_to_npu=True, use_qat=use_qat + ) + except RuntimeError as e: + if "Model converted with neutron-converter has" in str(e): + dlg_model_verifier.check_num_delegated_nodes(e.args[1]) + raise + + exported_program = delegated_program.exported_program() + nodes = list(exported_program.graph.nodes) + assert any( + node.name.startswith("executorch_call_delegate") for node in nodes + ), "No delegated parts found in program delegated to NPU!" + dlg_model_verifier.verify_graph(exported_program.graph) + + save_pte_program(delegated_program, test_name + "_delegated", test_dir) + execute_cmd(delegated_cmd) + + return exported_program + + +def _run_non_delegated_executorch_program( + model, test_dir, test_name, dataset_dir, input_spec, cpu_results_dir +) -> ExportedProgram: + if len(input_spec) == 1: + # Single input, use --dataset + dataset_cli = "--dataset" + dataset_or_inputs = dataset_dir + else: + # Multiple input, use --inputs with subdirectories + dataset_cli = "--inputs" + dataset_or_inputs = ",".join( + sorted([os.path.join(dataset_dir, d) for d in os.listdir(dataset_dir)]) + ) + + # Run program via nxp_executor_runner on CPU + non_delegated_model_path = os.path.abspath( + os.path.join(test_dir, f"{test_name}_non_delegated.pte") + ) + + non_delegated_cmd = f"{NEUTRON_TEST_PATH} --model {non_delegated_model_path} {dataset_cli} {dataset_or_inputs} \ + --output {cpu_results_dir} --firmware {NSYS_FIRMWARE_PATH} --nsys {NSYS_PATH} --nsys_config {NSYS_CONFIG_PATH}" + + non_delegated_program = to_quantized_executorch_program( + model, input_spec, dataset_dir, delegate_to_npu=False + ) + + nodes = list(non_delegated_program.exported_program().graph.nodes) + assert all( + not node.name.startswith("executorch_call_delegate") for node in nodes + ), "Delegated parts found in program executed on CPU!" + + save_pte_program(non_delegated_program, test_name + "_non_delegated", test_dir) + execute_cmd(non_delegated_cmd) + + return non_delegated_program.exported_program() + + +def read_prepared_samples( + dataset_dir: str, input_spec: list[ModelInputSpec] +) -> list[tuple[np.ndarray, ...]]: + """Read numpy arrays generated by a `DatasetCreator`. + + :param dataset_dir: Directory containing the generated samples + :param input_spec: List of ModelInputSpec defining the shape and type of each input + + :return: List of tuples, where each tuple contains numpy arrays for one sample + """ + all_samples = [] + + # Multi-input: samples are in numbered subdirectories + if len(input_spec) > 1: + sample_dirs = sorted( + [ + d + for d in os.listdir(dataset_dir) + if os.path.isdir(os.path.join(dataset_dir, d)) + ] + ) + + for sample_name in sample_dirs: + sample_dir = os.path.join(dataset_dir, sample_name) + current_samples = [] + + for spec_idx, spec in enumerate(input_spec): + bin_file_path = os.path.join( + sample_dir, f"{str(spec_idx).zfill(2)}.bin" + ) + sample_vector = np.fromfile(bin_file_path, dtype=spec.type).reshape( + spec.shape + ) + current_samples.append(sample_vector) + + all_samples.append(tuple(current_samples)) + + # Single-input: binary files are directly in dataset_dir + else: + bin_files = sorted([f for f in os.listdir(dataset_dir) if f.endswith(".bin")]) + + for bin_file in bin_files: + bin_file_path = os.path.join(dataset_dir, bin_file) + sample_vector = np.fromfile( + bin_file_path, dtype=input_spec[0].type + ).reshape(input_spec[0].shape) + all_samples.append((sample_vector,)) + + return all_samples + + +def store_results( + results: list[tuple[np.ndarray, ...]], output_dir: str, reference_dir: str +): + """Store a list of output arrays in the directory structure matching the reference directory. + + :param results: List of tuples, where each tuple contains numpy arrays (outputs for one sample) + :param output_dir: Directory where results will be stored + + Directory structure created matches reference_dir: + output_dir/ + ├── sample_0/ + │ ├── 0000.bin + │ └── 0001.bin + ├── some_other_sample/ + │ ├── 0000.bin + │ └── 0001.bin + """ + os.makedirs(output_dir, exist_ok=True) + + # Get subdirectories from reference directory + sample_dirs = sorted( + [ + d + for d in os.listdir(reference_dir) + if os.path.isdir(os.path.join(reference_dir, d)) + ] + ) + + assert len(sample_dirs) == len( + results + ), f"Number of samples ({len(results)}) must match number of subdirectories in reference_dir ({len(sample_dirs)})" + + for _sample_idx, (sample_name, sample_outputs) in enumerate( + zip(sample_dirs, results) + ): + sample_dir = os.path.join(output_dir, sample_name) + os.makedirs(sample_dir, exist_ok=True) + + # Store each output tensor + for output_idx, output_array in enumerate(sample_outputs): + bin_file_name = f"{str(output_idx).zfill(4)}.bin" + bin_file_path = os.path.join(sample_dir, bin_file_name) + output_array.tofile(bin_file_path) + + +def _run_pytorch_program( + model, + dataset_dir, + input_spec: list[ModelInputSpec], + output_spec: list[torch.Tensor], + cpu_results_dir, + npu_results_dir, +): + all_outputs = [] + + for input_samples in read_prepared_samples(dataset_dir, input_spec): + current_input_samples = [] + for spec, sample in zip(input_spec, input_samples, strict=True): + match spec.dim_order: + case torch.contiguous_format: + # Use the data as is, just turn it into a PyTorch tensor. + sample = torch.tensor(sample) + + case torch.channels_last: + # The tensor data was stored by the DatasetCreator as channels last (NHWC), but it was now + # incorrectly parsed as contiguous/channels first (NCHW). Transpose it to channels last to preserve + # the semantics. + channels_last_shape = translator.dims_to_channels_last( + list(spec.shape) + ) + sample = np.moveaxis(sample.reshape(channels_last_shape), -1, 1) + sample = torch.tensor(sample).to(memory_format=torch.channels_last) + + case _: + raise ValueError(f"Unsupported dim_order: {spec.dim_order}") + + current_input_samples.append(sample) + + # Run the model. + output = model(*current_input_samples) + if isinstance(output, torch.Tensor): + output = (output,) + + current_outputs = [] + + for o, o_spec in zip(output, output_spec, strict=True): + dim_order = list(o_spec.dim_order()) # ExecuTorch dim order. + rank = len(o_spec.shape) + if dim_order == list(range(rank)): # Contiguous dim order. + current_outputs.append(o.detach().numpy()) + + elif is_channels_last_dim_order(dim_order): # Channels last dim order. + # The NPU variant outputs channels last (NHWC). We need to convert the CPU output to match. + o = o.detach().numpy().reshape(o_spec.shape) + current_outputs.append(np.moveaxis(o, 1, -1)) + + else: + raise ValueError(f"Unsupported dim_order: {o_spec.dim_order}") + + all_outputs.append(current_outputs) + + # Store all the results. + store_results(all_outputs, cpu_results_dir, npu_results_dir) + + +def convert_run_compare( + model: torch.nn.Module, + input_spec: list[ModelInputSpec] | tuple, + dlg_model_verifier: GraphVerifier, + dataset_creator=None, + output_comparator=None, + mocker: MockerFixture = None, + run_cpu_version_in_pytorch: bool = False, + use_qat: bool = False, +): + """ + Run provided program twice with neutron-test and check if results correspond. At first, + non-delegated program is executed (all nodes run on CPU), followed by delegated one + (some nodes run on Neutron NPU). + + :param model: Executed PyTorch model. + :param input_spec: Model input specification. Can be either tuple - single float32 input model - or list + of ModelInputSpec. + :param dataset_creator: Creator that should fill provided `dataset_dir` with model input samples. + :param output_comparator: Comparator of results produced by NPU and CPU runs of the program. + :param dlg_model_verifier: Graph verifier instance. + :param run_cpu_version_in_pytorch: If True, runs CPU version in float32 PyTorch instead of quantized ExecuTorch. + :param mocker: Mocker instance used by visualizer. + :param use_qat: If True, applies quantization-aware training before conversion (without the QAT training). + """ + assert os.path.exists(NSYS_PATH) + assert os.path.exists(NSYS_CONFIG_PATH) + assert os.path.exists(NSYS_FIRMWARE_PATH) + + if not dataset_creator: + dataset_creator = RandomDatasetCreator() + if not output_comparator: + output_comparator = AllCloseOutputComparator() + + test_name = _get_caller_name() + test_dir = os.path.join(OUTPUTS_DIR, test_name) + + shutil.rmtree(test_dir, ignore_errors=True) + mkdir(test_dir) + + dataset_dir = os.path.join(test_dir, "dataset") + mkdir(dataset_dir) + if isinstance(input_spec, tuple): + input_spec = [ModelInputSpec(input_spec)] + + dataset_creator.generate_samples(dataset_dir, input_spec) + + cpu_results_dir = os.path.join(test_dir, "results_cpu") + npu_results_dir = os.path.join(test_dir, "results_npu") + + delegated_program = _run_delegated_executorch_program( + model, + test_dir, + test_name, + dataset_dir, + input_spec, + dlg_model_verifier, + npu_results_dir, + mocker, + use_qat=use_qat, + ) + + output_spec = _get_program_output_spec(delegated_program) + + if run_cpu_version_in_pytorch: + _run_pytorch_program( + model, + dataset_dir, + input_spec, + output_spec, + cpu_results_dir, + npu_results_dir, + ) + else: + _run_non_delegated_executorch_program( + model, test_dir, test_name, dataset_dir, input_spec, cpu_results_dir + ) + + output_tensor_spec = _get_program_output_spec(delegated_program) + + npu_results_dir = os.path.join(test_dir, "results_npu") + cpu_results_dir = os.path.join(test_dir, "results_cpu") + output_comparator.compare_results( + cpu_results_dir, npu_results_dir, output_tensor_spec + ) + + +def _get_caller_name(): + for idx, frame in enumerate(inspect.stack()): + if frame.function == "convert_run_compare": + # Look one index above to get caller + return inspect.stack()[idx + 1].function + + +def execute_cmd(cmd, cwd="."): + env = {"LD_LIBRARY_PATH": NSYS_PATH.parent} + + with subprocess.Popen( + cmd, + cwd=cwd, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env, + ) as process: + cmd_out, cmd_err = process.communicate() + cmd_out_decoded = cmd_out.strip().decode("utf-8", errors="replace") + cmd_error_decoded = cmd_err.strip().decode("utf-8", errors="replace") + + for line in cmd_out_decoded.split("\n"): + logger.info(line) + + for line in cmd_error_decoded.split("\n"): + if line: + logger.warning(line) + + return_code = process.returncode + if return_code != 0: + logger.fatal(cmd_error_decoded) + raise Exception('Error running command: "%s", rc: %d' % (cmd, return_code)) + + return cmd_out_decoded, cmd_error_decoded, return_code + + +def _get_program_output_spec(exported_program) -> list[torch.Tensor]: + """ + Get output tensor specification for provided program. + + :param exported_program: Exported program. + :return: List of output PyTorch tensors. + """ + # nodes = list(exported_program.graph.nodes) + # TODO robert: since version 0.5 the user_outputs are not updated after delegation. + # Hence bellow code does not works + # Remove/update if the feature/bug if confirmed. + + # program_outputs = exported_program.graph_signature.user_outputs + # + # output_tensors_spec = [] + # + # for node in nodes: + # if node.name in program_outputs: + # output_tensors_spec.append(node.meta["val"]) + # + # assert len(output_tensors_spec) == len(program_outputs) + + output_tensors_spec = list(exported_program.graph.output_node().meta["val"]) + + return output_tensors_spec diff --git a/backends/nxp/tests_models/graph_verifier.py b/backends/nxp/tests_models/graph_verifier.py new file mode 100644 index 00000000000..33dee7d3407 --- /dev/null +++ b/backends/nxp/tests_models/graph_verifier.py @@ -0,0 +1,86 @@ +# Copyright 2025-2026 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import abc +import re +from dataclasses import dataclass +from typing import Union + +from torch.fx.graph import Graph + + +@dataclass +class NonDelegatedNode: + node_name: str + num_occurrences: Union[int, None] = None + + +class GraphVerifier(abc.ABC): + @abc.abstractmethod + def verify_graph(self, graph: Graph): + pass + + @abc.abstractmethod + def check_num_delegated_nodes(self, num_dlg_nodes: int): + pass + + +class BaseGraphVerifier(GraphVerifier): + """Graph verifier base class. Checks for number of delegated nodes and number of selected expected nodes.""" + + def __init__( + self, + exp_num_delegate_call_nodes: int, + exp_non_delegated_nodes: list[NonDelegatedNode] = None, + ): + self.exp_non_delegated_nodes = ( + exp_non_delegated_nodes if exp_non_delegated_nodes is not None else [] + ) + self.exp_num_delegate_call_nodes = exp_num_delegate_call_nodes + + def check_num_delegated_nodes(self, num_dlg_nodes): + assert not ( + num_dlg_nodes < self.exp_num_delegate_call_nodes + ), f"Number of delegated nodes decreased from {self.exp_num_delegate_call_nodes} to {num_dlg_nodes}." + assert not ( + num_dlg_nodes > self.exp_num_delegate_call_nodes + ), f"Number of delegated nodes increased from {self.exp_num_delegate_call_nodes} to {num_dlg_nodes}." + + def verify_graph(self, graph): + nodes = list(graph.nodes) + + # Check for specific non delegated nodes + for exp_node in self.exp_non_delegated_nodes: + num_exp_nodes = len( + [node for node in nodes if exp_node.node_name in node.name] + ) + if exp_node.num_occurrences is None: + assert ( + num_exp_nodes + ), f"Graph contains no occurrences of {exp_node.node_name}." + else: + assert not ( + num_exp_nodes < exp_node.num_occurrences + ), f"Number of {exp_node.node_name} nodes decreased from {exp_node.num_occurrences} to {num_exp_nodes}." + assert not ( + num_exp_nodes > exp_node.num_occurrences + ), f"Number of {exp_node.node_name} nodes increased from {exp_node.num_occurrences} to {num_exp_nodes}." + + # Check for unexpected non delegated aten nodes + aten_fn_nodes = set( + [ + re.split(r"_\d", node.name)[0] + for node in nodes + if node.name.startswith("aten") + ] + ) + expected_aten_fn_nodes = set( + [exp_node.node_name for exp_node in self.exp_non_delegated_nodes] + ) + unexpected_aten_fn_nodes = aten_fn_nodes - expected_aten_fn_nodes + unexpected_aten_fn_nodes = "\n".join(unexpected_aten_fn_nodes) + assert ( + not unexpected_aten_fn_nodes + ), f"Graphs contains unexpected aten nodes:\n{unexpected_aten_fn_nodes}." diff --git a/backends/nxp/tests_models/model_input_spec.py b/backends/nxp/tests_models/model_input_spec.py new file mode 100644 index 00000000000..c96ca53b8da --- /dev/null +++ b/backends/nxp/tests_models/model_input_spec.py @@ -0,0 +1,18 @@ +# Copyright 2025-2026 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass + +import numpy as np +import torch +from torch import memory_format + + +@dataclass +class ModelInputSpec: + shape: tuple[int, ...] + type: np.dtype = np.float32 + dtype: torch.dtype = torch.float32 + dim_order: memory_format = torch.contiguous_format diff --git a/backends/nxp/tests_models/model_output_comparator.py b/backends/nxp/tests_models/model_output_comparator.py new file mode 100644 index 00000000000..bf4aece1003 --- /dev/null +++ b/backends/nxp/tests_models/model_output_comparator.py @@ -0,0 +1,172 @@ +# Copyright 2024-2026 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import abc +import os +from abc import abstractmethod +from pathlib import Path + +import numpy as np +import polars as pl + +from executorch.backends.nxp.backend.ir.converter.conversion.translator import ( + torch_type_to_numpy_type, +) + + +class BaseOutputComparator(abc.ABC): + + def compare_results(self, cpu_results_dir, npu_results_dir, output_tensor_spec): + """ + Check if tensors in result dirs corresponds. Directory with CPU results is taken as + a reference for compared binary files. Result dir should have the following hierarchy: + + result_dir + |-- sample_0 + |---- 0000.bin + |-- some_other_sample + |---- first_output.bin + |---- second_output.bin + + :param cpu_results_dir: Path to directory with CPU results. + :param npu_results_dir: Path to directory with NPU (delegated) results. + :param output_tensor_spec: List of output tensor specifications. + """ + sample_dirs = [ + os.path.join(cpu_results_dir, file) for file in os.listdir(cpu_results_dir) + ] + sample_dirs = [file for file in sample_dirs if os.path.isdir(file)] + + assert len(sample_dirs), "No samples to compare." + + for sample_dir in sample_dirs: + npu_output_tensors = [] + cpu_output_tensors = [] + + for idx, output_tensor_name in enumerate(os.listdir(sample_dir)): + sample_dir = os.path.basename(sample_dir) + tensor_path = os.path.join(sample_dir, output_tensor_name) + + cpu_tensor_path = os.path.join(cpu_results_dir, tensor_path) + npu_tensor_path = os.path.join(npu_results_dir, tensor_path) + + tensor_spec = output_tensor_spec[idx] + + cpu_tensor = np.fromfile( + cpu_tensor_path, dtype=torch_type_to_numpy_type(tensor_spec.dtype) + ) + np.reshape(cpu_tensor, tensor_spec.shape) + cpu_output_tensors.append((output_tensor_name, cpu_tensor)) + + npu_tensor = np.fromfile( + npu_tensor_path, dtype=torch_type_to_numpy_type(tensor_spec.dtype) + ) + np.reshape(npu_tensor, tensor_spec.shape) + npu_output_tensors.append((output_tensor_name, npu_tensor)) + + self.compare_sample(sample_dir, cpu_output_tensors, npu_output_tensors) + + @abstractmethod + def compare_sample( + self, + sample_dir, + cpu_output_tensors: list[tuple[str, np.ndarray]], + npu_output_tensors: list[tuple[str, np.ndarray]], + ): + raise NotImplementedError + + +class AllCloseOutputComparator(BaseOutputComparator): + + def __init__(self, atol=1e-7): + self.atol = atol + + def compare_sample(self, sample_dir, cpu_output_tensors, npu_output_tensors): + for idx in range(len(cpu_output_tensors)): + (cpu_output_name, cpu_tensor) = cpu_output_tensors[idx] + (npu_output_name, npu_tensor) = npu_output_tensors[idx] + + assert cpu_output_name == npu_output_name + assert np.any( + cpu_tensor + ), "Output tensor contains only zeros. This is suspicious." + assert np.allclose(cpu_tensor, npu_tensor, atol=self.atol) + + +class NumericalStatsOutputComparator(BaseOutputComparator): + + def __init__( + self, + max_mse_error=3.5e-4, + fail_if_not_close=True, + output_filename: None | str = "numerical_stats.csv", + is_classification_task=False, + ): + self._max_mse_error = max_mse_error + self._fail_if_not_close = fail_if_not_close + self._output_filename = output_filename + self._stats_data = None + self._is_classification_task = is_classification_task + + def compare_results(self, cpu_results_dir, npu_results_dir, output_tensor_spec): + self._stats_data = [] + super().compare_results(cpu_results_dir, npu_results_dir, output_tensor_spec) + + stats = pl.from_dicts(self._stats_data) + print(stats.sort("name")) + name_contains_class = stats.select( + pl.col("name").str.extract(r"example_(\w+)_", group_index=1) + ).item(0, 0) + if name_contains_class is not None: + print( + "Stats per label class:\n", + stats.group_by( + pl.col("name") + .str.extract(r"example_(\w+)_", group_index=1) + .alias("label") + ) + .agg( + pl.col("mse").mean().alias("mean_mse"), + pl.col("max_nominal_error").mean().alias("mean_max_nominal_error"), + ) + .sort("label"), + ) + + if self._output_filename: + test_results_dir = Path(cpu_results_dir).resolve().parent + stats.write_csv(os.path.join(test_results_dir, self._output_filename)) + + if self._fail_if_not_close: + error_samples = stats.filter(pl.col("mse") > self._max_mse_error) + + if len(error_samples) > 0: + error_msg = f"Some samples didn't match max MSE error threshold.\n{error_samples}" + raise AssertionError(error_msg) + + def compare_sample(self, sample_dir, cpu_output_tensors, npu_output_tensors): + for idx in range(len(cpu_output_tensors)): + (cpu_output_name, cpu_tensor) = cpu_output_tensors[idx] + (npu_output_name, npu_tensor) = npu_output_tensors[idx] + + assert cpu_output_name == npu_output_name + assert np.any( + cpu_tensor + ), "Output tensor contains only zeros. This is suspicious." + + mse = np.square(np.subtract(cpu_tensor, npu_tensor)).mean() + max_error = np.max(np.abs(cpu_tensor - npu_tensor)) + + stats = { + "name": f"{os.path.basename(sample_dir)}/{cpu_output_name}", + "shape": str(cpu_tensor.shape), + "mse": mse, + "max_nominal_error": max_error, + } + + if self._is_classification_task: + stats["argmax_cpu"] = np.argmax(cpu_tensor, axis=-1) + stats["argmax_npu"] = np.argmax(npu_tensor, axis=-1) + + self._stats_data.append(stats) diff --git a/backends/nxp/tests_models/neutron-imxrt700.ini b/backends/nxp/tests_models/neutron-imxrt700.ini new file mode 100644 index 00000000000..fb7d238b9ff --- /dev/null +++ b/backends/nxp/tests_models/neutron-imxrt700.ini @@ -0,0 +1,3 @@ +[neutron2] +num_pipelines = 8 +num_macs = 8 diff --git a/backends/nxp/tests_models/outputs_dir.py b/backends/nxp/tests_models/outputs_dir.py new file mode 100644 index 00000000000..2ea9bb9c599 --- /dev/null +++ b/backends/nxp/tests_models/outputs_dir.py @@ -0,0 +1,10 @@ +# Copyright 2026 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +import os +import pathlib + +OUTPUTS_DIR = pathlib.Path(os.getcwd()) / ".outputs" diff --git a/backends/nxp/tests_models/outputs_dir_importer.py b/backends/nxp/tests_models/outputs_dir_importer.py new file mode 100644 index 00000000000..2234dabbbeb --- /dev/null +++ b/backends/nxp/tests_models/outputs_dir_importer.py @@ -0,0 +1,17 @@ +# Copyright 2026 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import logging + +logger = logging.getLogger(__name__) + +try: + import test.python.outputs_dir as outputs_dir # noqa F401 + + logger.debug("Importing from executorch-integration") +except ImportError: + import executorch.backends.nxp.tests_models.outputs_dir as outputs_dir # noqa F401 + + logger.debug("Importing from executorch") diff --git a/backends/nxp/tests_models/test_cifarnet.py b/backends/nxp/tests_models/test_cifarnet.py new file mode 100644 index 00000000000..6f4bde5384d --- /dev/null +++ b/backends/nxp/tests_models/test_cifarnet.py @@ -0,0 +1,84 @@ +# Copyright 2024-2026 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import os.path + +import pytest + +from executorch.backends.nxp.tests_models.config_importer import test_config +from executorch.backends.nxp.tests_models.dataset_creator import CopyDatasetCreator +from executorch.backends.nxp.tests_models.executors import convert_run_compare +from executorch.backends.nxp.tests_models.graph_verifier import ( + BaseGraphVerifier, + NonDelegatedNode, +) +from executorch.backends.nxp.tests_models.model_output_comparator import ( + NumericalStatsOutputComparator, +) +from executorch.examples.nxp.experimental.cifar_net.cifar_net import ( + CifarNet, + store_test_data, +) + + +@pytest.fixture(scope="module") +def cifar_test_files(tmp_path_factory): + dataset_dir = tmp_path_factory.mktemp("cifar10_dataset") + store_test_data(dataset_dir) + return dataset_dir + + +def test_cifarnet(mocker, cifar_test_files): + model = ( + CifarNet( + pth_file=os.path.join( + test_config.PROJECT_DIR, + "examples", + "nxp", + "experimental", + "cifar_net", + "cifar_net.pth", + ) + ) + .get_eager_model() + .eval() + ) + + input_shape = (1, 3, 32, 32) + non_dlg_nodes = [NonDelegatedNode("aten__softmax_default", 1)] + + comparator = NumericalStatsOutputComparator( + max_mse_error=1e-3, is_classification_task=True + ) + convert_run_compare( + model, + input_shape, + dataset_creator=CopyDatasetCreator(cifar_test_files), + output_comparator=comparator, + dlg_model_verifier=BaseGraphVerifier(1, non_dlg_nodes), + mocker=mocker, + ) + + +def test_cifarnet_qat(mocker, cifar_test_files): + model = CifarNet().get_eager_model().eval() + + input_shape = (1, 3, 32, 32) + non_dlg_nodes = [NonDelegatedNode("aten__softmax_default", 1)] + + # The higher MSE threshold is due to using weaker "MovingAbs" observers instead of "MinMax" observers. + # The "MovingAbs" observers capture only limited number of past calibration samples compared to "MinMax", + # which uses statistics from the whole calibration set. + comparator = NumericalStatsOutputComparator( + max_mse_error=8e-2, is_classification_task=True + ) + convert_run_compare( + model, + input_shape, + dataset_creator=CopyDatasetCreator(cifar_test_files), + output_comparator=comparator, + dlg_model_verifier=BaseGraphVerifier(1, non_dlg_nodes), + mocker=mocker, + use_qat=True, + ) diff --git a/backends/nxp/tests_models/utils.py b/backends/nxp/tests_models/utils.py new file mode 100644 index 00000000000..8d174759d59 --- /dev/null +++ b/backends/nxp/tests_models/utils.py @@ -0,0 +1,209 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024 - 2026 NXP +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import os +from typing import Any, Dict, Optional, Tuple, Union + +import executorch.exir as exir + +import numpy as np +import torch +from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec +from executorch.backends.nxp.edge_passes.neutron_edge_pass_manager import ( + NeutronEdgePassManager, +) +from executorch.backends.nxp.neutron_partitioner import NeutronPartitioner +from executorch.backends.nxp.nxp_backend import generate_neutron_compile_spec +from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer +from executorch.backends.nxp.tests_models.model_input_spec import ModelInputSpec +from executorch.devtools.visualization.visualization_utils import ( + visualize_with_clusters, +) +from executorch.exir import ( + EdgeCompileConfig, + EdgeProgramManager, + ExecutorchBackendConfig, + ExecutorchProgramManager, + to_edge_transform_and_lower, +) +from executorch.exir.tracer import Value +from torch.export import export, ExportedProgram +from torchao.quantization.pt2e import move_exported_model_to_eval +from torchao.quantization.pt2e.quantize_pt2e import ( + convert_pt2e, + prepare_pt2e, + prepare_qat_pt2e, +) + +_EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig( + _check_ir_validity=True, + _skip_dim_order=True, # TODO(T189114319): Reuse dim order op after solving the ios oss issue +) + + +def to_quantized_edge_program( + model: torch.nn.Module, + input_spec: list[ModelInputSpec], + dataset_dir, + delegate_to_npu=True, + use_qat: bool = False, +) -> EdgeProgramManager: + assert isinstance(input_spec, list) and all( + isinstance(spec, ModelInputSpec) for spec in input_spec + ), "Input_spec must be a list of ModelInputSpec." + + example_input = [] + for spec in input_spec: + match spec.dim_order: + case torch.contiguous_format: + sample = torch.ones(spec.shape, dtype=spec.dtype) + case torch.channels_last: + sample = torch.ones(spec.shape, dtype=spec.dtype).to( + memory_format=torch.channels_last + ) + case _: + raise ValueError(f"Unsupported dim_order: {spec.dim_order}") + # noinspection PyUnboundLocalVariable + example_input.append(sample) + + example_input = tuple(example_input) + + exir_program_aten = torch.export.export(model, example_input, strict=True) + module = exir_program_aten.module() + + neutron_target_spec = NeutronTargetSpec( + target="imxrt700", neutron_converter_flavor="SDK_25_12" + ) + + # Quantize model + quantizer = NeutronQuantizer( + neutron_target_spec=neutron_target_spec, is_qat=use_qat + ) + if use_qat: + m = prepare_qat_pt2e(module, quantizer) + m = move_exported_model_to_eval(m) + else: + m = prepare_pt2e(module, quantizer) + + data = sorted(os.listdir(dataset_dir)) + inputs_needed = len(input_spec) + + # If the model is single-input, the following directory structure is used: + # dataset_dir/data.bin (data.bin is *the* input) + # Else, if multi-input, the following directory structure is used: + # dataset_dir/data/{.+}.bin (each .bin file is an input) + + input_data = [] + for path in data: + path = os.path.join(dataset_dir, path) + files = [] + + if os.path.isdir(path): + files = [os.path.join(path, x) for x in sorted(os.listdir(path))] + else: + files.append(path) + + for idx, file in enumerate(files): + if len(input_data) == inputs_needed: + break + + tensor = np.fromfile(file, dtype=input_spec[idx].type).reshape( + input_spec[idx].shape + ) + input_data += (torch.from_numpy(tensor),) + continue + + if len(input_data) < inputs_needed: + continue + + m(*input_data) + input_data.clear() + + exir_program_aten_quant = convert_pt2e(m) + + # To ATen + core_aten_ep = _to_core_aten( + exir_program_aten_quant, example_input, None, verbose=True + ) + + partitioners = ( + ( + [ + NeutronPartitioner( + generate_neutron_compile_spec("imxrt700", "SDK_25_12"), + neutron_target_spec=neutron_target_spec, + ) + ] + ) + if delegate_to_npu + else [] + ) + + edge_program_manager = to_edge_transform_and_lower( + core_aten_ep, + transform_passes=NeutronEdgePassManager(), + partitioner=partitioners, + compile_config=EdgeCompileConfig(), + ) + + return edge_program_manager + + +def to_quantized_executorch_program( + model: torch.nn.Module, + input_spec, + dataset_dir: str, + delegate_to_npu=True, + use_qat: bool = False, +) -> ExecutorchProgramManager: + edge_program_manager = to_quantized_edge_program( + model, input_spec, dataset_dir, delegate_to_npu, use_qat=use_qat + ) + + return edge_program_manager.to_executorch( + config=ExecutorchBackendConfig(extract_delegate_segments=False) + ) + + +def _to_core_aten( + model: Union[torch.fx.GraphModule, torch.nn.Module], + example_inputs: Tuple[Value, ...], + dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None, + verbose=True, +) -> ExportedProgram: + # post autograd export. eventually this will become .to_core_aten + if not isinstance(model, torch.fx.GraphModule) and not isinstance( + model, torch.nn.Module + ): + raise ValueError( + f"Expected passed in model to be an instance of fx.GraphModule, got {type(model)}" + ) + core_aten_ep = export(model, example_inputs, dynamic_shapes=dynamic_shapes) + if verbose: + logging.info(f"Core ATen graph:\n{core_aten_ep.graph}") + return core_aten_ep + + +def save_pte_program( + prog: ExecutorchProgramManager, model_name: str, output_dir: str = "" +) -> str: + if model_name.endswith(".pte"): + filename = model_name + visualize_file_name = f"{model_name}.json" + else: + filename = os.path.join(output_dir, f"{model_name}.pte") + visualize_file_name = os.path.join(output_dir, f"{model_name}.json") + try: + with open(filename, "wb") as file: + prog.write_to_file(file) + logging.info(f"Saved exported program to {filename}") + except Exception as e: + logging.error(f"Error while saving to {filename}: {e}") + + visualize_with_clusters(prog.exported_program(), visualize_file_name, False) + return filename diff --git a/docs/source/backends/nxp/nxp-overview.md b/docs/source/backends/nxp/nxp-overview.md index 5253f8d39aa..08b4c001ef2 100644 --- a/docs/source/backends/nxp/nxp-overview.md +++ b/docs/source/backends/nxp/nxp-overview.md @@ -37,18 +37,15 @@ $ ./examples/nxp/setup.sh ## Using NXP eIQ Backend -To test converting a neural network model for inference on NXP eIQ Neutron backend, you can use our example script: - -```shell -# cd to the root of executorch repository -./examples/nxp/aot_neutron_compile.sh [model (cifar10 or mobilenetv2)] -``` +To test the eIQ Neutron Backend, both AoT flow for model preparation and Runtime for execution, refer to the [Getting started with eIQ Neutron NPU ExecuTorch backend](tutorials/nxp-basic-tutorial.md) For a quick overview how to convert a custom PyTorch model, take a look at our [example python script](https://github.com/pytorch/executorch/tree/release/1.0/examples/nxp/aot_neutron_compile.py). ## Runtime Integration +An example runtime application using the eIQ NSYS (eIQ Neutron Simulator) is available [examples/nxp/executor_runner](https://github.com/pytorch/executorch/blob/main/examples/nxp/executor_runner/), described in the tutorial [Getting started with eIQ Neutron NPU ExecuTorch backend](tutorials/nxp-basic-tutorial.md) + To learn how to run the converted model on the NXP hardware, use one of our example projects on using ExecuTorch runtime from MCUXpresso IDE example projects list. For more finegrained tutorial, visit [this manual page](https://mcuxpresso.nxp.com/mcuxsdk/latest/html/middleware/eiq/executorch/docs/nxp/topics/example_applications.html). diff --git a/docs/source/backends/nxp/tutorials/nxp-basic-tutorial.md b/docs/source/backends/nxp/tutorials/nxp-basic-tutorial.md index 3f183a44f29..9afd460a8fe 100644 --- a/docs/source/backends/nxp/tutorials/nxp-basic-tutorial.md +++ b/docs/source/backends/nxp/tutorials/nxp-basic-tutorial.md @@ -1,9 +1,35 @@ -# Preparing a Model for NXP eIQ Neutron Backend +# Getting started with eIQ Neutron NPU ExecuTorch backend + +## Prerequisities + +### Hardware +For this tutorial, you will need a Linux machine with x86_64 processor architecture. +This tutorial demonstrates the use of the eIQ Neutron backend with the Neutron behavioral simulator, called NSYS. You don't need any specific development board for this tutorial. + +### Software +First you need to have Python 3.10 - 3.12 installed. + +You need to install the ExecuTorch. Please follow the tutorial to install the ExecuTorch [Setting Up ExecuTorch](../../../getting-started-setup.rst) + + +In addition to this, you will need to install the eIQ Neutron Simulator, called NSYS, +and the Neutron Converter for generating the byte-code for the eIQ Neutron NPU, +during the model conversion in ExecuTorch AoT flow. +To install the eIQ Neutron dependencies, run: +```bash +examples/nxp/setup.sh +``` +This will install: +* Neutron Converter, for converting the Neutron IR to Neutron byte-code +* eIQ Neutron SDK, containing the eIQ Neutron runtimes (driver and firmware) for various NXP SoC and simulator +* eIQ NSYS, the Neutron behavioral simulator + +## Preparing a Model for NXP eIQ Neutron Backend This guide demonstrating the use of ExecuTorch AoT flow to convert a PyTorch model to ExecuTorch format and delegate the model computation to eIQ Neutron NPU using the eIQ Neutron Backend. -## Step 1: Environment Setup +### Step 1: Environment Setup This tutorial is intended to be run from a Linux and uses Conda or Virtual Env for Python environment management. For full setup details and system requirements, see [Getting Started with ExecuTorch](/getting-started). @@ -11,14 +37,107 @@ Create a Conda environment and install the ExecuTorch Python package. ```bash conda create -y --name executorch python=3.12 conda activate executorch +``` +and install the ExecuTorch Python package, either a prebuilt one: +```bash conda install executorch ``` +or build from source: +```bash +./install_executorch.sh +``` -Run the setup.sh script to install the neutron-converter: -```commandline +Also run the `setup.sh` script to install the eIQ Neutron dependencies: +```bash $ ./examples/nxp/setup.sh ``` -## Step 2: Model Preparation and Running the Model on Target +### Step 2: Model Preparation and Running the Model on Target + +See the example `aot_neutron_compile.py` and its [README](https://github.com/pytorch/executorch/blob/main/examples/nxp/README.md) file. + +For the purpose of this tutorial we will use a simple image classification model CifarNet10. +```bash +python -m examples.nxp.aot_neutron_compile --quantize \ + --delegate --neutron_converter_flavor SDK_25_12 -m "cifar10" +``` + +Also, we will dump few of the images from the Cifar 10 dataset to a folder: +```bash +python -m examples.nxp.experimental.cifar_net.cifar_net --store-test-data +``` +The destination folder is `./cifar10_test_data`. + +## Runtime with NSYS +### nxp_executor_runner example +The end-to-end example illustrating the use of the eIQ Neutron NPU is located in `examples/nxp/executor_runner`. +Before we proceed with the build and execution, let's stop to briefly introduce the example. + +The application runs the executorch on a particular model. +If a delegated model is provided to the runner, as in this tutorial, the instruction `DelegateCall` having the `NeutronBackend` ID is present in the model. +This instruction is recognized by ExecuTorch runtime and handed over to the NeutronBackend. +NeutronBackend runs it on the eIQ Neutron NPU, simulated by NSYS. + +The NeutronDriver's API (`NeutronDriver.h` and `NeutronErrors.h`) is the same, regardless of whether it is a physical IP or NSYS. +What is specific, is the API provided by the `NeutronEnvConfig.h`, to set up the paths to: +* the NSYS, +* the Neutron Firmware to run and +* the NSYS configuration (.ini file): +```c++ + storeNsysConfigPath(FLAGS_nsys_config.c_str()); + storeFirmwarePath(FLAGS_firmware.c_str()); + storeNsysPath(FLAGS_nsys.c_str()); +``` + +What corresponds to nxp_executor_runner CLI options, `--firmware`, `--nsys`, `--nsys_config` or provided by environmental variable: + +| CLI Option | Environmental variable | Description | +|-----------------|------------------------|-------------| +| `--firmware` | NSYS_FIRMWARE_PATH | Path to the NSYS firmware. The NSYS firmware for each supported platform is provided by the eIQ Neutron SDK package. | +| `--nsys_config` | NSYS_CONFIG_PATH | Path to the NSYS configuration. For i.MXRT700 SoC available in `examples/nxp/exector_runner/neutron-imxrt700.ini`. For other platform not required. +| `--nsys` | N/A | Path to the eIQ NSYS simulator executable (`which nsys`) | + +For further details about `eiq_nsys` refer for the provided user guide in the python package. + +### Build the nxp_executor_runner +Use the provided `examples/nxp/executor_runner/CMakeLists.txt`. It can be build separately: +```bash +mkdir ./examples/nxp/executor_runner/build +pushd ./examples/nxp/executor_runner/build +cmake .. +make nxp_executor_runner +popd +``` +or as part of the ExecuTorch build: +```bash +cd +mkdir build +cmake .. \ + -DEXECUTORCH_BUILD_NXP_NEUTRON=ON \ + -DEXECUTORCH_BUILD_NXP_NEUTRON_RUNNER=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT=ON \ +make nxp_executorch_runner +``` + +### Run the nxp_executor_runner + +```bash +./examples/nxp/executor_runner/build/nxp_executor_runner \ + --firmware `make -C ./examples/nxp/executor_runner/build locate_neutron_firmware | grep "NeutronFirmware.elf" ` \ + --nsys `which nsys` \ + --nsys_config ./examples/nxp/executor_runner/neutron-imxrt700.ini \ + --model ./cifar10_nxp_delegate.pte \ + --dataset ./cifar10_test_data \ + --output ./cifar10_test_output +``` + +## Takeways +For the convenience, you can run the provided utility script doing all the above-mentioned steps for detailed inspection: +```bash +cd +./examples/nxp/run.sh +``` -See the example `aot_neutron_compile.py` and its [README](https://github.com/pytorch/executorch/blob/release/1.0/examples/nxp/README.md) file. +## FAQs +If you encounter any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new) and label as `module: nxp`. diff --git a/examples/nxp/aot_neutron_compile.py b/examples/nxp/aot_neutron_compile.py index b4e7d77534d..e4eb9cdb9d3 100644 --- a/examples/nxp/aot_neutron_compile.py +++ b/examples/nxp/aot_neutron_compile.py @@ -48,8 +48,8 @@ from .experimental.cifar_net.cifar_net import ( CifarNet, - test_cifarnet_model, train_cifarnet_model, + verify_cifarnet_model, ) from .models.mobilenet_v2 import MobilenetV2 @@ -282,7 +282,7 @@ def get_model_and_inputs_from_name(model_name: str): if args.test: match args.model_name: case "cifar10": - accuracy = test_cifarnet_model(module) + accuracy = verify_cifarnet_model(module) case _: raise NotImplementedError( diff --git a/examples/nxp/executor_runner/CMakeLists.txt b/examples/nxp/executor_runner/CMakeLists.txt new file mode 100644 index 00000000000..6a0c79f3402 --- /dev/null +++ b/examples/nxp/executor_runner/CMakeLists.txt @@ -0,0 +1,122 @@ +# Copyright 2026 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +cmake_minimum_required(VERSION 3.16) +project(nxp_executor_runner C CXX) + +set(EIQ_NEUTRON_TARGET + "imxrt700" + CACHE STRING "Neutron Target to build the executor_runner" +) +set(EXECUTORCH_PATH ${CMAKE_CURRENT_LIST_DIR}/../../../) + +find_package( + Python + COMPONENTS Interpreter + REQUIRED +) + +message(STATUS "Looking for Neutron Driver") +if(NOT TARGET nd) + message(STATUS "Looking for Neutron Driver -- not found") + + message(STATUS "Looking for eIQ Neutron SDK") + execute_process( + COMMAND + ${Python_EXECUTABLE} -c + "import eiq_neutron_sdk, os; print(os.path.dirname(eiq_neutron_sdk.__file__))" + OUTPUT_VARIABLE EIQ_NEUTRON_SDK_PATH + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + message( + STATUS "Looking for eIQ Neutron SDK -- found: ${EIQ_NEUTRON_SDK_PATH}" + ) + message(STATUS "Looking for Neutron target $CACHE{EIQ_NEUTRON_TARGET}") + set(EIQ_NEUTRON_TARGET_DIR + "${EIQ_NEUTRON_SDK_PATH}/target/$CACHE{EIQ_NEUTRON_TARGET}" + ) + if(NOT EXISTS ${EIQ_NEUTRON_TARGET_DIR}) + message( + FATAL_ERROR + "Looking for Neutron target $CACHE{EIQ_NEUTRON_TARGET} -- not found" + ) + else() + message( + STATUS + "Looking for Neutron target $CACHE{EIQ_NEUTRON_TARGET} -- found: ${EIQ_NEUTRON_TARGET_DIR}" + ) + endif() + + set(EIQ_NEUTRON_TARGET_INCLUDE_DIR "${EIQ_NEUTRON_TARGET_DIR}/common/include" + "${EIQ_NEUTRON_TARGET_DIR}/driver/include" + ) + set(EIQ_NEUTRON_TARGET_DRIVER + "${EIQ_NEUTRON_TARGET_DIR}/cmodel/libNeutronDriver.a" + ) + set(EIQ_NEUTRON_TARGET_FIRMWARE + "${EIQ_NEUTRON_TARGET_DIR}/cmodel/NeutronFirmware.elf" + ) + + add_library(nd STATIC IMPORTED GLOBAL) + set_target_properties( + nd PROPERTIES IMPORTED_LOCATION ${EIQ_NEUTRON_TARGET_DRIVER} + ) + set_target_properties( + nd PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + "${EIQ_NEUTRON_TARGET_INCLUDE_DIR}" + ) + + # Add a virtual target to dump the path to corresponding neutron_firmware + add_custom_target(locate_neutron_firmware echo ${EIQ_NEUTRON_TARGET_FIRMWARE}) +else() + message(STATUS "Neutron Driver -- found") +endif() + +# Check if the ExecuTorch Target is aleady defined, what indicate this +# CMakeLists.txt is involved as subproject: +message(STATUS "Looking for executorch target") +if(NOT TARGET executorch) + message(STATUS "Looking for executorch target -- not found, adding") + set(EXECUTORCH_BUILD_NXP_NEUTRON ON) + set(EXECUTORCH_BUILD_KERNELS_QUANTIZED ON) + set(EXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT ON) + add_subdirectory( + "${EXECUTORCH_PATH}" "${CMAKE_CURRENT_BINARY_DIR}/executorch" + EXCLUDE_FROM_ALL + ) +else() + message(STATUS "Looking for executorch target -- found") +endif() + +message(STATUS "Looking for gflags") +if(NOT TARGET gflags::gflags) + message(STATUS "Looking for gflags -- not found, adding") + add_subdirectory( + "${EXECUTORCH_PATH}/third-party/gflags" + "${CMAKE_CURRENT_BINARY_DIR}/gflags" EXCLUDE_FROM_ALL + ) +else() + message(STATUS "Looking for gflags -- found") +endif() + +set(CMAKE_CXX_STANDARD 17) +add_executable( + nxp_executor_runner + nxp_executor_runner.cpp + ${EXECUTORCH_PATH}/extension/data_loader/file_data_loader.cpp +) + +target_compile_options(nxp_executor_runner PRIVATE -DNEUTRON_CMODEL) + +target_link_libraries( + nxp_executor_runner + PRIVATE executorch + executorch_kernels + nd + gflags::gflags + "-Wl,--whole-archive" + executorch_delegate_neutron + "-Wl,--no-whole-archive" +) diff --git a/examples/nxp/executor_runner/neutron-imxrt700.ini b/examples/nxp/executor_runner/neutron-imxrt700.ini new file mode 100644 index 00000000000..fb7d238b9ff --- /dev/null +++ b/examples/nxp/executor_runner/neutron-imxrt700.ini @@ -0,0 +1,3 @@ +[neutron2] +num_pipelines = 8 +num_macs = 8 diff --git a/examples/nxp/executor_runner/nxp_executor_runner.cpp b/examples/nxp/executor_runner/nxp_executor_runner.cpp new file mode 100644 index 00000000000..52a62611cb5 --- /dev/null +++ b/examples/nxp/executor_runner/nxp_executor_runner.cpp @@ -0,0 +1,500 @@ +/* + * Copyright 2024-2026 NXP + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + + * Example script to compile the model for the NXP Neutron NPU + */ + +/* + * This is an example ExecuTorch runner running on host CPU and Neutron + * simulator - NSYS. Example illustrates how to use the ExecuTorch with the + * Neutron Backend. + */ + +#include +#include +#include +#include + +using torch::executor::Error; +using torch::executor::Result; +using torch::executor::util::FileDataLoader; + +static uint8_t __attribute__(( + aligned(16))) method_allocator_pool[512 * 1024 * 1024U]; // 512 MB +static uint8_t __attribute__(( + aligned(16))) tmp_allocator_pool[512 * 1024 * 1024U]; // 512 MB + +#include +#include + +#ifdef NEUTRON_CMODEL +// The following header is needed only for NSYS backend. +#include "NeutronEnvConfig.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +DEFINE_string(model, "", "Path to serialized model."); +DEFINE_string(dataset, "", "Path to model dataset folder."); +DEFINE_string( + inputs, + "", + "Path to inputs. Usage: " + "/path/to/inputdata1,/path/to/inputdata2"); +DEFINE_string(output, "", "Path to folder where output tensors are saved."); + +#ifdef NEUTRON_CMODEL +DEFINE_string(firmware, "", "Relative path to firmware *.elf file."); +DEFINE_string(nsys, "", "Relative path to nsys."); +DEFINE_string(nsys_config, "", "Relative path to nsys config *.ini file"); +#endif + +#define DEFAULT_OUTPUT_DIR "results" + +void processInputs(std::vector& inputsData, std::string& inputs) { + std::string segment; + std::stringstream ss(inputs); + + while (std::getline(ss, segment, ',')) { + inputsData.push_back(segment); + } +} + +bool isDirectory(std::string path) { + struct stat sb; + if (stat(path.c_str(), &sb) == -1) { + fprintf(stderr, "Unable to determine stats of a path!\n"); + exit(-1); + } + return S_ISDIR(sb.st_mode); +} + +void setInputs( + torch::executor::Method& method, + std::vector& inputFiles) { + if (method.inputs_size() != inputFiles.size()) { + fprintf( + stderr, + "Mismatch: method has %ld inputs, whereas the loaded data contains %ld entries!\n", + method.inputs_size(), + inputFiles.size()); + exit(-1); + } + std::vector values(method.inputs_size()); + Error status = method.get_inputs(values.data(), values.size()); + if (status != Error::Ok) { + fprintf(stderr, "Failed to get_inputs...\n"); + exit(-1); + } + for (size_t i = 0; i < values.size(); i++) { + fprintf(stderr, "Loading file %s\n", inputFiles[i].c_str()); + FILE* datasetFile = fopen(inputFiles[i].c_str(), "r"); + fseek(datasetFile, 0, SEEK_END); + size_t inputSize = ftell(datasetFile); + fseek(datasetFile, 0, SEEK_SET); + if (inputSize == values[i].toTensor().nbytes()) { + // Input is in floats + fread(values[i].toTensor().mutable_data_ptr(), 1, inputSize, datasetFile); + } else if ( + (inputSize == values[i].toTensor().numel()) && + (values[i].toTensor().scalar_type() == + torch::executor::ScalarType::Float)) { + // Input is in bytes, convert to floats + printf("Converting inputs to floats...\n"); + uint8_t* ptr = (uint8_t*)malloc(inputSize); + fread(ptr, 1, inputSize, datasetFile); + for (size_t j = 0; j < inputSize; j++) { + values[i].toTensor().mutable_data_ptr()[j] = ptr[j]; + } + free(ptr); + } else { + // Input mismatch + fprintf( + stderr, + "Mismatch in the %ld-th input tensor: expected %ld elements x %ld bytes each, loaded %ld bytes!\n", + i, + values[i].toTensor().numel(), + values[i].toTensor().element_size(), + inputSize); + fclose(datasetFile); + exit(-1); + } + fclose(datasetFile); + } +} + +void saveOutputs( + torch::executor::Method& method, + std::string& outputPath, + const std::string& runPathPrefix = ".") { + struct stat st; + if (stat(outputPath.c_str(), &st) == -1) { + mkdir(outputPath.c_str(), 0700); + } + if (stat((outputPath + "/" + runPathPrefix).c_str(), &st) == -1) { + mkdir((outputPath + "/" + runPathPrefix).c_str(), 0700); + } + std::vector values(method.outputs_size()); + Error status = method.get_outputs(values.data(), values.size()); + if (status != Error::Ok) { + fprintf(stderr, "Failed to get_outputs...\n"); + exit(-1); + } + for (size_t i = 0; i < values.size(); i++) { + int precision = 4 - std::to_string(i).size(); + std::string fileName = outputPath + "/" + runPathPrefix + "/" + + std::to_string(i).insert(0, precision, '0') + ".bin"; + printf("Saving file %s\n", fileName.c_str()); + FILE* datasetFile = fopen(fileName.c_str(), "w"); + fwrite( + values[i].toTensor().data_ptr(), + 1, + values[i].toTensor().nbytes(), + datasetFile); + fclose(datasetFile); + } +} + +template +void printClassificationOutput( + const torch::executor::EValue& value, + std::string& outputPath, + const std::string& runPathPrefix) { + T maxVal = value.toTensor().mutable_data_ptr()[0]; + size_t maxIdx = 0; + for (size_t j = 1; j < value.toTensor().numel(); j++) { + T val = value.toTensor().mutable_data_ptr()[j]; + if (val > maxVal) { + maxVal = val; + maxIdx = j; + } + } + struct stat st; + std::string resultsFile{outputPath + "/results.txt"}; + if (stat(outputPath.c_str(), &st) == -1) { + mkdir(outputPath.c_str(), 0700); + } + FILE* results = fopen(resultsFile.c_str(), "a+"); + // Print classification results and save to results.txt. + std::cout << "Top1 class " << runPathPrefix << " = " << maxIdx << std::endl; + fprintf(results, "%s %d ", runPathPrefix.c_str(), maxIdx); + std::cout << "Confidence = " << static_cast(maxVal) << std::endl; + fprintf(results, "%f ", static_cast(maxVal)); + fprintf(results, "\n"); + fclose(results); +} + +void printOutput( + torch::executor::Method& method, + std::string& outputPath, + const std::string& runPathPrefix = ".") { + // The single tensor is considered to be a classification result. + if (method.outputs_size() == 1) { + std::vector values(method.outputs_size()); + Error status = method.get_outputs(values.data(), values.size()); + if (status != Error::Ok) { + fprintf(stderr, "Failed to get_outputs...\n"); + exit(-1); + } + switch (values[0].toTensor().scalar_type()) { + case torch::executor::ScalarType::Byte: + printClassificationOutput( + values[0], outputPath, runPathPrefix); + break; + case torch::executor::ScalarType::Char: + printClassificationOutput(values[0], outputPath, runPathPrefix); + break; + case torch::executor::ScalarType::Short: + printClassificationOutput( + values[0], outputPath, runPathPrefix); + break; + case torch::executor::ScalarType::Int: + printClassificationOutput( + values[0], outputPath, runPathPrefix); + break; + case torch::executor::ScalarType::Long: + printClassificationOutput( + values[0], outputPath, runPathPrefix); + break; + case torch::executor::ScalarType::Float: + printClassificationOutput(values[0], outputPath, runPathPrefix); + break; + case torch::executor::ScalarType::Double: + printClassificationOutput(values[0], outputPath, runPathPrefix); + break; + default: + fprintf( + stderr, + "Unsupported tensor data type: %d\n", + values[0].toTensor().scalar_type()); + exit(-1); + } + } +} + +int main(int argc, char* argv[]) { + DIR* datasetDir = nullptr; + struct dirent* dataset = nullptr; + + gflags::ParseCommandLineFlags(&argc, &argv, true); + + // Check that model name and inputs have been specified. + if (FLAGS_model.empty()) { + std::cout << "Please specify path to model using the --model option.\n"; + exit(-1); + } + if (FLAGS_dataset.empty() && FLAGS_inputs.empty()) { + std::cout << "Please specify path to dataset using the --dataset option or " + "inputs using --inputs option\n"; + exit(-1); + } + if (!FLAGS_dataset.empty() && !FLAGS_inputs.empty()) { + std::cout << "Cannot specify both inputs list and dataset directory\n"; + exit(-1); + } + if (!FLAGS_dataset.empty()) { + datasetDir = opendir(FLAGS_dataset.c_str()); + if (!datasetDir) { + std::cout << "Dataset path is not valid\n"; + exit(-1); + } + } + if (FLAGS_output.empty()) { + FLAGS_output = DEFAULT_OUTPUT_DIR; + } + +#ifdef NEUTRON_CMODEL + if (!FLAGS_nsys_config.empty()) { + storeNsysConfigPath(FLAGS_nsys_config.c_str()); + } else if (getenv("NSYS_CONFIG_PATH")) { + storeNsysConfigPath(getenv("NSYS_CONFIG_PATH")); + } else { + std::cout << "ERROR: missing --nsys_config argument\n"; + exit(-1); + } + + if (!FLAGS_firmware.empty()) { + storeFirmwarePath(FLAGS_firmware.c_str()); + } else if (getenv("NSYS_FIRMWARE_PATH")) { + storeFirmwarePath(getenv("NSYS_FIRMWARE_PATH")); + } else { + std::cout << "ERROR: missing --firmware argument\n"; + exit(-1); + } + + if (!FLAGS_nsys.empty()) { + storeNsysPath(FLAGS_nsys.c_str()); + } else if (getenv("NSYS_PATH")) { + storeNsysPath(getenv("NSYS_PATH")); + } else { + std::cout << "ERROR: missing --nsys argument\n"; + exit(-1); + } +#endif + + NeutronError error = ENONE; + error = neutronInit(); + if (error != ENONE) { + fprintf(stderr, "Internal Neutron NPU driver error %x in init!\n", error); + exit(-1); + } + + torch::executor::runtime_init(); + + printf("Started..\n"); + + Result loader = FileDataLoader::from(FLAGS_model.c_str()); + if (!loader.ok()) { + fprintf(stderr, "Model PTE loading failed\n"); + exit(-1); + } else { + printf("Model file %s loaded\n", FLAGS_model.c_str()); + } + + Result program = + torch::executor::Program::load(&loader.get()); + if (!program.ok()) { + fprintf(stderr, "Program loading failed\n"); + exit(-1); + } else { + printf("Program loaded\n"); + } + + const char* method_name = nullptr; + { + const auto method_name_result = program->get_method_name(0); + if (!method_name_result.ok()) { + fprintf(stderr, "Program has no methods...\n"); + exit(-1); + } + method_name = *method_name_result; + } + printf("Using method (%s)...\n", method_name); + + Result method_meta = + program->method_meta(method_name); + if (!method_meta.ok()) { + fprintf( + stderr, + "Failed to get method_meta for (%s): %" PRIu32, + method_name, + (unsigned int)method_meta.error()); + exit(-1); + } + + printf("Creating MemoryAllocator...\n"); + torch::executor::MemoryAllocator method_allocator{ + torch::executor::MemoryAllocator( + sizeof(method_allocator_pool), method_allocator_pool)}; + torch::executor::MemoryAllocator tmp_allocator{ + torch::executor::MemoryAllocator( + sizeof(tmp_allocator_pool), tmp_allocator_pool)}; + + std::vector> planned_buffers; // Owns the memory + std::vector> + planned_spans; // Passed to the allocator + size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers(); + + for (size_t id = 0; id < num_memory_planned_buffers; ++id) { + size_t buffer_size = + static_cast(method_meta->memory_planned_buffer_size(id).get()); + printf("Setting up planned buffer %lu, size %lu...\n", id, buffer_size); + + planned_buffers.push_back(std::make_unique(buffer_size)); + planned_spans.push_back({planned_buffers.back().get(), buffer_size}); + } + + printf("Creating HierarchicalAllocator....\n"); + torch::executor::HierarchicalAllocator planned_memory( + {planned_spans.data(), planned_spans.size()}); + + torch::executor::MemoryManager memory_manager( + &method_allocator, &planned_memory, &tmp_allocator); + + Result method = + program->load_method(method_name, &memory_manager); + if (!method.ok()) { + fprintf( + stderr, + "Loading of method (%s) failed with status %" PRIu32 "...\n", + method_name, + (unsigned int)method.error()); + exit(-1); + } + printf("Method loaded...\n"); + + Error status = Error::Ok; + if (!FLAGS_dataset.empty()) { + // Go through entire dataset for this model. + FLAGS_dataset += "/"; + while (dataset = readdir(datasetDir)) { + if (!strcmp(dataset->d_name, ".") || !strcmp(dataset->d_name, "..")) + continue; + + std::vector inputsData; + inputsData.push_back(FLAGS_dataset + dataset->d_name); + // Set input and call inferrence. + setInputs(method.get(), inputsData); + + status = method->execute(); + if (status != Error::Ok) { + fprintf( + stderr, + "Execution of method %s failed with status %" PRIu32 "...\n", + method_name, + (unsigned int)status); + exit(-1); + } else { + printf("Method executed successfully...\n"); + } + + // Save outputs in binary files. + saveOutputs(method.get(), FLAGS_output, dataset->d_name); + // Print result with highest confidence. + printOutput(method.get(), FLAGS_output, dataset->d_name); + } + closedir(datasetDir); + } else if (!FLAGS_inputs.empty()) { + std::vector inputPaths; + + // Validate and process inputs and separate into two lists. + processInputs(inputPaths, FLAGS_inputs); + + if (std::all_of(inputPaths.begin(), inputPaths.end(), isDirectory)) { + // Inputs are in directories - use files in each directory as the inputs. + std::vector inputsData; + for (std::string& inputDir : inputPaths) { + datasetDir = opendir(inputDir.c_str()); + while (dataset = readdir(datasetDir)) { + if (!strcmp(dataset->d_name, ".") || !strcmp(dataset->d_name, "..")) + continue; + + inputsData.push_back(inputDir + "/" + dataset->d_name); + } + closedir(datasetDir); + + setInputs(method.get(), inputsData); + + status = method->execute(); + if (status != Error::Ok) { + fprintf( + stderr, + "Execution of method %s failed with status %" PRIu32 "...\n", + method_name, + (unsigned int)status); + exit(-1); + } else { + printf("Method executed successfully...\n"); + } + + if (inputDir.back() == '/') + inputDir.pop_back(); + + auto pos = inputDir.find_last_of('/'); + if (pos != std::string::npos) + inputDir = inputDir.substr(pos + 1); + + // Save outputs in binary files. + saveOutputs(method.get(), FLAGS_output, inputDir.c_str()); + inputsData.clear(); + } + } else { + // Inputs are files. + setInputs(method.get(), inputPaths); + + status = method->execute(); + if (status != Error::Ok) { + fprintf( + stderr, + "Execution of method %s failed with status %" PRIu32 "...\n", + method_name, + (unsigned int)status); + exit(-1); + } else { + printf("Method executed successfully...\n"); + } + + // Save outputs in binary files. + saveOutputs(method.get(), FLAGS_output); + } + } + + printf("Finished...\n"); + + error = neutronDeinit(); + + return 0; +} diff --git a/examples/nxp/experimental/cifar_net/cifar_net.py b/examples/nxp/experimental/cifar_net/cifar_net.py index 18d89e39085..a139c894267 100644 --- a/examples/nxp/experimental/cifar_net/cifar_net.py +++ b/examples/nxp/experimental/cifar_net/cifar_net.py @@ -9,6 +9,8 @@ import os.path from typing import Iterator, Tuple +import numpy as np + import torch import torch.nn as nn import torch.nn.functional as F @@ -200,7 +202,7 @@ def train_cifarnet_model( return cifar_net -def test_cifarnet_model(cifar_net: nn.Module, batch_size: int = 1) -> float: +def verify_cifarnet_model(cifar_net: nn.Module, batch_size: int = 1) -> float: """Test the CifarNet model on the CifarNet10 testing dataset and return the accuracy. This function may at some point in the future be integrated into the `CifarNet` class. @@ -221,6 +223,26 @@ def test_cifarnet_model(cifar_net: nn.Module, batch_size: int = 1) -> float: return correct / total +def store_test_data(path="./cifar10_test_data", count=10): + test_loader = get_test_loader(batch_size=1) + + os.makedirs(path, exist_ok=True) + + index = 0 + count = 10 + for data in test_loader: + images, labels = data + for image, label in zip(images, labels): + arr = image.numpy().astype(np.float32) + file_name = f"img{str(index)}_class{str(int(label))}.bin" + arr.tofile(os.path.join(path, file_name)) + index = index + 1 + if index >= count: + break + if index >= count: + break + + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( @@ -243,6 +265,12 @@ def test_cifarnet_model(cifar_net: nn.Module, batch_size: int = 1) -> float: ) parser.add_argument("-b", "--batch-size", required=False, type=int, default=1) parser.add_argument("-e", "--num-epochs", required=False, type=int, default=1) + parser.add_argument( + "--store-test-data", + required=False, + action="store_true", + help="Store the test data for the executor runner", + ) args = parser.parse_args() cifar_net = get_model( @@ -254,9 +282,12 @@ def test_cifarnet_model(cifar_net: nn.Module, batch_size: int = 1) -> float: if args.test: logger.info("Running tests.") - accuracy = test_cifarnet_model(cifar_net, args.batch_size) + accuracy = verify_cifarnet_model(cifar_net, args.batch_size) logger.info(f"Accuracy of the network on the 10000 test images: {accuracy}") + if args.store_test_data: + store_test_data() + if args.pte_file is not None: tracing_inputs = (torch.rand(args.batch_size, 3, 32, 32),) aten_dialect_program = torch.export.export(cifar_net, tracing_inputs) diff --git a/examples/nxp/run.sh b/examples/nxp/run.sh new file mode 100755 index 00000000000..a2dcd7a9da4 --- /dev/null +++ b/examples/nxp/run.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# Copyright 2026 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +set -eux + +SCRIPT_DIR=$(dirname $(readlink -fm $0)) +EXECUTORCH_DIR=$(dirname $(dirname $SCRIPT_DIR)) +MODEL=${1:-"cifar10"} + +cd ${EXECUTORCH_DIR} + +echo "** Build nxp_executor_runner" +if [ ! -d ${SCRIPT_DIR}/executor_runner/build ] ; then + mkdir ${SCRIPT_DIR}/executor_runner/build +fi +rm -rf ${SCRIPT_DIR}/executor_runner/build/* + +pushd ${SCRIPT_DIR}/executor_runner/build +cmake -DCMAKE_BUILD_TYPE=Debug .. +make -j8 nxp_executor_runner +popd + +echo "** Export cifar10 model to executorch" +# Run the AoT example +python -m examples.nxp.aot_neutron_compile --quantize \ + --delegate --neutron_converter_flavor SDK_25_12 -m "cifar10" +test -f cifar10_nxp_delegate.pte + +echo "** Generate test dataset" +python -m examples.nxp.experimental.cifar_net.cifar_net --store-test-data + +echo "** Run nxp_executor_runner" +${SCRIPT_DIR}/executor_runner/build/nxp_executor_runner \ + --firmware `make -C ${SCRIPT_DIR}/executor_runner/build locate_neutron_firmware | grep "NeutronFirmware.elf" ` \ + --nsys `which nsys` \ + --nsys_config ${SCRIPT_DIR}/executor_runner/neutron-imxrt700.ini \ + --model cifar10_nxp_delegate.pte \ + --dataset ./cifar10_test_data \ + --output ./cifar10_test_output diff --git a/examples/nxp/setup.sh b/examples/nxp/setup.sh index 9bdec38b508..22c1728acc9 100755 --- a/examples/nxp/setup.sh +++ b/examples/nxp/setup.sh @@ -5,12 +5,13 @@ # LICENSE file in the root directory of this source tree. set -u -EIQ_PYPI_URL=https://eiq.nxp.com/repository - +EIQ_PYPI_URL="${EIQ_PYPI_URL:-https://eiq.nxp.com/repository}" # Install neutron-converter pip install --index-url ${EIQ_PYPI_URL} neutron_converter_SDK_25_12 +pip install --index-url ${EIQ_PYPI_URL} eiq_neutron_sdk==2.2.2 eiq_nsys + # Get the directory of the current script SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" diff --git a/pyproject.toml b/pyproject.toml index 7a4ce277ade..aa7ab171ca8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -113,6 +113,8 @@ license-files = ["LICENSE"] [tool.setuptools.package-data] # TODO(dbort): Prune /test[s]/ dirs, /third-party/ dirs, yaml files that we # don't need. +# TODO(RobertKalmar): When test[s] dirs pruned the PROJECT_DIR resolution in backends.nxp.tests_models.config.py can +# avoid exporting and reading env variable. "*" = [ # Some backends like XNNPACK need their .fbs files. "*.fbs", diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake index 9a16f1ae4f4..be74ed74ed1 100644 --- a/tools/cmake/preset/default.cmake +++ b/tools/cmake/preset/default.cmake @@ -124,6 +124,13 @@ define_overridable_option( define_overridable_option( EXECUTORCH_BUILD_QNN "Build the Qualcomm backend" BOOL OFF ) +define_overridable_option( + EXECUTORCH_BUILD_NXP_NEUTRON "Build the NXP eIQ Neutron backend" BOOL OFF +) +define_overridable_option( + EXECUTORCH_BUILD_NXP_NEUTRON_RUNNER "Build the NXP eIQ Neutron runner" BOOL + OFF +) define_overridable_option( EXECUTORCH_BUILD_KERNELS_OPTIMIZED "Build the optimized kernels" BOOL OFF )