From f5b9d7dd48992d6a7c982768bb767352a23b1f2c Mon Sep 17 00:00:00 2001 From: Bohdan Date: Mon, 23 Feb 2026 13:54:52 -0800 Subject: [PATCH 01/12] feat: update transformers to v5.x, unsloth, and add MoE LoRA conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update core dependencies for transformers v5 ecosystem: - transformers: >=4.55.2,<=4.57.3 → >=5.1.0 - unsloth: 2025.12.9 → 2026.2.1 - unsloth-zoo: 2025.12.7 → 2026.2.1 (+ updated VCS pin) - trl: 0.20.0 → >=0.28.0 - peft: >=0.14.0 → >=0.18.0 (required by transformers v5) Fix transformers v5 breaking changes: - Replace removed dummy_pt_objects import with direct transformers import - Update masking_utils patch return type (now returns 5 values) - Remove deprecated TrainerArgs fields (overwrite_output_dir, jit_mode_eval, mp_parameters, logging_dir, fp16_backend, push_to_hub_token/model_id/organization) Add MoE LoRA adapter conversion utility for vLLM compatibility: - Unsloth + transformers v5 saves MoE LoRA as fused 2D tensors - vLLM expects per-expert format - Auto-detect and convert after checkpoint save Closes #575 Co-Authored-By: Claude Opus 4.6 --- pyproject.toml | 12 +- requirements/backend.vcs.txt | 2 +- src/art/dev/model.py | 8 -- src/art/transformers/patches.py | 4 +- src/art/unsloth/service.py | 5 +- src/art/utils/convert_moe_lora.py | 177 ++++++++++++++++++++++++++++++ 6 files changed, 191 insertions(+), 17 deletions(-) create mode 100644 src/art/utils/convert_moe_lora.py diff --git a/pyproject.toml b/pyproject.toml index e46320324..0d8c3df07 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,21 +19,21 @@ dependencies = [ plotting = ["matplotlib>=3.10.1", "seaborn>=0.13.2"] backend = [ - "peft>=0.14.0", + "peft>=0.18.0", "hf-xet>=1.1.0", "bitsandbytes>=0.45.2", - "unsloth==2025.12.9", - "unsloth-zoo==2025.12.7", + "unsloth==2026.2.1", + "unsloth-zoo==2026.2.1", "torch>=2.8.0", "torchao==0.14.1", "accelerate==1.7.0", "awscli>=1.38.1", "setuptools>=78.1.0", "wandb==0.24.0", - "transformers>=4.55.2,<=4.57.3", + "transformers>=5.1.0", "duckdb>=1.0.0", "pyarrow>=15.0.0", - "trl==0.20.0", + "trl>=0.28.0", "nbclient>=0.10.1", "pytest>=8.4.1", "nbmake>=1.5.5", @@ -65,7 +65,7 @@ tinker = [ "pydantic>=2.12.5", "tinker>=0.8.1", "torch>=2.8.0", - "transformers>=4.55.2,<=4.57.3", + "transformers>=5.1.0", "uvicorn>=0.35.0", "datrie>=0.8.3", ] diff --git a/requirements/backend.vcs.txt b/requirements/backend.vcs.txt index 13539e64a..4013ce46a 100644 --- a/requirements/backend.vcs.txt +++ b/requirements/backend.vcs.txt @@ -8,4 +8,4 @@ torchtune @ git+https://github.com/pytorch/torchtune.git@2344509cf83bd886538fe3e8263e5145d1afb5c2 # Unsloth Zoo pinned to known-good commit - unsloth-zoo @ git+https://github.com/bradhilton/unsloth-zoo@323cf5e + unsloth-zoo @ git+https://github.com/bradhilton/unsloth-zoo@f536ee6 diff --git a/src/art/dev/model.py b/src/art/dev/model.py index 8bd342b81..813287b85 100644 --- a/src/art/dev/model.py +++ b/src/art/dev/model.py @@ -190,7 +190,6 @@ class PeftArgs(TypedDict, total=False): class TrainerArgs(TypedDict, total=False): output_dir: str | None - overwrite_output_dir: bool do_train: bool do_eval: bool do_predict: bool @@ -219,7 +218,6 @@ class TrainerArgs(TypedDict, total=False): log_level: str log_level_replica: str log_on_each_node: bool - logging_dir: str | None logging_strategy: "IntervalStrategy | str" logging_first_step: bool logging_steps: float @@ -236,7 +234,6 @@ class TrainerArgs(TypedDict, total=False): use_mps_device: bool seed: int data_seed: int | None - jit_mode_eval: bool use_ipex: bool bf16: bool fp16: bool @@ -295,11 +292,6 @@ class TrainerArgs(TypedDict, total=False): include_inputs_for_metrics: bool include_for_metrics: list[str] eval_do_concat_batches: bool - fp16_backend: str - push_to_hub_model_id: str | None - push_to_hub_organization: str | None - push_to_hub_token: str | None - mp_parameters: str auto_find_batch_size: bool full_determinism: bool torchdynamo: str | None diff --git a/src/art/transformers/patches.py b/src/art/transformers/patches.py index 97e09f6c8..4990db17b 100644 --- a/src/art/transformers/patches.py +++ b/src/art/transformers/patches.py @@ -19,7 +19,9 @@ def _patched_preprocess_mask_arguments( past_key_values: Optional[Cache], position_ids: Optional[torch.Tensor], layer_idx: Optional[int], -) -> tuple[bool, Optional[Union[torch.Tensor, "BlockMask"]], int, int]: +) -> tuple[ + bool, Optional[Union[torch.Tensor, "BlockMask"]], Optional[torch.Tensor], int, int +]: if position_ids is not None and len(position_ids.shape) == 3: position_ids = position_ids[0] return _preprocess_mask_arguments( diff --git a/src/art/unsloth/service.py b/src/art/unsloth/service.py index 2417cff96..6209a65f3 100644 --- a/src/art/unsloth/service.py +++ b/src/art/unsloth/service.py @@ -9,8 +9,8 @@ from datasets import Dataset import peft import torch +from transformers import GenerationMixin, PreTrainedModel from transformers.tokenization_utils_base import PreTrainedTokenizerBase -from transformers.utils.dummy_pt_objects import GenerationMixin, PreTrainedModel from trl import GRPOConfig, GRPOTrainer from vllm import AsyncEngineArgs from vllm.lora.request import LoRARequest @@ -25,6 +25,7 @@ packed_tensors_from_dir, ) from ..preprocessing.tokenize import SFTBatch +from ..utils.convert_moe_lora import convert_checkpoint_if_needed from ..utils.get_model_step import get_step_from_dir from ..utils.output_dirs import get_step_checkpoint_dir from ..vllm import get_llm, get_worker, openai_server_task, run_on_workers @@ -149,6 +150,7 @@ def save_checkpoint( checkpoint_dir = get_step_checkpoint_dir(output_dir, next_step) os.makedirs(checkpoint_dir, exist_ok=True) trainer.save_model(checkpoint_dir) + convert_checkpoint_if_needed(checkpoint_dir) return checkpoint_dir @@ -280,6 +282,7 @@ async def start_openai_server( lora_path = get_step_checkpoint_dir(self.output_dir, 0) os.makedirs(os.path.dirname(lora_path), exist_ok=True) self._state.trainer.save_model(lora_path) + convert_checkpoint_if_needed(lora_path) self._latest_step = 0 else: # Extract step from checkpoint path diff --git a/src/art/utils/convert_moe_lora.py b/src/art/utils/convert_moe_lora.py new file mode 100644 index 000000000..50c7c84c6 --- /dev/null +++ b/src/art/utils/convert_moe_lora.py @@ -0,0 +1,177 @@ +"""Convert fused MoE LoRA adapters to per-expert format for vLLM compatibility. + +Unsloth with transformers v5 saves MoE expert LoRA as fused 2D tensors: + mlp.experts.base_layer.lora_A [num_experts*rank, intermediate*2] (gate_up_proj) + mlp.experts.base_layer.lora_B [hidden, num_experts*rank] (gate_up_proj) + mlp.experts.lora_A [num_experts*rank, hidden] (down_proj) + mlp.experts.lora_B [intermediate, num_experts*rank] (down_proj) + +vLLM expects per-expert keys: + mlp.experts.0.gate_proj.lora_A [rank, hidden] + mlp.experts.0.gate_proj.lora_B [intermediate, rank] + ... +""" + +import json +import os +import re + +import safetensors.torch +import torch + + +def _has_fused_moe_lora(tensors: dict[str, torch.Tensor]) -> bool: + """Check if the adapter contains fused MoE LoRA tensors.""" + return any( + re.search(r"mlp\.experts\.(base_layer\.)?lora_[AB]\.weight$", key) + for key in tensors + ) + + +def _infer_moe_params( + tensors: dict[str, torch.Tensor], + adapter_config: dict, +) -> tuple[int, int, int, int]: + """Infer num_experts, rank, intermediate_size, hidden_size from tensor shapes.""" + rank = adapter_config.get("r", adapter_config.get("lora_rank", 8)) + + for key, tensor in tensors.items(): + # gate_up_proj lora_A: [num_experts*rank, intermediate*2] + if re.search(r"mlp\.experts\.base_layer\.lora_A\.weight$", key): + num_experts_times_rank = tensor.shape[0] + intermediate_times_2 = tensor.shape[1] + num_experts = num_experts_times_rank // rank + intermediate_size = intermediate_times_2 // 2 + break + # down_proj lora_B: [intermediate, num_experts*rank] + if re.search(r"mlp\.experts\.lora_B\.weight$", key): + intermediate_size = tensor.shape[0] + num_experts = tensor.shape[1] // rank + break + else: + raise ValueError("Could not find fused MoE tensors to infer parameters") + + # Get hidden_size from gate_up_proj lora_B: [hidden, num_experts*rank] + for key, tensor in tensors.items(): + if re.search(r"mlp\.experts\.base_layer\.lora_B\.weight$", key): + hidden_size = tensor.shape[0] + break + else: + raise ValueError("Could not find gate_up_proj lora_B to infer hidden_size") + + return num_experts, rank, intermediate_size, hidden_size + + +def convert_fused_moe_lora( + tensors: dict[str, torch.Tensor], + num_experts: int, + rank: int, + intermediate_size: int, + hidden_size: int, +) -> dict[str, torch.Tensor]: + """Convert fused MoE LoRA tensors to per-expert format. + + Non-expert tensors (e.g. self_attn) are passed through unchanged. + """ + new_tensors: dict[str, torch.Tensor] = {} + + for key, tensor in tensors.items(): + # Non-expert tensors: keep as-is + m = re.match( + r"(.*\.mlp\.experts)\.(base_layer\.lora_(A|B)|lora_(A|B))\.weight$", + key, + ) + if not m: + new_tensors[key] = tensor + continue + + prefix = m.group(1) + is_base_layer = "base_layer" in key + is_A = "lora_A" in key + + if is_base_layer: + # gate_up_proj (fused gate + up) + if is_A: + # [num_experts*rank, intermediate*2] → per expert + per_expert = tensor.reshape(num_experts, rank, intermediate_size * 2) + for e in range(num_experts): + expert_a = per_expert[e] # [rank, intermediate*2] + gate_a = expert_a[:, :intermediate_size] + up_a = expert_a[:, intermediate_size:] + new_tensors[f"{prefix}.{e}.gate_proj.lora_B.weight"] = ( + gate_a.T.contiguous() + ) + new_tensors[f"{prefix}.{e}.up_proj.lora_B.weight"] = ( + up_a.T.contiguous() + ) + else: + # [hidden, num_experts*rank] → per expert + per_expert = tensor.reshape(hidden_size, num_experts, rank) + for e in range(num_experts): + expert_b = per_expert[:, e, :] # [hidden, rank] + new_tensors[f"{prefix}.{e}.gate_proj.lora_A.weight"] = ( + expert_b.T.contiguous() + ) + new_tensors[f"{prefix}.{e}.up_proj.lora_A.weight"] = ( + expert_b.T.contiguous() + ) + else: + # down_proj + if is_A: + # [num_experts*rank, hidden] → per expert + per_expert = tensor.reshape(num_experts, rank, hidden_size) + for e in range(num_experts): + expert_a = per_expert[e] # [rank, hidden] + new_tensors[f"{prefix}.{e}.down_proj.lora_B.weight"] = ( + expert_a.T.contiguous() + ) + else: + # [intermediate, num_experts*rank] → per expert + per_expert = tensor.reshape(intermediate_size, num_experts, rank) + for e in range(num_experts): + expert_b = per_expert[:, e, :] # [intermediate, rank] + new_tensors[f"{prefix}.{e}.down_proj.lora_A.weight"] = ( + expert_b.T.contiguous() + ) + + return new_tensors + + +def convert_checkpoint_if_needed(checkpoint_dir: str) -> None: + """Convert a checkpoint's MoE LoRA adapter to per-expert format if needed. + + This is a no-op for non-MoE adapters. + """ + adapter_path = os.path.join(checkpoint_dir, "adapter_model.safetensors") + config_path = os.path.join(checkpoint_dir, "adapter_config.json") + + if not os.path.exists(adapter_path) or not os.path.exists(config_path): + return + + tensors = safetensors.torch.load_file(adapter_path) + if not _has_fused_moe_lora(tensors): + return + + with open(config_path) as f: + adapter_config = json.load(f) + + num_experts, rank, intermediate_size, hidden_size = _infer_moe_params( + tensors, adapter_config + ) + + new_tensors = convert_fused_moe_lora( + tensors, num_experts, rank, intermediate_size, hidden_size + ) + + # Overwrite the adapter with the converted tensors + safetensors.torch.save_file(new_tensors, adapter_path) + + # Update adapter_config.json target_modules + adapter_config["target_modules"] = [ + m for m in adapter_config.get("target_modules", []) if "experts" not in m + ] + ["gate_proj", "up_proj", "down_proj"] + # Remove target_parameters if present (not needed for per-expert format) + adapter_config.pop("target_parameters", None) + + with open(config_path, "w") as f: + json.dump(adapter_config, f, indent=2) From 2aa7cc1f2ecffaddf4cddd0e9f3223995c21d3ba Mon Sep 17 00:00:00 2001 From: Bohdan Date: Mon, 23 Feb 2026 14:02:49 -0800 Subject: [PATCH 02/12] fix: pin trl<=0.24.0 for unsloth 2026.2.1 compatibility Unsloth 2026.2.1 requires trl>0.18.2,!=0.19.0,<=0.24.0. Co-Authored-By: Claude Opus 4.6 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0d8c3df07..0addc870b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ backend = [ "transformers>=5.1.0", "duckdb>=1.0.0", "pyarrow>=15.0.0", - "trl>=0.28.0", + "trl>=0.20.0,<=0.24.0", "nbclient>=0.10.1", "pytest>=8.4.1", "nbmake>=1.5.5", From bef21504b185e2ed0bd1a474ff6aad1c8ed906db Mon Sep 17 00:00:00 2001 From: Bohdan Date: Mon, 23 Feb 2026 14:04:36 -0800 Subject: [PATCH 03/12] fix: override unsloth dep constraints for transformers v5 + trl compat Unsloth 2026.2.1's pyproject.toml has overly strict constraints (transformers<=4.57.6, trl<=0.24.0) but the February-2026 release notes confirm v5.1.0 + trl 0.27.1 work well. Use uv override-dependencies to allow the upgrade. Co-Authored-By: Claude Opus 4.6 --- pyproject.toml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0addc870b..8fb38088f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ backend = [ "transformers>=5.1.0", "duckdb>=1.0.0", "pyarrow>=15.0.0", - "trl>=0.20.0,<=0.24.0", + "trl>=0.20.0", "nbclient>=0.10.1", "pytest>=8.4.1", "nbmake>=1.5.5", @@ -122,7 +122,14 @@ required-version = ">=0.6.15" # Override numpy to <2.0 for compatibility with megatron-core in the training # environment. vLLM 0.15.1 pulls opencv-python-headless>=4.13 which wants # numpy>=2 on Python 3.9+, but megatron-core requires numpy<2. -override-dependencies = ["transformer-engine>=2.11.0", "numpy<2"] +override-dependencies = [ + "transformer-engine>=2.11.0", + "numpy<2", + # Override unsloth's overly strict constraints — v5.x and trl 0.27.x + # are confirmed working per unsloth February-2026 release notes + "transformers>=5.1.0", + "trl>=0.20.0", +] exclude-dependencies = ["pynvml"] no-build-isolation-package = ["apex", "transformer-engine-torch", "nv-grouped-gemm"] From 3b7c8222f1c837a7c256ce791a9ec09d8515d119 Mon Sep 17 00:00:00 2001 From: Bohdan Date: Mon, 23 Feb 2026 14:53:04 -0800 Subject: [PATCH 04/12] fix: add warnings_issued attr for transformers v5 + unsloth compat Transformers v5 removed `warnings_issued` from PreTrainedModel, but Unsloth's GRPOTrainer still accesses it during initialization. Add it as an empty dict on the PEFT model before creating the trainer. Co-Authored-By: Claude Opus 4.6 --- src/art/unsloth/service.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/art/unsloth/service.py b/src/art/unsloth/service.py index 6209a65f3..1a623ba7c 100644 --- a/src/art/unsloth/service.py +++ b/src/art/unsloth/service.py @@ -670,6 +670,11 @@ def _state(self) -> UnslothState: ), ) + # Transformers v5 removed `warnings_issued` from PreTrainedModel, + # but Unsloth's GRPOTrainer still accesses it during init. + if not hasattr(peft_model, "warnings_issued"): + peft_model.warnings_issued = {} # type: ignore[attr-defined] + # Initialize trainer with dummy dataset data = {"prompt": ""} trainer = GRPOTrainer( From 57293b5a88e044672d450b94b39d7d477f297fec Mon Sep 17 00:00:00 2001 From: Bohdan Date: Mon, 23 Feb 2026 15:01:34 -0800 Subject: [PATCH 05/12] fix: add return_dict=False to apply_chat_template calls for transformers v5 Transformers v5 changed apply_chat_template to return BatchEncoding by default when tokenize=True. Add return_dict=False to all calls that expect list[int] return type. Co-Authored-By: Claude Opus 4.6 --- src/art/preprocessing/tokenize.py | 3 +++ src/art/tinker/server.py | 1 + 2 files changed, 4 insertions(+) diff --git a/src/art/preprocessing/tokenize.py b/src/art/preprocessing/tokenize.py index 8fbcedca6..815af427c 100644 --- a/src/art/preprocessing/tokenize.py +++ b/src/art/preprocessing/tokenize.py @@ -195,6 +195,7 @@ def tokenize_trajectory( cast(list[dict], messages), tools=tools, continue_final_message=True, + return_dict=False, ), ) sentinal_token_id = max( @@ -230,6 +231,7 @@ def tokenize_trajectory( cast(list[dict], token_template_messages), tools=tools, continue_final_message=True, + return_dict=False, ), ) assistant_mask: list[int] = [0] * len(token_ids) @@ -406,6 +408,7 @@ def tokenize_sft_batch( tools=cast(Any, tools), tokenize=True, add_generation_prompt=False, + return_dict=False, ), ) diff --git a/src/art/tinker/server.py b/src/art/tinker/server.py index 22ee9bb9b..588c11dfc 100644 --- a/src/art/tinker/server.py +++ b/src/art/tinker/server.py @@ -66,6 +66,7 @@ async def prompt_tokens( messages, # type: ignore tools=tools, # type: ignore add_generation_prompt=True, + return_dict=False, ) async def chat_completion_and_token_discrepancies( From 408daefbce831e432eab92c5acc554ab2041b52a Mon Sep 17 00:00:00 2001 From: Bohdan Date: Mon, 23 Feb 2026 15:11:59 -0800 Subject: [PATCH 06/12] fix: remove unsloth-zoo VCS pin, use PyPI 2026.2.1 instead The bradhilton/unsloth-zoo fork is at version 2025.8.4 which is missing modules needed by unsloth 2026.2.1 (e.g. unsloth_zoo.device_type). Switch to the official PyPI release which matches unsloth 2026.2.1. Co-Authored-By: Claude Opus 4.6 --- requirements/backend.vcs.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/backend.vcs.txt b/requirements/backend.vcs.txt index 4013ce46a..5b340ccb4 100644 --- a/requirements/backend.vcs.txt +++ b/requirements/backend.vcs.txt @@ -7,5 +7,5 @@ # Torchtune pinned to known-good commit torchtune @ git+https://github.com/pytorch/torchtune.git@2344509cf83bd886538fe3e8263e5145d1afb5c2 -# Unsloth Zoo pinned to known-good commit - unsloth-zoo @ git+https://github.com/bradhilton/unsloth-zoo@f536ee6 +# Unsloth Zoo: using PyPI version (2026.2.1) instead of VCS fork, +# since unsloth 2026.2.1 requires the matching unsloth-zoo version. From 620f9ee3b0d93f6877f5e8ad10a8de7956d88c63 Mon Sep 17 00:00:00 2001 From: Bohdan Date: Mon, 23 Feb 2026 15:23:45 -0800 Subject: [PATCH 07/12] revert: remove unnecessary changes to backend.vcs.txt and model.py These changes were not needed for the transformers v5 upgrade: - backend.vcs.txt: not used for installation (pyproject.toml handles deps) - model.py TrainerArgs: TypedDict fields don't cause runtime errors Co-Authored-By: Claude Opus 4.6 --- requirements/backend.vcs.txt | 4 ++-- src/art/dev/model.py | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/requirements/backend.vcs.txt b/requirements/backend.vcs.txt index 5b340ccb4..13539e64a 100644 --- a/requirements/backend.vcs.txt +++ b/requirements/backend.vcs.txt @@ -7,5 +7,5 @@ # Torchtune pinned to known-good commit torchtune @ git+https://github.com/pytorch/torchtune.git@2344509cf83bd886538fe3e8263e5145d1afb5c2 -# Unsloth Zoo: using PyPI version (2026.2.1) instead of VCS fork, -# since unsloth 2026.2.1 requires the matching unsloth-zoo version. +# Unsloth Zoo pinned to known-good commit + unsloth-zoo @ git+https://github.com/bradhilton/unsloth-zoo@323cf5e diff --git a/src/art/dev/model.py b/src/art/dev/model.py index 813287b85..8bd342b81 100644 --- a/src/art/dev/model.py +++ b/src/art/dev/model.py @@ -190,6 +190,7 @@ class PeftArgs(TypedDict, total=False): class TrainerArgs(TypedDict, total=False): output_dir: str | None + overwrite_output_dir: bool do_train: bool do_eval: bool do_predict: bool @@ -218,6 +219,7 @@ class TrainerArgs(TypedDict, total=False): log_level: str log_level_replica: str log_on_each_node: bool + logging_dir: str | None logging_strategy: "IntervalStrategy | str" logging_first_step: bool logging_steps: float @@ -234,6 +236,7 @@ class TrainerArgs(TypedDict, total=False): use_mps_device: bool seed: int data_seed: int | None + jit_mode_eval: bool use_ipex: bool bf16: bool fp16: bool @@ -292,6 +295,11 @@ class TrainerArgs(TypedDict, total=False): include_inputs_for_metrics: bool include_for_metrics: list[str] eval_do_concat_batches: bool + fp16_backend: str + push_to_hub_model_id: str | None + push_to_hub_organization: str | None + push_to_hub_token: str | None + mp_parameters: str auto_find_batch_size: bool full_determinism: bool torchdynamo: str | None From 31ae447148b64e5c074d55900e184ce990f4c6d4 Mon Sep 17 00:00:00 2001 From: Bohdan Date: Mon, 23 Feb 2026 15:25:31 -0800 Subject: [PATCH 08/12] fix: remove TrainerArgs fields removed in transformers v5 Remove fields that transformers v5 dropped from TrainingArguments: overwrite_output_dir, logging_dir, jit_mode_eval, half_precision_backend, tpu_num_cores, past_index, fp16_backend, push_to_hub_model_id, push_to_hub_organization, push_to_hub_token, mp_parameters, torchdynamo, ray_scope. Co-Authored-By: Claude Opus 4.6 --- src/art/dev/model.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/src/art/dev/model.py b/src/art/dev/model.py index 8bd342b81..3f41350cd 100644 --- a/src/art/dev/model.py +++ b/src/art/dev/model.py @@ -190,7 +190,6 @@ class PeftArgs(TypedDict, total=False): class TrainerArgs(TypedDict, total=False): output_dir: str | None - overwrite_output_dir: bool do_train: bool do_eval: bool do_predict: bool @@ -219,7 +218,6 @@ class TrainerArgs(TypedDict, total=False): log_level: str log_level_replica: str log_on_each_node: bool - logging_dir: str | None logging_strategy: "IntervalStrategy | str" logging_first_step: bool logging_steps: float @@ -236,25 +234,21 @@ class TrainerArgs(TypedDict, total=False): use_mps_device: bool seed: int data_seed: int | None - jit_mode_eval: bool use_ipex: bool bf16: bool fp16: bool fp16_opt_level: str - half_precision_backend: str bf16_full_eval: bool fp16_full_eval: bool tf32: bool | None local_rank: int ddp_backend: str | None - tpu_num_cores: int | None tpu_metrics_debug: bool debug: str | list[DebugOption] dataloader_drop_last: bool eval_steps: float | None dataloader_num_workers: int dataloader_prefetch_factor: int | None - past_index: int run_name: str | None disable_tqdm: bool | None remove_unused_columns: bool | None @@ -295,15 +289,8 @@ class TrainerArgs(TypedDict, total=False): include_inputs_for_metrics: bool include_for_metrics: list[str] eval_do_concat_batches: bool - fp16_backend: str - push_to_hub_model_id: str | None - push_to_hub_organization: str | None - push_to_hub_token: str | None - mp_parameters: str auto_find_batch_size: bool full_determinism: bool - torchdynamo: str | None - ray_scope: str | None ddp_timeout: int torch_compile: bool torch_compile_backend: str | None From af60b727e6e1394bf1789b4ce21ec8a6cf8dc978 Mon Sep 17 00:00:00 2001 From: Bohdan Date: Mon, 23 Feb 2026 15:26:54 -0800 Subject: [PATCH 09/12] fix: pin transformers==5.1.0 to avoid breakage from future releases Co-Authored-By: Claude Opus 4.6 --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8fb38088f..04fd08540 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ backend = [ "awscli>=1.38.1", "setuptools>=78.1.0", "wandb==0.24.0", - "transformers>=5.1.0", + "transformers==5.1.0", "duckdb>=1.0.0", "pyarrow>=15.0.0", "trl>=0.20.0", @@ -65,7 +65,7 @@ tinker = [ "pydantic>=2.12.5", "tinker>=0.8.1", "torch>=2.8.0", - "transformers>=5.1.0", + "transformers==5.1.0", "uvicorn>=0.35.0", "datrie>=0.8.3", ] @@ -127,7 +127,7 @@ override-dependencies = [ "numpy<2", # Override unsloth's overly strict constraints — v5.x and trl 0.27.x # are confirmed working per unsloth February-2026 release notes - "transformers>=5.1.0", + "transformers==5.1.0", "trl>=0.20.0", ] exclude-dependencies = ["pynvml"] From 0de56ffdf161e9e3bfc342355407b0d414f6fb30 Mon Sep 17 00:00:00 2001 From: Bohdan Date: Mon, 23 Feb 2026 15:28:09 -0800 Subject: [PATCH 10/12] fix: restore trl==0.20.0 pin and remove unnecessary trl override MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit trl was originally pinned to 0.20.0. No reason to loosen it — 0.20.0 already satisfies unsloth's trl<=0.24.0 constraint. Co-Authored-By: Claude Opus 4.6 --- pyproject.toml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 04fd08540..7f016ed63 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ backend = [ "transformers==5.1.0", "duckdb>=1.0.0", "pyarrow>=15.0.0", - "trl>=0.20.0", + "trl==0.20.0", "nbclient>=0.10.1", "pytest>=8.4.1", "nbmake>=1.5.5", @@ -125,10 +125,9 @@ required-version = ">=0.6.15" override-dependencies = [ "transformer-engine>=2.11.0", "numpy<2", - # Override unsloth's overly strict constraints — v5.x and trl 0.27.x - # are confirmed working per unsloth February-2026 release notes + # Override unsloth's overly strict constraint on transformers — v5.x + # is confirmed working per unsloth February-2026 release notes "transformers==5.1.0", - "trl>=0.20.0", ] exclude-dependencies = ["pynvml"] no-build-isolation-package = ["apex", "transformer-engine-torch", "nv-grouped-gemm"] From ab6a6da8df08864e73c699ee53c1ad9999cce988 Mon Sep 17 00:00:00 2001 From: Bohdan Date: Mon, 23 Feb 2026 17:28:06 -0800 Subject: [PATCH 11/12] refactor: centralize apply_chat_template return_dict=False patch Instead of adding return_dict=False to every call site, patch PreTrainedTokenizerBase.apply_chat_template once in patches.py to default return_dict=False. This restores transformers v4 behavior (returning list[int]) globally. Co-Authored-By: Claude Opus 4.6 --- src/art/__init__.py | 6 +++++- src/art/preprocessing/tokenize.py | 3 --- src/art/tinker/server.py | 1 - src/art/transformers/patches.py | 18 ++++++++++++++++++ 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/src/art/__init__.py b/src/art/__init__.py index 3272944da..bc82c4dcb 100644 --- a/src/art/__init__.py +++ b/src/art/__init__.py @@ -40,9 +40,13 @@ def __init__(self, **kwargs): import transformers try: - from .transformers.patches import patch_preprocess_mask_arguments + from .transformers.patches import ( + patch_apply_chat_template, + patch_preprocess_mask_arguments, + ) patch_preprocess_mask_arguments() + patch_apply_chat_template() except Exception: pass except ImportError: diff --git a/src/art/preprocessing/tokenize.py b/src/art/preprocessing/tokenize.py index 815af427c..8fbcedca6 100644 --- a/src/art/preprocessing/tokenize.py +++ b/src/art/preprocessing/tokenize.py @@ -195,7 +195,6 @@ def tokenize_trajectory( cast(list[dict], messages), tools=tools, continue_final_message=True, - return_dict=False, ), ) sentinal_token_id = max( @@ -231,7 +230,6 @@ def tokenize_trajectory( cast(list[dict], token_template_messages), tools=tools, continue_final_message=True, - return_dict=False, ), ) assistant_mask: list[int] = [0] * len(token_ids) @@ -408,7 +406,6 @@ def tokenize_sft_batch( tools=cast(Any, tools), tokenize=True, add_generation_prompt=False, - return_dict=False, ), ) diff --git a/src/art/tinker/server.py b/src/art/tinker/server.py index 0027ec968..8a5534094 100644 --- a/src/art/tinker/server.py +++ b/src/art/tinker/server.py @@ -68,7 +68,6 @@ async def prompt_tokens( messages, # type: ignore tools=tools, # type: ignore add_generation_prompt=True, - return_dict=False, ), ) diff --git a/src/art/transformers/patches.py b/src/art/transformers/patches.py index 4990db17b..4c1dfc404 100644 --- a/src/art/transformers/patches.py +++ b/src/art/transformers/patches.py @@ -1,9 +1,11 @@ +import functools from typing import TYPE_CHECKING, Optional, Union import torch from transformers import masking_utils from transformers.cache_utils import Cache from transformers.configuration_utils import PretrainedConfig +from transformers.tokenization_utils_base import PreTrainedTokenizerBase if TYPE_CHECKING: from torch.nn.attention.flex_attention import BlockMask @@ -37,3 +39,19 @@ def _patched_preprocess_mask_arguments( def patch_preprocess_mask_arguments() -> None: masking_utils._preprocess_mask_arguments = _patched_preprocess_mask_arguments # ty:ignore[invalid-assignment] + + +def patch_apply_chat_template() -> None: + """Default return_dict=False in apply_chat_template for transformers v5. + + Transformers v5 changed the default from list[int] to BatchEncoding. + This restores the v4 behavior so all call sites get list[int] back. + """ + original = PreTrainedTokenizerBase.apply_chat_template + + @functools.wraps(original) + def _patched(self, *args, **kwargs): # type: ignore + kwargs.setdefault("return_dict", False) + return original(self, *args, **kwargs) + + PreTrainedTokenizerBase.apply_chat_template = _patched # type: ignore From 0ab1b94fdbfb1e01cde37a228cd35b5055cc3c73 Mon Sep 17 00:00:00 2001 From: Bohdan Date: Mon, 23 Feb 2026 17:57:30 -0800 Subject: [PATCH 12/12] fix: correct comment about warnings_issued workaround MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The attribute wasn't removed in transformers v5 — Unsloth's model patching can leave the PEFT model without it. Co-Authored-By: Claude Opus 4.6 --- src/art/unsloth/service.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/art/unsloth/service.py b/src/art/unsloth/service.py index 1a623ba7c..209a1b80e 100644 --- a/src/art/unsloth/service.py +++ b/src/art/unsloth/service.py @@ -670,8 +670,8 @@ def _state(self) -> UnslothState: ), ) - # Transformers v5 removed `warnings_issued` from PreTrainedModel, - # but Unsloth's GRPOTrainer still accesses it during init. + # Unsloth's model patching can leave the PEFT model without + # `warnings_issued`, which GRPOTrainer expects during init. if not hasattr(peft_model, "warnings_issued"): peft_model.warnings_issued = {} # type: ignore[attr-defined]