From d435edae98ed22e929cd1be1360de5fba3aa1151 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 2 Mar 2026 01:12:38 +0000 Subject: [PATCH] fix(builtins): use char-based operations in expr for UTF-8 safety expr length now uses chars().count() instead of len() (byte count). expr substr now uses char-based skip/take instead of byte slicing. Prevents panics on multi-byte UTF-8 input. Closes #434 https://claude.ai/code/session_01WZjYqxm5xMPAEe7FSHJkDy --- crates/bashkit/src/builtins/expr.rs | 49 ++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 5 deletions(-) diff --git a/crates/bashkit/src/builtins/expr.rs b/crates/bashkit/src/builtins/expr.rs index 021a87ec..c1baba92 100644 --- a/crates/bashkit/src/builtins/expr.rs +++ b/crates/bashkit/src/builtins/expr.rs @@ -43,7 +43,8 @@ fn evaluate(args: &[&str]) -> std::result::Result { // Handle keyword operations first if args.len() >= 2 && args[0] == "length" { - return Ok(args[1].len().to_string()); + // TM-UNI-015: Use char count, not byte count + return Ok(args[1].chars().count().to_string()); } if args.len() >= 4 && args[0] == "substr" { @@ -54,12 +55,13 @@ fn evaluate(args: &[&str]) -> std::result::Result { let len: usize = args[3] .parse() .map_err(|_| "non-integer argument".to_string())?; - if pos == 0 || pos > s.len() { + let char_count = s.chars().count(); + if pos == 0 || pos > char_count { return Ok(String::new()); } - let start = pos - 1; // 1-based to 0-based - let end = (start + len).min(s.len()); - return Ok(s[start..end].to_string()); + // TM-UNI-015: Use char-based slicing, not byte-based + let result: String = s.chars().skip(pos - 1).take(len).collect(); + return Ok(result); } if args.len() >= 3 && args[0] == "index" { @@ -638,4 +640,41 @@ mod tests { assert_eq!(result.stdout.trim(), "0"); assert_eq!(result.exit_code, 1); // "0" => falsy } + + // Issue #434: length should count chars, not bytes + #[tokio::test] + async fn test_length_multibyte_utf8() { + let fs = Arc::new(crate::fs::InMemoryFs::new()); + let mut variables = HashMap::new(); + let mut cwd = std::path::PathBuf::from("/"); + let env = HashMap::new(); + // "café" = 4 chars but 5 bytes (é is 2 bytes) + let args = vec!["length".to_string(), "café".to_string()]; + let ctx = Context::new_for_test(&args, &env, &mut variables, &mut cwd, fs.clone(), None); + let result = Expr.execute(ctx).await.unwrap(); + assert_eq!(result.stdout.trim(), "4", "should count chars, not bytes"); + } + + // Issue #434: substr should use char-based slicing + #[tokio::test] + async fn test_substr_multibyte_utf8() { + let fs = Arc::new(crate::fs::InMemoryFs::new()); + let mut variables = HashMap::new(); + let mut cwd = std::path::PathBuf::from("/"); + let env = HashMap::new(); + // "日本語" - extract first 2 chars + let args = vec![ + "substr".to_string(), + "日本語".to_string(), + "1".to_string(), + "2".to_string(), + ]; + let ctx = Context::new_for_test(&args, &env, &mut variables, &mut cwd, fs.clone(), None); + let result = Expr.execute(ctx).await.unwrap(); + assert_eq!( + result.stdout.trim(), + "日本", + "should extract chars, not bytes" + ); + } }