nl: preserve raw bytes in output instead of using from_utf8_lossy

This commit is contained in:
Christopher Dryden 2025-12-16 13:04:18 +00:00
parent c085cd1c21
commit 93c8d5439b
2 changed files with 40 additions and 29 deletions

View file

@ -345,6 +345,13 @@ pub fn uu_app() -> Command {
)
}
/// Helper to write: prefix bytes + line bytes + newline
fn write_line(writer: &mut impl Write, prefix: &[u8], line: &[u8]) -> std::io::Result<()> {
writer.write_all(prefix)?;
writer.write_all(line)?;
writeln!(writer)
}
/// `nl` implements the main functionality for an individual buffer.
fn nl<T: Read>(reader: &mut BufReader<T>, stats: &mut Stats, settings: &Settings) -> UResult<()> {
let mut writer = BufWriter::new(stdout());
@ -409,24 +416,17 @@ fn nl<T: Read>(reader: &mut BufReader<T>, stats: &mut Stats, settings: &Settings
translate!("nl-error-line-number-overflow"),
));
};
writeln!(
writer,
"{}{}{}",
settings
.number_format
.format(line_number, settings.number_width),
settings.number_separator.to_string_lossy(),
String::from_utf8_lossy(&line),
)
.map_err_context(|| translate!("nl-error-could-not-write"))?;
// update line number for the potential next line
match line_number.checked_add(settings.line_increment) {
Some(new_line_number) => stats.line_number = Some(new_line_number),
None => stats.line_number = None, // overflow
}
let mut prefix = settings
.number_format
.format(line_number, settings.number_width)
.into_bytes();
prefix.extend_from_slice(settings.number_separator.as_encoded_bytes());
write_line(&mut writer, &prefix, &line)
.map_err_context(|| translate!("nl-error-could-not-write"))?;
stats.line_number = line_number.checked_add(settings.line_increment);
} else {
let spaces = " ".repeat(settings.number_width + 1);
writeln!(writer, "{spaces}{}", String::from_utf8_lossy(&line))
let prefix = " ".repeat(settings.number_width + 1);
write_line(&mut writer, prefix.as_bytes(), &line)
.map_err_context(|| translate!("nl-error-could-not-write"))?;
}
}

View file

@ -3,7 +3,7 @@
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
//
// spell-checker:ignore binvalid finvalid hinvalid iinvalid linvalid nabcabc nabcabcabc ninvalid vinvalid winvalid dabc näää
// spell-checker:ignore binvalid finvalid hinvalid iinvalid linvalid nabcabc nabcabcabc ninvalid vinvalid winvalid dabc näää févr
use uutests::{at_and_ucmd, new_ucmd, util::TestScenario, util_name};
#[test]
@ -209,23 +209,24 @@ fn test_number_separator() {
#[test]
#[cfg(target_os = "linux")]
fn test_number_separator_non_utf8() {
use std::{
ffi::{OsStr, OsString},
os::unix::ffi::{OsStrExt, OsStringExt},
};
use std::{ffi::OsString, os::unix::ffi::OsStringExt};
let separator_bytes = [0xFF, 0xFE];
let mut v = b"--number-separator=".to_vec();
v.extend_from_slice(&separator_bytes);
let arg = OsString::from_vec(v);
let separator = OsStr::from_bytes(&separator_bytes);
// Raw bytes should be preserved in the separator output
let mut expected = b" 1".to_vec();
expected.extend_from_slice(&separator_bytes);
expected.extend_from_slice(b"test\n");
new_ucmd!()
.arg(arg)
.pipe_in("test")
.succeeds()
.stdout_is(format!(" 1{}test\n", separator.to_string_lossy()));
.stdout_is_bytes(expected);
}
#[test]
@ -791,14 +792,24 @@ fn test_file_with_non_utf8_content() {
let filename = "file";
let content: &[u8] = b"a\n\xFF\xFE\nb";
let invalid_utf8: &[u8] = b"\xFF\xFE";
at.write_bytes(filename, content);
ucmd.arg(filename).succeeds().stdout_is(format!(
" 1\ta\n 2\t{}\n 3\tb\n",
String::from_utf8_lossy(invalid_utf8)
));
// Raw bytes should be preserved in output (not converted to UTF-8 replacement chars)
let expected: Vec<u8> = b" 1\ta\n 2\t\xFF\xFE\n 3\tb\n".to_vec();
ucmd.arg(filename).succeeds().stdout_is_bytes(expected);
}
#[test]
fn test_stdin_non_utf8_preserved() {
// Verify that non-UTF8 bytes are preserved in output, not converted to replacement chars
// This is important for locale compatibility
let input: Vec<u8> = b"f\xe9vr.\n".to_vec(); // "févr." in Latin-1
let expected: Vec<u8> = b" 1\tf\xe9vr.\n".to_vec();
new_ucmd!()
.pipe_in(input)
.succeeds()
.stdout_is_bytes(expected);
}
// Regression tests for issue #9132: repeated flags should use last value