nl: preserve raw bytes in output instead of using from_utf8_lossy

2025-12-23 08:47:37 +00:00 · 2025-12-16 13:04:18 +00:00 · 2025-12-16 13:04:18 +00:00 · 93c8d5439b
commit 93c8d5439b
parent c085cd1c21
2 changed files with 40 additions and 29 deletions
--- a/src/uu/nl/src/nl.rs
+++ b/src/uu/nl/src/nl.rs
@ -345,6 +345,13 @@ pub fn uu_app() -> Command {
        )
 }

+/// Helper to write: prefix bytes + line bytes + newline
+fn write_line(writer: &mut impl Write, prefix: &[u8], line: &[u8]) -> std::io::Result<()> {
+    writer.write_all(prefix)?;
+    writer.write_all(line)?;
+    writeln!(writer)
+}
+
 /// `nl` implements the main functionality for an individual buffer.
 fn nl<T: Read>(reader: &mut BufReader<T>, stats: &mut Stats, settings: &Settings) -> UResult<()> {
    let mut writer = BufWriter::new(stdout());
@ -409,24 +416,17 @@ fn nl<T: Read>(reader: &mut BufReader<T>, stats: &mut Stats, settings: &Settings
                        translate!("nl-error-line-number-overflow"),
                    ));
                };
-                writeln!(
-                    writer,
-                    "{}{}{}",
-                    settings
-                        .number_format
-                        .format(line_number, settings.number_width),
-                    settings.number_separator.to_string_lossy(),
-                    String::from_utf8_lossy(&line),
-                )
-                .map_err_context(|| translate!("nl-error-could-not-write"))?;
-                // update line number for the potential next line
-                match line_number.checked_add(settings.line_increment) {
-                    Some(new_line_number) => stats.line_number = Some(new_line_number),
-                    None => stats.line_number = None, // overflow
-                }
+                let mut prefix = settings
+                    .number_format
+                    .format(line_number, settings.number_width)
+                    .into_bytes();
+                prefix.extend_from_slice(settings.number_separator.as_encoded_bytes());
+                write_line(&mut writer, &prefix, &line)
+                    .map_err_context(|| translate!("nl-error-could-not-write"))?;
+                stats.line_number = line_number.checked_add(settings.line_increment);
            } else {
-                let spaces = " ".repeat(settings.number_width + 1);
-                writeln!(writer, "{spaces}{}", String::from_utf8_lossy(&line))
+                let prefix = " ".repeat(settings.number_width + 1);
+                write_line(&mut writer, prefix.as_bytes(), &line)
                    .map_err_context(|| translate!("nl-error-could-not-write"))?;
            }
        }
--- a/tests/by-util/test_nl.rs
+++ b/tests/by-util/test_nl.rs
@ -3,7 +3,7 @@
 // For the full copyright and license information, please view the LICENSE
 // file that was distributed with this source code.
 //
-// spell-checker:ignore binvalid finvalid hinvalid iinvalid linvalid nabcabc nabcabcabc ninvalid vinvalid winvalid dabc näää
+// spell-checker:ignore binvalid finvalid hinvalid iinvalid linvalid nabcabc nabcabcabc ninvalid vinvalid winvalid dabc näää févr
 use uutests::{at_and_ucmd, new_ucmd, util::TestScenario, util_name};

 #[test]
@ -209,23 +209,24 @@ fn test_number_separator() {
 #[test]
 #[cfg(target_os = "linux")]
 fn test_number_separator_non_utf8() {
-    use std::{
-        ffi::{OsStr, OsString},
-        os::unix::ffi::{OsStrExt, OsStringExt},
-    };
+    use std::{ffi::OsString, os::unix::ffi::OsStringExt};

    let separator_bytes = [0xFF, 0xFE];
    let mut v = b"--number-separator=".to_vec();
    v.extend_from_slice(&separator_bytes);

    let arg = OsString::from_vec(v);
-    let separator = OsStr::from_bytes(&separator_bytes);
+
+    // Raw bytes should be preserved in the separator output
+    let mut expected = b"     1".to_vec();
+    expected.extend_from_slice(&separator_bytes);
+    expected.extend_from_slice(b"test\n");

    new_ucmd!()
        .arg(arg)
        .pipe_in("test")
        .succeeds()
-        .stdout_is(format!("     1{}test\n", separator.to_string_lossy()));
+        .stdout_is_bytes(expected);
 }

 #[test]
@ -791,14 +792,24 @@ fn test_file_with_non_utf8_content() {

    let filename = "file";
    let content: &[u8] = b"a\n\xFF\xFE\nb";
-    let invalid_utf8: &[u8] = b"\xFF\xFE";

    at.write_bytes(filename, content);

-    ucmd.arg(filename).succeeds().stdout_is(format!(
-        "     1\ta\n     2\t{}\n     3\tb\n",
-        String::from_utf8_lossy(invalid_utf8)
-    ));
+    // Raw bytes should be preserved in output (not converted to UTF-8 replacement chars)
+    let expected: Vec<u8> = b"     1\ta\n     2\t\xFF\xFE\n     3\tb\n".to_vec();
+    ucmd.arg(filename).succeeds().stdout_is_bytes(expected);
+}
+
+#[test]
+fn test_stdin_non_utf8_preserved() {
+    // Verify that non-UTF8 bytes are preserved in output, not converted to replacement chars
+    // This is important for locale compatibility
+    let input: Vec<u8> = b"f\xe9vr.\n".to_vec(); // "févr." in Latin-1
+    let expected: Vec<u8> = b"     1\tf\xe9vr.\n".to_vec();
+    new_ucmd!()
+        .pipe_in(input)
+        .succeeds()
+        .stdout_is_bytes(expected);
 }

 // Regression tests for issue #9132: repeated flags should use last value