nl: allow non-UTF8 section delimiter

This commit is contained in:
Daniel Hofstetter 2025-09-05 16:38:00 +02:00
parent ee1b802612
commit 93feaccbcf
3 changed files with 98 additions and 10 deletions

View file

@ -16,11 +16,13 @@ pub fn parse_options(settings: &mut crate::Settings, opts: &clap::ArgMatches) ->
// This vector holds error messages encountered.
let mut errs: Vec<String> = vec![];
settings.renumber = opts.get_flag(options::NO_RENUMBER);
if let Some(delimiter) = opts.get_one::<String>(options::SECTION_DELIMITER) {
// check whether the delimiter is a single ASCII char (1 byte)
// because GNU nl doesn't add a ':' to single non-ASCII chars
if let Some(delimiter) = opts.get_one::<OsString>(options::SECTION_DELIMITER) {
// GNU nl determines whether a delimiter is a "single character" based on byte length, not
// character length. A "single character" implies the second character is a ':'.
settings.section_delimiter = if delimiter.len() == 1 {
format!("{delimiter}:")
let mut delimiter = delimiter.clone();
delimiter.push(":");
delimiter
} else {
delimiter.clone()
};

View file

@ -4,7 +4,7 @@
// file that was distributed with this source code.
use clap::{Arg, ArgAction, Command};
use std::ffi::OsString;
use std::ffi::{OsStr, OsString};
use std::fs::File;
use std::io::{BufRead, BufReader, Read, stdin};
use std::path::Path;
@ -20,7 +20,7 @@ pub struct Settings {
body_numbering: NumberingStyle,
footer_numbering: NumberingStyle,
// The variable corresponding to -d
section_delimiter: String,
section_delimiter: OsString,
// The variables corresponding to the options -v, -i, -l, -w.
starting_line_number: i64,
line_increment: i64,
@ -40,7 +40,7 @@ impl Default for Settings {
header_numbering: NumberingStyle::None,
body_numbering: NumberingStyle::NonEmpty,
footer_numbering: NumberingStyle::None,
section_delimiter: String::from("\\:"),
section_delimiter: OsString::from("\\:"),
starting_line_number: 1,
line_increment: 1,
join_blank_lines: 1,
@ -140,8 +140,8 @@ enum SectionDelimiter {
impl SectionDelimiter {
/// A valid section delimiter contains the pattern one to three times,
/// and nothing else.
fn parse(bytes: &[u8], pattern: &str) -> Option<Self> {
let pattern = pattern.as_bytes();
fn parse(bytes: &[u8], pattern: &OsStr) -> Option<Self> {
let pattern = pattern.as_encoded_bytes();
if bytes.is_empty() || pattern.is_empty() || bytes.len() % pattern.len() != 0 {
return None;
@ -270,6 +270,7 @@ pub fn uu_app() -> Command {
.short('d')
.long(options::SECTION_DELIMITER)
.help(translate!("nl-help-section-delimiter"))
.value_parser(clap::value_parser!(OsString))
.value_name("CC"),
)
.arg(

View file

@ -627,7 +627,50 @@ fn test_section_delimiter() {
}
#[test]
fn test_one_char_section_delimiter_expansion() {
#[cfg(target_os = "linux")]
fn test_section_delimiter_non_utf8() {
use std::{ffi::OsString, os::unix::ffi::OsStringExt};
fn create_arg(prefix: &[u8]) -> OsString {
let section_delimiter = [0xFF, 0xFE];
let mut v = prefix.to_vec();
v.extend_from_slice(&section_delimiter);
OsString::from_vec(v)
}
let short = create_arg(b"-d");
let long = create_arg(b"--section-delimiter=");
for arg in [short, long] {
let header_section: Vec<u8> =
vec![b'a', b'\n', 0xFF, 0xFE, 0xFF, 0xFE, 0xFF, 0xFE, b'\n', b'b'];
new_ucmd!()
.arg(&arg)
.pipe_in(header_section)
.succeeds()
.stdout_is(" 1\ta\n\n b\n");
let body_section: Vec<u8> = vec![b'a', b'\n', 0xFF, 0xFE, 0xFF, 0xFE, b'\n', b'b'];
new_ucmd!()
.arg(&arg)
.pipe_in(body_section)
.succeeds()
.stdout_is(" 1\ta\n\n 1\tb\n");
let footer_section: Vec<u8> = vec![b'a', b'\n', 0xFF, 0xFE, b'\n', b'b'];
new_ucmd!()
.arg(&arg)
.pipe_in(footer_section)
.succeeds()
.stdout_is(" 1\ta\n\n b\n");
}
}
#[test]
fn test_one_char_section_delimiter() {
for arg in ["-da", "--section-delimiter=a"] {
new_ucmd!()
.arg(arg)
@ -649,6 +692,48 @@ fn test_one_char_section_delimiter_expansion() {
}
}
#[test]
#[cfg(target_os = "linux")]
fn test_one_byte_section_delimiter() {
use std::{ffi::OsString, os::unix::ffi::OsStringExt};
fn create_arg(prefix: &[u8]) -> OsString {
let mut v = prefix.to_vec();
v.push(0xFF);
OsString::from_vec(v)
}
let short = create_arg(b"-d");
let long = create_arg(b"--section-delimiter=");
for arg in [short, long] {
let header_section: Vec<u8> =
vec![b'a', b'\n', 0xFF, b':', 0xFF, b':', 0xFF, b':', b'\n', b'b'];
new_ucmd!()
.arg(&arg)
.pipe_in(header_section)
.succeeds()
.stdout_is(" 1\ta\n\n b\n");
let body_section: Vec<u8> = vec![b'a', b'\n', 0xFF, b':', 0xFF, b':', b'\n', b'b'];
new_ucmd!()
.arg(&arg)
.pipe_in(body_section)
.succeeds()
.stdout_is(" 1\ta\n\n 1\tb\n");
let footer_section: Vec<u8> = vec![b'a', b'\n', 0xFF, b':', b'\n', b'b'];
new_ucmd!()
.arg(&arg)
.pipe_in(footer_section)
.succeeds()
.stdout_is(" 1\ta\n\n b\n");
}
}
#[test]
fn test_non_ascii_one_char_section_delimiter() {
for arg in ["-dä", "--section-delimiter=ä"] {