join: add support for multibyte separators (#6736)

* join: add test for multibyte separators

* join: implement support for multibyte separators

* join: use a trait instead of an enum for separator

* join: test whitespace merging
This commit is contained in:
Justin Tracey 2024-10-06 05:48:08 -04:00 committed by GitHub
parent d8eb4e2214
commit a51a731704
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 251 additions and 89 deletions

View file

@ -3,11 +3,11 @@
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
// spell-checker:ignore (ToDO) autoformat FILENUM whitespaces pairable unpairable nocheck
// spell-checker:ignore (ToDO) autoformat FILENUM whitespaces pairable unpairable nocheck memmem
use clap::builder::ValueParser;
use clap::{crate_version, Arg, ArgAction, Command};
use memchr::{memchr3_iter, memchr_iter};
use memchr::{memchr_iter, memmem::Finder, Memchr3};
use std::cmp::Ordering;
use std::error::Error;
use std::ffi::OsString;
@ -60,13 +60,114 @@ enum FileNum {
File2,
}
#[derive(Copy, Clone, PartialEq)]
enum Sep {
Char(u8),
#[derive(Clone)]
enum SepSetting {
/// Any single-byte separator.
Byte(u8),
/// A single character more than one byte long.
Char(Vec<u8>),
/// No separators, join on the entire line.
Line,
/// Whitespace separators.
Whitespaces,
}
trait Separator: Clone {
/// Using this separator, return the start and end index of all fields in the haystack.
fn field_ranges(&self, haystack: &[u8], len_guess: usize) -> Vec<(usize, usize)>;
/// The separator as it appears when in the output.
fn output_separator(&self) -> &[u8];
}
/// Simple separators one byte in length.
#[derive(Copy, Clone)]
struct OneByteSep {
byte: [u8; 1],
}
impl Separator for OneByteSep {
fn field_ranges(&self, haystack: &[u8], len_guess: usize) -> Vec<(usize, usize)> {
let mut field_ranges = Vec::with_capacity(len_guess);
let mut last_end = 0;
for i in memchr_iter(self.byte[0], haystack) {
field_ranges.push((last_end, i));
last_end = i + 1;
}
field_ranges.push((last_end, haystack.len()));
field_ranges
}
fn output_separator(&self) -> &[u8] {
&self.byte
}
}
/// Multi-byte (but still single character) separators.
#[derive(Clone)]
struct MultiByteSep<'a> {
finder: Finder<'a>,
}
impl<'a> Separator for MultiByteSep<'a> {
fn field_ranges(&self, haystack: &[u8], len_guess: usize) -> Vec<(usize, usize)> {
let mut field_ranges = Vec::with_capacity(len_guess);
let mut last_end = 0;
for i in self.finder.find_iter(haystack) {
field_ranges.push((last_end, i));
last_end = i + self.finder.needle().len();
}
field_ranges.push((last_end, haystack.len()));
field_ranges
}
fn output_separator(&self) -> &[u8] {
self.finder.needle()
}
}
/// Whole-line separator.
#[derive(Copy, Clone)]
struct LineSep {}
impl Separator for LineSep {
fn field_ranges(&self, haystack: &[u8], _len_guess: usize) -> Vec<(usize, usize)> {
vec![(0, haystack.len())]
}
fn output_separator(&self) -> &[u8] {
&[]
}
}
/// Default whitespace separator.
#[derive(Copy, Clone)]
struct WhitespaceSep {}
impl Separator for WhitespaceSep {
fn field_ranges(&self, haystack: &[u8], len_guess: usize) -> Vec<(usize, usize)> {
let mut field_ranges = Vec::with_capacity(len_guess);
let mut last_end = 0;
// GNU join used Bourne shell field splitters by default
// FIXME: but now uses locale-dependent whitespace
for i in Memchr3::new(b' ', b'\t', b'\n', haystack) {
// leading whitespace should be dropped, contiguous whitespace merged
if i > last_end {
field_ranges.push((last_end, i));
}
last_end = i + 1;
}
field_ranges.push((last_end, haystack.len()));
field_ranges
}
fn output_separator(&self) -> &[u8] {
b" "
}
}
#[derive(Copy, Clone, PartialEq)]
enum CheckOrder {
Default,
@ -82,7 +183,7 @@ struct Settings {
print_joined: bool,
ignore_case: bool,
line_ending: LineEnding,
separator: Sep,
separator: SepSetting,
autoformat: bool,
format: Vec<Spec>,
empty: Vec<u8>,
@ -100,7 +201,7 @@ impl Default for Settings {
print_joined: true,
ignore_case: false,
line_ending: LineEnding::Newline,
separator: Sep::Whitespaces,
separator: SepSetting::Whitespaces,
autoformat: false,
format: vec![],
empty: vec![],
@ -111,15 +212,15 @@ impl Default for Settings {
}
/// Output representation.
struct Repr<'a> {
struct Repr<'a, Sep: Separator> {
line_ending: LineEnding,
separator: u8,
format: &'a [Spec],
separator: Sep,
format: Vec<Spec>,
empty: &'a [u8],
}
impl<'a> Repr<'a> {
fn new(line_ending: LineEnding, separator: u8, format: &'a [Spec], empty: &'a [u8]) -> Self {
impl<'a, Sep: Separator> Repr<'a, Sep> {
fn new(line_ending: LineEnding, separator: Sep, format: Vec<Spec>, empty: &'a [u8]) -> Self {
Repr {
line_ending,
separator,
@ -155,7 +256,7 @@ impl<'a> Repr<'a> {
) -> Result<(), std::io::Error> {
for i in 0..line.field_ranges.len() {
if i != index {
writer.write_all(&[self.separator])?;
writer.write_all(self.separator.output_separator())?;
writer.write_all(line.get_field(i).unwrap())?;
}
}
@ -169,7 +270,7 @@ impl<'a> Repr<'a> {
{
for i in 0..self.format.len() {
if i > 0 {
writer.write_all(&[self.separator])?;
writer.write_all(self.separator.output_separator())?;
}
let field = match f(&self.format[i]) {
@ -188,13 +289,13 @@ impl<'a> Repr<'a> {
}
/// Input processing parameters.
struct Input {
struct Input<Sep: Separator> {
separator: Sep,
ignore_case: bool,
check_order: CheckOrder,
}
impl Input {
impl<Sep: Separator> Input<Sep> {
fn new(separator: Sep, ignore_case: bool, check_order: CheckOrder) -> Self {
Self {
separator,
@ -271,24 +372,8 @@ struct Line {
}
impl Line {
fn new(string: Vec<u8>, separator: Sep, len_guess: usize) -> Self {
let mut field_ranges = Vec::with_capacity(len_guess);
let mut last_end = 0;
if separator == Sep::Whitespaces {
// GNU join uses Bourne shell field splitters by default
for i in memchr3_iter(b' ', b'\t', b'\n', &string) {
if i > last_end {
field_ranges.push((last_end, i));
}
last_end = i + 1;
}
} else if let Sep::Char(sep) = separator {
for i in memchr_iter(sep, &string) {
field_ranges.push((last_end, i));
last_end = i + 1;
}
}
field_ranges.push((last_end, string.len()));
fn new<Sep: Separator>(string: Vec<u8>, separator: &Sep, len_guess: usize) -> Self {
let field_ranges = separator.field_ranges(&string, len_guess);
Self {
field_ranges,
@ -351,7 +436,12 @@ impl<'a> State<'a> {
}
/// Skip the current unpaired line.
fn skip_line(&mut self, writer: &mut impl Write, input: &Input, repr: &Repr) -> UResult<()> {
fn skip_line<Sep: Separator>(
&mut self,
writer: &mut impl Write,
input: &Input<Sep>,
repr: &Repr<'a, Sep>,
) -> UResult<()> {
if self.print_unpaired {
self.print_first_line(writer, repr)?;
}
@ -362,7 +452,7 @@ impl<'a> State<'a> {
/// Keep reading line sequence until the key does not change, return
/// the first line whose key differs.
fn extend(&mut self, input: &Input) -> UResult<Option<Line>> {
fn extend<Sep: Separator>(&mut self, input: &Input<Sep>) -> UResult<Option<Line>> {
while let Some(line) = self.next_line(input)? {
let diff = input.compare(self.get_current_key(), line.get_field(self.key));
@ -377,11 +467,11 @@ impl<'a> State<'a> {
}
/// Print lines in the buffers as headers.
fn print_headers(
fn print_headers<Sep: Separator>(
&self,
writer: &mut impl Write,
other: &State,
repr: &Repr,
repr: &Repr<'a, Sep>,
) -> Result<(), std::io::Error> {
if self.has_line() {
if other.has_line() {
@ -397,11 +487,11 @@ impl<'a> State<'a> {
}
/// Combine two line sequences.
fn combine(
fn combine<Sep: Separator>(
&self,
writer: &mut impl Write,
other: &State,
repr: &Repr,
repr: &Repr<'a, Sep>,
) -> Result<(), std::io::Error> {
let key = self.get_current_key();
@ -444,13 +534,16 @@ impl<'a> State<'a> {
}
}
fn reset_read_line(&mut self, input: &Input) -> Result<(), std::io::Error> {
let line = self.read_line(input.separator)?;
fn reset_read_line<Sep: Separator>(
&mut self,
input: &Input<Sep>,
) -> Result<(), std::io::Error> {
let line = self.read_line(&input.separator)?;
self.reset(line);
Ok(())
}
fn reset_next_line(&mut self, input: &Input) -> Result<(), JoinError> {
fn reset_next_line<Sep: Separator>(&mut self, input: &Input<Sep>) -> Result<(), JoinError> {
let line = self.next_line(input)?;
self.reset(line);
Ok(())
@ -460,7 +553,7 @@ impl<'a> State<'a> {
!self.seq.is_empty()
}
fn initialize(&mut self, read_sep: Sep, autoformat: bool) -> usize {
fn initialize<Sep: Separator>(&mut self, read_sep: &Sep, autoformat: bool) -> usize {
if let Some(line) = crash_if_err!(1, self.read_line(read_sep)) {
self.seq.push(line);
@ -471,7 +564,12 @@ impl<'a> State<'a> {
0
}
fn finalize(&mut self, writer: &mut impl Write, input: &Input, repr: &Repr) -> UResult<()> {
fn finalize<Sep: Separator>(
&mut self,
writer: &mut impl Write,
input: &Input<Sep>,
repr: &Repr<'a, Sep>,
) -> UResult<()> {
if self.has_line() {
if self.print_unpaired {
self.print_first_line(writer, repr)?;
@ -491,7 +589,7 @@ impl<'a> State<'a> {
}
/// Get the next line without the order check.
fn read_line(&mut self, sep: Sep) -> Result<Option<Line>, std::io::Error> {
fn read_line<Sep: Separator>(&mut self, sep: &Sep) -> Result<Option<Line>, std::io::Error> {
match self.lines.next() {
Some(value) => {
self.line_num += 1;
@ -506,8 +604,8 @@ impl<'a> State<'a> {
}
/// Get the next line with the order check.
fn next_line(&mut self, input: &Input) -> Result<Option<Line>, JoinError> {
if let Some(line) = self.read_line(input.separator)? {
fn next_line<Sep: Separator>(&mut self, input: &Input<Sep>) -> Result<Option<Line>, JoinError> {
if let Some(line) = self.read_line(&input.separator)? {
if input.check_order == CheckOrder::Disabled {
return Ok(Some(line));
}
@ -543,11 +641,11 @@ impl<'a> State<'a> {
self.seq[0].get_field(self.key)
}
fn print_line(
fn print_line<Sep: Separator>(
&self,
writer: &mut impl Write,
line: &Line,
repr: &Repr,
repr: &Repr<'a, Sep>,
) -> Result<(), std::io::Error> {
if repr.uses_format() {
repr.print_format(writer, |spec| match *spec {
@ -568,31 +666,53 @@ impl<'a> State<'a> {
repr.print_line_ending(writer)
}
fn print_first_line(&self, writer: &mut impl Write, repr: &Repr) -> Result<(), std::io::Error> {
fn print_first_line<Sep: Separator>(
&self,
writer: &mut impl Write,
repr: &Repr<'a, Sep>,
) -> Result<(), std::io::Error> {
self.print_line(writer, &self.seq[0], repr)
}
}
fn parse_separator(value_os: &OsString) -> UResult<Sep> {
fn parse_separator(value_os: &OsString) -> UResult<SepSetting> {
// Five possible separator values:
// No argument supplied, separate on whitespace; handled implicitly as the default elsewhere
// An empty string arg, whole line separation
// On unix-likes only, a single arbitrary byte
// The two-character "\0" string, interpreted as a single 0 byte
// A single scalar valid in the locale encoding (currently only UTF-8)
if value_os.is_empty() {
return Ok(SepSetting::Line);
}
#[cfg(unix)]
let value = value_os.as_bytes();
#[cfg(not(unix))]
let value = match value_os.to_str() {
Some(value) => value.as_bytes(),
None => {
return Err(USimpleError::new(
1,
"unprintable field separators are only supported on unix-like platforms",
));
{
let value = value_os.as_bytes();
if value.len() == 1 {
return Ok(SepSetting::Byte(value[0]));
}
}
let Some(value) = value_os.to_str() else {
#[cfg(unix)]
return Err(USimpleError::new(1, "non-UTF-8 multi-byte tab"));
#[cfg(not(unix))]
return Err(USimpleError::new(
1,
"unprintable field separators are only supported on unix-like platforms",
));
};
match value.len() {
0 => Ok(Sep::Line),
1 => Ok(Sep::Char(value[0])),
2 if value[0] == b'\\' && value[1] == b'0' => Ok(Sep::Char(0)),
let mut chars = value.chars();
let c = chars.next().expect("valid string with at least one byte");
match chars.next() {
None => Ok(SepSetting::Char(value.into())),
Some('0') if c == '\\' => Ok(SepSetting::Byte(0)),
_ => Err(USimpleError::new(
1,
format!("multi-character tab {}", value_os.to_string_lossy()),
format!("multi-character tab {}", value),
)),
}
}
@ -695,7 +815,20 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
return Err(USimpleError::new(1, "both files cannot be standard input"));
}
exec(file1, file2, settings)
let sep = settings.separator.clone();
match sep {
SepSetting::Byte(byte) => exec(file1, file2, settings, OneByteSep { byte: [byte] }),
SepSetting::Char(c) => exec(
file1,
file2,
settings,
MultiByteSep {
finder: Finder::new(&c),
},
),
SepSetting::Whitespaces => exec(file1, file2, settings, WhitespaceSep {}),
SepSetting::Line => exec(file1, file2, settings, LineSep {}),
}
}
pub fn uu_app() -> Command {
@ -816,7 +949,7 @@ FILENUM is 1 or 2, corresponding to FILE1 or FILE2",
)
}
fn exec(file1: &str, file2: &str, settings: Settings) -> UResult<()> {
fn exec<Sep: Separator>(file1: &str, file2: &str, settings: Settings, sep: Sep) -> UResult<()> {
let stdin = stdin();
let mut state1 = State::new(
@ -837,16 +970,12 @@ fn exec(file1: &str, file2: &str, settings: Settings) -> UResult<()> {
settings.print_unpaired2,
)?;
let input = Input::new(
settings.separator,
settings.ignore_case,
settings.check_order,
);
let input = Input::new(sep.clone(), settings.ignore_case, settings.check_order);
let format = if settings.autoformat {
let mut format = vec![Spec::Key];
let mut initialize = |state: &mut State| {
let max_fields = state.initialize(settings.separator, settings.autoformat);
let max_fields = state.initialize(&sep, settings.autoformat);
for i in 0..max_fields {
if i != state.key {
format.push(Spec::Field(state.file_num, i));
@ -857,20 +986,12 @@ fn exec(file1: &str, file2: &str, settings: Settings) -> UResult<()> {
initialize(&mut state2);
format
} else {
state1.initialize(settings.separator, settings.autoformat);
state2.initialize(settings.separator, settings.autoformat);
state1.initialize(&sep, settings.autoformat);
state2.initialize(&sep, settings.autoformat);
settings.format
};
let repr = Repr::new(
settings.line_ending,
match settings.separator {
Sep::Char(sep) => sep,
_ => b' ',
},
&format,
&settings.empty,
);
let repr = Repr::new(settings.line_ending, sep, format, &settings.empty);
let stdout = stdout();
let mut writer = BufWriter::new(stdout.lock());

View file

@ -58,6 +58,25 @@ fn default_arguments() {
.stdout_only_fixture("default.expected");
}
#[test]
fn only_whitespace_separators_merge() {
new_ucmd!()
.arg("contiguous_separators.txt")
.arg("-")
.pipe_in(" a ,c ")
.succeeds()
.stdout_only("a ,,,b ,c \n");
new_ucmd!()
.arg("contiguous_separators.txt")
.arg("-t")
.arg(",")
.arg("-")
.pipe_in(" a ,c ")
.succeeds()
.stdout_only(" a ,,,b,c \n");
}
#[test]
fn different_fields() {
new_ucmd!()
@ -208,9 +227,9 @@ fn tab_multi_character() {
.arg("semicolon_fields_1.txt")
.arg("semicolon_fields_2.txt")
.arg("-t")
.arg("э")
.arg("ab")
.fails()
.stderr_is("join: multi-character tab э\n");
.stderr_is("join: multi-character tab ab\n");
}
#[test]
@ -437,14 +456,22 @@ fn non_unicode() {
#[cfg(unix)]
{
let invalid_utf8: u8 = 167;
let non_utf8_byte: u8 = 167;
new_ucmd!()
.arg("-t")
.arg(OsStr::from_bytes(&[invalid_utf8]))
.arg(OsStr::from_bytes(&[non_utf8_byte]))
.arg("non-unicode_1.bin")
.arg("non-unicode_2.bin")
.succeeds()
.stdout_only_fixture("non-unicode_sep.expected");
new_ucmd!()
.arg("-t")
.arg(OsStr::from_bytes(&[non_utf8_byte, non_utf8_byte]))
.arg("non-unicode_1.bin")
.arg("non-unicode_2.bin")
.fails()
.stderr_is("join: non-UTF-8 multi-byte tab\n");
}
#[cfg(windows)]
@ -462,6 +489,16 @@ fn non_unicode() {
}
}
#[test]
fn multibyte_sep() {
new_ucmd!()
.arg("-t§")
.arg("multibyte_sep_1.txt")
.arg("multibyte_sep_2.txt")
.succeeds()
.stdout_only_fixture("multibyte_sep.expected");
}
#[test]
fn null_field_separators() {
new_ucmd!()

View file

@ -0,0 +1 @@
a ,,,b

View file

@ -0,0 +1 @@
a§b§c

View file

@ -0,0 +1 @@
a§b

View file

@ -0,0 +1 @@
a§c