mirror of
https://github.com/uutils/coreutils.git
synced 2025-07-07 21:45:01 +00:00
join: add support for multibyte separators (#6736)
* join: add test for multibyte separators * join: implement support for multibyte separators * join: use a trait instead of an enum for separator * join: test whitespace merging
This commit is contained in:
parent
d8eb4e2214
commit
a51a731704
6 changed files with 251 additions and 89 deletions
|
@ -3,11 +3,11 @@
|
|||
// For the full copyright and license information, please view the LICENSE
|
||||
// file that was distributed with this source code.
|
||||
|
||||
// spell-checker:ignore (ToDO) autoformat FILENUM whitespaces pairable unpairable nocheck
|
||||
// spell-checker:ignore (ToDO) autoformat FILENUM whitespaces pairable unpairable nocheck memmem
|
||||
|
||||
use clap::builder::ValueParser;
|
||||
use clap::{crate_version, Arg, ArgAction, Command};
|
||||
use memchr::{memchr3_iter, memchr_iter};
|
||||
use memchr::{memchr_iter, memmem::Finder, Memchr3};
|
||||
use std::cmp::Ordering;
|
||||
use std::error::Error;
|
||||
use std::ffi::OsString;
|
||||
|
@ -60,13 +60,114 @@ enum FileNum {
|
|||
File2,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, PartialEq)]
|
||||
enum Sep {
|
||||
Char(u8),
|
||||
#[derive(Clone)]
|
||||
enum SepSetting {
|
||||
/// Any single-byte separator.
|
||||
Byte(u8),
|
||||
/// A single character more than one byte long.
|
||||
Char(Vec<u8>),
|
||||
/// No separators, join on the entire line.
|
||||
Line,
|
||||
/// Whitespace separators.
|
||||
Whitespaces,
|
||||
}
|
||||
|
||||
trait Separator: Clone {
|
||||
/// Using this separator, return the start and end index of all fields in the haystack.
|
||||
fn field_ranges(&self, haystack: &[u8], len_guess: usize) -> Vec<(usize, usize)>;
|
||||
/// The separator as it appears when in the output.
|
||||
fn output_separator(&self) -> &[u8];
|
||||
}
|
||||
|
||||
/// Simple separators one byte in length.
|
||||
#[derive(Copy, Clone)]
|
||||
struct OneByteSep {
|
||||
byte: [u8; 1],
|
||||
}
|
||||
|
||||
impl Separator for OneByteSep {
|
||||
fn field_ranges(&self, haystack: &[u8], len_guess: usize) -> Vec<(usize, usize)> {
|
||||
let mut field_ranges = Vec::with_capacity(len_guess);
|
||||
let mut last_end = 0;
|
||||
|
||||
for i in memchr_iter(self.byte[0], haystack) {
|
||||
field_ranges.push((last_end, i));
|
||||
last_end = i + 1;
|
||||
}
|
||||
field_ranges.push((last_end, haystack.len()));
|
||||
field_ranges
|
||||
}
|
||||
|
||||
fn output_separator(&self) -> &[u8] {
|
||||
&self.byte
|
||||
}
|
||||
}
|
||||
|
||||
/// Multi-byte (but still single character) separators.
|
||||
#[derive(Clone)]
|
||||
struct MultiByteSep<'a> {
|
||||
finder: Finder<'a>,
|
||||
}
|
||||
|
||||
impl<'a> Separator for MultiByteSep<'a> {
|
||||
fn field_ranges(&self, haystack: &[u8], len_guess: usize) -> Vec<(usize, usize)> {
|
||||
let mut field_ranges = Vec::with_capacity(len_guess);
|
||||
let mut last_end = 0;
|
||||
|
||||
for i in self.finder.find_iter(haystack) {
|
||||
field_ranges.push((last_end, i));
|
||||
last_end = i + self.finder.needle().len();
|
||||
}
|
||||
field_ranges.push((last_end, haystack.len()));
|
||||
field_ranges
|
||||
}
|
||||
|
||||
fn output_separator(&self) -> &[u8] {
|
||||
self.finder.needle()
|
||||
}
|
||||
}
|
||||
|
||||
/// Whole-line separator.
|
||||
#[derive(Copy, Clone)]
|
||||
struct LineSep {}
|
||||
|
||||
impl Separator for LineSep {
|
||||
fn field_ranges(&self, haystack: &[u8], _len_guess: usize) -> Vec<(usize, usize)> {
|
||||
vec![(0, haystack.len())]
|
||||
}
|
||||
|
||||
fn output_separator(&self) -> &[u8] {
|
||||
&[]
|
||||
}
|
||||
}
|
||||
|
||||
/// Default whitespace separator.
|
||||
#[derive(Copy, Clone)]
|
||||
struct WhitespaceSep {}
|
||||
|
||||
impl Separator for WhitespaceSep {
|
||||
fn field_ranges(&self, haystack: &[u8], len_guess: usize) -> Vec<(usize, usize)> {
|
||||
let mut field_ranges = Vec::with_capacity(len_guess);
|
||||
let mut last_end = 0;
|
||||
|
||||
// GNU join used Bourne shell field splitters by default
|
||||
// FIXME: but now uses locale-dependent whitespace
|
||||
for i in Memchr3::new(b' ', b'\t', b'\n', haystack) {
|
||||
// leading whitespace should be dropped, contiguous whitespace merged
|
||||
if i > last_end {
|
||||
field_ranges.push((last_end, i));
|
||||
}
|
||||
last_end = i + 1;
|
||||
}
|
||||
field_ranges.push((last_end, haystack.len()));
|
||||
field_ranges
|
||||
}
|
||||
|
||||
fn output_separator(&self) -> &[u8] {
|
||||
b" "
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, PartialEq)]
|
||||
enum CheckOrder {
|
||||
Default,
|
||||
|
@ -82,7 +183,7 @@ struct Settings {
|
|||
print_joined: bool,
|
||||
ignore_case: bool,
|
||||
line_ending: LineEnding,
|
||||
separator: Sep,
|
||||
separator: SepSetting,
|
||||
autoformat: bool,
|
||||
format: Vec<Spec>,
|
||||
empty: Vec<u8>,
|
||||
|
@ -100,7 +201,7 @@ impl Default for Settings {
|
|||
print_joined: true,
|
||||
ignore_case: false,
|
||||
line_ending: LineEnding::Newline,
|
||||
separator: Sep::Whitespaces,
|
||||
separator: SepSetting::Whitespaces,
|
||||
autoformat: false,
|
||||
format: vec![],
|
||||
empty: vec![],
|
||||
|
@ -111,15 +212,15 @@ impl Default for Settings {
|
|||
}
|
||||
|
||||
/// Output representation.
|
||||
struct Repr<'a> {
|
||||
struct Repr<'a, Sep: Separator> {
|
||||
line_ending: LineEnding,
|
||||
separator: u8,
|
||||
format: &'a [Spec],
|
||||
separator: Sep,
|
||||
format: Vec<Spec>,
|
||||
empty: &'a [u8],
|
||||
}
|
||||
|
||||
impl<'a> Repr<'a> {
|
||||
fn new(line_ending: LineEnding, separator: u8, format: &'a [Spec], empty: &'a [u8]) -> Self {
|
||||
impl<'a, Sep: Separator> Repr<'a, Sep> {
|
||||
fn new(line_ending: LineEnding, separator: Sep, format: Vec<Spec>, empty: &'a [u8]) -> Self {
|
||||
Repr {
|
||||
line_ending,
|
||||
separator,
|
||||
|
@ -155,7 +256,7 @@ impl<'a> Repr<'a> {
|
|||
) -> Result<(), std::io::Error> {
|
||||
for i in 0..line.field_ranges.len() {
|
||||
if i != index {
|
||||
writer.write_all(&[self.separator])?;
|
||||
writer.write_all(self.separator.output_separator())?;
|
||||
writer.write_all(line.get_field(i).unwrap())?;
|
||||
}
|
||||
}
|
||||
|
@ -169,7 +270,7 @@ impl<'a> Repr<'a> {
|
|||
{
|
||||
for i in 0..self.format.len() {
|
||||
if i > 0 {
|
||||
writer.write_all(&[self.separator])?;
|
||||
writer.write_all(self.separator.output_separator())?;
|
||||
}
|
||||
|
||||
let field = match f(&self.format[i]) {
|
||||
|
@ -188,13 +289,13 @@ impl<'a> Repr<'a> {
|
|||
}
|
||||
|
||||
/// Input processing parameters.
|
||||
struct Input {
|
||||
struct Input<Sep: Separator> {
|
||||
separator: Sep,
|
||||
ignore_case: bool,
|
||||
check_order: CheckOrder,
|
||||
}
|
||||
|
||||
impl Input {
|
||||
impl<Sep: Separator> Input<Sep> {
|
||||
fn new(separator: Sep, ignore_case: bool, check_order: CheckOrder) -> Self {
|
||||
Self {
|
||||
separator,
|
||||
|
@ -271,24 +372,8 @@ struct Line {
|
|||
}
|
||||
|
||||
impl Line {
|
||||
fn new(string: Vec<u8>, separator: Sep, len_guess: usize) -> Self {
|
||||
let mut field_ranges = Vec::with_capacity(len_guess);
|
||||
let mut last_end = 0;
|
||||
if separator == Sep::Whitespaces {
|
||||
// GNU join uses Bourne shell field splitters by default
|
||||
for i in memchr3_iter(b' ', b'\t', b'\n', &string) {
|
||||
if i > last_end {
|
||||
field_ranges.push((last_end, i));
|
||||
}
|
||||
last_end = i + 1;
|
||||
}
|
||||
} else if let Sep::Char(sep) = separator {
|
||||
for i in memchr_iter(sep, &string) {
|
||||
field_ranges.push((last_end, i));
|
||||
last_end = i + 1;
|
||||
}
|
||||
}
|
||||
field_ranges.push((last_end, string.len()));
|
||||
fn new<Sep: Separator>(string: Vec<u8>, separator: &Sep, len_guess: usize) -> Self {
|
||||
let field_ranges = separator.field_ranges(&string, len_guess);
|
||||
|
||||
Self {
|
||||
field_ranges,
|
||||
|
@ -351,7 +436,12 @@ impl<'a> State<'a> {
|
|||
}
|
||||
|
||||
/// Skip the current unpaired line.
|
||||
fn skip_line(&mut self, writer: &mut impl Write, input: &Input, repr: &Repr) -> UResult<()> {
|
||||
fn skip_line<Sep: Separator>(
|
||||
&mut self,
|
||||
writer: &mut impl Write,
|
||||
input: &Input<Sep>,
|
||||
repr: &Repr<'a, Sep>,
|
||||
) -> UResult<()> {
|
||||
if self.print_unpaired {
|
||||
self.print_first_line(writer, repr)?;
|
||||
}
|
||||
|
@ -362,7 +452,7 @@ impl<'a> State<'a> {
|
|||
|
||||
/// Keep reading line sequence until the key does not change, return
|
||||
/// the first line whose key differs.
|
||||
fn extend(&mut self, input: &Input) -> UResult<Option<Line>> {
|
||||
fn extend<Sep: Separator>(&mut self, input: &Input<Sep>) -> UResult<Option<Line>> {
|
||||
while let Some(line) = self.next_line(input)? {
|
||||
let diff = input.compare(self.get_current_key(), line.get_field(self.key));
|
||||
|
||||
|
@ -377,11 +467,11 @@ impl<'a> State<'a> {
|
|||
}
|
||||
|
||||
/// Print lines in the buffers as headers.
|
||||
fn print_headers(
|
||||
fn print_headers<Sep: Separator>(
|
||||
&self,
|
||||
writer: &mut impl Write,
|
||||
other: &State,
|
||||
repr: &Repr,
|
||||
repr: &Repr<'a, Sep>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
if self.has_line() {
|
||||
if other.has_line() {
|
||||
|
@ -397,11 +487,11 @@ impl<'a> State<'a> {
|
|||
}
|
||||
|
||||
/// Combine two line sequences.
|
||||
fn combine(
|
||||
fn combine<Sep: Separator>(
|
||||
&self,
|
||||
writer: &mut impl Write,
|
||||
other: &State,
|
||||
repr: &Repr,
|
||||
repr: &Repr<'a, Sep>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
let key = self.get_current_key();
|
||||
|
||||
|
@ -444,13 +534,16 @@ impl<'a> State<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
fn reset_read_line(&mut self, input: &Input) -> Result<(), std::io::Error> {
|
||||
let line = self.read_line(input.separator)?;
|
||||
fn reset_read_line<Sep: Separator>(
|
||||
&mut self,
|
||||
input: &Input<Sep>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
let line = self.read_line(&input.separator)?;
|
||||
self.reset(line);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn reset_next_line(&mut self, input: &Input) -> Result<(), JoinError> {
|
||||
fn reset_next_line<Sep: Separator>(&mut self, input: &Input<Sep>) -> Result<(), JoinError> {
|
||||
let line = self.next_line(input)?;
|
||||
self.reset(line);
|
||||
Ok(())
|
||||
|
@ -460,7 +553,7 @@ impl<'a> State<'a> {
|
|||
!self.seq.is_empty()
|
||||
}
|
||||
|
||||
fn initialize(&mut self, read_sep: Sep, autoformat: bool) -> usize {
|
||||
fn initialize<Sep: Separator>(&mut self, read_sep: &Sep, autoformat: bool) -> usize {
|
||||
if let Some(line) = crash_if_err!(1, self.read_line(read_sep)) {
|
||||
self.seq.push(line);
|
||||
|
||||
|
@ -471,7 +564,12 @@ impl<'a> State<'a> {
|
|||
0
|
||||
}
|
||||
|
||||
fn finalize(&mut self, writer: &mut impl Write, input: &Input, repr: &Repr) -> UResult<()> {
|
||||
fn finalize<Sep: Separator>(
|
||||
&mut self,
|
||||
writer: &mut impl Write,
|
||||
input: &Input<Sep>,
|
||||
repr: &Repr<'a, Sep>,
|
||||
) -> UResult<()> {
|
||||
if self.has_line() {
|
||||
if self.print_unpaired {
|
||||
self.print_first_line(writer, repr)?;
|
||||
|
@ -491,7 +589,7 @@ impl<'a> State<'a> {
|
|||
}
|
||||
|
||||
/// Get the next line without the order check.
|
||||
fn read_line(&mut self, sep: Sep) -> Result<Option<Line>, std::io::Error> {
|
||||
fn read_line<Sep: Separator>(&mut self, sep: &Sep) -> Result<Option<Line>, std::io::Error> {
|
||||
match self.lines.next() {
|
||||
Some(value) => {
|
||||
self.line_num += 1;
|
||||
|
@ -506,8 +604,8 @@ impl<'a> State<'a> {
|
|||
}
|
||||
|
||||
/// Get the next line with the order check.
|
||||
fn next_line(&mut self, input: &Input) -> Result<Option<Line>, JoinError> {
|
||||
if let Some(line) = self.read_line(input.separator)? {
|
||||
fn next_line<Sep: Separator>(&mut self, input: &Input<Sep>) -> Result<Option<Line>, JoinError> {
|
||||
if let Some(line) = self.read_line(&input.separator)? {
|
||||
if input.check_order == CheckOrder::Disabled {
|
||||
return Ok(Some(line));
|
||||
}
|
||||
|
@ -543,11 +641,11 @@ impl<'a> State<'a> {
|
|||
self.seq[0].get_field(self.key)
|
||||
}
|
||||
|
||||
fn print_line(
|
||||
fn print_line<Sep: Separator>(
|
||||
&self,
|
||||
writer: &mut impl Write,
|
||||
line: &Line,
|
||||
repr: &Repr,
|
||||
repr: &Repr<'a, Sep>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
if repr.uses_format() {
|
||||
repr.print_format(writer, |spec| match *spec {
|
||||
|
@ -568,31 +666,53 @@ impl<'a> State<'a> {
|
|||
repr.print_line_ending(writer)
|
||||
}
|
||||
|
||||
fn print_first_line(&self, writer: &mut impl Write, repr: &Repr) -> Result<(), std::io::Error> {
|
||||
fn print_first_line<Sep: Separator>(
|
||||
&self,
|
||||
writer: &mut impl Write,
|
||||
repr: &Repr<'a, Sep>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
self.print_line(writer, &self.seq[0], repr)
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_separator(value_os: &OsString) -> UResult<Sep> {
|
||||
fn parse_separator(value_os: &OsString) -> UResult<SepSetting> {
|
||||
// Five possible separator values:
|
||||
// No argument supplied, separate on whitespace; handled implicitly as the default elsewhere
|
||||
// An empty string arg, whole line separation
|
||||
// On unix-likes only, a single arbitrary byte
|
||||
// The two-character "\0" string, interpreted as a single 0 byte
|
||||
// A single scalar valid in the locale encoding (currently only UTF-8)
|
||||
|
||||
if value_os.is_empty() {
|
||||
return Ok(SepSetting::Line);
|
||||
}
|
||||
|
||||
#[cfg(unix)]
|
||||
let value = value_os.as_bytes();
|
||||
#[cfg(not(unix))]
|
||||
let value = match value_os.to_str() {
|
||||
Some(value) => value.as_bytes(),
|
||||
None => {
|
||||
return Err(USimpleError::new(
|
||||
1,
|
||||
"unprintable field separators are only supported on unix-like platforms",
|
||||
));
|
||||
{
|
||||
let value = value_os.as_bytes();
|
||||
if value.len() == 1 {
|
||||
return Ok(SepSetting::Byte(value[0]));
|
||||
}
|
||||
}
|
||||
|
||||
let Some(value) = value_os.to_str() else {
|
||||
#[cfg(unix)]
|
||||
return Err(USimpleError::new(1, "non-UTF-8 multi-byte tab"));
|
||||
#[cfg(not(unix))]
|
||||
return Err(USimpleError::new(
|
||||
1,
|
||||
"unprintable field separators are only supported on unix-like platforms",
|
||||
));
|
||||
};
|
||||
match value.len() {
|
||||
0 => Ok(Sep::Line),
|
||||
1 => Ok(Sep::Char(value[0])),
|
||||
2 if value[0] == b'\\' && value[1] == b'0' => Ok(Sep::Char(0)),
|
||||
|
||||
let mut chars = value.chars();
|
||||
let c = chars.next().expect("valid string with at least one byte");
|
||||
match chars.next() {
|
||||
None => Ok(SepSetting::Char(value.into())),
|
||||
Some('0') if c == '\\' => Ok(SepSetting::Byte(0)),
|
||||
_ => Err(USimpleError::new(
|
||||
1,
|
||||
format!("multi-character tab {}", value_os.to_string_lossy()),
|
||||
format!("multi-character tab {}", value),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
@ -695,7 +815,20 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
|
|||
return Err(USimpleError::new(1, "both files cannot be standard input"));
|
||||
}
|
||||
|
||||
exec(file1, file2, settings)
|
||||
let sep = settings.separator.clone();
|
||||
match sep {
|
||||
SepSetting::Byte(byte) => exec(file1, file2, settings, OneByteSep { byte: [byte] }),
|
||||
SepSetting::Char(c) => exec(
|
||||
file1,
|
||||
file2,
|
||||
settings,
|
||||
MultiByteSep {
|
||||
finder: Finder::new(&c),
|
||||
},
|
||||
),
|
||||
SepSetting::Whitespaces => exec(file1, file2, settings, WhitespaceSep {}),
|
||||
SepSetting::Line => exec(file1, file2, settings, LineSep {}),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn uu_app() -> Command {
|
||||
|
@ -816,7 +949,7 @@ FILENUM is 1 or 2, corresponding to FILE1 or FILE2",
|
|||
)
|
||||
}
|
||||
|
||||
fn exec(file1: &str, file2: &str, settings: Settings) -> UResult<()> {
|
||||
fn exec<Sep: Separator>(file1: &str, file2: &str, settings: Settings, sep: Sep) -> UResult<()> {
|
||||
let stdin = stdin();
|
||||
|
||||
let mut state1 = State::new(
|
||||
|
@ -837,16 +970,12 @@ fn exec(file1: &str, file2: &str, settings: Settings) -> UResult<()> {
|
|||
settings.print_unpaired2,
|
||||
)?;
|
||||
|
||||
let input = Input::new(
|
||||
settings.separator,
|
||||
settings.ignore_case,
|
||||
settings.check_order,
|
||||
);
|
||||
let input = Input::new(sep.clone(), settings.ignore_case, settings.check_order);
|
||||
|
||||
let format = if settings.autoformat {
|
||||
let mut format = vec![Spec::Key];
|
||||
let mut initialize = |state: &mut State| {
|
||||
let max_fields = state.initialize(settings.separator, settings.autoformat);
|
||||
let max_fields = state.initialize(&sep, settings.autoformat);
|
||||
for i in 0..max_fields {
|
||||
if i != state.key {
|
||||
format.push(Spec::Field(state.file_num, i));
|
||||
|
@ -857,20 +986,12 @@ fn exec(file1: &str, file2: &str, settings: Settings) -> UResult<()> {
|
|||
initialize(&mut state2);
|
||||
format
|
||||
} else {
|
||||
state1.initialize(settings.separator, settings.autoformat);
|
||||
state2.initialize(settings.separator, settings.autoformat);
|
||||
state1.initialize(&sep, settings.autoformat);
|
||||
state2.initialize(&sep, settings.autoformat);
|
||||
settings.format
|
||||
};
|
||||
|
||||
let repr = Repr::new(
|
||||
settings.line_ending,
|
||||
match settings.separator {
|
||||
Sep::Char(sep) => sep,
|
||||
_ => b' ',
|
||||
},
|
||||
&format,
|
||||
&settings.empty,
|
||||
);
|
||||
let repr = Repr::new(settings.line_ending, sep, format, &settings.empty);
|
||||
|
||||
let stdout = stdout();
|
||||
let mut writer = BufWriter::new(stdout.lock());
|
||||
|
|
|
@ -58,6 +58,25 @@ fn default_arguments() {
|
|||
.stdout_only_fixture("default.expected");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn only_whitespace_separators_merge() {
|
||||
new_ucmd!()
|
||||
.arg("contiguous_separators.txt")
|
||||
.arg("-")
|
||||
.pipe_in(" a ,c ")
|
||||
.succeeds()
|
||||
.stdout_only("a ,,,b ,c \n");
|
||||
|
||||
new_ucmd!()
|
||||
.arg("contiguous_separators.txt")
|
||||
.arg("-t")
|
||||
.arg(",")
|
||||
.arg("-")
|
||||
.pipe_in(" a ,c ")
|
||||
.succeeds()
|
||||
.stdout_only(" a ,,,b,c \n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn different_fields() {
|
||||
new_ucmd!()
|
||||
|
@ -208,9 +227,9 @@ fn tab_multi_character() {
|
|||
.arg("semicolon_fields_1.txt")
|
||||
.arg("semicolon_fields_2.txt")
|
||||
.arg("-t")
|
||||
.arg("э")
|
||||
.arg("ab")
|
||||
.fails()
|
||||
.stderr_is("join: multi-character tab э\n");
|
||||
.stderr_is("join: multi-character tab ab\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
@ -437,14 +456,22 @@ fn non_unicode() {
|
|||
|
||||
#[cfg(unix)]
|
||||
{
|
||||
let invalid_utf8: u8 = 167;
|
||||
let non_utf8_byte: u8 = 167;
|
||||
new_ucmd!()
|
||||
.arg("-t")
|
||||
.arg(OsStr::from_bytes(&[invalid_utf8]))
|
||||
.arg(OsStr::from_bytes(&[non_utf8_byte]))
|
||||
.arg("non-unicode_1.bin")
|
||||
.arg("non-unicode_2.bin")
|
||||
.succeeds()
|
||||
.stdout_only_fixture("non-unicode_sep.expected");
|
||||
|
||||
new_ucmd!()
|
||||
.arg("-t")
|
||||
.arg(OsStr::from_bytes(&[non_utf8_byte, non_utf8_byte]))
|
||||
.arg("non-unicode_1.bin")
|
||||
.arg("non-unicode_2.bin")
|
||||
.fails()
|
||||
.stderr_is("join: non-UTF-8 multi-byte tab\n");
|
||||
}
|
||||
|
||||
#[cfg(windows)]
|
||||
|
@ -462,6 +489,16 @@ fn non_unicode() {
|
|||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multibyte_sep() {
|
||||
new_ucmd!()
|
||||
.arg("-t§")
|
||||
.arg("multibyte_sep_1.txt")
|
||||
.arg("multibyte_sep_2.txt")
|
||||
.succeeds()
|
||||
.stdout_only_fixture("multibyte_sep.expected");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn null_field_separators() {
|
||||
new_ucmd!()
|
||||
|
|
1
tests/fixtures/join/contiguous_separators.txt
vendored
Normal file
1
tests/fixtures/join/contiguous_separators.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
a ,,,b
|
1
tests/fixtures/join/multibyte_sep.expected
vendored
Normal file
1
tests/fixtures/join/multibyte_sep.expected
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
a§b§c
|
1
tests/fixtures/join/multibyte_sep_1.txt
vendored
Normal file
1
tests/fixtures/join/multibyte_sep_1.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
a§b
|
1
tests/fixtures/join/multibyte_sep_2.txt
vendored
Normal file
1
tests/fixtures/join/multibyte_sep_2.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
a§c
|
Loading…
Add table
Add a link
Reference in a new issue