Merge pull request #8179 from RenjiSann/locale-aware-quoting

i18n: Locale-aware quoting
This commit is contained in:
Sylvestre Ledru 2025-06-25 08:39:19 +02:00 committed by GitHub
commit 2b5dfe612c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 1543 additions and 586 deletions

View file

@ -61,7 +61,9 @@ use uucore::libc::{S_IXGRP, S_IXOTH, S_IXUSR};
use uucore::libc::{dev_t, major, minor};
use uucore::line_ending::LineEnding;
use uucore::locale::{get_message, get_message_with_args};
use uucore::quoting_style::{self, QuotingStyle, escape_name};
use uucore::quoting_style::{
self, QuotingStyle, locale_aware_escape_dir_name, locale_aware_escape_name,
};
use uucore::{
display::Quotable,
error::{UError, UResult, set_exit_code},
@ -2008,7 +2010,7 @@ fn show_dir_name(
config: &Config,
) -> std::io::Result<()> {
let escaped_name =
quoting_style::escape_dir_name(path_data.p_buf.as_os_str(), &config.quoting_style);
locale_aware_escape_dir_name(path_data.p_buf.as_os_str(), &config.quoting_style);
let name = if config.hyperlink && !config.dired {
create_hyperlink(&escaped_name, path_data)
@ -2509,7 +2511,7 @@ fn display_items(
// option, print the security context to the left of the size column.
let quoted = items.iter().any(|item| {
let name = escape_name(&item.display_name, &config.quoting_style);
let name = locale_aware_escape_name(&item.display_name, &config.quoting_style);
os_str_starts_with(&name, b"'")
});
@ -3152,7 +3154,7 @@ fn classify_file(path: &PathData, out: &mut BufWriter<Stdout>) -> Option<char> {
/// Takes a [`PathData`] struct and returns a cell with a name ready for displaying.
///
/// This function relies on the following parameters in the provided `&Config`:
/// * `config.quoting_style` to decide how we will escape `name` using [`escape_name`].
/// * `config.quoting_style` to decide how we will escape `name` using [`locale_aware_escape_name`].
/// * `config.inode` decides whether to display inode numbers beside names using [`get_inode`].
/// * `config.color` decides whether it's going to color `name` using [`color_name`].
/// * `config.indicator_style` to append specific characters to `name` using [`classify_file`].
@ -3173,7 +3175,7 @@ fn display_item_name(
current_column: LazyCell<usize, Box<dyn FnOnce() -> usize + '_>>,
) -> OsString {
// This is our return value. We start by `&path.display_name` and modify it along the way.
let mut name = escape_name(&path.display_name, &config.quoting_style);
let mut name = locale_aware_escape_name(&path.display_name, &config.quoting_style);
let is_wrap =
|namelen: usize| config.width != 0 && *current_column + namelen > config.width.into();
@ -3265,7 +3267,7 @@ fn display_item_name(
name.push(path.p_buf.read_link().unwrap());
} else {
name.push(color_name(
escape_name(target.as_os_str(), &config.quoting_style),
locale_aware_escape_name(target.as_os_str(), &config.quoting_style),
path,
style_manager,
&mut state.out,
@ -3276,7 +3278,10 @@ fn display_item_name(
} else {
// If no coloring is required, we just use target as is.
// Apply the right quoting
name.push(escape_name(target.as_os_str(), &config.quoting_style));
name.push(locale_aware_escape_name(
target.as_os_str(),
&config.quoting_style,
));
}
}
Err(err) => {

View file

@ -259,7 +259,9 @@ impl<'a> Input<'a> {
Self::Path(path) => {
let path = path.as_os_str();
if path.to_string_lossy().contains('\n') {
Some(Cow::Owned(quoting_style::escape_name(path, QS_ESCAPE)))
Some(Cow::Owned(quoting_style::locale_aware_escape_name(
path, QS_ESCAPE,
)))
} else {
Some(Cow::Borrowed(path))
}
@ -759,7 +761,7 @@ fn files0_iter_file<'a>(path: &Path) -> UResult<impl Iterator<Item = InputIterIt
"wc-error-cannot-open-for-reading",
HashMap::from([(
"path".to_string(),
quoting_style::escape_name(path.as_os_str(), QS_QUOTE_ESCAPE)
quoting_style::locale_aware_escape_name(path.as_os_str(), QS_QUOTE_ESCAPE)
.into_string()
.expect("All escaped names with the escaping option return valid strings."),
)]),
@ -812,7 +814,7 @@ fn files0_iter<'a>(
}
fn escape_name_wrapper(name: &OsStr) -> String {
quoting_style::escape_name(name, QS_ESCAPE)
quoting_style::locale_aware_escape_name(name, QS_ESCAPE)
.into_string()
.expect("All escaped names with the escaping option return valid strings.")
}

View file

@ -27,6 +27,10 @@ dns-lookup = { workspace = true, optional = true }
dunce = { version = "1.0.4", optional = true }
wild = "2.2.1"
glob = { workspace = true, optional = true }
icu_collator = { workspace = true, optional = true, features = [
"compiled_data",
] }
icu_locale = { workspace = true, optional = true, features = ["compiled_data"] }
itertools = { workspace = true, optional = true }
time = { workspace = true, optional = true, features = [
"formatting",
@ -106,6 +110,7 @@ format = [
"num-traits",
"quoting-style",
]
i18n = ["icu_collator", "icu_locale"]
mode = ["libc"]
perms = ["entries", "libc", "walkdir"]
buf-copy = []
@ -113,7 +118,7 @@ parser = ["extendedbigdecimal", "glob", "num-traits"]
pipes = []
process = ["libc"]
proc-info = ["tty", "walkdir"]
quoting-style = []
quoting-style = ["i18n"]
ranges = []
ringbuffer = []
selinux = ["dep:selinux"]

View file

@ -26,6 +26,8 @@ pub mod format;
pub mod fs;
#[cfg(feature = "fsext")]
pub mod fsext;
#[cfg(feature = "i18n")]
pub mod i18n;
#[cfg(feature = "lines")]
pub mod lines;
#[cfg(feature = "parser")]

View file

@ -8,7 +8,7 @@ use crate::format::spec::ArgumentLocation;
use crate::{
error::set_exit_code,
parser::num_parser::{ExtendedParser, ExtendedParserError},
quoting_style::{Quotes, QuotingStyle, escape_name},
quoting_style::{Quotes, QuotingStyle, locale_aware_escape_name},
show_error, show_warning,
};
use os_display::Quotable;
@ -153,7 +153,7 @@ fn extract_value<T: Default>(p: Result<T, ExtendedParserError<'_, T>>, input: &s
Ok(v) => v,
Err(e) => {
set_exit_code(1);
let input = escape_name(
let input = locale_aware_escape_name(
OsStr::new(input),
&QuotingStyle::C {
quotes: Quotes::None,

View file

@ -5,7 +5,7 @@
// spell-checker:ignore (vars) intmax ptrdiff padlen
use crate::quoting_style::{QuotingStyle, escape_name};
use crate::quoting_style::{QuotingStyle, locale_aware_escape_name};
use super::{
ExtendedBigDecimal, FormatChar, FormatError, OctalParsing,
@ -402,7 +402,7 @@ impl Spec {
writer.write_all(&parsed).map_err(FormatError::IoError)
}
Self::QuotedString { position } => {
let s = escape_name(
let s = locale_aware_escape_name(
args.next_string(position).as_ref(),
&QuotingStyle::Shell {
escape: true,

View file

@ -0,0 +1,62 @@
use std::sync::OnceLock;
use icu_locale::{Locale, locale};
/// The encoding specified by the locale, if specified
/// Currently only supports ASCII and UTF-8 for the sake of simplicity.
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub enum UEncoding {
Ascii,
Utf8,
}
const DEFAULT_LOCALE: Locale = locale!("en-US-posix");
/// Deduce the locale from the current environment
fn get_collating_locale() -> &'static (Locale, UEncoding) {
static COLLATING_LOCALE: OnceLock<(Locale, UEncoding)> = OnceLock::new();
COLLATING_LOCALE.get_or_init(|| {
// Look at 3 environment variables in the following order
//
// 1. LC_ALL
// 2. LC_COLLATE
// 3. LANG
//
// Or fallback on Posix locale, with ASCII encoding.
let locale_var = std::env::var("LC_ALL")
.or_else(|_| std::env::var("LC_COLLATE"))
.or_else(|_| std::env::var("LANG"));
if let Ok(locale_var_str) = locale_var {
let mut split = locale_var_str.split(&['.', '@']);
if let Some(simple) = split.next() {
let bcp47 = simple.replace("_", "-");
let locale = Locale::try_from_str(&bcp47).unwrap_or(DEFAULT_LOCALE);
// If locale parsing failed, parse the encoding part of the
// locale. Treat the special case of the given locale being "C"
// which becomes the default locale.
let encoding = if (locale != DEFAULT_LOCALE || bcp47 == "C")
&& split.next() == Some("UTF-8")
{
UEncoding::Utf8
} else {
UEncoding::Ascii
};
return (locale, encoding);
} else {
return (DEFAULT_LOCALE, UEncoding::Ascii);
};
}
// Default POSIX locale representing LC_ALL=C
(DEFAULT_LOCALE, UEncoding::Ascii)
})
}
/// Return the encoding deduced from the locale environment variable.
pub fn get_locale_encoding() -> UEncoding {
get_collating_locale().1
}

View file

@ -0,0 +1,57 @@
// This file is part of the uutils coreutils package.
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
use super::{EscapedChar, Quoter, Quotes};
pub(super) struct CQuoter {
/// The type of quotes to use.
quotes: Quotes,
dirname: bool,
buffer: Vec<u8>,
}
impl CQuoter {
pub fn new(quotes: Quotes, dirname: bool, size_hint: usize) -> Self {
let mut buffer = Vec::with_capacity(size_hint);
match quotes {
Quotes::None => (),
Quotes::Single => buffer.push(b'\''),
Quotes::Double => buffer.push(b'"'),
}
Self {
quotes,
dirname,
buffer,
}
}
}
impl Quoter for CQuoter {
fn push_char(&mut self, input: char) {
let escaped: String = EscapedChar::new_c(input, self.quotes, self.dirname)
.hide_control()
.collect();
self.buffer.extend_from_slice(escaped.as_bytes());
}
fn push_invalid(&mut self, input: &[u8]) {
for b in input {
let escaped: String = EscapedChar::new_octal(*b).hide_control().collect();
self.buffer.extend_from_slice(escaped.as_bytes());
}
}
fn finalize(mut self: Box<Self>) -> Vec<u8> {
match self.quotes {
Quotes::None => (),
Quotes::Single => self.buffer.push(b'\''),
Quotes::Double => self.buffer.push(b'"'),
}
self.buffer
}
}

View file

@ -0,0 +1,201 @@
// This file is part of the uutils coreutils package.
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
use std::char::from_digit;
use super::Quotes;
// PR#6559 : Remove `]{}` from special shell chars.
const SPECIAL_SHELL_CHARS: &str = "`$&*()|[;\\'\"<>?! ";
// This implementation is heavily inspired by the std::char::EscapeDefault implementation
// in the Rust standard library. This custom implementation is needed because the
// characters \a, \b, \e, \f & \v are not recognized by Rust.
pub struct EscapedChar {
pub state: EscapeState,
}
pub enum EscapeState {
Done,
Char(char),
Backslash(char),
ForceQuote(char),
Octal(EscapeOctal),
}
/// Bytes we need to present as escaped octal, in the form of `\nnn` per byte.
/// Only supports characters up to 2 bytes long in UTF-8.
pub struct EscapeOctal {
c: [u8; 2],
state: EscapeOctalState,
idx: u8,
}
enum EscapeOctalState {
Done,
FirstBackslash,
FirstValue,
LastBackslash,
LastValue,
}
fn byte_to_octal_digit(byte: u8, idx: u8) -> u8 {
(byte >> (idx * 3)) & 0o7
}
impl Iterator for EscapeOctal {
type Item = char;
fn next(&mut self) -> Option<char> {
match self.state {
EscapeOctalState::Done => None,
EscapeOctalState::FirstBackslash => {
self.state = EscapeOctalState::FirstValue;
Some('\\')
}
EscapeOctalState::LastBackslash => {
self.state = EscapeOctalState::LastValue;
Some('\\')
}
EscapeOctalState::FirstValue => {
let octal_digit = byte_to_octal_digit(self.c[0], self.idx);
if self.idx == 0 {
self.state = EscapeOctalState::LastBackslash;
self.idx = 2;
} else {
self.idx -= 1;
}
Some(from_digit(octal_digit.into(), 8).unwrap())
}
EscapeOctalState::LastValue => {
let octal_digit = byte_to_octal_digit(self.c[1], self.idx);
if self.idx == 0 {
self.state = EscapeOctalState::Done;
} else {
self.idx -= 1;
}
Some(from_digit(octal_digit.into(), 8).unwrap())
}
}
}
}
impl EscapeOctal {
fn from_char(c: char) -> Self {
if c.len_utf8() == 1 {
return Self::from_byte(c as u8);
}
let mut buf = [0; 2];
let _s = c.encode_utf8(&mut buf);
Self {
c: buf,
idx: 2,
state: EscapeOctalState::FirstBackslash,
}
}
fn from_byte(b: u8) -> Self {
Self {
c: [0, b],
idx: 2,
state: EscapeOctalState::LastBackslash,
}
}
}
impl EscapedChar {
pub fn new_literal(c: char) -> Self {
Self {
state: EscapeState::Char(c),
}
}
pub fn new_octal(b: u8) -> Self {
Self {
state: EscapeState::Octal(EscapeOctal::from_byte(b)),
}
}
pub fn new_c(c: char, quotes: Quotes, dirname: bool) -> Self {
use EscapeState::*;
let init_state = match c {
'\x07' => Backslash('a'),
'\x08' => Backslash('b'),
'\t' => Backslash('t'),
'\n' => Backslash('n'),
'\x0B' => Backslash('v'),
'\x0C' => Backslash('f'),
'\r' => Backslash('r'),
'\\' => Backslash('\\'),
'\'' => match quotes {
Quotes::Single => Backslash('\''),
_ => Char('\''),
},
'"' => match quotes {
Quotes::Double => Backslash('"'),
_ => Char('"'),
},
' ' if !dirname => match quotes {
Quotes::None => Backslash(' '),
_ => Char(' '),
},
':' if dirname => Backslash(':'),
_ if c.is_control() => Octal(EscapeOctal::from_char(c)),
_ => Char(c),
};
Self { state: init_state }
}
pub fn new_shell(c: char, escape: bool, quotes: Quotes) -> Self {
use EscapeState::*;
let init_state = match c {
_ if !escape && c.is_control() => Char(c),
'\x07' => Backslash('a'),
'\x08' => Backslash('b'),
'\t' => Backslash('t'),
'\n' => Backslash('n'),
'\x0B' => Backslash('v'),
'\x0C' => Backslash('f'),
'\r' => Backslash('r'),
'\'' => match quotes {
Quotes::Single => Backslash('\''),
_ => Char('\''),
},
_ if c.is_control() => Octal(EscapeOctal::from_char(c)),
_ if SPECIAL_SHELL_CHARS.contains(c) => ForceQuote(c),
_ => Char(c),
};
Self { state: init_state }
}
pub fn hide_control(self) -> Self {
match self.state {
EscapeState::Char(c) if c.is_control() => Self {
state: EscapeState::Char('?'),
},
_ => self,
}
}
}
impl Iterator for EscapedChar {
type Item = char;
fn next(&mut self) -> Option<char> {
match self.state {
EscapeState::Backslash(c) => {
self.state = EscapeState::Char(c);
Some('\\')
}
EscapeState::Char(c) | EscapeState::ForceQuote(c) => {
self.state = EscapeState::Done;
Some(c)
}
EscapeState::Done => None,
EscapeState::Octal(ref mut iter) => iter.next(),
}
}
}

View file

@ -0,0 +1,31 @@
// This file is part of the uutils coreutils package.
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
use super::{EscapedChar, Quoter};
pub(super) struct LiteralQuoter(Vec<u8>);
impl LiteralQuoter {
pub fn new(size_hint: usize) -> Self {
Self(Vec::with_capacity(size_hint))
}
}
impl Quoter for LiteralQuoter {
fn push_char(&mut self, input: char) {
let escaped = EscapedChar::new_literal(input)
.hide_control()
.collect::<String>();
self.0.extend(escaped.as_bytes());
}
fn push_invalid(&mut self, input: &[u8]) {
self.0.extend(std::iter::repeat_n(b'?', input.len()));
}
fn finalize(self: Box<Self>) -> Vec<u8> {
self.0
}
}

View file

@ -0,0 +1,241 @@
// This file is part of the uutils coreutils package.
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
use super::{EscapeState, EscapedChar, Quoter, Quotes};
// These are characters with special meaning in the shell (e.g. bash). The
// first const contains characters that only have a special meaning when they
// appear at the beginning of a name.
const SPECIAL_SHELL_CHARS_START: &[u8] = b"~#";
// Escaped and NonEscaped shell quoting strategies are very different.
// Therefore, we are using separate Quoter structures for each of them.
pub(super) struct NonEscapedShellQuoter<'a> {
// INIT
/// Original name.
reference: &'a [u8],
/// The quotes to be used if necessary
quotes: Quotes,
/// Whether to show control and non-unicode characters, or replace them
/// with `?`.
show_control: bool,
// INTERNAL STATE
/// Whether the name should be quoted.
must_quote: bool,
buffer: Vec<u8>,
}
impl<'a> NonEscapedShellQuoter<'a> {
pub fn new(
reference: &'a [u8],
show_control: bool,
always_quote: bool,
dirname: bool,
size_hint: usize,
) -> Self {
let (quotes, must_quote) = initial_quoting(reference, dirname, always_quote);
Self {
reference,
quotes,
show_control,
must_quote,
buffer: Vec::with_capacity(size_hint),
}
}
}
impl<'a> Quoter for NonEscapedShellQuoter<'a> {
fn push_char(&mut self, input: char) {
let escaped = EscapedChar::new_shell(input, false, self.quotes);
let escaped = if self.show_control {
escaped
} else {
escaped.hide_control()
};
match escaped.state {
EscapeState::Backslash('\'') => self.buffer.extend(b"'\\''"),
EscapeState::ForceQuote(x) => {
self.must_quote = true;
self.buffer.extend(x.to_string().as_bytes());
}
_ => {
self.buffer.extend(escaped.collect::<String>().as_bytes());
}
}
}
fn push_invalid(&mut self, input: &[u8]) {
if self.show_control {
self.buffer.extend(input);
} else {
self.buffer.extend(std::iter::repeat_n(b'?', input.len()));
}
}
fn finalize(self: Box<Self>) -> Vec<u8> {
finalize_shell_quoter(self.buffer, self.reference, self.must_quote, self.quotes)
}
}
// We need to keep track of whether we are in a dollar expression
// because e.g. \b\n is escaped as $'\b\n' and not like $'b'$'n'
pub(super) struct EscapedShellQuoter<'a> {
// INIT
/// Original name.
reference: &'a [u8],
/// The quotes to be used if necessary
quotes: Quotes,
// INTERNAL STATE
/// Whether the name should be quoted.
must_quote: bool,
/// Whether we are currently in a dollar escaped environment.
in_dollar: bool,
buffer: Vec<u8>,
}
impl<'a> EscapedShellQuoter<'a> {
pub fn new(reference: &'a [u8], always_quote: bool, dirname: bool, size_hint: usize) -> Self {
let (quotes, must_quote) = initial_quoting(reference, dirname, always_quote);
Self {
reference,
quotes,
must_quote,
in_dollar: false,
buffer: Vec::with_capacity(size_hint),
}
}
fn enter_dollar(&mut self) {
if !self.in_dollar {
self.buffer.extend(b"'$'");
self.in_dollar = true;
}
}
fn exit_dollar(&mut self) {
if self.in_dollar {
self.buffer.extend(b"''");
self.in_dollar = false;
}
}
}
impl<'a> Quoter for EscapedShellQuoter<'a> {
fn push_char(&mut self, input: char) {
let escaped = EscapedChar::new_shell(input, true, self.quotes);
match escaped.state {
EscapeState::Char(x) => {
self.exit_dollar();
self.buffer.extend(x.to_string().as_bytes());
}
EscapeState::ForceQuote(x) => {
self.exit_dollar();
self.must_quote = true;
self.buffer.extend(x.to_string().as_bytes());
}
// Single quotes are not put in dollar expressions, but are escaped
// if the string also contains double quotes. In that case, they
// must be handled separately.
EscapeState::Backslash('\'') => {
self.must_quote = true;
self.in_dollar = false;
self.buffer.extend(b"'\\''");
}
_ => {
self.enter_dollar();
self.must_quote = true;
self.buffer.extend(escaped.collect::<String>().as_bytes());
}
}
}
fn push_invalid(&mut self, input: &[u8]) {
// Early return on empty inputs.
if input.is_empty() {
return;
}
self.enter_dollar();
self.must_quote = true;
self.buffer.extend(
input
.iter()
.flat_map(|b| EscapedChar::new_octal(*b))
.collect::<String>()
.as_bytes(),
);
}
fn finalize(self: Box<Self>) -> Vec<u8> {
finalize_shell_quoter(self.buffer, self.reference, self.must_quote, self.quotes)
}
}
/// Deduce the initial quoting status from the provided information
fn initial_quoting(input: &[u8], dirname: bool, always_quote: bool) -> (Quotes, bool) {
if input
.iter()
.any(|c| shell_escaped_char_set(dirname).contains(c))
{
(Quotes::Single, true)
} else if input.contains(&b'\'') {
(Quotes::Double, true)
} else if always_quote || input.is_empty() {
(Quotes::Single, true)
} else {
(Quotes::Single, false)
}
}
/// Check whether `bytes` starts with any byte in `pattern`.
fn bytes_start_with(bytes: &[u8], pattern: &[u8]) -> bool {
!bytes.is_empty() && pattern.contains(&bytes[0])
}
/// Return a set of characters that implies quoting of the word in
/// shell-quoting mode.
fn shell_escaped_char_set(is_dirname: bool) -> &'static [u8] {
const ESCAPED_CHARS: &[u8] = b":\"`$\\^\n\t\r=";
// the ':' colon character only induce quoting in the
// context of ls displaying a directory name before listing its content.
// (e.g. with the recursive flag -R)
let start_index = if is_dirname { 0 } else { 1 };
&ESCAPED_CHARS[start_index..]
}
fn finalize_shell_quoter(
buffer: Vec<u8>,
reference: &[u8],
must_quote: bool,
quotes: Quotes,
) -> Vec<u8> {
let contains_quote_chars = must_quote || bytes_start_with(reference, SPECIAL_SHELL_CHARS_START);
if must_quote | contains_quote_chars && quotes != Quotes::None {
let mut quoted = Vec::<u8>::with_capacity(buffer.len() + 2);
let quote = if quotes == Quotes::Single {
b'\''
} else {
b'"'
};
quoted.push(quote);
quoted.extend(buffer);
quoted.push(quote);
quoted
} else {
buffer
}
}

View file

@ -51,6 +51,8 @@ pub use crate::features::fast_inc;
pub use crate::features::format;
#[cfg(feature = "fs")]
pub use crate::features::fs;
#[cfg(feature = "i18n")]
pub use crate::features::i18n;
#[cfg(feature = "lines")]
pub use crate::features::lines;
#[cfg(feature = "parser")]