i18n: small refactor, add decimal stuff

This commit is contained in:
Dorian Peron 2025-06-30 02:11:05 +02:00 committed by Dorian Péron
parent f5a862c55d
commit bb8744f115
7 changed files with 161 additions and 45 deletions

36
Cargo.lock generated
View file

@ -990,6 +990,17 @@ dependencies = [
"windows-sys 0.59.0",
]
[[package]]
name = "fixed_decimal"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35943d22b2f19c0cb198ecf915910a8158e94541c89dcc63300d7799d46c2c5e"
dependencies = [
"displaydoc",
"smallvec",
"writeable",
]
[[package]]
name = "flate2"
version = "1.1.2"
@ -1290,6 +1301,29 @@ dependencies = [
"zerovec",
]
[[package]]
name = "icu_decimal"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fec61c43fdc4e368a9f450272833123a8ef0d7083a44597660ce94d791b8a2e2"
dependencies = [
"displaydoc",
"fixed_decimal",
"icu_decimal_data",
"icu_locale",
"icu_locale_core",
"icu_provider",
"tinystr",
"writeable",
"zerovec",
]
[[package]]
name = "icu_decimal_data"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b70963bc35f9bdf1bc66a5c1f458f4991c1dc71760e00fa06016b2c76b2738d5"
[[package]]
name = "icu_locale"
version = "2.0.0"
@ -3946,7 +3980,9 @@ dependencies = [
"glob",
"hex",
"icu_collator",
"icu_decimal",
"icu_locale",
"icu_provider",
"itertools 0.14.0",
"libc",
"md-5",

View file

@ -314,7 +314,9 @@ glob = "0.3.1"
half = "2.4.1"
hostname = "0.4"
icu_collator = "2.0.0"
icu_decimal = "2.0.0"
icu_locale = "2.0.0"
icu_provider = "2.0.0"
indicatif = "0.18.0"
itertools = "0.14.0"
jiff = { version = "0.2.10", default-features = false, features = [

View file

@ -27,10 +27,6 @@ dns-lookup = { workspace = true, optional = true }
dunce = { version = "1.0.4", optional = true }
wild = "2.2.1"
glob = { workspace = true, optional = true }
icu_collator = { workspace = true, optional = true, features = [
"compiled_data",
] }
icu_locale = { workspace = true, optional = true, features = ["compiled_data"] }
itertools = { workspace = true, optional = true }
time = { workspace = true, optional = true, features = [
"formatting",
@ -59,6 +55,16 @@ bigdecimal = { workspace = true, optional = true }
num-traits = { workspace = true, optional = true }
selinux = { workspace = true, optional = true }
# icu stuff
icu_collator = { workspace = true, optional = true, features = [
"compiled_data",
] }
icu_decimal = { workspace = true, optional = true, features = [
"compiled_data",
] }
icu_locale = { workspace = true, optional = true, features = ["compiled_data"] }
icu_provider = { workspace = true, optional = true }
# Fluent dependencies
fluent = { workspace = true }
fluent-syntax = { workspace = true }
@ -108,7 +114,9 @@ format = [
"num-traits",
"quoting-style",
]
i18n = ["icu_collator", "icu_locale"]
i18n-all = ["i18n-decimal"]
i18n-common = ["icu_locale", "icu_provider"]
i18n-decimal = ["i18n-common", "icu_decimal", "icu_locale", "icu_provider"]
mode = ["libc"]
perms = ["entries", "libc", "walkdir"]
buf-copy = []
@ -116,7 +124,7 @@ parser = ["extendedbigdecimal", "glob", "num-traits"]
pipes = []
process = ["libc"]
proc-info = ["tty", "walkdir"]
quoting-style = ["i18n"]
quoting-style = ["i18n-common"]
ranges = []
ringbuffer = []
selinux = ["dep:selinux"]

View file

@ -26,7 +26,7 @@ pub mod format;
pub mod fs;
#[cfg(feature = "fsext")]
pub mod fsext;
#[cfg(feature = "i18n")]
#[cfg(feature = "i18n-common")]
pub mod i18n;
#[cfg(feature = "lines")]
pub mod lines;

View file

@ -0,0 +1,51 @@
// This file is part of the uutils coreutils package.
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
use std::sync::OnceLock;
use icu_decimal::provider::DecimalSymbolsV1;
use icu_locale::Locale;
use icu_provider::prelude::*;
use crate::i18n::get_numeric_locale;
/// Return the decimal separator for the given locale
fn get_decimal_separator(loc: Locale) -> String {
let data_locale = DataLocale::from(loc);
let request = DataRequest {
id: DataIdentifierBorrowed::for_locale(&data_locale),
metadata: DataRequestMetadata::default(),
};
let response: DataResponse<DecimalSymbolsV1> =
icu_decimal::provider::Baked.load(request).unwrap();
response.payload.get().decimal_separator().to_string()
}
/// Return the decimal separator from the language we're working with.
/// Example:
/// Say we need to format 1000.5
/// en_US: 1,000.5 -> decimal separator is '.'
/// fr_FR: 1 000,5 -> decimal separator is ','
pub fn locale_decimal_separator() -> &'static str {
static DECIMAL_SEP: OnceLock<String> = OnceLock::new();
DECIMAL_SEP.get_or_init(|| get_decimal_separator(get_numeric_locale().0.clone()))
}
#[cfg(test)]
mod tests {
use icu_locale::locale;
use super::get_decimal_separator;
#[test]
fn test_simple_separator() {
assert_eq!(get_decimal_separator(locale!("en")), ".");
assert_eq!(get_decimal_separator(locale!("fr")), ",");
}
}

View file

@ -1,7 +1,15 @@
// This file is part of the uutils coreutils package.
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
use std::sync::OnceLock;
use icu_locale::{Locale, locale};
#[cfg(feature = "i18n-decimal")]
pub mod decimal;
/// The encoding specified by the locale, if specified
/// Currently only supports ASCII and UTF-8 for the sake of simplicity.
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
@ -12,48 +20,59 @@ pub enum UEncoding {
const DEFAULT_LOCALE: Locale = locale!("en-US-posix");
/// Deduce the locale from the current environment
/// Look at 3 environment variables in the following order
///
/// 1. LC_ALL
/// 2. `locale_name`
/// 3. LANG
///
/// Or fallback on Posix locale, with ASCII encoding.
fn get_locale_from_env(locale_name: &str) -> (Locale, UEncoding) {
let locale_var = ["LC_ALL", locale_name, "LANG"]
.iter()
.find_map(|&key| std::env::var(key).ok());
if let Some(locale_var_str) = locale_var {
let mut split = locale_var_str.split(&['.', '@']);
if let Some(simple) = split.next() {
// Naively convert the locale name to BCP47 tag format.
//
// See https://en.wikipedia.org/wiki/IETF_language_tag
let bcp47 = simple.replace("_", "-");
let locale = Locale::try_from_str(&bcp47).unwrap_or(DEFAULT_LOCALE);
// If locale parsing failed, parse the encoding part of the
// locale. Treat the special case of the given locale being "C"
// which becomes the default locale.
let encoding = if (locale != DEFAULT_LOCALE || bcp47 == "C")
&& split
.next()
.is_some_and(|enc| enc.to_lowercase() == "utf-8")
{
UEncoding::Utf8
} else {
UEncoding::Ascii
};
return (locale, encoding);
}
}
// Default POSIX locale representing LC_ALL=C
(DEFAULT_LOCALE, UEncoding::Ascii)
}
/// Get the collating locale from the environment
fn get_collating_locale() -> &'static (Locale, UEncoding) {
static COLLATING_LOCALE: OnceLock<(Locale, UEncoding)> = OnceLock::new();
COLLATING_LOCALE.get_or_init(|| {
// Look at 3 environment variables in the following order
//
// 1. LC_ALL
// 2. LC_COLLATE
// 3. LANG
//
// Or fallback on Posix locale, with ASCII encoding.
COLLATING_LOCALE.get_or_init(|| {get_locale_from_env("LC_COLLATE")})
}
let locale_var = std::env::var("LC_ALL")
.or_else(|_| std::env::var("LC_COLLATE"))
.or_else(|_| std::env::var("LANG"));
/// Get the numeric locale from the environment
pub fn get_numeric_locale() -> &'static (Locale, UEncoding) {
static NUMERIC_LOCALE: OnceLock<(Locale, UEncoding)> = OnceLock::new();
if let Ok(locale_var_str) = locale_var {
let mut split = locale_var_str.split(&['.', '@']);
if let Some(simple) = split.next() {
let bcp47 = simple.replace("_", "-");
let locale = Locale::try_from_str(&bcp47).unwrap_or(DEFAULT_LOCALE);
// If locale parsing failed, parse the encoding part of the
// locale. Treat the special case of the given locale being "C"
// which becomes the default locale.
let encoding = if (locale != DEFAULT_LOCALE || bcp47 == "C")
&& split.next() == Some("UTF-8")
{
UEncoding::Utf8
} else {
UEncoding::Ascii
};
return (locale, encoding);
} else {
return (DEFAULT_LOCALE, UEncoding::Ascii);
};
}
// Default POSIX locale representing LC_ALL=C
(DEFAULT_LOCALE, UEncoding::Ascii)
})
NUMERIC_LOCALE.get_or_init(|| {get_locale_from_env("LC_NUMERIC")})
}
/// Return the encoding deduced from the locale environment variable.

View file

@ -51,7 +51,7 @@ pub use crate::features::fast_inc;
pub use crate::features::format;
#[cfg(feature = "fs")]
pub use crate::features::fs;
#[cfg(feature = "i18n")]
#[cfg(feature = "i18n-common")]
pub use crate::features::i18n;
#[cfg(feature = "lines")]
pub use crate::features::lines;