fmt: fix the last two gnu issues

2025-07-07 21:45:01 +00:00 · 2025-06-22 23:07:36 +02:00 · 2025-06-22 23:07:36 +02:00 · 2df7f01c15
commit 2df7f01c15
parent 56ce0e28ad
3 changed files with 95 additions and 10 deletions
--- a/src/uu/fmt/src/linebreak.rs
+++ b/src/uu/fmt/src/linebreak.rs
@ -236,7 +236,11 @@ fn find_kp_breakpoints<'a, T: Iterator<Item = &'a WordInfo<'a>>>(
    let mut next_active_breaks = vec![];

    let stretch = args.opts.width - args.opts.goal;
-    let minlength = args.opts.goal.max(stretch + 1) - stretch;
+    let minlength = if args.opts.goal <= 10 {
+        1
+    } else {
+        args.opts.goal.max(stretch + 1) - stretch
+    };
    let mut new_linebreaks = vec![];
    let mut is_sentence_start = false;
    let mut least_demerits = 0;
@ -384,11 +388,11 @@ fn build_best_path<'a>(paths: &[LineBreak<'a>], active: &[usize]) -> Vec<(&'a Wo
 const BAD_INFTY: i64 = 10_000_000;
 const BAD_INFTY_SQ: i64 = BAD_INFTY * BAD_INFTY;
 // badness = BAD_MULT * abs(r) ^ 3
-const BAD_MULT: f32 = 100.0;
+const BAD_MULT: f32 = 200.0;
 // DR_MULT is multiplier for delta-R between lines
 const DR_MULT: f32 = 600.0;
 // DL_MULT is penalty multiplier for short words at end of line
-const DL_MULT: f32 = 300.0;
+const DL_MULT: f32 = 10.0;

 fn compute_demerits(delta_len: isize, stretch: usize, wlen: usize, prev_rat: f32) -> (i64, f32) {
    // how much stretch are we using?
--- a/src/uu/fmt/src/parasplit.rs
+++ b/src/uu/fmt/src/parasplit.rs
@ -26,6 +26,14 @@ fn char_width(c: char) -> usize {
    }
 }

+// GNU fmt has a more restrictive definition of whitespace than Unicode.
+// It only considers ASCII whitespace characters (space, tab, newline, etc.)
+// and excludes many Unicode whitespace characters like non-breaking spaces.
+fn is_fmt_whitespace(c: char) -> bool {
+    // Only ASCII whitespace characters are considered whitespace in GNU fmt
+    matches!(c, ' ' | '\t' | '\n' | '\r' | '\x0B' | '\x0C')
+}
+
 // lines with PSKIP, lacking PREFIX, or which are entirely blank are
 // NoFormatLines; otherwise, they are FormatLines
 #[derive(Debug)]
@ -109,7 +117,7 @@ impl FileLines<'_> {
            for (i, char) in line.char_indices() {
                if line[i..].starts_with(pfx) {
                    return (true, i);
-                } else if !char.is_whitespace() {
+                } else if !is_fmt_whitespace(char) {
                    break;
                }
            }
@ -128,7 +136,7 @@ impl FileLines<'_> {
                prefix_len = indent_len;
            }

-            if (os >= prefix_end) && !c.is_whitespace() {
+            if (os >= prefix_end) && !is_fmt_whitespace(c) {
                // found first non-whitespace after prefix, this is indent_end
                indent_end = os;
                break;
@ -154,7 +162,7 @@ impl Iterator for FileLines<'_> {
        // emit a blank line
        // Err(true) indicates that this was a linebreak,
        // which is important to know when detecting mail headers
-        if n.chars().all(char::is_whitespace) {
+        if n.chars().all(is_fmt_whitespace) {
            return Some(Line::NoFormatLine(String::new(), true));
        }

@ -174,7 +182,7 @@ impl Iterator for FileLines<'_> {
        if pmatch
            && n[poffset + self.opts.prefix.as_ref().map_or(0, |s| s.len())..]
                .chars()
-                .all(char::is_whitespace)
+                .all(is_fmt_whitespace)
        {
            return Some(Line::NoFormatLine(n, false));
        }
@ -498,7 +506,7 @@ impl WordSplit<'_> {
        let mut aftertab = 0;
        let mut word_start = None;
        for (os, c) in string.char_indices() {
-            if !c.is_whitespace() {
+            if !is_fmt_whitespace(c) {
                word_start = Some(os);
                break;
            } else if c == '\t' {
@ -519,7 +527,7 @@ impl WordSplit<'_> {
 impl WordSplit<'_> {
    fn new<'b>(opts: &'b FmtOptions, string: &'b str) -> WordSplit<'b> {
        // wordsplits *must* start at a non-whitespace character
-        let trim_string = string.trim_start();
+        let trim_string = string.trim_start_matches(is_fmt_whitespace);
        WordSplit {
            opts,
            string: trim_string,
@ -571,7 +579,7 @@ impl<'a> Iterator for WordSplit<'a> {
        // points to whitespace character OR end of string
        let mut word_nchars = 0;
        self.position = match self.string[word_start..].find(|x: char| {
-            if x.is_whitespace() {
+            if is_fmt_whitespace(x) {
                true
            } else {
                word_nchars += char_width(x);