mirror of
https://github.com/uutils/coreutils.git
synced 2025-07-07 21:45:01 +00:00
fmt: fix the last two gnu issues
This commit is contained in:
parent
56ce0e28ad
commit
2df7f01c15
3 changed files with 95 additions and 10 deletions
|
@ -236,7 +236,11 @@ fn find_kp_breakpoints<'a, T: Iterator<Item = &'a WordInfo<'a>>>(
|
||||||
let mut next_active_breaks = vec![];
|
let mut next_active_breaks = vec![];
|
||||||
|
|
||||||
let stretch = args.opts.width - args.opts.goal;
|
let stretch = args.opts.width - args.opts.goal;
|
||||||
let minlength = args.opts.goal.max(stretch + 1) - stretch;
|
let minlength = if args.opts.goal <= 10 {
|
||||||
|
1
|
||||||
|
} else {
|
||||||
|
args.opts.goal.max(stretch + 1) - stretch
|
||||||
|
};
|
||||||
let mut new_linebreaks = vec![];
|
let mut new_linebreaks = vec![];
|
||||||
let mut is_sentence_start = false;
|
let mut is_sentence_start = false;
|
||||||
let mut least_demerits = 0;
|
let mut least_demerits = 0;
|
||||||
|
@ -384,11 +388,11 @@ fn build_best_path<'a>(paths: &[LineBreak<'a>], active: &[usize]) -> Vec<(&'a Wo
|
||||||
const BAD_INFTY: i64 = 10_000_000;
|
const BAD_INFTY: i64 = 10_000_000;
|
||||||
const BAD_INFTY_SQ: i64 = BAD_INFTY * BAD_INFTY;
|
const BAD_INFTY_SQ: i64 = BAD_INFTY * BAD_INFTY;
|
||||||
// badness = BAD_MULT * abs(r) ^ 3
|
// badness = BAD_MULT * abs(r) ^ 3
|
||||||
const BAD_MULT: f32 = 100.0;
|
const BAD_MULT: f32 = 200.0;
|
||||||
// DR_MULT is multiplier for delta-R between lines
|
// DR_MULT is multiplier for delta-R between lines
|
||||||
const DR_MULT: f32 = 600.0;
|
const DR_MULT: f32 = 600.0;
|
||||||
// DL_MULT is penalty multiplier for short words at end of line
|
// DL_MULT is penalty multiplier for short words at end of line
|
||||||
const DL_MULT: f32 = 300.0;
|
const DL_MULT: f32 = 10.0;
|
||||||
|
|
||||||
fn compute_demerits(delta_len: isize, stretch: usize, wlen: usize, prev_rat: f32) -> (i64, f32) {
|
fn compute_demerits(delta_len: isize, stretch: usize, wlen: usize, prev_rat: f32) -> (i64, f32) {
|
||||||
// how much stretch are we using?
|
// how much stretch are we using?
|
||||||
|
|
|
@ -26,6 +26,14 @@ fn char_width(c: char) -> usize {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GNU fmt has a more restrictive definition of whitespace than Unicode.
|
||||||
|
// It only considers ASCII whitespace characters (space, tab, newline, etc.)
|
||||||
|
// and excludes many Unicode whitespace characters like non-breaking spaces.
|
||||||
|
fn is_fmt_whitespace(c: char) -> bool {
|
||||||
|
// Only ASCII whitespace characters are considered whitespace in GNU fmt
|
||||||
|
matches!(c, ' ' | '\t' | '\n' | '\r' | '\x0B' | '\x0C')
|
||||||
|
}
|
||||||
|
|
||||||
// lines with PSKIP, lacking PREFIX, or which are entirely blank are
|
// lines with PSKIP, lacking PREFIX, or which are entirely blank are
|
||||||
// NoFormatLines; otherwise, they are FormatLines
|
// NoFormatLines; otherwise, they are FormatLines
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
|
@ -109,7 +117,7 @@ impl FileLines<'_> {
|
||||||
for (i, char) in line.char_indices() {
|
for (i, char) in line.char_indices() {
|
||||||
if line[i..].starts_with(pfx) {
|
if line[i..].starts_with(pfx) {
|
||||||
return (true, i);
|
return (true, i);
|
||||||
} else if !char.is_whitespace() {
|
} else if !is_fmt_whitespace(char) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -128,7 +136,7 @@ impl FileLines<'_> {
|
||||||
prefix_len = indent_len;
|
prefix_len = indent_len;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (os >= prefix_end) && !c.is_whitespace() {
|
if (os >= prefix_end) && !is_fmt_whitespace(c) {
|
||||||
// found first non-whitespace after prefix, this is indent_end
|
// found first non-whitespace after prefix, this is indent_end
|
||||||
indent_end = os;
|
indent_end = os;
|
||||||
break;
|
break;
|
||||||
|
@ -154,7 +162,7 @@ impl Iterator for FileLines<'_> {
|
||||||
// emit a blank line
|
// emit a blank line
|
||||||
// Err(true) indicates that this was a linebreak,
|
// Err(true) indicates that this was a linebreak,
|
||||||
// which is important to know when detecting mail headers
|
// which is important to know when detecting mail headers
|
||||||
if n.chars().all(char::is_whitespace) {
|
if n.chars().all(is_fmt_whitespace) {
|
||||||
return Some(Line::NoFormatLine(String::new(), true));
|
return Some(Line::NoFormatLine(String::new(), true));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -174,7 +182,7 @@ impl Iterator for FileLines<'_> {
|
||||||
if pmatch
|
if pmatch
|
||||||
&& n[poffset + self.opts.prefix.as_ref().map_or(0, |s| s.len())..]
|
&& n[poffset + self.opts.prefix.as_ref().map_or(0, |s| s.len())..]
|
||||||
.chars()
|
.chars()
|
||||||
.all(char::is_whitespace)
|
.all(is_fmt_whitespace)
|
||||||
{
|
{
|
||||||
return Some(Line::NoFormatLine(n, false));
|
return Some(Line::NoFormatLine(n, false));
|
||||||
}
|
}
|
||||||
|
@ -498,7 +506,7 @@ impl WordSplit<'_> {
|
||||||
let mut aftertab = 0;
|
let mut aftertab = 0;
|
||||||
let mut word_start = None;
|
let mut word_start = None;
|
||||||
for (os, c) in string.char_indices() {
|
for (os, c) in string.char_indices() {
|
||||||
if !c.is_whitespace() {
|
if !is_fmt_whitespace(c) {
|
||||||
word_start = Some(os);
|
word_start = Some(os);
|
||||||
break;
|
break;
|
||||||
} else if c == '\t' {
|
} else if c == '\t' {
|
||||||
|
@ -519,7 +527,7 @@ impl WordSplit<'_> {
|
||||||
impl WordSplit<'_> {
|
impl WordSplit<'_> {
|
||||||
fn new<'b>(opts: &'b FmtOptions, string: &'b str) -> WordSplit<'b> {
|
fn new<'b>(opts: &'b FmtOptions, string: &'b str) -> WordSplit<'b> {
|
||||||
// wordsplits *must* start at a non-whitespace character
|
// wordsplits *must* start at a non-whitespace character
|
||||||
let trim_string = string.trim_start();
|
let trim_string = string.trim_start_matches(is_fmt_whitespace);
|
||||||
WordSplit {
|
WordSplit {
|
||||||
opts,
|
opts,
|
||||||
string: trim_string,
|
string: trim_string,
|
||||||
|
@ -571,7 +579,7 @@ impl<'a> Iterator for WordSplit<'a> {
|
||||||
// points to whitespace character OR end of string
|
// points to whitespace character OR end of string
|
||||||
let mut word_nchars = 0;
|
let mut word_nchars = 0;
|
||||||
self.position = match self.string[word_start..].find(|x: char| {
|
self.position = match self.string[word_start..].find(|x: char| {
|
||||||
if x.is_whitespace() {
|
if is_fmt_whitespace(x) {
|
||||||
true
|
true
|
||||||
} else {
|
} else {
|
||||||
word_nchars += char_width(x);
|
word_nchars += char_width(x);
|
||||||
|
|
|
@ -2,6 +2,9 @@
|
||||||
//
|
//
|
||||||
// For the full copyright and license information, please view the LICENSE
|
// For the full copyright and license information, please view the LICENSE
|
||||||
// file that was distributed with this source code.
|
// file that was distributed with this source code.
|
||||||
|
|
||||||
|
// spell-checker:ignore plass samp
|
||||||
|
|
||||||
use uutests::new_ucmd;
|
use uutests::new_ucmd;
|
||||||
use uutests::util::TestScenario;
|
use uutests::util::TestScenario;
|
||||||
use uutests::util_name;
|
use uutests::util_name;
|
||||||
|
@ -303,3 +306,73 @@ fn prefix_equal_skip_prefix_equal_two() {
|
||||||
.stdout_is_fixture("prefixed-one-word-per-line_p=_P=2.txt");
|
.stdout_is_fixture("prefixed-one-word-per-line_p=_P=2.txt");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_fmt_unicode_whitespace_handling() {
|
||||||
|
// Character classification fix: Test that Unicode whitespace characters like non-breaking space
|
||||||
|
// are NOT treated as whitespace by fmt, maintaining GNU fmt compatibility.
|
||||||
|
// GNU fmt only recognizes ASCII whitespace (space, tab, newline, etc.) and excludes
|
||||||
|
// Unicode whitespace characters to ensure consistent formatting behavior.
|
||||||
|
// This prevents regression of the character classification fix
|
||||||
|
let non_breaking_space = "\u{00A0}"; // U+00A0 NO-BREAK SPACE
|
||||||
|
let figure_space = "\u{2007}"; // U+2007 FIGURE SPACE
|
||||||
|
let narrow_no_break_space = "\u{202F}"; // U+202F NARROW NO-BREAK SPACE
|
||||||
|
|
||||||
|
// When fmt splits on width=1, these characters should NOT cause line breaks
|
||||||
|
// because they should not be considered whitespace
|
||||||
|
for (name, char) in [
|
||||||
|
("non-breaking space", non_breaking_space),
|
||||||
|
("figure space", figure_space),
|
||||||
|
("narrow no-break space", narrow_no_break_space),
|
||||||
|
] {
|
||||||
|
let input = format!("={char}=");
|
||||||
|
let result = new_ucmd!()
|
||||||
|
.args(&["-s", "-w1"])
|
||||||
|
.pipe_in(input.as_bytes())
|
||||||
|
.succeeds();
|
||||||
|
|
||||||
|
// Should be 1 line since the Unicode char is not treated as whitespace
|
||||||
|
assert_eq!(
|
||||||
|
result.stdout_str().lines().count(),
|
||||||
|
1,
|
||||||
|
"Failed for {name}: Unicode character should not be treated as whitespace"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_fmt_knuth_plass_line_breaking() {
|
||||||
|
// Line breaking algorithm improvements: Test the enhanced Knuth-Plass optimal line breaking
|
||||||
|
// algorithm that better handles sentence boundaries, word positioning constraints,
|
||||||
|
// and produces more natural line breaks for complex text formatting.
|
||||||
|
// This prevents regression of the line breaking algorithm improvements
|
||||||
|
let input = "@command{fmt} prefers breaking lines at the end of a sentence, and tries to\n\
|
||||||
|
avoid line breaks after the first word of a sentence or before the last word\n\
|
||||||
|
of a sentence. A @dfn{sentence break} is defined as either the end of a\n\
|
||||||
|
paragraph or a word ending in any of @samp{.?!}, followed by two spaces or end\n\
|
||||||
|
of line, ignoring any intervening parentheses or quotes. Like @TeX{},\n\
|
||||||
|
@command{fmt} reads entire ''paragraphs'' before choosing line breaks; the\n\
|
||||||
|
algorithm is a variant of that given by\n\
|
||||||
|
Donald E. Knuth and Michael F. Plass\n\
|
||||||
|
in ''Breaking Paragraphs Into Lines'',\n\
|
||||||
|
@cite{Software---Practice & Experience}\n\
|
||||||
|
@b{11}, 11 (November 1981), 1119--1184.";
|
||||||
|
|
||||||
|
let expected = "@command{fmt} prefers breaking lines at the end of a sentence,\n\
|
||||||
|
and tries to avoid line breaks after the first word of a sentence\n\
|
||||||
|
or before the last word of a sentence. A @dfn{sentence break}\n\
|
||||||
|
is defined as either the end of a paragraph or a word ending\n\
|
||||||
|
in any of @samp{.?!}, followed by two spaces or end of line,\n\
|
||||||
|
ignoring any intervening parentheses or quotes. Like @TeX{},\n\
|
||||||
|
@command{fmt} reads entire ''paragraphs'' before choosing line\n\
|
||||||
|
breaks; the algorithm is a variant of that given by Donald\n\
|
||||||
|
E. Knuth and Michael F. Plass in ''Breaking Paragraphs Into\n\
|
||||||
|
Lines'', @cite{Software---Practice & Experience} @b{11}, 11\n\
|
||||||
|
(November 1981), 1119--1184.\n";
|
||||||
|
|
||||||
|
new_ucmd!()
|
||||||
|
.args(&["-g", "60", "-w", "72"])
|
||||||
|
.pipe_in(input)
|
||||||
|
.succeeds()
|
||||||
|
.stdout_is(expected);
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue