Ensure LF line endings

2025-07-07 21:35:16 +00:00 · 2025-05-17 01:28:52 +02:00 · 2025-05-17 01:28:52 +02:00 · 4055262e47
commit 4055262e47
parent d17cf66c7c
3 changed files with 245 additions and 280 deletions
--- a/rustfmt.toml
+++ b/rustfmt.toml
@ -3,4 +3,5 @@ use_small_heuristics = "Max"
 group_imports = "StdExternalCrate"
 imports_granularity = "Module"
 format_code_in_doc_comments = true
 newline_style = "Unix"
 use_field_init_shorthand = true
--- a/src/fuzzy.rs
+++ b/src/fuzzy.rs
@ -1,221 +1,221 @@
-// Copyright (c) Microsoft Corporation.
+// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT License.
+// Licensed under the MIT License.
-
+
-//! Fuzzy search algorithm based on the one used in VS Code (`/src/vs/base/common/fuzzyScorer.ts`).
+//! Fuzzy search algorithm based on the one used in VS Code (`/src/vs/base/common/fuzzyScorer.ts`).
-//! Other algorithms exist, such as Sublime Text's, or the one used in `fzf`,
+//! Other algorithms exist, such as Sublime Text's, or the one used in `fzf`,
-//! but I figured that this one is what lots of people may be familiar with.
+//! but I figured that this one is what lots of people may be familiar with.
-
+
-use std::vec;
+use std::vec;
-
+
-use crate::arena::{Arena, scratch_arena};
+use crate::arena::{Arena, scratch_arena};
-use crate::icu;
+use crate::icu;
-
+
-const NO_MATCH: i32 = 0;
+const NO_MATCH: i32 = 0;
-
+
-pub fn score_fuzzy<'a>(
+pub fn score_fuzzy<'a>(
-    arena: &'a Arena,
+    arena: &'a Arena,
-    haystack: &str,
+    haystack: &str,
-    needle: &str,
+    needle: &str,
-    allow_non_contiguous_matches: bool,
+    allow_non_contiguous_matches: bool,
-) -> (i32, Vec<usize, &'a Arena>) {
+) -> (i32, Vec<usize, &'a Arena>) {
-    if haystack.is_empty() || needle.is_empty() {
+    if haystack.is_empty() || needle.is_empty() {
-        // return early if target or query are empty
+        // return early if target or query are empty
-        return (NO_MATCH, Vec::new_in(arena));
+        return (NO_MATCH, Vec::new_in(arena));
-    }
+    }
-
+
-    let scratch = scratch_arena(Some(arena));
+    let scratch = scratch_arena(Some(arena));
-    let target = map_chars(&scratch, haystack);
+    let target = map_chars(&scratch, haystack);
-    let query = map_chars(&scratch, needle);
+    let query = map_chars(&scratch, needle);
-
+
-    if target.len() < query.len() {
+    if target.len() < query.len() {
-        // impossible for query to be contained in target
+        // impossible for query to be contained in target
-        return (NO_MATCH, Vec::new_in(arena));
+        return (NO_MATCH, Vec::new_in(arena));
-    }
+    }
-
+
-    let target_lower = icu::fold_case(&scratch, haystack);
+    let target_lower = icu::fold_case(&scratch, haystack);
-    let query_lower = icu::fold_case(&scratch, needle);
+    let query_lower = icu::fold_case(&scratch, needle);
-    let target_lower = map_chars(&scratch, &target_lower);
+    let target_lower = map_chars(&scratch, &target_lower);
-    let query_lower = map_chars(&scratch, &query_lower);
+    let query_lower = map_chars(&scratch, &query_lower);
-
+
-    let area = query.len() * target.len();
+    let area = query.len() * target.len();
-    let mut scores = vec::from_elem_in(0, area, &*scratch);
+    let mut scores = vec::from_elem_in(0, area, &*scratch);
-    let mut matches = vec::from_elem_in(0, area, &*scratch);
+    let mut matches = vec::from_elem_in(0, area, &*scratch);
-
+
-    //
+    //
-    // Build Scorer Matrix:
+    // Build Scorer Matrix:
-    //
+    //
-    // The matrix is composed of query q and target t. For each index we score
+    // The matrix is composed of query q and target t. For each index we score
-    // q[i] with t[i] and compare that with the previous score. If the score is
+    // q[i] with t[i] and compare that with the previous score. If the score is
-    // equal or larger, we keep the match. In addition to the score, we also keep
+    // equal or larger, we keep the match. In addition to the score, we also keep
-    // the length of the consecutive matches to use as boost for the score.
+    // the length of the consecutive matches to use as boost for the score.
-    //
+    //
-    //      t   a   r   g   e   t
+    //      t   a   r   g   e   t
-    //  q
+    //  q
-    //  u
+    //  u
-    //  e
+    //  e
-    //  r
+    //  r
-    //  y
+    //  y
-    //
+    //
-    for query_index in 0..query.len() {
+    for query_index in 0..query.len() {
-        let query_index_offset = query_index * target.len();
+        let query_index_offset = query_index * target.len();
-        let query_index_previous_offset =
+        let query_index_previous_offset =
-            if query_index > 0 { (query_index - 1) * target.len() } else { 0 };
+            if query_index > 0 { (query_index - 1) * target.len() } else { 0 };
-
+
-        for target_index in 0..target.len() {
+        for target_index in 0..target.len() {
-            let current_index = query_index_offset + target_index;
+            let current_index = query_index_offset + target_index;
-            let diag_index = if query_index > 0 && target_index > 0 {
+            let diag_index = if query_index > 0 && target_index > 0 {
-                query_index_previous_offset + target_index - 1
+                query_index_previous_offset + target_index - 1
-            } else {
+            } else {
-                0
+                0
-            };
+            };
-            let left_score = if target_index > 0 { scores[current_index - 1] } else { 0 };
+            let left_score = if target_index > 0 { scores[current_index - 1] } else { 0 };
-            let diag_score =
+            let diag_score =
-                if query_index > 0 && target_index > 0 { scores[diag_index] } else { 0 };
+                if query_index > 0 && target_index > 0 { scores[diag_index] } else { 0 };
-            let matches_sequence_len =
+            let matches_sequence_len =
-                if query_index > 0 && target_index > 0 { matches[diag_index] } else { 0 };
+                if query_index > 0 && target_index > 0 { matches[diag_index] } else { 0 };
-
+
-            // If we are not matching on the first query character any more, we only produce a
+            // If we are not matching on the first query character any more, we only produce a
-            // score if we had a score previously for the last query index (by looking at the diagScore).
+            // score if we had a score previously for the last query index (by looking at the diagScore).
-            // This makes sure that the query always matches in sequence on the target. For example
+            // This makes sure that the query always matches in sequence on the target. For example
-            // given a target of "ede" and a query of "de", we would otherwise produce a wrong high score
+            // given a target of "ede" and a query of "de", we would otherwise produce a wrong high score
-            // for query[1] ("e") matching on target[0] ("e") because of the "beginning of word" boost.
+            // for query[1] ("e") matching on target[0] ("e") because of the "beginning of word" boost.
-            let score = if diag_score == 0 && query_index != 0 {
+            let score = if diag_score == 0 && query_index != 0 {
-                0
+                0
-            } else {
+            } else {
-                compute_char_score(
+                compute_char_score(
-                    query[query_index],
+                    query[query_index],
-                    query_lower[query_index],
+                    query_lower[query_index],
-                    if target_index != 0 { Some(target[target_index - 1]) } else { None },
+                    if target_index != 0 { Some(target[target_index - 1]) } else { None },
-                    target[target_index],
+                    target[target_index],
-                    target_lower[target_index],
+                    target_lower[target_index],
-                    matches_sequence_len,
+                    matches_sequence_len,
-                )
+                )
-            };
+            };
-
+
-            // We have a score and its equal or larger than the left score
+            // We have a score and its equal or larger than the left score
-            // Match: sequence continues growing from previous diag value
+            // Match: sequence continues growing from previous diag value
-            // Score: increases by diag score value
+            // Score: increases by diag score value
-            let is_valid_score = score != 0 && diag_score + score >= left_score;
+            let is_valid_score = score != 0 && diag_score + score >= left_score;
-            if is_valid_score
+            if is_valid_score
-                && (
+                && (
-                    // We don't need to check if it's contiguous if we allow non-contiguous matches
+                    // We don't need to check if it's contiguous if we allow non-contiguous matches
-                    allow_non_contiguous_matches ||
+                    allow_non_contiguous_matches ||
-                        // We must be looking for a contiguous match.
+                        // We must be looking for a contiguous match.
-                        // Looking at an index higher than 0 in the query means we must have already
+                        // Looking at an index higher than 0 in the query means we must have already
-                        // found out this is contiguous otherwise there wouldn't have been a score
+                        // found out this is contiguous otherwise there wouldn't have been a score
-                        query_index > 0 ||
+                        query_index > 0 ||
-                        // lastly check if the query is completely contiguous at this index in the target
+                        // lastly check if the query is completely contiguous at this index in the target
-                        target_lower[target_index..].starts_with(&query_lower)
+                        target_lower[target_index..].starts_with(&query_lower)
-                )
+                )
-            {
+            {
-                matches[current_index] = matches_sequence_len + 1;
+                matches[current_index] = matches_sequence_len + 1;
-                scores[current_index] = diag_score + score;
+                scores[current_index] = diag_score + score;
-            } else {
+            } else {
-                // We either have no score or the score is lower than the left score
+                // We either have no score or the score is lower than the left score
-                // Match: reset to 0
+                // Match: reset to 0
-                // Score: pick up from left hand side
+                // Score: pick up from left hand side
-                matches[current_index] = NO_MATCH;
+                matches[current_index] = NO_MATCH;
-                scores[current_index] = left_score;
+                scores[current_index] = left_score;
-            }
+            }
-        }
+        }
-    }
+    }
-
+
-    // Restore Positions (starting from bottom right of matrix)
+    // Restore Positions (starting from bottom right of matrix)
-    let mut positions = Vec::new_in(arena);
+    let mut positions = Vec::new_in(arena);
-
+
-    if !query.is_empty() && !target.is_empty() {
+    if !query.is_empty() && !target.is_empty() {
-        let mut query_index = query.len() - 1;
+        let mut query_index = query.len() - 1;
-        let mut target_index = target.len() - 1;
+        let mut target_index = target.len() - 1;
-
+
-        loop {
+        loop {
-            let current_index = query_index * target.len() + target_index;
+            let current_index = query_index * target.len() + target_index;
-            if matches[current_index] == NO_MATCH {
+            if matches[current_index] == NO_MATCH {
-                if target_index == 0 {
+                if target_index == 0 {
-                    break;
+                    break;
-                }
+                }
-                target_index -= 1; // go left
+                target_index -= 1; // go left
-            } else {
+            } else {
-                positions.push(target_index);
+                positions.push(target_index);
-
+
-                // go up and left
+                // go up and left
-                if query_index == 0 || target_index == 0 {
+                if query_index == 0 || target_index == 0 {
-                    break;
+                    break;
-                }
+                }
-                query_index -= 1;
+                query_index -= 1;
-                target_index -= 1;
+                target_index -= 1;
-            }
+            }
-        }
+        }
-
+
-        positions.reverse();
+        positions.reverse();
-    }
+    }
-
+
-    (scores[area - 1], positions)
+    (scores[area - 1], positions)
-}
+}
-
+
-fn compute_char_score(
+fn compute_char_score(
-    query: char,
+    query: char,
-    query_lower: char,
+    query_lower: char,
-    target_prev: Option<char>,
+    target_prev: Option<char>,
-    target_curr: char,
+    target_curr: char,
-    target_curr_lower: char,
+    target_curr_lower: char,
-    matches_sequence_len: i32,
+    matches_sequence_len: i32,
-) -> i32 {
+) -> i32 {
-    let mut score = 0;
+    let mut score = 0;
-
+
-    if !consider_as_equal(query_lower, target_curr_lower) {
+    if !consider_as_equal(query_lower, target_curr_lower) {
-        return score; // no match of characters
+        return score; // no match of characters
-    }
+    }
-
+
-    // Character match bonus
+    // Character match bonus
-    score += 1;
+    score += 1;
-
+
-    // Consecutive match bonus
+    // Consecutive match bonus
-    if matches_sequence_len > 0 {
+    if matches_sequence_len > 0 {
-        score += matches_sequence_len * 5;
+        score += matches_sequence_len * 5;
-    }
+    }
-
+
-    // Same case bonus
+    // Same case bonus
-    if query == target_curr {
+    if query == target_curr {
-        score += 1;
+        score += 1;
-    }
+    }
-
+
-    if let Some(target_prev) = target_prev {
+    if let Some(target_prev) = target_prev {
-        // After separator bonus
+        // After separator bonus
-        let separator_bonus = score_separator_at_pos(target_prev);
+        let separator_bonus = score_separator_at_pos(target_prev);
-        if separator_bonus > 0 {
+        if separator_bonus > 0 {
-            score += separator_bonus;
+            score += separator_bonus;
-        }
+        }
-        // Inside word upper case bonus (camel case). We only give this bonus if we're not in a contiguous sequence.
+        // Inside word upper case bonus (camel case). We only give this bonus if we're not in a contiguous sequence.
-        // For example:
+        // For example:
-        // NPE => NullPointerException = boost
+        // NPE => NullPointerException = boost
-        // HTTP => HTTP = not boost
+        // HTTP => HTTP = not boost
-        else if target_curr != target_curr_lower && matches_sequence_len == 0 {
+        else if target_curr != target_curr_lower && matches_sequence_len == 0 {
-            score += 2;
+            score += 2;
-        }
+        }
-    } else {
+    } else {
-        // Start of word bonus
+        // Start of word bonus
-        score += 8;
+        score += 8;
-    }
+    }
-
+
-    score
+    score
-}
+}
-
+
-fn consider_as_equal(a: char, b: char) -> bool {
+fn consider_as_equal(a: char, b: char) -> bool {
-    // Special case path separators: ignore platform differences
+    // Special case path separators: ignore platform differences
-    a == b || a == '/' || a == '\\' && b == '/' || b == '\\'
+    a == b || a == '/' || a == '\\' && b == '/' || b == '\\'
-}
+}
-
+
-fn score_separator_at_pos(ch: char) -> i32 {
+fn score_separator_at_pos(ch: char) -> i32 {
-    match ch {
+    match ch {
-        '/' | '\\' => 5,                               // prefer path separators...
+        '/' | '\\' => 5,                               // prefer path separators...
-        '_' | '-' | '.' | ' ' | '\'' | '"' | ':' => 4, // ...over other separators
+        '_' | '-' | '.' | ' ' | '\'' | '"' | ':' => 4, // ...over other separators
-        _ => 0,
+        _ => 0,
-    }
+    }
-}
+}
-
+
-fn map_chars<'a>(arena: &'a Arena, s: &str) -> Vec<char, &'a Arena> {
+fn map_chars<'a>(arena: &'a Arena, s: &str) -> Vec<char, &'a Arena> {
-    let mut chars = Vec::with_capacity_in(s.len(), arena);
+    let mut chars = Vec::with_capacity_in(s.len(), arena);
-    chars.extend(s.chars());
+    chars.extend(s.chars());
-    chars.shrink_to_fit();
+    chars.shrink_to_fit();
-    chars
+    chars
-}
+}
--- a/tools/grapheme-table-gen/src/main.rs
+++ b/tools/grapheme-table-gen/src/main.rs
@ -3,16 +3,18 @@
 mod rules;
 use crate::rules::{JOIN_RULES_GRAPHEME_CLUSTER, JOIN_RULES_LINE_BREAK};
 use anyhow::{bail, Context};
 use indoc::writedoc;
 use rayon::prelude::*;
 use std::collections::HashMap;
 use std::fmt::Write as FmtWrite;
 use std::io::Write as IoWrite;
 use std::ops::RangeInclusive;
 use std::path::PathBuf;
 use anyhow::{Context, bail};
 use indoc::writedoc;
 use rayon::prelude::*;
 use crate::rules::{JOIN_RULES_GRAPHEME_CLUSTER, JOIN_RULES_LINE_BREAK};
 // `CharacterWidth` is 2 bits.
 #[derive(Clone, Copy, PartialEq, Eq)]
 enum CharacterWidth {
@ -285,19 +287,12 @@ fn main() -> anyhow::Result<()> {
        .iter()
        .map(|t| {
            let rules_gc_len = if out.arg_extended { t.len() } else { 16 };
-            t[..rules_gc_len]
+            t[..rules_gc_len].iter().map(|row| prepare_rules_row(row, 2, 3)).collect()
                .iter()
                .map(|row| prepare_rules_row(row, 2, 3))
                .collect()
        })
        .collect();
    // Same for line breaks, but in 2D.
-    let rules_lb_len = if out.arg_extended {
+    let rules_lb_len = if out.arg_extended { JOIN_RULES_LINE_BREAK.len() } else { 24 };
        JOIN_RULES_LINE_BREAK.len()
    } else {
        24
    };
    out.rules_lb = JOIN_RULES_LINE_BREAK[..rules_lb_len]
        .iter()
        .map(|row| prepare_rules_row(row, 1, 0))
@ -315,12 +310,7 @@ fn main() -> anyhow::Result<()> {
        for s in &out.trie.stages {
            actual = s.values[actual as usize + ((cp >> s.shift) & s.mask)];
        }
-        assert_eq!(
+        assert_eq!(expected.value(), actual, "trie sanity check failed for U+{:04X}", cp);
            expected.value(),
            actual,
            "trie sanity check failed for U+{:04X}",
            cp
        );
    }
    for (cp, &expected) in out.ucd.values[..0x80].iter().enumerate() {
        let last = out.trie.stages.last().unwrap();
@ -363,11 +353,7 @@ fn generate_c(out: Output) -> String {
            width = stage.mask + 1;
        }
-        _ = write!(
+        _ = write!(buf, "static const uint{}_t s_stage{}[] = {{", stage.bits, stage.index);
            buf,
            "static const uint{}_t s_stage{}[] = {{",
            stage.bits, stage.index
        );
        for (j, &value) in stage.values.iter().enumerate() {
            if j % width == 0 {
                buf.push_str("\n   ");
@ -701,21 +687,14 @@ fn generate_rust(out: Output) -> String {
 fn extract_values_from_ucd(doc: &roxmltree::Document, out: &Output) -> anyhow::Result<Ucd> {
    let packing = BitPacking::new(out.arg_line_breaks, out.arg_extended);
-    let ambiguous_value = if out.arg_no_ambiguous {
+    let ambiguous_value =
-        CharacterWidth::Narrow
+        if out.arg_no_ambiguous { CharacterWidth::Narrow } else { CharacterWidth::Ambiguous };
    } else {
        CharacterWidth::Ambiguous
    };
-    let mut values = vec![
+    let mut values =
-        TrieType::new(
+        vec![
-            &packing,
+            TrieType::new(&packing, ClusterBreak::Other, LineBreak::Other, CharacterWidth::Narrow,);
-            ClusterBreak::Other,
+            1114112
-            LineBreak::Other,
+        ];
            CharacterWidth::Narrow,
        );
        1114112
    ];
    let ns = "http://www.unicode.org/ns/2003/ucd/1.0";
    let root = doc.root_element();
@ -904,11 +883,7 @@ fn extract_values_from_ucd(doc: &roxmltree::Document, out: &Output) -> anyhow::R
    // but for us that's equivalent to Other.
    values[0xFE0F].change_width(&packing, CharacterWidth::Wide);
-    Ok(Ucd {
+    Ok(Ucd { description, values, packing })
        description,
        values,
        packing,
    })
 }
 struct UcdAttributes<'a> {
@ -927,15 +902,9 @@ fn extract_attributes<'a>(
    UcdAttributes {
        general_category: node.attribute("gc").unwrap_or(default.general_category),
        line_break: node.attribute("lb").unwrap_or(default.line_break),
-        grapheme_cluster_break: node
+        grapheme_cluster_break: node.attribute("GCB").unwrap_or(default.grapheme_cluster_break),
-            .attribute("GCB")
+        indic_conjunct_break: node.attribute("InCB").unwrap_or(default.indic_conjunct_break),
-            .unwrap_or(default.grapheme_cluster_break),
+        extended_pictographic: node.attribute("ExtPict").unwrap_or(default.extended_pictographic),
        indic_conjunct_break: node
            .attribute("InCB")
            .unwrap_or(default.indic_conjunct_break),
        extended_pictographic: node
            .attribute("ExtPict")
            .unwrap_or(default.extended_pictographic),
        east_asian: node.attribute("ea").unwrap_or(default.east_asian),
    }
 }
@ -1050,18 +1019,13 @@ fn build_trie(uncompressed: Vec<TrieType>, shifts: &[usize]) -> Trie {
        };
    }
-    let total_size: usize = stages
+    let total_size: usize = stages.iter().map(|stage| (stage.bits / 8) * stage.values.len()).sum();
        .iter()
        .map(|stage| (stage.bits / 8) * stage.values.len())
        .sum();
    Trie { stages, total_size }
 }
 fn find_existing(haystack: &[u32], needle: &[u32]) -> Option<usize> {
-    haystack
+    haystack.windows(needle.len()).position(|window| window == needle)
        .windows(needle.len())
        .position(|window| window == needle)
 }
 fn measure_overlap(prev: &[u32], next: &[u32]) -> usize {