Ensure LF line endings

This commit is contained in:
Leonard Hecker 2025-05-17 01:28:52 +02:00
parent d17cf66c7c
commit 4055262e47
3 changed files with 245 additions and 280 deletions

View file

@ -3,4 +3,5 @@ use_small_heuristics = "Max"
group_imports = "StdExternalCrate" group_imports = "StdExternalCrate"
imports_granularity = "Module" imports_granularity = "Module"
format_code_in_doc_comments = true format_code_in_doc_comments = true
newline_style = "Unix"
use_field_init_shorthand = true use_field_init_shorthand = true

View file

@ -1,221 +1,221 @@
// Copyright (c) Microsoft Corporation. // Copyright (c) Microsoft Corporation.
// Licensed under the MIT License. // Licensed under the MIT License.
//! Fuzzy search algorithm based on the one used in VS Code (`/src/vs/base/common/fuzzyScorer.ts`). //! Fuzzy search algorithm based on the one used in VS Code (`/src/vs/base/common/fuzzyScorer.ts`).
//! Other algorithms exist, such as Sublime Text's, or the one used in `fzf`, //! Other algorithms exist, such as Sublime Text's, or the one used in `fzf`,
//! but I figured that this one is what lots of people may be familiar with. //! but I figured that this one is what lots of people may be familiar with.
use std::vec; use std::vec;
use crate::arena::{Arena, scratch_arena}; use crate::arena::{Arena, scratch_arena};
use crate::icu; use crate::icu;
const NO_MATCH: i32 = 0; const NO_MATCH: i32 = 0;
pub fn score_fuzzy<'a>( pub fn score_fuzzy<'a>(
arena: &'a Arena, arena: &'a Arena,
haystack: &str, haystack: &str,
needle: &str, needle: &str,
allow_non_contiguous_matches: bool, allow_non_contiguous_matches: bool,
) -> (i32, Vec<usize, &'a Arena>) { ) -> (i32, Vec<usize, &'a Arena>) {
if haystack.is_empty() || needle.is_empty() { if haystack.is_empty() || needle.is_empty() {
// return early if target or query are empty // return early if target or query are empty
return (NO_MATCH, Vec::new_in(arena)); return (NO_MATCH, Vec::new_in(arena));
} }
let scratch = scratch_arena(Some(arena)); let scratch = scratch_arena(Some(arena));
let target = map_chars(&scratch, haystack); let target = map_chars(&scratch, haystack);
let query = map_chars(&scratch, needle); let query = map_chars(&scratch, needle);
if target.len() < query.len() { if target.len() < query.len() {
// impossible for query to be contained in target // impossible for query to be contained in target
return (NO_MATCH, Vec::new_in(arena)); return (NO_MATCH, Vec::new_in(arena));
} }
let target_lower = icu::fold_case(&scratch, haystack); let target_lower = icu::fold_case(&scratch, haystack);
let query_lower = icu::fold_case(&scratch, needle); let query_lower = icu::fold_case(&scratch, needle);
let target_lower = map_chars(&scratch, &target_lower); let target_lower = map_chars(&scratch, &target_lower);
let query_lower = map_chars(&scratch, &query_lower); let query_lower = map_chars(&scratch, &query_lower);
let area = query.len() * target.len(); let area = query.len() * target.len();
let mut scores = vec::from_elem_in(0, area, &*scratch); let mut scores = vec::from_elem_in(0, area, &*scratch);
let mut matches = vec::from_elem_in(0, area, &*scratch); let mut matches = vec::from_elem_in(0, area, &*scratch);
// //
// Build Scorer Matrix: // Build Scorer Matrix:
// //
// The matrix is composed of query q and target t. For each index we score // The matrix is composed of query q and target t. For each index we score
// q[i] with t[i] and compare that with the previous score. If the score is // q[i] with t[i] and compare that with the previous score. If the score is
// equal or larger, we keep the match. In addition to the score, we also keep // equal or larger, we keep the match. In addition to the score, we also keep
// the length of the consecutive matches to use as boost for the score. // the length of the consecutive matches to use as boost for the score.
// //
// t a r g e t // t a r g e t
// q // q
// u // u
// e // e
// r // r
// y // y
// //
for query_index in 0..query.len() { for query_index in 0..query.len() {
let query_index_offset = query_index * target.len(); let query_index_offset = query_index * target.len();
let query_index_previous_offset = let query_index_previous_offset =
if query_index > 0 { (query_index - 1) * target.len() } else { 0 }; if query_index > 0 { (query_index - 1) * target.len() } else { 0 };
for target_index in 0..target.len() { for target_index in 0..target.len() {
let current_index = query_index_offset + target_index; let current_index = query_index_offset + target_index;
let diag_index = if query_index > 0 && target_index > 0 { let diag_index = if query_index > 0 && target_index > 0 {
query_index_previous_offset + target_index - 1 query_index_previous_offset + target_index - 1
} else { } else {
0 0
}; };
let left_score = if target_index > 0 { scores[current_index - 1] } else { 0 }; let left_score = if target_index > 0 { scores[current_index - 1] } else { 0 };
let diag_score = let diag_score =
if query_index > 0 && target_index > 0 { scores[diag_index] } else { 0 }; if query_index > 0 && target_index > 0 { scores[diag_index] } else { 0 };
let matches_sequence_len = let matches_sequence_len =
if query_index > 0 && target_index > 0 { matches[diag_index] } else { 0 }; if query_index > 0 && target_index > 0 { matches[diag_index] } else { 0 };
// If we are not matching on the first query character any more, we only produce a // If we are not matching on the first query character any more, we only produce a
// score if we had a score previously for the last query index (by looking at the diagScore). // score if we had a score previously for the last query index (by looking at the diagScore).
// This makes sure that the query always matches in sequence on the target. For example // This makes sure that the query always matches in sequence on the target. For example
// given a target of "ede" and a query of "de", we would otherwise produce a wrong high score // given a target of "ede" and a query of "de", we would otherwise produce a wrong high score
// for query[1] ("e") matching on target[0] ("e") because of the "beginning of word" boost. // for query[1] ("e") matching on target[0] ("e") because of the "beginning of word" boost.
let score = if diag_score == 0 && query_index != 0 { let score = if diag_score == 0 && query_index != 0 {
0 0
} else { } else {
compute_char_score( compute_char_score(
query[query_index], query[query_index],
query_lower[query_index], query_lower[query_index],
if target_index != 0 { Some(target[target_index - 1]) } else { None }, if target_index != 0 { Some(target[target_index - 1]) } else { None },
target[target_index], target[target_index],
target_lower[target_index], target_lower[target_index],
matches_sequence_len, matches_sequence_len,
) )
}; };
// We have a score and its equal or larger than the left score // We have a score and its equal or larger than the left score
// Match: sequence continues growing from previous diag value // Match: sequence continues growing from previous diag value
// Score: increases by diag score value // Score: increases by diag score value
let is_valid_score = score != 0 && diag_score + score >= left_score; let is_valid_score = score != 0 && diag_score + score >= left_score;
if is_valid_score if is_valid_score
&& ( && (
// We don't need to check if it's contiguous if we allow non-contiguous matches // We don't need to check if it's contiguous if we allow non-contiguous matches
allow_non_contiguous_matches || allow_non_contiguous_matches ||
// We must be looking for a contiguous match. // We must be looking for a contiguous match.
// Looking at an index higher than 0 in the query means we must have already // Looking at an index higher than 0 in the query means we must have already
// found out this is contiguous otherwise there wouldn't have been a score // found out this is contiguous otherwise there wouldn't have been a score
query_index > 0 || query_index > 0 ||
// lastly check if the query is completely contiguous at this index in the target // lastly check if the query is completely contiguous at this index in the target
target_lower[target_index..].starts_with(&query_lower) target_lower[target_index..].starts_with(&query_lower)
) )
{ {
matches[current_index] = matches_sequence_len + 1; matches[current_index] = matches_sequence_len + 1;
scores[current_index] = diag_score + score; scores[current_index] = diag_score + score;
} else { } else {
// We either have no score or the score is lower than the left score // We either have no score or the score is lower than the left score
// Match: reset to 0 // Match: reset to 0
// Score: pick up from left hand side // Score: pick up from left hand side
matches[current_index] = NO_MATCH; matches[current_index] = NO_MATCH;
scores[current_index] = left_score; scores[current_index] = left_score;
} }
} }
} }
// Restore Positions (starting from bottom right of matrix) // Restore Positions (starting from bottom right of matrix)
let mut positions = Vec::new_in(arena); let mut positions = Vec::new_in(arena);
if !query.is_empty() && !target.is_empty() { if !query.is_empty() && !target.is_empty() {
let mut query_index = query.len() - 1; let mut query_index = query.len() - 1;
let mut target_index = target.len() - 1; let mut target_index = target.len() - 1;
loop { loop {
let current_index = query_index * target.len() + target_index; let current_index = query_index * target.len() + target_index;
if matches[current_index] == NO_MATCH { if matches[current_index] == NO_MATCH {
if target_index == 0 { if target_index == 0 {
break; break;
} }
target_index -= 1; // go left target_index -= 1; // go left
} else { } else {
positions.push(target_index); positions.push(target_index);
// go up and left // go up and left
if query_index == 0 || target_index == 0 { if query_index == 0 || target_index == 0 {
break; break;
} }
query_index -= 1; query_index -= 1;
target_index -= 1; target_index -= 1;
} }
} }
positions.reverse(); positions.reverse();
} }
(scores[area - 1], positions) (scores[area - 1], positions)
} }
fn compute_char_score( fn compute_char_score(
query: char, query: char,
query_lower: char, query_lower: char,
target_prev: Option<char>, target_prev: Option<char>,
target_curr: char, target_curr: char,
target_curr_lower: char, target_curr_lower: char,
matches_sequence_len: i32, matches_sequence_len: i32,
) -> i32 { ) -> i32 {
let mut score = 0; let mut score = 0;
if !consider_as_equal(query_lower, target_curr_lower) { if !consider_as_equal(query_lower, target_curr_lower) {
return score; // no match of characters return score; // no match of characters
} }
// Character match bonus // Character match bonus
score += 1; score += 1;
// Consecutive match bonus // Consecutive match bonus
if matches_sequence_len > 0 { if matches_sequence_len > 0 {
score += matches_sequence_len * 5; score += matches_sequence_len * 5;
} }
// Same case bonus // Same case bonus
if query == target_curr { if query == target_curr {
score += 1; score += 1;
} }
if let Some(target_prev) = target_prev { if let Some(target_prev) = target_prev {
// After separator bonus // After separator bonus
let separator_bonus = score_separator_at_pos(target_prev); let separator_bonus = score_separator_at_pos(target_prev);
if separator_bonus > 0 { if separator_bonus > 0 {
score += separator_bonus; score += separator_bonus;
} }
// Inside word upper case bonus (camel case). We only give this bonus if we're not in a contiguous sequence. // Inside word upper case bonus (camel case). We only give this bonus if we're not in a contiguous sequence.
// For example: // For example:
// NPE => NullPointerException = boost // NPE => NullPointerException = boost
// HTTP => HTTP = not boost // HTTP => HTTP = not boost
else if target_curr != target_curr_lower && matches_sequence_len == 0 { else if target_curr != target_curr_lower && matches_sequence_len == 0 {
score += 2; score += 2;
} }
} else { } else {
// Start of word bonus // Start of word bonus
score += 8; score += 8;
} }
score score
} }
fn consider_as_equal(a: char, b: char) -> bool { fn consider_as_equal(a: char, b: char) -> bool {
// Special case path separators: ignore platform differences // Special case path separators: ignore platform differences
a == b || a == '/' || a == '\\' && b == '/' || b == '\\' a == b || a == '/' || a == '\\' && b == '/' || b == '\\'
} }
fn score_separator_at_pos(ch: char) -> i32 { fn score_separator_at_pos(ch: char) -> i32 {
match ch { match ch {
'/' | '\\' => 5, // prefer path separators... '/' | '\\' => 5, // prefer path separators...
'_' | '-' | '.' | ' ' | '\'' | '"' | ':' => 4, // ...over other separators '_' | '-' | '.' | ' ' | '\'' | '"' | ':' => 4, // ...over other separators
_ => 0, _ => 0,
} }
} }
fn map_chars<'a>(arena: &'a Arena, s: &str) -> Vec<char, &'a Arena> { fn map_chars<'a>(arena: &'a Arena, s: &str) -> Vec<char, &'a Arena> {
let mut chars = Vec::with_capacity_in(s.len(), arena); let mut chars = Vec::with_capacity_in(s.len(), arena);
chars.extend(s.chars()); chars.extend(s.chars());
chars.shrink_to_fit(); chars.shrink_to_fit();
chars chars
} }

View file

@ -3,16 +3,18 @@
mod rules; mod rules;
use crate::rules::{JOIN_RULES_GRAPHEME_CLUSTER, JOIN_RULES_LINE_BREAK};
use anyhow::{bail, Context};
use indoc::writedoc;
use rayon::prelude::*;
use std::collections::HashMap; use std::collections::HashMap;
use std::fmt::Write as FmtWrite; use std::fmt::Write as FmtWrite;
use std::io::Write as IoWrite; use std::io::Write as IoWrite;
use std::ops::RangeInclusive; use std::ops::RangeInclusive;
use std::path::PathBuf; use std::path::PathBuf;
use anyhow::{Context, bail};
use indoc::writedoc;
use rayon::prelude::*;
use crate::rules::{JOIN_RULES_GRAPHEME_CLUSTER, JOIN_RULES_LINE_BREAK};
// `CharacterWidth` is 2 bits. // `CharacterWidth` is 2 bits.
#[derive(Clone, Copy, PartialEq, Eq)] #[derive(Clone, Copy, PartialEq, Eq)]
enum CharacterWidth { enum CharacterWidth {
@ -285,19 +287,12 @@ fn main() -> anyhow::Result<()> {
.iter() .iter()
.map(|t| { .map(|t| {
let rules_gc_len = if out.arg_extended { t.len() } else { 16 }; let rules_gc_len = if out.arg_extended { t.len() } else { 16 };
t[..rules_gc_len] t[..rules_gc_len].iter().map(|row| prepare_rules_row(row, 2, 3)).collect()
.iter()
.map(|row| prepare_rules_row(row, 2, 3))
.collect()
}) })
.collect(); .collect();
// Same for line breaks, but in 2D. // Same for line breaks, but in 2D.
let rules_lb_len = if out.arg_extended { let rules_lb_len = if out.arg_extended { JOIN_RULES_LINE_BREAK.len() } else { 24 };
JOIN_RULES_LINE_BREAK.len()
} else {
24
};
out.rules_lb = JOIN_RULES_LINE_BREAK[..rules_lb_len] out.rules_lb = JOIN_RULES_LINE_BREAK[..rules_lb_len]
.iter() .iter()
.map(|row| prepare_rules_row(row, 1, 0)) .map(|row| prepare_rules_row(row, 1, 0))
@ -315,12 +310,7 @@ fn main() -> anyhow::Result<()> {
for s in &out.trie.stages { for s in &out.trie.stages {
actual = s.values[actual as usize + ((cp >> s.shift) & s.mask)]; actual = s.values[actual as usize + ((cp >> s.shift) & s.mask)];
} }
assert_eq!( assert_eq!(expected.value(), actual, "trie sanity check failed for U+{:04X}", cp);
expected.value(),
actual,
"trie sanity check failed for U+{:04X}",
cp
);
} }
for (cp, &expected) in out.ucd.values[..0x80].iter().enumerate() { for (cp, &expected) in out.ucd.values[..0x80].iter().enumerate() {
let last = out.trie.stages.last().unwrap(); let last = out.trie.stages.last().unwrap();
@ -363,11 +353,7 @@ fn generate_c(out: Output) -> String {
width = stage.mask + 1; width = stage.mask + 1;
} }
_ = write!( _ = write!(buf, "static const uint{}_t s_stage{}[] = {{", stage.bits, stage.index);
buf,
"static const uint{}_t s_stage{}[] = {{",
stage.bits, stage.index
);
for (j, &value) in stage.values.iter().enumerate() { for (j, &value) in stage.values.iter().enumerate() {
if j % width == 0 { if j % width == 0 {
buf.push_str("\n "); buf.push_str("\n ");
@ -701,21 +687,14 @@ fn generate_rust(out: Output) -> String {
fn extract_values_from_ucd(doc: &roxmltree::Document, out: &Output) -> anyhow::Result<Ucd> { fn extract_values_from_ucd(doc: &roxmltree::Document, out: &Output) -> anyhow::Result<Ucd> {
let packing = BitPacking::new(out.arg_line_breaks, out.arg_extended); let packing = BitPacking::new(out.arg_line_breaks, out.arg_extended);
let ambiguous_value = if out.arg_no_ambiguous { let ambiguous_value =
CharacterWidth::Narrow if out.arg_no_ambiguous { CharacterWidth::Narrow } else { CharacterWidth::Ambiguous };
} else {
CharacterWidth::Ambiguous
};
let mut values = vec![ let mut values =
TrieType::new( vec![
&packing, TrieType::new(&packing, ClusterBreak::Other, LineBreak::Other, CharacterWidth::Narrow,);
ClusterBreak::Other, 1114112
LineBreak::Other, ];
CharacterWidth::Narrow,
);
1114112
];
let ns = "http://www.unicode.org/ns/2003/ucd/1.0"; let ns = "http://www.unicode.org/ns/2003/ucd/1.0";
let root = doc.root_element(); let root = doc.root_element();
@ -904,11 +883,7 @@ fn extract_values_from_ucd(doc: &roxmltree::Document, out: &Output) -> anyhow::R
// but for us that's equivalent to Other. // but for us that's equivalent to Other.
values[0xFE0F].change_width(&packing, CharacterWidth::Wide); values[0xFE0F].change_width(&packing, CharacterWidth::Wide);
Ok(Ucd { Ok(Ucd { description, values, packing })
description,
values,
packing,
})
} }
struct UcdAttributes<'a> { struct UcdAttributes<'a> {
@ -927,15 +902,9 @@ fn extract_attributes<'a>(
UcdAttributes { UcdAttributes {
general_category: node.attribute("gc").unwrap_or(default.general_category), general_category: node.attribute("gc").unwrap_or(default.general_category),
line_break: node.attribute("lb").unwrap_or(default.line_break), line_break: node.attribute("lb").unwrap_or(default.line_break),
grapheme_cluster_break: node grapheme_cluster_break: node.attribute("GCB").unwrap_or(default.grapheme_cluster_break),
.attribute("GCB") indic_conjunct_break: node.attribute("InCB").unwrap_or(default.indic_conjunct_break),
.unwrap_or(default.grapheme_cluster_break), extended_pictographic: node.attribute("ExtPict").unwrap_or(default.extended_pictographic),
indic_conjunct_break: node
.attribute("InCB")
.unwrap_or(default.indic_conjunct_break),
extended_pictographic: node
.attribute("ExtPict")
.unwrap_or(default.extended_pictographic),
east_asian: node.attribute("ea").unwrap_or(default.east_asian), east_asian: node.attribute("ea").unwrap_or(default.east_asian),
} }
} }
@ -1050,18 +1019,13 @@ fn build_trie(uncompressed: Vec<TrieType>, shifts: &[usize]) -> Trie {
}; };
} }
let total_size: usize = stages let total_size: usize = stages.iter().map(|stage| (stage.bits / 8) * stage.values.len()).sum();
.iter()
.map(|stage| (stage.bits / 8) * stage.values.len())
.sum();
Trie { stages, total_size } Trie { stages, total_size }
} }
fn find_existing(haystack: &[u32], needle: &[u32]) -> Option<usize> { fn find_existing(haystack: &[u32], needle: &[u32]) -> Option<usize> {
haystack haystack.windows(needle.len()).position(|window| window == needle)
.windows(needle.len())
.position(|window| window == needle)
} }
fn measure_overlap(prev: &[u32], next: &[u32]) -> usize { fn measure_overlap(prev: &[u32], next: &[u32]) -> usize {