Use characters instead of u32 in confusable map (#8463)

This commit is contained in:
Charlie Marsh 2023-11-03 06:57:47 -07:00 committed by GitHub
parent 41e538a748
commit 7c12eaf322
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 1602 additions and 1594 deletions

View file

@ -163,7 +163,7 @@ pub(crate) fn ambiguous_unicode_character(
let candidate = Candidate::new(
TextSize::try_from(relative_offset).unwrap() + range.start(),
current_char,
char::from_u32(representant).unwrap(),
representant,
);
if let Some(diagnostic) = candidate.into_diagnostic(context, settings) {
diagnostics.push(diagnostic);
@ -178,7 +178,7 @@ pub(crate) fn ambiguous_unicode_character(
word_candidates.push(Candidate::new(
TextSize::try_from(relative_offset).unwrap() + range.start(),
current_char,
char::from_u32(representant).unwrap(),
representant,
));
} else {
// The current word contains at least one unambiguous unicode character.

File diff suppressed because it is too large Load diff

View file

@ -13,7 +13,7 @@ prelude = """
/// Via: <https://github.com/hediet/vscode-unicode-data/blob/main/out/ambiguous.json>
/// See: <https://github.com/microsoft/vscode/blob/095ddabc52b82498ee7f718a34f9dd11d59099a8/src/vs/base/common/strings.ts#L1094>
pub(crate) fn confusable(c: u32) -> Option<u8> {
pub(crate) fn confusable(c: u32) -> Option<char> {
let result = match c {
""".lstrip()
@ -49,6 +49,14 @@ def format_number(number: int) -> str:
return f"{number}u32"
def format_char(number: int) -> str:
"""Format a Python integer as a Rust character literal."""
char = chr(number)
if char == "\\":
return "\\\\"
return char
def format_confusables_rs(raw_data: dict[str, list[int]]) -> str:
"""Format the downloaded data into a Rust source file."""
# The input data contains duplicate entries.
@ -59,7 +67,7 @@ def format_confusables_rs(raw_data: dict[str, list[int]]) -> str:
flattened_items.add((items[i], items[i + 1]))
tuples = [
f" {format_number(left)} => {right},\n"
f" {format_number(left)} => '{format_char(right)}',\n"
for left, right in sorted(flattened_items)
]
@ -67,13 +75,13 @@ def format_confusables_rs(raw_data: dict[str, list[int]]) -> str:
# as they're unicode-to-unicode confusables, not unicode-to-ASCII confusables.
confusable_units = [
# ANGSTROM SIGN → LATIN CAPITAL LETTER A WITH RING ABOVE
("0x212B", "0x00C5"),
("0x212B", chr(0x00C5)),
# OHM SIGN → GREEK CAPITAL LETTER OMEGA
("0x2126", "0x03A9"),
("0x2126", chr(0x03A9)),
# MICRO SIGN → GREEK SMALL LETTER MU
("0x00B5", "0x03BC"),
("0x00B5", chr(0x03BC)),
]
tuples += [f" {left} => {right},\n" for left, right in confusable_units]
tuples += [f" {left} => '{right}',\n" for left, right in confusable_units]
print(f"{len(tuples)} confusable tuples.")