normalize latin-1 and utf-8 variant encodings like the builtin tokenizer does

2025-09-26 18:29:57 +00:00 · 2009-10-09 21:43:09 +00:00 · 2009-10-09 21:43:09 +00:00 · d3afadaa49
commit d3afadaa49
parent ffc08fcad6
3 changed files with 44 additions and 2 deletions
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@ -279,6 +279,17 @@ def untokenize(iterable):
    return out


+def _get_normal_name(orig_enc):
+    """Imitates get_normal_name in tokenizer.c."""
+    # Only care about the first 12 characters.
+    enc = orig_enc[:12].lower().replace("_", "-")
+    if enc == "utf-8" or enc.startswith("utf-8-"):
+        return "utf-8"
+    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
+       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
+        return "iso-8859-1"
+    return orig_enc
+
 def detect_encoding(readline):
    """
    The detect_encoding() function is used to detect the encoding that should
@ -313,7 +324,7 @@ def detect_encoding(readline):
        matches = cookie_re.findall(line_string)
        if not matches:
            return None
-        encoding = matches[0]
+        encoding = _get_normal_name(matches[0])
        try:
            codec = lookup(encoding)
        except LookupError: