gh-82927: Update files related to HTML entities. (GH-92504)

This commit is contained in:
Ezio Melotti 2022-06-21 22:03:12 +02:00 committed by GitHub
parent 4e08fbcfdf
commit f28ec34c5c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 29 additions and 78 deletions

1
.github/CODEOWNERS vendored
View file

@ -53,6 +53,7 @@ Python/pythonrun.c @iritkatriel
/Lib/html/ @ezio-melotti /Lib/html/ @ezio-melotti
/Lib/_markupbase.py @ezio-melotti /Lib/_markupbase.py @ezio-melotti
/Lib/test/test_html*.py @ezio-melotti /Lib/test/test_html*.py @ezio-melotti
/Tools/scripts/*html5* @ezio-melotti
# Import (including importlib). # Import (including importlib).
# Ignoring importlib.h so as to not get flagged on # Ignoring importlib.h so as to not get flagged on

View file

@ -34,12 +34,12 @@ This module defines four dictionaries, :data:`html5`,
.. data:: name2codepoint .. data:: name2codepoint
A dictionary that maps HTML entity names to the Unicode code points. A dictionary that maps HTML4 entity names to the Unicode code points.
.. data:: codepoint2name .. data:: codepoint2name
A dictionary that maps Unicode code points to HTML entity names. A dictionary that maps Unicode code points to HTML4 entity names.
.. rubric:: Footnotes .. rubric:: Footnotes

View file

@ -3,8 +3,7 @@
__all__ = ['html5', 'name2codepoint', 'codepoint2name', 'entitydefs'] __all__ = ['html5', 'name2codepoint', 'codepoint2name', 'entitydefs']
# maps the HTML entity name to the Unicode code point # maps HTML4 entity name to the Unicode code point
# from https://html.spec.whatwg.org/multipage/named-characters.html
name2codepoint = { name2codepoint = {
'AElig': 0x00c6, # latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 'AElig': 0x00c6, # latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1
'Aacute': 0x00c1, # latin capital letter A with acute, U+00C1 ISOlat1 'Aacute': 0x00c1, # latin capital letter A with acute, U+00C1 ISOlat1
@ -261,7 +260,11 @@ name2codepoint = {
} }
# maps the HTML5 named character references to the equivalent Unicode character(s) # HTML5 named character references
# Generated by 'Tools/scripts/parse_html5_entities.py'
# from https://html.spec.whatwg.org/entities.json and
# https://html.spec.whatwg.org/multipage/named-characters.html.
# Map HTML5 named character references to the equivalent Unicode character(s).
html5 = { html5 = {
'Aacute': '\xc1', 'Aacute': '\xc1',
'aacute': '\xe1', 'aacute': '\xe1',

View file

@ -0,0 +1,2 @@
The ``Tools/scripts/parseentities.py`` script used to parse HTML4 entities
has been removed.

View file

@ -2,10 +2,14 @@
""" """
Utility for parsing HTML5 entity definitions available from: Utility for parsing HTML5 entity definitions available from:
http://dev.w3.org/html5/spec/entities.json https://html.spec.whatwg.org/entities.json
https://html.spec.whatwg.org/multipage/named-characters.html
The page now contains the following note:
"This list is static and will not be expanded or changed in the future."
Written by Ezio Melotti and Iuliia Proskurnia. Written by Ezio Melotti and Iuliia Proskurnia.
""" """
import os import os
@ -14,7 +18,9 @@ import json
from urllib.request import urlopen from urllib.request import urlopen
from html.entities import html5 from html.entities import html5
entities_url = 'http://dev.w3.org/html5/spec/entities.json' PAGE_URL = 'https://html.spec.whatwg.org/multipage/named-characters.html'
ENTITIES_URL = 'https://html.spec.whatwg.org/entities.json'
HTML5_SECTION_START = '# HTML5 named character references'
def get_json(url): def get_json(url):
"""Download the json file from the url and returns a decoded object.""" """Download the json file from the url and returns a decoded object."""
@ -62,9 +68,15 @@ def write_items(entities, file=sys.stdout):
# be before their equivalent lowercase version. # be before their equivalent lowercase version.
keys = sorted(entities.keys()) keys = sorted(entities.keys())
keys = sorted(keys, key=str.lower) keys = sorted(keys, key=str.lower)
print(HTML5_SECTION_START, file=file)
print(f'# Generated by {sys.argv[0]!r}\n'
f'# from {ENTITIES_URL} and\n'
f'# {PAGE_URL}.\n'
f'# Map HTML5 named character references to the '
f'equivalent Unicode character(s).', file=file)
print('html5 = {', file=file) print('html5 = {', file=file)
for name in keys: for name in keys:
print(' {!r}: {!a},'.format(name, entities[name]), file=file) print(f' {name!r}: {entities[name]!a},', file=file)
print('}', file=file) print('}', file=file)
@ -72,11 +84,8 @@ if __name__ == '__main__':
# without args print a diff between html.entities.html5 and new_html5 # without args print a diff between html.entities.html5 and new_html5
# with --create print the new html5 dict # with --create print the new html5 dict
# with --patch patch the Lib/html/entities.py file # with --patch patch the Lib/html/entities.py file
new_html5 = create_dict(get_json(entities_url)) new_html5 = create_dict(get_json(ENTITIES_URL))
if '--create' in sys.argv: if '--create' in sys.argv:
print('# map the HTML5 named character references to the '
'equivalent Unicode character(s)')
print('# Generated by {}. Do not edit manually.'.format(__file__))
write_items(new_html5) write_items(new_html5)
elif '--patch' in sys.argv: elif '--patch' in sys.argv:
fname = 'Lib/html/entities.py' fname = 'Lib/html/entities.py'
@ -84,7 +93,7 @@ if __name__ == '__main__':
with open(fname) as f1, open(temp_fname, 'w') as f2: with open(fname) as f1, open(temp_fname, 'w') as f2:
skip = False skip = False
for line in f1: for line in f1:
if line.startswith('html5 = {'): if line.startswith(HTML5_SECTION_START):
write_items(new_html5, file=f2) write_items(new_html5, file=f2)
skip = True skip = True
continue continue

View file

@ -1,64 +0,0 @@
#!/usr/bin/env python3
""" Utility for parsing HTML entity definitions available from:
http://www.w3.org/ as e.g.
http://www.w3.org/TR/REC-html40/HTMLlat1.ent
Input is read from stdin, output is written to stdout in form of a
Python snippet defining a dictionary "entitydefs" mapping literal
entity name to character or numeric entity.
Marc-Andre Lemburg, mal@lemburg.com, 1999.
Use as you like. NO WARRANTIES.
"""
import re,sys
entityRE = re.compile(r'<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->')
def parse(text,pos=0,endpos=None):
pos = 0
if endpos is None:
endpos = len(text)
d = {}
while 1:
m = entityRE.search(text,pos,endpos)
if not m:
break
name,charcode,comment = m.groups()
d[name] = charcode,comment
pos = m.end()
return d
def writefile(f,defs):
f.write("entitydefs = {\n")
items = sorted(defs.items())
for name, (charcode,comment) in items:
if charcode[:2] == '&#':
code = int(charcode[2:-1])
if code < 256:
charcode = r"'\%o'" % code
else:
charcode = repr(charcode)
else:
charcode = repr(charcode)
comment = ' '.join(comment.split())
f.write(" '%s':\t%s, \t# %s\n" % (name,charcode,comment))
f.write('\n}\n')
if __name__ == '__main__':
if len(sys.argv) > 1:
with open(sys.argv[1]) as infile:
text = infile.read()
else:
text = sys.stdin.read()
defs = parse(text)
if len(sys.argv) > 2:
with open(sys.argv[2],'w') as outfile:
writefile(outfile, defs)
else:
writefile(sys.stdout, defs)