mirror of
				https://github.com/python/cpython.git
				synced 2025-11-04 11:49:12 +00:00 
			
		
		
		
	Create Tools/build/ directory. Move the following scripts from Tools/scripts/ to Tools/build/: * check_extension_modules.py * deepfreeze.py * freeze_modules.py * generate_global_objects.py * generate_levenshtein_examples.py * generate_opcode_h.py * generate_re_casefix.py * generate_sre_constants.py * generate_stdlib_module_names.py * generate_token.py * parse_html5_entities.py * smelly.py * stable_abi.py * umarshal.py * update_file.py * verify_ensurepip_wheels.py Update references to these scripts.
		
			
				
	
	
		
			115 lines
		
	
	
	
		
			4.3 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
			
		
		
	
	
			115 lines
		
	
	
	
		
			4.3 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
#!/usr/bin/env python3
 | 
						|
"""
 | 
						|
Utility for parsing HTML5 entity definitions available from:
 | 
						|
 | 
						|
    https://html.spec.whatwg.org/entities.json
 | 
						|
    https://html.spec.whatwg.org/multipage/named-characters.html
 | 
						|
 | 
						|
The page now contains the following note:
 | 
						|
 | 
						|
    "This list is static and will not be expanded or changed in the future."
 | 
						|
 | 
						|
Written by Ezio Melotti and Iuliia Proskurnia.
 | 
						|
"""
 | 
						|
 | 
						|
import os
 | 
						|
import sys
 | 
						|
import json
 | 
						|
from urllib.request import urlopen
 | 
						|
from html.entities import html5
 | 
						|
 | 
						|
SCRIPT_NAME = 'Tools/build/parse_html5_entities.py'
 | 
						|
PAGE_URL = 'https://html.spec.whatwg.org/multipage/named-characters.html'
 | 
						|
ENTITIES_URL = 'https://html.spec.whatwg.org/entities.json'
 | 
						|
HTML5_SECTION_START = '# HTML5 named character references'
 | 
						|
 | 
						|
def get_json(url):
 | 
						|
    """Download the json file from the url and returns a decoded object."""
 | 
						|
    with urlopen(url) as f:
 | 
						|
        data = f.read().decode('utf-8')
 | 
						|
    return json.loads(data)
 | 
						|
 | 
						|
def create_dict(entities):
 | 
						|
    """Create the html5 dict from the decoded json object."""
 | 
						|
    new_html5 = {}
 | 
						|
    for name, value in entities.items():
 | 
						|
        new_html5[name.lstrip('&')] = value['characters']
 | 
						|
    return new_html5
 | 
						|
 | 
						|
def compare_dicts(old, new):
 | 
						|
    """Compare the old and new dicts and print the differences."""
 | 
						|
    added = new.keys() - old.keys()
 | 
						|
    if added:
 | 
						|
        print('{} entitie(s) have been added:'.format(len(added)))
 | 
						|
        for name in sorted(added):
 | 
						|
            print('  {!r}: {!r}'.format(name, new[name]))
 | 
						|
    removed = old.keys() - new.keys()
 | 
						|
    if removed:
 | 
						|
        print('{} entitie(s) have been removed:'.format(len(removed)))
 | 
						|
        for name in sorted(removed):
 | 
						|
            print('  {!r}: {!r}'.format(name, old[name]))
 | 
						|
    changed = set()
 | 
						|
    for name in (old.keys() & new.keys()):
 | 
						|
        if old[name] != new[name]:
 | 
						|
            changed.add((name, old[name], new[name]))
 | 
						|
    if changed:
 | 
						|
        print('{} entitie(s) have been modified:'.format(len(changed)))
 | 
						|
        for item in sorted(changed):
 | 
						|
            print('  {!r}: {!r} -> {!r}'.format(*item))
 | 
						|
 | 
						|
def write_items(entities, file=sys.stdout):
 | 
						|
    """Write the items of the dictionary in the specified file."""
 | 
						|
    # The keys in the generated dictionary should be sorted
 | 
						|
    # in a case-insensitive way, however, when two keys are equal,
 | 
						|
    # the uppercase version should come first so that the result
 | 
						|
    # looks like: ['Aacute', 'aacute', 'Aacute;', 'aacute;', ...]
 | 
						|
    # To do this we first sort in a case-sensitive way (so all the
 | 
						|
    # uppercase chars come first) and then sort with key=str.lower.
 | 
						|
    # Since the sorting is stable the uppercase keys will eventually
 | 
						|
    # be before their equivalent lowercase version.
 | 
						|
    keys = sorted(entities.keys())
 | 
						|
    keys = sorted(keys, key=str.lower)
 | 
						|
    print(HTML5_SECTION_START, file=file)
 | 
						|
    print(f'# Generated by {SCRIPT_NAME}\n'
 | 
						|
          f'# from {ENTITIES_URL} and\n'
 | 
						|
          f'# {PAGE_URL}.\n'
 | 
						|
          f'# Map HTML5 named character references to the '
 | 
						|
          f'equivalent Unicode character(s).', file=file)
 | 
						|
    print('html5 = {', file=file)
 | 
						|
    for name in keys:
 | 
						|
        print(f'    {name!r}: {entities[name]!a},', file=file)
 | 
						|
    print('}', file=file)
 | 
						|
 | 
						|
 | 
						|
if __name__ == '__main__':
 | 
						|
    # without args print a diff between html.entities.html5 and new_html5
 | 
						|
    # with --create print the new html5 dict
 | 
						|
    # with --patch patch the Lib/html/entities.py file
 | 
						|
    new_html5 = create_dict(get_json(ENTITIES_URL))
 | 
						|
    if '--create' in sys.argv:
 | 
						|
        write_items(new_html5)
 | 
						|
    elif '--patch' in sys.argv:
 | 
						|
        fname = 'Lib/html/entities.py'
 | 
						|
        temp_fname = fname + '.temp'
 | 
						|
        with open(fname) as f1, open(temp_fname, 'w') as f2:
 | 
						|
            skip = False
 | 
						|
            for line in f1:
 | 
						|
                if line.startswith(HTML5_SECTION_START):
 | 
						|
                    write_items(new_html5, file=f2)
 | 
						|
                    skip = True
 | 
						|
                    continue
 | 
						|
                if skip:
 | 
						|
                    # skip the old items until the }
 | 
						|
                    if line.startswith('}'):
 | 
						|
                        skip = False
 | 
						|
                    continue
 | 
						|
                f2.write(line)
 | 
						|
        os.remove(fname)
 | 
						|
        os.rename(temp_fname, fname)
 | 
						|
    else:
 | 
						|
        if html5 == new_html5:
 | 
						|
            print('The current dictionary is updated.')
 | 
						|
        else:
 | 
						|
            compare_dicts(html5, new_html5)
 | 
						|
            print('Run "./python {0} --patch" to update Lib/html/entities.html '
 | 
						|
                  'or "./python {0} --create" to see the generated ' 'dictionary.'.format(__file__))
 |