mirror of
				https://github.com/python/cpython.git
				synced 2025-10-30 18:07:37 +00:00 
			
		
		
		
	 1863302d61
			
		
	
	
		1863302d61
		
			
		
	
	
	
	
		
			
			Create Tools/build/ directory. Move the following scripts from Tools/scripts/ to Tools/build/: * check_extension_modules.py * deepfreeze.py * freeze_modules.py * generate_global_objects.py * generate_levenshtein_examples.py * generate_opcode_h.py * generate_re_casefix.py * generate_sre_constants.py * generate_stdlib_module_names.py * generate_token.py * parse_html5_entities.py * smelly.py * stable_abi.py * umarshal.py * update_file.py * verify_ensurepip_wheels.py Update references to these scripts.
		
			
				
	
	
		
			115 lines
		
	
	
	
		
			4.3 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
			
		
		
	
	
			115 lines
		
	
	
	
		
			4.3 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
| #!/usr/bin/env python3
 | |
| """
 | |
| Utility for parsing HTML5 entity definitions available from:
 | |
| 
 | |
|     https://html.spec.whatwg.org/entities.json
 | |
|     https://html.spec.whatwg.org/multipage/named-characters.html
 | |
| 
 | |
| The page now contains the following note:
 | |
| 
 | |
|     "This list is static and will not be expanded or changed in the future."
 | |
| 
 | |
| Written by Ezio Melotti and Iuliia Proskurnia.
 | |
| """
 | |
| 
 | |
| import os
 | |
| import sys
 | |
| import json
 | |
| from urllib.request import urlopen
 | |
| from html.entities import html5
 | |
| 
 | |
| SCRIPT_NAME = 'Tools/build/parse_html5_entities.py'
 | |
| PAGE_URL = 'https://html.spec.whatwg.org/multipage/named-characters.html'
 | |
| ENTITIES_URL = 'https://html.spec.whatwg.org/entities.json'
 | |
| HTML5_SECTION_START = '# HTML5 named character references'
 | |
| 
 | |
| def get_json(url):
 | |
|     """Download the json file from the url and returns a decoded object."""
 | |
|     with urlopen(url) as f:
 | |
|         data = f.read().decode('utf-8')
 | |
|     return json.loads(data)
 | |
| 
 | |
| def create_dict(entities):
 | |
|     """Create the html5 dict from the decoded json object."""
 | |
|     new_html5 = {}
 | |
|     for name, value in entities.items():
 | |
|         new_html5[name.lstrip('&')] = value['characters']
 | |
|     return new_html5
 | |
| 
 | |
| def compare_dicts(old, new):
 | |
|     """Compare the old and new dicts and print the differences."""
 | |
|     added = new.keys() - old.keys()
 | |
|     if added:
 | |
|         print('{} entitie(s) have been added:'.format(len(added)))
 | |
|         for name in sorted(added):
 | |
|             print('  {!r}: {!r}'.format(name, new[name]))
 | |
|     removed = old.keys() - new.keys()
 | |
|     if removed:
 | |
|         print('{} entitie(s) have been removed:'.format(len(removed)))
 | |
|         for name in sorted(removed):
 | |
|             print('  {!r}: {!r}'.format(name, old[name]))
 | |
|     changed = set()
 | |
|     for name in (old.keys() & new.keys()):
 | |
|         if old[name] != new[name]:
 | |
|             changed.add((name, old[name], new[name]))
 | |
|     if changed:
 | |
|         print('{} entitie(s) have been modified:'.format(len(changed)))
 | |
|         for item in sorted(changed):
 | |
|             print('  {!r}: {!r} -> {!r}'.format(*item))
 | |
| 
 | |
| def write_items(entities, file=sys.stdout):
 | |
|     """Write the items of the dictionary in the specified file."""
 | |
|     # The keys in the generated dictionary should be sorted
 | |
|     # in a case-insensitive way, however, when two keys are equal,
 | |
|     # the uppercase version should come first so that the result
 | |
|     # looks like: ['Aacute', 'aacute', 'Aacute;', 'aacute;', ...]
 | |
|     # To do this we first sort in a case-sensitive way (so all the
 | |
|     # uppercase chars come first) and then sort with key=str.lower.
 | |
|     # Since the sorting is stable the uppercase keys will eventually
 | |
|     # be before their equivalent lowercase version.
 | |
|     keys = sorted(entities.keys())
 | |
|     keys = sorted(keys, key=str.lower)
 | |
|     print(HTML5_SECTION_START, file=file)
 | |
|     print(f'# Generated by {SCRIPT_NAME}\n'
 | |
|           f'# from {ENTITIES_URL} and\n'
 | |
|           f'# {PAGE_URL}.\n'
 | |
|           f'# Map HTML5 named character references to the '
 | |
|           f'equivalent Unicode character(s).', file=file)
 | |
|     print('html5 = {', file=file)
 | |
|     for name in keys:
 | |
|         print(f'    {name!r}: {entities[name]!a},', file=file)
 | |
|     print('}', file=file)
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     # without args print a diff between html.entities.html5 and new_html5
 | |
|     # with --create print the new html5 dict
 | |
|     # with --patch patch the Lib/html/entities.py file
 | |
|     new_html5 = create_dict(get_json(ENTITIES_URL))
 | |
|     if '--create' in sys.argv:
 | |
|         write_items(new_html5)
 | |
|     elif '--patch' in sys.argv:
 | |
|         fname = 'Lib/html/entities.py'
 | |
|         temp_fname = fname + '.temp'
 | |
|         with open(fname) as f1, open(temp_fname, 'w') as f2:
 | |
|             skip = False
 | |
|             for line in f1:
 | |
|                 if line.startswith(HTML5_SECTION_START):
 | |
|                     write_items(new_html5, file=f2)
 | |
|                     skip = True
 | |
|                     continue
 | |
|                 if skip:
 | |
|                     # skip the old items until the }
 | |
|                     if line.startswith('}'):
 | |
|                         skip = False
 | |
|                     continue
 | |
|                 f2.write(line)
 | |
|         os.remove(fname)
 | |
|         os.rename(temp_fname, fname)
 | |
|     else:
 | |
|         if html5 == new_html5:
 | |
|             print('The current dictionary is updated.')
 | |
|         else:
 | |
|             compare_dicts(html5, new_html5)
 | |
|             print('Run "./python {0} --patch" to update Lib/html/entities.html '
 | |
|                   'or "./python {0} --create" to see the generated ' 'dictionary.'.format(__file__))
 |