HTMLParser is now able to handle slashes in the start tag.

This commit is contained in:
Ezio Melotti 2012-02-21 09:22:16 +02:00
parent 9be6c3ddf0
commit 36b7361fe7
3 changed files with 28 additions and 5 deletions

View file

@ -28,19 +28,19 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
attrfind = re.compile(
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
r'[\s/]*((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:\s+ # whitespace before attribute name
(?:(?<=['"\s])[^\s/>][^\s/=>]* # attribute name
(?:[\s/]* # optional whitespace before attribute name
(?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
(?:\s*=+\s* # value indicator
(?:'[^']*' # LITA-enclosed value
|"[^"]*" # LIT-enclosed value
|(?!['"])[^>\s]* # bare value
)
)?\s*
)?(?:\s|/(?!>))*
)*
)?
\s* # trailing whitespace