mirror of
https://github.com/python/cpython.git
synced 2025-09-14 04:37:29 +00:00
Preliminary code to mark paragraphs. Seems to work, but very slow.
This commit is contained in:
parent
5c355201e2
commit
fcc5910090
1 changed files with 150 additions and 2 deletions
|
@ -20,6 +20,9 @@ import xml.dom.core
|
||||||
import xml.dom.esis_builder
|
import xml.dom.esis_builder
|
||||||
|
|
||||||
|
|
||||||
|
DEBUG_PARA_FIXER = 0
|
||||||
|
|
||||||
|
|
||||||
# Workaround to deal with invalid documents (multiple root elements). This
|
# Workaround to deal with invalid documents (multiple root elements). This
|
||||||
# does not indicate a bug in the DOM implementation.
|
# does not indicate a bug in the DOM implementation.
|
||||||
#
|
#
|
||||||
|
@ -323,8 +326,153 @@ def cleanup_synopses(doc):
|
||||||
create_module_info(doc, node)
|
create_module_info(doc, node)
|
||||||
|
|
||||||
|
|
||||||
|
FIXUP_PARA_ELEMENTS = (
|
||||||
|
"chapter",
|
||||||
|
"section", "subsection", "subsubsection",
|
||||||
|
"paragraph", "subparagraph")
|
||||||
|
|
||||||
|
PARA_LEVEL_ELEMENTS = (
|
||||||
|
"moduleinfo", "title", "opcodedesc",
|
||||||
|
"verbatim", "funcdesc", "methoddesc", "excdesc", "datadesc",
|
||||||
|
"funcdescni", "methoddescni", "excdescni", "datadescni",
|
||||||
|
"tableii", "tableiii", "tableiv", "localmoduletable",
|
||||||
|
"sectionauthor",
|
||||||
|
# include <para>, so we can just do it again to get subsequent paras:
|
||||||
|
"para",
|
||||||
|
)
|
||||||
|
|
||||||
|
PARA_LEVEL_PRECEEDERS = (
|
||||||
|
"index", "indexii", "indexiii", "indexiv",
|
||||||
|
"stindex", "obindex", "COMMENT", "label",
|
||||||
|
)
|
||||||
|
|
||||||
def fixup_paras(doc):
|
def fixup_paras(doc):
|
||||||
pass
|
for child in doc.childNodes:
|
||||||
|
if child.nodeType == xml.dom.core.ELEMENT \
|
||||||
|
and child.tagName in FIXUP_PARA_ELEMENTS:
|
||||||
|
fixup_paras_helper(doc, child)
|
||||||
|
descriptions = child.getElementsByTagName("description")
|
||||||
|
for description in descriptions:
|
||||||
|
if DEBUG_PARA_FIXER:
|
||||||
|
sys.stderr.write("-- Fixing up <description> element...\n")
|
||||||
|
fixup_paras_helper(doc, description)
|
||||||
|
|
||||||
|
|
||||||
|
def fixup_paras_helper(doc, container):
|
||||||
|
# document is already normalized
|
||||||
|
children = container.childNodes
|
||||||
|
start = 0
|
||||||
|
start_fixed = 0
|
||||||
|
i = 0
|
||||||
|
SKIP_ELEMENTS = PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS
|
||||||
|
for child in children:
|
||||||
|
if child.nodeType == xml.dom.core.ELEMENT:
|
||||||
|
if child.tagName in FIXUP_PARA_ELEMENTS:
|
||||||
|
fixup_paras_helper(doc, child)
|
||||||
|
break
|
||||||
|
elif child.tagName in SKIP_ELEMENTS:
|
||||||
|
if not start_fixed:
|
||||||
|
start = i + 1
|
||||||
|
elif not start_fixed:
|
||||||
|
start_fixed = 1
|
||||||
|
i = i + 1
|
||||||
|
else:
|
||||||
|
if child.nodeType == xml.dom.core.TEXT \
|
||||||
|
and string.strip(child.data) and not start_fixed:
|
||||||
|
start_fixed = 1
|
||||||
|
i = i + 1
|
||||||
|
if DEBUG_PARA_FIXER:
|
||||||
|
sys.stderr.write("fixup_paras_helper() called on <%s>; %d, %d\n"
|
||||||
|
% (container.tagName, start, i))
|
||||||
|
if i > start:
|
||||||
|
# the first [start:i] children shoudl be rewritten as <para> elements
|
||||||
|
# start by breaking text nodes that contain \n\n+ into multiple nodes
|
||||||
|
nstart, i = skip_leading_nodes(container.childNodes, start, i)
|
||||||
|
if i > nstart:
|
||||||
|
build_para(doc, container, nstart, i)
|
||||||
|
fixup_paras_helper(doc, container)
|
||||||
|
|
||||||
|
|
||||||
|
def build_para(doc, parent, start, i):
|
||||||
|
children = parent.childNodes
|
||||||
|
# collect all children until \n\n+ is found in a text node or a
|
||||||
|
# PARA_LEVEL_ELEMENT is found.
|
||||||
|
after = start + 1
|
||||||
|
have_last = 0
|
||||||
|
BREAK_ELEMENTS = PARA_LEVEL_ELEMENTS + FIXUP_PARA_ELEMENTS
|
||||||
|
for j in range(start, i):
|
||||||
|
after = j + 1
|
||||||
|
child = children[j]
|
||||||
|
nodeType = child.nodeType
|
||||||
|
if nodeType == xml.dom.core.ELEMENT:
|
||||||
|
if child.tagName in BREAK_ELEMENTS:
|
||||||
|
after = j
|
||||||
|
break
|
||||||
|
elif nodeType == xml.dom.core.TEXT:
|
||||||
|
pos = string.find(child.data, "\n\n")
|
||||||
|
if pos == 0:
|
||||||
|
after = j
|
||||||
|
break
|
||||||
|
if pos >= 1:
|
||||||
|
child.splitText(pos)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
have_last = 1
|
||||||
|
if children[after - 1].nodeType == xml.dom.core.TEXT:
|
||||||
|
# we may need to split off trailing white space:
|
||||||
|
child = children[after - 1]
|
||||||
|
data = child.data
|
||||||
|
if string.rstrip(data) != data:
|
||||||
|
have_last = 0
|
||||||
|
child.splitText(len(string.rstrip(data)))
|
||||||
|
children = parent.childNodes
|
||||||
|
para = doc.createElement("para")
|
||||||
|
prev = None
|
||||||
|
indexes = range(start, after)
|
||||||
|
indexes.reverse()
|
||||||
|
for j in indexes:
|
||||||
|
node = children[j]
|
||||||
|
parent.removeChild(node)
|
||||||
|
para.insertBefore(node, prev)
|
||||||
|
prev = node
|
||||||
|
if have_last:
|
||||||
|
parent.appendChild(para)
|
||||||
|
else:
|
||||||
|
parent.insertBefore(para, parent.childNodes[start])
|
||||||
|
|
||||||
|
|
||||||
|
def skip_leading_nodes(children, start, i):
|
||||||
|
i = min(i, len(children))
|
||||||
|
while i > start:
|
||||||
|
# skip over leading comments and whitespace:
|
||||||
|
try:
|
||||||
|
child = children[start]
|
||||||
|
except IndexError:
|
||||||
|
sys.stderr.write(
|
||||||
|
"skip_leading_nodes() failed at index %d\n" % start)
|
||||||
|
raise
|
||||||
|
nodeType = child.nodeType
|
||||||
|
if nodeType == xml.dom.core.COMMENT:
|
||||||
|
start = start + 1
|
||||||
|
elif nodeType == xml.dom.core.TEXT:
|
||||||
|
data = child.data
|
||||||
|
shortened = string.lstrip(data)
|
||||||
|
if shortened:
|
||||||
|
if data != shortened:
|
||||||
|
# break into two nodes: whitespace and non-whitespace
|
||||||
|
child.splitText(len(data) - len(shortened))
|
||||||
|
return start + 1, i + 1
|
||||||
|
break
|
||||||
|
# all whitespace, just skip
|
||||||
|
start = start + 1
|
||||||
|
elif nodeType == xml.dom.core.ELEMENT:
|
||||||
|
if child.tagName in PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS:
|
||||||
|
start = start + 1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
return start, i
|
||||||
|
|
||||||
|
|
||||||
_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
|
_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue