cpython/Doc/tools/sgmlconv/docfixer.py
Fred Drake d24167baf2 Make <rfc> no longer an empty element but a container. The text
currently generated by the LaTeX and LaTeX2HTML processes is generated
here as well, making it more flexible in the SGML version.

Reduce the <args> element so that <optional> goes away; just use
square brackets to indicate what's optional.  This makes it easier to
read than the LaTeX, and the processor can do any checking it needs to
in order to make sure it's legit.  Possible shortcoming: DSSSL
processors may need more explicit markup.  Can probably hack around it
for this case, but we'll see.
1999-01-14 21:18:03 +00:00

718 lines
24 KiB
Python
Executable file

#! /usr/bin/env python
"""Promote the IDs from <label/> elements to the enclosing section / chapter /
whatever, then remove the <label/> elements. This allows *ML style internal
linking rather than the bogus LaTeX model.
Note that <label/>s in <title> elements are promoted two steps, since the
<title> elements are artificially created from the section parameter, and the
label really refers to the sectioning construct.
"""
__version__ = '$Revision$'
import errno
import esistools
import re
import string
import sys
import xml.dom.core
import xml.dom.esis_builder
class ConversionError(Exception):
pass
DEBUG_PARA_FIXER = 0
# Workaround to deal with invalid documents (multiple root elements). This
# does not indicate a bug in the DOM implementation.
#
def get_documentElement(self):
docelem = None
for n in self._node.children:
if n.type == xml.dom.core.ELEMENT:
docelem = xml.dom.core.Element(n, self, self)
return docelem
xml.dom.core.Document.get_documentElement = get_documentElement
# Replace get_childNodes for the Document class; without this, children
# accessed from the Document object via .childNodes (no matter how many
# levels of access are used) will be given an ownerDocument of None.
#
def get_childNodes(self):
return xml.dom.core.NodeList(self._node.children, self, self)
xml.dom.core.Document.get_childNodes = get_childNodes
def get_first_element(doc, gi):
for n in doc.childNodes:
if n.nodeType == xml.dom.core.ELEMENT and n.tagName == gi:
return n
def extract_first_element(doc, gi):
node = get_first_element(doc, gi)
if node is not None:
doc.removeChild(node)
return node
def simplify(doc):
# Try to rationalize the document a bit, since these things are simply
# not valid SGML/XML documents as they stand, and need a little work.
documentclass = "document"
inputs = []
node = extract_first_element(doc, "documentclass")
if node is not None:
documentclass = node.getAttribute("classname")
node = extract_first_element(doc, "title")
if node is not None:
inputs.append(node)
# update the name of the root element
node = get_first_element(doc, "document")
if node is not None:
node._node.name = documentclass
while 1:
node = extract_first_element(doc, "input")
if node is None:
break
inputs.append(node)
if inputs:
docelem = doc.documentElement
inputs.reverse()
for node in inputs:
text = doc.createTextNode("\n")
docelem.insertBefore(text, docelem.firstChild)
docelem.insertBefore(node, text)
docelem.insertBefore(doc.createTextNode("\n"), docelem.firstChild)
while doc.firstChild.nodeType == xml.dom.core.TEXT:
doc.removeChild(doc.firstChild)
def cleanup_root_text(doc):
discards = []
skip = 0
for n in doc.childNodes:
prevskip = skip
skip = 0
if n.nodeType == xml.dom.core.TEXT and not prevskip:
discards.append(n)
elif n.nodeType == xml.dom.core.ELEMENT and n.tagName == "COMMENT":
skip = 1
for node in discards:
doc.removeChild(node)
def rewrite_desc_entries(doc, argname_gi):
argnodes = doc.getElementsByTagName(argname_gi)
for node in argnodes:
parent = node.parentNode
nodes = []
for n in parent.childNodes:
if n.nodeType != xml.dom.core.ELEMENT or n.tagName != argname_gi:
nodes.append(n)
desc = doc.createElement("description")
for n in nodes:
parent.removeChild(n)
desc.appendChild(n)
if node.childNodes:
# keep the <args>...</args>, newline & indent
parent.insertBefore(doc.createText("\n "), node)
else:
# no arguments, remove the <args/> node
parent.removeChild(node)
parent.appendChild(doc.createText("\n "))
parent.appendChild(desc)
parent.appendChild(doc.createText("\n"))
def handle_args(doc):
rewrite_desc_entries(doc, "args")
rewrite_desc_entries(doc, "constructor-args")
def handle_appendix(doc):
# must be called after simplfy() if document is multi-rooted to begin with
docelem = doc.documentElement
toplevel = docelem.tagName == "manual" and "chapter" or "section"
appendices = 0
nodes = []
for node in docelem.childNodes:
if appendices:
nodes.append(node)
elif node.nodeType == xml.dom.core.ELEMENT:
appnodes = node.getElementsByTagName("appendix")
if appnodes:
appendices = 1
parent = appnodes[0].parentNode
parent.removeChild(appnodes[0])
parent.normalize()
if nodes:
map(docelem.removeChild, nodes)
docelem.appendChild(doc.createTextNode("\n\n\n"))
back = doc.createElement("back-matter")
docelem.appendChild(back)
back.appendChild(doc.createTextNode("\n"))
while nodes and nodes[0].nodeType == xml.dom.core.TEXT \
and not string.strip(nodes[0].data):
del nodes[0]
map(back.appendChild, nodes)
docelem.appendChild(doc.createTextNode("\n"))
def handle_labels(doc):
labels = doc.getElementsByTagName("label")
for label in labels:
id = label.getAttribute("id")
if not id:
continue
parent = label.parentNode
if parent.tagName == "title":
parent.parentNode.setAttribute("id", id)
else:
parent.setAttribute("id", id)
# now, remove <label id="..."/> from parent:
parent.removeChild(label)
def fixup_trailing_whitespace(doc, wsmap):
queue = [doc]
while queue:
node = queue[0]
del queue[0]
if node.nodeType == xml.dom.core.ELEMENT \
and wsmap.has_key(node.tagName):
ws = wsmap[node.tagName]
children = node.childNodes
children.reverse()
if children[0].nodeType == xml.dom.core.TEXT:
data = string.rstrip(children[0].data) + ws
children[0].data = data
children.reverse()
# hack to get the title in place:
if node.tagName == "title" \
and node.parentNode.firstChild.nodeType == xml.dom.core.ELEMENT:
node.parentNode.insertBefore(doc.createText("\n "),
node.parentNode.firstChild)
for child in node.childNodes:
if child.nodeType == xml.dom.core.ELEMENT:
queue.append(child)
def normalize(doc):
for node in doc.childNodes:
if node.nodeType == xml.dom.core.ELEMENT:
node.normalize()
def cleanup_trailing_parens(doc, element_names):
d = {}
for gi in element_names:
d[gi] = gi
rewrite_element = d.has_key
queue = []
for node in doc.childNodes:
if node.nodeType == xml.dom.core.ELEMENT:
queue.append(node)
while queue:
node = queue[0]
del queue[0]
if rewrite_element(node.tagName):
children = node.childNodes
if len(children) == 1 \
and children[0].nodeType == xml.dom.core.TEXT:
data = children[0].data
if data[-2:] == "()":
children[0].data = data[:-2]
else:
for child in node.childNodes:
if child.nodeType == xml.dom.core.ELEMENT:
queue.append(child)
def contents_match(left, right):
left_children = left.childNodes
right_children = right.childNodes
if len(left_children) != len(right_children):
return 0
for l, r in map(None, left_children, right_children):
nodeType = l.nodeType
if nodeType != r.nodeType:
return 0
if nodeType == xml.dom.core.ELEMENT:
if l.tagName != r.tagName:
return 0
# should check attributes, but that's not a problem here
if not contents_match(l, r):
return 0
elif nodeType == xml.dom.core.TEXT:
if l.data != r.data:
return 0
else:
# not quite right, but good enough
return 0
return 1
def create_module_info(doc, section):
# Heavy.
node = extract_first_element(section, "modulesynopsis")
if node is None:
return
node._node.name = "synopsis"
lastchild = node.childNodes[-1]
if lastchild.nodeType == xml.dom.core.TEXT \
and lastchild.data[-1:] == ".":
lastchild.data = lastchild.data[:-1]
if section.tagName == "section":
modinfo_pos = 2
modinfo = doc.createElement("moduleinfo")
moddecl = extract_first_element(section, "declaremodule")
name = None
if moddecl:
modinfo.appendChild(doc.createTextNode("\n "))
name = moddecl.attributes["name"].value
namenode = doc.createElement("name")
namenode.appendChild(doc.createTextNode(name))
modinfo.appendChild(namenode)
type = moddecl.attributes.get("type")
if type:
type = type.value
modinfo.appendChild(doc.createTextNode("\n "))
typenode = doc.createElement("type")
typenode.appendChild(doc.createTextNode(type))
modinfo.appendChild(typenode)
title = get_first_element(section, "title")
if title:
children = title.childNodes
if len(children) >= 2 \
and children[0].nodeType == xml.dom.core.ELEMENT \
and children[0].tagName == "module" \
and children[0].childNodes[0].data == name:
# this is it; morph the <title> into <short-synopsis>
first_data = children[1]
if first_data.data[:4] == " ---":
first_data.data = string.lstrip(first_data.data[4:])
title._node.name = "short-synopsis"
if children[-1].data[-1:] == ".":
children[-1].data = children[-1].data[:-1]
section.removeChild(title)
section.removeChild(section.childNodes[0])
title.removeChild(children[0])
modinfo_pos = 0
else:
sys.stderr.write(
"module name in title doesn't match"
" <declaremodule>; no <short-synopsis>\n")
else:
sys.stderr.write(
"Unexpected condition: <section> without <title>\n")
modinfo.appendChild(doc.createTextNode("\n "))
modinfo.appendChild(node)
if title and not contents_match(title, node):
# The short synopsis is actually different,
# and needs to be stored:
modinfo.appendChild(doc.createTextNode("\n "))
modinfo.appendChild(title)
modinfo.appendChild(doc.createTextNode("\n "))
section.insertBefore(modinfo, section.childNodes[modinfo_pos])
section.insertBefore(doc.createTextNode("\n "), modinfo)
def cleanup_synopses(doc):
for node in doc.childNodes:
if node.nodeType == xml.dom.core.ELEMENT \
and node.tagName == "section":
create_module_info(doc, node)
def remap_element_names(root, name_map):
queue = []
for child in root.childNodes:
if child.nodeType == xml.dom.core.ELEMENT:
queue.append(child)
while queue:
node = queue.pop()
tagName = node.tagName
if name_map.has_key(tagName):
name, attrs = name_map[tagName]
node._node.name = name
for attr, value in attrs.items():
node.setAttribute(attr, value)
for child in node.childNodes:
if child.nodeType == xml.dom.core.ELEMENT:
queue.append(child)
def fixup_table_structures(doc):
# must be done after remap_element_names(), or the tables won't be found
for child in doc.childNodes:
if child.nodeType == xml.dom.core.ELEMENT:
tables = child.getElementsByTagName("table")
for table in tables:
fixup_table(doc, table)
def fixup_table(doc, table):
# create the table head
thead = doc.createElement("thead")
row = doc.createElement("row")
move_elements_by_name(doc, table, row, "entry")
thead.appendChild(doc.createTextNode("\n "))
thead.appendChild(row)
thead.appendChild(doc.createTextNode("\n "))
# create the table body
tbody = doc.createElement("tbody")
prev_row = None
last_was_hline = 0
children = table.childNodes
for child in children:
if child.nodeType == xml.dom.core.ELEMENT:
tagName = child.tagName
if tagName == "hline" and prev_row is not None:
prev_row.setAttribute("rowsep", "1")
elif tagName == "row":
prev_row = child
# save the rows:
tbody.appendChild(doc.createTextNode("\n "))
move_elements_by_name(doc, table, tbody, "row", sep="\n ")
# and toss the rest:
while children:
child = children[0]
nodeType = child.nodeType
if nodeType == xml.dom.core.TEXT:
if string.strip(child.data):
raise ConversionError("unexpected free data in table")
table.removeChild(child)
continue
if nodeType == xml.dom.core.ELEMENT:
if child.tagName != "hline":
raise ConversionError(
"unexpected <%s> in table" % child.tagName)
table.removeChild(child)
continue
raise ConversionError(
"unexpected %s node in table" % child.__class__.__name__)
# nothing left in the <table>; add the <thead> and <tbody>
tgroup = doc.createElement("tgroup")
tgroup.appendChild(doc.createTextNode("\n "))
tgroup.appendChild(thead)
tgroup.appendChild(doc.createTextNode("\n "))
tgroup.appendChild(tbody)
tgroup.appendChild(doc.createTextNode("\n "))
table.appendChild(tgroup)
# now make the <entry>s look nice:
for row in table.getElementsByTagName("row"):
fixup_row(doc, row)
def fixup_row(doc, row):
entries = []
map(entries.append, row.childNodes[1:])
for entry in entries:
row.insertBefore(doc.createTextNode("\n "), entry)
# row.appendChild(doc.createTextNode("\n "))
def move_elements_by_name(doc, source, dest, name, sep=None):
nodes = []
for child in source.childNodes:
if child.nodeType == xml.dom.core.ELEMENT and child.tagName == name:
nodes.append(child)
for node in nodes:
source.removeChild(node)
dest.appendChild(node)
if sep:
dest.appendChild(doc.createTextNode(sep))
FIXUP_PARA_ELEMENTS = (
"chapter",
"section", "subsection", "subsubsection",
"paragraph", "subparagraph")
PARA_LEVEL_ELEMENTS = (
"moduleinfo", "title", "opcodedesc",
"verbatim", "funcdesc", "methoddesc", "excdesc", "datadesc",
"funcdescni", "methoddescni", "excdescni", "datadescni",
"tableii", "tableiii", "tableiv", "localmoduletable",
"sectionauthor",
# include <para>, so we can just do it again to get subsequent paras:
"para",
)
PARA_LEVEL_PRECEEDERS = (
"index", "indexii", "indexiii", "indexiv",
"stindex", "obindex", "COMMENT", "label",
)
def fixup_paras(doc):
for child in doc.childNodes:
if child.nodeType == xml.dom.core.ELEMENT \
and child.tagName in FIXUP_PARA_ELEMENTS:
fixup_paras_helper(doc, child)
descriptions = child.getElementsByTagName("description")
for description in descriptions:
if DEBUG_PARA_FIXER:
sys.stderr.write("-- Fixing up <description> element...\n")
fixup_paras_helper(doc, description)
def fixup_paras_helper(doc, container):
# document is already normalized
children = container.childNodes
start = 0
start_fixed = 0
i = 0
SKIP_ELEMENTS = PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS
for child in children:
if child.nodeType == xml.dom.core.ELEMENT:
if child.tagName in FIXUP_PARA_ELEMENTS:
fixup_paras_helper(doc, child)
break
elif child.tagName in SKIP_ELEMENTS:
if not start_fixed:
start = i + 1
elif not start_fixed:
start_fixed = 1
i = i + 1
else:
if child.nodeType == xml.dom.core.TEXT \
and string.strip(child.data) and not start_fixed:
start_fixed = 1
i = i + 1
if DEBUG_PARA_FIXER:
sys.stderr.write("fixup_paras_helper() called on <%s>; %d, %d\n"
% (container.tagName, start, i))
if i > start:
# the first [start:i] children shoudl be rewritten as <para> elements
# start by breaking text nodes that contain \n\n+ into multiple nodes
nstart, i = skip_leading_nodes(container.childNodes, start, i)
if i > nstart:
build_para(doc, container, nstart, i)
fixup_paras_helper(doc, container)
def build_para(doc, parent, start, i):
children = parent.childNodes
# collect all children until \n\n+ is found in a text node or a
# PARA_LEVEL_ELEMENT is found.
after = start + 1
have_last = 0
BREAK_ELEMENTS = PARA_LEVEL_ELEMENTS + FIXUP_PARA_ELEMENTS
for j in range(start, i):
after = j + 1
child = children[j]
nodeType = child.nodeType
if nodeType == xml.dom.core.ELEMENT:
if child.tagName in BREAK_ELEMENTS:
after = j
break
elif nodeType == xml.dom.core.TEXT:
pos = string.find(child.data, "\n\n")
if pos == 0:
after = j
break
if pos >= 1:
child.splitText(pos)
break
else:
have_last = 1
if children[after - 1].nodeType == xml.dom.core.TEXT:
# we may need to split off trailing white space:
child = children[after - 1]
data = child.data
if string.rstrip(data) != data:
have_last = 0
child.splitText(len(string.rstrip(data)))
children = parent.childNodes
para = doc.createElement("para")
prev = None
indexes = range(start, after)
indexes.reverse()
for j in indexes:
node = children[j]
parent.removeChild(node)
para.insertBefore(node, prev)
prev = node
if have_last:
parent.appendChild(para)
else:
parent.insertBefore(para, parent.childNodes[start])
def skip_leading_nodes(children, start, i):
i = min(i, len(children))
while i > start:
# skip over leading comments and whitespace:
try:
child = children[start]
except IndexError:
sys.stderr.write(
"skip_leading_nodes() failed at index %d\n" % start)
raise
nodeType = child.nodeType
if nodeType == xml.dom.core.COMMENT:
start = start + 1
elif nodeType == xml.dom.core.TEXT:
data = child.data
shortened = string.lstrip(data)
if shortened:
if data != shortened:
# break into two nodes: whitespace and non-whitespace
child.splitText(len(data) - len(shortened))
return start + 1, i + 1
break
# all whitespace, just skip
start = start + 1
elif nodeType == xml.dom.core.ELEMENT:
if child.tagName in PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS:
start = start + 1
else:
break
else:
break
return start, i
def fixup_rfc_references(doc):
rfc_nodes = []
for child in doc.childNodes:
if child.nodeType == xml.dom.core.ELEMENT:
kids = child.getElementsByTagName("rfc")
for k in kids:
rfc_nodes.append(k)
for rfc_node in rfc_nodes:
rfc_node.appendChild(doc.createTextNode(
"RFC " + rfc_node.getAttribute("num")))
def fixup_signatures(doc):
for child in doc.childNodes:
if child.nodeType == xml.dom.core.ELEMENT:
args = child.getElementsByTagName("args")
for arg in args:
fixup_args(doc, arg)
args = child.getElementsByTagName("constructor-args")
for arg in args:
fixup_args(doc, arg)
arg.normalize()
def fixup_args(doc, arglist):
for child in arglist.childNodes:
if child.nodeType == xml.dom.core.ELEMENT \
and child.tagName == "optional":
# found it; fix and return
arglist.insertBefore(doc.createTextNode("["), child)
optkids = child.childNodes
while optkids:
k = optkids[0]
child.removeChild(k)
arglist.insertBefore(k, child)
arglist.insertBefore(doc.createTextNode("]"), child)
arglist.removeChild(child)
return fixup_args(doc, arglist)
_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
def write_esis(doc, ofp, knownempty):
for node in doc.childNodes:
nodeType = node.nodeType
if nodeType == xml.dom.core.ELEMENT:
gi = node.tagName
if knownempty(gi):
if node.hasChildNodes():
raise ValueError, "declared-empty node has children"
ofp.write("e\n")
for k, v in node.attributes.items():
value = v.value
if _token_rx.match(value):
dtype = "TOKEN"
else:
dtype = "CDATA"
ofp.write("A%s %s %s\n" % (k, dtype, esistools.encode(value)))
ofp.write("(%s\n" % gi)
write_esis(node, ofp, knownempty)
ofp.write(")%s\n" % gi)
elif nodeType == xml.dom.core.TEXT:
ofp.write("-%s\n" % esistools.encode(node.data))
else:
raise RuntimeError, "unsupported node type: %s" % nodeType
def convert(ifp, ofp):
p = esistools.ExtendedEsisBuilder()
p.feed(ifp.read())
doc = p.document
normalize(doc)
handle_args(doc)
simplify(doc)
handle_labels(doc)
handle_appendix(doc)
fixup_trailing_whitespace(doc, {
"abstract": "\n",
"title": "",
"chapter": "\n\n",
"section": "\n\n",
"subsection": "\n\n",
"subsubsection": "\n\n",
"paragraph": "\n\n",
"subparagraph": "\n\n",
})
cleanup_root_text(doc)
cleanup_trailing_parens(doc, ["function", "method", "cfunction"])
cleanup_synopses(doc)
normalize(doc)
fixup_paras(doc)
remap_element_names(doc, {
"tableii": ("table", {"cols": "2"}),
"tableiii": ("table", {"cols": "3"}),
"tableiv": ("table", {"cols": "4"}),
"lineii": ("row", {}),
"lineiii": ("row", {}),
"lineiv": ("row", {}),
})
fixup_table_structures(doc)
fixup_rfc_references(doc)
fixup_signatures(doc)
#
d = {}
for gi in p.get_empties():
d[gi] = gi
if d.has_key("rfc"):
del d["rfc"]
knownempty = d.has_key
#
try:
write_esis(doc, ofp, knownempty)
except IOError, (err, msg):
# Ignore EPIPE; it just means that whoever we're writing to stopped
# reading. The rest of the output would be ignored. All other errors
# should still be reported,
if err != errno.EPIPE:
raise
def main():
if len(sys.argv) == 1:
ifp = sys.stdin
ofp = sys.stdout
elif len(sys.argv) == 2:
ifp = open(sys.argv[1])
ofp = sys.stdout
elif len(sys.argv) == 3:
ifp = open(sys.argv[1])
ofp = open(sys.argv[2], "w")
else:
usage()
sys.exit(2)
convert(ifp, ofp)
if __name__ == "__main__":
main()