mirror of
https://github.com/python/cpython.git
synced 2025-07-23 11:15:24 +00:00

LaTeX and we have at least one occurance of that in the content, so this script needs to support it as well.
555 lines
19 KiB
Python
Executable file
555 lines
19 KiB
Python
Executable file
#! /usr/bin/env python
|
||
|
||
"""Generate ESIS events based on a LaTeX source document and
|
||
configuration data.
|
||
|
||
The conversion is not strong enough to work with arbitrary LaTeX
|
||
documents; it has only been designed to work with the highly stylized
|
||
markup used in the standard Python documentation. A lot of
|
||
information about specific markup is encoded in the control table
|
||
passed to the convert() function; changing this table can allow this
|
||
tool to support additional LaTeX markups.
|
||
|
||
The format of the table is largely undocumented; see the commented
|
||
headers where the table is specified in main(). There is no provision
|
||
to load an alternate table from an external file.
|
||
"""
|
||
|
||
import errno
|
||
import getopt
|
||
import os
|
||
import re
|
||
import string
|
||
import sys
|
||
import UserList
|
||
import xml.sax.saxutils
|
||
|
||
from types import ListType, StringType, TupleType
|
||
|
||
try:
|
||
from xml.parsers.xmllib import XMLParser
|
||
except ImportError:
|
||
from xmllib import XMLParser
|
||
|
||
|
||
from esistools import encode
|
||
|
||
|
||
DEBUG = 0
|
||
|
||
|
||
class LaTeXFormatError(Exception):
|
||
pass
|
||
|
||
|
||
class LaTeXStackError(LaTeXFormatError):
|
||
def __init__(self, found, stack):
|
||
msg = "environment close for %s doesn't match;\n stack = %s" \
|
||
% (found, stack)
|
||
self.found = found
|
||
self.stack = stack[:]
|
||
LaTeXFormatError.__init__(self, msg)
|
||
|
||
|
||
_begin_env_rx = re.compile(r"[\\]begin{([^}]*)}")
|
||
_end_env_rx = re.compile(r"[\\]end{([^}]*)}")
|
||
_begin_macro_rx = re.compile(r"[\\]([a-zA-Z]+[*]?) ?({|\s*\n?)")
|
||
_comment_rx = re.compile("%+ ?(.*)\n[ \t]*")
|
||
_text_rx = re.compile(r"[^]~%\\{}]+")
|
||
_optional_rx = re.compile(r"\s*[[]([^]]*)[]]", re.MULTILINE)
|
||
# _parameter_rx is this complicated to allow {...} inside a parameter;
|
||
# this is useful to match tabular layout specifications like {c|p{24pt}}
|
||
_parameter_rx = re.compile("[ \n]*{(([^{}}]|{[^}]*})*)}")
|
||
_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
|
||
_start_group_rx = re.compile("[ \n]*{")
|
||
_start_optional_rx = re.compile("[ \n]*[[]")
|
||
|
||
|
||
ESCAPED_CHARS = "$%#^ {}&~"
|
||
|
||
|
||
def dbgmsg(msg):
|
||
if DEBUG:
|
||
sys.stderr.write(msg + "\n")
|
||
|
||
def pushing(name, point, depth):
|
||
dbgmsg("pushing <%s> at %s" % (name, point))
|
||
|
||
def popping(name, point, depth):
|
||
dbgmsg("popping </%s> at %s" % (name, point))
|
||
|
||
|
||
class _Stack(UserList.UserList):
|
||
def append(self, entry):
|
||
if type(entry) is not StringType:
|
||
raise LaTeXFormatError("cannot push non-string on stack: "
|
||
+ `entry`)
|
||
#dbgmsg("%s<%s>" % (" "*len(self.data), entry))
|
||
self.data.append(entry)
|
||
|
||
def pop(self, index=-1):
|
||
entry = self.data[index]
|
||
del self.data[index]
|
||
#dbgmsg("%s</%s>" % (" "*len(self.data), entry))
|
||
|
||
def __delitem__(self, index):
|
||
entry = self.data[index]
|
||
del self.data[index]
|
||
#dbgmsg("%s</%s>" % (" "*len(self.data), entry))
|
||
|
||
|
||
def new_stack():
|
||
if DEBUG:
|
||
return _Stack()
|
||
return []
|
||
|
||
|
||
class Conversion:
|
||
def __init__(self, ifp, ofp, table):
|
||
self.write = ofp.write
|
||
self.ofp = ofp
|
||
self.table = table
|
||
self.line = string.join(map(string.rstrip, ifp.readlines()), "\n")
|
||
self.preamble = 1
|
||
|
||
def convert(self):
|
||
self.subconvert()
|
||
|
||
def subconvert(self, endchar=None, depth=0):
|
||
#
|
||
# Parses content, including sub-structures, until the character
|
||
# 'endchar' is found (with no open structures), or until the end
|
||
# of the input data is endchar is None.
|
||
#
|
||
stack = new_stack()
|
||
line = self.line
|
||
while line:
|
||
if line[0] == endchar and not stack:
|
||
self.line = line
|
||
return line
|
||
m = _comment_rx.match(line)
|
||
if m:
|
||
text = m.group(1)
|
||
if text:
|
||
self.write("(COMMENT\n- %s \n)COMMENT\n-\\n\n"
|
||
% encode(text))
|
||
line = line[m.end():]
|
||
continue
|
||
m = _begin_env_rx.match(line)
|
||
if m:
|
||
name = m.group(1)
|
||
entry = self.get_env_entry(name)
|
||
# re-write to use the macro handler
|
||
line = r"\%s %s" % (name, line[m.end():])
|
||
continue
|
||
m = _end_env_rx.match(line)
|
||
if m:
|
||
# end of environment
|
||
envname = m.group(1)
|
||
entry = self.get_entry(envname)
|
||
while stack and envname != stack[-1] \
|
||
and stack[-1] in entry.endcloses:
|
||
self.write(")%s\n" % stack.pop())
|
||
if stack and envname == stack[-1]:
|
||
self.write(")%s\n" % entry.outputname)
|
||
del stack[-1]
|
||
else:
|
||
raise LaTeXStackError(envname, stack)
|
||
line = line[m.end():]
|
||
continue
|
||
m = _begin_macro_rx.match(line)
|
||
if m:
|
||
# start of macro
|
||
macroname = m.group(1)
|
||
if macroname == "c":
|
||
# Ugh! This is a combining character...
|
||
endpos = m.end()
|
||
self.combining_char("c", line[endpos])
|
||
line = line[endpos + 1:]
|
||
continue
|
||
entry = self.get_entry(macroname)
|
||
if entry.verbatim:
|
||
# magic case!
|
||
pos = string.find(line, "\\end{%s}" % macroname)
|
||
text = line[m.end(1):pos]
|
||
stack.append(entry.name)
|
||
self.write("(%s\n" % entry.outputname)
|
||
self.write("-%s\n" % encode(text))
|
||
self.write(")%s\n" % entry.outputname)
|
||
stack.pop()
|
||
line = line[pos + len("\\end{%s}" % macroname):]
|
||
continue
|
||
while stack and stack[-1] in entry.closes:
|
||
top = stack.pop()
|
||
topentry = self.get_entry(top)
|
||
if topentry.outputname:
|
||
self.write(")%s\n-\\n\n" % topentry.outputname)
|
||
#
|
||
if entry.outputname:
|
||
if entry.empty:
|
||
self.write("e\n")
|
||
#
|
||
params, optional, empty, environ = self.start_macro(macroname)
|
||
# rip off the macroname
|
||
if params:
|
||
line = line[m.end(1):]
|
||
elif empty:
|
||
line = line[m.end(1):]
|
||
else:
|
||
line = line[m.end():]
|
||
opened = 0
|
||
implied_content = 0
|
||
|
||
# handle attribute mappings here:
|
||
for pentry in params:
|
||
if pentry.type == "attribute":
|
||
if pentry.optional:
|
||
m = _optional_rx.match(line)
|
||
if m and entry.outputname:
|
||
line = line[m.end():]
|
||
self.dump_attr(pentry, m.group(1))
|
||
elif pentry.text and entry.outputname:
|
||
# value supplied by conversion spec:
|
||
self.dump_attr(pentry, pentry.text)
|
||
else:
|
||
m = _parameter_rx.match(line)
|
||
if not m:
|
||
raise LaTeXFormatError(
|
||
"could not extract parameter %s for %s: %s"
|
||
% (pentry.name, macroname, `line[:100]`))
|
||
if entry.outputname:
|
||
self.dump_attr(pentry, m.group(1))
|
||
line = line[m.end():]
|
||
elif pentry.type == "child":
|
||
if pentry.optional:
|
||
m = _optional_rx.match(line)
|
||
if m:
|
||
line = line[m.end():]
|
||
if entry.outputname and not opened:
|
||
opened = 1
|
||
self.write("(%s\n" % entry.outputname)
|
||
stack.append(macroname)
|
||
stack.append(pentry.name)
|
||
self.write("(%s\n" % pentry.name)
|
||
self.write("-%s\n" % encode(m.group(1)))
|
||
self.write(")%s\n" % pentry.name)
|
||
stack.pop()
|
||
else:
|
||
if entry.outputname and not opened:
|
||
opened = 1
|
||
self.write("(%s\n" % entry.outputname)
|
||
stack.append(entry.name)
|
||
self.write("(%s\n" % pentry.name)
|
||
stack.append(pentry.name)
|
||
self.line = skip_white(line)[1:]
|
||
line = self.subconvert(
|
||
"}", len(stack) + depth + 1)[1:]
|
||
self.write(")%s\n" % stack.pop())
|
||
elif pentry.type == "content":
|
||
if pentry.implied:
|
||
implied_content = 1
|
||
else:
|
||
if entry.outputname and not opened:
|
||
opened = 1
|
||
self.write("(%s\n" % entry.outputname)
|
||
stack.append(entry.name)
|
||
line = skip_white(line)
|
||
if line[0] != "{":
|
||
raise LaTeXFormatError(
|
||
"missing content for " + macroname)
|
||
self.line = line[1:]
|
||
line = self.subconvert("}", len(stack) + depth + 1)
|
||
if line and line[0] == "}":
|
||
line = line[1:]
|
||
elif pentry.type == "text" and pentry.text:
|
||
if entry.outputname and not opened:
|
||
opened = 1
|
||
stack.append(entry.name)
|
||
self.write("(%s\n" % entry.outputname)
|
||
#dbgmsg("--- text: %s" % `pentry.text`)
|
||
self.write("-%s\n" % encode(pentry.text))
|
||
elif pentry.type == "entityref":
|
||
self.write("&%s\n" % pentry.name)
|
||
if entry.outputname:
|
||
if not opened:
|
||
self.write("(%s\n" % entry.outputname)
|
||
stack.append(entry.name)
|
||
if not implied_content:
|
||
self.write(")%s\n" % entry.outputname)
|
||
stack.pop()
|
||
continue
|
||
if line[0] == endchar and not stack:
|
||
self.line = line[1:]
|
||
return self.line
|
||
if line[0] == "}":
|
||
# end of macro or group
|
||
macroname = stack[-1]
|
||
if macroname:
|
||
conversion = self.table[macroname]
|
||
if conversion.outputname:
|
||
# otherwise, it was just a bare group
|
||
self.write(")%s\n" % conversion.outputname)
|
||
del stack[-1]
|
||
line = line[1:]
|
||
continue
|
||
if line[0] == "~":
|
||
# don't worry about the "tie" aspect of this command
|
||
line = line[1:]
|
||
self.write("- \n")
|
||
continue
|
||
if line[0] == "{":
|
||
stack.append("")
|
||
line = line[1:]
|
||
continue
|
||
if line[0] == "\\" and line[1] in ESCAPED_CHARS:
|
||
self.write("-%s\n" % encode(line[1]))
|
||
line = line[2:]
|
||
continue
|
||
if line[:2] == r"\\":
|
||
self.write("(BREAK\n)BREAK\n")
|
||
line = line[2:]
|
||
continue
|
||
if line[:2] == r"\_":
|
||
line = "_" + line[2:]
|
||
continue
|
||
if line[:2] in (r"\'", r'\"'):
|
||
# combining characters...
|
||
self.combining_char(line[1], line[2])
|
||
line = line[3:]
|
||
continue
|
||
m = _text_rx.match(line)
|
||
if m:
|
||
text = encode(m.group())
|
||
self.write("-%s\n" % text)
|
||
line = line[m.end():]
|
||
continue
|
||
# special case because of \item[]
|
||
# XXX can we axe this???
|
||
if line[0] == "]":
|
||
self.write("-]\n")
|
||
line = line[1:]
|
||
continue
|
||
# avoid infinite loops
|
||
extra = ""
|
||
if len(line) > 100:
|
||
extra = "..."
|
||
raise LaTeXFormatError("could not identify markup: %s%s"
|
||
% (`line[:100]`, extra))
|
||
while stack:
|
||
entry = self.get_entry(stack[-1])
|
||
if entry.closes:
|
||
self.write(")%s\n-%s\n" % (entry.outputname, encode("\n")))
|
||
del stack[-1]
|
||
else:
|
||
break
|
||
if stack:
|
||
raise LaTeXFormatError("elements remain on stack: "
|
||
+ string.join(stack, ", "))
|
||
# otherwise we just ran out of input here...
|
||
|
||
# This is a really limited table of combinations, but it will have
|
||
# to do for now.
|
||
_combinations = {
|
||
("c", "c"): 0x00E7,
|
||
("'", "e"): 0x00E9,
|
||
('"', "o"): 0x00F6,
|
||
}
|
||
|
||
def combining_char(self, prefix, char):
|
||
ordinal = self._combinations[(prefix, char)]
|
||
self.write("-\\%%%d;\n" % ordinal)
|
||
|
||
def start_macro(self, name):
|
||
conversion = self.get_entry(name)
|
||
parameters = conversion.parameters
|
||
optional = parameters and parameters[0].optional
|
||
return parameters, optional, conversion.empty, conversion.environment
|
||
|
||
def get_entry(self, name):
|
||
entry = self.table.get(name)
|
||
if entry is None:
|
||
dbgmsg("get_entry(%s) failing; building default entry!" % `name`)
|
||
# not defined; build a default entry:
|
||
entry = TableEntry(name)
|
||
entry.has_content = 1
|
||
entry.parameters.append(Parameter("content"))
|
||
self.table[name] = entry
|
||
return entry
|
||
|
||
def get_env_entry(self, name):
|
||
entry = self.table.get(name)
|
||
if entry is None:
|
||
# not defined; build a default entry:
|
||
entry = TableEntry(name, 1)
|
||
entry.has_content = 1
|
||
entry.parameters.append(Parameter("content"))
|
||
entry.parameters[-1].implied = 1
|
||
self.table[name] = entry
|
||
elif not entry.environment:
|
||
raise LaTeXFormatError(
|
||
name + " is defined as a macro; expected environment")
|
||
return entry
|
||
|
||
def dump_attr(self, pentry, value):
|
||
if not (pentry.name and value):
|
||
return
|
||
if _token_rx.match(value):
|
||
dtype = "TOKEN"
|
||
else:
|
||
dtype = "CDATA"
|
||
self.write("A%s %s %s\n" % (pentry.name, dtype, encode(value)))
|
||
|
||
|
||
def convert(ifp, ofp, table):
|
||
c = Conversion(ifp, ofp, table)
|
||
try:
|
||
c.convert()
|
||
except IOError, (err, msg):
|
||
if err != errno.EPIPE:
|
||
raise
|
||
|
||
|
||
def skip_white(line):
|
||
while line and line[0] in " %\n\t\r":
|
||
line = string.lstrip(line[1:])
|
||
return line
|
||
|
||
|
||
|
||
class TableEntry:
|
||
def __init__(self, name, environment=0):
|
||
self.name = name
|
||
self.outputname = name
|
||
self.environment = environment
|
||
self.empty = not environment
|
||
self.has_content = 0
|
||
self.verbatim = 0
|
||
self.auto_close = 0
|
||
self.parameters = []
|
||
self.closes = []
|
||
self.endcloses = []
|
||
|
||
class Parameter:
|
||
def __init__(self, type, name=None, optional=0):
|
||
self.type = type
|
||
self.name = name
|
||
self.optional = optional
|
||
self.text = ''
|
||
self.implied = 0
|
||
|
||
|
||
class TableParser(XMLParser):
|
||
def __init__(self, table=None):
|
||
if table is None:
|
||
table = {}
|
||
self.__table = table
|
||
self.__current = None
|
||
self.__buffer = ''
|
||
XMLParser.__init__(self)
|
||
|
||
def get_table(self):
|
||
for entry in self.__table.values():
|
||
if entry.environment and not entry.has_content:
|
||
p = Parameter("content")
|
||
p.implied = 1
|
||
entry.parameters.append(p)
|
||
entry.has_content = 1
|
||
return self.__table
|
||
|
||
def start_environment(self, attrs):
|
||
name = attrs["name"]
|
||
self.__current = TableEntry(name, environment=1)
|
||
self.__current.verbatim = attrs.get("verbatim") == "yes"
|
||
if attrs.has_key("outputname"):
|
||
self.__current.outputname = attrs.get("outputname")
|
||
self.__current.endcloses = string.split(attrs.get("endcloses", ""))
|
||
def end_environment(self):
|
||
self.end_macro()
|
||
|
||
def start_macro(self, attrs):
|
||
name = attrs["name"]
|
||
self.__current = TableEntry(name)
|
||
self.__current.closes = string.split(attrs.get("closes", ""))
|
||
if attrs.has_key("outputname"):
|
||
self.__current.outputname = attrs.get("outputname")
|
||
def end_macro(self):
|
||
self.__table[self.__current.name] = self.__current
|
||
self.__current = None
|
||
|
||
def start_attribute(self, attrs):
|
||
name = attrs.get("name")
|
||
optional = attrs.get("optional") == "yes"
|
||
if name:
|
||
p = Parameter("attribute", name, optional=optional)
|
||
else:
|
||
p = Parameter("attribute", optional=optional)
|
||
self.__current.parameters.append(p)
|
||
self.__buffer = ''
|
||
def end_attribute(self):
|
||
self.__current.parameters[-1].text = self.__buffer
|
||
|
||
def start_entityref(self, attrs):
|
||
name = attrs["name"]
|
||
p = Parameter("entityref", name)
|
||
self.__current.parameters.append(p)
|
||
|
||
def start_child(self, attrs):
|
||
name = attrs["name"]
|
||
p = Parameter("child", name, attrs.get("optional") == "yes")
|
||
self.__current.parameters.append(p)
|
||
self.__current.empty = 0
|
||
|
||
def start_content(self, attrs):
|
||
p = Parameter("content")
|
||
p.implied = attrs.get("implied") == "yes"
|
||
if self.__current.environment:
|
||
p.implied = 1
|
||
self.__current.parameters.append(p)
|
||
self.__current.has_content = 1
|
||
self.__current.empty = 0
|
||
|
||
def start_text(self, attrs):
|
||
self.__current.empty = 0
|
||
self.__buffer = ''
|
||
def end_text(self):
|
||
p = Parameter("text")
|
||
p.text = self.__buffer
|
||
self.__current.parameters.append(p)
|
||
|
||
def handle_data(self, data):
|
||
self.__buffer = self.__buffer + data
|
||
|
||
|
||
def load_table(fp, table=None):
|
||
parser = TableParser(table=table)
|
||
parser.feed(fp.read())
|
||
parser.close()
|
||
return parser.get_table()
|
||
|
||
|
||
def main():
|
||
global DEBUG
|
||
#
|
||
opts, args = getopt.getopt(sys.argv[1:], "D", ["debug"])
|
||
for opt, arg in opts:
|
||
if opt in ("-D", "--debug"):
|
||
DEBUG = DEBUG + 1
|
||
if len(args) == 0:
|
||
ifp = sys.stdin
|
||
ofp = sys.stdout
|
||
elif len(args) == 1:
|
||
ifp = open(args)
|
||
ofp = sys.stdout
|
||
elif len(args) == 2:
|
||
ifp = open(args[0])
|
||
ofp = open(args[1], "w")
|
||
else:
|
||
usage()
|
||
sys.exit(2)
|
||
|
||
table = load_table(open(os.path.join(sys.path[0], 'conversion.xml')))
|
||
convert(ifp, ofp, table)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|