# Perforce Defect Tracking Integration Project
#
#
# CHECK_XHTML.PY -- CHECK XHTML DOCUMENT
#
# Gareth Rees, Ravenbrook Limited, 2001-04-30
#
#
# 1. INTRODUCTION
#
# This Python script checks that XHTML documents conform to the XHTML
# 1.0 Transitional specification [XHTML 1.0] and to the Ravenbrook
# document rules [Rules].
#
# The intended readership is Ravenbrook staff.
#
# This document is not confidential.
#
#
# 1.1. Use
#
# This module is intended for use in two circumstances. It may be run
# from the command line, passing a list of paths. It then checks those
# paths and writes its output to stderr. This is convenient for use in
# Emacs under M-x compile, for then the errors can be browsed using
# `next-error' (C-x `).
#
# It may also be run from other Python programs, which should
# instantiate an object from the checker class, and call the check()
# method, passing appropriate paths, or the check_stream() method,
# passing a file stream. When constructing the checker object, you may
# supply an error stream object with a write() method: all error
# messages will be written to this object and you may then divert them
# for exammple to unittest's fail() method.
import dircache
import getopt
import os
import re
import string
import sys
import types
import xml.sax
# 2. XHTML DEFINITION
#
# This section defines a bunch of tables that describe the XHTML 1.0
# Transitional document type [XHTML 1.0 DTD]. It would be nice to
# generate these tables automatically by parsing the document type
# definition, but as far as I know there's no DTD parser for XML, and I
# didn't want to write one just for the purpose of checking one document
# type.
#
# Instead, these tables have been derived mechanically (typically using
# Emacs Lisp) from the XHTML DTD [XHTML 1.0 DTD].
#
# In any case, there are a number of constraints on XHTML documents that
# are not specified in the DTD; see [XHTML 1.0]. So parsing the DTD
# wouldn't be the whole story.
#
#
# 2.1. Useful element contents
#
# These variables contain lists of elements: they will be used in the
# element definitions [2.2] to define the set of elements that may
# legally appear in the content of each XHTML element.
elt_special = ['br', 'span', 'bdo', 'object', 'applet', 'img',
'map', 'iframe']
elt_fontstyle = ['tt', 'i', 'b', 'big', 'small', 'u', 's',
'strike', 'font', 'basefont']
elt_phrase = ['em', 'strong', 'dfn', 'code', 'q', 'sub', 'sup',
'samp', 'kbd', 'var', 'cite', 'abbr', 'acronym']
elt_inline_forms = ['input', 'select', 'textarea', 'label', 'button']
elt_misc = ['ins', 'del', 'script', 'noscript']
elt_inline = (['a'] + elt_special + elt_fontstyle + elt_phrase
+ elt_inline_forms)
elt_Inline = ['#PCDATA'] + elt_inline + elt_misc
elt_heading = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
elt_lists = ['ul', 'ol', 'dl', 'menu', 'dir']
elt_blocktext = ['pre', 'hr', 'blockquote', 'address', 'center',
'noframes']
elt_block = (['p', 'div', 'isindex', 'fieldset', 'table']
+ elt_heading + elt_lists + elt_blocktext)
elt_Block = elt_block + ['form'] + elt_misc
elt_Flow = (['#PCDATA', 'form'] + elt_block + elt_inline
+ elt_misc)
elt_a_content = (['#PCDATA'] + elt_special + elt_fontstyle
+ elt_phrase + elt_inline_forms + elt_misc)
elt_pre_content = (['#PCDATA', 'a', 'br', 'span', 'bdo', 'map', 'tt',
'i', 'b', 'u', 's']
+ elt_phrase + elt_inline_forms)
elt_form_content = ['#PCDATA'] + elt_block + elt_inline + elt_misc
elt_button_content = (['#PCDATA', 'p', 'div', 'table', 'br', 'span',
'bdo', 'object', 'applet', 'img', 'map']
+ elt_heading + elt_lists + elt_blocktext
+ elt_fontstyle + elt_phrase + elt_misc)
elt_head_misc = ['script', 'style', 'meta', 'link', 'object',
'isindex']
# 2.2. Elements definitions
#
# The legal_elements dictionary maps the name of an XHTML element to the
# list of elements that are legal members of that element.
legal_elements = {
'a': elt_a_content,
'abbr': elt_Inline,
'acronym': elt_Inline,
'address': elt_Inline,
'applet': (['#PCDATA', 'param', 'form'] + elt_block + elt_inline
+ elt_misc),
'area': [],
'b': elt_Inline,
'base': [],
'basefont': [],
'bdo': elt_Inline,
'big': elt_Inline,
'blockquote': elt_Flow,
'body': elt_Flow,
'br': [],
'button': elt_button_content,
'caption': elt_Inline,
'center': elt_Flow,
'cite': elt_Inline,
'code': elt_Inline,
'col': [],
'colgroup': ['col'],
'dd': elt_Flow,
'del': elt_Flow,
'dfn': elt_Inline,
'dir': ['li'],
'div': elt_Flow,
'dl': ['dt', 'dd'],
'dt': elt_Inline,
'em': elt_Inline,
'fieldset': (['#PCDATA', 'legend', 'form'] + elt_block + elt_inline
+ elt_misc),
'font': elt_Inline,
'form': elt_form_content,
'h1': elt_Inline,
'h2': elt_Inline,
'h3': elt_Inline,
'h4': elt_Inline,
'h5': elt_Inline,
'h6': elt_Inline,
'head': elt_head_misc + ['title', 'base'],
'hr': [],
'html': ['head', 'body'],
'i': elt_Inline,
'iframe': elt_Flow,
'img': [],
'input': [],
'ins': elt_Flow,
'isindex': [],
'kbd': elt_Inline,
'label': elt_Inline,
'legend': elt_Inline,
'li': elt_Flow,
'link': [],
'map': ['form', 'area'] + elt_block + elt_misc,
'menu': ['li'],
'meta': [],
'noframes': elt_Flow,
'noscript': elt_Flow,
'object': (['#PCDATA', 'param', 'form'] + elt_block + elt_inline
+ elt_misc),
'ol': ['li'],
'optgroup': ['option'],
'option': ['#PCDATA'],
'p': elt_Inline,
'param': [],
'pre': elt_pre_content,
'q': elt_Inline,
's': elt_Inline,
'samp': elt_Inline,
'script': ['#PCDATA'],
'select': ['optgroup', 'option'],
'small': elt_Inline,
'span': elt_Inline,
'strike': elt_Inline,
'strong': elt_Inline,
'style': ['#PCDATA'],
'sub': elt_Inline,
'sup': elt_Inline,
'table': ['caption', 'col', 'colgroup', 'thead', 'tfoot', 'tbody',
'tr'],
'tbody': ['tr'],
'td': elt_Flow,
'textarea': ['#PCDATA'],
'tfoot': ['tr'],
'th': elt_Flow,
'thead': ['tr'],
'title': ['#PCDATA'],
'tr': ['th', 'td'],
'tt': elt_Inline,
'u': elt_Inline,
'ul': ['li'],
'var': elt_Inline,
}
# 2.3. Nonempty elements
#
# The nonempty_element array is a list of elements that may not be empty
# (that is, they must have at least one element in their contents: for
# example