#             Perforce Defect Tracking Integration Project
#              <http://www.ravenbrook.com/project/p4dti/>
#
#                CHECK_XHTML.PY -- CHECK XHTML DOCUMENT
#
#             Gareth Rees, Ravenbrook Limited, 2001-04-30
#
#
# 1. INTRODUCTION
#
# This Python script checks that XHTML documents conform to the XHTML 1.0
# Transitional specification [XHTML 1.0] and to the Ravenbrook document rules
# [Rules].
#
# The intended readership is Ravenbrook staff.
#
# This document is not confidential.
#
#
# 1.1. Use
#
# This module is intended for use in two circumstances.  It may be run
# from the command line, passing a list of paths.  It then checks those
# paths and writes its output to stderr.  This is convenient for use in
# Emacs under M-x compile, for then the errors can be browsed using
# `next-error' (C-x `).
#
# It may also be run from other Python programs, which should
# instantiate an object from the checker class, and call the check()
# method, passing appropriate paths, or the check_stream() method,
# passing a file stream.  When constructing the checker object, you may
# supply an error stream object with a write() method: all error
# messages will be written to this object and you may then divert them
# for exammple to unittest's fail() method.

import dircache
import getopt
import os
import re
import string
import sys
import types
import xml.sax


# 2. XHTML DEFINITION
#
# This section defines a bunch of tables that describe the XHTML 1.0
# Transitional document type [XHTML 1.0 DTD].  It would be nice to generate
# these tables automatically by parsing the document type definition, but as
# far as I know there's no DTD parser for XML, and I didn't want to write one
# just for the purpose of checking one document type.
#
# Instead, these tables have been derived mechanically (typically using Emacs
# Lisp) from the XHTML DTD [XHTML 1.0 DTD].
#
# In any case, there are a number of constraints on XHTML documents that are
# not specified in the DTD; see [XHTML 1.0].  So parsing the DTD wouldn't be
# the whole story.
#
#
# 2.1. Useful element contents
#
# These variables contain lists of elements: they will be used in the element
# definitions [2.2] to define the set of elements that may legally appear in
# the content of each XHTML element.

elt_special = ['br', 'span', 'bdo', 'object', 'applet', 'img', 'map', 'iframe']
elt_fontstyle = ['tt', 'i', 'b', 'big', 'small', 'u', 's', 'strike', 'font', 'basefont']
elt_phrase = ['em', 'strong', 'dfn', 'code', 'q', 'sub', 'sup', 'samp', 'kbd', 'var', 'cite', 'abbr', 'acronym']
elt_inline_forms = ['input', 'select', 'textarea', 'label', 'button']
elt_misc = ['ins', 'del', 'script', 'noscript']
elt_inline = ['a'] + elt_special + elt_fontstyle + elt_phrase + elt_inline_forms
elt_Inline = ['#PCDATA'] + elt_inline + elt_misc
elt_heading = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
elt_lists = ['ul', 'ol', 'dl', 'menu', 'dir']
elt_blocktext = ['pre', 'hr', 'blockquote', 'address', 'center', 'noframes']
elt_block = ['p', 'div', 'isindex', 'fieldset', 'table'] + elt_heading + elt_lists + elt_blocktext
elt_Block = elt_block + ['form'] + elt_misc
elt_Flow = ['#PCDATA', 'form'] + elt_block + elt_inline + elt_misc
elt_a_content = ['#PCDATA'] + elt_special + elt_fontstyle + elt_phrase + elt_inline_forms + elt_misc
elt_pre_content = ['#PCDATA', 'a', 'br', 'span', 'bdo', 'map', 'tt', 'i', 'b', 'u', 's'] + elt_phrase + elt_inline_forms
elt_form_content = ['#PCDATA'] + elt_block + elt_inline + elt_misc
elt_button_content = ['#PCDATA', 'p', 'div', 'table', 'br', 'span', 'bdo', 'object', 'applet', 'img', 'map'] + elt_heading + elt_lists + elt_blocktext + elt_fontstyle + elt_phrase + elt_misc
elt_head_misc = ['script', 'style', 'meta', 'link', 'object', 'isindex']


# 2.2. Elements definitions
#
# The legal_elements dictionary maps the name of an XHTML element to the list
# of elements that are legal members of that element.

legal_elements = {
    'a': elt_a_content,
    'abbr': elt_Inline,
    'acronym': elt_Inline,
    'address': elt_Inline,
    'applet': ['#PCDATA', 'param', 'form'] + elt_block + elt_inline + elt_misc,
    'area': [],
    'b': elt_Inline,
    'base': [],
    'basefont': [],
    'bdo': elt_Inline,
    'big': elt_Inline,
    'blockquote': elt_Flow,
    'body': elt_Flow,
    'br': [],
    'button': elt_button_content,
    'caption': elt_Inline,
    'center': elt_Flow,
    'cite': elt_Inline,
    'code': elt_Inline,
    'col': [],
    'colgroup': ['col'],
    'dd': elt_Flow,
    'del': elt_Flow,
    'dfn': elt_Inline,
    'dir': ['li'],
    'div': elt_Flow,
    'dl': ['dt', 'dd'],
    'dt': elt_Inline,
    'em': elt_Inline,
    'fieldset': ['#PCDATA', 'legend', 'form'] + elt_block + elt_inline + elt_misc,
    'font': elt_Inline,
    'form': elt_form_content,
    'h1': elt_Inline,
    'h2': elt_Inline,
    'h3': elt_Inline,
    'h4': elt_Inline,
    'h5': elt_Inline,
    'h6': elt_Inline,
    'head': elt_head_misc + ['title', 'base'],
    'hr': [],
    'html': ['head', 'body'],
    'i': elt_Inline,
    'iframe': elt_Flow,
    'img': [],
    'input': [],
    'ins': elt_Flow,
    'isindex': [],
    'kbd': elt_Inline,
    'label': elt_Inline,
    'legend': elt_Inline,
    'li': elt_Flow,
    'link': [],
    'map': ['form', 'area'] + elt_block + elt_misc,
    'menu': ['li'],
    'meta': [],
    'noframes': elt_Flow,
    'noscript': elt_Flow,
    'object': ['#PCDATA', 'param', 'form'] + elt_block + elt_inline + elt_misc,
    'ol': ['li'],
    'optgroup': ['option'],
    'option': ['#PCDATA'],
    'p': elt_Inline,
    'param': [],
    'pre': elt_pre_content,
    'q': elt_Inline,
    's': elt_Inline,
    'samp': elt_Inline,
    'script': ['#PCDATA'],
    'select': ['optgroup', 'option'],
    'small': elt_Inline,
    'span': elt_Inline,
    'strike': elt_Inline,
    'strong': elt_Inline,
    'style': ['#PCDATA'],
    'sub': elt_Inline,
    'sup': elt_Inline,
    'table': ['caption', 'col', 'colgroup', 'thead', 'tfoot', 'tbody', 'tr'],
    'tbody': ['tr'],
    'td': elt_Flow,
    'textarea': ['#PCDATA'],
    'tfoot': ['tr'],
    'th': elt_Flow,
    'thead': ['tr'],
    'title': ['#PCDATA'],
    'tr': ['th', 'td'],
    'tt': elt_Inline,
    'u': elt_Inline,
    'ul': ['li'],
    'var': elt_Inline,
    }


# 2.3. Nonempty elements
#
# The nonempty_element array is a list of elements that may not be empty (that
# is, they must have at least one element in their contents: for example <ul>
# must contain at least one <li>, and <optgroup> must contain at least one
# <option>.

nonempty_elements = [ 'dir', 'dl', 'head', 'html', 'map', 'menu', 'ol', 'optgroup', 'select', 'table', 'tbody', 'tfoot', 'thead', 'tr', 'ul',]


# 2.4. Other constraints
#
# There are constraints on <head>, <html>, <map> and <table> which are not
# specified in [2.2] and [2.3] (that is, the constraints can't be expressed as
# a combination of "these elements are legal contents" and "the contents must
# not be empty").  The contraints dictionary maps element name to a pair
# consisting of the element definition from [XHTML 1.0 DTD], and a regular
# expression that matches legal contents of the element, when the content
# elements have '<' appended and are joined together.  The reason for using '<'
# as the terminator is that it may not appear in an element name.

constraints = {
    'html': ( "(head, body)",
              "^head<body<$" ),
    'head': ( "(%head.misc;, ((title, %head.misc;, (base, %head.misc;)?) | (base, %head.misc;, (title, %head.misc;))))",
              "^(script<|style<|meta<|link<|object<|isindex<)*(title<(script<|style<|meta<|link<|object<|isindex<)*(base<)?|base<(script<|style<|meta<|link<|object<|isindex<)*title<)(script<|style<|meta<|link<|object<|isindex<)*$" ),
    'map': ( "((%block; | form | %misc;)+ | area+)",
             "^(p<|div<|isindex<|fieldset<|table<|h1<|h2<|h3<|h4<|h5<|h6<|ul<|ol<|dl<|menu<|dir<|pre<|hr<|blockquote<|address<|center<|noframes<|form<|ins<|del<|script<|noscript<)+|(area<)+$" ),
    'table': ( "(caption?, (col*|colgroup*), thead?, tfoot?, (tbody+|tr+))",
               "^(caption<)?((col<)*|(colgroup<)*)(thead<)?(tfoot<)?((tbody<)+|(tr<)+)$" ),
    }

# The illegal_ancestors dictionary is a map from element name to a list of
# elements that the element may not be found in (no matter how deep in the
# document tree).  This list is derived from [XHTML 1.0, B].

illegal_ancestors = {
    'a': ['a'],
    'big': ['pre'],
    'button': ['button'],
    'fieldset': ['button'],
    'form': ['button', 'form'],
    'iframe': ['button'],
    'img': ['pre'],
    'input': ['button'],
    'isindex': ['button'],
    'label': ['button', 'label'],
    'object': ['pre'],
    'select': ['button'],
    'small': ['pre'],
    'sub': ['pre'],
    'sup': ['pre'],
    'textarea': ['button'],
    }


# 2.5. Attribute definitions
#
# These variables define sets of attributes that are common to a number of
# elements.  The variables will be used to help build the attributes table in
# [2.6] below.

attrs_cellhalign = [
    ('align', ['left', 'center', 'right', 'justify', 'char'], 0),
    ('char', 'Character', 0),
    ('charoff', 'Length', 0),
    ]
attrs_cellvalign = [
    ('valign', ['top', 'middle', 'bottom', 'baseline'], 0),
    ]
attrs_coreattrs = [
    ('id', 'ID', 0),
    ('class', 'Class', 0),
    ('style', 'StyleSheet', 0),
    ('title', 'Text', 0),
    ]
attrs_events = [
    ('onclick', 'Script', 0),
    ('ondblclick', 'Script', 0),
    ('onmousedown', 'Script', 0),
    ('onmouseup', 'Script', 0),
    ('onmouseover', 'Script', 0),
    ('onmousemove', 'Script', 0),
    ('onmouseout', 'Script', 0),
    ('onkeypress', 'Script', 0),
    ('onkeydown', 'Script', 0),
    ('onkeyup', 'Script', 0),
    ]
attrs_focus = [
    ('accesskey', 'Character', 0),
    ('tabindex', 'Number', 0),
    ('onfocus', 'Script', 0),
    ('onblur', 'Script', 0),
    ]
attrs_i18n = [
    ('lang', 'LanguageCode', 0),
    ('xml:lang', 'LanguageCode', 0),
    ('dir', ['ltr', 'rtl'], 0),
    ]
attrs_TextAlign = [
    ('align', ['left', 'center', 'right'], 0),
    ]
attrs_attrs = attrs_coreattrs + attrs_i18n + attrs_events


# 2.6. Attributes
#
# This dictionary maps element name to a list of legal attributes for that
# element.  Each member of the list is a triple (NAME, TYPE, DISPOSITION).
# NAME is the name of the attribute.  TYPE is either a string naming the type
# of the attribute value, or a list of strings which are the valid values for
# the attribute.  DISPOSITION is either 0 (meaning optional), 1 (meaning
# required), or a string which is the default value for the attribute.

type_CAlign = ['top', 'bottom', 'left', 'right']
type_ImgAlign = ['top', 'middle', 'bottom', 'left', 'right']
type_Scope = ['row', 'col', 'rowgroup', 'colgroup']
type_Shape = ['rect', 'circle', 'poly', 'default']
type_TAlign = ['left', 'center', 'right']
type_TFrame = ['void', 'above', 'below', 'hsides', 'lhs', 'rhs', 'vsides', 'box', 'border']
type_TRules = ['none', 'groups', 'rows', 'cols', 'all']

attributes = {
    'a': attrs_attrs + attrs_focus + [
	('charset', 'Charset', 0),
	('type', 'ContentType', 0),
	('name', 'NMTOKEN', 0),
	('href', 'URI', 0),
	('hreflang', 'LanguageCode', 0),
	('rel', 'LinkTypes', 0),
	('rev', 'LinkTypes', 0),
	('shape', type_Shape, "rect"),
	('coords', 'Coords', 0),
	('target', 'FrameTarget', 0),
	],
    'abbr': attrs_attrs,
    'acronym': attrs_attrs,
    'address': attrs_attrs,
    'applet': attrs_coreattrs + [
	('codebase', 'URI', 0),
	('archive', 'CDATA', 0),
	('code', 'CDATA', 0),
	('object', 'CDATA', 0),
	('alt', 'Text', 0),
	('name', 'NMTOKEN', 0),
	('width', 'Length', 1),
	('height', 'Length', 1),
	('align', type_ImgAlign, 0),
	('hspace', 'Pixels', 0),
	('vspace', 'Pixels', 0),
	],
    'area': attrs_attrs + attrs_focus + [
	('shape', type_Shape, "rect"),
	('coords', 'Coords', 0),
	('href', 'URI', 0),
	('nohref', ['nohref'], 0),
	('alt', 'Text', 1),
	('target', 'FrameTarget', 0),
	],
    'b': attrs_attrs,
    'base': [
	('href', 'URI', 0),
	('target', 'FrameTarget', 0),
	],
    'basefont': [
	('id', 'ID', 0),
	('size', 'CDATA', 1),
	('color', 'Color', 0),
	('face', 'CDATA', 0),
	],
    'bdo': attrs_coreattrs + attrs_events + [
	('lang', 'LanguageCode', 0),
	('xml:lang', 'LanguageCode', 0),
	('dir', ['ltr', 'rtl'], 1),
	],
    'big': attrs_attrs,
    'blockquote': attrs_attrs + [
	('cite', 'URI', 0),
	],
    'body': attrs_attrs + [
	('onload', 'Script', 0),
	('onunload', 'Script', 0),
	('background', 'URI', 0),
	('bgcolor', 'Color', 0),
	('text', 'Color', 0),
	('link', 'Color', 0),
	('vlink', 'Color', 0),
	('alink', 'Color', 0),
	],
    'br': attrs_coreattrs + [
	('clear', ['left', 'all', 'right', 'none'], "none"),
	],
    'button': attrs_attrs + attrs_focus + [
	('name', 'CDATA', 0),
	('value', 'CDATA', 0),
	('type', ['button', 'submit', 'reset'], "submit"),
	('disabled', ['disabled'], 0),
	],
    'caption': attrs_attrs + [
	('align', type_CAlign, 0),
	],
    'center': attrs_attrs,
    'cite': attrs_attrs,
    'code': attrs_attrs,
    'col': attrs_attrs + attrs_cellhalign + attrs_cellvalign + [
	('span', 'Number', "1"),
	('width', 'MultiLength', 0),
	],
    'colgroup': attrs_attrs + attrs_cellhalign + attrs_cellvalign + [
	('span', 'Number', "1"),
	('width', 'MultiLength', 0),
	],
    'dd': attrs_attrs,
    'del': attrs_attrs + [
	('cite', 'URI', 0),
	('datetime', 'Datetime', 0),
	],
    'dfn': attrs_attrs,
    'dir': attrs_attrs + [
	('compact', ['compact'], 0),
	],
    'div': attrs_attrs + attrs_TextAlign,
    'dl': attrs_attrs + [
	('compact', ['compact'], 0),
	],
    'dt': attrs_attrs,
    'em': attrs_attrs,
    'fieldset': attrs_attrs,
    'font': attrs_coreattrs + attrs_i18n + [
	('size', 'CDATA', 0),
	('color', 'Color', 0),
	('face', 'CDATA', 0),
	],
    'form': attrs_attrs + [
	('action', 'URI', 1),
	('method', ['get', 'post'], "get"),
	('name', 'NMTOKEN', 0),
	('enctype', 'ContentType',  "application/x-www-form-urlencoded"),
	('onsubmit', 'Script', 0),
	('onreset', 'Script', 0),
	('accept', 'ContentTypes', 0),
	('accept-charset', 'Charsets', 0),
	('target', 'FrameTarget', 0),
	],
    'h1': attrs_attrs + attrs_TextAlign,
    'h2': attrs_attrs + attrs_TextAlign,
    'h3': attrs_attrs + attrs_TextAlign,
    'h4': attrs_attrs + attrs_TextAlign,
    'h5': attrs_attrs + attrs_TextAlign,
    'h6': attrs_attrs + attrs_TextAlign,
    'head': attrs_i18n + [
	('profile', 'URI', 0),
	],
    'hr': attrs_attrs + [
	('align', ['left','center','right'], 0),
	('noshade', ['noshade'], 0),
	('size', 'Pixels', 0),
	('width', 'Length', 0),
	],
    'html': attrs_i18n + [
	('xmlns', 'URI', 'http://www.w3.org/1999/xhtml'),
	],
    'i': attrs_attrs,
    'iframe': attrs_coreattrs + [
	('longdesc', 'URI', 0),
	('name', 'NMTOKEN', 0),
	('src', 'URI', 0),
	('frameborder', ['1','0'], "1"),
	('marginwidth', 'Pixels', 0),
	('marginheight', 'Pixels', 0),
	('scrolling', ['yes','no','auto'], "auto"),
	('align', type_ImgAlign, 0),
	('height', 'Length', 0),
	('width', 'Length', 0),
	],
    'img': attrs_attrs + [
	('src', 'URI', 1),
	('alt', 'Text', 1),
	('name', 'NMTOKEN', 0),
	('longdesc', 'URI', 0),
	('height', 'Length', 0),
	('width', 'Length', 0),
	('usemap', 'URI', 0),
	('ismap', ['ismap'], 0),
	('align', type_ImgAlign, 0),
	('border', 'Length', 0),
	('hspace', 'Pixels', 0),
	('vspace', 'Pixels', 0),
	],
    'input': attrs_attrs + attrs_focus + [
	('type', 'InputType', "text"),
	('name', 'CDATA', 0),
	('value', 'CDATA', 0),
	('checked', ['checked'], 0),
	('disabled', ['disabled'], 0),
	('readonly', ['readonly'], 0),
	('size', 'CDATA', 0),
	('maxlength', 'Number', 0),
	('src', 'URI', 0),
	('alt', 'CDATA', 0),
	('usemap', 'URI', 0),
	('onselect', 'Script', 0),
	('onchange', 'Script', 0),
	('accept', 'ContentTypes', 0),
	('align', type_ImgAlign, 0),
	],
    'ins': attrs_attrs + [
	('cite', 'URI', 0),
	('datetime', 'Datetime', 0),
	],
    'isindex': attrs_coreattrs + attrs_i18n + [
	('prompt', 'Text', 0),
	],
    'kbd': attrs_attrs,
    'label': attrs_attrs + [
	('for', 'IDREF', 0),
	('accesskey', 'Character', 0),
	('onfocus', 'Script', 0),
	('onblur', 'Script', 0),
	],
    'legend': attrs_attrs + [
	('accesskey', 'Character', 0),
	('align', 'LAlign', 0),
	],
    'li': attrs_attrs + [
	('type', 'LIStyle', 0),
	('value', 'Number', 0),
	],
    'link': attrs_attrs + [
	('charset', 'Charset', 0),
	('href', 'URI', 0),
	('hreflang', 'LanguageCode', 0),
	('type', 'ContentType', 0),
	('rel', 'LinkTypes', 0),
	('rev', 'LinkTypes', 0),
	('media', 'MediaDesc', 0),
	('target', 'FrameTarget', 0),
	],
    'map': attrs_i18n + attrs_events + [
	('id', 'ID', 1),
	('class', 'CDATA', 0),
	('style', 'StyleSheet', 0),
	('title', 'Text', 0),
	('name', 'CDATA', 0),
	],
    'menu': attrs_attrs + [
	('compact', ['compact'], 0),
	],
    'meta': attrs_i18n + [
	('http-equiv', 'CDATA', 0),
	('name', 'CDATA', 0),
	('content', 'CDATA', 1),
	('scheme', 'CDATA', 0),
	],
    'noframes': attrs_attrs,
    'noscript': attrs_attrs,
    'object': attrs_attrs + [
	('declare', ['declare'], 0),
	('classid', 'URI', 0),
	('codebase', 'URI', 0),
	('data', 'URI', 0),
	('type', 'ContentType', 0),
	('codetype', 'ContentType', 0),
	('archive', 'UriList', 0),
	('standby', 'Text', 0),
	('height', 'Length', 0),
	('width', 'Length', 0),
	('usemap', 'URI', 0),
	('name', 'NMTOKEN', 0),
	('tabindex', 'Number', 0),
	('align', type_ImgAlign, 0),
	('border', 'Pixels', 0),
	('hspace', 'Pixels', 0),
	('vspace', 'Pixels', 0),
	],
    'ol': attrs_attrs + [
	('type', ['1', 'a', 'A', 'i', 'I'], 0),
	('compact', ['compact'], 0),
	('start', 'Number', 0),
	],
    'optgroup': attrs_attrs + [
	('disabled', ['disabled'], 0),
	('label', 'Text', 1),
	],
    'option': attrs_attrs + [
	('selected', ['selected'], 0),
	('disabled', ['disabled'], 0),
	('label', 'Text', 0),
	('value', 'CDATA', 0),
	],
    'p': attrs_attrs + attrs_TextAlign,
    'param': [
	('id', 'ID', 0),
	('name', 'CDATA', 1),
	('value', 'CDATA', 0),
	('valuetype', ['data', 'ref', 'object'], "data"),
	('type', 'ContentType', 0),
	],
    'pre': attrs_attrs + [
	('width', 'Number', 0),
	('xml:space', ['preserve'], 'preserve'),
	],
    'q': attrs_attrs + [
	('cite', 'URI', 0),
	],
    's': attrs_attrs,
    'samp': attrs_attrs,
    'script': [
	('charset', 'Charset', 0),
	('type', 'ContentType', 1),
	('language', 'CDATA', 0),
	('src', 'URI', 0),
	('defer', ['defer'], 0),
	('xml:space', ['preserve'], 'preserve'),
	],
    'select': attrs_attrs + [
	('name', 'CDATA', 0),
	('size', 'Number', 0),
	('multiple', ['multiple'], 0),
	('disabled', ['disabled'], 0),
	('tabindex', 'Number', 0),
	('onfocus', 'Script', 0),
	('onblur', 'Script', 0),
	('onchange', 'Script', 0),
	],
    'small': attrs_attrs,
    'span': attrs_attrs,
    'strike': attrs_attrs,
    'strong': attrs_attrs,
    'style': attrs_i18n + [
	('type', 'ContentType', 1),
	('media', 'MediaDesc', 0),
	('title', 'Text', 0),
	('xml:space', ['preserve'], 'preserve'),
	],
    'sub': attrs_attrs,
    'sup': attrs_attrs,
    'table': attrs_attrs + [
	('summary', 'Text', 0),
	('width', 'Length', 0),
	('border', 'Pixels', 0),
	('frame', type_TFrame, 0),
	('rules', type_TRules, 0),
	('cellspacing', 'Length', 0),
	('cellpadding', 'Length', 0),
	('align', 'TAlign', 0),
	('bgcolor', 'Color', 0),
	],
    'tbody': attrs_attrs + attrs_cellhalign + attrs_cellvalign,
    'td': attrs_attrs + attrs_cellhalign + attrs_cellvalign + [
	('abbr', 'Text', 0),
	('axis', 'CDATA', 0),
	('headers', 'IDREFS', 0),
	('scope', type_Scope, 0),
	('rowspan', 'Number', "1"),
	('colspan', 'Number', "1"),
	('nowrap', ['nowrap'], 0),
	('bgcolor', 'Color', 0),
	('width', 'Pixels', 0),
	('height', 'Pixels', 0),
	],
    'textarea': attrs_attrs + attrs_focus + [
	('name', 'CDATA', 0),
	('rows', 'Number', 1),
	('cols', 'Number', 1),
	('disabled', ['disabled'], 0),
	('readonly', ['readonly'], 0),
	('onselect', 'Script', 0),
	('onchange', 'Script', 0),
	],
    'tfoot': attrs_attrs + attrs_cellhalign + attrs_cellvalign,
    'th': attrs_attrs + attrs_cellhalign + attrs_cellvalign + [
	('abbr', 'Text', 0),
	('axis', 'CDATA', 0),
	('headers', 'IDREFS', 0),
	('scope', type_Scope, 0),
	('rowspan', 'Number', "1"),
	('colspan', 'Number', "1"),
	('nowrap', ['nowrap'], 0),
	('bgcolor', 'Color', 0),
	('width', 'Pixels', 0),
	('height', 'Pixels', 0),
	],
    'thead': attrs_attrs + attrs_cellhalign + attrs_cellvalign,
    'title': attrs_i18n,
    'tr': attrs_attrs + attrs_cellhalign + attrs_cellvalign + [
	('bgcolor', 'Color', 0),
	],
    'tt': attrs_attrs,
    'u': attrs_attrs,
    'ul': attrs_attrs + [
	('type', ['disc', 'square', 'circle'], 0),
	('compact', ['compact'], 0),
	],
    'var': attrs_attrs,
    }

legal_attributes = {}

for (element, attrs) in attributes.items():
    legal_attributes[element] = {}
    for (name, type, disposition) in attrs:
        legal_attributes[element][name] = (type, disposition)

# The recommended_attributes dictionary maps element name to a list of
# attributes that are recommended by [Chisholm 2000-11-06] and [HTML 4.01].

recommended_attributes = {
    'abbr': ['title'],
    'acronym': ['title'],
    'applet': ['alt'],
    'area': ['alt'],
    'img': ['height', 'width'],
    'input': ['alt'],
    }


# 2.7. Attribute checkers
#
# The attribute_checkers dictionary maps attribute type to a function of one
# argument that checks that an attribute value is legal.  See [HTML 4.01, 6].

attribute_checkers = {
    'Character': lambda(v): re.match("^.$", v),
    'Color': lambda(v): re.match("^#[0-9A-Fa-f]+$", v) or string.lower(v) in ['black', 'green', 'silver', 'lime', 'gray', 'olive', 'white', 'yellow', 'maroon', 'navy', 'red', 'blue', 'purple', 'teal', 'fuchsia', 'aqua'],
    'Datetime': lambda(v): re.match("^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}(Z|[+-]\\d{2}:\\d{2})$", v),
    'ID': lambda(v): re.match("^[A-Za-z][A-Za-z0-9-_:.]*$", v),
    'IDREF': lambda(v): re.match("^[A-Za-z][A-Za-z0-9-_:.]*$", v),
    'IDREFS': lambda(v): re.match("^[A-Za-z]([A-Za-z0-9-_:.]| +[A-Za-z])*$", v),
    'Length': lambda(v): re.match("^\\d+%?$", v),
    'MultiLength': lambda(v): re.match("^\\d+[%*]?$", v),
    'NMTOKEN': lambda(v): re.match("^[A-Za-z][A-Za-z0-9-_:.]*$", v),
    'Number': lambda(v): re.match("^\\d+$", v),
    'Pixels': lambda(v): re.match("^\\d+$", v),
    }


# 3. CHECK XHTML
#
# The error_sets dictionary defines sets of error messages.  'accessibility'
# errors violate rules in [Chisholm 2000-11-06]; 'ravenbrook' errors violate
# rules in [Rules]; 'xhtml-1.0' errors violate rules in [XHTML 1.0].

error_sets = {
    'accessibility': [ 26, ],
    'ravenbrook': [ 5, 6, 17, 18, 19, 20, 23, 21, 25, ],
    'xhtml-1.0': [ 1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 15, 16, 22, 27, ],
    }

class handler(xml.sax.handler.ContentHandler):
    locator = None
    current_heading_level = 0
    current_section = None
    exception = "Fatal XML parsing error."
    error_stream = None

    def __init__(self, doc, noerrors = [], error_stream = sys.stderr):
        self.doc = doc
        self.noerrors = noerrors
        self.stack = [[]]
        self.stack2 = []
        self.ids = {}
        self.nocase_ids = {} # Lower-case keys.
        self.cross_refs = []
        self.refs = {}
        self.idrefs = []
        self.error_stream = error_stream

    def err(self, code, msg):
        self.write_error(code, msg, self.locator.getLineNumber())

    def write_error(self, code, msg, line):
        if code not in self.noerrors:
            msg = "%s(%d) [%d] %s\n" % (self.doc, line, code, msg)
            self.error_stream.write(msg)

    def setDocumentLocator(self, locator):
        self.locator = locator

    def startElement(self, element, attrs):
        if len(self.stack2) == 0:
            parent = None
        else:
            parent = self.stack2[-1][0]
        self.stack2.append((element, self.locator.getLineNumber()))
        self.stack[-1].append(element)
        self.stack.append([])

        # Check that element is legal in XHTML.
        if not legal_elements.has_key(element):
            self.err(1, "<%s> element is not legal in XHTML 1.0 Transitional."
                     % element)
            return

        # Check that the element may legally appear at this point in the
        # document.
        if (parent and legal_elements.has_key(parent)
            and element not in legal_elements[parent]):
            self.err(2, "Element <%s> appears in <%s> (not allowed)."
                     % (element, parent))
        if not parent and element != 'html':
            self.err(3, "Top-level element is <%s> (should be <html>)."
                     % element)
        if illegal_ancestors.has_key(element):
            for (ancestor, line) in self.stack2[0:-1]:
                if ancestor in illegal_ancestors[element]:
                    self.err(16, "Element <%s> appears below <%s> on line %d "
                             "(not allowed)." % (element, ancestor, line))

        # Check attributes.
        for (attr, value) in attrs.items():
            # Check that the attributes is legal for the element.
            if not legal_attributes[element].has_key(attr):
                self.err(4, "Attribute '%s' is not allowed in <%s>."
                         % (attr, element))
                continue

            # Check that the value for the attribute is legal.
            type, disposition = legal_attributes[element][attr]
            if isinstance(type, types.ListType):
                if value not in type:
                    self.err(10, "Attribute '%s' for element <%s> has value "
                             "'%s' (must be one of [%s])."
                             % (attr, element, value,
                                string.join(type, ', ')))
            elif attribute_checkers.has_key(type):
                if not attribute_checkers[type](value):
                    self.err(11, "Attribute '%s' for element <%s> has illegal "
                             "value '%s' (not a %s)."
                             % (attr, element, value, type))

            # Check that id is unique (ignoring case) within the document.
            if attr == 'id':
                if self.nocase_ids.has_key(string.lower(value)):
                    self.err(12, "Duplicate id '%s' (original on line %d)."
                             % (value, self.nocase_ids[string.lower(value)]))
                self.ids[value] = 1
                self.nocase_ids[string.lower(value)] = self.locator.getLineNumber()

            # Remember IDREF and IDREFS for checking later.
            if type in ['IDREF', 'IDREFS']:
                idrefs = re.split(" +", value)
                for i in idrefs:
                    self.idrefs.append((i, self.locator.getLineNumber()))

        # Check that required attributes are present.
        for (attr, (type, disposition)) in legal_attributes[element].items():
            if disposition == 1 and attr not in attrs.keys():
                self.err(13, "Attribute '%s' is required for <%s> but not "
                         "present." % (attr, element))

        # Check that recommended attributes are present.
        if recommended_attributes.has_key(element):
            for a in recommended_attributes[element]:
                if not attrs.has_key(a):
                    self.err(26, "Attribute '%s' is recommended for <%s> but "
                             "not present." % (a, element))

        # <td valign="..."> is deprecated: should set same valign for all cells
        # in the row using <tr valign="...">.
        if element in ['td','th'] and attrs.has_key('valign'):
            self.err(20, "<%s> has valign attribute: better in the <tr>."
                     % element)

        # Test against rule xhtml/id.
        if element == 'a' and attrs.has_key('id') and not attrs.has_key('name'):
            self.err(5, "<%s> element has 'id' attribute but no 'name' "
                     "attribute." % element)
        if element == 'a' and attrs.has_key('name') and not attrs.has_key('id'):
            self.err(6, "<%s> element has 'name' attribute but no 'id' "
                     "attribute." % element)
        if (attrs.has_key('name') and attrs.has_key('id')
            and attrs['name'] != attrs['id']):
            self.err(7, "<%s> element has id '%s' but name '%s'."
                     % (element, attrs['id'], attrs['name']))

        # Test against rule xhtml/section.  Remember current section number.
        if element == 'a' and parent in ['h2', 'h3', 'h4', 'h5', 'h6']:
            level = int(re.match('h([0-9])$', parent).group(1))
            if not attrs.has_key('id'):
                self.err(17, "Section anchor has no 'id' attribute.")
            elif re.match("section-", attrs['id']):
                ref_re = ("section-(\\w+" + "\\.[0-9]+"*max(0,level-2) + ")$")
                ref_example = "section-1" + ".1" * max(0,level-2)
                match = re.match(ref_re, attrs['id'])
                if not match:
                    self.err(18, "Anchor for <%s> has id '%s': should look "
                             "like '%s'." % (parent, attrs['id'], ref_example))
                else:
                    self.current_section = match.group(1)

        # Check (some of) rule xhtml/ref-id.
        if (self.current_section == 'A' and len(self.stack2) >= 4
            and element == 'a' and self.stack2[-2][0] == 'td'
            and self.stack2[-3][0] == 'tr' and self.stack2[-4][0] == 'table'):
            if attrs.has_key('id') and not re.match("ref-", attrs['id']):
                self.err(21, "Reference anchor has id '%s': should start with "
                         "'ref-'." % attrs['id'])
            # Remember the target of the reference so that we can check the
            # rule xhtml/ref-link in the endDocument() method.
            if attrs.has_key('id') and attrs.has_key('href'):
                self.refs[attrs['id']] = attrs['href']

        # Remember cross-refs for later checking against the set of ids.
        if (element == 'a' and attrs.has_key('href') and attrs['href']
            and attrs['href'][0] == '#'):
            self.cross_refs.append((attrs['href'][1:],
                                    self.locator.getLineNumber()))

        # Check ordering of sections (<h3> can't follow <h1>).
        if element in elt_heading:
            level = int(re.match('h([0-9])$', element).group(1))
            if level > self.current_heading_level + 1:
                self.err(25, "<%s> follows <h%d>."
                         % (element, self.current_heading_level))
            self.current_heading_level = level

    def characters(self, content):
        if len(self.stack2) > 0:
            element = self.stack2[-1][0]
            # Check that non-blank character data is part of an element only
            # when #PCDATA is legal contents.
            if (legal_elements.has_key(element)
                and '#PCDATA' not in legal_elements[element]
                and not re.match('^\\s+$', content)):
                self.err(8, "<%s> element contains character data '%s'."
                         % (element, content))

    def endElement(self, element):
        contents = self.stack[-1]

        # Check that required contents are present.
        if not contents and element in nonempty_elements:
            self.err(9, "<%s> element is empty." % element)

        # Extra contraints.  See [2.4].
        if contents and constraints.has_key(element):
            if not re.match(constraints[element][1],
                            string.join(contents,'<') + '<'):
                self.err(15, "Contents of <%s> element [%s] doesn't match "
                         "XHTML specification %s."
                         % (element, string.join(contents, ', '),
                            constraints[element][0]))

        # Section headings (except <h1>) must have anchors so they can be
        # referred to (rule xhtml/section).
        if element in ['h2', 'h3', 'h4', 'h5', 'h6'] and 'a' not in contents:
            self.err(19, "<%s> has no section anchor." % element)

        # Pop the stacks.
        self.stack2 = self.stack2[0:-1]
        self.stack = self.stack[0:-1]
        assert self.stack[-1][-1] == element

    def endDocument(self):
        # Check that cross-references have a target.
        for (target, line) in self.cross_refs:
            if not self.ids.has_key(target):
                self.write_error(22, "Cross-reference '#%s' has no target."
                                 % target, line)

            # Check the xhtml/ref-link rule.
            elif self.refs.has_key(target):
                self.write_error(23, "Cross-reference '#%s' to references "
                                 "section should link to target '%s' instead."
                                 % (target, self.refs[target]), line)

        # Check that IDREFs have a target.
        for (idref, line) in self.idrefs:
            if not self.ids.has_key(idref):
                self.write_error(27, "IDREF '%s' has no target." % idref,
                                 line)

    def error(self, exception):
        line = exception.getLineNumber()
        message = exception.getMessage()
        # This unfortunately depends on the XML parser producing this
        # error.
        if message == 'mismatched tag' and self.stack2:
            message = ("Mismatched closing tag (opening tag was <%s> "
                       "at line %d)." % self.stack2[-1])
        self.error_stream.write("%s(%d) %s\n" % (self.doc, line, message))

    def fatalError(self, exception):
        self.error(exception)
        raise self.exception

    def warning(self, exception):
        self.error(exception)

    def check(self, path_or_stream):
        try:
            xml.sax.parse(path_or_stream, self, self)
        except self.exception:
            pass


# 4. CHECK DIRECTORIES AND FILES

class checker:
    noerrors = []
    skip = []

    def __init__(self, skip = [], noerrors = [],
                 error_stream = sys.stderr):
        self.skip = skip
        self.noerrors = noerrors
        self.error_stream = error_stream

    def check(self, path):
        if os.path.isdir(path):
            for f in dircache.listdir(path):
                if f not in self.skip:
                    self.check(os.path.join(path, f))
        elif (os.path.isfile(path) and re.search("\\.html$", path)
              and re.match("<\\?xml", open(path).readline())):
            handler(path, self.noerrors, self.error_stream).check(path)

    def check_stream(self, name, stream):
        handler(name, self.noerrors, self.error_stream).check(stream)


# 5. COMMAND-LINE INTERFACE

def run():
    opts, paths = getopt.getopt(sys.argv[1:], 's:n:', ['skip=', 'noerror='])
    skip = []
    noerrors = []
    for (o, a) in opts:
        if o in ('-s', '--skip'):
            skip.extend(string.split(a, ','))
        if o in ('-n', '--noerror'):
            noerrors.extend(map(int, string.split(a, ',')))
    c = checker(skip, noerrors)
    for p in paths:
        c.check(p)

if __name__ == "__main__":
    run()


# A. REFERENCES
#
# [Chisholm 2000-11-06] "HTML Techniques for Web Content Accessibility
# Guidelines 1.0"; Wendy Chisholm, Gregg Vanderheiden, Ian Jacobs;
# <http://www.w3.org/TR/WCAG10-TECHS/>; 2000-11-06.
#
# [HTML 4.01] "HTML 4.01 Specification"; World Wide Web Consortium;
# <URL:http://www.w3.org/TR/html4/>; 1999-12-24.
#
# [Jacobs 2001-04-09] "User Agent Accessibility Guidelines 1.0" (W3C Working
# Draft); Ian Jacobs, Jon Gunderson, Eric Hansen;
# <http://www.w3.org/TR/2001/WD-UAAG10-20010409/>; 2001-04-09.
#
# [Rules] "Rules"; Gareth Rees; Ravenbrook Limited;
# <http://info.ravenbrook.com/rule/>; 2001-04-22.
#
# [SAX] "xml.sax -- Support for SAX2 parsers"; Python;
# <http://www.python.org/doc/current/lib/module-xml.sax.html>.
#
# [XHTML 1.0] "XHTML 1.0: The Extensible HyperText Markup Language"; World Wide
# Web Consortium; <http://www.w3.org/TR/xhtml1/>; 2000-01-26.
#
# [XHTML 1.0 DTD] "XHTML 1.0 Transitional Document Type Definition"; World Wide
# Web Consortium;
# <URL:http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd>; 2000-01-26.
#
#
# B. DOCUMENT HISTORY
#
# 2001-04-29 GDR Created.
#
# 2001-04-30 GDR Added checking of attributes to the check_xhtml.py
# script. Added Ravenbrook document format checks for section headers (rule
# xhtml/section) and reference ids (rule xhtml/ref-id).
#
# 2001-05-01 GDR Added checks for attribute values, cross-references, links to
# the references section, ordering of sections, XHTML constraints for <head>,
# <html>, <map> and <table>, recommended attributes, IDREFs.
#
# 2001-05-03 GDR For mismatched tags, report the opening tag and its line
# number.
#
# 2001-05-07 GDR Use the 'handler' class as both ContentHandler and
# ErrorHandler.  New method check_stream takes a stream argument, not a path.
#
# 2001-07-25 GDR Handler and checker classes take error_stream object as
# parameter (defaults to sys.stderr) so that they can be used in other
# checking situations (e.g., under unittest.py).  Added section on use.
#
#
# C. COPYRIGHT AND LICENCE
#
# This file is copyright (c) 2001 Perforce Software, Inc.  All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1.  Redistributions of source code must retain the above copyright notice,
#     this list of conditions and the following disclaimer.
#
# 2.  Redistributions in binary form must reproduce the above copyright notice,
#     this list of conditions and the following disclaimer in the documentation
#     and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS AND CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
#
# $Id: //info.ravenbrook.com/project/p4dti/version/1.1/test/check_xhtml.py#4 $
