# Perforce Defect Tracking Integration Project # # # CHECK_XHTML.PY -- CHECK XHTML DOCUMENT # # Gareth Rees, Ravenbrook Limited, 2001-04-30 # # # 1. INTRODUCTION # # This Python script checks that XHTML documents conform to the XHTML # 1.0 Transitional specification [XHTML 1.0] and to the Ravenbrook # document rules [Rules]. # # The intended readership is Ravenbrook staff. # # This document is not confidential. # # # 1.1. Use # # This module is intended for use in two circumstances. It may be run # from the command line, passing a list of paths. It then checks those # paths and writes its output to stderr. This is convenient for use in # Emacs under M-x compile, for then the errors can be browsed using # `next-error' (C-x `). # # It may also be run from other Python programs, which should # instantiate an object from the checker class, and call the check() # method, passing appropriate paths, or the check_stream() method, # passing a file stream. When constructing the checker object, you may # supply an error stream object with a write() method: all error # messages will be written to this object and you may then divert them # for exammple to unittest's fail() method. import dircache import getopt import os import re import string import sys import types import xml.sax # 2. XHTML DEFINITION # # This section defines a bunch of tables that describe the XHTML 1.0 # Transitional document type [XHTML 1.0 DTD]. It would be nice to # generate these tables automatically by parsing the document type # definition, but as far as I know there's no DTD parser for XML, and I # didn't want to write one just for the purpose of checking one document # type. # # Instead, these tables have been derived mechanically (typically using # Emacs Lisp) from the XHTML DTD [XHTML 1.0 DTD]. # # In any case, there are a number of constraints on XHTML documents that # are not specified in the DTD; see [XHTML 1.0]. So parsing the DTD # wouldn't be the whole story. # # # 2.1. Useful element contents # # These variables contain lists of elements: they will be used in the # element definitions [2.2] to define the set of elements that may # legally appear in the content of each XHTML element. elt_special = ['br', 'span', 'bdo', 'object', 'applet', 'img', 'map', 'iframe'] elt_fontstyle = ['tt', 'i', 'b', 'big', 'small', 'u', 's', 'strike', 'font', 'basefont'] elt_phrase = ['em', 'strong', 'dfn', 'code', 'q', 'sub', 'sup', 'samp', 'kbd', 'var', 'cite', 'abbr', 'acronym'] elt_inline_forms = ['input', 'select', 'textarea', 'label', 'button'] elt_misc = ['ins', 'del', 'script', 'noscript'] elt_inline = (['a'] + elt_special + elt_fontstyle + elt_phrase + elt_inline_forms) elt_Inline = ['#PCDATA'] + elt_inline + elt_misc elt_heading = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] elt_lists = ['ul', 'ol', 'dl', 'menu', 'dir'] elt_blocktext = ['pre', 'hr', 'blockquote', 'address', 'center', 'noframes'] elt_block = (['p', 'div', 'isindex', 'fieldset', 'table'] + elt_heading + elt_lists + elt_blocktext) elt_Block = elt_block + ['form'] + elt_misc elt_Flow = (['#PCDATA', 'form'] + elt_block + elt_inline + elt_misc) elt_a_content = (['#PCDATA'] + elt_special + elt_fontstyle + elt_phrase + elt_inline_forms + elt_misc) elt_pre_content = (['#PCDATA', 'a', 'br', 'span', 'bdo', 'map', 'tt', 'i', 'b', 'u', 's'] + elt_phrase + elt_inline_forms) elt_form_content = ['#PCDATA'] + elt_block + elt_inline + elt_misc elt_button_content = (['#PCDATA', 'p', 'div', 'table', 'br', 'span', 'bdo', 'object', 'applet', 'img', 'map'] + elt_heading + elt_lists + elt_blocktext + elt_fontstyle + elt_phrase + elt_misc) elt_head_misc = ['script', 'style', 'meta', 'link', 'object', 'isindex'] # 2.2. Elements definitions # # The legal_elements dictionary maps the name of an XHTML element to the # list of elements that are legal members of that element. legal_elements = { 'a': elt_a_content, 'abbr': elt_Inline, 'acronym': elt_Inline, 'address': elt_Inline, 'applet': (['#PCDATA', 'param', 'form'] + elt_block + elt_inline + elt_misc), 'area': [], 'b': elt_Inline, 'base': [], 'basefont': [], 'bdo': elt_Inline, 'big': elt_Inline, 'blockquote': elt_Flow, 'body': elt_Flow, 'br': [], 'button': elt_button_content, 'caption': elt_Inline, 'center': elt_Flow, 'cite': elt_Inline, 'code': elt_Inline, 'col': [], 'colgroup': ['col'], 'dd': elt_Flow, 'del': elt_Flow, 'dfn': elt_Inline, 'dir': ['li'], 'div': elt_Flow, 'dl': ['dt', 'dd'], 'dt': elt_Inline, 'em': elt_Inline, 'fieldset': (['#PCDATA', 'legend', 'form'] + elt_block + elt_inline + elt_misc), 'font': elt_Inline, 'form': elt_form_content, 'h1': elt_Inline, 'h2': elt_Inline, 'h3': elt_Inline, 'h4': elt_Inline, 'h5': elt_Inline, 'h6': elt_Inline, 'head': elt_head_misc + ['title', 'base'], 'hr': [], 'html': ['head', 'body'], 'i': elt_Inline, 'iframe': elt_Flow, 'img': [], 'input': [], 'ins': elt_Flow, 'isindex': [], 'kbd': elt_Inline, 'label': elt_Inline, 'legend': elt_Inline, 'li': elt_Flow, 'link': [], 'map': ['form', 'area'] + elt_block + elt_misc, 'menu': ['li'], 'meta': [], 'noframes': elt_Flow, 'noscript': elt_Flow, 'object': (['#PCDATA', 'param', 'form'] + elt_block + elt_inline + elt_misc), 'ol': ['li'], 'optgroup': ['option'], 'option': ['#PCDATA'], 'p': elt_Inline, 'param': [], 'pre': elt_pre_content, 'q': elt_Inline, 's': elt_Inline, 'samp': elt_Inline, 'script': ['#PCDATA'], 'select': ['optgroup', 'option'], 'small': elt_Inline, 'span': elt_Inline, 'strike': elt_Inline, 'strong': elt_Inline, 'style': ['#PCDATA'], 'sub': elt_Inline, 'sup': elt_Inline, 'table': ['caption', 'col', 'colgroup', 'thead', 'tfoot', 'tbody', 'tr'], 'tbody': ['tr'], 'td': elt_Flow, 'textarea': ['#PCDATA'], 'tfoot': ['tr'], 'th': elt_Flow, 'thead': ['tr'], 'title': ['#PCDATA'], 'tr': ['th', 'td'], 'tt': elt_Inline, 'u': elt_Inline, 'ul': ['li'], 'var': elt_Inline, } # 2.3. Nonempty elements # # The nonempty_element array is a list of elements that may not be empty # (that is, they must have at least one element in their contents: for # example