# Perforce Defect Tracking Integration Project # # # RELOCATE_XHTML.PY -- MAKE DOCUMENTS RELOCATABLE # # Gareth Rees, Ravenbrook Limited, 2001-07-10 # # # 1. INTRODUCTION # # This module defines a class that edits links in XHTML documents so # that a set of documents can be packaged into a distribution, unpacked # on a random machine, and the links will still work. # # The intended readership is project developers. # # This document is not confidential. # # # 1.1. What it does # # It goes through XHTML documents looking at every 'href' attribute. # (It does so in a naive way, grepping for href="[^"]+". I tried to the # approach of analyzing the XHTML directly and so guaranteeing only to # find the href attributes of anchor tags, but the XML parser in Python # 2.0 [xml.sax] is not accurate enough: it does not call back with all # document entities, for example   appears to be ignored.) # # The xhtml/url rule [GDR 2001-04-22] means that every URL in a # Ravenbrook document either specifies a method, like # "http://info.ravenbrook.com/mail/2001/04/18/14-13-41/0.txt" or # "mailto:gd@ravenbrook.com", or else specifies no method, no host, and # an absolute path like "/project/p4dti/issue/job000331/". See [RFC # 1738] for the specification of URLs. # # Case 1. If the URL specifies a method, a host, or no path (for # example, fragment identifier only) then we leave it unchanged. # # Case 2. If the URL specifies a relative path, convert to an absolute # path and apply case 3 or 4 as appropriate. # # Case 3. If the URL names a document that will belong to the # distribution. We replace the absolute URL by a relative URL that will # point to the target on the local disk or local web site. We add # "index.txt" or "index.html" as appropriate so that the link will # resolve properly on servers that don't automatically supply index # files, such as public.perforce.com. We use ../ to specify the parent # directory in the URL path, in accordance with [RFC 1808]. # # Case 4. Otherwise, the URL names a document that will not belong to # the distribution. We add the method "http" and the host # "www.ravenbrook.com" so that readers can get to the docuument online. # # # 1.2. Terminology # # A "file path" (abbreviated to "fp") names a file or directory in the # file system. File paths have different conventions on different # operating systems, for example 'd:\\p4\\project\\p4dti' on Windows, # '/home/gdr/p4dti' on Unix, or 'Grouse:p4:project:p4dti' on MacOS. # # A "URL path" (abbreviated to "up") is the path component of a URL. # URL paths always use '/' as the separator, regardless of operating # system. # # A "file path list" (abbreviated to "fpl") is a list of components # making up a file path, with the empty string indicating an empty # component. # # A "URL path list" (abbreviated to "upl") is a list of components # making up a URL path, with the empty string indicating an empty # component. # # # 1.3. How to use it # # Create a relocater object and pass: # # 1. The file path for the root of input to the distribution (no # trailing separator). # 2. The URL path for the root of the distribution (no trailing # slash). # 3. The file path for the root of the output (where converted files # are written). If omitted, this defaults to the input root file # path. No trailing separator. # # For example, when building the Integration Kit on Windows, you might # specify # # from relocate_xhtml import relocater # r = relocater('d:\\p4\\project\\p4dti\\version\\1.1', # '/project/p4dti/version/1.1', # 'c:\\temp\\build') # # Or when building the P4DTI release on Unix, you might specify # # r = relocater('/home/gdr/p4dti/version/1.1/manual', # '/project/p4dti/version/1.1/manual', # '/tmp/build') # # Then call the relocate_distribution() method, passing a file path (or # list of file paths) specifying the distribution. # # This program can also be run as a script from the command line. Use # these arguments: # # -i, --input Input root file path. # -u, --url Corresponding URL. # -o, --output Output root file path. # -d, --dist Distribution file path. # # You must make sure that all -d paths are below the -i path. You can # specify multiple -d options if your distribution isn't a complete # subtree. import getopt import os import re import string import sys import types import urlparse # 2. THE RELOCATER CLASS class relocater: # 2.1. Path variables and initialization default_hostname = None # Host for unhosted URLs. dist_fpl_list = None # File path lists in distribution. dist_upl_list = None # URL path lists in distribution. input_fp = None # Path to file currently being relocated. input_up = None # URL path to current file. output_root_fp = None # Output root file path. root_fp = None # Input root file path. root_fpl = None # The same, converted to a path list root_up = None # Root URL path for the distribution. root_upl = None # The same, converted to a path list def __init__(self, root_fp, root_up, output_root_fp = None, default_hostname = "www.ravenbrook.com"): self.root_fp = root_fp self.root_fpl = self.fp_to_fpl(root_fp) self.root_up = root_up self.root_upl = self.up_to_upl(root_up) if output_root_fp: self.output_root_fp = output_root_fp else: # Modify in-place self.output_root_fp = root_fp self.default_hostname = default_hostname # Check there's no trailing separator on file paths. assert self.root_fpl[-1] assert self.fp_to_fpl(self.output_root_fp)[-1] # Check there's no trailing slash to the root URL path. assert self.root_upl[-1] # 2.2. Path conversion utilities # Convert a URL path into a list of its components, for example # '/project/p4dti/version/1.1/' -> ['', 'project', 'p4dti', # 'version', '1.1', '']. def up_to_upl(self, up): return string.split(up, '/') # Convert a list of path components to a URL path, for example ['', # 'project', 'p4dti', ''] -> '/project/p4dti/'. def upl_to_up(self, upl): return string.join(upl, '/') # Convert a file path to a list of its components, for example # 'd:\\p4\\project\\p4dti\\' -> ['d:\\', 'p4', 'project', 'p4dti', # '']. def fp_to_fpl(self, fp): pathlist = [] while 1: dirname, basename = os.path.split(fp) if dirname == fp: pathlist.insert(0, dirname) return pathlist else: pathlist.insert(0, basename) fp = dirname # Convert a list of path components to a file path, for example # ['/', 'home', 'gdr', 'p4dti', 'index.html'] -> # '/home/gdr/p4dti/index.html'. def fpl_to_fp(self, fpl): return apply(os.path.join, fpl) # Convert a file path list to the URL path list that names the same # file (based on the correspondence between root_fp and root_up). # For example, if # # root_fp = 'd:\\p4dti' # root_up = '/project/p4dti/version/1.1' # # then this method will perform this conversion: # # ['d:\\', 'p4dti', 'index.html'] # -> ['', 'project', 'p4dti', 'version', '1.1', 'index.html'] # # It is an error if the argument is not under the root file path. def fpl_to_upl(self, fpl): l = len(self.root_fpl) assert (len(fpl) >= l and fpl[0:l] == self.root_fpl) return self.root_upl + fpl[l:] # Convert a URL path list to the file path list that names the same # file (based on the correspondence between root_fp and root_up), # choosing an index file if appropriate and if one exists. For # example, if # # root_fp = '/home/gdr/p4dti' # root_up = '/project/p4dti/version/1.1' # # then this method will perform this conversion: # # ['', 'project', 'p4dti', 'version', '1.1', 'manual', 'ag', ''] # -> ['/', 'home', 'gdr', 'manual', 'ag', 'index.html'] # # It is an error if the argument is not under the root URL path. def upl_to_fpl(self, upl): l = len(self.root_upl) assert (len(upl) >= l and upl[0:l] == self.root_upl) fpl = self.root_fpl + upl[l:] if fpl[-1] == '': for f in ['index.html', 'index.txt']: fp = self.fpl_to_fp(fpl[0:-1] + [f]) if os.path.isfile(fp): fpl[-1] = f return fpl return fpl # Convert file path to URL path. def fp_to_up(self, fp): return self.upl_to_up(self.fpl_to_upl(self.fp_to_fpl(fp))) # Convert URL path to file path. def up_to_fp(self, up): return self.fpl_to_fp(self.upl_to_fpl(self.up_to_upl(up))) # Convert a file path from one root to another. It is an error if # the fpl argument isn't under root_fpl_1. def fp_to_fp(self, fp, root_fp_1, root_fp_2): fpl = self.fp_to_fpl(fp) root_fpl_1 = self.fp_to_fpl(root_fp_1) l = len(root_fpl_1) assert (len(fpl) >= l and fpl[0:l] == root_fpl_1) return self.fpl_to_fp(self.fp_to_fpl(root_fp_2) + fpl[l:]) # 2.3. Make a relative URL path # # relative_up(source_fp, target_up) returns a URL path that could be # inserted into the file named by source_fp and would link to the # same target as target_up), specifying an index file if appropriate # and one exists. For example, if we have # # root_fp = 'd:\\p4dti' # root_up = '/project/p4dti' # source_fp = 'd:\\p4dti\\version\\1.1\\manual\\ag\\index.html' # target_up is '/project/p4dti/version/1.1/manual/ug/' # # then this method returns '../ug/index.html' def relative_up(self, source_fp, target_up): source_fpl = self.fp_to_fpl(source_fp) target_upl = self.up_to_upl(target_up) target_fpl = self.upl_to_fpl(target_upl) while (source_fpl and target_fpl and source_fpl[0] == target_fpl[0]): source_fpl = source_fpl[1:] target_fpl = target_fpl[1:] relative_upl = ['..'] * (len(source_fpl) - 1) + target_fpl return self.upl_to_up(relative_upl) # 2.4. Determine if something is in the distribution # fp_in_distribution(fp) returns 1 if the file path is in the # distribution, 0 otherwise. def fp_in_distribution(self, fp): fpl = self.fp_to_fpl(fp) for dist_fpl in self.dist_fpl_list: l = len(dist_fpl) if len(fpl) >= l and fpl[0:l] == dist_fpl: return 1 return 0 # up_in_distribution(up) returns 1 if the URL path is in the # distribution, 0 otherwise. def up_in_distribution(self, up): upl = self.up_to_upl(up) for dist_upl in self.dist_upl_list: l = len(dist_upl) if len(upl) >= l and upl[0:l] == dist_upl: return 1 return 0 # 2.5. Replace a URL # # This method is designed to be used as an argument to the re.sub # method. It takes a match object whose group 0 is 'href="TARGET"' # and whose group 1 is the target itself. It returns replacement # text of the form 'href="REVISED-TARGET"'. def replace_url(self, match): target = list(urlparse.urlparse(match.group(1))) if (target[0] or target[1] or target[2] == ''): # Case 1. Target URL specifies a method, a host, or no path. # Leave it unchanged. return match.group(0) else: if target[2][0] != '/': # Case 2. Target URL specifies a relative URL path. # Convert to absolute URL path and continue. target_upl = (self.input_upl[0:-1] + self.up_to_upl(target[2])) target[2] = self.upl_to_up(target_upl) if self.up_in_distribution(target[2]): # Case 3. Target URL names a document that will belong # to the distribution. Replace by relative URL. target[2] = self.relative_up(self.input_fp, target[2]) else: # Case 4. Target not in distribution, add method, host. target[0] = 'http' target[1] = self.default_hostname new_url = urlparse.urlunparse(tuple(target)) return ('href="%s"' % new_url) # 2.6. Relocate a file url_re = re.compile('href="([^"]+)"') def relocate_file(self, fp): self.input_fp = fp self.input_up = self.fp_to_up(fp) self.input_upl = self.up_to_upl(self.input_up) output_fp = self.fp_to_fp(self.input_fp, self.root_fp, self.output_root_fp) dirname = os.path.dirname(output_fp) if not os.path.isdir(dirname): os.makedirs(dirname) if self.input_fp != output_fp: print " Converting", self.input_fp, "to", output_fp else: print " Converting", self.input_fp input = open(self.input_fp, 'r') lines = map(lambda l, s=self: s.url_re.sub(s.replace_url, l), input.readlines()) input.close() output = open(output_fp, 'w') output.writelines(lines) output.close() # 2.7. Relocate files in a path # # relocate_path(fp) recursively descends directories below the file # path given by fp, relocating all the XHTML files it finds there . def relocate_path(self, fp): if os.path.isdir(fp): for f in os.listdir(fp): new_fp = os.path.join(fp, f) self.relocate_path(new_fp) elif (os.path.isfile(fp) and os.path.splitext(fp)[1] == '.html' and open(fp, 'r').readline()[0:5] == '. # # [RFC 1738] "Uniform Resource Locators (URL)"; T Berners-Lee, L # Masinter, M McCahill; 1994-12; # . # # [RFC 1808] "Relative Uniform Resource Locators"; R Fielding; 1995-06; # . # # [xml.sax] "xml.sax -- Support for SAX2 parsers"; Guido van Rossum; # 2000-10-16; # . # # # B. DOCUMENT HISTORY # # 2001-07-10 GDR Created. # # # C. COPYRIGHT AND LICENSE # # This file is copyright (c) 2001 Perforce Software, Inc. All # rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDERS AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS # OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR # TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE # USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH # DAMAGE. # # # $Id: //info.ravenbrook.com/project/p4dti/branch/2005-03-02/agena-extension/tool/relocate_xhtml.py#1 $