(in-package "CL-USER") ;; $Id: //info.ravenbrook.com/project/mps/tool/xml-import/parse-xml.lisp#5 $ ;; Code to parse xml files. Also for splitting ;; Load this file and the accompanying transformation file (eg "mmdoc-transforms"). ;; (transform "e:/p4/project/mps/import/2001-09-17/mmdoc") ;; I will assume at this stage that I do not need to preserve comments ;; or whitespace, apart from the whitespace of a string contained ;; within a tag. ;; Also, for now, don't build seperate classes for each tag type. (defvar *file*) ;; find xml files in directory/docs, write html files under directory/html (defun transform (directory) (let* ((truename (truename directory)) (subdirs (directory (namestring (truename (merge-pathnames "docs" truename)))))) (dolist (subdir subdirs) (when (file-directory-p subdir) (let ((tag nil) (directory (directory (namestring subdir)))) (dolist (*file* directory) (print *file*) (catch 'next-file (with-open-file (istream *file*) (let ((new-tag (transform-tag (read-tag istream) ;; doc.mps.guide.appendix.plinth contained ;; only one file. It had a parent. Growl. ;; [I think it was a filing error at Hqn?] (null (cdr directory))))) (if tag (error "Found two documents with no parent in ~a." directory) (setf tag new-tag)))))) (let ((where (format nil "~ahtml/~a/" truename (car (last (pathname-directory subdir)))))) (ensure-directories-exist where) (with-open-file (ostream (format nil "~aindex.html" where) :direction :output :external-format '(:latin-1 :eol-style :lf) :if-exists :supersede) (write-tag tag ostream)))))))) (defstruct (tag (:constructor make-tag (name attributes &optional contents)) (:print-object print-tag)) name attributes contents) (defun print-tag (tag stream) (if *print-escape* (print-unreadable-object (tag stream :type t :identity t) (format stream "~(~a, ~:[no attributes~;~:*~d attribute~:p~], ~:[no contents~;~:*~d content~:p~]~)" (tag-name tag) (length (tag-attributes tag)) (length (tag-contents tag)))) (write-tag tag stream 0))) (defparameter *html-header* " ") (defun write-tag (tag &optional (stream *standard-output*) depth) (or depth (progn (write-string *html-header* stream) (dolist (content (tag-contents tag)) (write-tag content stream 1)) (return-from write-tag tag))) (let ((name (tag-name tag))) (format stream "~&~vt<~(~a~)" depth name) (dolist (attribute (tag-attributes tag)) (let ((name (car attribute)) (value (cadr attribute))) (format stream " ~(~a~)='~a'" name value))) (let ((contents (tag-contents tag))) (if contents (progn (format stream ">") (if (or (member name '(pre code)) (and (= (length contents) 1) (stringp (car contents)))) (dolist (content contents) (format stream "~a" content)) (progn (dolist (content contents) (if (tag-p content) (write-tag content stream (1+ depth)) (format stream "~&~vt~a" (1+ depth) content))) (format stream "~&~vt" depth))) (format stream "~%" name)) (format stream " />~%"))))) ;; returns either a tag structure or the name within a tag-close (defun read-tag (stream) (check-read-char #\< stream) (let ((name (or (let ((sym (safe-read-symbol stream))) (when (eq sym :comment) (return-from read-tag nil)) sym) (return-from read-tag (read-close-tag stream)))) (attributes (read-attributes stream)) (next-char (read-char stream))) (case next-char ((#\/) (check-read-char #\> stream :skip nil) (make-tag name attributes)) ((#\>) (multiple-value-bind (contents close-tag) (read-contents stream) (or (eq close-tag name) (xml-parse-error stream "close tag '~a' was expected to be '~a'." close-tag name)) (make-tag name attributes contents))) (t (xml-parse-error stream "'~a' should have been one of '/' or '>'." next-char))))) (defun read-symbol (stream) (skip-to-start stream) (let ((chars nil)) (loop (let ((char (read-char stream))) (if (alpha-char-p char) (push char chars) (return (unread-char char stream))))) (when chars (intern (nstring-upcase (nreverse (coerce chars 'simple-base-string))))))) (defun safe-read-symbol (stream) (or (read-symbol stream) ;; did it funk because we're on a comment? (let ((char (peek-char nil stream))) (when (char= char #\!) (read-char stream) (skip-to-end-of-comment stream) (or (safe-read-symbol stream) :comment))))) (defun read-close-tag (stream) (check-read-char #\/ stream :skip nil) (let ((name (read-symbol stream))) (check-read-char #\> stream) name)) (defun read-attributes (stream) (skip-to-start stream) (let ((attributes nil)) (loop (let ((attr (read-symbol stream))) (if attr (progn (check-read-char #\= stream) (let ((value (read-value stream))) (push (list attr value) attributes))) (return)))) (nreverse attributes))) (defun read-contents (stream) (let* ((whitespaces (skip-to-start stream)) (char (peek-char nil stream)) (contents nil)) (if (char= char #\<) (let ((next (read-tag stream))) (cond ((null next) ;; false alarm - it was a comment, not a tag (multiple-value-bind (contents close-tag) (read-contents stream) (when (stringp contents) (setf contents (concatenate 'simple-base-string (nreverse whitespaces) contents))) (values contents close-tag))) ((symbolp next) (values (nreverse contents) next)) (t (multiple-value-bind (contents close-tag) (read-contents stream) (values (cons next contents) close-tag))))) (progn (setf contents whitespaces) (loop (let ((char (read-char stream))) (if (char= char #\<) (progn (unread-char char stream) (let ((string (nreverse (coerce contents 'simple-base-string))) (name (read-tag stream))) (return-from read-contents (if (tag-p name) (multiple-value-bind (more-contents close-tag) (read-contents stream) (values (list* string name more-contents) close-tag)) (values (list string) name))))) (push char contents)))))))) ;; this is to gobble whitespace, unreading the character that wasn't. ;; return (reversed) list containing whitespace found. (defun skip-to-start (stream &key (eof-error t)) (let ((whitespaces nil)) (loop (let ((char (read-char stream eof-error :eof))) (cond ((eq char :eof) (return-from skip-to-start :eof)) ((whitespace-char-p char) (push char whitespaces)) (t (return (unread-char char stream)))))) whitespaces)) ;; advance to character (not yet read) outside comment which has just started (defun skip-to-end-of-comment (stream) (dotimes (i 2) (or (char= (read-char stream) #\-) (xml-parse-error stream "expected two hyphens at start of comment."))) (let ((one (read-char stream)) (two (read-char stream))) (loop (let ((three (read-char stream))) (when (and (char= one #\-) (char= two #\-) (char= three #\>)) (return)) (shiftf one two three))))) (defun read-value (stream) (check-read-char #\' stream) (let ((chars nil)) (loop (let ((char (read-char stream))) (if (char= char #\') (return) (push char chars)))) (nreverse (coerce chars 'simple-base-string)))) (defun check-read-char (expected stream &key (skip t)) (when skip (skip-to-start stream)) (let ((char (read-char stream))) (or (char= char expected) (xml-parse-error stream "expected '~a' and got '~a'." expected char)))) (defun xml-parse-error (stream format-string &rest format-args) (error "XML parse error, at position ~d: ~?" (1- (file-position stream)) format-string format-args)) ;; Transforms ;; Each deftransform is a non-destructive transformation from one set ;; of content siblings to another. The final form returns a list - ;; possibly empty - of siblings (to replace the old one). All previous ;; forms must return non-nil, otherwise no transformation takes place. ;; The content siblings which might be transformed are those ;; explicitly named in the parameter list. The parameter list ;; specialises with defmethod syntax: the types are tag names combined ;; with and, or and not. (A string sibling is (not tag).) Within the ;; body of the deftransform the contents of each (tag) parameter are ;; bound to a variable of the form -contents. ;; This might be tidied somewhat by specialising item tags by their ;; name attribute, as that's always what we look at first. (editor:setup-indent "deftransform" 1) (defparameter *transforms* (make-hash-table :test 'equal)) (defmacro deftransform (name arglist &body body) (multiple-value-bind (args types discriminations bindings) (parse-arglist arglist) (if body `(top-level-form (deftransform ,name ,types) (setf (gethash ',name *transforms*) (lambda (contents) (if (<= ,(length args) (length contents)) (destructuring-bind (,@args &rest tail) contents (if (and ,@discriminations) (let ,bindings (declare (ignorable ,@(mapcar 'car bindings))) (if (and ,@(butlast body)) (append ,(car (last body)) tail) ; transformation contents)) ; no change because subform returned nil contents)) ; no change because pattern not matched contents)))) ; no change because pattern too long `(remhash ',name *transforms*)))) (defun parse-arglist (arglist) (let ((args nil) (types nil) (discriminations nil) (bindings nil)) (dolist (arg arglist) (if (atom arg) (push arg args) (let ((name (car arg)) (type (cadr arg))) (push name args) (push type types) (push (walk-discrimination name type) discriminations) (push `(,(intern (format nil "~a-CONTENTS" name)) (and (tag-p ,name) (tag-contents ,name))) bindings)))) (values (nreverse args) (nreverse types) (nreverse discriminations) (nreverse bindings)))) (defun walk-discrimination (name type) (labels ((walk (type) (cond ((eq type 'tag) `(tag-p ,name)) ((atom type) `(and (tag-p ,name) (eq (tag-name ,name) ',type))) (t (cons (car type) (mapcar #'walk (cdr type))))))) (walk type))) (defun transform-tag (tag &optional force) (if (tag-p tag) (let* ((name (tag-name tag)) (attributes (tag-attributes tag)) (original-contents (tag-contents tag)) (contents (mapcar 'transform-tag original-contents))) (when (and (not force) (eq name 'document) (assoc 'parent attributes)) (throw 'next-file nil)) (loop (let ((new-contents (transform-subcontents contents))) (if (eq new-contents contents) (return) (setf contents new-contents)))) (if (eq contents original-contents) tag (make-tag name attributes contents))) tag)) ;; one of the very large files was blowing the stack. ;; this ugly hack is the recomended fix. (eval-when (compile) (hcl:toggle-source-debugging nil)) (defun transform-subcontents (subcontents) (declare (optimize (debug 0))) (when subcontents (let* ((old-tail (cdr subcontents)) (new-tail (transform-subcontents old-tail))) (when (not (eq old-tail new-tail)) (setf subcontents (cons (car subcontents) new-tail)))) (maphash (lambda (key function) (declare (ignore key)) (setf subcontents (funcall function subcontents))) *transforms*) subcontents)) (eval-when (compile) (hcl:toggle-source-debugging t))