Yaclml

Miles Egan's XMLS 

(defpackage :it.bese.yaclml.xmls
  (:use :cl)
  (:export #:node-name
           #:node-ns
           #:node-attrs
           #:node-children
           #:make-node
           #:parse
           #:*entities*
           #:*convert-entities*))
(in-package :it.bese.yaclml.xmls)
(define-condition xml-parse-error (error) 
  ((offset :accessor offset :initform nil :initarg :offset)
   (message :accessor message :initform nil :initarg :message))
  (:report (lambda (c s)
             (if (message c)
                 (format s (message c))
                 (format s "XML-PARSE-ERROR at offset ~D." (offset c))))))
(define-condition unresovable-entity (xml-parse-error)
  ((entity :initarg :entity :accessor entity))
  (:report (lambda (c s)
             (format s "Unable to resolve entity ~S at offset ~D." (entity c) (offset c)))))
(define-condition reference-to-undeclared-namespace (xml-parse-error)
  ((namespace :accessor namespace :initarg :namespace))
  (:report (lambda (c s)
             (format s  "Undeclared namespace ~S referenced." (namespace c)))))
(define-condition unmatched-end-tag (xml-parse-error)
  ((expected :accessor expected :initarg :expected)
   (found :accessor found :initarg :found))
  (:report (lambda (c s)
             (format s "Unmatched end tag, found ~S and was expecting ~S." (found c) (expected c)))))
(defstruct state
  "Represents parser state.  Passed among rules to avoid threading issues."
  (got-doctype nil)
  (lines 1 :type integer)
  nsstack
  stream)
(declaim (inline peek-stream))
(define-symbol-macro next-char (peek-stream (state-stream s)))
(defstruct element
  "Common return type of all rule functions."
  (type nil :type symbol)
  (val nil))
(defmatch digit ()
  (and c (digit-char-p c)))
(defmatch letter ()
  (and c (alpha-char-p c)))
(defmatch ws-char ()
  (case c
    ((#\Newline #\Space #\Tab #\Return . #.(unless (char= #\Newline #\Linefeed) (list #\Linefeed))) t)
    (t nil)))
(defmatch namechar ()
  (or 
   (and c (alpha-char-p c))
   (and c (digit-char-p c))
   (case c
     ((#\. #\- #\_ #\:) t))))
(defmatch ncname-char ()
  (or 
   (and c (alpha-char-p c))
   (and c (digit-char-p c))
   (case c
     ((#\. #\- #\_) t))))
(defmatch attr-text-dq ()
  (and c
       (case c
         ((#\< #\") nil)
         (t t))))
(defmatch attr-text-sq ()
  (and c
       (case c
         ((#\< #\') nil)
         (t t))))
(defmatch chardata ()
  (and c (not (char= c #\<))))
(defmatch comment-char ()
  (and c (not (eql c #\-))))
(defrule ncname ()
  (and (peek letter #\_)
       (match+ ncname-char)))
(defrule qname ()
  (let (name suffix)
    (and
     (setf name (ncname s))
     (or
      (and
       (match #\:)
       (setf suffix (ncname s)))
      t))
    (values name suffix)))
(defrule attr-or-nsdecl ()
  (let (suffix name val)
    (and
     (setf (values name suffix) (qname s))
     (or
      (and
       (match #\=)
       (or
        (and
         (match #\")
         (setf val (match* attr-text-dq))
         (match #\"))
        (and
         (match #\')
         (setf val (match* attr-text-sq))
         (match #\'))))
      t)
     (if (string= "xmlns" name)
         (cons 'nsdecl (cons suffix val)) 
         (list
          'attr 
          (if suffix
              (cons suffix name)
              (cons name nil))
          val)))))
(defrule ws ()
  (and (match+ ws-char)
       (make-element :type 'whitespace :val nil)))
(defrule name ()
  (and
   (peek namechar #\_ #\:)
   (match* namechar)))
(defrule ws-attr-or-nsdecl ()
  (and
   (ws s)
   (attr-or-nsdecl s)))
(defrule start-tag ()
  (let (name suffix attrs nsdecls)
    (and
     (peek namechar)
     (setf (values name suffix) (qname s))
     (multiple-value-bind (res a)
         (none-or-more s #'ws-attr-or-nsdecl)
       (mapcar (lambda (x) (if (eq (car x) 'attr)
                               (setf attrs (nconc (cdr x) attrs))
                               (push (cdr x) nsdecls)))
               a)
       res)
     (or (ws s) t)
     (values
      (make-node
       :name (or suffix name)
       :ns (and suffix name)
       :attrs attrs)
      nsdecls))))
(defrule end-tag ()
  (let (name suffix)
    (and
     (match #\/)
     (setf (values name suffix) (qname s))
     (or (ws s) t)
     (match #\>)
     (make-element :type 'end-tag :val (intern-xml-name (if suffix
                                                            suffix
                                                            name)
                                                        (if suffix
                                                            name
                                                            nil)
                                                        (state-nsstack s))))))
(defrule comment ()
  (and
   (match-seq #\! #\- #\-)
   (progn
     (loop until (match-seq #\- #\- #\>)
           do (eat))
     t)
   (make-element :type 'comment)))
(defrule comment-or-cdata ()
  (and
   (peek #\!)
   (must (xml-parse-error :offset (file-position (state-stream s)))
         (or (comment s)
             (and
              (match-seq #\[ #\C #\D #\A #\T #\A #\[)
              (loop with data = (make-extendable-string 50)
                 with state = 0
                 do (case state
                      (0 (if (match #\])
                             (incf state)
                             (push-string (eat) data)))
                      (1 (if (match #\])
                             (incf state)
                             (progn 
                               (setf state 0)
                               (push-string #\] data)
                               (push-string (eat) data))))
                      (2 (if (match #\>)
                             (incf state)
                             (progn 
                               (setf state 0)
                               (push-string #\] data)
                               (push-string #\] data)
                               (push-string (eat) data)))))
                 until (eq state 3)
                 finally (return (make-element :type 'cdata :val data))))))))
(declaim (ftype function element))
(defrule content ()
  (if (match #\<)
      (must (xml-parse-error :offset (file-position (state-stream s)))
            (or (comment-or-cdata s)
                (element s)
                (end-tag s)))
      (or (let (content)
            (and (setf content (match+ chardata))
                 (make-element :type 'data :val content))))))
(defrule element ()
  (let (elem children nsdecls end-name)
    (and
     ;; parse front end of tag
     (multiple-value-bind (e n)
         (start-tag s)
       (setf elem e)
       (setf nsdecls n)
       e)
     ;; resolve namespaces *before* parsing children
     (if nsdecls
         (push nsdecls (state-nsstack s))
         t)
     (resolve-namespace elem (state-nsstack s))
     ;; parse end-tag and children
     (or
      (match-seq #\/ #\>)
      (and
       (match #\>)
       (loop for c = (content s)
             while c
             do (etypecase c
                  (element (case (element-type c)
                             ('end-tag
                              (return (setf end-name (element-val c))))
                             (t (if (element-val c)
                                    (push (element-val c) children)))))))
       (or (eql (node-name elem) end-name)
           (error 'unmatched-end-tag :found end-name :expected (node-name elem)))))
     ;; package up new node
     (progn
       (setf (node-children elem) (nreverse children))
       (make-element :type 'elem :val elem)))))
(defrule processing-instruction-or-xmldecl ()
  (let (name)
    (and
     (match #\?)
     (setf name (name s))
     (none-or-more s #'ws-attr-or-nsdecl)
     (match-seq #\? #\>)
     (make-element :type 'pi :val name))))
(defrule processing-instruction ()
  (let ((p (processing-instruction-or-xmldecl s)))
    (and p
         (not (string= (element-val p) "xml"))
         p)))
(defrule xmldecl ()
  (let ((p (processing-instruction-or-xmldecl s)))
    (and p
         (string= (element-val p) "xml")
         p)))
(defrule comment-or-doctype ()
  ;; skip dtd - bail out to comment if it's a comment
  ;; only match doctype once
  (and
   (peek #\!)
   (or (comment s)
       (and (not (state-got-doctype s))
            (must (xml-parse-error :offset (file-position (state-stream s)))
                  (match-seq #\D #\O #\C #\T #\Y #\P #\E))
            (loop with level = 1
                  do (case (eat)
                       (#\> (decf level))
                       (#\< (incf level)))
                  until (eq level 0)
                  finally (return t))
            (setf (state-got-doctype s) t)
            (make-element :type 'doctype)))))
(defrule misc ()
  (or 
   (ws s)
   (and (match #\<) (must (xml-parse-error :offset (file-position (state-stream s)))
                          (or (processing-instruction s)
                              (comment-or-doctype s)
                              (element s))))))
(defrule document ()
  (let (elem)
    (if (match #\<)
        (must (xml-parse-error :offset (file-position (state-stream s)))
              (or (processing-instruction-or-xmldecl s)
                  (comment-or-doctype s)
                  (setf elem (element s)))))
    (unless elem
      (loop for c = (misc s)
            while c do (if (eql (element-type c) 'elem)
                           (return (setf elem c)))))
    (and elem (element-val elem))))