(defpackage :it.bese.yaclml.xmls (:use :cl) (:export #:node-name #:node-ns #:node-attrs #:node-children #:make-node #:parse #:*entities* #:*convert-entities*))
(in-package :it.bese.yaclml.xmls)
Variable *CONVERT-ENTITIES* When true we convert entities found in the data to their corresponding chars, when false we leave ignore entities.
(define-condition xml-parse-error (error) ((offset :accessor offset :initform nil :initarg :offset) (message :accessor message :initform nil :initarg :message)) (:report (lambda (c s) (if (message c) (format s (message c)) (format s "XML-PARSE-ERROR at offset ~D." (offset c))))))
(define-condition unresovable-entity (xml-parse-error) ((entity :initarg :entity :accessor entity)) (:report (lambda (c s) (format s "Unable to resolve entity ~S at offset ~D." (entity c) (offset c)))))
(define-condition reference-to-undeclared-namespace (xml-parse-error) ((namespace :accessor namespace :initarg :namespace)) (:report (lambda (c s) (format s "Undeclared namespace ~S referenced." (namespace c)))))
(define-condition unmatched-end-tag (xml-parse-error) ((expected :accessor expected :initarg :expected) (found :accessor found :initarg :found)) (:report (lambda (c s) (format s "Unmatched end tag, found ~S and was expecting ~S." (found c) (expected c)))))
Function MAKE-NODE Convenience function for creating a new xml node.
Function ENTITY-OF Returns the xml entity corresponding to CHAR, without the leading ampersand.
Function ENTITIFY Converts OBJECT to its string representation, if necessary, and then replaces the characters of OBJECT with their corresponding entities.
Function MAKE-EXTENDABLE-STRING Creates an adjustable string with a fill pointer.
Function PUSH-STRING Shorthand function for adding characters to an extendable string.
(defstruct state "Represents parser state. Passed among rules to avoid threading issues." (got-doctype nil) (lines 1 :type integer) nsstack stream)
Function RESOLVE-ENTITY Resolves the xml entity ENT to a character.
(declaim (inline peek-stream))
Function PEEK-STREAM Looks one character ahead in the input stream.
Function READ-STREAM Reads a character from the stream, translating entities as it goes (assuming *convert-entities* is non-NIL).
(define-symbol-macro next-char (peek-stream (state-stream s)))
Macro EAT Consumes one character from the input stream.
Macro MATCH Attempts to match the next input character with one of the supplied matchers.
Macro MATCH-SEQ Tries to match the supplied matchers in sequence with characters in the input stream.
Macro MATCH* Matches any occurances of any of the supplied matchers.
Macro MATCH+ Matches one or more occurances of any of the supplied matchers.
Macro PEEK Looks ahead for an occurance of any of the supplied matchers.
Macro MUST Throws a parse error if the supplied forms do not succeed.
(defstruct element "Common return type of all rule functions." (type nil :type symbol) (val nil))
Function RESOLVE-NAMESPACE Maps the ns prefix to its associated url via the supplied ns env.
Macro DEFMATCH Match definition macro that provides a common lexical environment for matchers.
Macro DEFRULE Rule definition macro that provides a common lexical environment for rules.
Macro MATCHFN Convenience macro for creating an anonymous function wrapper around a matcher macro.
Function NONE-OR-MORE Collects any matches of the supplied rule with the input stream.
Function ONE-OR-MORE Collects one or more matches of the supplied rule with the input stream.
(defmatch digit () (and c (digit-char-p c)))
(defmatch letter () (and c (alpha-char-p c)))
(defmatch ws-char () (case c ((#\Newline #\Space #\Tab #\Return . #.(unless (char= #\Newline #\Linefeed) (list #\Linefeed))) t) (t nil)))
(defmatch namechar () (or (and c (alpha-char-p c)) (and c (digit-char-p c)) (case c ((#\. #\- #\_ #\:) t))))
(defmatch ncname-char () (or (and c (alpha-char-p c)) (and c (digit-char-p c)) (case c ((#\. #\- #\_) t))))
(defmatch attr-text-dq () (and c (case c ((#\< #\") nil) (t t))))
(defmatch attr-text-sq () (and c (case c ((#\< #\') nil) (t t))))
(defmatch chardata () (and c (not (char= c #\<))))
(defmatch comment-char () (and c (not (eql c #\-))))
(defrule ncname () (and (peek letter #\_) (match+ ncname-char)))
(defrule qname () (let (name suffix) (and (setf name (ncname s)) (or (and (match #\:) (setf suffix (ncname s))) t)) (values name suffix)))
(defrule attr-or-nsdecl () (let (suffix name val) (and (setf (values name suffix) (qname s)) (or (and (match #\=) (or (and (match #\") (setf val (match* attr-text-dq)) (match #\")) (and (match #\') (setf val (match* attr-text-sq)) (match #\')))) t) (if (string= "xmlns" name) (cons 'nsdecl (cons suffix val)) (list 'attr (if suffix (cons suffix name) (cons name nil)) val)))))
(defrule ws () (and (match+ ws-char) (make-element :type 'whitespace :val nil)))
(defrule name () (and (peek namechar #\_ #\:) (match* namechar)))
(defrule ws-attr-or-nsdecl () (and (ws s) (attr-or-nsdecl s)))
(defrule start-tag () (let (name suffix attrs nsdecls) (and (peek namechar) (setf (values name suffix) (qname s)) (multiple-value-bind (res a) (none-or-more s #'ws-attr-or-nsdecl) (mapcar (lambda (x) (if (eq (car x) 'attr) (setf attrs (nconc (cdr x) attrs)) (push (cdr x) nsdecls))) a) res) (or (ws s) t) (values (make-node :name (or suffix name) :ns (and suffix name) :attrs attrs) nsdecls))))
(defrule end-tag () (let (name suffix) (and (match #\/) (setf (values name suffix) (qname s)) (or (ws s) t) (match #\>) (make-element :type 'end-tag :val (intern-xml-name (if suffix suffix name) (if suffix name nil) (state-nsstack s))))))
(defrule comment () (and (match-seq #\! #\- #\-) (progn (loop until (match-seq #\- #\- #\>) do (eat)) t) (make-element :type 'comment)))
(defrule comment-or-cdata () (and (peek #\!) (must (xml-parse-error :offset (file-position (state-stream s))) (or (comment s) (and (match-seq #\[ #\C #\D #\A #\T #\A #\[) (loop with data = (make-extendable-string 50) with state = 0 do (case state (0 (if (match #\]) (incf state) (push-string (eat) data))) (1 (if (match #\]) (incf state) (progn (setf state 0) (push-string #\] data) (push-string (eat) data)))) (2 (if (match #\>) (incf state) (progn (setf state 0) (push-string #\] data) (push-string #\] data) (push-string (eat) data))))) until (eq state 3) finally (return (make-element :type 'cdata :val data))))))))
(declaim (ftype function element)) (defrule content () (if (match #\<) (must (xml-parse-error :offset (file-position (state-stream s))) (or (comment-or-cdata s) (element s) (end-tag s))) (or (let (content) (and (setf content (match+ chardata)) (make-element :type 'data :val content))))))
(defrule element () (let (elem children nsdecls end-name) (and ;; parse front end of tag (multiple-value-bind (e n) (start-tag s) (setf elem e) (setf nsdecls n) e) ;; resolve namespaces *before* parsing children (if nsdecls (push nsdecls (state-nsstack s)) t) (resolve-namespace elem (state-nsstack s)) ;; parse end-tag and children (or (match-seq #\/ #\>) (and (match #\>) (loop for c = (content s) while c do (etypecase c (element (case (element-type c) ('end-tag (return (setf end-name (element-val c)))) (t (if (element-val c) (push (element-val c) children))))))) (or (eql (node-name elem) end-name) (error 'unmatched-end-tag :found end-name :expected (node-name elem))))) ;; package up new node (progn (setf (node-children elem) (nreverse children)) (make-element :type 'elem :val elem)))))
(defrule processing-instruction-or-xmldecl () (let (name) (and (match #\?) (setf name (name s)) (none-or-more s #'ws-attr-or-nsdecl) (match-seq #\? #\>) (make-element :type 'pi :val name))))
(defrule processing-instruction () (let ((p (processing-instruction-or-xmldecl s))) (and p (not (string= (element-val p) "xml")) p)))
(defrule xmldecl () (let ((p (processing-instruction-or-xmldecl s))) (and p (string= (element-val p) "xml") p)))
(defrule comment-or-doctype () ;; skip dtd - bail out to comment if it's a comment ;; only match doctype once (and (peek #\!) (or (comment s) (and (not (state-got-doctype s)) (must (xml-parse-error :offset (file-position (state-stream s))) (match-seq #\D #\O #\C #\T #\Y #\P #\E)) (loop with level = 1 do (case (eat) (#\> (decf level)) (#\< (incf level))) until (eq level 0) finally (return t)) (setf (state-got-doctype s) t) (make-element :type 'doctype)))))
(defrule misc () (or (ws s) (and (match #\<) (must (xml-parse-error :offset (file-position (state-stream s))) (or (processing-instruction s) (comment-or-doctype s) (element s))))))
(defrule document () (let (elem) (if (match #\<) (must (xml-parse-error :offset (file-position (state-stream s))) (or (processing-instruction-or-xmldecl s) (comment-or-doctype s) (setf elem (element s))))) (unless elem (loop for c = (misc s) while c do (if (eql (element-type c) 'elem) (return (setf elem c))))) (and elem (element-val elem))))
Function PARSE Parses the supplied stream or string into a lisp node tree.