Changeset 17
- Timestamp:
- 05/28/07 16:09:07 (18 months ago)
- Files:
-
- 1 modified
-
trunk/code/xparser/xml-stream-coding.lisp (modified) (14 diffs)
Legend:
- Unmodified
- Added
- Removed
-
trunk/code/xparser/xml-stream-coding.lisp
r4 r17 30 30 <DELTA DATE='20020118'>corrections to encoding-stream-writer for non-mcl; 31 31 fixed autodetect code to recognize allegro :EOF; fixed us-ascii stream-reader for same.</DELTA> 32 <DELTA DATE='20030602'>provisioins for allegro simple-streams with character element-type</DELTA> 32 <DELTA DATE='20030602'>provisions for allegro simple-streams with character element-type</DELTA> 33 <DELTA DATE='20030602'>scl</DELTA> 34 <DELTA DATE='20040123'>correction to utf-16-{21,12} order discovered from initial declaration 35 bytes rather than from BOM.</DELTA> 36 <DELTA DATE='20060116'>lsh ccl-specific</DELTA> 37 <DELTA DATE='20070415'>corrected utf-8 encoding bounds.</DELTA> 33 38 </CHRONOLOGY> 34 39 </DOCUMENTATION> … … 37 42 (in-package "XML-PARSER") 38 43 39 #+(or ALLEGRO LispWorks CormanLisp CMU sbcl) 44 (defvar *illegal-utf-data-condition* nil) 45 46 #+(or ALLEGRO LispWorks CormanLisp CMU sbcl scl) 40 47 (defMacro lsh (x y) `(ash ,x ,y)) 48 49 #+ccl 50 (unless (eq (symbol-package 'lsh) (find-package :ccl)) 51 (defmacro lsh (x y) `(ccl:lsh ,x ,y))) 41 52 42 53 #+(and allegro allegro-version>= (not (version>= 6 0))) … … 173 184 stream)) 174 185 186 #+scl 187 (defMethod stream-reader ((stream stream)) 188 (values (if (subtypep (stream-element-type stream) 'character) 189 #'(lambda (stream) 190 (read-char stream nil nil)) 191 #'(lambda (stream) 192 (read-byte stream nil nil))) 193 stream)) 194 #+scl 195 (defMethod stream-writer ((stream stream)) 196 (values (if (subtypep (stream-element-type stream) 'character) 197 #'(lambda (stream char) 198 (write-char char stream)) 199 #'(lambda (stream byte) 200 (write-byte byte stream))) 201 stream)) 202 203 175 204 #+sbcl 176 205 (defmethod stream-element-type ((stream stream)) … … 221 250 ;; ? not sure that continuability is a good idea, but 222 251 (unless (typep io-buffer 'ccl::io-buffer) 223 (setq io-buffer ( require-type io-buffer 'ccl::io-buffer)))252 (setq io-buffer (ccl:require-type io-buffer 'ccl::io-buffer))) 224 253 (if (characterp byte) (setf byte (char-code byte))) 225 254 (ccl::%io-buffer-write-byte io-buffer byte))) … … 342 371 (multiple-value-bind (function arg) 343 372 (stream-reader stream) 344 (values #'(lambda (function.arg &aux byte1) 345 (block read-utf-8-datum 346 (flet ((read-byte-code (&aux byte) 347 (setf byte (funcall-function.arg function.arg)) 348 (if (integerp byte) byte 349 (return-from read-utf-8-datum nil)))) 350 (declare (type fixnum byte1) 351 (ftype (function () fixnum) read-byte-code) 352 (optimize (speed 3) (safety 0))) 353 (setf byte1 (read-byte-code)) 354 (cond ((= 0 (logand #x80 byte1)) 355 byte1) 356 ((= #xc0 (logand #xe0 byte1)) 357 (logior (lsh (logand byte1 #x1f) 6) 358 (logand (read-byte-code) #x3f))) 359 ((= #xe0 (logand #xf0 byte1)) 360 (logior (logior (lsh (logand byte1 #x0f) 12) 361 (lsh (logand (read-byte-code) #x3f) 6)) 362 (logand (read-byte-code) #x3f))) 363 ((= #xf0 (logand #xf8 byte1)) 364 (let ((byte2 (read-byte-code)) 365 (byte3 (read-byte-code)) 366 (byte4 (read-byte-code))) 367 (xml-error "unsupported unicode datum: ~s." 368 (list byte1 byte2 byte3 byte4)))) 369 (t 370 (xml-error "illegal UTF-8 data: x~2,'0x." byte1)))))) 371 (cons function arg) 372 :UTF-8))) 373 (let ((reader nil)) 374 (setf reader #'(lambda (function.arg &aux byte1) 375 (block read-utf-8-datum 376 (flet ((read-byte-code (&aux byte) 377 (setf byte (funcall-function.arg function.arg)) 378 (if (integerp byte) byte 379 (return-from read-utf-8-datum nil)))) 380 (declare (type fixnum byte1) 381 (ftype (function () fixnum) read-byte-code) 382 (optimize (speed 3) (safety 0))) 383 (setf byte1 (read-byte-code)) 384 (cond ((= 0 (logand #x80 byte1)) 385 byte1) 386 ((= #xc0 (logand #xe0 byte1)) 387 (logior (lsh (logand byte1 #x1f) 6) 388 (logand (read-byte-code) #x3f))) 389 ((= #xe0 (logand #xf0 byte1)) 390 (logior (logior (lsh (logand byte1 #x0f) 12) 391 (lsh (logand (read-byte-code) #x3f) 6)) 392 (logand (read-byte-code) #x3f))) 393 ((= #xf0 (logand #xf8 byte1)) 394 (let ((byte2 (read-byte-code)) 395 (byte3 (read-byte-code)) 396 (byte4 (read-byte-code))) 397 (xml-error "unsupported unicode datum: ~s." 398 (list byte1 byte2 byte3 byte4)))) 399 (t 400 (ecase *illegal-utf-data-condition* 401 ((nil) (funcall reader function.arg)) 402 (error (xml-error "illegal UTF-8 data: x~2,'0x." byte1)) 403 (warning (xml-warn "illegal UTF-8 data: x~2,'0x." byte1) 404 (funcall reader function.arg))))))))) 405 (values reader 406 (cons function arg) 407 :UTF-8)))) 373 408 374 409 … … 379 414 (stream-writer stream) 380 415 (values #'(lambda (function.arg char &aux (code (char-code char))) 381 (cond ((<= code 255)416 (cond ((<= code #x007f) 382 417 (funcall-function.arg function.arg code)) 383 ((<= code #x0 3ff)418 ((<= code #x07ff) 384 419 (funcall-function.arg function.arg (logior #b11000000 (lsh code -6))) 385 420 (funcall-function.arg function.arg (logior #b10000000 (logand code #b00111111)))) … … 540 575 (stream-reader stream) 541 576 (values #'(lambda (function.arg) 542 (declare (ftype (function () fixnum) read-byte-code) 543 (optimize (speed 3) (safety 0))) 577 (declare (optimize (speed 3) (safety 0))) 544 578 (block read-utf-16-datum 545 579 (flet ((read-byte-code (&aux byte) … … 547 581 (if (integerp byte) byte 548 582 (return-from read-utf-16-datum nil)))) 583 (declare (ftype (function () fixnum) read-byte-code)) 549 584 (+ (lsh (read-byte-code) 8) (read-byte-code))))) 550 585 (cons function arg) … … 573 608 (stream-reader stream) 574 609 (values #'(lambda (function.arg) 575 (declare (ftype (function () fixnum) read-byte-code) 576 (optimize (speed 3) (safety 0))) 610 (declare (optimize (speed 3) (safety 0))) 577 611 (block read-utf-16-datum 578 612 (flet ((read-byte-code (&aux byte) … … 580 614 (if (integerp byte) byte 581 615 (return-from read-utf-16-datum nil)))) 616 (declare (ftype (function () fixnum) read-byte-code)) 582 617 (+ (read-byte-code) (lsh (read-byte-code) 8))))) 583 618 (cons function arg) … … 632 667 (#x3f 633 668 (when *xml-verbose* 634 (warn "assuming UTF-16- 21encoding for stream: ~s." stream))635 (setf encoding :UTF-16- 21to-reread "<?"))669 (warn "assuming UTF-16-12 encoding for stream: ~s." stream)) 670 (setf encoding :UTF-16-12 to-reread "<?")) 636 671 (t (|EC: Byte Order Mark| :data (list #x00 #x3c #x00 byte1)))) 637 672 (|EC: Byte Order Mark| :data (list #x00 #x3c byte1)))) … … 643 678 (#x00 644 679 (when *xml-verbose* 645 (warn "assuming UTF-16- 12encoding for stream: ~s." stream))646 (setf encoding :UTF-16- 12to-reread "<?"))680 (warn "assuming UTF-16-21 encoding for stream: ~s." stream)) 681 (setf encoding :UTF-16-21 to-reread "<?")) 647 682 (t 648 683 (|EC: Byte Order Mark| :data (list #x3c #x00 #x3f byte1))))) … … 691 726 #| 692 727 693 (defun utf-8-decoding (byte1 &optional (byte2 0) (byte3 0) (byte4 0)) 728 (defun utf-8-decoding (byte1 &optional (byte2 0) (byte3 0) (byte4 0) &rest ignore) 729 (declare (ignore ignore)) 694 730 (cond ((= 0 (logand #x80 byte1)) 695 byte1)731 (values byte1 1)) 696 732 ((= #xc0 (logand #xe0 byte1)) 697 (logior (lsh (logand byte1 #x1f) 6) (logand byte2 #x3f))) 733 (values (logior (lsh (logand byte1 #x1f) 6) (logand byte2 #x3f)) 734 2)) 698 735 ((= #xe0 (logand #xf0 byte1)) 699 (logior (logior (lsh (logand byte1 #x0f) 12) (lsh (logand byte2 #x3f) 6)) (logand byte3 #x3f))) 736 (values (logior (logior (lsh (logand byte1 #x0f) 12) (lsh (logand byte2 #x3f) 6)) (logand byte3 #x3f)) 737 3)) 700 738 ((= #xf0 (logand #xf8 byte1)) 701 (xml-error "unsupported unicode datum: ~s." 702 (list byte1 byte2 byte3 byte4))) 739 (values (logior (logior (lsh (logand byte1 #x03) 18) 740 (lsh (logand byte2 #x3f) 12)) 741 (logior (lsh (logand byte3 #x3f) 6) 742 (logand byte4 #x3f))) 743 4)) 703 744 (t 704 745 (xml-error "illegal UTF-8 data: #x~2,'0x." byte1)))) 746 705 747 (xml-namechar? (utf-8-decoding 194 183 0 0)) 706 748 (xml-space? (utf-8-decoding 239 187 191)) … … 715 757 condition)))) 716 758 717 (format nil "~{~8,'0x~^~%~}" (mapcar #'utf-8-decodingXstring 759 (format nil "~{~8,'0x~^~%~}" (mapcar #'(lambda (string) (multiple-value-list 760 (utf-8-decodingXstring string))) 718 761 '("í¿" "æ°" "î" "ᅵ" "ð"))) 762 763 (defGeneric decode-utf8 (source) 764 (:method ((string string)) (decode-utf8 (map 'list #'char-code string))) 765 (:method ((vector vector)) (decode-utf8 (map 'list #'identity vector))) 766 (:method ((source list)) 767 (let ((result nil)) 768 (loop (unless source (return (reverse result))) 769 (multiple-value-bind (decoded code-unit-count) 770 (apply #'utf-8-decoding source) 771 (push (cons (if (< decoded char-code-limit) 772 (cons (code-char decoded) (format nil "#u~4,'0x" decoded)) 773 (format nil "#u~4,'0x" decoded)) 774 code-unit-count) 775 result) 776 (setf source (nthcdr code-unit-count source))))))) 777 778 (decode-utf8 "asdfqwer") 779 (decode-utf8 "í¿") 780 (decode-utf8 #(#x3c #x42 #x69 #x72 #x74 #x68 #x43 #x69 #x74 #x79 #x3e #x4b #xef #xbf #xbd #x2f 781 #x42 #x69 #x72 #x74 #x68 #x43 #x69 #x74 #x79 #x3e)) 719 782 720 783 from the 3.1 unicode report
