CMUCL commit: src/pcl/simple-streams/external-formats (utf-8.lisp)

Raymond Toy rtoy at common-lisp.net
Fri Jul 2 18:36:01 CEST 2010


    Date: Friday, July 2, 2010 @ 12:36:01
  Author: rtoy
    Path: /project/cmucl/cvsroot/src/pcl/simple-streams/external-formats

Modified: utf-8.lisp

o Rearrange error calls so that error function can just return the
  desired replacement code.
o Generate different messages for the possible caes of invalid
  codepoint, overlong sequence, and surrogate characters in UTF8
  stream.


------------+
 utf-8.lisp |   64 +++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 38 insertions(+), 26 deletions(-)


Index: src/pcl/simple-streams/external-formats/utf-8.lisp
diff -u src/pcl/simple-streams/external-formats/utf-8.lisp:1.7 src/pcl/simple-streams/external-formats/utf-8.lisp:1.8
--- src/pcl/simple-streams/external-formats/utf-8.lisp:1.7	Thu Jul  1 22:50:35 2010
+++ src/pcl/simple-streams/external-formats/utf-8.lisp	Fri Jul  2 12:36:01 2010
@@ -4,7 +4,7 @@
 ;;; This code was written by Paul Foley and has been placed in the public
 ;;; domain.
 ;;;
-(ext:file-comment "$Header: /project/cmucl/cvsroot/src/pcl/simple-streams/external-formats/utf-8.lisp,v 1.7 2010-07-02 02:50:35 rtoy Exp $")
+(ext:file-comment "$Header: /project/cmucl/cvsroot/src/pcl/simple-streams/external-formats/utf-8.lisp,v 1.8 2010-07-02 16:36:01 rtoy Exp $")
 
 (in-package "STREAM")
 
@@ -34,41 +34,53 @@
 			  (progn
 			    (,unput 1)
 			    (return
-			      (if ,error
-				  (funcall ,error "Invalid utf8 byte #x~X at offset ~D" ,c (1+ ,j) ,unput)
-				  (values +replacement-character-code+ (1+ ,j))))))))))
+			      (values
+			       (if ,error
+				   (funcall ,error "Invalid utf8 octet #x~X at offset ~D"
+					    ,c (1+ ,j))
+				   +replacement-character-code+)
+			       (1+ ,j)))))))))
 	      (check (,n ,i)
-	       (declare (type (unsigned-byte 31) ,n)
-			(type (integer 1 5) ,i))
-	       ;; We check for overlong sequences (sequences that
-	       ;; encode to codepoints that don't need that long of a
-	       ;; sequence) and any surrogate values and any code
-	       ;; outside the 21-bit Unicode range.
-	       (if (or (> ,n #x10ffff)
-		       (<= ,n (the (member 127 1023 32767)
-				(svref #(127 1023 32767) (1- ,i)))) ; overlong
-		       (lisp::surrogatep ,n)) ; surrogate
-		   (progn
-		     (,unput ,i)
-		     (if ,error
-			 (funcall ,error "Overlong utf8 sequence" nil 1 ,unput)
-			 (values +replacement-character-code+ 1)))
-		   (values ,n (1+ ,i)))))
+		(declare (type (unsigned-byte 31) ,n)
+			 (type (integer 1 5) ,i))
+		;; We check for overlong sequences (sequences that
+		;; encode to codepoints that don't need that long of a
+		;; sequence) and any surrogate values and any code
+		;; outside the 21-bit Unicode range.
+		(if (or (> ,n #x10ffff)
+			(<= ,n (the (member 127 1023 32767)
+				 (svref #(127 1023 32767) (1- ,i)))) ; overlong
+			(lisp::surrogatep ,n)) ; surrogate
+		    (progn
+		      (,unput ,i)
+		      (values (if ,error
+				  (cond
+				    ((> ,n #x10ffff)
+				     (funcall ,error "Invalid codepoint" nil nil))
+				    ((lisp::surrogatep ,n)
+				     (funcall ,error "Invalid surrogate code #x~4,0X" ,n nil))
+				    (t
+				     (funcall ,error "Overlong utf8 sequence" nil nil)))
+				  +replacement-character-code+)
+			      1))
+		    (values ,n (1+ ,i)))))
       (let ((,c ,input))
 	(declare (optimize (ext:inhibit-warnings 3)))
 	(cond ((null ,c) (values nil 0))
 	      ((< ,c #b10000000) (values ,c 1))
 	      ((< ,c #b11000010)
-	       (if ,error
-		   (funcall ,error "Invalid second utf8 octet: #x~X" ,c 1 ,unput)
-		   (values +replacement-character-code+ 1)))
+	       (values (if ,error
+			   (funcall ,error "Invalid initial utf8 octet: #x~X" ,c 1)
+			   +replacement-character-code+)
+		       1))
 	      ((< ,c #b11100000) (utf8 ,c 1))
 	      ((< ,c #b11110000) (utf8 ,c 2))
 	      ((< ,c #b11111000) (utf8 ,c 3))
 	      (t
-	       (if ,error
-		   (funcall ,error "Invalid fourth utf8 octet: #x~X" ,c 1 ,unput)
-		   (values +replacement-character-code+ 1)))))))
+	       (values (if ,error
+			   (funcall ,error "Invalid initial utf8 octet: #x~X" ,c 1)
+			   +replacement-character-code+)
+		       1))))))
   (code-to-octets (code state output error i j n p init)
     `(flet ((utf8 (,n ,i)
           (let* ((,j (- 6 ,i))



More information about the cmucl-commit mailing list