CMUCL commit: src (5 files)

Raymond Toy rtoy at common-lisp.net
Fri Jul 2 04:50:35 CEST 2010


    Date: Thursday, July 1, 2010 @ 22:50:35
  Author: rtoy
    Path: /project/cmucl/cvsroot/src

Modified: code/extfmts.lisp code/fd-stream-extfmt.lisp code/fd-stream.lisp
          code/stream.lisp pcl/simple-streams/external-formats/utf-8.lisp

Implement more of the external format error handlers.

code/extfmts.lisp
o Call the error handler for iso8859-1 output.
o In OCTETS-TO-CODEPOINT and CODEPOINT-TO-OCTETS, call the external
  format with the error argument.
o In OCTETS-TO-CHAR
  - Call OCTETS-TO-CODEPOINT with the error handler.
  - For all of the error conditions, call the error handler if
    defined.
o Add error parameter to EF-STRING-TO-OCTETS and EF-ENCODE so we can
  handle errors.  Call CHAR-TO-OCTETS with the error handler.
o Add error parameter to STRING-TO-OCTETS and use it.
o Add error parameter to EF-OCTETS-TO-STRING and EF-DECODE so we can
  handle errors.  Call OCTETS-TO-CHAR with the error handler.
o Add error parameter to OCTETS-TO-STRING and use it.
o In STRING-ENCODE and STRING-DECODE, call the ef function with the
  error handler.
o Change STRING-ENCODE to use keyword args instead of optional args.
  Add error parameter and use it.

code/fd-stream-extfmt.lisp:
o Tell OCTETS-TO-STRING about the error handler stored in the
  fd-stream. 

code/fd-stream.lisp:
o OPEN, MAKE-FD-STREAM, and OPEN-FD-STREAM get DECODING-ERROR and
  ENCODING-ERROR keyword arguments for specifying how to handle
  decoding and encoding errors in external formats.

code/stream.lisp:
o Make sure the error handler is called in
  FAST-READ-CHAR-STRING-REFILL. 

pcl/simple-streams/external-formats/utf-8.lisp:
o Initial cut at calling the error handler for the various possible
  invalid octet streams for a utf-8 encoding.


------------------------------------------------+
 code/extfmts.lisp                              |   71 +++++++++++++++--------
 code/fd-stream-extfmt.lisp                     |    8 +-
 code/fd-stream.lisp                            |   36 ++++++++---
 code/stream.lisp                               |    5 -
 pcl/simple-streams/external-formats/utf-8.lisp |   53 ++++++++++-------
 5 files changed, 113 insertions(+), 60 deletions(-)


Index: src/code/extfmts.lisp
diff -u src/code/extfmts.lisp:1.26 src/code/extfmts.lisp:1.27
--- src/code/extfmts.lisp:1.26	Tue Jun 29 23:53:28 2010
+++ src/code/extfmts.lisp	Thu Jul  1 22:50:35 2010
@@ -5,7 +5,7 @@
 ;;; domain.
 ;;; 
 (ext:file-comment
- "$Header: /project/cmucl/cvsroot/src/code/extfmts.lisp,v 1.26 2010-06-30 03:53:28 rtoy Exp $")
+ "$Header: /project/cmucl/cvsroot/src/code/extfmts.lisp,v 1.27 2010-07-02 02:50:35 rtoy Exp $")
 ;;;
 ;;; **********************************************************************
 ;;;
@@ -522,7 +522,12 @@
   (octets-to-code (state input unput error)
     `(values ,input 1))
   (code-to-octets (code state output error)
-    `(,output (if (> ,code 255) #x3F ,code))))
+    `(,output (if (> ,code 255)
+		  (if ,error
+		      (funcall ,error "Cannot output codepoint #x~X to ISO8859-1 stream"
+			       ,code 1)
+		      #x3F)
+		  ,code))))
 
 ;;; OCTETS-TO-CODEPOINT, CODEPOINT-TO-OCTETS  -- Semi-Public
 ;;;
@@ -535,13 +540,13 @@
   (let ((tmp1 (gensym)) (tmp2 (gensym))
 	(ef (find-external-format external-format)))
     `(multiple-value-bind (,tmp1 ,tmp2)
-	 ,(funcall (ef-octets-to-code ef) state input unput)
+	 ,(funcall (ef-octets-to-code ef) state input unput error)
        (setf ,count (the kernel:index ,tmp2))
        (the (or lisp:codepoint null) ,tmp1))))
 
 (defmacro codepoint-to-octets (external-format code state output &optional error)
   (let ((ef (find-external-format external-format)))
-    (funcall (ef-code-to-octets ef) code state output)))
+    (funcall (ef-code-to-octets ef) code state output error)))
 
 
 
@@ -617,8 +622,10 @@
 	     ;;@@ on non-Unicode builds, limit to 8-bit chars
 	     ;;@@ if unicode-bootstrap, can't use #\u+fffd
 	     (cond ((or (lisp::surrogatep code) (> code #x10FFFF))
-		    #-(and unicode (not unicode-bootstrap)) #\?
-		    #+(and unicode (not unicode-bootstrap)) #\U+FFFD)
+		    (if ,error
+			(funcall ,error "Cannot output codepoint #x~X" code)
+			#-(and unicode (not unicode-bootstrap)) #\?
+			#+(and unicode (not unicode-bootstrap)) #\U+FFFD))
 		   #+unicode
 		   ((> code #xFFFF)
 		    (multiple-value-bind (hi lo) (surrogates code)
@@ -646,12 +653,18 @@
 		     ;; the replacement character.
 		     (,wryte (if (lisp::surrogatep (char-code ,nchar) :low)
 				 (surrogates-to-codepoint (car ,nstate) ,nchar)
-				 +replacement-character-code+))
+				 (if ,error
+				     (funcall ,error "Cannot convert invalid surrogate #x~X to character"
+					      ,nchar)
+				     +replacement-character-code+)))
 		   (setf (car ,nstate) nil))
 		 ;; A lone trailing (low) surrogate gets replaced with
 		 ;; the replacement character.
 		 (,wryte (if (lisp::surrogatep (char-code ,nchar) :low)
-			     +replacement-character-code+
+			     (if ,error
+				 (funcall ,error "Cannot convert lone trailing surrogate #x~X to character"
+					  ,nchar)
+				 +replacement-character-code+)
 			     (char-code ,nchar)))))))))
 
 (defmacro flush-state (external-format state output)
@@ -667,7 +680,7 @@
       (funcall f state))))
 
 (def-ef-macro ef-string-to-octets (extfmt lisp::lisp +ef-max+ +ef-so+)
-  `(lambda (string start end buffer &aux (ptr 0) (state nil))
+  `(lambda (string start end buffer &optional error &aux (ptr 0) (state nil))
      (declare #|(optimize (speed 3) (safety 0) (space 0) (debug 0))|#
 	      (type simple-string string)
 	      (type kernel:index start end ptr)
@@ -679,10 +692,12 @@
 		      (lambda (b)
 			(when (= ptr (length buffer))
 			  (setq buffer (adjust-array buffer (* 2 ptr))))
-			(setf (aref buffer (1- (incf ptr))) b))))))
+			(setf (aref buffer (1- (incf ptr))) b))
+		      error))))
 
 (defun string-to-octets (string &key (start 0) end (external-format :default)
-				     (buffer nil bufferp))
+				     (buffer nil bufferp)
+			             error)
   "Convert String to octets using the specified External-format.  The
    string is bounded by Start (defaulting to 0) and End (defaulting to
    the end of the string.  If Buffer is given, the octets are stored
@@ -696,11 +711,12 @@
     (multiple-value-bind (buffer ptr)
 	(lisp::with-array-data ((string string) (start start) (end end))
 	  (funcall (ef-string-to-octets external-format)
-		   string start end buffer))
+		   string start end buffer error))
       (values (if bufferp buffer (lisp::shrink-vector buffer ptr)) ptr))))
 
 (def-ef-macro ef-octets-to-string (extfmt lisp::lisp +ef-max+ +ef-os+)
-  `(lambda (octets ptr end state string s-start s-end &aux (pos s-start) (count 0) (last-octet 0))
+  `(lambda (octets ptr end state string s-start s-end &optional error
+	    &aux (pos s-start) (count 0) (last-octet 0))
      (declare (optimize (speed 3) (safety 0) #|(space 0) (debug 0)|#)
 	      (type (simple-array (unsigned-byte 8) (*)) octets)
 	      (type kernel:index pos end count last-octet s-start s-end)
@@ -714,7 +730,8 @@
 				   (if (>= ptr end)
 				       (throw 'end-of-octets nil)
 				       (aref octets (incf ptr)))
-				   (lambda (n) (decf ptr n))))
+				   (lambda (n) (decf ptr n))
+				   error))
 	  (incf pos)
 	  (incf last-octet count)))
      (values string pos last-octet state)))
@@ -722,7 +739,8 @@
 (defun octets-to-string (octets &key (start 0) end (external-format :default)
 				     (string nil stringp)
 			             (s-start 0) (s-end nil s-end-p)
-			             (state nil))
+			             (state nil)
+			             error)
   "Octets-to-string converts an array of octets in Octets to a string
   according to the specified External-format.  The array of octets is
   bounded by Start (defaulting ot 0) and End (defaulting to the end of
@@ -751,13 +769,14 @@
 		 octets (1- start) (1- (or end (length octets)))
 		 state
 		 (or string (make-string (length octets)))
-		 s-start s-end)
+		 s-start s-end
+		 error)
       (values (if stringp string (lisp::shrink-vector string pos)) pos last-octet new-state))))
 
 
 
 (def-ef-macro ef-encode (extfmt lisp::lisp +ef-max+ +ef-en+)
-  `(lambda (string start end result &aux (ptr 0) (state nil))
+  `(lambda (string start end result &optional error  &aux (ptr 0) (state nil))
      (declare #|(optimize (speed 3) (safety 0) (space 0) (debug 0))|#
 	      (type simple-string string)
 	      (type kernel:index start end ptr)
@@ -770,9 +789,10 @@
 			 (when (= ptr (length result))
 			   (setq result (adjust-array result (* 2 ptr))))
 			 (setf (aref result (1- (incf ptr)))
-			     (code-char b)))))))
+			       (code-char b)))
+		       error))))
 
-(defun string-encode (string external-format &optional (start 0) end)
+(defun string-encode (string external-format &key (start 0) end error)
   "Encode the given String using External-Format and return a new
   string.  The characters of the new string are the octets of the
   encoded result, with each octet converted to a character via
@@ -782,11 +802,12 @@
   (multiple-value-bind (result ptr)
       (lisp::with-array-data ((string string) (start start) (end end))
 	(funcall (ef-encode external-format) string start end
-		 (make-string (length string) :element-type 'base-char)))
+		 (make-string (length string) :element-type 'base-char)
+		 error))
     (lisp::shrink-vector result ptr)))
 
 (def-ef-macro ef-decode (extfmt lisp::lisp +ef-max+ +ef-de+)
-  `(lambda (string ptr end result &aux (pos -1) (count 0) (state nil))
+  `(lambda (string ptr end result &optional error &aux (pos -1) (count 0) (state nil))
      (declare #|(optimize (speed 3) (safety 0) (space 0) (debug 0))|#
 	      (type simple-string string)
 	      (type kernel:index end count)
@@ -802,10 +823,11 @@
 			       (if (= (1+ ptr) (length string))
 				   nil
 				   (char-code (char string (incf ptr))))
-			       (lambda (n) (decf ptr n))))
+			       (lambda (n) (decf ptr n))
+			       error))
 	finally (return (values result (1+ pos))))))
 
-(defun string-decode (string external-format &optional (start 0) end)
+(defun string-decode (string external-format &key (start 0) end error)
   "Decode String using the given External-Format and return the new
   string.  The input string is treated as if it were an array of
   octets, where the char-code of each character is the octet.  This is
@@ -815,7 +837,8 @@
   (multiple-value-bind (result pos)
       (lisp::with-array-data ((string string) (start start) (end end))
 	(funcall (ef-decode external-format)
-		 string (1- start) (1- end) (make-string (length string))))
+		 string (1- start) (1- end) (make-string (length string))
+		 error))
     (lisp::shrink-vector result pos)))
 
 
Index: src/code/fd-stream-extfmt.lisp
diff -u src/code/fd-stream-extfmt.lisp:1.8 src/code/fd-stream-extfmt.lisp:1.9
--- src/code/fd-stream-extfmt.lisp:1.8	Tue Apr 20 13:57:44 2010
+++ src/code/fd-stream-extfmt.lisp	Thu Jul  1 22:50:35 2010
@@ -5,7 +5,7 @@
 ;;; domain.
 ;;; 
 (ext:file-comment
- "$Header: /project/cmucl/cvsroot/src/code/fd-stream-extfmt.lisp,v 1.8 2010-04-20 17:57:44 rtoy Exp $")
+ "$Header: /project/cmucl/cvsroot/src/code/fd-stream-extfmt.lisp,v 1.9 2010-07-02 02:50:35 rtoy Exp $")
 ;;;
 ;;; **********************************************************************
 ;;;
@@ -94,7 +94,8 @@
 			      :start 0
 			      :external-format old-format
 			      :string (make-string sindex)
-			      :state state)
+			      :state state
+			      :error (fd-stream-octets-to-char-error stream))
 	  (declare (ignore s pos))
 	  (setf state new-state)
 	  (setf index count))
@@ -108,7 +109,8 @@
 			      :external-format (fd-stream-external-format stream)
 			      :string (lisp-stream-string-buffer stream)
 			      :s-start 1
-			      :state state)
+			      :state state
+			      :error (fd-stream-octets-to-char-error stream))
 	  (declare (ignore s))
 	  (setf (lisp-stream-string-index stream) 1)
 	  (setf (lisp-stream-string-buffer-len stream) pos)
Index: src/code/fd-stream.lisp
diff -u src/code/fd-stream.lisp:1.102 src/code/fd-stream.lisp:1.103
--- src/code/fd-stream.lisp:1.102	Tue Jun 29 20:52:15 2010
+++ src/code/fd-stream.lisp	Thu Jul  1 22:50:35 2010
@@ -5,7 +5,7 @@
 ;;; Carnegie Mellon University, and has been placed in the public domain.
 ;;;
 (ext:file-comment
-  "$Header: /project/cmucl/cvsroot/src/code/fd-stream.lisp,v 1.102 2010-06-30 00:52:15 rtoy Exp $")
+  "$Header: /project/cmucl/cvsroot/src/code/fd-stream.lisp,v 1.103 2010-07-02 02:50:35 rtoy Exp $")
 ;;;
 ;;; **********************************************************************
 ;;;
@@ -270,12 +270,19 @@
   ;; characters.  If NIL, then the external format should handle it
   ;; itself, doing whatever is deemed appropriate.  If non-NIL, this
   ;; should be a function (or symbol) that the external format can
-  ;; funcall to deal with the error.
+  ;; funcall to deal with the error.  The function should take 4
+  ;; arguments: a message string, the offending octet, the number of
+  ;; octets read so far in decoding, and a function to unput
+  ;; characters.  If the function returns, it should return the code
+  ;; of the desired replacement character and the number of octets
+  ;; read (the input parameter).
   #+unicode
   (octets-to-char-error nil)
   ;;
   ;; Like OCTETS-TO-CHAR-ERROR, but for converting characters to
-  ;; octets for output.
+  ;; octets for output.  The function takes two arguments: a message
+  ;; string and the codepoint that cannot be converted.  The function
+  ;; should return the octet that should be output.
   #+unicode
   (char-to-octets-error nil))
 
@@ -1782,7 +1789,9 @@
 				 (format nil "descriptor ~D" fd)))
 		       auto-close
 		       (external-format :default)
-		       binary-stream-p)
+		       binary-stream-p
+		       decoding-error
+		       encoding-error)
   (declare (type index fd) (type (or index null) timeout)
 	   (type (member :none :line :full) buffering))
   "Create a stream for the given unix file descriptor.
@@ -1808,14 +1817,16 @@
 					      :pathname pathname
 					      :buffering buffering
 					      :timeout timeout)
-		    (%make-fd-stream :fd fd
+b		    (%make-fd-stream :fd fd
 				     :name name
 				     :file file
 				     :original original
 				     :delete-original delete-original
 				     :pathname pathname
 				     :buffering buffering
-				     :timeout timeout))))
+				     :timeout timeout
+				     :char-to-octets-error encoding-error
+				     :octets-to-char-error decoding-error))))
     ;; FIXME: setting the external format here should be better
     ;; integrated into set-routines.  We do it before so that
     ;; set-routines can create an in-buffer if appropriate.  But we
@@ -2096,7 +2107,8 @@
 				(if-exists nil if-exists-given)
 				(if-does-not-exist nil if-does-not-exist-given)
 				(external-format :default)
-		                class)
+		                class
+		                decoding-error encoding-error)
   (declare (type pathname pathname)
            (type (member :input :output :io :probe) direction)
            (type (member :error :new-version :rename :rename-and-delete
@@ -2121,7 +2133,9 @@
 			 :input-buffer-p t
 			 :auto-close t
 			 :external-format external-format
-			 :binary-stream-p class))
+			 :binary-stream-p class
+			 :decoding-error decoding-error
+			 :encoding-error encoding-error))
 	(:probe
 	 (let ((stream (%make-fd-stream :name namestring :fd fd
 					:pathname pathname
@@ -2145,7 +2159,8 @@
 		      (options options)
 		      (direction direction)
 		      (if-does-not-exist if-does-not-exist)
-		      (if-exists if-exists))
+		      (if-exists if-exists)
+	              decoding-error encoding-error)
   "Return a stream which reads from or writes to Filename.
   Defined keywords:
    :direction - one of :input, :output, :io, or :probe
@@ -2155,7 +2170,8 @@
    :if-does-not-exist - one of :error, :create or nil
    :external-format - an external format name
   See the manual for details."
-  (declare (ignore element-type external-format input-handle output-handle))
+  (declare (ignore element-type external-format input-handle output-handle
+		   decoding-error encoding-error))
 
   ;; OPEN signals a file-error if the filename is wild.
   (when (wild-pathname-p filename)
Index: src/code/stream.lisp
diff -u src/code/stream.lisp:1.92 src/code/stream.lisp:1.93
--- src/code/stream.lisp:1.92	Tue Apr 20 13:57:45 2010
+++ src/code/stream.lisp	Thu Jul  1 22:50:35 2010
@@ -5,7 +5,7 @@
 ;;; Carnegie Mellon University, and has been placed in the public domain.
 ;;;
 (ext:file-comment
-  "$Header: /project/cmucl/cvsroot/src/code/stream.lisp,v 1.92 2010-04-20 17:57:45 rtoy Exp $")
+  "$Header: /project/cmucl/cvsroot/src/code/stream.lisp,v 1.93 2010-07-02 02:50:35 rtoy Exp $")
 ;;;
 ;;; **********************************************************************
 ;;;
@@ -781,7 +781,8 @@
 				     :state (fd-stream-oc-state stream)
 				     :string sbuf
 				     :s-start 1
-				     :external-format (fd-stream-external-format stream))
+				     :external-format (fd-stream-external-format stream)
+				     :error (fd-stream-octets-to-char-error stream))
 		 (declare (ignore s))
 		 (setf (fd-stream-oc-state stream) new-state)
 		 (setf (lisp-stream-string-buffer-len stream) char-count)
Index: src/pcl/simple-streams/external-formats/utf-8.lisp
diff -u src/pcl/simple-streams/external-formats/utf-8.lisp:1.6 src/pcl/simple-streams/external-formats/utf-8.lisp:1.7
--- src/pcl/simple-streams/external-formats/utf-8.lisp:1.6	Wed Jun 30 00:02:53 2010
+++ src/pcl/simple-streams/external-formats/utf-8.lisp	Thu Jul  1 22:50:35 2010
@@ -4,7 +4,7 @@
 ;;; This code was written by Paul Foley and has been placed in the public
 ;;; domain.
 ;;;
-(ext:file-comment "$Header: /project/cmucl/cvsroot/src/pcl/simple-streams/external-formats/utf-8.lisp,v 1.6 2010-06-30 04:02:53 rtoy Exp $")
+(ext:file-comment "$Header: /project/cmucl/cvsroot/src/pcl/simple-streams/external-formats/utf-8.lisp,v 1.7 2010-07-02 02:50:35 rtoy Exp $")
 
 (in-package "STREAM")
 
@@ -17,23 +17,26 @@
   ()
   (octets-to-code (state input unput error c i j n)
     `(labels ((utf8 (,c ,i)
-	       (declare (type (unsigned-byte 8) ,c)
-			(type (integer 1 5) ,i))
-	       (let ((,n (ash (ldb (byte (- 6 ,i) 0) ,c)
-			      (* 6 ,i))))
-		 (declare (type (unsigned-byte 31) ,n))
-		 (dotimes (,j ,i (check ,n ,i))
-		   (let ((,c ,input))
-		     ;; Following bytes must all have the form
-		     ;; #b10xxxxxx.  If not, put back the octet we
-		     ;; just read and return the replacement character
-		     ;; for the bad sequence.
-		     (if (< (logxor ,c #x80) #x40)
-			 (setf (ldb (byte 6 (* 6 (- ,i ,j 1))) ,n)
-			       (ldb (byte 6 0) ,c))
-			 (progn
-			   (,unput 1)
-			   (return (values +replacement-character-code+ (1+ ,j)))))))))
+		(declare (type (unsigned-byte 8) ,c)
+			 (type (integer 1 5) ,i))
+		(let ((,n (ash (ldb (byte (- 6 ,i) 0) ,c)
+			       (* 6 ,i))))
+		  (declare (type (unsigned-byte 31) ,n))
+		  (dotimes (,j ,i (check ,n ,i))
+		    (let ((,c ,input))
+		      ;; Following bytes must all have the form
+		      ;; #b10xxxxxx.  If not, put back the octet we
+		      ;; just read and return the replacement character
+		      ;; for the bad sequence.
+		      (if (< (logxor ,c #x80) #x40)
+			  (setf (ldb (byte 6 (* 6 (- ,i ,j 1))) ,n)
+				(ldb (byte 6 0) ,c))
+			  (progn
+			    (,unput 1)
+			    (return
+			      (if ,error
+				  (funcall ,error "Invalid utf8 byte #x~X at offset ~D" ,c (1+ ,j) ,unput)
+				  (values +replacement-character-code+ (1+ ,j))))))))))
 	      (check (,n ,i)
 	       (declare (type (unsigned-byte 31) ,n)
 			(type (integer 1 5) ,i))
@@ -47,17 +50,25 @@
 		       (lisp::surrogatep ,n)) ; surrogate
 		   (progn
 		     (,unput ,i)
-		     (values +replacement-character-code+ 1))
+		     (if ,error
+			 (funcall ,error "Overlong utf8 sequence" nil 1 ,unput)
+			 (values +replacement-character-code+ 1)))
 		   (values ,n (1+ ,i)))))
       (let ((,c ,input))
 	(declare (optimize (ext:inhibit-warnings 3)))
 	(cond ((null ,c) (values nil 0))
 	      ((< ,c #b10000000) (values ,c 1))
-	      ((< ,c #b11000010) (values +replacement-character-code+ 1))
+	      ((< ,c #b11000010)
+	       (if ,error
+		   (funcall ,error "Invalid second utf8 octet: #x~X" ,c 1 ,unput)
+		   (values +replacement-character-code+ 1)))
 	      ((< ,c #b11100000) (utf8 ,c 1))
 	      ((< ,c #b11110000) (utf8 ,c 2))
 	      ((< ,c #b11111000) (utf8 ,c 3))
-	      (t (values +replacement-character-code+ 1))))))
+	      (t
+	       (if ,error
+		   (funcall ,error "Invalid fourth utf8 octet: #x~X" ,c 1 ,unput)
+		   (values +replacement-character-code+ 1)))))))
   (code-to-octets (code state output error i j n p init)
     `(flet ((utf8 (,n ,i)
           (let* ((,j (- 6 ,i))



More information about the cmucl-commit mailing list