CMUCL commit: src (5 files)
Raymond Toy
rtoy at common-lisp.net
Fri Jul 2 04:50:35 CEST 2010
Date: Thursday, July 1, 2010 @ 22:50:35
Author: rtoy
Path: /project/cmucl/cvsroot/src
Modified: code/extfmts.lisp code/fd-stream-extfmt.lisp code/fd-stream.lisp
code/stream.lisp pcl/simple-streams/external-formats/utf-8.lisp
Implement more of the external format error handlers.
code/extfmts.lisp
o Call the error handler for iso8859-1 output.
o In OCTETS-TO-CODEPOINT and CODEPOINT-TO-OCTETS, call the external
format with the error argument.
o In OCTETS-TO-CHAR
- Call OCTETS-TO-CODEPOINT with the error handler.
- For all of the error conditions, call the error handler if
defined.
o Add error parameter to EF-STRING-TO-OCTETS and EF-ENCODE so we can
handle errors. Call CHAR-TO-OCTETS with the error handler.
o Add error parameter to STRING-TO-OCTETS and use it.
o Add error parameter to EF-OCTETS-TO-STRING and EF-DECODE so we can
handle errors. Call OCTETS-TO-CHAR with the error handler.
o Add error parameter to OCTETS-TO-STRING and use it.
o In STRING-ENCODE and STRING-DECODE, call the ef function with the
error handler.
o Change STRING-ENCODE to use keyword args instead of optional args.
Add error parameter and use it.
code/fd-stream-extfmt.lisp:
o Tell OCTETS-TO-STRING about the error handler stored in the
fd-stream.
code/fd-stream.lisp:
o OPEN, MAKE-FD-STREAM, and OPEN-FD-STREAM get DECODING-ERROR and
ENCODING-ERROR keyword arguments for specifying how to handle
decoding and encoding errors in external formats.
code/stream.lisp:
o Make sure the error handler is called in
FAST-READ-CHAR-STRING-REFILL.
pcl/simple-streams/external-formats/utf-8.lisp:
o Initial cut at calling the error handler for the various possible
invalid octet streams for a utf-8 encoding.
------------------------------------------------+
code/extfmts.lisp | 71 +++++++++++++++--------
code/fd-stream-extfmt.lisp | 8 +-
code/fd-stream.lisp | 36 ++++++++---
code/stream.lisp | 5 -
pcl/simple-streams/external-formats/utf-8.lisp | 53 ++++++++++-------
5 files changed, 113 insertions(+), 60 deletions(-)
Index: src/code/extfmts.lisp
diff -u src/code/extfmts.lisp:1.26 src/code/extfmts.lisp:1.27
--- src/code/extfmts.lisp:1.26 Tue Jun 29 23:53:28 2010
+++ src/code/extfmts.lisp Thu Jul 1 22:50:35 2010
@@ -5,7 +5,7 @@
;;; domain.
;;;
(ext:file-comment
- "$Header: /project/cmucl/cvsroot/src/code/extfmts.lisp,v 1.26 2010-06-30 03:53:28 rtoy Exp $")
+ "$Header: /project/cmucl/cvsroot/src/code/extfmts.lisp,v 1.27 2010-07-02 02:50:35 rtoy Exp $")
;;;
;;; **********************************************************************
;;;
@@ -522,7 +522,12 @@
(octets-to-code (state input unput error)
`(values ,input 1))
(code-to-octets (code state output error)
- `(,output (if (> ,code 255) #x3F ,code))))
+ `(,output (if (> ,code 255)
+ (if ,error
+ (funcall ,error "Cannot output codepoint #x~X to ISO8859-1 stream"
+ ,code 1)
+ #x3F)
+ ,code))))
;;; OCTETS-TO-CODEPOINT, CODEPOINT-TO-OCTETS -- Semi-Public
;;;
@@ -535,13 +540,13 @@
(let ((tmp1 (gensym)) (tmp2 (gensym))
(ef (find-external-format external-format)))
`(multiple-value-bind (,tmp1 ,tmp2)
- ,(funcall (ef-octets-to-code ef) state input unput)
+ ,(funcall (ef-octets-to-code ef) state input unput error)
(setf ,count (the kernel:index ,tmp2))
(the (or lisp:codepoint null) ,tmp1))))
(defmacro codepoint-to-octets (external-format code state output &optional error)
(let ((ef (find-external-format external-format)))
- (funcall (ef-code-to-octets ef) code state output)))
+ (funcall (ef-code-to-octets ef) code state output error)))
@@ -617,8 +622,10 @@
;;@@ on non-Unicode builds, limit to 8-bit chars
;;@@ if unicode-bootstrap, can't use #\u+fffd
(cond ((or (lisp::surrogatep code) (> code #x10FFFF))
- #-(and unicode (not unicode-bootstrap)) #\?
- #+(and unicode (not unicode-bootstrap)) #\U+FFFD)
+ (if ,error
+ (funcall ,error "Cannot output codepoint #x~X" code)
+ #-(and unicode (not unicode-bootstrap)) #\?
+ #+(and unicode (not unicode-bootstrap)) #\U+FFFD))
#+unicode
((> code #xFFFF)
(multiple-value-bind (hi lo) (surrogates code)
@@ -646,12 +653,18 @@
;; the replacement character.
(,wryte (if (lisp::surrogatep (char-code ,nchar) :low)
(surrogates-to-codepoint (car ,nstate) ,nchar)
- +replacement-character-code+))
+ (if ,error
+ (funcall ,error "Cannot convert invalid surrogate #x~X to character"
+ ,nchar)
+ +replacement-character-code+)))
(setf (car ,nstate) nil))
;; A lone trailing (low) surrogate gets replaced with
;; the replacement character.
(,wryte (if (lisp::surrogatep (char-code ,nchar) :low)
- +replacement-character-code+
+ (if ,error
+ (funcall ,error "Cannot convert lone trailing surrogate #x~X to character"
+ ,nchar)
+ +replacement-character-code+)
(char-code ,nchar)))))))))
(defmacro flush-state (external-format state output)
@@ -667,7 +680,7 @@
(funcall f state))))
(def-ef-macro ef-string-to-octets (extfmt lisp::lisp +ef-max+ +ef-so+)
- `(lambda (string start end buffer &aux (ptr 0) (state nil))
+ `(lambda (string start end buffer &optional error &aux (ptr 0) (state nil))
(declare #|(optimize (speed 3) (safety 0) (space 0) (debug 0))|#
(type simple-string string)
(type kernel:index start end ptr)
@@ -679,10 +692,12 @@
(lambda (b)
(when (= ptr (length buffer))
(setq buffer (adjust-array buffer (* 2 ptr))))
- (setf (aref buffer (1- (incf ptr))) b))))))
+ (setf (aref buffer (1- (incf ptr))) b))
+ error))))
(defun string-to-octets (string &key (start 0) end (external-format :default)
- (buffer nil bufferp))
+ (buffer nil bufferp)
+ error)
"Convert String to octets using the specified External-format. The
string is bounded by Start (defaulting to 0) and End (defaulting to
the end of the string. If Buffer is given, the octets are stored
@@ -696,11 +711,12 @@
(multiple-value-bind (buffer ptr)
(lisp::with-array-data ((string string) (start start) (end end))
(funcall (ef-string-to-octets external-format)
- string start end buffer))
+ string start end buffer error))
(values (if bufferp buffer (lisp::shrink-vector buffer ptr)) ptr))))
(def-ef-macro ef-octets-to-string (extfmt lisp::lisp +ef-max+ +ef-os+)
- `(lambda (octets ptr end state string s-start s-end &aux (pos s-start) (count 0) (last-octet 0))
+ `(lambda (octets ptr end state string s-start s-end &optional error
+ &aux (pos s-start) (count 0) (last-octet 0))
(declare (optimize (speed 3) (safety 0) #|(space 0) (debug 0)|#)
(type (simple-array (unsigned-byte 8) (*)) octets)
(type kernel:index pos end count last-octet s-start s-end)
@@ -714,7 +730,8 @@
(if (>= ptr end)
(throw 'end-of-octets nil)
(aref octets (incf ptr)))
- (lambda (n) (decf ptr n))))
+ (lambda (n) (decf ptr n))
+ error))
(incf pos)
(incf last-octet count)))
(values string pos last-octet state)))
@@ -722,7 +739,8 @@
(defun octets-to-string (octets &key (start 0) end (external-format :default)
(string nil stringp)
(s-start 0) (s-end nil s-end-p)
- (state nil))
+ (state nil)
+ error)
"Octets-to-string converts an array of octets in Octets to a string
according to the specified External-format. The array of octets is
bounded by Start (defaulting ot 0) and End (defaulting to the end of
@@ -751,13 +769,14 @@
octets (1- start) (1- (or end (length octets)))
state
(or string (make-string (length octets)))
- s-start s-end)
+ s-start s-end
+ error)
(values (if stringp string (lisp::shrink-vector string pos)) pos last-octet new-state))))
(def-ef-macro ef-encode (extfmt lisp::lisp +ef-max+ +ef-en+)
- `(lambda (string start end result &aux (ptr 0) (state nil))
+ `(lambda (string start end result &optional error &aux (ptr 0) (state nil))
(declare #|(optimize (speed 3) (safety 0) (space 0) (debug 0))|#
(type simple-string string)
(type kernel:index start end ptr)
@@ -770,9 +789,10 @@
(when (= ptr (length result))
(setq result (adjust-array result (* 2 ptr))))
(setf (aref result (1- (incf ptr)))
- (code-char b)))))))
+ (code-char b)))
+ error))))
-(defun string-encode (string external-format &optional (start 0) end)
+(defun string-encode (string external-format &key (start 0) end error)
"Encode the given String using External-Format and return a new
string. The characters of the new string are the octets of the
encoded result, with each octet converted to a character via
@@ -782,11 +802,12 @@
(multiple-value-bind (result ptr)
(lisp::with-array-data ((string string) (start start) (end end))
(funcall (ef-encode external-format) string start end
- (make-string (length string) :element-type 'base-char)))
+ (make-string (length string) :element-type 'base-char)
+ error))
(lisp::shrink-vector result ptr)))
(def-ef-macro ef-decode (extfmt lisp::lisp +ef-max+ +ef-de+)
- `(lambda (string ptr end result &aux (pos -1) (count 0) (state nil))
+ `(lambda (string ptr end result &optional error &aux (pos -1) (count 0) (state nil))
(declare #|(optimize (speed 3) (safety 0) (space 0) (debug 0))|#
(type simple-string string)
(type kernel:index end count)
@@ -802,10 +823,11 @@
(if (= (1+ ptr) (length string))
nil
(char-code (char string (incf ptr))))
- (lambda (n) (decf ptr n))))
+ (lambda (n) (decf ptr n))
+ error))
finally (return (values result (1+ pos))))))
-(defun string-decode (string external-format &optional (start 0) end)
+(defun string-decode (string external-format &key (start 0) end error)
"Decode String using the given External-Format and return the new
string. The input string is treated as if it were an array of
octets, where the char-code of each character is the octet. This is
@@ -815,7 +837,8 @@
(multiple-value-bind (result pos)
(lisp::with-array-data ((string string) (start start) (end end))
(funcall (ef-decode external-format)
- string (1- start) (1- end) (make-string (length string))))
+ string (1- start) (1- end) (make-string (length string))
+ error))
(lisp::shrink-vector result pos)))
Index: src/code/fd-stream-extfmt.lisp
diff -u src/code/fd-stream-extfmt.lisp:1.8 src/code/fd-stream-extfmt.lisp:1.9
--- src/code/fd-stream-extfmt.lisp:1.8 Tue Apr 20 13:57:44 2010
+++ src/code/fd-stream-extfmt.lisp Thu Jul 1 22:50:35 2010
@@ -5,7 +5,7 @@
;;; domain.
;;;
(ext:file-comment
- "$Header: /project/cmucl/cvsroot/src/code/fd-stream-extfmt.lisp,v 1.8 2010-04-20 17:57:44 rtoy Exp $")
+ "$Header: /project/cmucl/cvsroot/src/code/fd-stream-extfmt.lisp,v 1.9 2010-07-02 02:50:35 rtoy Exp $")
;;;
;;; **********************************************************************
;;;
@@ -94,7 +94,8 @@
:start 0
:external-format old-format
:string (make-string sindex)
- :state state)
+ :state state
+ :error (fd-stream-octets-to-char-error stream))
(declare (ignore s pos))
(setf state new-state)
(setf index count))
@@ -108,7 +109,8 @@
:external-format (fd-stream-external-format stream)
:string (lisp-stream-string-buffer stream)
:s-start 1
- :state state)
+ :state state
+ :error (fd-stream-octets-to-char-error stream))
(declare (ignore s))
(setf (lisp-stream-string-index stream) 1)
(setf (lisp-stream-string-buffer-len stream) pos)
Index: src/code/fd-stream.lisp
diff -u src/code/fd-stream.lisp:1.102 src/code/fd-stream.lisp:1.103
--- src/code/fd-stream.lisp:1.102 Tue Jun 29 20:52:15 2010
+++ src/code/fd-stream.lisp Thu Jul 1 22:50:35 2010
@@ -5,7 +5,7 @@
;;; Carnegie Mellon University, and has been placed in the public domain.
;;;
(ext:file-comment
- "$Header: /project/cmucl/cvsroot/src/code/fd-stream.lisp,v 1.102 2010-06-30 00:52:15 rtoy Exp $")
+ "$Header: /project/cmucl/cvsroot/src/code/fd-stream.lisp,v 1.103 2010-07-02 02:50:35 rtoy Exp $")
;;;
;;; **********************************************************************
;;;
@@ -270,12 +270,19 @@
;; characters. If NIL, then the external format should handle it
;; itself, doing whatever is deemed appropriate. If non-NIL, this
;; should be a function (or symbol) that the external format can
- ;; funcall to deal with the error.
+ ;; funcall to deal with the error. The function should take 4
+ ;; arguments: a message string, the offending octet, the number of
+ ;; octets read so far in decoding, and a function to unput
+ ;; characters. If the function returns, it should return the code
+ ;; of the desired replacement character and the number of octets
+ ;; read (the input parameter).
#+unicode
(octets-to-char-error nil)
;;
;; Like OCTETS-TO-CHAR-ERROR, but for converting characters to
- ;; octets for output.
+ ;; octets for output. The function takes two arguments: a message
+ ;; string and the codepoint that cannot be converted. The function
+ ;; should return the octet that should be output.
#+unicode
(char-to-octets-error nil))
@@ -1782,7 +1789,9 @@
(format nil "descriptor ~D" fd)))
auto-close
(external-format :default)
- binary-stream-p)
+ binary-stream-p
+ decoding-error
+ encoding-error)
(declare (type index fd) (type (or index null) timeout)
(type (member :none :line :full) buffering))
"Create a stream for the given unix file descriptor.
@@ -1808,14 +1817,16 @@
:pathname pathname
:buffering buffering
:timeout timeout)
- (%make-fd-stream :fd fd
+b (%make-fd-stream :fd fd
:name name
:file file
:original original
:delete-original delete-original
:pathname pathname
:buffering buffering
- :timeout timeout))))
+ :timeout timeout
+ :char-to-octets-error encoding-error
+ :octets-to-char-error decoding-error))))
;; FIXME: setting the external format here should be better
;; integrated into set-routines. We do it before so that
;; set-routines can create an in-buffer if appropriate. But we
@@ -2096,7 +2107,8 @@
(if-exists nil if-exists-given)
(if-does-not-exist nil if-does-not-exist-given)
(external-format :default)
- class)
+ class
+ decoding-error encoding-error)
(declare (type pathname pathname)
(type (member :input :output :io :probe) direction)
(type (member :error :new-version :rename :rename-and-delete
@@ -2121,7 +2133,9 @@
:input-buffer-p t
:auto-close t
:external-format external-format
- :binary-stream-p class))
+ :binary-stream-p class
+ :decoding-error decoding-error
+ :encoding-error encoding-error))
(:probe
(let ((stream (%make-fd-stream :name namestring :fd fd
:pathname pathname
@@ -2145,7 +2159,8 @@
(options options)
(direction direction)
(if-does-not-exist if-does-not-exist)
- (if-exists if-exists))
+ (if-exists if-exists)
+ decoding-error encoding-error)
"Return a stream which reads from or writes to Filename.
Defined keywords:
:direction - one of :input, :output, :io, or :probe
@@ -2155,7 +2170,8 @@
:if-does-not-exist - one of :error, :create or nil
:external-format - an external format name
See the manual for details."
- (declare (ignore element-type external-format input-handle output-handle))
+ (declare (ignore element-type external-format input-handle output-handle
+ decoding-error encoding-error))
;; OPEN signals a file-error if the filename is wild.
(when (wild-pathname-p filename)
Index: src/code/stream.lisp
diff -u src/code/stream.lisp:1.92 src/code/stream.lisp:1.93
--- src/code/stream.lisp:1.92 Tue Apr 20 13:57:45 2010
+++ src/code/stream.lisp Thu Jul 1 22:50:35 2010
@@ -5,7 +5,7 @@
;;; Carnegie Mellon University, and has been placed in the public domain.
;;;
(ext:file-comment
- "$Header: /project/cmucl/cvsroot/src/code/stream.lisp,v 1.92 2010-04-20 17:57:45 rtoy Exp $")
+ "$Header: /project/cmucl/cvsroot/src/code/stream.lisp,v 1.93 2010-07-02 02:50:35 rtoy Exp $")
;;;
;;; **********************************************************************
;;;
@@ -781,7 +781,8 @@
:state (fd-stream-oc-state stream)
:string sbuf
:s-start 1
- :external-format (fd-stream-external-format stream))
+ :external-format (fd-stream-external-format stream)
+ :error (fd-stream-octets-to-char-error stream))
(declare (ignore s))
(setf (fd-stream-oc-state stream) new-state)
(setf (lisp-stream-string-buffer-len stream) char-count)
Index: src/pcl/simple-streams/external-formats/utf-8.lisp
diff -u src/pcl/simple-streams/external-formats/utf-8.lisp:1.6 src/pcl/simple-streams/external-formats/utf-8.lisp:1.7
--- src/pcl/simple-streams/external-formats/utf-8.lisp:1.6 Wed Jun 30 00:02:53 2010
+++ src/pcl/simple-streams/external-formats/utf-8.lisp Thu Jul 1 22:50:35 2010
@@ -4,7 +4,7 @@
;;; This code was written by Paul Foley and has been placed in the public
;;; domain.
;;;
-(ext:file-comment "$Header: /project/cmucl/cvsroot/src/pcl/simple-streams/external-formats/utf-8.lisp,v 1.6 2010-06-30 04:02:53 rtoy Exp $")
+(ext:file-comment "$Header: /project/cmucl/cvsroot/src/pcl/simple-streams/external-formats/utf-8.lisp,v 1.7 2010-07-02 02:50:35 rtoy Exp $")
(in-package "STREAM")
@@ -17,23 +17,26 @@
()
(octets-to-code (state input unput error c i j n)
`(labels ((utf8 (,c ,i)
- (declare (type (unsigned-byte 8) ,c)
- (type (integer 1 5) ,i))
- (let ((,n (ash (ldb (byte (- 6 ,i) 0) ,c)
- (* 6 ,i))))
- (declare (type (unsigned-byte 31) ,n))
- (dotimes (,j ,i (check ,n ,i))
- (let ((,c ,input))
- ;; Following bytes must all have the form
- ;; #b10xxxxxx. If not, put back the octet we
- ;; just read and return the replacement character
- ;; for the bad sequence.
- (if (< (logxor ,c #x80) #x40)
- (setf (ldb (byte 6 (* 6 (- ,i ,j 1))) ,n)
- (ldb (byte 6 0) ,c))
- (progn
- (,unput 1)
- (return (values +replacement-character-code+ (1+ ,j)))))))))
+ (declare (type (unsigned-byte 8) ,c)
+ (type (integer 1 5) ,i))
+ (let ((,n (ash (ldb (byte (- 6 ,i) 0) ,c)
+ (* 6 ,i))))
+ (declare (type (unsigned-byte 31) ,n))
+ (dotimes (,j ,i (check ,n ,i))
+ (let ((,c ,input))
+ ;; Following bytes must all have the form
+ ;; #b10xxxxxx. If not, put back the octet we
+ ;; just read and return the replacement character
+ ;; for the bad sequence.
+ (if (< (logxor ,c #x80) #x40)
+ (setf (ldb (byte 6 (* 6 (- ,i ,j 1))) ,n)
+ (ldb (byte 6 0) ,c))
+ (progn
+ (,unput 1)
+ (return
+ (if ,error
+ (funcall ,error "Invalid utf8 byte #x~X at offset ~D" ,c (1+ ,j) ,unput)
+ (values +replacement-character-code+ (1+ ,j))))))))))
(check (,n ,i)
(declare (type (unsigned-byte 31) ,n)
(type (integer 1 5) ,i))
@@ -47,17 +50,25 @@
(lisp::surrogatep ,n)) ; surrogate
(progn
(,unput ,i)
- (values +replacement-character-code+ 1))
+ (if ,error
+ (funcall ,error "Overlong utf8 sequence" nil 1 ,unput)
+ (values +replacement-character-code+ 1)))
(values ,n (1+ ,i)))))
(let ((,c ,input))
(declare (optimize (ext:inhibit-warnings 3)))
(cond ((null ,c) (values nil 0))
((< ,c #b10000000) (values ,c 1))
- ((< ,c #b11000010) (values +replacement-character-code+ 1))
+ ((< ,c #b11000010)
+ (if ,error
+ (funcall ,error "Invalid second utf8 octet: #x~X" ,c 1 ,unput)
+ (values +replacement-character-code+ 1)))
((< ,c #b11100000) (utf8 ,c 1))
((< ,c #b11110000) (utf8 ,c 2))
((< ,c #b11111000) (utf8 ,c 3))
- (t (values +replacement-character-code+ 1))))))
+ (t
+ (if ,error
+ (funcall ,error "Invalid fourth utf8 octet: #x~X" ,c 1 ,unput)
+ (values +replacement-character-code+ 1)))))))
(code-to-octets (code state output error i j n p init)
`(flet ((utf8 (,n ,i)
(let* ((,j (- 6 ,i))
More information about the cmucl-commit
mailing list