CMUCL commit: src (14 files)

Raymond Toy rtoy at common-lisp.net
Sun Oct 18 16:21:24 CEST 2009


    Date: Sunday, October 18, 2009 @ 10:21:24
  Author: rtoy
    Path: /project/cmucl/cvsroot/src

Modified: bootfiles/20a/boot-2009-10-01.lisp
          bootfiles/20a/boot-2009-10-1-cross.lisp code/extfmts.lisp
          code/fd-stream-extfmt.lisp code/fd-stream.lisp code/stream.lisp
          code/string.lisp code/struct.lisp code/sysmacs.lisp
          general-info/release-20b.txt
          pcl/simple-streams/external-formats/utf-16-be.lisp
          pcl/simple-streams/external-formats/utf-16-le.lisp
          pcl/simple-streams/external-formats/utf-16.lisp
          pcl/simple-streams/external-formats/utf-32.lisp

Merge changes from unicode-string-buffer-impl-branch which gives
faster reads on external-formats.  This is done by adding an
additional buffer to streams so we can convert the entire in-buffer
into characters all at once.

To build this change, you need to do a cross-compile using
boot-2009-10-1-cross.lisp.  Using that build, do a normal build with
these sources.

For a non-unicode build use boot-2009-10-01.lisp with a 20a
non-unicode build.

code/extfmts.lisp:
o Add another slot to the extfmts for copying the state.
o Modify EF-OCTETS-TO-STRING and OCTETS-TO-STRING to support the
  necesssary changes for fast formats.  This is incompatible with the
  previous version because the string is not grown if needed.

code/fd-stream-extfmt.lisp:
o Set *enable-stream-buffer-p* to T so we have fast streams. 

code/fd-stream.lisp:
o Add new slots to support fast strams.
o In SET-ROUTINES, initialize the new slots appropriately.
o Update UNREAD-CHAR to be able to back up in the string buffer to
  unread. 
o Add implementation to copy the state of an external format.

code/stream.lisp:
o Change %SET-FD-STREAM-EXTERNAL-FORMAT to be able to change formats
  even if we've already converted the buffer with a different format.
  We reconvert the buffer with the old format until we reach the
  current character.  Then the remaining octets are converted using
  the new format and stored in the string buffer.
o Add FAST-READ-CHAR-STRING-REFILL to refill the string buffer, like
  FAST-READ-CHAR-REFILL does for the octet in-buffer.

code/struct.lisp:
o Add new slots to hold the string buffer, the current index, and
  length.  These are needed for the fast formats.

code/sysmacs.lisp:
o Update PREPARE-FOR-FAST-READ-CHAR, DONE-WITH-FAST-READ-CHAR, and
  FAST-READ-CHAR to support the string buffer.

code/string.lisp:
o Microoptimization of SURROGATEP to reduce the number of branchs.

general-info/release-20b.txt:
o Update with these changes

pcl/simple-streams/external-formats/utf-16-be.lisp:
pcl/simple-streams/external-formats/utf-16-le.lisp:
pcl/simple-streams/external-formats/utf-16.lisp:
o These formats actually have state, so update them to take a handle
  an initial state.  These are needed if the string buffer ends with a
  leading surrogate and the next string buffer starts with a trailing
  surrogate.  The conversion needs to combine the surrogates together.


----------------------------------------------------+
 bootfiles/20a/boot-2009-10-01.lisp                 |   22 ++
 bootfiles/20a/boot-2009-10-1-cross.lisp            |    8 
 code/extfmts.lisp                                  |   90 ++++++----
 code/fd-stream-extfmt.lisp                         |    5 
 code/fd-stream.lisp                                |   70 ++++++--
 code/stream.lisp                                   |  156 +++++++++++++++++--
 code/string.lisp                                   |   14 -
 code/struct.lisp                                   |   19 ++
 code/sysmacs.lisp                                  |   50 ++++--
 general-info/release-20b.txt                       |    9 -
 pcl/simple-streams/external-formats/utf-16-be.lisp |   26 ++-
 pcl/simple-streams/external-formats/utf-16-le.lisp |   30 +++
 pcl/simple-streams/external-formats/utf-16.lisp    |   38 +++-
 pcl/simple-streams/external-formats/utf-32.lisp    |    4 
 14 files changed, 428 insertions(+), 113 deletions(-)


Index: src/bootfiles/20a/boot-2009-10-01.lisp
diff -u /dev/null src/bootfiles/20a/boot-2009-10-01.lisp:1.2
--- /dev/null	Sun Oct 18 10:21:24 2009
+++ src/bootfiles/20a/boot-2009-10-01.lisp	Sun Oct 18 10:21:23 2009
@@ -0,0 +1,22 @@
+;; Bootstrap for non-unicode build.  Just updating the +ef-foo+ values.
+
+(in-package "STREAM")
+
+(handler-bind ((error #'(lambda (c)
+                          (declare (ignore c))
+                          (invoke-restart 'kernel::continue))))
+(vm::defenum (:prefix "+EF-" :suffix "+" :start 1)
+  str					; string length
+  cin					; input a character
+  cout					; output a character
+  sin					; input string
+  sout					; output string
+  os					; octets to string
+  so					; string to octets
+  en					; encode
+  de					; decode
+  flush					; flush state
+  copy-state				; copy state
+  max)
+)
+
Index: src/bootfiles/20a/boot-2009-10-1-cross.lisp
diff -u /dev/null src/bootfiles/20a/boot-2009-10-1-cross.lisp:1.2
--- /dev/null	Sun Oct 18 10:21:24 2009
+++ src/bootfiles/20a/boot-2009-10-1-cross.lisp	Sun Oct 18 10:21:23 2009
@@ -0,0 +1,8 @@
+;;; Cross-compile script to add new slots to lisp-stream so we can
+;;; have faster external formats.
+
+;; Nothing special needed; use standard scripts
+
+(load #+x86 "target:tools/cross-scripts/cross-x86-x86"
+      #+sparc "target:tools/cross-scripts/cross-sparc-sparc"
+      #+ppc "target:tools/cross-scripts/cross-ppc-ppc-darwin")
Index: src/code/extfmts.lisp
diff -u src/code/extfmts.lisp:1.19 src/code/extfmts.lisp:1.20
--- src/code/extfmts.lisp:1.19	Fri Oct  2 16:15:04 2009
+++ src/code/extfmts.lisp	Sun Oct 18 10:21:23 2009
@@ -5,7 +5,7 @@
 ;;; domain.
 ;;; 
 (ext:file-comment
- "$Header: /project/cmucl/cvsroot/src/code/extfmts.lisp,v 1.19 2009-10-02 20:15:04 rtoy Exp $")
+ "$Header: /project/cmucl/cvsroot/src/code/extfmts.lisp,v 1.20 2009-10-18 14:21:23 rtoy Exp $")
 ;;;
 ;;; **********************************************************************
 ;;;
@@ -33,6 +33,7 @@
   en					; encode
   de					; decode
   flush					; flush state
+  copy-state				; copy state
   max)
 
 ;; Unicode replacement character U+FFFD
@@ -567,18 +568,18 @@
 			    `(+ (ensure-cache ,tmp1 ',id ,reqd) ,idx))))
 	   (funcall (or (aref (ef-cache ,tmp1) ,tmp2)
 			(setf (aref (ef-cache ,tmp1) ,tmp2)
-			    (let ((*compile-print* nil)
-				  ;; Set default format when we compile so we
-				  ;; can see compiler messages.  If we don't,
-				  ;; we run into a problem that we might be
-				  ;; changing the default format while we're
-				  ;; compiling, and we don't know how to output
-				  ;; the compiler messages.
-				  #|(*default-external-format* :iso8859-1)|#)
-			      (compile nil `(lambda (%slots%)
-					      (declare (ignorable %slots%))
-					      (block ,',blknm
-						,,body))))))
+			      (let ((*compile-print* nil)
+				    ;; Set default format when we compile so we
+				    ;; can see compiler messages.  If we don't,
+				    ;; we run into a problem that we might be
+				    ;; changing the default format while we're
+				    ;; compiling, and we don't know how to output
+				    ;; the compiler messages.
+				    #|(*default-external-format* :iso8859-1)|#)
+				(compile nil `(lambda (%slots%)
+					       (declare (ignorable %slots%))
+					       (block ,',blknm
+						 ,,body))))))
 		    (ef-slots ,tmp1))))
        (declaim (inline ,name))
        (defun ,name (,tmp1)
@@ -694,46 +695,59 @@
       (values (if bufferp buffer (lisp::shrink-vector buffer ptr)) ptr))))
 
 (def-ef-macro ef-octets-to-string (extfmt lisp::lisp +ef-max+ +ef-os+)
-  `(lambda (octets ptr end string &aux (pos 0) (count 0) (state nil) (last-octet 0))
-     (declare (optimize (speed 3) #|(safety 0) (space 0) (debug 0)|#)
+  `(lambda (octets ptr end state string s-start s-end &aux (pos s-start) (count 0) (last-octet 0))
+     (declare (optimize (speed 3) (safety 0) #|(space 0) (debug 0)|#)
 	      (type (simple-array (unsigned-byte 8) (*)) octets)
-	      (type kernel:index pos end count last-octet)
+	      (type kernel:index pos end count last-octet s-start s-end)
 	      (type (integer -1 (#.array-dimension-limit)) ptr)
 	      (type simple-string string)
 	      (ignorable state))
      (catch 'end-of-octets
-       (loop
-	  (when (= pos (length string))
-	    (setq string (adjust-array string (* 2 pos))))
-	  (setf (schar string pos)
-		(octets-to-char ,extfmt state count
-				(if (>= ptr end)
-				    (throw 'end-of-octets nil)
-				    (aref octets (incf ptr)))
-				(lambda (n) (decf ptr n))))
+       (loop while (< pos s-end)
+	  do (setf (schar string pos)
+		   (octets-to-char ,extfmt state count
+				   (if (>= ptr end)
+				       (throw 'end-of-octets nil)
+				       (aref octets (incf ptr)))
+				   (lambda (n) (decf ptr n))))
 	  (incf pos)
 	  (incf last-octet count)))
-     (values string pos last-octet)))
+     (values string pos last-octet state)))
 
 (defun octets-to-string (octets &key (start 0) end (external-format :default)
-				     (string nil stringp))
+				     (string nil stringp)
+			             (s-start 0) (s-end nil s-end-p)
+			             (state nil))
   "Octets-to-string converts an array of octets in Octets to a string
   according to the specified External-format.  The array of octets is
   bounded by Start (defaulting ot 0) and End (defaulting to the end of
-  the array.  If String is given, the string is stored there.  If
-  String is too short to hold all of the characters, it will be
-  adjusted (via adjust-array).  If String is not given, a new string
-  is created.  Three values are returned: the string, the number of
-  characters read, and the number of octets consumed."
+  the array.  If String is not given, a new string is created.  If
+  String is given, the converted octets are stored in String, starting
+  at S-Start (defaulting to the 0) and ending at S-End (defaulting to
+  the length of String).  If the string is not large enough to hold
+  all of characters, then some octets will not be converted.  A State
+  may also be specified; this is used as the state of the external
+  format.
+
+  Four values are returned: the string, the number of characters read,
+  the number of octets actually consumed and the new state of the
+  external format."
   (declare (type (simple-array (unsigned-byte 8) (*)) octets)
-	   (type kernel:index start)
+	   (type kernel:index start s-start)
 	   (type (or kernel:index null) end)
 	   (type (or simple-string null) string))
-  (multiple-value-bind (string pos last-octet)
-      (funcall (ef-octets-to-string external-format)
-	       octets (1- start) (1- (or end (length octets)))
-	       (or string (make-string (length octets))))
-    (values (if stringp string (lisp::shrink-vector string pos)) pos last-octet)))
+  (let ((s-end (if s-end-p
+		   s-end
+		   (if stringp
+		       (length string)
+		       (length octets)))))
+    (multiple-value-bind (string pos last-octet new-state)
+	(funcall (ef-octets-to-string external-format)
+		 octets (1- start) (1- (or end (length octets)))
+		 state
+		 (or string (make-string (length octets)))
+		 s-start s-end)
+      (values (if stringp string (lisp::shrink-vector string pos)) pos last-octet new-state))))
 
 
 
Index: src/code/fd-stream-extfmt.lisp
diff -u src/code/fd-stream-extfmt.lisp:1.3 src/code/fd-stream-extfmt.lisp:1.4
--- src/code/fd-stream-extfmt.lisp:1.3	Mon Aug 10 12:47:41 2009
+++ src/code/fd-stream-extfmt.lisp	Sun Oct 18 10:21:24 2009
@@ -5,7 +5,7 @@
 ;;; domain.
 ;;; 
 (ext:file-comment
- "$Header: /project/cmucl/cvsroot/src/code/fd-stream-extfmt.lisp,v 1.3 2009-08-10 16:47:41 rtoy Rel $")
+ "$Header: /project/cmucl/cvsroot/src/code/fd-stream-extfmt.lisp,v 1.4 2009-10-18 14:21:24 rtoy Exp $")
 ;;;
 ;;; **********************************************************************
 ;;;
@@ -66,7 +66,10 @@
 (stream::precompile-ef-slot :iso8859-1 #.stream::+ef-cin+)
 (stream::precompile-ef-slot :iso8859-1 #.stream::+ef-cout+)
 (stream::precompile-ef-slot :iso8859-1 #.stream::+ef-sout+)
+(stream::precompile-ef-slot :iso8859-1 #.stream::+ef-os+)
 
 
 
 ;(set-terminal-coding-system :iso8859-1)
+
+(setf lisp::*enable-stream-buffer-p* t)
Index: src/code/fd-stream.lisp
diff -u src/code/fd-stream.lisp:1.91 src/code/fd-stream.lisp:1.92
--- src/code/fd-stream.lisp:1.91	Wed Sep  9 11:51:27 2009
+++ src/code/fd-stream.lisp	Sun Oct 18 10:21:24 2009
@@ -5,7 +5,7 @@
 ;;; Carnegie Mellon University, and has been placed in the public domain.
 ;;;
 (ext:file-comment
-  "$Header: /project/cmucl/cvsroot/src/code/fd-stream.lisp,v 1.91 2009-09-09 15:51:27 rtoy Rel $")
+  "$Header: /project/cmucl/cvsroot/src/code/fd-stream.lisp,v 1.92 2009-10-18 14:21:24 rtoy Exp $")
 ;;;
 ;;; **********************************************************************
 ;;;
@@ -41,6 +41,8 @@
   "List of available buffers.  Each buffer is an sap pointing to
   bytes-per-buffer of memory.")
 
+(defvar lisp::*enable-stream-buffer-p* nil)
+
 (defconstant bytes-per-buffer (* 4 1024)
   "Number of bytes per buffer.")
 
@@ -250,7 +252,17 @@
   #+unicode
   (co-state nil)
   #+unicode
-  (last-char-read-size 0 :type index))
+  (last-char-read-size 0 :type index)
+  ;; Saved state needed for (setf stream-external-format) when the
+  ;; fast string-buffer is used.
+  #+unicode
+  (saved-oc-state nil)
+  ;;
+  ;; The number of octets in in-buffer.  Normally equal to
+  ;; in-buffer-length, but could be less if we reached the
+  ;; end-of-file.
+  #+unicode
+  (in-length 0 :type index))
 
 (defun %print-fd-stream (fd-stream stream depth)
   (declare (ignore depth) (stream stream))
@@ -1378,18 +1390,32 @@
 		      ;; since we already have size = 1.
 		      (or (eq 'unsigned-byte (and (consp type) (car type)))
 			  (eq type :default))
-		      ;; Character streams with :iso8859-1
-		      (and (eq type 'character)
-			   #+unicode
-			   (eql :iso8859-1 (fd-stream-external-format stream)))))
+		      (eq type 'character)))
 	    ;; We only create this buffer for streams of type
 	    ;; (unsigned-byte 8) or character streams with an external
 	    ;; format of :iso8859-1.  Because there's no buffer, the
 	    ;; other element-types will dispatch to the appropriate
 	    ;; input (output) routine in fast-read-byte/fast-read-char.
-	    (setf (lisp-stream-in-buffer stream)
-		  (make-array in-buffer-length
-			      :element-type '(unsigned-byte 8)))))
+	    (when *enable-stream-buffer-p*
+	      (setf (lisp-stream-in-buffer stream)
+		    (make-array in-buffer-length
+				:element-type '(unsigned-byte 8)))
+	      #+unicode
+	      (when (and (eq type 'character)
+			 (not (eq :iso8859-1 (fd-stream-external-format stream))))
+		;; For character streams, we create the string-buffer so
+		;; we can convert all available octets at once instead
+		;; of for each character.  The string is one element
+		;; longer than in-buffer-length to leave room for
+		;; unreading.
+		;;
+		;; For ISO8859-1, we don't want this because it's very
+		;; easy and quick to convert octets to iso8859-1.  (See
+		;; FAST-READ-CHAR.)
+		(setf (lisp-stream-string-buffer stream)
+		      (make-string (1+ in-buffer-length)))
+		(setf (lisp-stream-string-buffer-len stream) 0)
+		(setf (lisp-stream-string-index stream) 0)))))
 	(setf input-size size)
 	(setf input-type type)))
 
@@ -1496,10 +1522,15 @@
      #-unicode
      (setf (fd-stream-unread stream) arg1)
      #+unicode
-     (if (zerop (fd-stream-last-char-read-size stream))
-	 (setf (fd-stream-unread stream) arg1)
-	 (decf (fd-stream-ibuf-head stream)
-	       (fd-stream-last-char-read-size stream)))
+     (cond ((lisp-stream-string-buffer stream)
+	    (if (zerop (lisp-stream-string-index stream))
+		(setf (fd-stream-unread stream) arg1)
+		(decf (lisp-stream-string-index stream))))
+	   (t
+	    (if (zerop (fd-stream-last-char-read-size stream))
+		(setf (fd-stream-unread stream) arg1)
+		(decf (fd-stream-ibuf-head stream)
+		      (fd-stream-last-char-read-size stream)))))
      ;; Paul says:
      ;; 
      ;; Not needed for unicode when unreading is implemented by backing up in
@@ -1753,11 +1784,13 @@
     ;;
     ;;#-unicode-bootstrap ; fails in stream-reinit otherwise
     #+(and unicode (not unicode-bootstrap))
-    (setf (stream-external-format stream) external-format)
+    (when lisp::*enable-stream-buffer-p*
+      (%set-fd-stream-external-format stream external-format nil))
     (set-routines stream element-type input output input-buffer-p
 		  :binary-stream-p binary-stream-p)
     #+(and unicode (not unicode-bootstrap))
-    (setf (stream-external-format stream) external-format)
+    (when lisp::*enable-stream-buffer-p*
+      (%set-fd-stream-external-format stream external-format nil))
     (when (and auto-close (fboundp 'finalize))
       (finalize stream
 		#'(lambda ()
@@ -2290,3 +2323,10 @@
     (t (etypecase object
 	 (character 1)
 	 (string (length object))))))
+
+#+unicode
+(stream::def-ef-macro ef-copy-state (extfmt lisp stream::+ef-max+ stream::+ef-copy-state+)
+  ;; Return a copy of the state of an external format.
+  `(lambda (state)
+     (declare (ignorable state))
+     (stream::copy-state ,extfmt state)))
Index: src/code/stream.lisp
diff -u src/code/stream.lisp:1.87 src/code/stream.lisp:1.88
--- src/code/stream.lisp:1.87	Mon Aug 10 12:47:41 2009
+++ src/code/stream.lisp	Sun Oct 18 10:21:24 2009
@@ -5,7 +5,7 @@
 ;;; Carnegie Mellon University, and has been placed in the public domain.
 ;;;
 (ext:file-comment
-  "$Header: /project/cmucl/cvsroot/src/code/stream.lisp,v 1.87 2009-08-10 16:47:41 rtoy Rel $")
+  "$Header: /project/cmucl/cvsroot/src/code/stream.lisp,v 1.88 2009-10-18 14:21:24 rtoy Exp $")
 ;;;
 ;;; **********************************************************************
 ;;;
@@ -298,19 +298,59 @@
     ;; fundamental-stream
     :default))
 
-(defun %set-fd-stream-external-format (stream extfmt)
+#+unicode
+(defun %set-fd-stream-external-format (stream extfmt &optional (updatep t))
   (declare (type fd-stream stream))
-  (setf (fd-stream-external-format stream)
-      (stream::ef-name (stream::find-external-format extfmt))
-	(fd-stream-oc-state stream) nil
-	(fd-stream-co-state stream) nil)
-  (when (fd-stream-ibuf-sap stream) ; input stream
-    (setf (fd-stream-in stream) (ef-cin extfmt)))
-  (when (fd-stream-obuf-sap stream) ; output stream
-    (setf (fd-stream-out stream) (ef-cout extfmt)
-	  ;;@@ (fd-stream-sout stream) (ef-sout extfmt)
-	  ))
-  extfmt)
+  (let ((old-format (fd-stream-external-format stream)))
+    (setf (fd-stream-external-format stream)
+	  (stream::ef-name (stream::find-external-format extfmt))
+	  (fd-stream-oc-state stream) nil
+	  (fd-stream-co-state stream) nil)
+    (when (fd-stream-ibuf-sap stream)	; input stream
+      (setf (fd-stream-in stream) (ef-cin extfmt)))
+    (when (fd-stream-obuf-sap stream)	; output stream
+      (setf (fd-stream-out stream) (ef-cout extfmt)
+	    ;;@@ (fd-stream-sout stream) (ef-sout extfmt)
+	    ))
+    (when (and lisp::*enable-stream-buffer-p* updatep
+	       (lisp-stream-string-buffer stream))
+      ;; We want to reconvert any octets that haven't been converted
+      ;; yet.  So, we need to figure out which octet to start with.
+      ;; This is done by converting (the previously converted) octets
+      ;; until we've converted the right number of characters.
+      (let ((ibuf (lisp-stream-in-buffer stream))
+	    (sindex (1- (lisp-stream-string-index stream)))
+	    (index 0)
+	    (state (fd-stream-saved-oc-state stream)))
+	;; Reconvert all the octets we've already converted and read.
+	;; We don't know how many octets that is, but do know how many
+	;; characters there are.
+	(multiple-value-bind (s pos count new-state)
+	    (octets-to-string ibuf
+			      :start 0
+			      :external-format old-format
+			      :string (make-string sindex)
+			      :state state)
+	  (declare (ignore s pos))
+	  (setf state new-state)
+	  (setf index count))
+	
+	;; We now know the last octet that was used.  Now convert the
+	;; rest of the octets using the new format.
+	(multiple-value-bind (s pos count new-state)
+	    (octets-to-string ibuf
+			      :start index
+			      :end (fd-stream-in-length stream)
+			      :external-format (fd-stream-external-format stream)
+			      :string (lisp-stream-string-buffer stream)
+			      :s-start 1
+			      :state state)
+	  (declare (ignore s))
+	  (setf (lisp-stream-string-index stream) 1)
+	  (setf (lisp-stream-string-buffer-len stream) pos)
+	  (setf (lisp-stream-in-index stream) (+ index count))
+	  (setf (fd-stream-oc-state stream) new-state))))
+    extfmt))
 
 ;; This is only used while building; it's reimplemented in
 ;; fd-stream-extfmt.lisp
@@ -459,6 +499,7 @@
       ;; simple-stream
       (stream::%unread-char stream character)
       ;; lisp-stream
+      #-unicode
       (let ((index (1- (lisp-stream-in-index stream)))
 	    (buffer (lisp-stream-in-buffer stream)))
 	(declare (fixnum index))
@@ -469,6 +510,25 @@
 	      (t
 	       (funcall (lisp-stream-misc stream) stream 
 			:unread character))))
+      #+unicode
+      (let ((sbuf (lisp-stream-string-buffer stream))
+	    (ibuf (lisp-stream-in-buffer stream)))
+	(cond (sbuf
+	       (let ((index (1- (lisp-stream-string-index stream))))
+		 (when (minusp index)
+		   (error "Nothing to unread."))
+		 (setf (aref sbuf index) character)
+		 (setf (lisp-stream-string-index stream) index)))
+	      (ibuf
+	       (let ((index (1- (lisp-stream-in-index stream))))
+		 (when (minusp index)
+		   (error "Nothing to unread."))
+		 ;; This only works for iso8859-1!
+		 (setf (aref ibuf index) (char-code character))
+		 (setf (lisp-stream-in-index stream) index)))
+	      (t
+	       (funcall (lisp-stream-misc stream) stream 
+			:unread character))))
       ;; fundamental-stream
       (stream-unread-char stream character)))
   nil)
@@ -703,6 +763,76 @@
 	   (setf (lisp-stream-in-index stream) (1+ start))
 	   (code-char (aref ibuf start))))))
 
+#+unicode
+(defun fast-read-char-string-refill (stream eof-errorp eof-value)
+  ;; Like fast-read-char-refill, but we don't need or want the
+  ;; in-buffer-extra.
+  (let* ((ibuf (lisp-stream-in-buffer stream))
+	 (index (lisp-stream-in-index stream)))
+    (declare (type (integer 0 #.in-buffer-length) index))
+
+    ;; Copy the stuff we haven't read from in-buffer to the beginning
+    ;; of the buffer.
+    (replace ibuf ibuf
+	     :start1 0
+	     :start2 index :end2 in-buffer-length)
+    
+    (let ((count (funcall (lisp-stream-n-bin stream) stream
+			  ibuf (- in-buffer-length index)
+			  index
+			  nil)))
+      (declare (type (integer 0 #.in-buffer-length) count))
+
+      (cond ((zerop count)
+	     ;; Nothing left in the stream, so update our pointers to
+	     ;; indicate we've read everything and call the stream-in
+	     ;; function so that we do the right thing for eof.
+	     (setf (lisp-stream-in-index stream) in-buffer-length)
+	     (setf (lisp-stream-string-index stream)
+		   (lisp-stream-string-buffer-len stream))
+	     (funcall (lisp-stream-in stream) stream eof-errorp eof-value))
+	    (t
+	     (let ((sbuf (lisp-stream-string-buffer stream))
+		   (slen (lisp-stream-string-buffer-len stream)))
+	       (declare (simple-string sbuf)
+			(type (integer 0 #.(1+ in-buffer-length)) slen)
+			(optimize (speed 3)))
+
+	       ;; Update in-length and saved-oc-state.  These are
+	       ;; needed if we change the external-format of the
+	       ;; stream because we need to know how many octets are
+	       ;; valid (in case end-of-file was reached), and what
+	       ;; the state was when originally converting the octets
+	       ;; to characters.
+	       (setf (fd-stream-in-length stream) (+ count (- in-buffer-length index)))
+	       (let ((state (fd-stream-oc-state stream)))
+		 (setf (fd-stream-saved-oc-state stream)
+		       (cons (car state)
+			     (funcall (ef-copy-state (fd-stream-external-format stream))
+				      (cdr state)))))
+
+	       ;; Copy the last read character to the beginning of the
+	       ;; buffer to support unreading.
+	       (when (plusp slen)
+		 (setf (schar sbuf 0) (schar sbuf (1- slen))))
+
+
+	       ;; Convert all the octets, including the ones that we
+	       ;; haven't processed yet and the ones we just read in.
+	       (multiple-value-bind (s char-count octet-count new-state)
+		   (octets-to-string ibuf
+				     :start 0
+				     :end (+ count (- in-buffer-length index))
+				     :state (fd-stream-oc-state stream)
+				     :string sbuf
+				     :s-start 1
+				     :external-format (fd-stream-external-format stream))
+		 (declare (ignore s))
+		 (setf (fd-stream-oc-state stream) new-state)
+		 (setf (lisp-stream-string-buffer-len stream) char-count)
+		 (setf (lisp-stream-string-index stream) 2)
+		 (setf (lisp-stream-in-index stream) octet-count)
+		 (schar sbuf 1))))))))
 
 ;;; FAST-READ-BYTE-REFILL  --  Interface
 ;;;
Index: src/code/string.lisp
diff -u src/code/string.lisp:1.20 src/code/string.lisp:1.21
--- src/code/string.lisp:1.20	Tue Sep 15 11:52:43 2009
+++ src/code/string.lisp	Sun Oct 18 10:21:24 2009
@@ -5,7 +5,7 @@
 ;;; Carnegie Mellon University, and has been placed in the public domain.
 ;;;
 (ext:file-comment
-  "$Header: /project/cmucl/cvsroot/src/code/string.lisp,v 1.20 2009-09-15 15:52:43 rtoy Rel $")
+  "$Header: /project/cmucl/cvsroot/src/code/string.lisp,v 1.21 2009-10-18 14:21:24 rtoy Exp $")
 ;;;
 ;;; **********************************************************************
 ;;;
@@ -44,14 +44,14 @@
 		  char-or-code)))
     (ecase surrogate-type
       ((:high :leading)
-       ;; Test for high surrogate
-       (<= #xD800 code #xDBFF))
+       ;; Test for high surrogate (#xD800 to #xDBFF)
+       (= #b110110 (ash code -10)))
       ((:low :trailing)
-       ;; Test for low surrogate
-       (<= #xDC00 code #xDFFF))
+       ;; Test for low surrogate (#xDC00 to #xDFFF)
+       (= #b110111 (ash code -10)))
       ((:any nil)
-       ;; Test for any surrogate
-       (<= #xD800 code #xDFFF)))))
+       ;; Test for any surrogate (#xD800 to #xDFFF)
+       (= #b11011 (ash code -11))))))
 
 (defun surrogates-to-codepoint (hi-surrogate-char lo-surrogate-char)
   "Convert the given Hi and Lo surrogate characters to the
Index: src/code/struct.lisp
diff -u src/code/struct.lisp:1.21 src/code/struct.lisp:1.22
--- src/code/struct.lisp:1.21	Mon Feb  4 12:22:15 2002
+++ src/code/struct.lisp	Sun Oct 18 10:21:24 2009
@@ -5,7 +5,7 @@
 ;;; Carnegie Mellon University, and has been placed in the public domain.
 ;;;
 (ext:file-comment
-  "$Header: /project/cmucl/cvsroot/src/code/struct.lisp,v 1.21 2002-02-04 17:22:15 toy Rel $")
+  "$Header: /project/cmucl/cvsroot/src/code/struct.lisp,v 1.22 2009-10-18 14:21:24 rtoy Exp $")
 ;;;
 ;;; **********************************************************************
 ;;;
@@ -37,7 +37,22 @@
   (out #'ill-out :type function)		; Write-Char function
   (bout #'ill-bout :type function)		; Byte output function
   (sout #'ill-out :type function)		; String output function
-  (misc #'do-nothing :type function))		; Less used methods
+  (misc #'do-nothing :type function)		; Less used methods
+  ;;
+  ;; A string to hold characters that have been converted from
+  ;; in-buffer.
+  #+unicode
+  (string-buffer nil :type (or null simple-string))
+  ;;
+  ;; Index into string-buffer where the next character should be read from
+  #+unicode
+  (string-index 0 :type index)
+  ;;
+  ;; Number of characters in string-buffer.  (This isn't the length of
+  ;; string-buffer, but the number of characters in the buffer, since
+  ;; many octets may be consumed to produce one character.)
+  #+unicode
+  (string-buffer-len 0 :type index))
 
 (declaim (inline streamp))
 (defun streamp (stream)
Index: src/code/sysmacs.lisp
diff -u src/code/sysmacs.lisp:1.30 src/code/sysmacs.lisp:1.31
--- src/code/sysmacs.lisp:1.30	Fri Feb  3 08:51:28 2006
+++ src/code/sysmacs.lisp	Sun Oct 18 10:21:24 2009
@@ -5,7 +5,7 @@
 ;;; Carnegie Mellon University, and has been placed in the public domain.
 ;;;
 (ext:file-comment
-  "$Header: /project/cmucl/cvsroot/src/code/sysmacs.lisp,v 1.30 2006-02-03 13:51:28 rtoy Rel $")
+  "$Header: /project/cmucl/cvsroot/src/code/sysmacs.lisp,v 1.31 2009-10-18 14:21:24 rtoy Exp $")
 ;;;
 ;;; **********************************************************************
 ;;;
@@ -150,8 +150,17 @@
   `(let* ((%frc-stream% ,stream)
 	  (%frc-method% (lisp-stream-in %frc-stream%))
 	  (%frc-buffer% (lisp-stream-in-buffer %frc-stream%))
-	  (%frc-index% (lisp-stream-in-index %frc-stream%)))
-     (declare (type index %frc-index%)
+	  (%frc-index% (lisp-stream-in-index %frc-stream%))
+	  #+unicode
+	  (%frc-string-buffer% (lisp-stream-string-buffer %frc-stream%))
+	  #+unicode
+	  (%frc-string-index% (lisp-stream-string-index %frc-stream%))
+	  #+unicode
+	  (%frc-string-length% (lisp-stream-string-buffer-len %frc-stream%)))
+     (declare #+unicode
+	      (type index %frc-string-index% %frc-string-length%)
+	      #+unicode
+	      (type (or null simple-string) %frc-string-buffer%)
 	      (type lisp-stream %frc-stream%))
      , at forms))
 
@@ -161,7 +170,10 @@
 ;;; inside it's scope to decache the lisp-stream-in-index.
 ;;;
 (defmacro done-with-fast-read-char ()
-  `(setf (lisp-stream-in-index %frc-stream%) %frc-index%))
+  `(progn
+     (setf (lisp-stream-in-index %frc-stream%) %frc-index%)
+     #+unicode
+     (setf (lisp-stream-string-index %frc-stream%) %frc-string-index%)))
 
 ;;; Fast-Read-Char  --  Internal
 ;;;
@@ -170,14 +182,25 @@
 ;;;
 (defmacro fast-read-char (&optional (eof-errorp t) (eof-value ()))
   `(cond
-    ((not %frc-buffer%)
-     (funcall %frc-method% %frc-stream% ,eof-errorp ,eof-value))
-    ((= %frc-index% in-buffer-length)
-     (prog1 (fast-read-char-refill %frc-stream% ,eof-errorp ,eof-value)
-	    (setq %frc-index% (lisp-stream-in-index %frc-stream%))))
-    (t
-     (prog1 (code-char (aref %frc-buffer% %frc-index%))
-	    (incf %frc-index%)))))
+     #+unicode
+     (%frc-string-buffer%
+      (cond ((>= %frc-string-index% %frc-string-length%)
+	     (prog1 (fast-read-char-string-refill %frc-stream% ,eof-errorp ,eof-value)
+	       (setf %frc-string-index% (lisp-stream-string-index %frc-stream%))
+	       (setf %frc-string-length% (lisp-stream-string-buffer-len %frc-stream%))))
+	    (t
+	     (prog1 (aref %frc-string-buffer% %frc-string-index%)
+	       (incf %frc-string-index%)))))
+     (%frc-buffer%
+      (cond ((= %frc-index% in-buffer-length)
+	     (prog1 (fast-read-char-refill %frc-stream% ,eof-errorp ,eof-value)
+	       (setq %frc-index% (lisp-stream-in-index %frc-stream%))))
+	    (t
+	     ;; This only works correctly for :iso8859-1!
+	     (prog1 (code-char (aref %frc-buffer% %frc-index%))
+	       (incf %frc-index%)))))
+     (t
+      (funcall %frc-method% %frc-stream% ,eof-errorp ,eof-value))))
 
 ;;;; And these for the fasloader...
 
@@ -215,4 +238,5 @@
 	(incf %frc-index%))))))
 ;;;
 (defmacro done-with-fast-read-byte ()
-  `(done-with-fast-read-char))
+  `(progn
+     (setf (lisp-stream-in-index %frc-stream%) %frc-index%)))
Index: src/general-info/release-20b.txt
diff -u src/general-info/release-20b.txt:1.1 src/general-info/release-20b.txt:1.2
--- src/general-info/release-20b.txt:1.1	Tue Sep 15 12:51:38 2009
+++ src/general-info/release-20b.txt	Sun Oct 18 10:21:24 2009
@@ -21,12 +21,19 @@
   * Known issues:
 
   * Feature enhancements:
+    - Read operations using an external format of utf8 are now much
+      faster.  Some simple tests on Linux shows that read-line now
+      only takes 40% as much time.  Sparc shows 60% as much time.
 
   * ANSI compliance fixes:
 
   * Bugfixes:
-
+    - On Unicode builds, printing of '|\|| and '|`| was incorrect
+      because no escapes were printed.  This is fixed now.
   * Trac Tickets:
+    #33: get-dispatch-macro-character doesn't signal errors in
+      compiled code
+      Fixed.
 
   * Other changes:
 
Index: src/pcl/simple-streams/external-formats/utf-16-be.lisp
diff -u src/pcl/simple-streams/external-formats/utf-16-be.lisp:1.3 src/pcl/simple-streams/external-formats/utf-16-be.lisp:1.4
--- src/pcl/simple-streams/external-formats/utf-16-be.lisp:1.3	Wed Sep 30 12:12:41 2009
+++ src/pcl/simple-streams/external-formats/utf-16-be.lisp	Sun Oct 18 10:21:24 2009
@@ -4,7 +4,7 @@
 ;;; This code was written by Paul Foley and has been placed in the public
 ;;; domain.
 ;;;
-(ext:file-comment "$Header: /project/cmucl/cvsroot/src/pcl/simple-streams/external-formats/utf-16-be.lisp,v 1.3 2009-09-30 16:12:41 rtoy Exp $")
+(ext:file-comment "$Header: /project/cmucl/cvsroot/src/pcl/simple-streams/external-formats/utf-16-be.lisp,v 1.4 2009-10-18 14:21:24 rtoy Exp $")
 
 (in-package "STREAM")
 
@@ -19,11 +19,23 @@
 	    (,code (+ (* 256 ,c1) ,c2)))
        (declare (type lisp:codepoint ,code))
        (cond ((lisp::surrogatep ,code :low)
-	      (setf ,code +replacement-character-code+))
+	      ;; Got low surrogate.  Combine with the state (high
+	      ;; surrogate), if we have it.  Otherwise, use the
+	      ;; replacement code.
+	      (if ,state
+		  (setf ,code (+ (ash (- (the (integer #xd800 #xdbff) ,state) #xD800) 10)
+				 ,code #x2400)
+			,state nil)
+		  (setf ,code +replacement-character-code+)))
 	     ((lisp::surrogatep ,code :high)
+	      ;; Remember the high surrogate in case we bail out
+	      ;; reading the low surrogate (for octets-to-string.)
+	      (setf ,state ,code)
 	      (let* ((,c1 ,input)
 		     (,c2 ,input)
 		     (,next (+ (* 256 ,c1) ,c2)))
+		;; We read the trailing code, so clear the state.
+		(setf ,state nil)
 		;; If we don't have a high and low surrogate,
 		;; replace with REPLACEMENT CHARACTER.  Possibly
 		;; unput 2 so it'll be read as another character
@@ -33,7 +45,8 @@
 		    (setf ,code +replacement-character-code+))))
 	     ((= ,code #xFFFE)
 	      ;; Replace with REPLACEMENT CHARACTER.  
-	      (setf ,code +replacement-character-code+)))
+	      (setf ,code +replacement-character-code+))
+	     (t (setf ,state nil)))
        (values ,code 2)))
   (code-to-octets (code state output c c1 c2)
     `(flet ((output (code)
@@ -48,4 +61,9 @@
 		(output (logior ,c1 #xD800))
 		(output (logior ,c2 #xDC00))))
 	     (t
-	      (output +replacement-character-code+))))))
+	      (output +replacement-character-code+)))))
+  nil
+  (copy-state (state)
+    ;; The state is either NIL or a codepoint, so nothing really
+    ;; special is needed to copy it.
+    `(progn ,state)))
Index: src/pcl/simple-streams/external-formats/utf-16-le.lisp
diff -u src/pcl/simple-streams/external-formats/utf-16-le.lisp:1.3 src/pcl/simple-streams/external-formats/utf-16-le.lisp:1.4
--- src/pcl/simple-streams/external-formats/utf-16-le.lisp:1.3	Wed Sep 30 12:12:41 2009
+++ src/pcl/simple-streams/external-formats/utf-16-le.lisp	Sun Oct 18 10:21:24 2009
@@ -4,11 +4,13 @@
 ;;; This code was written by Paul Foley and has been placed in the public
 ;;; domain.
 ;;;
-(ext:file-comment "$Header: /project/cmucl/cvsroot/src/pcl/simple-streams/external-formats/utf-16-le.lisp,v 1.3 2009-09-30 16:12:41 rtoy Exp $")
+(ext:file-comment "$Header: /project/cmucl/cvsroot/src/pcl/simple-streams/external-formats/utf-16-le.lisp,v 1.4 2009-10-18 14:21:24 rtoy Exp $")
 
 (in-package "STREAM")
 
 ;; UTF-16LE.  BOM is not recognized, and is never output.
+;;
+;; The state is either NIL or a codepoint.
 (define-external-format :utf-16-le (:size 2)
   ()
 
@@ -18,12 +20,24 @@
 	    (,code (+ (* 256 ,c2) ,c1)))
        (declare (type lisp:codepoint ,code))
        (cond ((lisp::surrogatep ,code :low)
-	      ;; Replace with REPLACEMENT CHARACTER.
-	      (setf ,code +replacement-character-code+))
+	      ;; If possible combine this low surrogate with the
+	      ;; high surrogate in the state.  Otherwise, we have
+	      ;; a bare low surrogate so return the replacement
+	      ;; character.
+	      (if ,state
+		  (setf ,code (+ (ash (- (the (integer #xd800 #xdbff) ,state) #xD800) 10)
+				 ,code #x2400)
+			,state nil)
+		  (setf ,code +replacement-character-code+)))
 	     ((lisp::surrogatep ,code :high)
+	      ;; Remember the high surrogate in case we bail out
+	      ;; reading the low surrogate (for octets-to-string.)
+	      (setf ,state ,code)
 	      (let* ((,c1 ,input)
 		     (,c2 ,input)
 		     (,next (+ (* 256 ,c2) ,c1)))
+		;; We read the trailing code, so clear the state.
+		(setf ,state nil)
 		;; Replace with REPLACEMENT CHARACTER.  Possibly
 		;; unput 2 so it'll be read as another character
 		;; next time around?
@@ -32,7 +46,8 @@
 		    (setq ,code +replacement-character-code+))))
 	     ((= ,code #xFFFE)
 	      ;; replace with REPLACEMENT CHARACTER ?
-	      (error "Illegal character U+FFFE in UTF-16 sequence.")))
+	      (error "Illegal character U+FFFE in UTF-16 sequence."))
+	     (t (setf ,state nil)))
       (values ,code 2)))
   (code-to-octets (code state output c c1 c2)
     `(flet ((output (code)
@@ -47,4 +62,9 @@
 		(output (logior ,c1 #xD800))
 		(output (logior ,c2 #xDC00))))
 	     (t
-	      (output +replacement-character-code+))))))
+	      (output +replacement-character-code+)))))
+  nil
+  (copy-state (state)
+    ;; The state is either NIL or a codepoint, so nothing really
+    ;; special is needed.
+    `(progn ,state)))
Index: src/pcl/simple-streams/external-formats/utf-16.lisp
diff -u src/pcl/simple-streams/external-formats/utf-16.lisp:1.4 src/pcl/simple-streams/external-formats/utf-16.lisp:1.5
--- src/pcl/simple-streams/external-formats/utf-16.lisp:1.4	Wed Sep 30 12:12:41 2009
+++ src/pcl/simple-streams/external-formats/utf-16.lisp	Sun Oct 18 10:21:24 2009
@@ -1,7 +1,7 @@
 ;;; -*- Mode: LISP; Syntax: ANSI-Common-Lisp; Package: STREAM -*-
 ;;;
 ;;; **********************************************************************
-(ext:file-comment "$Header: /project/cmucl/cvsroot/src/pcl/simple-streams/external-formats/utf-16.lisp,v 1.4 2009-09-30 16:12:41 rtoy Exp $")
+(ext:file-comment "$Header: /project/cmucl/cvsroot/src/pcl/simple-streams/external-formats/utf-16.lisp,v 1.5 2009-10-18 14:21:24 rtoy Exp $")
 
 (in-package "STREAM")
 
@@ -11,15 +11,14 @@
 ;; UTF-16BE, the input stream shouldn't have a BOM.  But we allow for
 ;; one here, anyway.  This should be compatible with the Unicode spec.
 
-;; make state an integer:
-;;  or (or state 0) to cope with NIL case
+;; The state is a cons.  The car is an integer:
 ;;  0 = initial state, nothing has been read yet
 ;;  1 = BOM has been read, little-endian
 ;;  2 = BOM has been read, big-endian, or non-BOM char has been read
 ;;
-;; (oddp state) = little-endian
-;; (evenp state) = big-endian
-;; (zerop state) = #xFEFF/#xFFFE is BOM (to be skipped)
+;; The cdr is either NIL or a codepoint which is used for converting
+;; surrogate pairs into codepoints.  If the cdr is non-NIL, then it is
+;; the leading (high) surrogate of a surrogate pair.
 ;;
 ;; When writing, never output a BOM.
 
@@ -28,10 +27,10 @@
 
   (octets-to-code (state input unput c1 c2 code wd next st)
     `(block nil
-       (when (null ,state) (setf ,state 0))
+       (when (null ,state) (setf ,state (cons 0 nil)))
        (tagbody
 	:again
-	  (let* ((,st ,state)
+	  (let* ((,st (car ,state))
 		 (,c1 ,input)
 		 (,c2 ,input)
 		 (,code (if (oddp ,st)
@@ -47,14 +46,29 @@
 	    ;; indicates that BOM has been seen, so that would result in
 	    ;; the BOM being reread as a character
 	    (cond ((lisp::surrogatep ,code :low)
-		   ;; replace with REPLACEMENT CHARACTER ?
-		   (setf ,code +replacement-character-code+))
+		   ;; If possible combine this low surrogate with the
+		   ;; high surrogate in the state.  Otherwise, we have
+		   ;; a bare low surrogate so return the replacement
+		   ;; character.
+		   (if (cdr ,state)
+		       (setf ,code (+ (ash (- (the (integer #xd800 #xdbff) (cdr ,state)) #xD800)
+					   10)
+				      ,code #x2400)
+			     ,state nil)
+		       (setf ,code +replacement-character-code+)))
 		  ((lisp::surrogatep ,code :high)
+		   ;; Save the high (leading) code in the state, in
+		   ;; case we fail to read the low (trailing)
+		   ;; surrogate.  (This should only happen when we're
+		   ;; doing octets-to-string.)
+		   (setf (cdr ,state) ,code)
 		   (let* ((,c1 ,input)
 			  (,c2 ,input)
 			  (,next (if (oddp ,st)
 				     (+ (* 256 ,c2) ,c1)
 				     (+ (* 256 ,c1) ,c2))))
+		     ;; We read the trailing surrogate, so clear the state.
+		     (setf (cdr ,state) nil)
 		     ;; If we don't have a high and low surrogate,
 		     ;; replace with REPLACEMENT CHARACTER.  Possibly
 		     ;; unput 2 so it'll be read as another character
@@ -91,5 +105,5 @@
 	      (output +replacement-character-code+)))))
   nil
   (copy-state (state)
-    ;; The state is either NIL or T, so we can just return that.
-    state))
+    ;; The state is list. Copy it
+    `(copy-list ,state)))
Index: src/pcl/simple-streams/external-formats/utf-32.lisp
diff -u src/pcl/simple-streams/external-formats/utf-32.lisp:1.4 src/pcl/simple-streams/external-formats/utf-32.lisp:1.5
--- src/pcl/simple-streams/external-formats/utf-32.lisp:1.4	Mon Sep 28 14:12:59 2009
+++ src/pcl/simple-streams/external-formats/utf-32.lisp	Sun Oct 18 10:21:24 2009
@@ -4,7 +4,7 @@
 ;;; This code was written by Raymond Toy and has been placed in the public
 ;;; domain.
 ;;;
-(ext:file-comment "$Header: /project/cmucl/cvsroot/src/pcl/simple-streams/external-formats/utf-32.lisp,v 1.4 2009-09-28 18:12:59 rtoy Exp $")
+(ext:file-comment "$Header: /project/cmucl/cvsroot/src/pcl/simple-streams/external-formats/utf-32.lisp,v 1.5 2009-10-18 14:21:24 rtoy Exp $")
 
 (in-package "STREAM")
 
@@ -83,4 +83,4 @@
   nil
   (copy-state (state)
     ;; The state is either NIL or T, so we can just return that.
-    state))
+    `(progn ,state)))



More information about the cmucl-commit mailing list