CMUCL commit: src (code/unidata.lisp tools/build-unidata.lisp)

Sat Sep 18 22:47:51 CEST 2010

Date: Saturday, September 18, 2010 @ 16:47:51
  Author: rtoy
    Path: /project/cmucl/cvsroot/src

Modified: code/unidata.lisp tools/build-unidata.lisp

code/unidata.lisp:
o Just add some comments on why we don't put the dictionaries in
  unidata.bin.
o Print out some messages when building the hangul and cjk
  dictionaries so the user knows what's happening.

tools/build-unidata.lisp:
o Add some comments on the various parts of unidata.bin.


--------------------------+
 code/unidata.lisp        |   25 +++++++++++++++++++++----
 tools/build-unidata.lisp |   41 +++++++++++++++++++++++------------------
 2 files changed, 44 insertions(+), 22 deletions(-)


Index: src/code/unidata.lisp
diff -u src/code/unidata.lisp:1.14 src/code/unidata.lisp:1.15

--- src/code/unidata.lisp:1.14	Fri Sep 17 19:29:01 2010
+++ src/code/unidata.lisp	Sat Sep 18 16:47:51 2010
@@ -4,7 +4,7 @@
 ;;; This code was written by Paul Foley and has been placed in the public
 ;;; domain.
 ;;; 
-(ext:file-comment "$Header: /project/cmucl/cvsroot/src/code/unidata.lisp,v 1.14 2010-09-17 23:29:01 rtoy Exp $")
+(ext:file-comment "$Header: /project/cmucl/cvsroot/src/code/unidata.lisp,v 1.15 2010-09-18 20:47:51 rtoy Exp $")
 ;;;
 ;;; **********************************************************************
 ;;;
@@ -18,7 +18,7 @@
 
 (defconstant +unidata-path+ #p"ext-formats:unidata.bin")
 
-(defvar *unidata-version* "$Revision: 1.14 $")
+(defvar *unidata-version* "$Revision: 1.15 $")
 
 (defstruct unidata
   range
@@ -1116,6 +1116,17 @@
 ;; Code written by Paul Foley, with some modifications by Raymond Toy.
 ;;
 
+;; These hold dictionaries for the Hangul syllables and the CJK
+;; unified ideographs.  Note that these could be stored in
+;; unidata.bin, but that adds almost a megabyte to the size of
+;; unidata.bin.  That seems way to much bloat for something that is
+;; probably not used that much.  However, this incurs a runtime cost
+;; the first time it needs to be accessed.  On a 450 MHz sparc, it
+;; takes 55 sec for the cjk dictionary and 9 sec for the Hangul
+;; dictionary.  A bit long but not too bad.  On a 2 GHz mac mini, it
+;; takes 5 sec and .8 sec, respectively.  This seems reasonable,
+;; especially since the intent is for character completion, which
+;; doesn't have to be too fast.
 (defvar *hangul-syllable-dictionary* nil
   "Dictionary of Hangul syllables")
 (defvar *cjk-unified-ideograph-dictionary* nil
@@ -1322,6 +1333,7 @@
 
 (defun build-hangul-syllable-dictionary ()
   "Build the dictionary for Hangul syllables"
+  (format t "~&Building Hangul Syllable dictionary.  Please wait...~%")
   (initialize-reverse-hangul-tables)
   (let ((hangul-codebook
 	 (map 'vector #'car
@@ -1344,10 +1356,13 @@
 			 names)
 		   (incf k))))
     (setf *hangul-syllable-dictionary*
-	  (build-dictionary hangul-codebook (nreverse names)))))
+	  (build-dictionary hangul-codebook (nreverse names)))
+    (format t "~&Done.~%")
+    (values)))
 
 (defun build-cjk-unified-ideograph-dictionary ()
   "Build the dictionary for CJK Unified Ideographs"
+  (format t "~&Building CJK Unified Ideographs dictionary.  Please wait...~%")
   (let ((codebook (coerce (loop for k from 0 to 15
 			     collect (format nil "~X" k))
 			  'vector))
@@ -1356,7 +1371,9 @@
 		  collect (cons (format nil "~X" codepoint)
 				codepoint))))
     (setf *cjk-unified-ideograph-dictionary*
-	  (build-dictionary codebook names))))
+	  (build-dictionary codebook names))
+    (format t "~&Done.~%")
+    (values)))
 
 ;; The definitions of BUILD-DICTIONARY, NAME-LOOKUP, and ENCODE-NAME
 ;; were taken from build-unidata.lisp.
Index: src/tools/build-unidata.lisp
diff -u src/tools/build-unidata.lisp:1.5 src/tools/build-unidata.lisp:1.6
--- src/tools/build-unidata.lisp:1.5	Wed Sep 15 17:06:39 2010
+++ src/tools/build-unidata.lisp	Sat Sep 18 16:47:51 2010
@@ -4,7 +4,7 @@
 ;;; This code was written by Paul Foley and has been placed in the public
 ;;; domain.
 ;;; 
-(ext:file-comment "$Header: /project/cmucl/cvsroot/src/tools/build-unidata.lisp,v 1.5 2010-09-15 21:06:39 rtoy Exp $")
+(ext:file-comment "$Header: /project/cmucl/cvsroot/src/tools/build-unidata.lisp,v 1.6 2010-09-18 20:47:51 rtoy Exp $")
 ;;;
 ;;; **********************************************************************
 ;;;
@@ -466,6 +466,8 @@
 		 (error "Index array too short for the data being written")))))
     (with-open-file (stm path :direction :io :if-exists :rename-and-delete
 			 :element-type '(unsigned-byte 8))
+      ;; The length of the index array is the number of sections to be
+      ;; written.  See below for each section.
       (let ((index (make-array 19 :fill-pointer 0)))
 	;; File header
 	(write32 +unicode-magic-number+ stm)	; identification "magic"
@@ -478,12 +480,12 @@
 	(dotimes (i (array-dimension index 0))
 	  (write32 0 stm))		; space for indices
 	(write32 0 stm)			; end marker
-	;; Range data
+	;; 0. Range data
 	(let ((data (unidata-range *unicode-data*)))
 	  (update-index (file-position stm) index)
 	  (write32 (length (range-codes data)) stm)
 	  (write-vector (range-codes data) stm :endian-swap :network-order))
-	;; Character name data
+	;; 1. Character name data
 	(let ((data (unidata-name+ *unicode-data*)))
 	  (update-index (file-position stm) index)
 	  (write-byte (1- (length (dictionary-cdbk data))) stm)
@@ -499,41 +501,41 @@
 	  (write-vector (dictionary-codev data) stm :endian-swap :network-order)
 	  (write-vector (dictionary-nextv data) stm :endian-swap :network-order)
 	  (write-vector (dictionary-namev data) stm :endian-swap :network-order))
-	;; Codepoint-to-name mapping
+	;; 2. Codepoint-to-name mapping
 	(let ((data (unidata-name *unicode-data*)))
 	  (update-index (file-position stm) index)
 	  (write-ntrie32 data stm))
-	;; Codepoint-to-category table
+	;; 3. Codepoint-to-category table
 	(let ((data (unidata-category *unicode-data*)))
 	  (update-index (file-position stm) index)
 	  (write-ntrie8 data stm))
-	;; Simple case mapping table
+	;; 4. Simple case mapping table
 	(let ((data (unidata-scase *unicode-data*)))
 	  (update-index (file-position stm) index)
 	  (write-ntrie32 data stm)
 	  (write-byte (length (scase-svec data)) stm)
 	  (write-vector (scase-svec data) stm :endian-swap :network-order))
-	;; Numeric data
+	;; 5. Numeric data
 	(let ((data (unidata-numeric *unicode-data*)))
 	  (update-index (file-position stm) index)
 	  (write-ntrie32 data stm))
-	;; Decomposition data
+	;; 6. Decomposition data
 	(let ((data (unidata-decomp *unicode-data*)))
 	  (update-index (file-position stm) index)
 	  (write-ntrie32 data stm)
 	  (write16 (length (decomp-tabl data)) stm)
 	  (write-vector (decomp-tabl data) stm :endian-swap :network-order))
-	;; Combining classes
+	;; 7. Combining classes
 	(let ((data (unidata-combining *unicode-data*)))
 	  (update-index (file-position stm) index)
 	  (write-ntrie8 data stm))
-	;; Bidi data
+	;; 8. Bidi data
 	(let ((data (unidata-bidi *unicode-data*)))
 	  (update-index (file-position stm) index)
 	  (write-ntrie16 data stm)
 	  (write-byte (length (bidi-tabl data)) stm)
 	  (write-vector (bidi-tabl data) stm :endian-swap :network-order))
-	;; Unicode 1.0 names
+	;; 9. Unicode 1.0 names
 	(let ((data (unidata-name1+ *unicode-data*)))
 	  (update-index (file-position stm) index)
 	  (write-byte (1- (length (dictionary-cdbk data))) stm)
@@ -549,10 +551,11 @@
 	  (write-vector (dictionary-codev data) stm :endian-swap :network-order)
 	  (write-vector (dictionary-nextv data) stm :endian-swap :network-order)
 	  (write-vector (dictionary-namev data) stm :endian-swap :network-order))
+	;; 10. Codepoint to unicode-1.0 name
 	(let ((data (unidata-name1 *unicode-data*)))
 	  (update-index (file-position stm) index)
 	  (write-ntrie32 data stm))
-	;; Normalization quick-check data
+	;; 11. Normalization quick-check data
 	(update-index (file-position stm) index)
 	(let ((data (unidata-qc-nfd *unicode-data*)))
 	  (write-ntrie1 data stm))
@@ -562,35 +565,37 @@
 	  (write-ntrie2 data stm))
 	(let ((data (unidata-qc-nfkc *unicode-data*)))
 	  (write-ntrie2 data stm))
-	;; Write composition exclusion table
+	;; 12. Write composition exclusion table
 	(let ((data (unidata-comp-exclusions *unicode-data*)))
 	  (update-index (file-position stm) index)
 	  (write16 (length data) stm)
 	  (write-vector data stm :endian-swap :network-order))
-	;; Write full-case lower data
 	(flet ((dump-full-case (data)
 		 (update-index (file-position stm) index)
 		 (write-ntrie32 data stm)
 		 (write16 (length (full-case-tabl data)) stm)
 		 (write-vector (full-case-tabl data) stm :endian-swap :network-order)))
+	  ;; 13. Write full-case lower data
 	  (dump-full-case (unidata-full-case-lower *unicode-data*))
+	  ;; 14. Write full-case title data
 	  (dump-full-case (unidata-full-case-title *unicode-data*))
+	  ;; 15. Write full-case upper data
 	  (dump-full-case (unidata-full-case-upper *unicode-data*)))
-	;; Write case folding data
+	;; 16. Write case-folding simple data
 	(let ((data (unidata-case-fold-simple *unicode-data*)))
 	  (update-index (file-position stm) index)
 	  (write-ntrie32 data stm))
-	;; case-folding full
+	;; 17. Write case-folding full data
 	(let ((data (unidata-case-fold-full *unicode-data*)))
 	  (update-index (file-position stm) index)
 	  (write-ntrie32 data stm)
 	  (write16 (length (case-folding-tabl data)) stm)
 	  (write-vector (case-folding-tabl data) stm :endian-swap :network-order))
-	;; Word-break
+	;; 18. Word-break
 	(let ((data (unidata-word-break *unicode-data*)))
 	  (update-index (file-position stm) index)
 	  (write-ntrie4 data stm))
-	;; Patch up index
+	;; All components saved. Patch up index table now.
 	(file-position stm 8)
 	(dotimes (i (length index))
 	  (write32 (aref index i) stm)))))