CMUCL commit: src (code/unidata.lisp tools/build-unidata.lisp)
Raymond Toy
rtoy at common-lisp.net
Sat Sep 18 22:47:51 CEST 2010
Date: Saturday, September 18, 2010 @ 16:47:51
Author: rtoy
Path: /project/cmucl/cvsroot/src
Modified: code/unidata.lisp tools/build-unidata.lisp
code/unidata.lisp:
o Just add some comments on why we don't put the dictionaries in
unidata.bin.
o Print out some messages when building the hangul and cjk
dictionaries so the user knows what's happening.
tools/build-unidata.lisp:
o Add some comments on the various parts of unidata.bin.
--------------------------+
code/unidata.lisp | 25 +++++++++++++++++++++----
tools/build-unidata.lisp | 41 +++++++++++++++++++++++------------------
2 files changed, 44 insertions(+), 22 deletions(-)
Index: src/code/unidata.lisp
diff -u src/code/unidata.lisp:1.14 src/code/unidata.lisp:1.15
--- src/code/unidata.lisp:1.14 Fri Sep 17 19:29:01 2010
+++ src/code/unidata.lisp Sat Sep 18 16:47:51 2010
@@ -4,7 +4,7 @@
;;; This code was written by Paul Foley and has been placed in the public
;;; domain.
;;;
-(ext:file-comment "$Header: /project/cmucl/cvsroot/src/code/unidata.lisp,v 1.14 2010-09-17 23:29:01 rtoy Exp $")
+(ext:file-comment "$Header: /project/cmucl/cvsroot/src/code/unidata.lisp,v 1.15 2010-09-18 20:47:51 rtoy Exp $")
;;;
;;; **********************************************************************
;;;
@@ -18,7 +18,7 @@
(defconstant +unidata-path+ #p"ext-formats:unidata.bin")
-(defvar *unidata-version* "$Revision: 1.14 $")
+(defvar *unidata-version* "$Revision: 1.15 $")
(defstruct unidata
range
@@ -1116,6 +1116,17 @@
;; Code written by Paul Foley, with some modifications by Raymond Toy.
;;
+;; These hold dictionaries for the Hangul syllables and the CJK
+;; unified ideographs. Note that these could be stored in
+;; unidata.bin, but that adds almost a megabyte to the size of
+;; unidata.bin. That seems way to much bloat for something that is
+;; probably not used that much. However, this incurs a runtime cost
+;; the first time it needs to be accessed. On a 450 MHz sparc, it
+;; takes 55 sec for the cjk dictionary and 9 sec for the Hangul
+;; dictionary. A bit long but not too bad. On a 2 GHz mac mini, it
+;; takes 5 sec and .8 sec, respectively. This seems reasonable,
+;; especially since the intent is for character completion, which
+;; doesn't have to be too fast.
(defvar *hangul-syllable-dictionary* nil
"Dictionary of Hangul syllables")
(defvar *cjk-unified-ideograph-dictionary* nil
@@ -1322,6 +1333,7 @@
(defun build-hangul-syllable-dictionary ()
"Build the dictionary for Hangul syllables"
+ (format t "~&Building Hangul Syllable dictionary. Please wait...~%")
(initialize-reverse-hangul-tables)
(let ((hangul-codebook
(map 'vector #'car
@@ -1344,10 +1356,13 @@
names)
(incf k))))
(setf *hangul-syllable-dictionary*
- (build-dictionary hangul-codebook (nreverse names)))))
+ (build-dictionary hangul-codebook (nreverse names)))
+ (format t "~&Done.~%")
+ (values)))
(defun build-cjk-unified-ideograph-dictionary ()
"Build the dictionary for CJK Unified Ideographs"
+ (format t "~&Building CJK Unified Ideographs dictionary. Please wait...~%")
(let ((codebook (coerce (loop for k from 0 to 15
collect (format nil "~X" k))
'vector))
@@ -1356,7 +1371,9 @@
collect (cons (format nil "~X" codepoint)
codepoint))))
(setf *cjk-unified-ideograph-dictionary*
- (build-dictionary codebook names))))
+ (build-dictionary codebook names))
+ (format t "~&Done.~%")
+ (values)))
;; The definitions of BUILD-DICTIONARY, NAME-LOOKUP, and ENCODE-NAME
;; were taken from build-unidata.lisp.
Index: src/tools/build-unidata.lisp
diff -u src/tools/build-unidata.lisp:1.5 src/tools/build-unidata.lisp:1.6
--- src/tools/build-unidata.lisp:1.5 Wed Sep 15 17:06:39 2010
+++ src/tools/build-unidata.lisp Sat Sep 18 16:47:51 2010
@@ -4,7 +4,7 @@
;;; This code was written by Paul Foley and has been placed in the public
;;; domain.
;;;
-(ext:file-comment "$Header: /project/cmucl/cvsroot/src/tools/build-unidata.lisp,v 1.5 2010-09-15 21:06:39 rtoy Exp $")
+(ext:file-comment "$Header: /project/cmucl/cvsroot/src/tools/build-unidata.lisp,v 1.6 2010-09-18 20:47:51 rtoy Exp $")
;;;
;;; **********************************************************************
;;;
@@ -466,6 +466,8 @@
(error "Index array too short for the data being written")))))
(with-open-file (stm path :direction :io :if-exists :rename-and-delete
:element-type '(unsigned-byte 8))
+ ;; The length of the index array is the number of sections to be
+ ;; written. See below for each section.
(let ((index (make-array 19 :fill-pointer 0)))
;; File header
(write32 +unicode-magic-number+ stm) ; identification "magic"
@@ -478,12 +480,12 @@
(dotimes (i (array-dimension index 0))
(write32 0 stm)) ; space for indices
(write32 0 stm) ; end marker
- ;; Range data
+ ;; 0. Range data
(let ((data (unidata-range *unicode-data*)))
(update-index (file-position stm) index)
(write32 (length (range-codes data)) stm)
(write-vector (range-codes data) stm :endian-swap :network-order))
- ;; Character name data
+ ;; 1. Character name data
(let ((data (unidata-name+ *unicode-data*)))
(update-index (file-position stm) index)
(write-byte (1- (length (dictionary-cdbk data))) stm)
@@ -499,41 +501,41 @@
(write-vector (dictionary-codev data) stm :endian-swap :network-order)
(write-vector (dictionary-nextv data) stm :endian-swap :network-order)
(write-vector (dictionary-namev data) stm :endian-swap :network-order))
- ;; Codepoint-to-name mapping
+ ;; 2. Codepoint-to-name mapping
(let ((data (unidata-name *unicode-data*)))
(update-index (file-position stm) index)
(write-ntrie32 data stm))
- ;; Codepoint-to-category table
+ ;; 3. Codepoint-to-category table
(let ((data (unidata-category *unicode-data*)))
(update-index (file-position stm) index)
(write-ntrie8 data stm))
- ;; Simple case mapping table
+ ;; 4. Simple case mapping table
(let ((data (unidata-scase *unicode-data*)))
(update-index (file-position stm) index)
(write-ntrie32 data stm)
(write-byte (length (scase-svec data)) stm)
(write-vector (scase-svec data) stm :endian-swap :network-order))
- ;; Numeric data
+ ;; 5. Numeric data
(let ((data (unidata-numeric *unicode-data*)))
(update-index (file-position stm) index)
(write-ntrie32 data stm))
- ;; Decomposition data
+ ;; 6. Decomposition data
(let ((data (unidata-decomp *unicode-data*)))
(update-index (file-position stm) index)
(write-ntrie32 data stm)
(write16 (length (decomp-tabl data)) stm)
(write-vector (decomp-tabl data) stm :endian-swap :network-order))
- ;; Combining classes
+ ;; 7. Combining classes
(let ((data (unidata-combining *unicode-data*)))
(update-index (file-position stm) index)
(write-ntrie8 data stm))
- ;; Bidi data
+ ;; 8. Bidi data
(let ((data (unidata-bidi *unicode-data*)))
(update-index (file-position stm) index)
(write-ntrie16 data stm)
(write-byte (length (bidi-tabl data)) stm)
(write-vector (bidi-tabl data) stm :endian-swap :network-order))
- ;; Unicode 1.0 names
+ ;; 9. Unicode 1.0 names
(let ((data (unidata-name1+ *unicode-data*)))
(update-index (file-position stm) index)
(write-byte (1- (length (dictionary-cdbk data))) stm)
@@ -549,10 +551,11 @@
(write-vector (dictionary-codev data) stm :endian-swap :network-order)
(write-vector (dictionary-nextv data) stm :endian-swap :network-order)
(write-vector (dictionary-namev data) stm :endian-swap :network-order))
+ ;; 10. Codepoint to unicode-1.0 name
(let ((data (unidata-name1 *unicode-data*)))
(update-index (file-position stm) index)
(write-ntrie32 data stm))
- ;; Normalization quick-check data
+ ;; 11. Normalization quick-check data
(update-index (file-position stm) index)
(let ((data (unidata-qc-nfd *unicode-data*)))
(write-ntrie1 data stm))
@@ -562,35 +565,37 @@
(write-ntrie2 data stm))
(let ((data (unidata-qc-nfkc *unicode-data*)))
(write-ntrie2 data stm))
- ;; Write composition exclusion table
+ ;; 12. Write composition exclusion table
(let ((data (unidata-comp-exclusions *unicode-data*)))
(update-index (file-position stm) index)
(write16 (length data) stm)
(write-vector data stm :endian-swap :network-order))
- ;; Write full-case lower data
(flet ((dump-full-case (data)
(update-index (file-position stm) index)
(write-ntrie32 data stm)
(write16 (length (full-case-tabl data)) stm)
(write-vector (full-case-tabl data) stm :endian-swap :network-order)))
+ ;; 13. Write full-case lower data
(dump-full-case (unidata-full-case-lower *unicode-data*))
+ ;; 14. Write full-case title data
(dump-full-case (unidata-full-case-title *unicode-data*))
+ ;; 15. Write full-case upper data
(dump-full-case (unidata-full-case-upper *unicode-data*)))
- ;; Write case folding data
+ ;; 16. Write case-folding simple data
(let ((data (unidata-case-fold-simple *unicode-data*)))
(update-index (file-position stm) index)
(write-ntrie32 data stm))
- ;; case-folding full
+ ;; 17. Write case-folding full data
(let ((data (unidata-case-fold-full *unicode-data*)))
(update-index (file-position stm) index)
(write-ntrie32 data stm)
(write16 (length (case-folding-tabl data)) stm)
(write-vector (case-folding-tabl data) stm :endian-swap :network-order))
- ;; Word-break
+ ;; 18. Word-break
(let ((data (unidata-word-break *unicode-data*)))
(update-index (file-position stm) index)
(write-ntrie4 data stm))
- ;; Patch up index
+ ;; All components saved. Patch up index table now.
(file-position stm 8)
(dotimes (i (length index))
(write32 (aref index i) stm)))))
More information about the cmucl-commit
mailing list