[MLton-commit] r4252
Wesley Terpstra
MLton@mlton.org
Wed, 23 Nov 2005 21:49:51 -0800
pieces of the charset generator
----------------------------------------------------------------------
A mlton/branches/unicode/basis-library/text/gen-hash.sml
A mlton/branches/unicode/basis-library/text/gen-lists.sml
----------------------------------------------------------------------
Added: mlton/branches/unicode/basis-library/text/gen-hash.sml
===================================================================
--- mlton/branches/unicode/basis-library/text/gen-hash.sml 2005-11-24 00:59:21 UTC (rev 4251)
+++ mlton/branches/unicode/basis-library/text/gen-hash.sml 2005-11-24 01:08:59 UTC (rev 4252)
@@ -0,0 +1,70 @@
+(* parse the entire standard input *)
+fun parse s = Vector.fromList (String.fields (fn c => c = #";") s)
+fun scanin h =
+ case TextIO.inputLine h of
+ NONE => []
+ | SOME s => s :: scanin h
+
+fun hex2int s = valOf (StringCvt.scanString (Int.scan StringCvt.HEX) s)
+val UnicodeData = List.map parse (scanin (TextIO.openIn "UnicodeData.txt"))
+val CodePoints = Vector.fromList (List.map (fn v => hex2int (Vector.sub (v, 0))) UnicodeData)
+
+fun hash (s, f, n) =
+ let
+ val a = Array.tabulate (n, fn _ => 0)
+ val clashes = ref 0
+
+ val sw = Word32.fromInt s
+ val fw = Word32.fromInt f
+ val nw = Word32.fromInt n
+
+ fun inject x =
+ let
+ val xw = Word32.fromInt x
+ val k = (Word32.>> (xw, sw) * fw) + xw (* 6..15 *)
+ val k = Word32.toInt (Word32.mod (k, nw))
+ val v = Unsafe.Array.sub (a, k) + 1
+ in
+ if v > 1 then clashes := !clashes + 1 else ();
+ Unsafe.Array.update (a, k, v)
+ end
+ in
+ Vector.app inject CodePoints;
+ !clashes
+ end
+
+(* 21 bits of data, 14 bits used, 15+5 used *)
+
+(* 10, 5919, 32768 -> 52, 2 *)
+(* 12, 5537, 32768 -> 51, 2 *)
+(* 14, 16837, 32768 -> 8, 2 *)
+(* 14, 6162, 33739 -> 5, 2 *)
+(* 13, 10941, 52007 -> 0 *)
+
+fun better (best, clashes) = best < clashes
+
+fun loopshift (n, f) (s, best) =
+ if s = 17 then best else
+ let val trial = hash (s, f, n) in
+ if trial > best then loopshift (n, f) (s + 1, best) else
+ (print (Int.toString s ^ ", " ^ Int.toString f ^ ", " ^ Int.toString n ^ " -> " ^ Int.toString best ^ "\n");
+ loopshift (n, f) (s + 1, trial)) end
+
+fun loopfact n (f, best) =
+ let val best = loopshift (n, f) (10, best) in
+ if f = n then best else loopfact n (f + 1, best) end
+
+fun loopmod best =
+ let
+ val n = Word.toInt (MLton.Random.rand () mod 0w31500) + 10000
+ val best = loopfact n (1, best)
+ in
+ loopmod best
+ end
+
+val () = MLton.Random.srand (valOf (MLton.Random.useed ()))
+(*
+val best = loopfact 32768 (1, 5000) (* a useful size due to bit arith *)
+val best = loopfact 65536 (1, best) (* a useful size due to bit arith *)
+*)
+val () = loopmod 0
Added: mlton/branches/unicode/basis-library/text/gen-lists.sml
===================================================================
--- mlton/branches/unicode/basis-library/text/gen-lists.sml 2005-11-24 00:59:21 UTC (rev 4251)
+++ mlton/branches/unicode/basis-library/text/gen-lists.sml 2005-11-24 01:08:59 UTC (rev 4252)
@@ -0,0 +1,45 @@
+(* parse the entire standard input *)
+fun parse s = Vector.fromList (String.fields (fn c => c = #";") s)
+fun scanin h =
+ case TextIO.inputLine h of
+ NONE => []
+ | SOME s => s :: scanin h
+
+val UnicodeData = List.map parse (scanin (TextIO.openIn "UnicodeData.txt"))
+val PropList = List.map parse (scanin (TextIO.openIn "PropList.txt"))
+fun hex2int s = valOf (StringCvt.scanString (Int.scan StringCvt.HEX) s)
+
+(* There are four tables of data we must provide for the basis:
+ * category: c => letter | numeral | control | punct
+ * case: c => lower | upper | space | other
+ * Because Unicode stores clumps characters of similar type together,
+ * we record the data as [ <val0>, <count0>, <val1>, <count1>, ... <valN> ]
+ * val0 is the value of the field, count is the repitition of that value
+ * toupper: c => c
+ * tolower: c => c
+ *)
+
+
+(* select characters by their category *)
+fun category select =
+ List.mapPartial (fn v => if select (Vector.sub (v, 2))
+ then hex2int (Vector.sub (v, 0))
+ else NONE)
+ records
+
+(* convert a alist of integers into closed integer ranges *)
+fun compress [] = []
+ | compress (x :: r) =
+ let
+ fun helper ((l, u), []) = [(l, u)]
+ | helper ((l, u), n :: r) =
+ if u+1 = n then helper ((l, n), r) else
+ (l, u) :: helper ((n, n), r)
+ in
+ helper ((x, x), r)
+ end
+
+fun dump [] = ()
+ | dump ((l, u) :: r) = (
+ print (Int.fmt StringCvt.HEX l ^ "..." ^ Int.fmt StringCvt.HEX u ^ " = " ^ Int.toString (u - l) ^ "\n");
+ dump r)