[MLton-commit] r4253
Wesley Terpstra
MLton@mlton.org
Thu, 24 Nov 2005 07:45:22 -0800
The converter signature and beginnings of converter implementation.
----------------------------------------------------------------------
A mlton/branches/unicode/basis-library/i18n/
A mlton/branches/unicode/basis-library/i18n/converter.sig
A mlton/branches/unicode/basis-library/i18n/converter.sml
A mlton/branches/unicode/basis-library/i18n.mlb
----------------------------------------------------------------------
Added: mlton/branches/unicode/basis-library/i18n/converter.sig
===================================================================
--- mlton/branches/unicode/basis-library/i18n/converter.sig 2005-11-24 01:08:59 UTC (rev 4252)
+++ mlton/branches/unicode/basis-library/i18n/converter.sig 2005-11-24 15:45:21 UTC (rev 4253)
@@ -0,0 +1,70 @@
+signature CHARSET_CONVERTER =
+ sig
+ structure Encoding:
+ sig
+ type t
+
+ val equals: t * t -> bool
+
+ val fromName: string -> t option
+ val toName: t -> string
+
+ (* also needed for bare minimum support (in order of usefulness):
+ val punycode: t
+ val utf7: t
+ val gb18030: t
+ val cesu8: t
+ val scsu: t
+ *)
+ val utf8: t
+ (* the "be" and "le" control endian in the absense of FFFE *)
+ val utf16be: t
+ val utf16le: t
+ val utf32be: t
+ val utf32le: t
+ end
+
+ (* Unfortunately, unlike all of the StringCvt methods provided in the
+ * basis, charset encodings can be stateful. For example, consider a
+ * fictituous charset consisting of letters A-Z and a 'uppercase'
+ * and 'lowercase' char (u & l respectively). Then "BlBBuCB" = "BbbCB".
+ * For this reason, decoders need to keep a state in addition to the
+ * stream position. Encoders also need to be 'flush'ed at the end of
+ * encoding to restore a stateful output stream to the initial state.
+ *)
+ type state
+ val embed: unit -> ('a -> state) * (state -> 'a option)
+
+ type 'a decoder = {
+ initial: state,
+ decoder: (Word8.word, 'a) reader -> (WideChar.char, 'a * state) reader
+ }
+ val decoder: Encoding.t -> 'a decoder
+
+ (* The encoder will only write up to the first unicode character which
+ * cannot be represented in the output charset.
+ *)
+ type encoder = {
+ initial: state * Word8Vector.vector,
+ encoder: state * WideSubtring.substring ->
+ state * WideSubstring.substring * Word8Vector.vector,
+ flush: state -> Word8Vector.vector
+ }
+ val encoder: Encoding.t -> encoder
+
+ (* Convenience functions *)
+ val decode: Encoding.t * Word8VectorSlice.vector_slice -> WideString.string option
+ val encode: Encoding.t * WideSubstring.substring -> Word8Vector.vector option
+
+ (* The register method allows you to add support for new encodings.
+ * The name is used case insensitively.
+ * The decoder has concrete type "state decoder" to work around SML's
+ * lack of higher order types. However, you must not peek inside it.
+ *)
+ type user_coder = {
+ name: string,
+ decoder: state decoder,
+ encoder: encoder
+ }
+ val register: user_coder -> unit
+ end
Added: mlton/branches/unicode/basis-library/i18n/converter.sml
===================================================================
--- mlton/branches/unicode/basis-library/i18n/converter.sml 2005-11-24 01:08:59 UTC (rev 4252)
+++ mlton/branches/unicode/basis-library/i18n/converter.sml 2005-11-24 15:45:21 UTC (rev 4253)
@@ -0,0 +1,94 @@
+structure CharsetConverter :> CHARSET_CONVERTER =
+ struct
+ (* http://mlton.org/UniversalType *)
+ type state = exn
+ fun 'a embed () =
+ let
+ exception E of 'a
+ fun project (e: t): 'a option =
+ case e of
+ E a => SOME a
+ | _ => NONE
+ in
+ (E, project)
+ end
+
+ type 'a decoder = {
+ initial: state,
+ decoder: (Word8.word, 'a) reader -> (WideChar.char, 'a * state) reader
+ }
+
+ type encoder = {
+ initial: state * Word8Vector.vector,
+ encoder: state * WideSubtring.substring ->
+ state * WideSubstring.substring * Word8Vector.vector,
+ flush: state -> Word8Vector.vector
+ }
+
+ type user_coder = {
+ name: string,
+ decoder: state decoder,
+ encoder: encoder
+ }
+
+ val coders : user_coder list ref = ref []
+ fun register x = coders := x :: (!coders)
+
+ structure Encoding =
+ struct
+ datatype t =
+ UTF8 | UTF16BE | UTF16LE | UTF32BE | UTF32LE |
+ USER of user_coder
+
+ val equals = op =
+
+ fun canonName s =
+ String.translate
+ (fn #"_" => ""
+ | #"-" => ""
+ | x => String.str (Char.toUpper x)) s
+
+ fun fromName s = case canonName s of
+ "UTF8" => SOME UTF8
+ | "UCS2" => SOME UTF16LE
+ | "UCS2LE" => SOME UTF16LE
+ | "UCS2BE" => SOME UTF16BE
+ | "UTF16" => SOME UTF16LE (* guess little-endian for now *)
+ | "UTF16LE" => SOME UTF16LE
+ | "UTF16BE" => SOME UTF16BE
+ | "UCS4" => SOME UTF32LE
+ | "UCS4LE" => SOME UTF32LE
+ | "UCS4BE" => SOME UTF32BE
+ | "UTF32" => SOME UTF32LE (* guess little-endian for now *)
+ | "UTF32LE" => SOME UTF32LE
+ | "UTF32BE" => SOME UTF32BE
+ | s =>
+ case List.find (fn {name, ...} => name = s) (!coders) of
+ NONE => NONE
+ | SOME x => SOME (USER x)
+
+ fun toName UTF8 = "UTF-8"
+ | toName UTF16BE = "UTF-16BE"
+ | toName UTF16LE = "UTF-16LE"
+ | toName UTF32BE = "UTF-32BE"
+ | toName UTF32LE = "UTF-32LE"
+ | toName (User {name, ...}) => name
+
+ val utf8 = UTF8
+ val utf16be = UTF16BE
+ val utf16le = UTF16LE
+ val utf32be = UTF32BE
+ val utf32le = UTF32LE
+ end
+
+ fun decode (e, vs) =
+ let
+ val { initial, decoder } = decoder e
+ fun get vs =
+ if Word8VectorSlice.length vs = 0 then NONE else
+ SOME (Word8VectorSlice.sub (vs, 0),
+ Word8VectorSlice.subslice (vs, 1, NONE))
+ in
+ ()
+ end
+ end
Added: mlton/branches/unicode/basis-library/i18n.mlb
===================================================================
--- mlton/branches/unicode/basis-library/i18n.mlb 2005-11-24 01:08:59 UTC (rev 4252)
+++ mlton/branches/unicode/basis-library/i18n.mlb 2005-11-24 15:45:21 UTC (rev 4253)
@@ -0,0 +1,50 @@
+(* Copyright (C) 2004-2005 Henry Cejtin, Matthew Fluet, Suresh
+ * Jagannathan, and Stephen Weeks.
+ *
+ * MLton is released under a BSD-style license.
+ * See the file MLton-LICENSE for details.
+ *)
+
+ann
+ "deadCode true"
+ "sequenceNonUnit warn"
+ "nonexhaustiveMatch warn" "redundantMatch warn"
+ "warnUnused true" "forceUsed"
+in
+ local
+ libs/basis-extra/basis-extra.mlb
+ in
+ structure Char4 : CHAR
+ structure Char4Array : MONO_ARRAY
+ structure Char4Array2 : MONO_ARRAY2
+ structure Char4ArraySlice : MONO_ARRAY_SLICE
+ structure Char4Vector : MONO_VECTOR
+ structure Char4VectorSlice : MONO_VECTOR_SLICE
+ structure String4 : STRING
+ structure Substring4 : SUBSTRING
+ structure Text4 : TEXT
+
+ structure Char2 : CHAR
+ structure Char2Array : MONO_ARRAY
+ structure Char2Array2 : MONO_ARRAY2
+ structure Char2ArraySlice : MONO_ARRAY_SLICE
+ structure Char2Vector : MONO_VECTOR
+ structure Char2VectorSlice : MONO_VECTOR_SLICE
+ structure String2 : STRING
+ structure Substring2 : SUBSTRING
+ structure Text2 : TEXT
+
+ structure Char1 : CHAR
+ structure Char1Array : MONO_ARRAY
+ structure Char1Array2 : MONO_ARRAY2
+ structure Char1ArraySlice : MONO_ARRAY_SLICE
+ structure Char1Vector : MONO_VECTOR
+ structure Char1VectorSlice : MONO_VECTOR_SLICE
+ structure String1 : STRING
+ structure Substring1 : SUBSTRING
+ structure Text1 : TEXT
+
+ signature CHARSET_CONVERTER
+ structure CharsetConverter : CHARSET_CONVERTER
+ end
+end