adam@1592
|
1 (* Copyright (c) 2011, Adam Chlipala
|
adam@1592
|
2 * All rights reserved.
|
adam@1592
|
3 *
|
adam@1592
|
4 * Redistribution and use in source and binary forms, with or without
|
adam@1592
|
5 * modification, are permitted provided that the following conditions are met:
|
adam@1592
|
6 *
|
adam@1592
|
7 * - Redistributions of source code must retain the above copyright notice,
|
adam@1592
|
8 * this list of conditions and the following disclaimer.
|
adam@1592
|
9 * - Redistributions in binary form must reproduce the above copyright notice,
|
adam@1592
|
10 * this list of conditions and the following disclaimer in the documentation
|
adam@1592
|
11 * and/or other materials provided with the distribution.
|
adam@1592
|
12 * - The names of contributors may not be used to endorse or promote products
|
adam@1592
|
13 * derived from this software without specific prior written permission.
|
adam@1592
|
14 *
|
adam@1592
|
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
adam@1592
|
16 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
adam@1592
|
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
adam@1592
|
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
adam@1592
|
19 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
adam@1592
|
20 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
adam@1592
|
21 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
adam@1592
|
22 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
adam@1592
|
23 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
adam@1592
|
24 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
adam@1592
|
25 * POSSIBILITY OF SUCH DAMAGE.
|
adam@1592
|
26 *)
|
adam@1592
|
27
|
adam@1592
|
28 (* UTF-8 conversion *)
|
adam@1592
|
29
|
adam@1592
|
30 structure Utf8 :> UTF8 = struct
|
adam@1592
|
31
|
adam@1592
|
32 fun byte n = str (chr (Word.toInt n))
|
adam@1592
|
33
|
adam@1592
|
34 fun encode n =
|
adam@1592
|
35 if n <= 0 then
|
adam@1592
|
36 raise Fail "Invalid character to UTF-8-encode"
|
adam@1592
|
37 else if n <= 0x7F then
|
adam@1592
|
38 str (chr n)
|
adam@1592
|
39 else if n <= 0x7FF then
|
adam@1592
|
40 let
|
adam@1592
|
41 val w = Word.fromInt n
|
adam@1592
|
42 val b1 = Word.orb (Word.fromInt (128 + 64), Word.>> (w, Word.fromInt 6))
|
adam@1592
|
43 val b2 = Word.orb (Word.fromInt 128, Word.andb (w, Word.fromInt 63))
|
adam@1592
|
44 in
|
adam@1592
|
45 byte b1 ^ byte b2
|
adam@1592
|
46 end
|
adam@1592
|
47 else if n <= 0xFFFF then
|
adam@1592
|
48 let
|
adam@1592
|
49 val w = Word.fromInt n
|
adam@1592
|
50 val b1 = Word.orb (Word.fromInt (128 + 64 + 32), Word.>> (w, Word.fromInt 12))
|
adam@1592
|
51 val b2 = Word.orb (Word.fromInt 128, Word.andb (Word.>> (w, Word.fromInt 6), Word.fromInt 63))
|
adam@1592
|
52 val b3 = Word.orb (Word.fromInt 128, Word.andb (w, Word.fromInt 63))
|
adam@1592
|
53 in
|
adam@1592
|
54 byte b1 ^ byte b2 ^ byte b3
|
adam@1592
|
55 end
|
adam@1592
|
56 else
|
adam@1592
|
57 raise Fail "Exceeded supported range for UTF-8 characters"
|
adam@1592
|
58
|
adam@1592
|
59 end
|