comparison src/utf8.sml @ 1592:1c9f8f06c1d6

Support the full set of XHTML character entities
author Adam Chlipala <adam@chlipala.net>
date Sat, 05 Nov 2011 15:05:13 -0400
parents
children
comparison
equal deleted inserted replaced
1591:20f898c29525 1592:1c9f8f06c1d6
1 (* Copyright (c) 2011, Adam Chlipala
2 * All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * - Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * - Redistributions in binary form must reproduce the above copyright notice,
10 * this list of conditions and the following disclaimer in the documentation
11 * and/or other materials provided with the distribution.
12 * - The names of contributors may not be used to endorse or promote products
13 * derived from this software without specific prior written permission.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 * POSSIBILITY OF SUCH DAMAGE.
26 *)
27
28 (* UTF-8 conversion *)
29
30 structure Utf8 :> UTF8 = struct
31
32 fun byte n = str (chr (Word.toInt n))
33
34 fun encode n =
35 if n <= 0 then
36 raise Fail "Invalid character to UTF-8-encode"
37 else if n <= 0x7F then
38 str (chr n)
39 else if n <= 0x7FF then
40 let
41 val w = Word.fromInt n
42 val b1 = Word.orb (Word.fromInt (128 + 64), Word.>> (w, Word.fromInt 6))
43 val b2 = Word.orb (Word.fromInt 128, Word.andb (w, Word.fromInt 63))
44 in
45 byte b1 ^ byte b2
46 end
47 else if n <= 0xFFFF then
48 let
49 val w = Word.fromInt n
50 val b1 = Word.orb (Word.fromInt (128 + 64 + 32), Word.>> (w, Word.fromInt 12))
51 val b2 = Word.orb (Word.fromInt 128, Word.andb (Word.>> (w, Word.fromInt 6), Word.fromInt 63))
52 val b3 = Word.orb (Word.fromInt 128, Word.andb (w, Word.fromInt 63))
53 in
54 byte b1 ^ byte b2 ^ byte b3
55 end
56 else
57 raise Fail "Exceeded supported range for UTF-8 characters"
58
59 end