Mercurial > urweb
comparison src/utf8.sml @ 1592:1c9f8f06c1d6
Support the full set of XHTML character entities
author | Adam Chlipala <adam@chlipala.net> |
---|---|
date | Sat, 05 Nov 2011 15:05:13 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1591:20f898c29525 | 1592:1c9f8f06c1d6 |
---|---|
1 (* Copyright (c) 2011, Adam Chlipala | |
2 * All rights reserved. | |
3 * | |
4 * Redistribution and use in source and binary forms, with or without | |
5 * modification, are permitted provided that the following conditions are met: | |
6 * | |
7 * - Redistributions of source code must retain the above copyright notice, | |
8 * this list of conditions and the following disclaimer. | |
9 * - Redistributions in binary form must reproduce the above copyright notice, | |
10 * this list of conditions and the following disclaimer in the documentation | |
11 * and/or other materials provided with the distribution. | |
12 * - The names of contributors may not be used to endorse or promote products | |
13 * derived from this software without specific prior written permission. | |
14 * | |
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
16 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
19 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
20 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
21 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
22 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
23 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
24 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
25 * POSSIBILITY OF SUCH DAMAGE. | |
26 *) | |
27 | |
28 (* UTF-8 conversion *) | |
29 | |
30 structure Utf8 :> UTF8 = struct | |
31 | |
32 fun byte n = str (chr (Word.toInt n)) | |
33 | |
34 fun encode n = | |
35 if n <= 0 then | |
36 raise Fail "Invalid character to UTF-8-encode" | |
37 else if n <= 0x7F then | |
38 str (chr n) | |
39 else if n <= 0x7FF then | |
40 let | |
41 val w = Word.fromInt n | |
42 val b1 = Word.orb (Word.fromInt (128 + 64), Word.>> (w, Word.fromInt 6)) | |
43 val b2 = Word.orb (Word.fromInt 128, Word.andb (w, Word.fromInt 63)) | |
44 in | |
45 byte b1 ^ byte b2 | |
46 end | |
47 else if n <= 0xFFFF then | |
48 let | |
49 val w = Word.fromInt n | |
50 val b1 = Word.orb (Word.fromInt (128 + 64 + 32), Word.>> (w, Word.fromInt 12)) | |
51 val b2 = Word.orb (Word.fromInt 128, Word.andb (Word.>> (w, Word.fromInt 6), Word.fromInt 63)) | |
52 val b3 = Word.orb (Word.fromInt 128, Word.andb (w, Word.fromInt 63)) | |
53 in | |
54 byte b1 ^ byte b2 ^ byte b3 | |
55 end | |
56 else | |
57 raise Fail "Exceeded supported range for UTF-8 characters" | |
58 | |
59 end |