changeset 1053:4eb1c4a1b057

Escaping UTF-8 in MonoOpt
author Adam Chlipala <adamc@hcoop.net>
date Thu, 03 Dec 2009 11:20:13 -0500
parents 38411c2cd363
children b06a2a65e670
files src/mono_opt.sml src/prim.sml tests/cyrillic.ur tests/cyrillic.urp tests/cyrillic.urs
diffstat 5 files changed, 49 insertions(+), 10 deletions(-) [+]
line wrap: on
line diff
--- a/src/mono_opt.sml	Wed Nov 25 09:48:23 2009 -0500
+++ b/src/mono_opt.sml	Thu Dec 03 11:20:13 2009 -0500
@@ -61,14 +61,37 @@
 
 val htmlifyInt = attrifyInt
 val htmlifyFloat = attrifyFloat
-val htmlifyString = String.translate (fn ch => case ch of
-                                                   #"<" => "&lt;"
-                                                 | #"&" => "&amp;"
-                                                 | _ =>   
-                                                   if Char.isPrint ch orelse Char.isSpace ch then
-                                                       str ch
-                                                   else
-                                                       "&#" ^ Int.toString (ord ch) ^ ";")
+
+fun htmlifyString s =
+    let
+        fun hs (pos, acc) =
+            if pos >= size s then
+                String.concat (rev acc)
+            else
+                case String.sub (s, pos) of
+                    #"<" => hs (pos+1, "&lt;" :: acc)
+                  | #"&" => hs (pos+1, "&amp;" :: acc)
+                  | ch =>
+                    let
+                        val n = ord ch
+                        fun isCont k = pos + k < size s
+                                       andalso ord (String.sub (s, pos + k)) div 64 = 2
+                        fun unicode k = hs (pos+k+1, String.substring (s, pos, k+1) :: acc)
+                    in
+                        if Char.isPrint ch orelse Char.isSpace ch then
+                            hs (pos+1, str ch :: acc)
+                        else if n div 32 = 6 andalso isCont 1 then
+                            unicode 1
+                        else if n div 16 = 14 andalso isCont 1 andalso isCont 2 then
+                            unicode 2
+                        else if n div 8 = 30 andalso isCont 1 andalso isCont 2 andalso isCont 3 then
+                            unicode 3
+                        else
+                            hs (pos+1, "&#" ^ Int.toString (ord ch) ^ ";" :: acc)
+                    end
+    in
+        hs (0, [])
+    end
 
 fun hexIt ch =
     let
--- a/src/prim.sml	Wed Nov 25 09:48:23 2009 -0500
+++ b/src/prim.sml	Thu Dec 03 11:20:13 2009 -0500
@@ -68,12 +68,23 @@
       | String s => s
       | Char ch => str ch
 
+fun pad (n, ch, s) =
+    if size s >= n then
+        s
+    else
+        str ch ^ pad (n-1, ch, s)
+
+val gccify = String.translate (fn ch => if Char.isPrint ch then
+                                            str ch
+                                        else
+                                            "\\" ^ pad (3, #"0", Int.fmt StringCvt.OCT (ord ch)))
+
 fun p_t_GCC t =
     case t of
         Int n => string (int2s n)
       | Float n => string (float2s n)
-      | String s => box [string "\"", string (String.toString s), string "\""]
-      | Char ch => box [string "'", string (String.toString (str ch)), string "'"]
+      | String s => box [string "\"", string (gccify s), string "\""]
+      | Char ch => box [string "'", string (gccify (str ch)), string "'"]
 
 fun equal x =
     case x of
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/cyrillic.ur	Thu Dec 03 11:20:13 2009 -0500
@@ -0,0 +1,2 @@
+fun main () = return <xml><body>одел
+Hi!</body></xml>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/cyrillic.urp	Thu Dec 03 11:20:13 2009 -0500
@@ -0,0 +1,2 @@
+
+cyrillic
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/cyrillic.urs	Thu Dec 03 11:20:13 2009 -0500
@@ -0,0 +1,1 @@
+val main : unit -> transaction page