changeset 1054:b06a2a65e670

UTF-8 in dynamic escaping
author Adam Chlipala <adamc@hcoop.net>
date Thu, 03 Dec 2009 11:50:51 -0500
parents 4eb1c4a1b057
children 118a5a08a881
files src/c/urweb.c src/mono_opt.sml src/prim.sml
diffstat 3 files changed, 114 insertions(+), 17 deletions(-) [+]
line wrap: on
line diff
--- a/src/c/urweb.c	Thu Dec 03 11:20:13 2009 -0500
+++ b/src/c/urweb.c	Thu Dec 03 11:50:51 2009 -0500
@@ -1410,6 +1410,10 @@
   return result;
 }
 
+static int isCont(unsigned char ch) {
+  return ch / 64 == 2;
+}
+
 char *uw_Basis_attrifyString(uw_context ctx, uw_Basis_string s) {
   int len = strlen(s);
   char *result, *p;
@@ -1418,7 +1422,7 @@
   result = p = ctx->heap.front;
 
   for (; *s; s++) {
-    char c = *s;
+    unsigned char c = *s;
 
     if (c == '"') {
       strcpy(p, "&quot;");
@@ -1429,7 +1433,19 @@
     }
     else if (isprint(c))
       *p++ = c;
-    else {
+    else if (c / 32 == 6 && isCont(s[1])) {
+      memcpy(p, s, 2);
+      p += 2;
+      ++s;
+    } else if (c / 16 == 14 && isCont(s[1]) && isCont(s[2])) {
+      memcpy(p, s, 3);
+      p += 3;
+      s += 2;
+    } else if (c / 8 == 30 && isCont(s[1]) && isCont(s[2]) && isCont(s[3])) {
+      memcpy(p, s, 4);
+      p += 4;
+      s += 3;
+    } else {
       int len2;
       sprintf(p, "&#%d;%n", c, &len2);
       p += len2;
@@ -1499,7 +1515,7 @@
   uw_check(ctx, strlen(s) * 6);
 
   for (; *s; s++) {
-    char c = *s;
+    unsigned char c = *s;
 
     if (c == '"')
       uw_write_unsafe(ctx, "&quot;");
@@ -1507,7 +1523,22 @@
       uw_write_unsafe(ctx, "&amp;");
     else if (isprint(c))
       uw_writec_unsafe(ctx, c);
-    else {
+    else if (c / 32 == 6 && isCont(s[1])) {
+      uw_writec_unsafe(ctx, c);
+      uw_writec_unsafe(ctx, s[1]);
+      ++s;
+    } else if (c / 16 == 14 && isCont(s[1]) && isCont(s[2])) {
+      uw_writec_unsafe(ctx, c);
+      uw_writec_unsafe(ctx, s[1]);
+      uw_writec_unsafe(ctx, s[2]);
+      s += 2;
+    } else if (c / 8 == 30 && isCont(s[1]) && isCont(s[2]) && isCont(s[3])) {
+      uw_writec_unsafe(ctx, c);
+      uw_writec_unsafe(ctx, s[1]);
+      uw_writec_unsafe(ctx, s[2]);
+      uw_writec_unsafe(ctx, s[3]);
+      s += 3;
+    } else {
       uw_write_unsafe(ctx, "&#");
       uw_Basis_attrifyInt_w_unsafe(ctx, c);
       uw_writec_unsafe(ctx, ';');
@@ -1847,7 +1878,7 @@
   uw_check_heap(ctx, strlen(s) * 5 + 1);
 
   for (r = s2 = ctx->heap.front; *s; s++) {
-    char c = *s;
+    unsigned char c = *s;
 
     switch (c) {
     case '<':
@@ -1859,9 +1890,21 @@
       s2 += 5;
       break;
     default:
-      if (isprint(c))
+      if (isprint(c) || isspace(c))
         *s2++ = c;
-      else {
+      else if (c / 32 == 6 && isCont(s[1])) {
+        memcpy(s2, s, 2);
+        s2 += 2;
+        ++s;
+      } else if (c / 16 == 14 && isCont(s[1]) && isCont(s[2])) {
+        memcpy(s2, s, 3);
+        s2 += 3;
+        s += 2;
+      } else if (c / 8 == 30 && isCont(s[1]) && isCont(s[2]) && isCont(s[3])) {
+        memcpy(s2, s, 4);
+        s2 += 4;
+        s += 3;
+      } else {
         int len2;
         sprintf(s2, "&#%d;%n", c, &len2);
         s2 += len2;
@@ -1878,7 +1921,7 @@
   uw_check(ctx, strlen(s) * 6);
 
   for (; *s; s++) {
-    char c = *s;
+    unsigned char c = *s;
 
     switch (c) {
     case '<':
@@ -1888,9 +1931,24 @@
       uw_write_unsafe(ctx, "&amp;");
       break;
     default:
-      if (isprint(c))
+      if (isprint(c) || isspace(c))
         uw_writec_unsafe(ctx, c);
-      else {
+      else if (c / 32 == 6 && isCont(s[1])) {
+        uw_writec_unsafe(ctx, c);
+        uw_writec_unsafe(ctx, s[1]);
+        ++s;
+      } else if (c / 16 == 14 && isCont(s[1]) && isCont(s[2])) {
+        uw_writec_unsafe(ctx, c);
+        uw_writec_unsafe(ctx, s[1]);
+        uw_writec_unsafe(ctx, s[2]);
+        s += 2;
+      } else if (c / 8 == 30 && isCont(s[1]) && isCont(s[2]) && isCont(s[3])) {
+        uw_writec_unsafe(ctx, c);
+        uw_writec_unsafe(ctx, s[1]);
+        uw_writec_unsafe(ctx, s[2]);
+        uw_writec_unsafe(ctx, s[3]);
+        s += 3;
+      } else {
         uw_write_unsafe(ctx, "&#");
         uw_Basis_attrifyInt_w_unsafe(ctx, c);
         uw_writec_unsafe(ctx, ';');
--- a/src/mono_opt.sml	Thu Dec 03 11:20:13 2009 -0500
+++ b/src/mono_opt.sml	Thu Dec 03 11:50:51 2009 -0500
@@ -45,6 +45,37 @@
     else
         Real.toString n
 
+fun attrifyString s =
+    let
+        fun hs (pos, acc) =
+            if pos >= size s then
+                String.concat (rev acc)
+            else
+                case String.sub (s, pos) of
+                    #"\"" => hs (pos+1, "&quot;" :: acc)
+                  | #"&" => hs (pos+1, "&amp;" :: acc)
+                  | ch =>
+                    let
+                        val n = ord ch
+                        fun isCont k = pos + k < size s
+                                       andalso ord (String.sub (s, pos + k)) div 64 = 2
+                        fun unicode k = hs (pos+k+1, String.substring (s, pos, k+1) :: acc)
+                    in
+                        if Char.isPrint ch orelse Char.isSpace ch then
+                            hs (pos+1, str ch :: acc)
+                        else if n div 32 = 6 andalso isCont 1 then
+                            unicode 1
+                        else if n div 16 = 14 andalso isCont 1 andalso isCont 2 then
+                            unicode 2
+                        else if n div 8 = 30 andalso isCont 1 andalso isCont 2 andalso isCont 3 then
+                            unicode 3
+                        else
+                            hs (pos+1, "&#" ^ Int.toString (ord ch) ^ ";" :: acc)
+                    end
+    in
+        hs (0, [])
+    end
+
 fun attrifyChar ch =
     case ch of
         #"\"" => "&quot;"
@@ -54,8 +85,6 @@
               else
                   "&#" ^ Int.toString (ord ch) ^ ";"
 
-val attrifyString = String.translate attrifyChar
-
 val urlifyInt = attrifyInt
 val urlifyFloat = attrifyFloat
 
@@ -78,7 +107,7 @@
                                        andalso ord (String.sub (s, pos + k)) div 64 = 2
                         fun unicode k = hs (pos+k+1, String.substring (s, pos, k+1) :: acc)
                     in
-                        if Char.isPrint ch orelse Char.isSpace ch then
+                        if Char.isPrint ch then
                             hs (pos+1, str ch :: acc)
                         else if n div 32 = 6 andalso isCont 1 then
                             unicode 1
--- a/src/prim.sml	Thu Dec 03 11:20:13 2009 -0500
+++ b/src/prim.sml	Thu Dec 03 11:50:51 2009 -0500
@@ -74,10 +74,20 @@
     else
         str ch ^ pad (n-1, ch, s)
 
-val gccify = String.translate (fn ch => if Char.isPrint ch then
-                                            str ch
-                                        else
-                                            "\\" ^ pad (3, #"0", Int.fmt StringCvt.OCT (ord ch)))
+val gccify = String.translate (fn ch =>
+                                  case ch of
+                                      #"\"" => "\\\""
+                                    | #"\\" => "\\\\"
+                                    | #"'" => "\\'"
+                                    | #"\n" => "\\n"
+                                    | #"\r" => "\\r"
+                                    | #"\t" => "\\t"
+                                    | #" " => " "
+                                    | _ =>
+                                      if Char.isPrint ch then
+                                          str ch
+                                      else
+                                          "\\" ^ pad (3, #"0", Int.fmt StringCvt.OCT (ord ch)))
 
 fun p_t_GCC t =
     case t of