diff src/c/urweb.c @ 1054:b06a2a65e670

UTF-8 in dynamic escaping
author Adam Chlipala <adamc@hcoop.net>
date Thu, 03 Dec 2009 11:50:51 -0500
parents a8a825861397
children 118a5a08a881
line wrap: on
line diff
--- a/src/c/urweb.c	Thu Dec 03 11:20:13 2009 -0500
+++ b/src/c/urweb.c	Thu Dec 03 11:50:51 2009 -0500
@@ -1410,6 +1410,10 @@
   return result;
 }
 
+static int isCont(unsigned char ch) {
+  return ch / 64 == 2;
+}
+
 char *uw_Basis_attrifyString(uw_context ctx, uw_Basis_string s) {
   int len = strlen(s);
   char *result, *p;
@@ -1418,7 +1422,7 @@
   result = p = ctx->heap.front;
 
   for (; *s; s++) {
-    char c = *s;
+    unsigned char c = *s;
 
     if (c == '"') {
       strcpy(p, "&quot;");
@@ -1429,7 +1433,19 @@
     }
     else if (isprint(c))
       *p++ = c;
-    else {
+    else if (c / 32 == 6 && isCont(s[1])) {
+      memcpy(p, s, 2);
+      p += 2;
+      ++s;
+    } else if (c / 16 == 14 && isCont(s[1]) && isCont(s[2])) {
+      memcpy(p, s, 3);
+      p += 3;
+      s += 2;
+    } else if (c / 8 == 30 && isCont(s[1]) && isCont(s[2]) && isCont(s[3])) {
+      memcpy(p, s, 4);
+      p += 4;
+      s += 3;
+    } else {
       int len2;
       sprintf(p, "&#%d;%n", c, &len2);
       p += len2;
@@ -1499,7 +1515,7 @@
   uw_check(ctx, strlen(s) * 6);
 
   for (; *s; s++) {
-    char c = *s;
+    unsigned char c = *s;
 
     if (c == '"')
       uw_write_unsafe(ctx, "&quot;");
@@ -1507,7 +1523,22 @@
       uw_write_unsafe(ctx, "&amp;");
     else if (isprint(c))
       uw_writec_unsafe(ctx, c);
-    else {
+    else if (c / 32 == 6 && isCont(s[1])) {
+      uw_writec_unsafe(ctx, c);
+      uw_writec_unsafe(ctx, s[1]);
+      ++s;
+    } else if (c / 16 == 14 && isCont(s[1]) && isCont(s[2])) {
+      uw_writec_unsafe(ctx, c);
+      uw_writec_unsafe(ctx, s[1]);
+      uw_writec_unsafe(ctx, s[2]);
+      s += 2;
+    } else if (c / 8 == 30 && isCont(s[1]) && isCont(s[2]) && isCont(s[3])) {
+      uw_writec_unsafe(ctx, c);
+      uw_writec_unsafe(ctx, s[1]);
+      uw_writec_unsafe(ctx, s[2]);
+      uw_writec_unsafe(ctx, s[3]);
+      s += 3;
+    } else {
       uw_write_unsafe(ctx, "&#");
       uw_Basis_attrifyInt_w_unsafe(ctx, c);
       uw_writec_unsafe(ctx, ';');
@@ -1847,7 +1878,7 @@
   uw_check_heap(ctx, strlen(s) * 5 + 1);
 
   for (r = s2 = ctx->heap.front; *s; s++) {
-    char c = *s;
+    unsigned char c = *s;
 
     switch (c) {
     case '<':
@@ -1859,9 +1890,21 @@
       s2 += 5;
       break;
     default:
-      if (isprint(c))
+      if (isprint(c) || isspace(c))
         *s2++ = c;
-      else {
+      else if (c / 32 == 6 && isCont(s[1])) {
+        memcpy(s2, s, 2);
+        s2 += 2;
+        ++s;
+      } else if (c / 16 == 14 && isCont(s[1]) && isCont(s[2])) {
+        memcpy(s2, s, 3);
+        s2 += 3;
+        s += 2;
+      } else if (c / 8 == 30 && isCont(s[1]) && isCont(s[2]) && isCont(s[3])) {
+        memcpy(s2, s, 4);
+        s2 += 4;
+        s += 3;
+      } else {
         int len2;
         sprintf(s2, "&#%d;%n", c, &len2);
         s2 += len2;
@@ -1878,7 +1921,7 @@
   uw_check(ctx, strlen(s) * 6);
 
   for (; *s; s++) {
-    char c = *s;
+    unsigned char c = *s;
 
     switch (c) {
     case '<':
@@ -1888,9 +1931,24 @@
       uw_write_unsafe(ctx, "&amp;");
       break;
     default:
-      if (isprint(c))
+      if (isprint(c) || isspace(c))
         uw_writec_unsafe(ctx, c);
-      else {
+      else if (c / 32 == 6 && isCont(s[1])) {
+        uw_writec_unsafe(ctx, c);
+        uw_writec_unsafe(ctx, s[1]);
+        ++s;
+      } else if (c / 16 == 14 && isCont(s[1]) && isCont(s[2])) {
+        uw_writec_unsafe(ctx, c);
+        uw_writec_unsafe(ctx, s[1]);
+        uw_writec_unsafe(ctx, s[2]);
+        s += 2;
+      } else if (c / 8 == 30 && isCont(s[1]) && isCont(s[2]) && isCont(s[3])) {
+        uw_writec_unsafe(ctx, c);
+        uw_writec_unsafe(ctx, s[1]);
+        uw_writec_unsafe(ctx, s[2]);
+        uw_writec_unsafe(ctx, s[3]);
+        s += 3;
+      } else {
         uw_write_unsafe(ctx, "&#");
         uw_Basis_attrifyInt_w_unsafe(ctx, c);
         uw_writec_unsafe(ctx, ';');