# HG changeset patch # User Adam Chlipala # Date 1259859051 18000 # Node ID b06a2a65e670230a442be43c60368615ad1dcd40 # Parent 4eb1c4a1b05735ad61d1e242daa1653c88600d86 UTF-8 in dynamic escaping diff -r 4eb1c4a1b057 -r b06a2a65e670 src/c/urweb.c --- a/src/c/urweb.c Thu Dec 03 11:20:13 2009 -0500 +++ b/src/c/urweb.c Thu Dec 03 11:50:51 2009 -0500 @@ -1410,6 +1410,10 @@ return result; } +static int isCont(unsigned char ch) { + return ch / 64 == 2; +} + char *uw_Basis_attrifyString(uw_context ctx, uw_Basis_string s) { int len = strlen(s); char *result, *p; @@ -1418,7 +1422,7 @@ result = p = ctx->heap.front; for (; *s; s++) { - char c = *s; + unsigned char c = *s; if (c == '"') { strcpy(p, """); @@ -1429,7 +1433,19 @@ } else if (isprint(c)) *p++ = c; - else { + else if (c / 32 == 6 && isCont(s[1])) { + memcpy(p, s, 2); + p += 2; + ++s; + } else if (c / 16 == 14 && isCont(s[1]) && isCont(s[2])) { + memcpy(p, s, 3); + p += 3; + s += 2; + } else if (c / 8 == 30 && isCont(s[1]) && isCont(s[2]) && isCont(s[3])) { + memcpy(p, s, 4); + p += 4; + s += 3; + } else { int len2; sprintf(p, "&#%d;%n", c, &len2); p += len2; @@ -1499,7 +1515,7 @@ uw_check(ctx, strlen(s) * 6); for (; *s; s++) { - char c = *s; + unsigned char c = *s; if (c == '"') uw_write_unsafe(ctx, """); @@ -1507,7 +1523,22 @@ uw_write_unsafe(ctx, "&"); else if (isprint(c)) uw_writec_unsafe(ctx, c); - else { + else if (c / 32 == 6 && isCont(s[1])) { + uw_writec_unsafe(ctx, c); + uw_writec_unsafe(ctx, s[1]); + ++s; + } else if (c / 16 == 14 && isCont(s[1]) && isCont(s[2])) { + uw_writec_unsafe(ctx, c); + uw_writec_unsafe(ctx, s[1]); + uw_writec_unsafe(ctx, s[2]); + s += 2; + } else if (c / 8 == 30 && isCont(s[1]) && isCont(s[2]) && isCont(s[3])) { + uw_writec_unsafe(ctx, c); + uw_writec_unsafe(ctx, s[1]); + uw_writec_unsafe(ctx, s[2]); + uw_writec_unsafe(ctx, s[3]); + s += 3; + } else { uw_write_unsafe(ctx, "&#"); uw_Basis_attrifyInt_w_unsafe(ctx, c); uw_writec_unsafe(ctx, ';'); @@ -1847,7 +1878,7 @@ uw_check_heap(ctx, strlen(s) * 5 + 1); for (r = s2 = ctx->heap.front; *s; s++) { - char c = *s; + unsigned char c = *s; switch (c) { case '<': @@ -1859,9 +1890,21 @@ s2 += 5; break; default: - if (isprint(c)) + if (isprint(c) || isspace(c)) *s2++ = c; - else { + else if (c / 32 == 6 && isCont(s[1])) { + memcpy(s2, s, 2); + s2 += 2; + ++s; + } else if (c / 16 == 14 && isCont(s[1]) && isCont(s[2])) { + memcpy(s2, s, 3); + s2 += 3; + s += 2; + } else if (c / 8 == 30 && isCont(s[1]) && isCont(s[2]) && isCont(s[3])) { + memcpy(s2, s, 4); + s2 += 4; + s += 3; + } else { int len2; sprintf(s2, "&#%d;%n", c, &len2); s2 += len2; @@ -1878,7 +1921,7 @@ uw_check(ctx, strlen(s) * 6); for (; *s; s++) { - char c = *s; + unsigned char c = *s; switch (c) { case '<': @@ -1888,9 +1931,24 @@ uw_write_unsafe(ctx, "&"); break; default: - if (isprint(c)) + if (isprint(c) || isspace(c)) uw_writec_unsafe(ctx, c); - else { + else if (c / 32 == 6 && isCont(s[1])) { + uw_writec_unsafe(ctx, c); + uw_writec_unsafe(ctx, s[1]); + ++s; + } else if (c / 16 == 14 && isCont(s[1]) && isCont(s[2])) { + uw_writec_unsafe(ctx, c); + uw_writec_unsafe(ctx, s[1]); + uw_writec_unsafe(ctx, s[2]); + s += 2; + } else if (c / 8 == 30 && isCont(s[1]) && isCont(s[2]) && isCont(s[3])) { + uw_writec_unsafe(ctx, c); + uw_writec_unsafe(ctx, s[1]); + uw_writec_unsafe(ctx, s[2]); + uw_writec_unsafe(ctx, s[3]); + s += 3; + } else { uw_write_unsafe(ctx, "&#"); uw_Basis_attrifyInt_w_unsafe(ctx, c); uw_writec_unsafe(ctx, ';'); diff -r 4eb1c4a1b057 -r b06a2a65e670 src/mono_opt.sml --- a/src/mono_opt.sml Thu Dec 03 11:20:13 2009 -0500 +++ b/src/mono_opt.sml Thu Dec 03 11:50:51 2009 -0500 @@ -45,6 +45,37 @@ else Real.toString n +fun attrifyString s = + let + fun hs (pos, acc) = + if pos >= size s then + String.concat (rev acc) + else + case String.sub (s, pos) of + #"\"" => hs (pos+1, """ :: acc) + | #"&" => hs (pos+1, "&" :: acc) + | ch => + let + val n = ord ch + fun isCont k = pos + k < size s + andalso ord (String.sub (s, pos + k)) div 64 = 2 + fun unicode k = hs (pos+k+1, String.substring (s, pos, k+1) :: acc) + in + if Char.isPrint ch orelse Char.isSpace ch then + hs (pos+1, str ch :: acc) + else if n div 32 = 6 andalso isCont 1 then + unicode 1 + else if n div 16 = 14 andalso isCont 1 andalso isCont 2 then + unicode 2 + else if n div 8 = 30 andalso isCont 1 andalso isCont 2 andalso isCont 3 then + unicode 3 + else + hs (pos+1, "&#" ^ Int.toString (ord ch) ^ ";" :: acc) + end + in + hs (0, []) + end + fun attrifyChar ch = case ch of #"\"" => """ @@ -54,8 +85,6 @@ else "&#" ^ Int.toString (ord ch) ^ ";" -val attrifyString = String.translate attrifyChar - val urlifyInt = attrifyInt val urlifyFloat = attrifyFloat @@ -78,7 +107,7 @@ andalso ord (String.sub (s, pos + k)) div 64 = 2 fun unicode k = hs (pos+k+1, String.substring (s, pos, k+1) :: acc) in - if Char.isPrint ch orelse Char.isSpace ch then + if Char.isPrint ch then hs (pos+1, str ch :: acc) else if n div 32 = 6 andalso isCont 1 then unicode 1 diff -r 4eb1c4a1b057 -r b06a2a65e670 src/prim.sml --- a/src/prim.sml Thu Dec 03 11:20:13 2009 -0500 +++ b/src/prim.sml Thu Dec 03 11:50:51 2009 -0500 @@ -74,10 +74,20 @@ else str ch ^ pad (n-1, ch, s) -val gccify = String.translate (fn ch => if Char.isPrint ch then - str ch - else - "\\" ^ pad (3, #"0", Int.fmt StringCvt.OCT (ord ch))) +val gccify = String.translate (fn ch => + case ch of + #"\"" => "\\\"" + | #"\\" => "\\\\" + | #"'" => "\\'" + | #"\n" => "\\n" + | #"\r" => "\\r" + | #"\t" => "\\t" + | #" " => " " + | _ => + if Char.isPrint ch then + str ch + else + "\\" ^ pad (3, #"0", Int.fmt StringCvt.OCT (ord ch))) fun p_t_GCC t = case t of