changeset 763:af41ec2f302a

Lexing character entities
author Adam Chlipala <adamc@hcoop.net>
date Thu, 30 Apr 2009 16:25:27 -0400 (2009-04-30)
parents 9021d44ba6b2
children 7f653298dd66
files src/urweb.lex tests/ent.ur tests/ent.urp tests/ent.urs
diffstat 4 files changed, 60 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/src/urweb.lex	Thu Apr 30 15:10:13 2009 -0400
+++ b/src/urweb.lex	Thu Apr 30 16:25:27 2009 -0400
@@ -1,4 +1,6 @@
-(* Copyright (c) 2008, Adam Chlipala
+(* -*- mode: sml-lex -*- *)
+
+(* Copyright (c) 2008-2009, Adam Chlipala
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -106,6 +108,55 @@
 		     xmlString := false)
 
 
+fun unescape loc s =
+    let
+        fun process (s, acc) =
+            let
+                val (befor, after) = Substring.splitl (fn ch => ch <> #"&") s
+            in
+                if Substring.size after = 0 then
+                    Substring.concat (rev (s :: acc))
+                else
+                    let
+                        val after = Substring.slice (after, 1, NONE)
+                        val (befor', after') = Substring.splitl (fn ch => ch <> #";") after
+                    in
+                        if Substring.size after' = 0 then
+                            (ErrorMsg.errorAt' loc "Missing ';' after '&'";
+                             "")
+                        else
+                            let
+                                val pre = befor
+                                val code = befor'
+                                val s = Substring.slice (after', 1, NONE)
+
+                                val special =
+                                    if Substring.size code > 0 andalso Substring.sub (code, 0) = #"#"
+                                       andalso CharVectorSlice.all Char.isDigit (Substring.slice (code, 1, NONE)) then
+                                        let
+                                            val code = Substring.string (Substring.slice (code, 1, NONE))
+                                        in
+                                            Option.map chr (Int.fromString code)
+                                        end
+                                    else case Substring.string code of
+                                             "amp" => SOME #"&"
+                                           | "lt" => SOME #"<"
+                                           | "gt" => SOME #">"
+                                           | "quot" => SOME #"\""
+                                           | _ => NONE
+                            in
+                                case special of
+                                    NONE => (ErrorMsg.errorAt' loc ("Unsupported XML character entity "
+                                                                        ^ Substring.string code);
+                                             "")
+                                  | SOME ch => process (s, Substring.full (String.str ch) :: pre :: acc)
+                            end
+                    end
+            end
+    in
+        process (Substring.full s, [])
+    end
+
 %%
 %header (functor UrwebLexFn(structure Tokens : Urweb_TOKENS));
 %full
@@ -231,7 +282,7 @@
 			  pushLevel (fn () => YYBEGIN XML);
 			  Tokens.LBRACE (yypos, yypos + 1));
 
-<XML> {notags}        => (Tokens.NOTAGS (yytext, yypos, yypos + size yytext));
+<XML> {notags}        => (Tokens.NOTAGS (unescape (yypos, yypos + size yytext) yytext, yypos, yypos + size yytext));
 
 <XML> .               => (ErrorMsg.errorAt' (yypos, yypos)
                           ("illegal XML character: \"" ^ yytext ^ "\"");
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/ent.ur	Thu Apr 30 16:25:27 2009 -0400
@@ -0,0 +1,3 @@
+fun main () = return <xml><body>
+  &lt;Whoa-hoa!&gt;  A&#66;CD!
+</body></xml>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/ent.urp	Thu Apr 30 16:25:27 2009 -0400
@@ -0,0 +1,3 @@
+debug
+
+ent
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/ent.urs	Thu Apr 30 16:25:27 2009 -0400
@@ -0,0 +1,1 @@
+val main : unit -> transaction page