Mercurial > urweb
comparison src/urweb.lex @ 763:af41ec2f302a
Lexing character entities
author | Adam Chlipala <adamc@hcoop.net> |
---|---|
date | Thu, 30 Apr 2009 16:25:27 -0400 |
parents | 8688e01ae469 |
children | 395a5d450cc0 |
comparison
equal
deleted
inserted
replaced
762:9021d44ba6b2 | 763:af41ec2f302a |
---|---|
1 (* Copyright (c) 2008, Adam Chlipala | 1 (* -*- mode: sml-lex -*- *) |
2 | |
3 (* Copyright (c) 2008-2009, Adam Chlipala | |
2 * All rights reserved. | 4 * All rights reserved. |
3 * | 5 * |
4 * Redistribution and use in source and binary forms, with or without | 6 * Redistribution and use in source and binary forms, with or without |
5 * modification, are permitted provided that the following conditions are met: | 7 * modification, are permitted provided that the following conditions are met: |
6 * | 8 * |
103 | _ => () | 105 | _ => () |
104 | 106 |
105 fun initialize () = (xmlTag := []; | 107 fun initialize () = (xmlTag := []; |
106 xmlString := false) | 108 xmlString := false) |
107 | 109 |
110 | |
111 fun unescape loc s = | |
112 let | |
113 fun process (s, acc) = | |
114 let | |
115 val (befor, after) = Substring.splitl (fn ch => ch <> #"&") s | |
116 in | |
117 if Substring.size after = 0 then | |
118 Substring.concat (rev (s :: acc)) | |
119 else | |
120 let | |
121 val after = Substring.slice (after, 1, NONE) | |
122 val (befor', after') = Substring.splitl (fn ch => ch <> #";") after | |
123 in | |
124 if Substring.size after' = 0 then | |
125 (ErrorMsg.errorAt' loc "Missing ';' after '&'"; | |
126 "") | |
127 else | |
128 let | |
129 val pre = befor | |
130 val code = befor' | |
131 val s = Substring.slice (after', 1, NONE) | |
132 | |
133 val special = | |
134 if Substring.size code > 0 andalso Substring.sub (code, 0) = #"#" | |
135 andalso CharVectorSlice.all Char.isDigit (Substring.slice (code, 1, NONE)) then | |
136 let | |
137 val code = Substring.string (Substring.slice (code, 1, NONE)) | |
138 in | |
139 Option.map chr (Int.fromString code) | |
140 end | |
141 else case Substring.string code of | |
142 "amp" => SOME #"&" | |
143 | "lt" => SOME #"<" | |
144 | "gt" => SOME #">" | |
145 | "quot" => SOME #"\"" | |
146 | _ => NONE | |
147 in | |
148 case special of | |
149 NONE => (ErrorMsg.errorAt' loc ("Unsupported XML character entity " | |
150 ^ Substring.string code); | |
151 "") | |
152 | SOME ch => process (s, Substring.full (String.str ch) :: pre :: acc) | |
153 end | |
154 end | |
155 end | |
156 in | |
157 process (Substring.full s, []) | |
158 end | |
108 | 159 |
109 %% | 160 %% |
110 %header (functor UrwebLexFn(structure Tokens : Urweb_TOKENS)); | 161 %header (functor UrwebLexFn(structure Tokens : Urweb_TOKENS)); |
111 %full | 162 %full |
112 %s COMMENT STRING XML XMLTAG; | 163 %s COMMENT STRING XML XMLTAG; |
229 | 280 |
230 <XML> "{" => (YYBEGIN INITIAL; | 281 <XML> "{" => (YYBEGIN INITIAL; |
231 pushLevel (fn () => YYBEGIN XML); | 282 pushLevel (fn () => YYBEGIN XML); |
232 Tokens.LBRACE (yypos, yypos + 1)); | 283 Tokens.LBRACE (yypos, yypos + 1)); |
233 | 284 |
234 <XML> {notags} => (Tokens.NOTAGS (yytext, yypos, yypos + size yytext)); | 285 <XML> {notags} => (Tokens.NOTAGS (unescape (yypos, yypos + size yytext) yytext, yypos, yypos + size yytext)); |
235 | 286 |
236 <XML> . => (ErrorMsg.errorAt' (yypos, yypos) | 287 <XML> . => (ErrorMsg.errorAt' (yypos, yypos) |
237 ("illegal XML character: \"" ^ yytext ^ "\""); | 288 ("illegal XML character: \"" ^ yytext ^ "\""); |
238 continue ()); | 289 continue ()); |
239 | 290 |