# HG changeset patch # User Adam Chlipala # Date 1292423266 18000 # Node ID 8eaaca74a64c300099c13bc867809297a3fd3842 # Parent ec4de8f848f8c78d84765dc80bf72c35357f367e Import HTML parser diff -r ec4de8f848f8 -r 8eaaca74a64c html.ur --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/html.ur Wed Dec 15 09:27:46 2010 -0500 @@ -0,0 +1,144 @@ +open Parse + +con attribute = fn t => {Nam : string, + Parse : string -> option t} + +con tag = fn ts => {Nam : string, + Attributes : $(map attribute ts), + Folder : folder ts, + Construct : ctx ::: {Unit} -> [[Body] ~ ctx] => $ts + -> xml ([Body] ++ ctx) [] [] -> xml ([Body] ++ ctx) [] []} + +fun tag [use] [ignore] [use ~ ignore] (fl : folder use) (name : string) (attrs : $(map attribute use)) + (construct : ctx ::: {Unit} -> [[Body] ~ ctx] => Basis.tag (use ++ ignore) ([Body] ++ ctx) ([Body] ++ ctx) [] []) = + {Nam = name, + Attributes = attrs, + Folder = fl, + Construct = fn [ctx] [[Body] ~ ctx] (ats : $use) (inner : xml ([Body] ++ ctx) [] []) => + Basis.tag None ats construct inner} + +fun simpleTag [ignore] name (bt : bodyTag ignore) : tag [] = + @@tag [[]] [ignore] ! _ name {} (fn [ctx] [[Body] ~ ctx] => bt ()) + +fun simpleTag' [use] [ignore] [use ~ ignore] (fl : folder use) + name (bt : bodyTag (use ++ ignore)) (ats : $(map attribute use)) : tag use = + @@tag [use] [ignore] ! fl name ats (fn [ctx] [[Body] ~ ctx] => bt ()) + +fun url name = {Nam = name, + Parse = checkUrl} + +datatype error a = + Good of a + | Bad of string + +fun format [tags] (fl : folder tags) (tags : $(map tag tags)) [ctx] [[Body] ~ ctx] s = + let + fun loop s : error (xml ([Body] ++ ctx) [] [] * string) = + case String.msplit {Haystack = s, Needle = "&<"} of + None => Good (cdata s, "") + | Some (pre, ch, post) => + case ch of + #"&" => + (case String.split post #";" of + None => Bad "No ';' after '&'" + | Some (code, post) => + let + val xml = + case code of + "lt" => < + | "gt" => > + | "amp" => & + | _ => + in + case loop post of + Good (after, post) => Good ({[pre]}{xml}{after}, post) + | x => x + end) + | _ => + if String.length post > 0 && String.sub post 0 = #"/" then + case String.split post #"\x3E" of + None => Bad "No '>' after ' Good ({[pre]}, post) + else + case String.msplit {Haystack = post, Needle = " >"} of + None => Bad "No '>' after '<'" + | Some (tname, ch, post) => + @foldR [tag] [fn _ => unit -> error (xml ([Body] ++ ctx) [] [] * string)] + (fn [nm :: Name] [ts :: {Type}] [r :: {{Type}}] [[nm] ~ r] (meta : tag ts) acc () => + if meta.Nam = tname then + let + fun doAttrs (ch, post, ats : $(map option ts)) = + if String.length post > 0 && Char.isSpace (String.sub post 0) then + doAttrs (ch, String.substring post {Start = 1, + Len = String.length post - 1}, + ats) + else + case ch of + #"\x3E" => Good (ats, post) + | _ => + case String.split post #"=" of + None => + (case String.split post #"\x3E" of + None => Bad "No tag ender '\x3E'" + | Some (_, post) => Good (ats, post)) + | Some (aname, post) => + if String.length post >= 1 && String.sub post 0 = #"\"" then + case String.split (String.substring post + {Start = 1, + Len = String.length post + - 1}) + #"\"" of + None => Bad "No '\"' to end attribute value" + | Some (aval, post) => + let + val ats = + @map2 [attribute] [option] [option] + (fn [t] meta v => + if aname = meta.Nam then + meta.Parse aval + else + v) + meta.Folder meta.Attributes ats + in + doAttrs (#" ", post, ats) + end + else + Bad "Attribute value doesn't begin with quote" + in + case doAttrs (ch, post, @map0 [option] (fn [t :: Type] => None) + meta.Folder) of + Good (ats, post) => + let + val ats = + @map2 [attribute] [option] [id] + (fn [t] meta v => + case v of + None => error Missing attribute {[meta.Nam]} + for {[tname]} + | Some v => v) + meta.Folder meta.Attributes ats + in + case loop post of + Good (inner, post) => + (case loop post of + Good (after, post) => + Good ({[pre]}{meta.Construct [ctx] ! + ats inner}{after}, post) + | x => x) + | x => x + end + | Bad s => Bad s + end + else + acc ()) + (fn () => Bad ("Unknown HTML tag " ^ tname)) fl tags () + in + case loop s of + Bad msg => Failure msg + | Good (xml, _) => Success xml + end + +val b = simpleTag "b" @@b +val i = simpleTag "i" @@i +val a = simpleTag' "a" @@a {Href = url "href"} + diff -r ec4de8f848f8 -r 8eaaca74a64c html.urs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/html.urs Wed Dec 15 09:27:46 2010 -0500 @@ -0,0 +1,29 @@ +(** Safe HTML parsing *) + +con attribute = fn t => {Nam : string, + Parse : string -> option t} + +con tag = fn ts => {Nam : string, + Attributes : $(map attribute ts), + Folder : folder ts, + Construct : ctx ::: {Unit} -> [[Body] ~ ctx] => $ts + -> xml ([Body] ++ ctx) [] [] -> xml ([Body] ++ ctx) [] []} + +val tag : use ::: {Type} -> ignore ::: {Type} -> [use ~ ignore] => folder use -> string + -> $(map attribute use) + -> (ctx ::: {Unit} -> [[Body] ~ ctx] => Basis.tag (use ++ ignore) ([Body] ++ ctx) ([Body] ++ ctx) [] []) + -> tag use + +val simpleTag : ignore ::: {Type} -> string -> bodyTag ignore -> tag [] +val simpleTag' : use ::: {Type} -> ignore ::: {Type} -> [use ~ ignore] => folder use + -> string -> bodyTag (use ++ ignore) -> $(map attribute use) -> tag use + +val url : string -> attribute url + +val format : tags ::: {{Type}} -> folder tags -> $(map tag tags) + -> ctx ::: {Unit} -> [[Body] ~ ctx] => string + -> Parse.parse (xml ([Body] ++ ctx) [] []) + +val b : tag [] +val i : tag [] +val a : tag [Href = url] diff -r ec4de8f848f8 -r 8eaaca74a64c lib.urp --- a/lib.urp Tue Dec 14 10:54:26 2010 -0500 +++ b/lib.urp Wed Dec 15 09:27:46 2010 -0500 @@ -7,3 +7,5 @@ eq variant sql +parse +html diff -r ec4de8f848f8 -r 8eaaca74a64c parse.ur --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/parse.ur Wed Dec 15 09:27:46 2010 -0500 @@ -0,0 +1,5 @@ +(** Datatypes for describing parse results *) + +datatype parse a = + Success of a + | Failure of string diff -r ec4de8f848f8 -r 8eaaca74a64c tests/testHtml.ur --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/testHtml.ur Wed Dec 15 09:27:46 2010 -0500 @@ -0,0 +1,17 @@ +open Parse +open Html + +val parser = format (b, i, a) + +fun parse r = + case parser r.Source of + Failure s => error Bad HTML: {[s]} + | Success x => return +

Here it is:

+ + {x} +
+ +fun main () = return +
+
diff -r ec4de8f848f8 -r 8eaaca74a64c tests/testHtml.urp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/testHtml.urp Wed Dec 15 09:27:46 2010 -0500 @@ -0,0 +1,5 @@ +library .. +rewrite all TestHtml/* +allow url http://en.wikipedia.org/* + +testHtml diff -r ec4de8f848f8 -r 8eaaca74a64c tests/testHtml.urs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/testHtml.urs Wed Dec 15 09:27:46 2010 -0500 @@ -0,0 +1,1 @@ +val main : {} -> transaction page