isabelle-hacks/fxp/src/Parser/entities.sml

571 lines
27 KiB
Standard ML

(*--------------------------------------------------------------------------*)
(* Structure: Entities *)
(* *)
(* Exceptions raised by functions in this structure: *)
(* closeAll : none *)
(* getChar : none *)
(* getEntId : none *)
(* getPos : none *)
(* inInternalSubset : none *)
(* isOpenEntity : none *)
(* isSpecialEnd : none *)
(* Position2String : none *)
(* pushDummy : none *)
(* pushExtern : NoSuchFile *)
(* pushIntern : none *)
(* pushSpecial : NoSuchFile *)
(* statePos : none *)
(*--------------------------------------------------------------------------*)
(* This module maintains the entity stack. For each open entity it holds a *)
(* buffer to read characters from. When the buffer is exceeded, it gets re- *)
(* filled with new characters, depending on the entity's encoding. *)
(* *)
(****************************************************************************)
(* End-of-line handling as specified in 2.11 is performed: *)
(* in XML 1.0 ***************************************************************)
(****************************************************************************)
(* ... To simplify the tasks of applications, wherever an external parsed *)
(* entity or the literal entity value of an internal parsed entity *)
(* contains either the literal two-character sequence "#xD#xA" or a *)
(* standalone literal #xD, an XML processor must pass to the application *)
(* the single character #xA. *)
(* (This behavior can conveniently be produced by normalizing all line *)
(* breaks to #xA on input, before parsing.) *)
(****************************************************************************)
(* in XML 1.1 ***************************************************************)
(****************************************************************************)
(* To simplify the tasks of applications, the XML processor MUST behave *)
(* as if it normalized all line breaks in external parsed entities *)
(* (including the document entity) on input, before parsing, *)
(* by translating all of the following to a single #xA character: *)
(* 1. the two-character sequence #xD #xA *)
(* 2. the two-character sequence #xD #x85 *)
(* 3. the single character #x85 *)
(* 4. the single character #x2028 *)
(* 5. any #xD character that is not immediately followed by #xA or #x85.*)
(* *)
(* The characters #x85 and #x2028 cannot be reliably recognized and *)
(* translated until an entity's encoding declaration (if present) has *)
(* been read. TheUnsynchronized.refore, it is a fatal error to use them within the XML *)
(* declaration or text declaration. *)
(****************************************************************************)
(* *)
(* It also checks for illegal characters, cf. 2.2: *)
(* *)
(* [2] Char ::= #x9 | #xA | #xD /* any Unicode character, *)
(* | [#x20-#xD7FF] excluding the surrogate *)
(* | [#xE000-#xFFFD] blocks, FFFE, and FFFF. */ *)
(* | [#x10000-#x10FFFF] *)
(* *)
(* More precisely, it assumes that all decoded characters are valid Unicode *)
(* characters. It thus only checks for control characters other than #x9, *)
(* #xA or #xD. *)
(*--------------------------------------------------------------------------*)
signature Entities =
sig
include Hooks
type State
eqtype EntId
datatype Special = DOC_ENTITY | EXT_SUBSET
exception CantOpenFile of (string * string) * AppData
val pushIntern : State * int * bool * UniChar.Vector -> State
val pushExtern : State * int * bool * Uri.Uri -> State * Encoding.Encoding
val pushSpecial : Special * Uri.Uri option -> State * Encoding.Encoding
val closeAll : State -> unit
val commitAuto : AppData * State -> AppData * State
val changeAuto : AppData * State * string -> AppData * State * Encoding.Encoding
val getEntId : State -> EntId
val getPos : State -> Errors.Position
val getUri : State -> Uri.Uri
val getChar : AppData * State -> UniChar.Char * AppData * State
val getChar11 : AppData * State -> UniChar.Char * AppData * State
val getCharRef : (AppData * State -> UniChar.Char * AppData * State) Unsynchronized.ref
val ungetChars : State * UniChar.Data -> State
val isOpen : int * bool * State -> bool
val isSpecial : State -> bool
val inDocEntity : State -> bool
end
functor Entities (structure Hooks : Hooks) : Entities =
struct
(*
structure Entities0=Entities0(structure Hooks = Hooks)
open Entities0
*)
open UniChar Decode Decode.Error Errors Hooks Uri UtilError
val THIS_MODULE = "Entities"
val BUFSIZE = 1024
type CharBuffer = UniChar.Char array
(*--------------------------------------------------------------------*)
(* A special entity can not be popped from the stack by getChar, so *)
(* it must be popped explicitly. This is for the document entity and *)
(* the external subset. *)
(*--------------------------------------------------------------------*)
datatype Special = DOC_ENTITY | EXT_SUBSET
(*--------------------------------------------------------------------*)
(* In order to distinguish a general entity from a paramter entity, *)
(* entity idxs are marked with this datatype. *)
(*--------------------------------------------------------------------*)
datatype EntId = GENERAL of int | PARAMETER of int
(*--------------------------------------------------------------------*)
(* Make an EntId from the entity's index. *)
(*--------------------------------------------------------------------*)
fun makeEntId(idx,isParam) =
if isParam then PARAMETER idx else GENERAL idx
(*--------------------------------------------------------------------*)
(* A non-empty stack is: *)
(* *)
(* an internal entity INT(buf,size,idx,(id,other)): *)
(* - (vec,idx,size) is a buffer,current index and its size; *)
(* - id is the index of the entity's name in the entity table. *)
(* - other contains the underlying entities (the rest of the stack). *)
(* The components are nested according to access frequency. *)
(* *)
(* an external entity has three forms: *)
(* EXT2(buf,size,idx,line,col,break,(dec,err,typ)) *)
(* - (buf,size,idx) is a buffer, its size and current index; *)
(* - (line,col) are the line and column; *)
(* - break is a boolean indicating whether the last character was a *)
(* carriage return (0xD) (then a succeeding line feed (0xA) must be *)
(* supressed); *)
(* - err is an option: if it is SOME(f,ee,err) then it indicates that *)
(* the array was finished by a decoding error err, with the basic *)
(* file f; f was at end of file if ee is true. Otherwise there was *)
(* no error when loading the array. *)
(* - dec describies the encoding of the entity and thus, how more *)
(* data can be loaded; *)
(* - typ is either of the form SPECIAL spec indicating a special *)
(* entity; then this is the only entity on the stack. Otherwise it *)
(* is NORMAL(id,other) for a normal external entity, with: *)
(* + id is the index of the entity's name in the DTD; *)
(* + other is the underlying stack. *)
(* The components are nested according to access frequency. *)
(* *)
(* The second form of an external entity is *)
(* EXT1(dec,line,col,break,typ). This is an unbuffered *)
(* entity whose encoding declaration is being read. We may not load *)
(* an array of characters as a whole because the encoding might still *)
(* change. The components have the same meaning as for EXT2. *)
(* *)
(* A closed entity remains on the stack until the next getChar, for *)
(* purposes of error printing. A closed external entity has the form *)
(* CLOSED(dec,l,col,typ); components have the same meaning *)
(* as for open external entities. A closed internal entity has the *)
(* form ENDED(id,other) with components as above. *)
(* *)
(* Sometimes (for parsing xml/decl declarations) we need a lookahead. *)
(* LOOKING(cs,q) is a state remembering all chars cs looked ahead up *)
(* to state q, in reverse order. LOOKED(cs,q) is an undone lookahead, *)
(* the looked-ahead chars now in the right order. *)
(*--------------------------------------------------------------------*)
datatype ExtType = SPECIAL of Special | NORMAL of EntId * State
and State =
LOOKED of Data * State
| ENDED of EntId * State
| CLOSED of DecFile * int * int * ExtType
| INT of Vector * int * int * (EntId * State)
| EXT1 of DecFile * int * int * bool * ExtType
| EXT2 of CharBuffer * int * int * int * int * bool
* (DecFile * DecodeError option * ExtType)
exception CantOpenFile of (string * string) * AppData
(*--------------------------------------------------------------------*)
(* Extract the unique number from a state. *)
(*--------------------------------------------------------------------*)
fun getExtEntId extType =
case extType
of SPECIAL DOC_ENTITY => GENERAL 0
| SPECIAL EXT_SUBSET => PARAMETER 0
| NORMAL(id,_) => id
fun getEntId q =
case q
of LOOKED (_,q) => getEntId q
| ENDED(id,_) => id
| CLOSED(_,_,_,extType) => getExtEntId extType
| INT(_,_,_,(id,_)) => id
| EXT1(_,_,_,_,extType) => getExtEntId extType
| EXT2(_,_,_,_,_,_,(_,_,extType)) => getExtEntId extType
(*--------------------------------------------------------------------*)
(* Find the nearest enclosing external entity, and return its *)
(* filename, line and column number. *)
(*--------------------------------------------------------------------*)
fun getPos q =
case q
of ENDED(_,other) => getPos other
| INT(_,_,_,(_,other)) => getPos other
| CLOSED(dec,l,col,_) => (decName dec,l,col)
| EXT1(dec,l,col,_,_) => (decName dec,l,col)
| EXT2(_,_,_,l,col,_,(dec,_,_)) => (decName dec,l,col)
| LOOKED (cs,q) => let val (f,l,c) = getPos q
val k = length cs
in if c>=k then (f,l,c-k) else (f,l,0)
end
(*--------------------------------------------------------------------*)
(* get the path of the nearest enclosing external entity. *)
(*--------------------------------------------------------------------*)
fun getUri q =
case q
of LOOKED (_,q) => getUri q
| ENDED(_,other) => getUri other
| INT(_,_,_,(_,other)) => getUri other
| CLOSED(dec,l,col,_) => decUri dec
| EXT1(dec,l,col,_,_) => decUri dec
| EXT2(_,_,_,l,col,_,(dec,_,_)) => decUri dec
(*--------------------------------------------------------------------*)
(* close all files, return nothing. *)
(*--------------------------------------------------------------------*)
fun closeAll q =
case q
of LOOKED(_,other) => closeAll other
| ENDED(_,other) => closeAll other
| CLOSED(_,_,_,SPECIAL _) => ()
| CLOSED(_,_,_,NORMAL(_,other)) => closeAll other
| INT(_,_,_,(_,other)) => closeAll other
| EXT1(dec,_,_,_,SPECIAL _) => ignore(decClose dec)
| EXT1(dec,_,_,_,NORMAL(_,other)) => (ignore (decClose dec); closeAll other)
| EXT2(_,_,_,_,_,_,(dec,_,SPECIAL _)) => ignore(decClose dec)
| EXT2(_,_,_,_,_,_,(dec,_,NORMAL(_,other))) => (ignore (decClose dec); closeAll other)
(*--------------------------------------------------------------------*)
(* is this entity already on the stack? *)
(*--------------------------------------------------------------------*)
fun isOpen (idx,isParam,q) =
let val id = makeEntId(idx,isParam)
fun doit q =
case q
of LOOKED (_,other) => doit other
| ENDED(id',other) => id=id' orelse doit other
| CLOSED(_,_,_,SPECIAL _) => false
| CLOSED(_,_,_,NORMAL(id',other)) => id=id' orelse doit other
| INT(_,_,_,(id',other)) => id=id' orelse doit other
| EXT1(_,_,_,_,SPECIAL _) => false
| EXT1(_,_,_,_,NORMAL(id',other)) => id=id' orelse doit other
| EXT2(_,_,_,_,_,_,(_,_,SPECIAL _)) => false
| EXT2(_,_,_,_,_,_,(_,_,NORMAL(id',other))) => id=id' orelse doit other
in doit q
end
(*--------------------------------------------------------------------*)
(* are we in the internal subset, i.e., in the document entity? *)
(* The internal subset can only be in the document entity, since no *)
(* parameter entities are declared prior to it. The document entity *)
(* is then the only entity on the stack. *)
(*--------------------------------------------------------------------*)
fun inDocEntity q =
case q
of LOOKED (_,q) => inDocEntity q
| ENDED(_,other) => inDocEntity other
| INT(_,_,_,(_,other)) => inDocEntity other
| CLOSED(_,_,_,NORMAL _) => false
| CLOSED(_,_,_,SPECIAL what) => what=DOC_ENTITY
| EXT1(_,_,_,_,NORMAL _) => false
| EXT1(_,_,_,_,SPECIAL what) => what=DOC_ENTITY
| EXT2(_,_,_,_,_,_,(_,_,NORMAL _)) => false
| EXT2(_,_,_,_,_,_,(_,_,SPECIAL what)) => what=DOC_ENTITY
(*--------------------------------------------------------------------*)
(* is this state the document end, i.e., are all entities closed? *)
(*--------------------------------------------------------------------*)
fun isSpecial q =
case q
of LOOKED (_,q) => isSpecial q
| CLOSED(_,_,_,SPECIAL _) => true
| EXT1(_,_,_,_,SPECIAL _) => true
| EXT2(_,_,_,_,_,_,(_,_,SPECIAL _)) => true
| _ => false
(*--------------------------------------------------------------------*)
(* Initialize and load a new buffer when opening an external entity. *)
(*--------------------------------------------------------------------*)
fun initArray dec =
let
val arr = Array.array(BUFSIZE,0wx0)
val (n,dec1,err) = decGetArray dec arr
in (arr,n,dec1,err)
end
(*--------------------------------------------------------------------*)
(* Open an external/internal entity. *)
(*--------------------------------------------------------------------*)
fun pushIntern(q,id,isParam,vec) =
INT(vec,Vector.length vec,0,(makeEntId(id,isParam),q))
fun pushExtern(q,id,isParam,uri) =
let
val dec = decOpenXml (SOME uri)
val auto = decEncoding dec
val q1 = EXT1(dec,1,0,false,NORMAL(makeEntId(id,isParam),q))
in (q1,auto)
end
fun pushSpecial(what,uri) =
let
val dec = decOpenXml uri
val auto = decEncoding dec
val q = EXT1(dec,1,0,false,SPECIAL what)
in (q,auto)
end
(*--------------------------------------------------------------------*)
(* confirm the autodetected encoding of an external entity. *)
(*--------------------------------------------------------------------*)
fun commitAuto(a,q) =
case q
of EXT1(dec,l,col,brk,typ) =>
let
val a1 = a before decCommit dec
handle DecError(_,_,err)
=> hookError(a,(getPos q,ERR_DECODE_ERROR err))
val (arr,n,dec1,err) = initArray dec
in (a1,EXT2(arr,n,0,l,col,brk,(dec1,err,typ)))
end
(*
in (a1,EXT1(dec,l,col,brk,typ))
end
*)
| LOOKED(cs,q1) => let val (a1,q2) = commitAuto (a,q1)
in (a1,LOOKED(cs,q2))
end
| CLOSED _ => (a,q)
| _ => raise InternalError(THIS_MODULE,"commitAuto",
"entity is neither EXT1 nor CLOSED nor LOOKED")
(*--------------------------------------------------------------------*)
(* change from the autodetected encoding to the declared one. *)
(*--------------------------------------------------------------------*)
fun changeAuto (a,q,decl) =
case q
of EXT1(dec,l,col,brk,typ) =>
let
val dec1 = decSwitch(dec,decl)
handle DecError(dec,_,err)
=> let val a1 = hookError(a,(getPos q,ERR_DECODE_ERROR err))
val _ = decClose dec
val uri = decName dec
val msg = case err
of ERR_UNSUPPORTED_ENC _ => "Unsupported encoding"
| _ => "Declared encoding incompatible"
^"with auto-detected encoding"
in raise CantOpenFile ((uri,msg),a1)
end
val newEnc = decEncoding dec1
val (arr,n,dec2,err) = initArray dec1
in (a,EXT2(arr,n,0,l,col,brk,(dec2,err,typ)),newEnc)
end
(*
in (a,EXT1(dec1,l,col,brk,typ),newEnc)
end
*)
| LOOKED(cs,q1) => let val (a2,q2,enc2) = changeAuto(a,q1,decl)
in (a2,LOOKED(cs,q2),enc2)
end
| CLOSED(dec,_,_,_) => (a,q,decEncoding dec)
| _ => raise InternalError(THIS_MODULE,"changeAuto",
"entity is neither EXT1 nor CLOSED nor LOOKED")
(*--------------------------------------------------------------------*)
(* Get one character from the current entity. Possibly reload buffer. *)
(* Return 0wx0 at entity end. Otherwise check whether the character *)
(* is valid (cf. 2.2). If the last character was a carriage return *)
(* (0xD) supress a line feed (0xA). *)
(*--------------------------------------------------------------------*)
fun getChar (a,q) =
case q
of ENDED(_,other) => getChar(a,other)
| CLOSED(_,_,_,typ) =>
(case typ
of SPECIAL _ => raise InternalError (THIS_MODULE,"getChar",
"attempt to read beyond special entity end")
| NORMAL(_,other) => getChar(a,other))
| INT(vec,s,i,io) =>
if i>=s then (0wx0,a,ENDED io)
else (Vector.sub(vec,i),a,INT(vec,s,i+1,io))
| EXT1(dec,l,col,br,typ) =>
(let
val (c,dec1) = decGetChar dec
in
if (* c>=0wx20 orelse c=0wx09 *)
c>=0wx0020
andalso (c<=0wxD7FF
orelse c>=0wxE000 andalso (c<=0wxFFFD
orelse c>=0wx10000))
orelse c=0wx9
then (c,a,EXT1(dec1,l,col+1,false,typ))
else
if c=0wxA
then if br then getChar(a,EXT1(dec1,l,col,false,typ))
else (c,a,EXT1(dec1,l+1,0,false,typ))
else (if c=0wxD then (0wxA,a,EXT1(dec1,l+1,0,true,typ))
else let val a1 = hookError(a,(getPos q,ERR_NON_XML_CHAR c))
in getChar(a1,EXT1(dec1,l,col+1,false,typ))
end)
end
handle DecEof dec => (0wx0,a,CLOSED(dec,l,col,typ))
| DecError(dec,eof,err) =>
let val err = ERR_DECODE_ERROR err
val a1 = hookError(a,(getPos q,err))
in if eof then (0wx0,a,CLOSED(dec,l,col,typ))
else getChar(a1,EXT1(dec,col,l,br,typ))
end)
| EXT2(arr,s,i,l,col,br,det) =>
if i<s
then let val c = Array.sub(arr,i)
in if (* c>=0wx20 orelse c=0wx09 *)
(* c>=0wx0020 andalso c<=0wxD7FF orelse c=0wx9 orelse *)
(* c>=0wxE000 andalso c<=0wxFFFD orelse c>=0wx10000 *)
c>=0wx0020
andalso (c<=0wxD7FF
orelse c>=0wxE000 andalso (c<=0wxFFFD
orelse c>=0wx10000))
orelse c=0wx9
then (c,a,EXT2(arr,s,i+1,l,col+1,false,det))
else if c=0wxA
then if br then getChar(a,EXT2(arr,s,i+1,l,col,false,det))
else (c,a,EXT2(arr,s,i+1,l+1,0,false,det))
else (if c=0wxD then (0wxA,a,EXT2(arr,s,i+1,l+1,0,true,det))
else let val a1 = hookError(a,(getPos q,ERR_NON_XML_CHAR c))
in getChar(a1,EXT2(arr,s,i+1,l,col+1,false,det))
end)
end
else let val (dec,err,typ) = det
val (a1,(n,dec1,err1)) =
case err
of NONE => if s=BUFSIZE then (a,decGetArray dec arr)
else (a,(0,dec,NONE))
| SOME err => (hookError(a,(getPos q,ERR_DECODE_ERROR err)),
decGetArray dec arr)
in if n=0 andalso not (isSome err1)
then (0wx0,a1,CLOSED(dec1,l,col,typ))
else getChar(a1,EXT2(arr,n,0,l,col,br,(dec1,err1,typ)))
end
| LOOKED(nil,q) => getChar(a,q)
| LOOKED(c::cs,q) => (c,a,LOOKED(cs,q))
fun getChar11 (a,q) =
case q
of ENDED(_,other) => getChar11(a,other)
| CLOSED(_,_,_,typ) =>
(case typ
of SPECIAL _ => raise InternalError (THIS_MODULE,"getChar11",
"attempt to read beyond special entity end")
| NORMAL(_,other) => getChar11(a,other))
| INT(vec,s,i,io) =>
if i>=s then (0wx0,a,ENDED io)
else (Vector.sub(vec,i),a,INT(vec,s,i+1,io))
| EXT1(dec,l,col,br,typ) => (* br = whether the previous char was 0wx0D *)
(let
val (c,dec1) = decGetChar dec
in
(* cf 2.2 and 2.11 (end-of-line handling) *)
if c>=0wx1
andalso (c<=0wxD7FF orelse c>=0wxE000 andalso (c<=0wxFFFD orelse
c>=0wx10000 andalso c<=0wx10FFFF))
then
if c=0wx2028 then (0wxA,a,EXT1(dec1,l+1,0,false,typ))
else
if (c=0wxA orelse c=0wx85) then
if br then getChar11(a,EXT1(dec1,l,col,false,typ))
(* c and 0wxD was previously translated to 0wxA *)
else (0wxA,a,EXT1(dec1,l+1,0,false,typ))
else if c=0wxD then (* whatever follows a 0wxA must be produced (cf. 2.11) *)
(0wxA,a,EXT1(dec1,l+1,0,true,typ))
else
if c<0wx7F orelse c>0wx9F then (c,a,EXT1(dec1,l,col+1,false,typ))
else (* in XML 1.1 the control characters 0wx7F through 0wx9F must appear
only as chracter Unsynchronized.references *)
let
val a1 = hookError(a,(getPos q,ERR_MUST_CHARREF c))
in
getChar11(a1,EXT1(dec1,l,col+1,false,typ))
end
else
let
val a1 = hookError(a,(getPos q,ERR_NON_XML_CHAR c))
in
getChar11(a1,EXT1(dec1,l,col+1,false,typ))
end
end
handle DecEof dec => (0wx0,a,CLOSED(dec,l,col,typ))
| DecError(dec,eof,err) =>
let val err = ERR_DECODE_ERROR err
val a1 = hookError(a,(getPos q,err))
in if eof then (0wx0,a,CLOSED(dec,l,col,typ))
else getChar11(a1,EXT1(dec,col,l,br,typ))
end)
| EXT2(arr,s,i,l,col,br,det) =>
if i<s then
let
val c = Array.sub(arr,i)
in
(* cf 2.2 and 2.11 *)
if c>=0wx1
andalso (c<=0wxD7FF orelse c>=0wxE000 andalso (c<=0wxFFFD orelse
c>=0wx10000 andalso c<=0wx10FFFF))
then
if c=0wx2028 then (0wxA,a,EXT2(arr,s,i+1,l+1,0,false,det))
else
if (c=0wxA orelse c=0wx85) then
if br then getChar11(a,EXT2(arr,s,i+1,l,col,false,det))
(* c and 0wxD was previously translated to 0wxA *)
else (0wxA,a,EXT2(arr,s,i+1,l+1,0,false,det))
else if c=0wxD then (* whatever follows a 0wxA must be produced (cf. 2.11) *)
(0wxA,a,EXT2(arr,s,i+1,l+1,0,true,det))
else
if c<0wx7F orelse c>0wx9F then (c,a,EXT2(arr,s,i+1,l,col+1,false,det))
else (* in XML 1.1 the control characters 0wx7F through 0wx9F must appear
only as chracter Unsynchronized.references *)
let
val a1 = hookError(a,(getPos q,ERR_MUST_CHARREF c))
in
getChar11(a1,EXT2(arr,s,i+1,l,col+1,false,det))
end
else
let
val a1 = hookError(a,(getPos q,ERR_NON_XML_CHAR c))
in
getChar(a1,EXT2(arr,s,i+1,l,col+1,false,det))
end
end
else let val (dec,err,typ) = det
val (a1,(n,dec1,err1)) =
case err
of NONE => if s=BUFSIZE then (a,decGetArray dec arr)
else (a,(0,dec,NONE))
| SOME err => (hookError(a,(getPos q,ERR_DECODE_ERROR err)),
decGetArray dec arr)
in if n=0 andalso not (isSome err1)
then (0wx0,a1,CLOSED(dec1,l,col,typ))
else getChar11(a1,EXT2(arr,n,0,l,col,br,(dec1,err1,typ)))
end
| LOOKED(nil,q) => getChar11(a,q)
| LOOKED(c::cs,q) => (c,a,LOOKED(cs,q))
val getCharRef = Unsynchronized.ref getChar
fun getChar x = !getCharRef x
(*--------------------------------------------------------------------*)
(* unget a list of characters. *)
(*--------------------------------------------------------------------*)
fun ungetChars (q,cs) = LOOKED(cs,q)
end