223 lines
10 KiB
Standard ML
223 lines
10 KiB
Standard ML
|
|
|
|
|
|
|
|
(*--------------------------------------------------------------------------*)
|
|
(* Structure: UniClasses *)
|
|
(* *)
|
|
(* Notes: *)
|
|
(* read CharClasses in order to understand how CharClasses are handled. *)
|
|
(* *)
|
|
(* Depends on: *)
|
|
(* UniChar *)
|
|
(* CharClasses *)
|
|
(* *)
|
|
(* Exceptions raised by functions in this structure: *)
|
|
(* decValue : none *)
|
|
(* hexValue : none *)
|
|
(* isAsciiLetter : none *)
|
|
(* isEnc : none *)
|
|
(* isEncS : none *)
|
|
(* isName : none *)
|
|
(* isNms : none *)
|
|
(* isPubid : none *)
|
|
(* isS : none *)
|
|
(* isXml : none *)
|
|
(* isUnicode : none *)
|
|
(* isVers : none *)
|
|
(*--------------------------------------------------------------------------*)
|
|
signature UniClasses =
|
|
sig
|
|
val isName : UniChar.Char -> bool
|
|
val isName11 : UniChar.Char -> bool
|
|
val isNameRef : (UniChar.Char -> bool) Unsynchronized.ref
|
|
val isNms : UniChar.Char -> bool
|
|
val isNms11 : UniChar.Char -> bool
|
|
val isNmsRef : (UniChar.Char -> bool) Unsynchronized.ref
|
|
val isXml : UniChar.Char -> bool
|
|
val isXml11 : UniChar.Char -> bool
|
|
val isXmlRef : (UniChar.Char -> bool) Unsynchronized.ref
|
|
val isPubid : UniChar.Char -> bool
|
|
val isS : UniChar.Char -> bool
|
|
val isEnc : UniChar.Char -> bool
|
|
val isEncS : UniChar.Char -> bool
|
|
val isVers : UniChar.Char -> bool
|
|
val isDec : UniChar.Char -> bool
|
|
val isHex : UniChar.Char -> bool
|
|
val isUnicode : UniChar.Char -> bool
|
|
|
|
val decValue : UniChar.Char -> UniChar.Char option
|
|
val hexValue : UniChar.Char -> UniChar.Char option
|
|
|
|
val isAsciiLetter : UniChar.Char -> bool
|
|
end
|
|
|
|
structure UniClasses : UniClasses =
|
|
struct
|
|
open UniChar CharClasses UniRanges
|
|
|
|
(*--------------------------------------------------------------------*)
|
|
(* initialize the character classes. *)
|
|
(*--------------------------------------------------------------------*)
|
|
local
|
|
val nmsTemp = initialize(0wx0000,0wx3FFF)
|
|
val restNms = addCharRange(nmsTemp,0wx0000,0wx3FFF,nmsRange)
|
|
val _ = if restNms=[(0wxAC00,0wxD7A3), (* interval in BaseChar not between 0 and 3FFF *)
|
|
(0wx4E00,0wx9FA5) (* interval in Ideographic not between 0 and 3FFF *)
|
|
] then ()
|
|
else print ("Warning: extra characters after computing nms char class.\n")
|
|
|
|
val nms11Temp = initialize(0wx0000,0wxFFFF)
|
|
val restNms11 = addCharRange(nms11Temp,0wx0000,0wxFFFF,nameStartCharRange)
|
|
val _ = if restNms11=[(0wx10000,0wxEFFFF)] then ()
|
|
else print ("Warning: extra characters after computing nms11 char class.\n")
|
|
|
|
val nameTemp = initialize(0wx0000,0wxFFFF) (* should probably be enough 3FFF *)
|
|
val restName = addCharRange(nameTemp,0wx0000,0wx3FFF,nameRange)
|
|
val _ = if restName=[(0wxAC00,0wxD7A3),(0wx4E00,0wx9FA5)] then ()
|
|
else print ("Warning: extra characters after computing name char class.\n")
|
|
|
|
val name11Temp = initialize(0wx0000,0wxFFFF)
|
|
val restName11 = addCharRange(name11Temp,0wx0000,0wxFFFF,nameCharRange)
|
|
val _ = if restName11=[(0wx10000,0wxEFFFF)] then ()
|
|
else print ("Warning: extra characters after computing name11 char class.\n")
|
|
|
|
val pubTemp = initialize(0wx0000,0wx007F)
|
|
val restPubid = addCharRange(pubTemp,0wx0000,0wx007F,pubidRange)
|
|
val _ = if restPubid=nil then ()
|
|
else print ("Warning: extra characters after computing pubid char class.\n")
|
|
|
|
val encTemp = initialize(0wx0000,0wx007F)
|
|
val restEnc = addCharRange(encTemp,0wx0000,0wx007F,encRange)
|
|
val _ = if restEnc=nil then ()
|
|
else print ("Warning: extra characters after computing enc char class.\n")
|
|
in
|
|
val nmsClass = finalize nmsTemp
|
|
val nms11Class = finalize nms11Temp
|
|
val nameClass = finalize nameTemp
|
|
val name11Class = finalize name11Temp
|
|
val pubClass = finalize pubTemp
|
|
val encClass = finalize encTemp
|
|
end
|
|
|
|
(*--------------------------------------------------------------------*)
|
|
(* is a character a name start char? *)
|
|
(*--------------------------------------------------------------------*)
|
|
fun isNms c = if c<0wx4000 then inCharClass(c,nmsClass)
|
|
else
|
|
c>=0wx4E00 andalso c<=0wx9FA5 orelse (* legal interval for nms but not in nmsClass *)
|
|
c>=0wxAC00 andalso c<=0wxD7A3 (* legal interval for nms but not in nmsClass *)
|
|
|
|
(* for XML 1.1 *)
|
|
fun isNms11 c = if c<=0wxFFFF then inCharClass(c,nms11Class)
|
|
else c>=0wx10000 andalso c<=0wxEFFFF
|
|
|
|
val isNmsRef = Unsynchronized.ref isNms
|
|
|
|
fun isNms x = (!isNmsRef) x
|
|
|
|
|
|
(*--------------------------------------------------------------------*)
|
|
(* is a character a name char? *)
|
|
(*--------------------------------------------------------------------*)
|
|
fun isName c = if c<0wx4000 then inCharClass(c,nameClass)
|
|
else
|
|
c>=0wx4E00 andalso c<=0wx9FA5 orelse
|
|
c>=0wxAC00 andalso c<=0wxD7A3
|
|
|
|
(* for XML 1.1 *)
|
|
fun isName11 c = if c<=0wxFFFF then inCharClass(c,name11Class)
|
|
else c>=0wx10000 andalso c<=0wxEFFFF
|
|
|
|
val isNameRef = Unsynchronized.ref isName
|
|
|
|
fun isName x = (!isNameRef) x
|
|
|
|
(*--------------------------------------------------------------------*)
|
|
(* XML characters if not checked for Unicode char in advance. *)
|
|
(*--------------------------------------------------------------------*)
|
|
fun isXml (c:UniChar.Char) =
|
|
c>=0wx0020 andalso c<=0wxD7FF orelse
|
|
c>=0wxE000 andalso c<=0wxFFFD orelse
|
|
c>=0wx10000 andalso c<=0wx10FFFF orelse
|
|
c=0wx9 orelse c=0wxA orelse c=0wxD
|
|
(**********************************************************************)
|
|
(* XML 1.1 *)
|
|
(* Char ::= [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] [2] *)
|
|
(**********************************************************************)
|
|
fun isXml11 (c:UniChar.Char) =
|
|
c>=0wx1 andalso c<=0wxD7FF orelse
|
|
c>=0wxE000 andalso c<=0wxFFFD orelse
|
|
c>=0wx10000 andalso c<=0wx10FFFF
|
|
|
|
val isXmlRef = Unsynchronized.ref isXml
|
|
|
|
fun isXml x = (!isXmlRef) x
|
|
|
|
(*--------------------------------------------------------------------*)
|
|
(* is a character a pubid char? *)
|
|
(*--------------------------------------------------------------------*)
|
|
fun isPubid c = c<0wx80 andalso inCharClass(c,pubClass)
|
|
|
|
(*--------------------------------------------------------------------*)
|
|
(* is a character valid in an encoding name, at its start, or in a *)
|
|
(* version number? *)
|
|
(*--------------------------------------------------------------------*)
|
|
fun isEnc c =
|
|
c<0wx80 andalso inCharClass(c,encClass)
|
|
fun isEncS (c:UniChar.Char) =
|
|
c>=0wx41 andalso c<=0wx5A orelse
|
|
c>=0wx61 andalso c<=0wx7A
|
|
fun isVers c =
|
|
isEnc c orelse c=0wx3A (* #":" *)
|
|
|
|
(*--------------------------------------------------------------------*)
|
|
(* these are the valid Unicode characters (including surrogates). *)
|
|
(*--------------------------------------------------------------------*)
|
|
fun isUnicode (c:UniChar.Char) = c<=0wx10FFFF
|
|
|
|
(*--------------------------------------------------------------------*)
|
|
(* the frontend supresses 0wxD (carriage return), but its is still *)
|
|
(* present when encoding is recognized. *)
|
|
(*--------------------------------------------------------------------*)
|
|
fun isS (c:UniChar.Char) =
|
|
case c
|
|
of 0wx09 => true
|
|
| 0wx0A => true
|
|
| 0wx0D => true
|
|
| 0wx20 => true
|
|
| _ => false
|
|
|
|
(*--------------------------------------------------------------------*)
|
|
(* is this character an ascii decimal/hexadecimal digit? *)
|
|
(*--------------------------------------------------------------------*)
|
|
fun isDec (c:UniChar.Char) =
|
|
c>=0wx30 andalso c<=0wx39
|
|
fun isHex (c:UniChar.Char) =
|
|
c>=0wx30 andalso c<=0wx39 orelse
|
|
c>=0wx41 andalso c<=0wx46 orelse
|
|
c>=0wx61 andalso c<=0wx66
|
|
|
|
(*--------------------------------------------------------------------*)
|
|
(* calculate the decimal/hexadecimal value of an ascii (hex-)digit. *)
|
|
(*--------------------------------------------------------------------*)
|
|
fun decValue (c:UniChar.Char) =
|
|
let val v = c-0wx30
|
|
in if v<=0wx9 then SOME v else NONE
|
|
end
|
|
fun hexValue (c:UniChar.Char) =
|
|
let val v = c-0wx30
|
|
in if v<=0wx9 then SOME v
|
|
else (if c>=0wx41 andalso c<=0wx46 then SOME(c-0wx37)
|
|
else if c>=0wx61 andalso c<=0wx66 then SOME(c-0wx57)
|
|
else NONE)
|
|
end
|
|
|
|
(*--------------------------------------------------------------------*)
|
|
(* is c in [a-z]+[A-Z]? *)
|
|
(*--------------------------------------------------------------------*)
|
|
fun isAsciiLetter (c:UniChar.Char) =
|
|
c>=0wx41 andalso c<=0wx5A orelse
|
|
c>=0wx61 andalso c<=0wx7A
|
|
end
|