Commit 00518123 authored by gerd's avatar gerd

Enhanced symmetry of tree representation ("solid XML") and

event-based representation ("liquid XML"): Every tree can be
transformed into a stream of events, and vice versa, without
loss of information. The type [event] was modified to make this
symmetry possible. The functions [liquefy] and [solidify] do the
transformations in the two directions.


git-svn-id: https://godirepo.camlcity.org/svn/lib-pxp/trunk@696 dbe99aee-44db-0310-b2b3-d33182c8eb97
parent 757d37d5
......@@ -5,7 +5,7 @@
# make distclean: remove any superflous files (recursively)
#----------------------------------------------------------------------
all: create_element modify strip
all: create_element modify strip symmetry
create_element: create_element.ml
ocamlfind ocamlc -labels -g -a -o create_element.cma -package pxp create_element.ml
......@@ -19,6 +19,10 @@ strip: strip.ml
ocamlfind ocamlc -g -a -o strip.cma -package pxp strip.ml
../create-wrapper strip
symmetry: symmetry.ml
ocamlfind ocamlc -g -a -o symmetry.cma -package pxp symmetry.ml
../create-wrapper symmetry
#----------------------------------------------------------------------
.PHONY: all
all:
......
......@@ -9,3 +9,5 @@ echo "modify:"
echo "strip:"
./strip
echo "symmetry:"
./symmetry
This diff is collapsed.
This diff is collapsed.
......@@ -2508,6 +2508,11 @@ class [ 'ext ] document :
method root : 'ext node
(* Returns the root element, or fails if there is not any. *)
method raw_root_name : string
(* The unprocessed name of the root element (second arg of
* init_root)
*)
method add_pinstr : proc_instruction -> unit
(* Adds a processing instruction to the document container.
* The parser does this for PIs occurring outside the DTD and outside
......@@ -2580,40 +2585,55 @@ val print_doc :
(* Experimental: event streams and node trees *)
(**********************************************************************)
exception Empty_tree
(* Nothing to return *)
exception Build_aborted
exception Error_event of exn
(* The event stream contains an E_error event *)
val build_node_tree :
config -> dtd -> 'ext spec -> (unit -> event option) -> 'ext node
type 'ext solid_xml =
[ `Node of 'ext node
| `Document of 'ext document
]
val solidify :
?dtd:dtd ->
config -> 'ext spec -> (unit -> event option) -> 'ext solid_xml
(* Reads the event stream by calling the unit->event function, and
* creates a node tree according to config, dtd, spec.
*
* The event stream may be either:
* - A document event stream (as generated by `Entry_document)
* - A content event stream (as generated by `Entry_content)
* - A document event stream (as generated by `Entry_document).
* In this case `Document d is returned.
* - A content event stream (as generated by `Entry_content).
* In this case `Node n is returned.
*
* Document streams contain a DTD. The found DTD is used for the
* node tree. Content streams, on the contrary, do not contain DTDs.
* In this case, an empty DTD is created (in well-formedness mode).
*
* The returned toplevel node is either the "super root node" (if
* configured), or the topmost element. In the latter case, comments
* and processing instructions at the top level are ignored, and it
* may even happen that the function raises Empty_tree because there
* is no topmost element.
* The [dtd] argument overrides any DTD, no matter whether found
* in the stream or freshly created.
*
* If the DTD allows validation, the returned tree is validated.
*
* The data nodes are not normalized unless the arriving data events
* are already normalized.
* are already normalized. To get this effect, filter the stream
* with Pxp_ev_parser.norm_cdata_filter before calling solidify.
*
* Ignorable whitespace is not automatically removed. To get this
* effect, filter the stream with
* Pxp_ev_parser.drop_ignorable_whitespace_filter before calling solidify.
*
* The uniqueness of ID attributes is not checked.
*)
(* TODO:
* build_document
* decompose_node_tree
* decompose_document
*)
val liquefy :
?omit_end: bool -> ?omit_positions:bool -> 'ext solid_xml ->
('a -> event option)
(* The converse of [solidify]: The passed node or document is transformed
* into an event stream.
*
* omit_end: If true, the E_end_of_stream event is omitted at the end.
* Useful to concatenate several streams. Default: false.
* omit_positions: If true, no E_position events are generated.
* Default:false.
*)
......@@ -117,6 +117,9 @@ object (self)
;;
let create_namespace_manager () = new namespace_manager;;
class type namespace_scope =
object
method namespace_manager : namespace_manager
......@@ -204,6 +207,10 @@ end
;;
let create_namespace_scope ?parent ?(decl = []) mng =
new namespace_scope_impl mng parent decl ;;
class dtd ?swarner the_warner init_encoding =
object (self)
val mutable root = (None : string option)
......@@ -1260,6 +1267,10 @@ object (self)
end
;;
let create_dtd ?swarner ?(warner = new drop_warnings) enc =
new dtd ?swarner warner enc ;;
type source =
Entity of ((dtd -> Pxp_entity.entity) * Pxp_reader.resolver)
| ExtID of (ext_id * Pxp_reader.resolver)
......
......@@ -143,6 +143,10 @@ class namespace_manager :
;;
val create_namespace_manager : unit -> namespace_manager
(* Preferred way of creating a namespace_manager *)
(** The recursive class type [namespace_scope] represents the original
* namespace declarations found in the XML text. A single [namespace_scope]
* object contains a list of declared namespaces
......@@ -224,6 +228,14 @@ class namespace_scope_impl :
*)
val create_namespace_scope :
?parent:namespace_scope ->
?decl:(string * string) list ->
namespace_manager ->
namespace_scope
(* Preferred way of creating a namespace_scope *)
class dtd :
(* Creation:
* new dtd
......@@ -618,6 +630,23 @@ and proc_instruction : string -> string -> Pxp_core_types.rep_encoding ->
(*$-*)
(* ---------------------------------------------------------------------- *)
val create_dtd :
?swarner:Pxp_core_types.symbolic_warnings ->
?warner:Pxp_core_types.collect_warnings ->
Pxp_core_types.rep_encoding ->
dtd
(* Preferred way of creating a DTD. Example:
* let dtd = create_dtd
* ?swarner:config.swarner
* ~warner:config.warner
* config.encoding
*
* See also Pxp_dtd_parser.create_empty_dtd.
*)
(* ---------------------------------------------------------------------- *)
type source =
......
......@@ -9,10 +9,13 @@ open Pxp_lexers
open Pxp_lexer_types
open Pxp_entity_manager
open Pxp_dtd
open Pxp_document
open Pxp_core_parser
open Pxp_ev_parser
let create_empty_dtd config =
create_dtd ?swarner:config.swarner ~warner:config.warner config.encoding ;;
class dtd_parser init_dtd init_config =
object (self)
inherit core_parser init_dtd init_config (-1)
......@@ -97,7 +100,7 @@ let extract_dtd_from_document_entity cfg src =
let entry = `Entry_document [ `Val_mode_dtd; `Parse_xml_decl ] in
let handle ev =
match ev with
E_start_doc(_,_,dtd) -> raise(Return_DTD dtd)
E_start_doc(_,dtd) -> raise(Return_DTD dtd)
| E_error _ -> () (* ignore now, exception will be raised anyway *)
| E_position(_,_,_) -> ()
| _ -> assert false
......
......@@ -7,6 +7,9 @@
open Pxp_types
open Pxp_dtd
val create_empty_dtd : config -> dtd
(* Create an empty DTD *)
val parse_dtd_entity : config -> source -> dtd
(* Parse an entity containing a DTD (external subset), and return this DTD. *)
......
......@@ -18,7 +18,7 @@ open Pxp_aux
class any_entity_id = object end ;;
class event_parser init_dtd init_config init_event_handler
init_want_start_doc init_pull_counter =
init_want_start_doc init_pull_counter lit_root =
object (self)
inherit core_parser init_dtd init_config init_pull_counter
......@@ -46,10 +46,13 @@ object (self)
method private init_for_xml_body() =
if not init_done then begin
if config.recognize_standalone_declaration then
dtd # set_standalone_declaration xml_standalone;
if want_start_doc then
event_handler (E_start_doc(xml_version,
xml_standalone,
dtd));
if config.enable_super_root_node then
event_handler E_start_super;
(* Init namespace processing, if necessary: *)
( match config.enable_namespace_processing with
None -> ()
......@@ -72,8 +75,7 @@ object (self)
| Some "no" -> false
| _ -> raise (WF_error("Illegal 'standalone' declaration"))
in
if config.recognize_standalone_declaration then
xml_standalone <- v
xml_standalone <- v
method private event_start_tag position name attlist emptiness tag_beg_entid =
......@@ -85,6 +87,7 @@ object (self)
if ep_root_element_seen then
raise(WF_error("Document must consist of only one toplevel element"));
ep_root_element_seen <- true;
lit_root := name
end;
match config.enable_namespace_processing with
......@@ -92,29 +95,23 @@ object (self)
(* no namespaces *)
if not emptiness then
stack_push (position, name, "", tag_beg_entid) ep_elstack;
event_handler(E_start_tag(name,attlist,tag_beg_entid));
event_handler(E_start_tag(name,attlist,None,tag_beg_entid));
if emptiness then
event_handler(E_end_tag(name,tag_beg_entid))
| Some mng ->
(* enabled namespaces *)
let (src_prefix, localname, norm_name, norm_attlist) =
self # push_src_norm_mapping mng name attlist in
let scope = match ns_scope with Some s -> s | None -> assert false
in
let mixed_attlist =
let attlist' =
List.map (fun (orig_prefix, localname, norm_name, value) ->
(if orig_prefix = ""
then localname
else orig_prefix ^ ":" ^ localname),
norm_name,
value) norm_attlist in
(norm_name, value)) norm_attlist in
if not emptiness then
stack_push (position, name, norm_name, tag_beg_entid) ep_elstack;
event_handler(E_ns_start_tag
(name,norm_name,mixed_attlist,scope,tag_beg_entid));
event_handler(E_start_tag
(norm_name,attlist',ns_scope,tag_beg_entid));
if emptiness then (
self # pop_src_norm_mapping();
event_handler(E_ns_end_tag(name,norm_name,tag_beg_entid));
event_handler(E_end_tag(norm_name,tag_beg_entid));
)
......@@ -155,7 +152,7 @@ object (self)
| Some mng ->
(* namespaces *)
self # pop_src_norm_mapping();
event_handler(E_ns_end_tag(name,norm_name,tag_end_entid))
event_handler(E_end_tag(norm_name,tag_end_entid))
method private event_char_data data =
......@@ -163,7 +160,8 @@ object (self)
method private event_pinstr position target value =
if config.enable_pinstr_nodes then begin
if config.enable_pinstr_nodes &&
(n_tags_open > 0 || config.enable_super_root_node) then begin
let ev_list =
(match position with
Some(e,l,c) -> [ E_position(e,l,c) ]
......@@ -175,11 +173,19 @@ object (self)
else
ep_early_events <- ep_early_events @ ev_list
end
else begin
let ev = E_pinstr_member(target,value) in
if init_done then
event_handler ev
else
ep_early_events <- ep_early_events @ [ev]
end
method private event_comment position mat =
if config.enable_comment_nodes then begin
let ev_list =
if config.enable_comment_nodes &&
(n_tags_open > 0 || config.enable_super_root_node) then begin
let ev_list =
(match position with
Some(e,l,c) -> [ E_position(e,l,c) ]
| None -> []
......@@ -193,7 +199,7 @@ object (self)
method private sub_parser () =
let pobj = new event_parser dtd config event_handler false (-1) in
let pobj = new event_parser dtd config event_handler false (-1) (ref "") in
(pobj :> core_parser)
end
......@@ -226,6 +232,7 @@ let process_entity
let have_document_entry =
match entry with `Entry_document _ -> true | _ -> false in
let lit_root = ref "" in
let pobj =
new event_parser
mgr#dtd
......@@ -233,6 +240,7 @@ let process_entity
eh
have_document_entry
(-1)
lit_root
in
let resolver = mgr # current_resolver in
let init_lexer =
......@@ -250,7 +258,9 @@ let process_entity
pobj # parse context (entry : entry :> extended_entry);
if en # is_open then ignore(en # close_entity);
if have_document_entry then eh (E_end_doc);
if cfg.enable_super_root_node then
eh E_end_super;
if have_document_entry then eh (E_end_doc !lit_root);
eh E_end_of_stream;
with
| Failure "Invalid UTF-8 stream" ->
......@@ -276,7 +286,8 @@ let process_expr
?first_token
?following_token
cfg mgr eh =
let pobj = new event_parser mgr#dtd cfg eh false (-1) in
let lit_root = ref "" in
let pobj = new event_parser mgr#dtd cfg eh false (-1) lit_root in
let resolver = mgr # current_resolver in
let en = mgr # current_entity in
begin try
......@@ -322,6 +333,7 @@ let create_pull_parser
let have_document_entry =
match entry with `Entry_document _ -> true | _ -> false in
let lit_root = ref "" in
let pobj =
new event_parser
mgr#dtd
......@@ -329,6 +341,7 @@ let create_pull_parser
eh
have_document_entry
100 (* the number of loops until Interrupt_parsing *)
lit_root
in
let resolver = mgr # current_resolver in
let init_lexer =
......@@ -383,7 +396,8 @@ let create_pull_parser
(* If the [parse] method terminates, the end of the stream is reached!
*)
if en # is_open then ignore(en # close_entity);
if have_document_entry then eh (E_end_doc);
if cfg.enable_super_root_node then eh E_end_super;
if have_document_entry then eh (E_end_doc !lit_root);
eh E_end_of_stream;
pull_queue_eof := true;
with
......@@ -472,7 +486,7 @@ let drop_ignorable_whitespace_filter get_ev =
let rec get_ev' thing =
let ev = get_ev thing in
match ev with
Some(E_start_doc(_,_,dtd)) ->
Some(E_start_doc(_,dtd)) ->
if !found_dtd <> None then
failwith "Pxp_ev_parser.drop_ignorable_whitespace_filter: More than one E_start_doc event";
found_dtd := Some dtd;
......@@ -480,20 +494,13 @@ let drop_ignorable_whitespace_filter get_ev =
| Some(E_position(e,line,col)) ->
pos := (e,line,col);
ev
| Some(E_start_tag(name,_,_)) ->
let ign_ws = has_ignorable_ws name in
Stack.push ign_ws elements;
ev
| Some(E_ns_start_tag(_,name,_,_,_)) ->
| Some(E_start_tag(name,_,_,_)) ->
let ign_ws = has_ignorable_ws name in
Stack.push ign_ws elements;
ev
| Some(E_end_tag(name,_)) ->
pop();
ev
| Some(E_ns_end_tag(_,name,_)) ->
pop();
ev
| Some(E_char_data s) ->
let ign_ws = try Stack.top elements with Stack.Empty -> true in
if ign_ws then (
......
......@@ -75,6 +75,7 @@ val process_entity :
* - encoding
* - enable_pinstr_nodes
* - enable_comment_nodes
* - enable_super_root_node
* - store_element_positions
* - name_pool and all name pool options
* - enable_namespace_processing
......@@ -192,3 +193,4 @@ val drop_ignorable_whitespace_filter : 'a filter
*)
(* TODO: ID filter that creates an ID index *)
......@@ -217,17 +217,18 @@ type entry =
type event =
| E_start_doc of (string * bool * dtd)
| E_end_doc
| E_start_doc of (string * dtd)
| E_end_doc of string
| E_start_tag of (string * (string * string) list *
namespace_scope option *
Pxp_lexer_types.entity_id)
| E_ns_start_tag of (string * string * (string * string * string) list *
namespace_scope * Pxp_lexer_types.entity_id)
| E_end_tag of (string * Pxp_lexer_types.entity_id)
| E_ns_end_tag of (string * string * Pxp_lexer_types.entity_id)
| E_char_data of string
| E_pinstr of (string * string)
| E_pinstr_member of (string * string)
| E_comment of string
| E_start_super
| E_end_super
| E_position of (string * int * int)
| E_error of exn
| E_end_of_stream
......
......@@ -126,7 +126,6 @@ type config =
* T_element "-vr" instead of T_super_root.
* (2) The T_super_root node is created from the super root exemplar
* in your spec.
* (3) Event-based parser: no effect
*)
enable_comment_nodes : bool;
......@@ -637,53 +636,54 @@ type entry =
type event =
| E_start_doc of (string * bool * Pxp_dtd.dtd)
| E_end_doc
| E_start_doc of (string * Pxp_dtd.dtd)
| E_end_doc of string
| E_start_tag of (string * (string * string) list *
Pxp_dtd.namespace_scope option *
Pxp_lexer_types.entity_id)
| E_ns_start_tag of (string * string * (string * string * string) list *
Pxp_dtd.namespace_scope * Pxp_lexer_types.entity_id)
| E_end_tag of (string * Pxp_lexer_types.entity_id)
| E_ns_end_tag of (string * string * Pxp_lexer_types.entity_id)
| E_char_data of string
| E_pinstr of (string * string)
| E_pinstr_member of (string * string)
| E_comment of string
| E_start_super
| E_end_super
| E_position of (string * int * int)
| E_error of exn
| E_end_of_stream
(* may be extended in the future *)
(* The type of XML events:
* E_start_doc (xmlversion,standalone,dtd)
* E_end_doc
* E_start_doc (xmlversion,dtd)
* E_end_doc lit_name
* lit_name: The literal name of the root element
*
* E_start_tag (name, attlist, entid):
* E_start_tag (name, attlist, scope_opt, entid):
* <name attlist>
* only used in non-namespace mode
*
* E_ns_start_tag (orig_name, norm_name, attlist, scope, entid)
* only used in namespace mode; orig_name is the element as found
* in the XML text; norm_name is the normalized element name;
* attlist consists of triples (orig_name, norm_name, value).
* [scope] is the namespace scope object.
* scope_opt is None in non-namespace mode, and the
* namespace scope object in namespace mode.
*
* E_end_tag (name, entid):
* </name>
* only used in non-namespace mode
*
* E_ns_end_tag (orig_name, norm_name, entid):
* only used in namespace mode
*
* E_char_data data:
* The parser usually generates several E_char_data events for a
* longer section of character data.
*
* E_pinstr (target,value):
* <?target value?>
* <?target value?> as node
*
* E_pinstr_member (target,value):
* <?target value?> as member of the parent element (add_pinstr)
*
* E_comment value:
* <!--value-->
*
* E_start_super,
* E_end_super:
* Indicates where the "super root node" is. Only generated when
* enable_super_root_node is on.
*
* E_position(entity,line,col):
* these events are only created if the next event will be
* E_start_tag, E_pinstr, or E_comment, and if
......
......@@ -89,30 +89,3 @@ let parse_dtd_entity = Pxp_dtd_parser.parse_dtd_entity
let extract_dtd_from_document_entity =
Pxp_dtd_parser.extract_dtd_from_document_entity
type event = Pxp_types.event =
| E_start_doc of (string * bool * dtd)
| E_end_doc
| E_start_tag of (string * (string * string) list * Pxp_lexer_types.entity_id)
| E_ns_start_tag of (string * string * (string * string * string) list *
Pxp_dtd.namespace_scope * Pxp_lexer_types.entity_id)
| E_end_tag of (string * Pxp_lexer_types.entity_id)
| E_ns_end_tag of (string * string * Pxp_lexer_types.entity_id)
| E_char_data of string
| E_pinstr of (string * string)
| E_comment of string
| E_position of (string * int * int)
| E_error of exn
| E_end_of_stream
let create_entity_manager = Pxp_ev_parser.create_entity_manager
type entry = Pxp_types.entry
let process_entity = Pxp_ev_parser.process_entity
let process_expr = Pxp_ev_parser.process_expr
let create_pull_parser = Pxp_ev_parser.create_pull_parser
......@@ -148,53 +148,8 @@ val parse_dtd_entity : config -> source -> dtd
val extract_dtd_from_document_entity : config -> source -> dtd
(* now defined in Pxp_dtd_parser *)
type event = Pxp_types.event =
| E_start_doc of (string * bool * dtd)
| E_end_doc
| E_start_tag of (string * (string * string) list * Pxp_lexer_types.entity_id)
| E_ns_start_tag of (string * string * (string * string * string) list *
Pxp_dtd.namespace_scope * Pxp_lexer_types.entity_id)
| E_end_tag of (string * Pxp_lexer_types.entity_id)
| E_ns_end_tag of (string * string * Pxp_lexer_types.entity_id)
| E_char_data of string
| E_pinstr of (string * string)
| E_comment of string
| E_position of (string * int * int)
| E_error of exn
| E_end_of_stream
val create_entity_manager :
?is_document:bool ->
config ->
source ->
Pxp_entity_manager.entity_manager
(* now defined in Pxp_ev_parser *)
type entry = Pxp_types.entry
val process_entity :
config ->
entry ->
Pxp_entity_manager.entity_manager ->
(event -> unit) ->
unit
(* now defined in Pxp_ev_parser *)
val process_expr :
?first_token: Pxp_lexer_types.token ->
?following_token: Pxp_lexer_types.token ref ->
config ->
Pxp_entity_manager.entity_manager ->
(event -> unit) ->
unit
(* now defined in Pxp_ev_parser *)
val create_pull_parser :
config ->
entry ->
Pxp_entity_manager.entity_manager ->
('a -> event option)
(* now defined in Pxp_ev_parser *)
(* Event-based stuff now only in Pxp_ev_parser! *)
(*$-*)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment