Commit d5d3b0f7 authored by gerd's avatar gerd

Continued documentation.

Some minor additions to the library; unchecked whether they compile


git-svn-id: https://godirepo.camlcity.org/svn/lib-pxp/trunk@737 dbe99aee-44db-0310-b2b3-d33182c8eb97
parent aa9055f7
......@@ -50,6 +50,10 @@ for PXP; if you are looking for the stable distribution, please go
<sect1>
<title>Version History</title>
<ul>
<li>
<p><em>1.2.1:</em> Revised documentation</p>
<p>Addition: Pxp_event.unwrap_document</p>
</li>
<li><p><em>1.2.0test*:</em> New ~minimization option for the
[write] and [display] methods (user wish).</p>
<p>Improvement: better control what is printed as DTD for
......
......@@ -15,29 +15,8 @@ open Printf
(* dump_event: dumps a single parsing event *)
let dump_event =
function
E_start_doc(v,sa,dtd) ->
printf "E_start_doc version=%s standalone=%b\n" v sa
| E_end_doc ->
printf "E_end_doc\n"
| E_start_tag(name,attlist,_) ->
printf "E_start_tag %s %s\n" name
(String.concat " " (List.map (fun (n,v) -> n ^ "=" ^ v) attlist))
| E_end_tag(name,_) ->
printf "E_end_tag %s\n" name
| E_char_data data ->
printf "E_char_data %s\n" data
| E_pinstr(target,data) ->
printf "E_pinstr %s %s\n" target data
| E_comment data ->
printf "E_comment %s\n" data
| E_position(ent,line,col) ->
printf "E_position %s line=%d col=%d\n" ent line col
| E_error e ->
printf "E_error %s\n" (Printexc.to_string e)
| E_end_of_stream ->
printf "E_end_of_stream\n"
let dump_event e =
print_endline (Pxp_event.string_of_event e)
;;
......
......@@ -21,7 +21,7 @@ DOC = pxp_types.mli pxp_document.mli pxp_dtd.mli pxp_tree_parser.mli \
pxp_marshal.mli pxp_yacc.mli pxp_reader.mli \
intro_trees.txt intro_extensions.txt intro_namespaces.txt \
intro_events.txt intro_resolution.txt intro_getting_started.txt \
intro_advanced.txt \
intro_advanced.txt intro_preprocessor.txt \
example_readme.txt
XOBJ = $(OBJ:.cmo=.cmx)
......
{1 The [readme] processor}
The task of the [readme] processor is to convert a document conforming
to the XML DTD "readme.dtd" to an HTML document or a text document.
to the XML DTD "readme.dtd" into an HTML document or a text document.
This example especially demonstrates how to use node extensions to add
custom methods to nodes (see {!Intro_extensions}), and how to use the
object-oriented feature of late binding so that every node type
behaves differently.
Note that the converter code dates back from 1999. Nowadays I would
have written it in a different style.
probably have written it as a purely functional transformer. This
task is now left to the reader...
{2 The [readme] DTD}
......@@ -320,6 +325,9 @@ let escape_html s =
s
]}
Note (of 2009): There is also the Ocamlnet function
[Netencoding.Html.encode] one can use. It has a special XML mode.
{3 The virtual class [shared]}
This virtual class is the abstract superclass of the extension classes
......@@ -358,10 +366,12 @@ For an introduction into extension classes see {!Intro_extensions}.
{3 The class [only_data]}
This class defines [to_html] such that the character data of
the current node is converted to HTML. Note that [self] is an
extension object, [self # node] is the node object, and
[self # node # data] returns the character data of the node.
This class defines [to_html] such that the character data of the
current node is converted to HTML. Note that [self] is an extension
object (of type {!Pxp_document.extension}), [self # node] is the node
object (of type {!Pxp_document.node}), and [self # node # data]
returns the character data of the node (see
{!Pxp_document.node.data}).
{[
class only_data =
......@@ -469,17 +479,19 @@ class readme =
]}
This class is an example how to access the value of an attribute: The
value is determined by invoking [self # node # attribute "title"]. As
this attribute has been declared as CDATA and as being required, the
value has always the form [Value s] where [s] is the string value of
the attribute.
value is determined by invoking [self # node # attribute "title"] (see
{!Pxp_document.node.attribute}). As this attribute has been declared
as CDATA and as being required, the value has always the form [Value
s] where [s] is the string value of the attribute. Attribute values
have type {!Pxp_types.att_value}.
You can also see how entity contents can be accessed. A parameter entity object
can be looked up by [self # node # dtd # par_entity "name"],
and by invoking [replacement_text] the value of the entity
is returned after inner parameter and character entities have been
processed. Note that you must use [gen_entity] instead of
[par_entity] to access general entities.
You can also see how entity contents can be accessed. A parameter
entity object can be looked up by [self # node # dtd # par_entity
"name"] (see {!Pxp_dtd.dtd.par_entity}), and by invoking
{!Pxp_dtd.Entity.replacement_text} the value of the entity is returned
after inner parameter and character entities have been processed. Note
that you must use {!Pxp_dtd.dtd.gen_entity} instead of [par_entity] to
access general entities.
{3 The classes [section], [sect1], [sect2], and [sect3]}
......@@ -518,22 +530,23 @@ class sect3 = section "h4"
]}
Section elements are converted to HTML by printing a headline and then
converting the contents of the element recursively. More precisely, the first
sub-element is always a [title] element, and the other
elements are the contents of the section. This structure is declared in the
DTD, and it is guaranteed that the document matches the DTD. Because of this
the title node can be separated from the rest without any checks.
converting the contents of the element recursively. More precisely,
the first sub-element is always a [title] element, and the other
elements are the contents of the section. This structure is declared
in the DTD, and it is guaranteed that the document matches the
DTD. Because of this the title node can be separated from the rest
without any checks.
Both the title node, and the body nodes are then converted to HTML by calling
[to_html] on them.
Both the title node, and the body nodes are then converted to HTML by
calling [to_html] on them.
{3 The classes [map_tag], [p], [em], [ul], and [li] }
Several element types are converted to HTML by simply mapping them to
corresponding HTML element types. The class [map_tag]
implements this, and the class argument [the_target_tag]
determines the tag name to map to. The output consists of the start tag, the
recursively converted inner elements, and the end tag.
corresponding HTML element types. The class [map_tag] implements this,
and the class argument [the_target_tag] determines the tag name to map
to. The output consists of the start tag, the recursively converted
inner elements, and the end tag.
{[
class map_tag the_target_tag =
......@@ -558,8 +571,8 @@ class li = map_tag "li"
{3 The class [br]}
Element of type [br] are mapped to the same HTML type. Note
that HTML forbids the end tag of [br].
Element of type [br] are mapped to the same HTML type. Note that HTML
forbids the end tag of [br].
{[
class br =
......@@ -591,7 +604,7 @@ class code =
(* convert tabs *)
let l = String.length data in
let rec preprocess i column =
(* this is very ineffective but comprehensive: *)
(* this is very ineffective but comprehensible: *)
if i < l then
match data.[i] with
'\t' ->
......@@ -720,6 +733,8 @@ class footnote =
This code sets up the hash table that connects element types with the
exemplars of the extension classes that convert the elements to HTML.
See {!Intro_extensions.bindext} for comments, and
{!Pxp_document.make_spec_from_alist} for the function definition.
{[
let tag_map =
......
This diff is collapsed.
......@@ -61,7 +61,7 @@ of the extension objects has to be the same for all nodes in a tree.
It is not possible to e.g. use a different type for elements than for
data nodes.
{2 How to define an extension class}
{2:defext How to define an extension class}
At minimum, you must define the methods [clone], [node], and
[set_node] such that your class is compatible with the type
......@@ -148,7 +148,7 @@ sense, or it would violate some important condition), it is possible
to define the method and to always raise an exception when the method
is invoked (e.g. [assert false]).
{2 How to bind extension classes to element types}
{2:bindext How to bind extension classes to element types}
Once you have defined your extension classes, you can bind them to
element types. The simplest case is that you have only one class and
......@@ -202,7 +202,7 @@ let spec =
The extension object [c] is still used for all data nodes and
for all other element types.
{2 An example}
{2:example An example}
A complete example using extension objects is the [readme]
processor. The full source code is included in the PXP source tarball.
......
......@@ -45,17 +45,8 @@ computer. {!Pxp_reader.make_file_url} has to deal with character
encodings of file names. It assumes UTF-8 by default. By passing
arguments to this function, other assumptions about the encoding of
file names can be made. Unfortunately, there is no portable way of
determining the character encoding the system uses for file names.
Links:
- {{:http://library.gnome.org/devel/glib/stable/glib-Character-Set-Conversion.html#g-get-filename-charsets}How GLib treats the file name encoding problem}
- {{:http://developer.apple.com/technotes/tn/tn1150.html} OS X stores filenames on HFS+ volumes in a Unicode encoding}; the POSIX
functions like [open] expect file names in UTF-8 encoding.
- Current Windows versions store filenames in Unicode. The Win32 functions
are available in a Unicode and in a so-called ANSI version
(see {{:http://msdn.microsoft.com/en-us/library/dd317752(VS.85).aspx}
Code Pages}), and the O'Caml runtime calls the latter. This means file
names available to PXP are encoded in the active code page.
determining the character encoding the system uses for file names
(see the hyperlinks at the end of this section).
The returned [doc] object is of type {!classtype:Pxp_document.document}. This type
is used for all regular documents that exist independently. The root
......@@ -70,6 +61,16 @@ formal correctness called well-formedness. See below how to only the
check for well-formedness while parsing without doing the whole
validation.
Links about the file name encoding problem:
- {{:http://library.gnome.org/devel/glib/stable/glib-Character-Set-Conversion.html#g-get-filename-charsets}How GLib treats the file name encoding problem}
- {{:http://developer.apple.com/technotes/tn/tn1150.html} OS X stores filenames on HFS+ volumes in a Unicode encoding}; the POSIX
functions like [open] expect file names in UTF-8 encoding.
- Current Windows versions store filenames in Unicode. The Win32 functions
are available in a Unicode and in a so-called ANSI version
(see {{:http://msdn.microsoft.com/en-us/library/dd317752(VS.85).aspx}
Code Pages}), and the O'Caml runtime calls the latter. This means file
names available to PXP are encoded in the active code page.
{2:complink Compiling and linking}
It is strongly recommended to compile and link with the help of
......@@ -311,7 +312,7 @@ the executable. The lexers are available as separate findlib packages:
For the link command, see above: {!Intro_getting_started.complink}.
{3 Event parser (push/pull parsing)}
{3:evparser Event parser (push/pull parsing)}
It is sometimes not desirable to represent the parsed XML data as
tree. An important reason is that the amount of data would exceed the
......@@ -338,7 +339,7 @@ The events have type {!Pxp_types.event} [option].
More about event parsing can be found in {!Intro_events}.
{3 Low-profile trees}
{3:lowprofile Low-profile trees}
When the tree classes in {!Pxp_document} are too much overhead,
it is easily possible to define a specially crafted tree data type, and
......@@ -418,7 +419,7 @@ Of course, this all is only reasonable for the well-forermedness mode,
as PXP's validation routines depend on the built-in tree representation
of {!Pxp_document}.
{3 Choosing the node types to represent}
{3:nodetypes Choosing the node types to represent}
By default, PXP only represents element and data nodes (both in the
normal tree representation and in the event stream). It is possible
......@@ -453,7 +454,7 @@ let config =
]}
{3 Controlling whitespace}
{3:whitespace Controlling whitespace}
Depending on the mode, PXP applies some automatic whitespace rules. The
user can call functions to reduce whitespace even more.
......@@ -527,7 +528,7 @@ of whitespace in data nodes.
{3 Checking the [ID] consistency and looking up nodes by [ID]}
{3:idcheck Checking the [ID] consistency and looking up nodes by [ID]}
In XML it is possible to identify elements by giving them an [ID]
attribute. The requires a DTD, and could be done with declarations
......@@ -587,7 +588,7 @@ Note that the [id_index] is not automatically updated when the parsed
tree is modified.
{3 Finding nodes by element names}
{3:findelements Finding nodes by element names}
As we are at it: PXP does not maintain indexes of any kind. Unlike in
other tree representations, there is no index of elements that would
......@@ -750,7 +751,7 @@ opened when exactly the URL is referenced that is also mentioned in the
catalog.
{3 Embedding large constant XML in source code}
{3:codewriter Embedding large constant XML in source code}
Sometimes one needs to embed XML files into source code. For small files
this is no problem at all, just define them as string literals
......@@ -852,6 +853,8 @@ namespace is uniquely identified by a prefix. Note that this means
that the elements and attributes may be renamed by the parser.
For details how the prefix normalization works, see {!Intro_namespaces}.
Namespace processing can also be combined with event-oriented
parsing, see {!Intro_events.namespaces}.
{3:spec Specifying which classes implement nodes - the mysterious [spec] parameter}
......@@ -868,7 +871,7 @@ namespace-enabled parsing a different set of node classes is used:
let spec = Pxp_tree_parser.default_namespace_spec
]}
The mysterious [spec] parameter directs PXP which class it uses for
The mysterious [spec] parameter controls which class it uses for
which node type. In the source code of {!Pxp_tree_parser}, we find
{[
......@@ -917,9 +920,13 @@ XML it is not designed for:
- It is not possible to leave entities unresolved in the text. Whenever
there is an [&entity;] or [%entity;] PXP replaces it with the definition
of that entity.
of that entity. It is an error if the entity turns out to be undefined,
and parsing is stopped with an exception.
- It is not possible to figure out notational details of the XML text,
such as where CDATA sections are used
- It is not possible to parse a syntactically wrong document as much as
possible, and to return the parseable parts. PXP either parses the
document completely, or it fails completely.
Effectively, this makes it hard to use PXP for XML editing, but otherwise
does not limit its uses.
......
......@@ -9,6 +9,21 @@ of namespace-aware documents PXP applies a transformation to the document
which is called "prefix normalization". This transformation ensures that every
namespace prefix uniquely identifies a namespace throughout the whole document.
{3 Links to other documentation}
- {!Intro_getting_started.namespaces}
- {!classtype:Pxp_dtd.namespace_manager}
- {!Pxp_dtd.create_namespace_manager}
- {!classtype:Pxp_dtd.namespace_scope}
- {!Pxp_dtd.create_namespace_scope}
- Trees and namespaces: {!Intro_trees.access}, see the namespace subsection
- {!Intro_advanced.irrnodes}
- {!Intro_events.namespaces}
{2 Namespace URI's and prefixes}
A namespace is identified by a namespace URI (e.g. something like
"http://company.org/namespaces/project1" - note that this URI is simply
processed as string, and never looked up by an HTTP access). For
......@@ -41,7 +56,11 @@ instead of {!Pxp_tree_parser.default_spec}. A good starting point
to enable all that:
{[
let config = Pxp_types.default_namespace_config
let nsmng = Pxp_dtd.create_namespace_manager()
let config =
{ Pxp_types.default_config with
enable_namespace_processing = Some nsmng
}
let source = ...
let spec = Pxp_tree_parser.default_namespace_spec
let doc = Pxp_tree_parser.parse_document_entity config source spec
......@@ -131,7 +150,11 @@ moment, and allows the program to enter such special configurations
into the DTD object. The resulting program could look then like:
{[
let config = Pxp_types.default_namespace_config
let nsmng = Pxp_dtd.create_namespace_manager()
let config =
{ Pxp_types.default_config with
enable_namespace_processing = Some nsmng
}
let source = ...
let spec = Pxp_tree_parser.default_namespace_spec
let transform_dtd dtd =
......
This diff is collapsed.
......@@ -133,7 +133,7 @@ method of the element node. If this way of handling processing instructions
is not exact enough, the parser can optionally create processing instruction
nodes that are regular members of the document tree.
{2 Access methods}
{2:access Access methods}
An overview over some relevant access methods:
......
......@@ -7,9 +7,9 @@
(** DTD objects
The DTD object is a separate container for the formal requirements
of a document. The DTD object is always present in a document, even
when validation is turned off. See {!class: Pxp_dtd.dtd} for details about
the DTD object.
of a document. The DTD object is always present in a document,
even when validation is turned off. See {!classtype: Pxp_dtd.dtd}
for details about the DTD object.
There are a number of accompanying objects also defined in this
module (e.g. [namespace_manager] or [dtd_element]).
......@@ -34,35 +34,51 @@ type validation_record =
(**/**)
(** This class manages mappings from URIs to normalized prefixes. For every
* namespace a namespace_manager object contains a set of mappings
* namespace a namespace_manager object contains a set that maps
* various URI's to the same normalized prefix [np]:
* {[ uri1 |-> np, uri2 |-> np, ..., uriN |-> np ]}
* The normalized prefix [np] is characterstical of the namespace, and
* identifies the namespace uniquely.
* The first URI [uri1] is the primary URI, the other URIs are aliases.
*
* In order to create an empty namespace, call
* {!Pxp_dtd.create_namespace_manager}.
*
* See {!Intro_namespaces} for an introduction to namespaces and more
* links to other explanations about namespace managers.
*
* The following operations are supported:
* - [add_uri np uri]: The passed [uri] is added to the already existing
* namespace which is identified by the normprefix [np]. This means
* that the precondition is that there is already some mapping
* [uri' |-> np], and that there is no mapping for [uri]. Postcondition
* is that [uri |-> np] is a new mapping.
* [add_uri] thus adds a new alias URI for an existing namespace.
* - [add_namespace np uri]: Precondition is that neither [np] nor [uri]
* are used in the namespace_manager object. The effect is that the
* mapping [uri |-> np] is added.
* - [add_namespace np uri]: adds a new mapping [uri |-> np] to the
* manager. Neither [np] nor [uri] must already be part of another
* mapping in the manager.
* - [add_uri np uri]: adds a new alias [uri] for an existing namespace
* which is identified by the normprefix [np]. The normprefix [np]
* must already be part of a mapping which is then extended by this
* method.
* - [lookup_or_add_namespace p uri]: If there is already some mapping
* [uri |-> np], the normprefix [np] is simply returned ("lookup"). In this
* case [p] is ignored. Otherwise [uri] is not yet mapped, and in this
* case some unique [np] must be found such that [uri |-> np] can be
* added ([add_namespace]). First, the passed prefix [p] is tried.
* If [p] is free, it can be taken as new normprefix: [np = p]. Otherwise
* some number [n] is found such that the concatenation [p + n] is free:
* [np = p + n]. The operation returns [np].
* some number [n] is found such that the concatenation [p ^ n] is free:
* [np = p ^ n]. The operation returns [np].
*
* {b Encodings:} prefixes and URIs are always encoded in the default
* encoding of the document
*)
class namespace_manager :
object
method add_namespace : string -> string -> unit
(** [add_namespace np uri]: adds a new namespace to the object. The
* namespace is identified by the normprefix [np] and contains initially
* the primary URI [uri].
* The method fails ([Namespace_error]) if either [np] already identifies
* some namespace or if [uri] is already member of some namespace.
* Nothing happens if [uri] is the sole member of the namespace [np].
* It is required that [np <> ""].
*)
method add_uri : string -> string -> unit
(** [add_uri np uri]: adds [uri] as alias URI to the namespace identified
* by the normprefix [np] (see above for detailed semantics). The method
......@@ -76,16 +92,6 @@ class namespace_manager :
* instead of [Not_found].
*)
method add_namespace : string -> string -> unit
(** [add_namespace np uri]: adds a new namespace to the object. The
* namespace is identified by the normprefix [np] and contains initially
* the primary URI [uri].
* The method fails ([Namespace_error]) if either [np] already identifies
* some namespace or if [uri] is already member of some namespace.
* Nothing happens if [uri] is the sole member of the namespace [np].
* It is required that [np <> ""].
*)
method lookup_or_add_namespace : string -> string -> string
(** [lookup_or_add_namespace p uri]: first, the method looks up if
* the namespace for [uri] does already exist. If so, [p] is ignored,
......@@ -100,9 +106,6 @@ class namespace_manager :
(** Return the primary URI for a normprefix, or raises
* [Namespace_prefix_not_managed]. [get_uri ""] raises always this
* exception.
*
* {b Change in PXP 1.2}: Using exception [Namespace_prefix_not_managed]
* instead of [Not_found].
*)
method get_uri_list : string -> string list
......@@ -114,9 +117,6 @@ class namespace_manager :
method get_normprefix : string -> string
(** Return the normprefix for a URI, or raises
* [Namespace_not_managed].
*
* {b Change in PXP 1.2}: Using exception [Namespace_not_managed]
* instead of [Not_found].
*)
method iter_namespaces : (string -> unit) -> unit
......@@ -152,15 +152,24 @@ val create_namespace_manager : unit -> namespace_manager
*
* Furthermore, the [namespace_scope] object may have a parent
* [namespace_scope], representing the namespace declarations in the
* surrounding XML text.
* surrounding XML text. [namespace_scope] objects are intentionally
* immutable. When some XML subtree is cut out of a document
* and inserted into another document, the original [namespace_scope]
* declarations (including
* all parents) are still applied to the subtree when it is in the
* new document. Further changes in the old document cannot break this
* assertion because of the immutability.
*
* The [namespace_scope] objects are connected with the [namespace_manager]
* to allow translations from the namespace prefixes found in the XML
* text (also called "display prefixes" from now on) to the normalized
* prefixes stored in the [namespace_manager], and vice versa.
*
* The [namespace_scope] objects are intentionally immutable in order to
* allow memory sharing.
* Call {!Pxp_dtd.create_namespace_scope} to create a scope object using
* the default implementation.
*
* See {!Intro_namespaces} for an introduction to namespaces and more
* links to other explanations about scopes.
*)
class type namespace_scope =
object
......@@ -235,10 +244,15 @@ val create_namespace_scope :
(** Preferred way of creating a [namespace_scope] *)
(** DTD objects are used to keep global declarations that apply to the
whole XML document.
(** DTD objects have two purposes:
- They are containers for global declarations that apply to the
whole XML document. This includes the character set, the
standalone declaration, and all declaration that can appear
in the "DTD part" of a document.
- Also, they express formal constraints the document must fulfill
such as validity, or (less ambitious) well-formedness.
Normally, programmers need neither to create such objects, nor to
Normally, programmers neither need to create such objects, nor to
fill them with data, as the parser already does this. If it is required
to create a DTD object, the recommended function is
{!Pxp_dtd.create_dtd}.
......@@ -253,7 +267,11 @@ val create_namespace_scope :
- Whether the document is declared as standalone
A consequence of this is that even documents have a DTD object
for which only well-formedness parsing is enabled.
that only have to comply to the relatively weak well-formedness
constraints.
For some introductory words about well-formedness mode, see
{!Intro_getting_started.wfmode}.
*)
class dtd :
?swarner:Pxp_core_types.symbolic_warnings ->
......@@ -692,7 +710,8 @@ val create_dtd :
* ~warner:config.warner
* config.encoding]}
*
* See also {!Pxp_dtd_parser.create_empty_dtd}.
* See also {!Pxp_dtd_parser.create_empty_dtd}, which creates a DTD
* from a {!Pxp_types.config} record.
*)
......
......@@ -10,7 +10,9 @@ open Pxp_types
open Pxp_dtd
val create_empty_dtd : config -> dtd
(** Create an empty DTD *)
(** Create an empty DTD. See also {!Pxp_dtd.create_dtd} for a lower-level
DTD constructor not requiring a full [config] record.
*)
val parse_dtd_entity : config -> Pxp_types.source -> dtd
(** Parse an entity containing a DTD (external subset), and return this DTD. *)
......
......@@ -74,9 +74,11 @@ val process_entity :
* - [`Entry_content]:
* Only events for contents are generated. They are terminated
* by [E_end_of_stream].
* - [`Entry_declaration]:
* - [`Entry_declarations]:
* Currently not supported. (But see {!Pxp_dtd_parser} for functions
* parsing DTDs.)
* - [`Entry_expr]: Do not pass this entry point! There is the specially
* crafted function {!Pxp_ev_parser.parse_expr} for it.
*
* The entry points have options, see {!Pxp_types.entry} for explanations.
*
......@@ -152,6 +154,7 @@ val process_expr :
*)
val create_pull_parser :
?close:((unit -> unit) ref) ->
config ->
entry ->
Pxp_entity_manager.entity_manager ->
......@@ -177,4 +180,13 @@ val create_pull_parser :
* let next = create_pull_parser cfg entry mng in
* let stream = Stream.from(fun _ -> next())
* ]}
*
* The optional argument [close] may be set to a variable, and the
* pull parser sets this variable to a function that closes the event
* stream immediately when invoked. This implies that all resources
* of the operating system (like files) are closed. The token stream
* is immediately ended (i.e. [None] is returned). The [close] argument
* exists to allow users to stop parsing at any point. If you can ensure
* to read from the event stream until [E_end_of_stream] or [E_error] is
* encountered, it is not required to care of closing the parser engine.
*)
......@@ -7,6 +7,7 @@
open Pxp_types
open Pxp_dtd
open Pxp_aux
open Printf
let to_list get_ev =
(* This function must be tail-recursive! *)
......@@ -219,6 +220,56 @@ let drop_ignorable_whitespace_filter get_ev =
get_ev'
;;
let unwrap_document pull =
let doc_details = ref None in
let first_event_done = ref false in
let get_doc_details() =
if not !first_event_done then (
match pull() with
| E_start_doc(v,dtd) ->
doc_details := Some(v,dtd);
first_event_done := true
| _ ->
() (* Will cause an exception! *)
);
match !doc_details with
| None ->
failwith "Pxp_event.unwrap_document: No E_start_doc event found"
| Some (v,dtd) ->
(v,dtd)
in
let pull' =
pfilter
(function
| E_start_doc(v,dtd) ->
doc_details := Some(v,dtd);
first_event_done := true;
false
| E_end_doc | E_start_super | E_end_super | E_end_of_stream ->
false
| E_error e ->
raise e
| _ ->
true
)
pull
in
(get_doc_details, pull')
;;
let namespace_split = Pxp_aux.namespace_split
;;
let extract_prefix = Pxp_aux.extract_prefix
;;
type dtd_style =
[ `Ignore
| `Include
......@@ -554,3 +605,41 @@ let write_events ?default ?(dtd_style = `Include) ?(minimization=`None) =
wr_dsp false default dtd_style minimization ;;
let display_events ?(dtd_style = `Include) ?(minimization=`None) =
wr_dsp true None dtd_style minimization ;;
let string_of_event e =
match e with
| E_start_doc(v,dtd) ->
sprintf "E_start_doc(%s,<%d>)\n" v (Oo.id dtd)
| E_end_doc ->
"E_end_doc\n"
| E_start_tag(name,attlist,scope_opt,entid) ->
sprintf "E_start_tag(%s,%s,%s,<%d>)"
name
(String.concat " " (List.map (fun (n,v) -> n ^ "=" ^ v) attlist))
(match scope_opt with
| None -> "None"
| Some scope -> sprintf "<%d>" (Oo.id scope)
)
(Oo.id entid)
| E_end_tag(name,entid) ->
sprintf "E_end_tag(%s,<%d>)" name (Oo.id entid)
| E_start_super ->
"E_start_super"
| E_end_super ->
"E_end_super"
| E_char_data data ->
sprintf "E_char_data(\"%s\")" (String.escaped data)
| E_pinstr(target,data,entid) ->
sprintf "E_pinstr(%s,%s,<%d>)" target data (Oo.id entid)
| E_pinstr_member(target,data,entid) ->
sprintf "E_pinstr_member(%s,%s,<%d>)" target data (Oo.id entid)
| E_comment data ->
sprintf "E_comment(\"%s\")" (String.escaped data)
| E_position(ent,line,col) ->
sprintf "E_position(%s,%d,%d)" ent line col
| E_error e ->
sprintf "E_error(%s)" (Pxp_types.string_of_exn e)
| E_end_of_stream ->
"E_end_of_stream\n"
;;
......@@ -76,7 +76,10 @@ val extract : event -> (unit -> event option) -> (unit -> event option)
(** {2 Filters} *)
type filter = (unit -> event option) -> (unit -> event option)
type pull_fn = unit -> event option
(** The result type of {!Pxp_ev_parser.create_pull_parser} *)
type filter = pull_fn -> pull_fn
(** A filter transforms a pull function into another pull function *)
val norm_cdata_filter : filter
......@@ -112,9 +115,50 @@ val pfilter : (event -> bool) -> filter
* {[ pfilter (function E_comment _ -> false | _ -> true) g ]}
*)
val unwrap_document : pull_fn -> ((unit -> (string * Pxp_dtd.dtd)) * pull_fn)
(** This filter removes the document wrapping from the stream
(see {!Intro_events.docs} for a definition what this is).