Commit e4438bae authored by gerd's avatar gerd

Fix: autodetection of the encoding


git-svn-id: https://godirepo.camlcity.org/svn/lib-pxp/trunk@742 dbe99aee-44db-0310-b2b3-d33182c8eb97
parent 287b0860
......@@ -63,6 +63,9 @@ for PXP; if you are looking for the stable distribution, please go
`Entry_element_content. A new `Entry_content with different
semantics has been added, now conforming to a standard production.
</p>
<p>Improvement: The parser also accepts a BOM as UTF-8 sequence.
Also, the autodetection of the encoding for UTF-16 has been
enhanced</p>
</li>
<li><p><em>1.2.0test*:</em> New ~minimization option for the
[write] and [display] methods (user wish).</p>
......
......@@ -129,17 +129,35 @@ class virtual resolve_general
* "UTF-16-LE": UTF-16/UCS-2 encoding little endian
* "UTF-8": UTF-8 encoding
*
* Note: Four bytes are required for cases not yet handled
* (e.g. UTF-32).
* Returns the number of bytes to eat up in the buffer
*)
if String.sub s 0 2 = "\254\255" then
encoding <- `Enc_utf16
(* Note: Netconversion.recode will detect the big endianess, too *)
else if String.sub s 0 2 = "\255\254" then
encoding <- `Enc_utf16
(* Note: Netconversion.recode will detect the little endianess, too *)
else
encoding <- `Enc_utf8
if String.sub s 0 2 = "\254\255" then (
encoding <- `Enc_utf16_be;
2
)
else if String.sub s 0 4 = "\000\060\000\063" then (
encoding <- `Enc_utf16_be;
0
)
else if String.sub s 0 2 = "\255\254" then (
encoding <- `Enc_utf16_le;
2
)
else if String.sub s 0 4 = "\060\000\063\000" then (
encoding <- `Enc_utf16_le;
0
)
else if String.sub s 0 3 = "\239\187\191" then (
(* That's the unusual case of a byte order mark in UTF-8 encoding.
This is not mentioned in the XML standard, but Unicode allows it.
*)
encoding <- `Enc_utf8;
3
)
else (
encoding <- `Enc_utf8;
0
)
method private virtual next_string : string -> int -> int -> int
......@@ -242,7 +260,10 @@ class virtual resolve_general
* but we try to switch to direct reading later.
*)
refill();
if !buf_end >= 4 && not encoding_requested then self # autodetect !buf;
if !buf_end >= 4 && not encoding_requested then (
let n_skip = self # autodetect !buf in
buf_beg := !buf_beg + n_skip;
);
(* Ensure that [n >= 6], the longest UTF-8 character, so we can always
* put at least one character into [s]
......@@ -323,8 +344,10 @@ class virtual resolve_general
in
buf_eof := (n=0)
done;
if Netbuffer.length buf >= 4 then
self # autodetect (Netbuffer.contents buf);
if Netbuffer.length buf >= 4 then (
let n_skip = self # autodetect (Netbuffer.contents buf) in
Netbuffer.delete buf 0 n_skip
)
);
let lexbuf =
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment