The following patch should make sure that HTML::Parser does not
produce badly encoded SVs. That avoid the problem demonstrated, but I
still need to track down why perl itself segfaulted because of this.
Regards,
Gisle
Index: util.c
===================================================================
RCS file: /cvsroot/libwww-perl/html-parser/util.c,v
retrieving revision 2.20
retrieving revision 2.21
diff -u -p -r2.20 -r2.21
--- util.c 8 Nov 2004 14:14:35 -0000 2.20
+++ util.c 10 Nov 2004 13:32:56 -0000 2.21
@@ -209,23 +209,21 @@ decode_entities(pTHX_ SV* sv, HV* entity
}
if (!SvUTF8(sv) && repl_utf8) {
- STRLEN len = t - SvPVX(sv);
- if (len) {
- /* need to upgrade the part that we have looked though */
- STRLEN old_len = len;
- char *ustr = bytes_to_utf8(SvPVX(sv), &len);
- STRLEN grow = len - old_len;
- if (grow) {
- /* XXX It might already be enough gap, so we don't need
this,
- but it should not hurt either.
- */
- grow_gap(aTHX_ sv, grow, &t, &s, &end);
- Copy(ustr, SvPVX(sv), len, char);
- t = SvPVX(sv) + len;
- }
- Safefree(ustr);
- }
+ /* need to upgrade sv before we continue */
+ STRLEN before_gap_len = t - SvPVX(sv);
+ char *before_gap = bytes_to_utf8(SvPVX(sv), &before_gap_len);
+ STRLEN after_gap_len = end - s;
+ char *after_gap = bytes_to_utf8(s, &after_gap_len);
+
+ sv_setpvn(sv, before_gap, before_gap_len);
+ sv_catpvn(sv, after_gap, after_gap_len);
SvUTF8_on(sv);
+
+ Safefree(before_gap);
+ Safefree(after_gap);
+
+ s = t = SvPVX(sv) + before_gap_len;
+ end = SvPVX(sv) + before_gap_len + after_gap_len;
}
else if (SvUTF8(sv) && !repl_utf8) {
repl = bytes_to_utf8(repl, &repl_len);
Index: t/uentities.t
===================================================================
RCS file: /cvsroot/libwww-perl/html-parser/t/uentities.t,v
retrieving revision 1.8
retrieving revision 1.9
diff -u -p -r1.8 -r1.9
--- t/uentities.t 8 Nov 2004 14:14:42 -0000 1.8
+++ t/uentities.t 10 Nov 2004 13:33:03 -0000 1.9
@@ -14,7 +14,7 @@ unless (&HTML::Entities::UNICODE_SUPPORT
exit;
}
-print "1..13\n";
+print "1..14\n";
print "not " unless decode_entities("&euro") eq "\x{20AC}";
print "ok 1\n";
@@ -90,3 +90,6 @@ print "ok 12\n";
print "not " unless decode_entities("�") eq chr(0xFFFD);
print "ok 13\n";
+
+print "not " unless decode_entities("\260’\260") eq
"\x{b0}\x{2019}\x{b0}";
+print "ok 14\n";
|