|
|
Choosing A Webhost: |
HTML::Entities and WinLatin1 NCRs [PATCH]: msg#00007lang.perl.modules.lwp
Hi -- I use the HTML::Entities module quite a bit and have really appreciated its support for Unicode characters > 256 with Perl 5.8. I do have one particular issue that crops up for me, and I thought it might affects others as well, so I'm including a crude set of patches with my "fix". In short, I have to support HTML documents authored by a wide variety of people, and over time they've accumulated numeric character references to the troublesome set of characters between 128 and 159, mostly due to authors working on Windows platforms. The same documents now may also have character references to the Unicode code points for those characters. Here's a simple example: "two — em — dashes". Now, in my particular situation, I sometimes want to decode these entities to the same code point, so that, for example, I can match strings against each other. At first I thought I might get away with this: $a = Encode::encode('utf8', $a); # force no utf8 flag HTML::Entities::decode_entities($a); $a = Encode::decode('cp1252', $a) unless (Encode::is_utf8($a)); But while that will turn "—" into U+2014, it turns "——" into U+0097 U+2014, which doesn't help. So, I whacked into place a decode_entities_cp1252() function that decodes any numeric characters references in the 128-159 range (except for a couple of undefined ones) to the UTF-8 equivalents. I'm positive there are nicer, more elegant, and probably more flexible ways to do this, but lacking additional time to experiment, this is where I stopped. I pondered briefly trying to allow any character set mapping to be applied to these characters, but concluded that using WinLatin1 (a.k.a. Microsoft code page 1252) was actually sufficient to match what most/all modern browsers do with these character references. For example, this test page: http://www.fifi.org/doc/lynx/test/c1.html on my Linux Mozilla 1.7 browser displays matching columns of glyphs, so Mozilla seems to be mapping these WinLatin1 characters to their Unicode equivalents. Further, a test at our offices on a variety of Windows and Mac browsers and systems didn't find any that failed to display all these characters "properly". Here's another gratuitous link: http://home.earthlink.net/~bobbau/platforms/specialchars/ Well, here's my hacky patch for version 3.50, FWIW. Thanks for the great modules! Chris. ================================================ cp1252.patch --- lib/HTML/Entities.pm.orig 2006-03-06 12:18:12.272613000 -0500 +++ lib/HTML/Entities.pm 2006-03-06 12:18:42.950260000 -0500 @@ -127,7 +127,7 @@ require Exporter; @ISA = qw(Exporter); -@EXPORT = qw(encode_entities decode_entities _decode_entities); +@EXPORT = qw(encode_entities decode_entities decode_entities_cp1252 _decode_entities); @EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric); $VERSION = sprintf("%d.%02d", q$Revision: 1.32 $ =~ /(\d+)\.(\d+)/); --- MANIFEST.orig 2006-03-06 13:19:55.364120000 -0500 +++ MANIFEST 2006-03-06 13:20:13.055791000 -0500 @@ -40,6 +40,7 @@ t/dtext.t Test dtext decoding of entities t/entities.t Test encoding/decoding of entities t/entities2.t Test _decode_entities() +t/entities3.t Test decode_entities_cp1252() t/filter-methods.t Test ignore_tags, ignore_elements methods. t/filter.t Test HTML::Filter t/handler-eof.t Test invocation of $p->eof in handlers --- Parser.xs.orig 2006-03-06 11:53:43.401973000 -0500 +++ Parser.xs 2006-03-06 12:05:53.599678000 -0500 @@ -489,7 +489,24 @@ ST(i) = sv_2mortal(newSVsv(ST(i))); else if (SvREADONLY(ST(i))) croak("Can't inline decode readonly string"); - decode_entities(aTHX_ ST(i), entity2char, 0); + decode_entities(aTHX_ ST(i), entity2char, 0, 0); + } + SP += items; + +void +decode_entities_cp1252(...) + PREINIT: + int i; + HV *entity2char = perl_get_hv("HTML::Entities::entity2char", FALSE); + PPCODE: + if (GIMME_V == G_SCALAR && items > 1) + items = 1; + for (i = 0; i < items; i++) { + if (GIMME_V != G_VOID) + ST(i) = sv_2mortal(newSVsv(ST(i))); + else if (SvREADONLY(ST(i))) + croak("Can't inline decode readonly string"); + decode_entities(aTHX_ ST(i), entity2char, 0, 1); } SP += items; @@ -514,7 +531,7 @@ } if (SvREADONLY(string)) croak("Can't inline decode readonly string"); - decode_entities(aTHX_ string, entities_hv, allow_unterminated); + decode_entities(aTHX_ string, entities_hv, allow_unterminated, 0); bool _probably_utf8_chunk(string) --- hparser.c.orig 2006-03-06 15:31:33.228418000 -0500 +++ hparser.c 2006-03-06 15:31:44.401579000 -0500 @@ -465,7 +465,7 @@ if (p_state->utf8_mode) sv_utf8_decode(attrval); #endif - decode_entities(aTHX_ attrval, p_state->entity2char, 0); + decode_entities(aTHX_ attrval, p_state->entity2char, 0, 0); if (p_state->utf8_mode) SvUTF8_off(attrval); } @@ -537,7 +537,7 @@ if (p_state->utf8_mode) sv_utf8_decode(arg); #endif - decode_entities(aTHX_ arg, p_state->entity2char, 1); + decode_entities(aTHX_ arg, p_state->entity2char, 1, 0); if (p_state->utf8_mode) SvUTF8_off(arg); } --- util.c.orig 2006-03-06 14:07:52.686794000 -0500 +++ util.c 2006-03-06 14:07:55.647755000 -0500 @@ -11,6 +11,37 @@ #endif +#ifdef UNICODE_HTML_PARSER +#define CP1252_MAX_LEN 3 + +static const int cp1252_len[32] = +{ + 3, 0, 3, 2, 3, 3, 3, 3, 2, 3, 2, 3, 2, 0, 2, 0, + 0, 3, 3, 3, 3, 3, 3, 3, 2, 3, 2, 3, 2, 0, 2, 2 +}; + +static const unsigned char cp1252_utf8[32][CP1252_MAX_LEN] = +{ + { 0xE2, 0x82, 0xAC }, { 0, 0, 0 }, + { 0xE2, 0x80, 0x9A }, { 0xC6, 0x92, 0 }, + { 0xE2, 0x80, 0x9E }, { 0xE2, 0x80, 0xA6 }, + { 0xE2, 0x80, 0xA0 }, { 0xE2, 0x80, 0xA1 }, + { 0xCB, 0x86, 0 }, { 0xE2, 0x80, 0xB0 }, + { 0xC5, 0xA0, 0 }, { 0xE2, 0x80, 0xB9 }, + { 0xC5, 0x92, 0 }, { 0, 0, 0 }, + { 0xC5, 0xBD, 0 }, { 0, 0, 0 }, + { 0, 0, 0 }, { 0xE2, 0x80, 0x98 }, + { 0xE2, 0x80, 0x99 }, { 0xE2, 0x80, 0x9C }, + { 0xE2, 0x80, 0x9D }, { 0xE2, 0x80, 0xA2 }, + { 0xE2, 0x80, 0x93 }, { 0xE2, 0x80, 0x94 }, + { 0xCB, 0x9C, 0 }, { 0xE2, 0x84, 0xA2 }, + { 0xC5, 0xA1, 0 }, { 0xE2, 0x80, 0xBA }, + { 0xC5, 0x93, 0 }, { 0, 0, 0 }, + { 0xC5, 0xBE, 0 }, { 0xC5, 0xB8, 0 } +}; +#endif + + EXTERN SV* sv_lower(pTHX_ SV* sv) { @@ -63,7 +94,7 @@ } EXTERN SV* -decode_entities(pTHX_ SV* sv, HV* entity2char, bool allow_unterminated) +decode_entities(pTHX_ SV* sv, HV* entity2char, bool allow_unterminated, bool cp1252) { STRLEN len; char *s = SvPV_force(sv, len); @@ -132,7 +163,12 @@ } if (ok) { #ifdef UNICODE_HTML_PARSER - if (!SvUTF8(sv) && num <= 255) { + if (cp1252 && num >= 128 && num < 160 && cp1252_len[num & 0x7F] > 0) { + repl = (char*) cp1252_utf8[num & 0x7F]; + repl_len = cp1252_len[num & 0x7F]; + repl_utf8 = 1; + } + else if (!SvUTF8(sv) && num <= 255) { buf[0] = (char) num; repl = buf; repl_len = 1; ================================================ t/entities3.t use HTML::Entities qw(decode_entities_cp1252 encode_entities encode_entities_numeric); use Test::More tests => 6; $a = "Våre norske tegn bør æres"; decode_entities_cp1252($a); is($a, "Våre norske tegn bør æres"); encode_entities($a); is($a, "Våre norske tegn bør æres"); decode_entities_cp1252($a); encode_entities_numeric($a); is($a, "Våre norske tegn bør æres"); # See how well it does against CP1252 $ent = $hexent = $plain = ""; while (<DATA>) { next unless /^(0x[0-9a-f]{2})\t(0x[0-9a-f]{4})?/i; $hexnum = hex($1); $ent .= "&#$hexnum;"; $hexent .= sprintf("&#x%x;", $hexnum); $plain .= defined($2) ? chr(hex($2)) : chr($hexnum); } $a = $ent; decode_entities_cp1252($a); is($a, $plain); $a = $hexent; decode_entities_cp1252($a); is($a, $plain); # Decoding of ' is(decode_entities_cp1252("'"), "'"); __END__ # ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT 0x80 0x20AC #EURO SIGN 0x81 #UNDEFINED 0x82 0x201A #SINGLE LOW-9 QUOTATION MARK 0x83 0x0192 #LATIN SMALL LETTER F WITH HOOK 0x84 0x201E #DOUBLE LOW-9 QUOTATION MARK 0x85 0x2026 #HORIZONTAL ELLIPSIS 0x86 0x2020 #DAGGER 0x87 0x2021 #DOUBLE DAGGER 0x88 0x02C6 #MODIFIER LETTER CIRCUMFLEX ACCENT 0x89 0x2030 #PER MILLE SIGN 0x8A 0x0160 #LATIN CAPITAL LETTER S WITH CARON 0x8B 0x2039 #SINGLE LEFT-POINTING ANGLE QUOTATION MARK 0x8C 0x0152 #LATIN CAPITAL LIGATURE OE 0x8D #UNDEFINED 0x8E 0x017D #LATIN CAPITAL LETTER Z WITH CARON 0x8F #UNDEFINED 0x90 #UNDEFINED 0x91 0x2018 #LEFT SINGLE QUOTATION MARK 0x92 0x2019 #RIGHT SINGLE QUOTATION MARK 0x93 0x201C #LEFT DOUBLE QUOTATION MARK 0x94 0x201D #RIGHT DOUBLE QUOTATION MARK 0x95 0x2022 #BULLET 0x96 0x2013 #EN DASH 0x97 0x2014 #EM DASH 0x98 0x02DC #SMALL TILDE 0x99 0x2122 #TRADE MARK SIGN 0x9A 0x0161 #LATIN SMALL LETTER S WITH CARON 0x9B 0x203A #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 0x9C 0x0153 #LATIN SMALL LIGATURE OE 0x9D #UNDEFINED 0x9E 0x017E #LATIN SMALL LETTER Z WITH CARON 0x9F 0x0178 #LATIN CAPITAL LETTER Y WITH DIAERESIS ================================================ -- GPG Key ID: 366A375B GPG Key Fingerprint: 485E 5041 17E1 E2BB C263 E4DE C8E3 FA36 366A 375B
|
|
| <Prev in Thread] | Current Thread | [Next in Thread> |
|---|---|---|
| Previous by Date: | LWP::Protocol::http bug with SSL returning 500 Can't read entity body, Wilson Snyder |
|---|---|
| Next by Date: | Re: HTML::Entities and WinLatin1 NCRs [PATCH], Gisle Aas |
| Previous by Thread: | LWP::Protocol::http bug with SSL returning 500 Can't read entity body, Wilson Snyder |
| Next by Thread: | Re: HTML::Entities and WinLatin1 NCRs [PATCH], Gisle Aas |
| Indexes: | [Date] [Thread] [Top] [All Lists] |
Free MagazinesCisco NewsReceive a free quarterly e-newsletter with exclusive articles on how Cisco IT uses its own products and solutions to enable the business. subscribe Systems Management News, the newspaper for IT systems administration and data center managers! Each issue of Systems Management News is chock-full of news and analysis to help you understand what's happening in your field. subscribe The Enterprise Newsweekly eWeek is the essential technology information source for builders of e-business. subscribe Oracle Magazine Oracle Magazine contains technology strategy articles, sample code, tips, Oracle and partner news, how to articles for developers and DBAs, and more. Oracle (NASDAQ: ORCL) is the world's largest enterprise software company. subscribe Total Telecom Total Telecom is "The Economist of the communications industry". subscribe |