> You could run 'grep VERSION /usr/share/perl5/URI.pm' to get the
> version number out of the file.
[njh@njh ~]$ grep VERSION /usr/share/perl5/URI.pm
grep: /usr/share/perl5/URI.pm: No such file or directory
[njh@njh ~]$ grep VERSION /usr/lib/perl5/vendor_perl/5.8.6/URI.pm
use vars qw($VERSION);
$VERSION = "1.35"; # $Date: 2004/11/05 14:17:33 $
[njh@njh ~]$
It's taken 2 days of hard graft to nail down a small[ish]
program that will reproduce it. Probably it's a character
set decoding problem in my code. Here is is:
#!/usr/bin/perl -wT
use strict;
use HTML::SimpleLinkExtor;
use WWW::RobotRules::AnyDBM_File;
use LWP::RobotUA;
use LWP::Charset;
use Encode;
my $url = 'http://www5b.biglobe.ne.jp/~ubs/html/history.html';
my $rules =
WWW::RobotRules::AnyDBM_File->new('www.bandsman.co.uk/Spider',
'/tmp/robots.cache');
my $robot = LWP::RobotUA->new('www.bandsman.co.uk/Spider',
'njh@xxxxxxxxxxxxx', $rules);
$robot->timeout(20);
$robot->protocols_allowed(['http']); # disabling all others
$robot->env_proxy();
my $request = new HTTP::Request 'GET' => $url;
my $webdoc = $robot->simple_request($request);
my $content = $webdoc->content;
my $extor = HTML::SimpleLinkExtor->new($url);
unless($extor) {
die "Couldn't start extor\n";
}
my $charset = LWP::Charset::getCharset($webdoc);
if($charset) {
# print "$url: Charset is $charset\n";
if($charset =~ /(.+),/) {
$charset = $1;
}
if(Encode::resolve_alias($charset)) {
if($charset eq 'Shift_JIS') {
$content =
ShiftJIS::X0213::MapUTF::sjis2004_to_utf8($content);
$content = Encode::decode_utf8($content);
} elsif($charset ne 'us-ascii') {
$content = Encode::decode($charset, $content);
}
} else {
die "$url: Has an unknown character set: $charset\n";
}
}
$extor->parse($content);
URLLOOP: foreach ($extor->links) {
# print "Considering $_\n";
next URLLOOP if(/^(mailto|news|javascript|clsid):/i);
next URLLOOP if(/^(ftp:\/\/|\#.+)/i);
if(/^file:/i) {
# print "File protocol not supported since that does
not work over the Internet\n";
next URLLOOP;
}
# Remove any CGI arguments to get the bare page
# Watch the very broken
# http://www.mvlausen.ch/index.php
# Don't anchor - do the whole doc
my $page = $_;
$page =~ s/(\?|\#).*$//;
# Handle the equally broken
# http://watfordband.org.uk/~greg/band/news/
# which just keeps on scrolling back and back
if($page =~ /(.+\.php)\/.+/) {
$page = $1;
}
# Remove double slashes from the url. They
# are valid according to RFC2398, but they confuse us
# TODO: https
if($page =~ /^http:\/\/(.*\/\/)/) {
$page = $1;
$page =~ s/\/\//\//g;
$page = 'http://' . $page;
} elsif($page !~ /^http:\/\//) {
# Doesn't start http - we can remove double
# slash easily
if($page =~ /.*\/\/.*/) {
$page =~ s/\/\//\//g;
}
}
# print "Found: $page\n";
}
--
Nigel Horne. Arranger, Adjudicator, Band Trainer, Composer, Tutor,
Typesetter.
NJH Music, Barnsley, UK. ICQ#20252325
njh@xxxxxxxxxxxxxx http://www.bandsman.co.uk
|