Allen, could the memory leak you discovered in POE::Filter::Reference
be the cause of the memory leak in PoCo::Server::TCP?
I wrote a a spider based on Randal Schwartz's POE article. This version
of the spider uses PoCo::TCP::Server I've included the code below. The
spider works well but it has a big leak. The more URLs that the spider
looks at, the higher the RAM goes up. It reaches something like 31MB of
RAM after spidering several thousands of URLs and never frees up the
memory.
I'm cleaning up the $heap and I am not using any session aliases, but
that hasn't helped. I've included the code below to see if somebody can
help identify the leak.
#!/usr/bin/perl
use warnings;
use strict;
$|++;
use POE;
use POE::Component::Server::TCP;
use URI;
use Data::Dumper;
use constant ListenAddr => '127.0.0.1';
use constant ListenPort => '31399';
my $KIDMAX = 10;
my $MAX_DEPTH = 50; #Used only as default if max is not defined
my $MAX_CONCURENCY = -1;
# This Spider Handles:
# -Connection Timeout (25 secs)
# -Redirects (Follows as defined by driver / package)
# -Max File Size (550000 bytes)
# -Max Depth (Dependant on input, default 50)
#
# Could fall in spider trap: Sessions, https pages
POE::Component::Server::TCP->new(
Address => ListenAddr,
Port => ListenPort,
Concurrency => $MAX_CONCURENCY,
ClientConnected => sub {
my ( $kernel, $heap, $session ) = @_[
KERNEL, HEAP, SESSION];
},
ClientDisconnected => sub {
my ( $kernel, $heap, $session ) = @_[
KERNEL, HEAP, SESSION];
delete $heap->{TODO};
delete $heap->{KIDS};
delete $heap->{BASE};
delete $heap->{DONE};
delete $heap->{client};
my $dumper = Dumper($heap);
$kernel->post( $heap->{client_id} =>
"shutdown" );
print "\n\nDISCONNECTING: $dumper\n\n";
},
ClientInput => sub {
my ( $kernel, $heap, $session ) = @_[
KERNEL, HEAP, SESSION];
#$kernel->alias_set('Spider
API'.int(rand(time)));
my $url = '';
($url) = split(' ', $_[ARG0]);
$url ne '' ?
$kernel->yield('StartSpider', $url) : $kernel->yield('shutdown');
},
InlineStates => {
StartSpider => sub {
my ( $kernel, $heap, $session, $url) =
@_[ KERNEL, HEAP, SESSION, ARG0];
print('SESS:'.$session->ID.", QUERY:
'$url'\n");
#Caller's responsbility to specify
where to start
$url = scalar(make_canonical($url));
print("Canonicalized: $url\n");
push @{$heap->{TODO}}, $url;
$heap->{KIDS} = 0;
print("Initialized kid to 0\n");
my $uri = new URI($url);
my $domain_name = $uri->authority;
$domain_name =~ s/www\.//;
$heap->{BASE} =
$uri->scheme."://(www\\.)?$domain_name";
print("\$heap->{BASE} = $heap->{BASE}\n");
$heap->{DONE}{$url}++;
print('SESS:' . $session->ID . '
$heap->{DONE} has ' . keys(%{$heap->{DONE}}) ." elements\n");
$kernel->yield("ReadySpider", "initial");
print("Ready to spider: Yielding to
ReadySpider\n");
},
ReadySpider => sub {
my ( $kernel, $heap, $session ) = @_[
KERNEL, HEAP, SESSION ];
if( not exists $heap->{client})
{
$kernel->yield('shutdown');
return;
}
print("$heap->{KIDS} running / $KIDMAX max\n");
return if $heap->{KIDS} >= $KIDMAX;
print("Not enough kids are running, run
more!\n");
return unless my $url = shift @{$heap->{TODO}};
print("Check whether $url is valid\n");
$heap->{KIDS}++;
$kernel->yield('DownloadPage', $url);
$kernel->yield("ReadySpider", "looping");
print("Finishing ReadySpider, going to
DownloadPage with $url, and looping around for another url to
spider!\n");
},
DownloadPage => sub {
my ( $kernel, $heap, $session, $url) =
@_[ KERNEL, HEAP, SESSION, ARG0];
if( not exists $heap->{client})
{
$kernel->yield('shutdown');
return;
}
print("Entering DownloadPage state with
$url\n");
use LWP::UserAgent;
my $ua = LWP::UserAgent->new;
$ua->max_size(550000); # if file bigger
than max_size, will retrieve all bytes up to max_size
$ua->max_redirect(0); # do not follow
redirects
$ua->timeout(25); # allow up to 25
seconds to get connection
$ua->agent('Mozilla/4.0 (compatible;
MSIE 6.0; Windows NT 5.1; SV1)'); # latest user agent as of 01/20/06
my $response = $ua->get($url);
my $valid = 0;
my @links;
print("Got $url\n");
if ($response->is_success)
{
print("Successful response\n");
if ($response->base =~ m{^$heap->{BASE}})
{
print("Internal link: $url\n");
if ($response->content_type eq "text/html")
{
$valid = 1;
print("Valid URL: $url\n");
$heap->{client}->put("INT::$url");
require HTML::SimpleLinkExtor;
my $e =
HTML::SimpleLinkExtor->new($response->base);
$e->parse($response->content);
@links = grep m{^http:}, $e->links;
}
}
}
$kernel->yield('SpiderDone', [$url],
[\@links, $valid]);
print("Exiting DownloadPage towards
SpiderDone\n");
},
SpiderDone => sub {
my ( $kernel, $heap, $session ) = @_[
KERNEL, HEAP, SESSION ];
$heap->{KIDS}--;
if( not exists $heap->{client})
{
$kernel->yield('shutdown');
return;
}
# if client already disconnected, this
may happen if the max count or time has been reached,
# and the client has disconnected, but
there are still children processesrunning
print("Client still connected, let's
give them some links\n");
my ($request,$response) = @_[ARG0,ARG1];
my ($url) = @$request;
my @links = @{$response->[0]};
my $valid = $response->[1];
for (@links)
{
# processing links extracted from page.
If link is internal, push on to TODO heap, if link is external,
# feed back to client in EXT::url ->
ext_url format
my $new_url = scalar(make_canonical($_));
print("Processing $new_url for EXT or
INT\n");
push @{$heap->{TODO}}, $new_url if
(not $heap->{DONE}{$new_url}++) and ($new_url =~ m{^$heap->{BASE}} and
$valid);
$heap->{client}->put("EXT::$url ->
$new_url") if($new_url !~ m{^$heap->{BASE}} and $valid);
#output if not of the same host
}
if(not @{$heap->{TODO}} and not $heap->{KIDS})
{
$kernel->yield("shutdown");
print("Max reached or no more to spider\n");
return;
}
$kernel->yield("ReadySpider", "child done");
print("Exiting SpiderDone, going back
to ReadySpider for another kid\n");
},
}
);
sub make_canonical { # not a POE
my $uri = URI->new(shift);
$uri->canonical();
$uri->fragment(undef); # toss fragment
return $uri->canonical->as_string; # return value
}
POE::Kernel->run();
|