logo       
Google Custom Search
    AddThis Social Bookmark Button

Memory Leak in POE::Component::Server::TCP: msg#00057

Subject: Memory Leak in POE::Component::Server::TCP
Allen, could the memory leak you discovered in POE::Filter::Reference be the cause of the memory leak in PoCo::Server::TCP?

I wrote a a spider based on Randal Schwartz's POE article. This version of the spider uses PoCo::TCP::Server I've included the code below. The spider works well but it has a big leak. The more URLs that the spider looks at, the higher the RAM goes up. It reaches something like 31MB of RAM after spidering several thousands of URLs and never frees up the memory.

I'm cleaning up the $heap and I am not using any session aliases, but that hasn't helped. I've included the code below to see if somebody can help identify the leak.

#!/usr/bin/perl

use warnings;
use strict;
$|++;
use POE;
use POE::Component::Server::TCP;
use URI;
use Data::Dumper;
use constant ListenAddr => '127.0.0.1';
use constant ListenPort => '31399';

my $KIDMAX         = 10;
my $MAX_DEPTH      = 50; #Used only as default if max is not defined
my $MAX_CONCURENCY = -1;

# This Spider Handles:
# -Connection Timeout (25 secs)
# -Redirects (Follows as defined by driver / package)
# -Max File Size (550000 bytes)
# -Max Depth (Dependant on input, default 50)
#
# Could fall in spider trap: Sessions, https pages


POE::Component::Server::TCP->new(
 Address            => ListenAddr,
 Port               => ListenPort,
 Concurrency        => $MAX_CONCURENCY,
 ClientConnected    => sub {
my ( $kernel, $heap, $session ) = @_[ KERNEL, HEAP, SESSION];
                           },
 ClientDisconnected => sub {
my ( $kernel, $heap, $session ) = @_[ KERNEL, HEAP, SESSION];
                             delete $heap->{TODO};
                             delete $heap->{KIDS};
                             delete $heap->{BASE};
                             delete $heap->{DONE};
                             delete $heap->{client};
                             my $dumper = Dumper($heap);
$kernel->post( $heap->{client_id} => "shutdown" );
                             print "\n\nDISCONNECTING: $dumper\n\n";
                           },
ClientInput        => sub {
my ( $kernel, $heap, $session ) = @_[ KERNEL, HEAP, SESSION];

#$kernel->alias_set('Spider API'.int(rand(time)));

                             my $url = '';

                             ($url) = split(' ', $_[ARG0]);

$url ne '' ? $kernel->yield('StartSpider', $url) : $kernel->yield('shutdown');
                           },
 InlineStates       => {
                             StartSpider => sub {
my ( $kernel, $heap, $session, $url) = @_[ KERNEL, HEAP, SESSION, ARG0]; print('SESS:'.$session->ID.", QUERY: '$url'\n"); #Caller's responsbility to specify where to start

                               $url = scalar(make_canonical($url));
                               print("Canonicalized: $url\n");

                               push @{$heap->{TODO}}, $url;

                               $heap->{KIDS}    = 0;
                               print("Initialized kid to 0\n");

                               my $uri         = new URI($url);
                               my $domain_name = $uri->authority;
                               $domain_name    =~ s/www\.//;
$heap->{BASE} = $uri->scheme."://(www\\.)?$domain_name";
                               print("\$heap->{BASE} = $heap->{BASE}\n");

                               $heap->{DONE}{$url}++;
print('SESS:' . $session->ID . ' $heap->{DONE} has ' . keys(%{$heap->{DONE}}) ." elements\n");

                               $kernel->yield("ReadySpider", "initial");
print("Ready to spider: Yielding to ReadySpider\n");
                             },

                             ReadySpider => sub {
my ( $kernel, $heap, $session ) = @_[ KERNEL, HEAP, SESSION ];

                               if( not exists $heap->{client})
                               {
                                 $kernel->yield('shutdown');
                                 return;
                               }
                               print("$heap->{KIDS} running / $KIDMAX max\n");
                               return if $heap->{KIDS} >= $KIDMAX;
print("Not enough kids are running, run more!\n");
                               return unless my $url = shift @{$heap->{TODO}};
                               print("Check whether $url is valid\n");

                               $heap->{KIDS}++;

                               $kernel->yield('DownloadPage', $url);
                               $kernel->yield("ReadySpider", "looping");
print("Finishing ReadySpider, going to DownloadPage with $url, and looping around for another url to spider!\n");
                             },

                             DownloadPage => sub {
my ( $kernel, $heap, $session, $url) = @_[ KERNEL, HEAP, SESSION, ARG0];

                               if( not exists $heap->{client})
                               {
                                 $kernel->yield('shutdown');
                                 return;
                               }
print("Entering DownloadPage state with $url\n");

                               use LWP::UserAgent;
                               my $ua = LWP::UserAgent->new;
$ua->max_size(550000); # if file bigger than max_size, will retrieve all bytes up to max_size $ua->max_redirect(0); # do not follow redirects $ua->timeout(25); # allow up to 25 seconds to get connection $ua->agent('Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)'); # latest user agent as of 01/20/06

                               my $response = $ua->get($url);
                               my $valid = 0;
                               my @links;
                               print("Got $url\n");

                              if ($response->is_success)
                               {
                                 print("Successful response\n");

                                 if ($response->base =~ m{^$heap->{BASE}})
                                 {
                                   print("Internal link: $url\n");

                                  if ($response->content_type eq "text/html")
                                  {
                                     $valid = 1;
                                     print("Valid URL: $url\n");
                                     $heap->{client}->put("INT::$url");
                                     require HTML::SimpleLinkExtor;
my $e = HTML::SimpleLinkExtor->new($response->base);
                                     $e->parse($response->content);
                                     @links = grep m{^http:}, $e->links;
                                  }
                                 }
                               }

$kernel->yield('SpiderDone', [$url], [\@links, $valid]); print("Exiting DownloadPage towards SpiderDone\n");
                             },

                             SpiderDone  => sub {
my ( $kernel, $heap, $session ) = @_[ KERNEL, HEAP, SESSION ];

                               $heap->{KIDS}--;
                               if( not exists $heap->{client})
                               {
                                 $kernel->yield('shutdown');
                                 return;
                               }
# if client already disconnected, this may happen if the max count or time has been reached, # and the client has disconnected, but there are still children processesrunning print("Client still connected, let's give them some links\n");

                               my ($request,$response) = @_[ARG0,ARG1];
                               my ($url) = @$request;
                               my @links = @{$response->[0]};
                               my $valid = $response->[1];

                               for (@links)
                               {
# processing links extracted from page. If link is internal, push on to TODO heap, if link is external, # feed back to client in EXT::url -> ext_url format
                                 my $new_url = scalar(make_canonical($_));
print("Processing $new_url for EXT or INT\n");

push @{$heap->{TODO}}, $new_url if (not $heap->{DONE}{$new_url}++) and ($new_url =~ m{^$heap->{BASE}} and $valid);

$heap->{client}->put("EXT::$url -> $new_url") if($new_url !~ m{^$heap->{BASE}} and $valid);
                                 #output if not of the same host
                               }

                               if(not @{$heap->{TODO}} and not $heap->{KIDS})
                               {
                                 $kernel->yield("shutdown");
                                 print("Max reached or no more to spider\n");
                                 return;
                               }

                               $kernel->yield("ReadySpider", "child done");
print("Exiting SpiderDone, going back to ReadySpider for another kid\n");
                             },
 }
);

sub make_canonical {          # not a POE
   my $uri = URI->new(shift);
   $uri->canonical();
   $uri->fragment(undef);      # toss fragment
   return $uri->canonical->as_string; # return value
}

POE::Kernel->run();





Try Searching:
servers, voip, java, networking, microsoft ...
<Prev in Thread] Current Thread [Next in Thread>