Hi,
I have a three machine Jboss cache cluster. I see
normal operation and state merge when one of the jboss cache instances is
shutdown and restarted. However, when I simulate a network partitioning
by disabling network connection and enabling it a little later, the partitioned
node is not able to rejoin the group.
When I started all the machines (172.16.15.130,
172.16.15.136, 172.16.15.137), they are able to find each other and jboss cache
inserts are replicated across the cluster as expected. When I disable the
network connection for 172.16.15.137 and enabling it a little later,
172.16.15.137 is not able to join the rest of the group members.
172.16.15.137 keeps sending heartbeat messages to 172.16.15.130. It does
not get any response from 172.16.15.130 and 172.16.15.137 ends up transmitting
SUSPECT message again and again.
I think that I am making a mistake in configuration.
Can you guys point me out? I have tested using jgroups version 2.2.9.2 as
well as jgroups version 2.4. 172.16.15.130 is a windows XP machine while
the rest of them are Vmware workstation instances running windows 2000
professional.
Thanks,
Shan
---------------------- Configuration for machine
(172.16.15.137)
-----------------------------------------------------------------------
<attribute name="ClusterConfig">
<config>
<TCP
start_port="17910" bind_addr="172.16.15.137"/>
<TCPPING initial_hosts="172.16.15.130[17910],172.16.15.136[17910]"
port_range="5" timeout="13000"
num_initial_members="2" up_thread="true"
down_thread="true"/>
<MERGE2 max_interval="20000" min_interval="15000"/>
<FD shun="true" up_thread="true"
down_thread="true" timeout="5500" max_tries="5"
/>
<VERIFY_SUSPECT timeout="1500" up_thread="false"
down_thread="false" />
<pbcast.NAKACK gc_lag="100" retransmit_timeout="13000"
up_thread="true" down_thread="true" />
<pbcast.STABLE desired_avg_gossip="20000"
up_thread="false" down_thread="false" />
<pbcast.GMS join_timeout="15000"
join_retry_timeout="5000" shun="true"
print_local_addr="false" down_thread="true"
up_thread="true" />
<pbcast.STATE_TRANSFER up_thread="true"
down_thread="true"/>
</config>
</attribute>
---------------------- Configuration for machine
(172.16.15.136)
-----------------------------------------------------------------------
<attribute name="ClusterConfig">
<config>
<TCP
start_port="17910" bind_addr="172.16.15.136"/>
<TCPPING initial_hosts="172.16.15.130[17910],172.16.15.137[17910]"
port_range="5" timeout="13000"
num_initial_members="2" up_thread="true" down_thread="true"/>
<MERGE2 max_interval="20000" min_interval="15000"/>
<FD shun="true" up_thread="true"
down_thread="true" timeout="5500" max_tries="5"
/>
<VERIFY_SUSPECT timeout="1500" up_thread="false"
down_thread="false" />
<pbcast.NAKACK gc_lag="100" retransmit_timeout="13000"
up_thread="true" down_thread="true" />
<pbcast.STABLE desired_avg_gossip="20000"
up_thread="false" down_thread="false" />
<pbcast.GMS join_timeout="15000" join_retry_timeout="5000"
shun="true" print_local_addr="false"
down_thread="true" up_thread="true" />
<pbcast.STATE_TRANSFER up_thread="true"
down_thread="true"/>
</config>
</attribute>
---------------------- Configuration for machine
(172.16.15.130)
-----------------------------------------------------------------------
<attribute name="ClusterConfig">
<config>
<TCP
start_port="17910" bind_addr="172.16.15.130"/>
<TCPPING initial_hosts="172.16.15.136[17910],172.16.15.137[17910]"
port_range="5" timeout="13000"
num_initial_members="2" up_thread="true"
down_thread="true"/>
<MERGE2 max_interval="20000" min_interval="15000"/>
<FD shun="true" up_thread="true"
down_thread="true" timeout="5500" max_tries="5"
/>
<VERIFY_SUSPECT timeout="1500" up_thread="false"
down_thread="false" />
<pbcast.NAKACK gc_lag="100" retransmit_timeout="13000"
up_thread="true" down_thread="true" />
<pbcast.STABLE desired_avg_gossip="20000"
up_thread="false" down_thread="false" />
<pbcast.GMS join_timeout="15000"
join_retry_timeout="5000" shun="true"
print_local_addr="false" down_thread="true"
up_thread="true" />
<pbcast.STATE_TRANSFER up_thread="true" down_thread="true"/>
</config>
</attribute>