View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.master;
20  
21  import static org.junit.Assert.assertEquals;
22  import static org.junit.Assert.assertFalse;
23  import static org.junit.Assert.assertNotNull;
24  import static org.junit.Assert.assertTrue;
25  
26  import java.io.IOException;
27  import java.util.ArrayList;
28  import java.util.Iterator;
29  import java.util.List;
30  import java.util.Set;
31  import java.util.TreeSet;
32  
33  import org.apache.commons.logging.Log;
34  import org.apache.commons.logging.LogFactory;
35  import org.apache.hadoop.conf.Configuration;
36  import org.apache.hadoop.fs.FileSystem;
37  import org.apache.hadoop.fs.Path;
38  import org.apache.hadoop.hbase.Abortable;
39  import org.apache.hadoop.hbase.ClusterStatus;
40  import org.apache.hadoop.hbase.HBaseConfiguration;
41  import org.apache.hadoop.hbase.HBaseTestingUtility;
42  import org.apache.hadoop.hbase.HColumnDescriptor;
43  import org.apache.hadoop.hbase.HRegionInfo;
44  import org.apache.hadoop.hbase.HTableDescriptor;
45  import org.apache.hadoop.hbase.LargeTests;
46  import org.apache.hadoop.hbase.MiniHBaseCluster;
47  import org.apache.hadoop.hbase.RegionTransition;
48  import org.apache.hadoop.hbase.ServerName;
49  import org.apache.hadoop.hbase.TableName;
50  import org.apache.hadoop.hbase.executor.EventType;
51  import org.apache.hadoop.hbase.master.RegionState.State;
52  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
53  import org.apache.hadoop.hbase.regionserver.HRegion;
54  import org.apache.hadoop.hbase.regionserver.HRegionServer;
55  import org.apache.hadoop.hbase.regionserver.RegionMergeTransaction;
56  import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
57  import org.apache.hadoop.hbase.util.Bytes;
58  import org.apache.hadoop.hbase.util.FSTableDescriptors;
59  import org.apache.hadoop.hbase.util.FSUtils;
60  import org.apache.hadoop.hbase.util.JVMClusterUtil;
61  import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
62  import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
63  import org.apache.hadoop.hbase.util.Threads;
64  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
65  import org.apache.hadoop.hbase.zookeeper.ZKTable;
66  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
67  import org.apache.zookeeper.data.Stat;
68  import org.junit.Test;
69  import org.junit.experimental.categories.Category;
70  
71  @Category(LargeTests.class)
72  public class TestMasterFailover {
73    private static final Log LOG = LogFactory.getLog(TestMasterFailover.class);
74  
75    /**
76     * Complex test of master failover that tests as many permutations of the
77     * different possible states that regions in transition could be in within ZK.
78     * <p>
79     * This tests the proper handling of these states by the failed-over master
80     * and includes a thorough testing of the timeout code as well.
81     * <p>
82     * Starts with a single master and three regionservers.
83     * <p>
84     * Creates two tables, enabledTable and disabledTable, each containing 5
85     * regions.  The disabledTable is then disabled.
86     * <p>
87     * After reaching steady-state, the master is killed.  We then mock several
88     * states in ZK.
89     * <p>
90     * After mocking them, we will startup a new master which should become the
91     * active master and also detect that it is a failover.  The primary test
92     * passing condition will be that all regions of the enabled table are
93     * assigned and all the regions of the disabled table are not assigned.
94     * <p>
95     * The different scenarios to be tested are below:
96     * <p>
97     * <b>ZK State:  OFFLINE</b>
98     * <p>A node can get into OFFLINE state if</p>
99     * <ul>
100    * <li>An RS fails to open a region, so it reverts the state back to OFFLINE
101    * <li>The Master is assigning the region to a RS before it sends RPC
102    * </ul>
103    * <p>We will mock the scenarios</p>
104    * <ul>
105    * <li>Master has assigned an enabled region but RS failed so a region is
106    *     not assigned anywhere and is sitting in ZK as OFFLINE</li>
107    * <li>This seems to cover both cases?</li>
108    * </ul>
109    * <p>
110    * <b>ZK State:  CLOSING</b>
111    * <p>A node can get into CLOSING state if</p>
112    * <ul>
113    * <li>An RS has begun to close a region
114    * </ul>
115    * <p>We will mock the scenarios</p>
116    * <ul>
117    * <li>Region of enabled table was being closed but did not complete
118    * <li>Region of disabled table was being closed but did not complete
119    * </ul>
120    * <p>
121    * <b>ZK State:  CLOSED</b>
122    * <p>A node can get into CLOSED state if</p>
123    * <ul>
124    * <li>An RS has completed closing a region but not acknowledged by master yet
125    * </ul>
126    * <p>We will mock the scenarios</p>
127    * <ul>
128    * <li>Region of a table that should be enabled was closed on an RS
129    * <li>Region of a table that should be disabled was closed on an RS
130    * </ul>
131    * <p>
132    * <b>ZK State:  OPENING</b>
133    * <p>A node can get into OPENING state if</p>
134    * <ul>
135    * <li>An RS has begun to open a region
136    * </ul>
137    * <p>We will mock the scenarios</p>
138    * <ul>
139    * <li>RS was opening a region of enabled table but never finishes
140    * </ul>
141    * <p>
142    * <b>ZK State:  OPENED</b>
143    * <p>A node can get into OPENED state if</p>
144    * <ul>
145    * <li>An RS has finished opening a region but not acknowledged by master yet
146    * </ul>
147    * <p>We will mock the scenarios</p>
148    * <ul>
149    * <li>Region of a table that should be enabled was opened on an RS
150    * <li>Region of a table that should be disabled was opened on an RS
151    * </ul>
152    * @throws Exception
153    */
154   @Test (timeout=240000)
155   public void testMasterFailoverWithMockedRIT() throws Exception {
156 
157     final int NUM_MASTERS = 1;
158     final int NUM_RS = 3;
159 
160     // Create config to use for this cluster
161     Configuration conf = HBaseConfiguration.create();
162 
163     // Start the cluster
164     HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
165     TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
166     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
167     log("Cluster started");
168 
169     // Create a ZKW to use in the test
170     ZooKeeperWatcher zkw = HBaseTestingUtility.getZooKeeperWatcher(TEST_UTIL);
171 
172     // get all the master threads
173     List<MasterThread> masterThreads = cluster.getMasterThreads();
174     assertEquals(1, masterThreads.size());
175 
176     // only one master thread, let's wait for it to be initialized
177     assertTrue(cluster.waitForActiveAndReadyMaster());
178     HMaster master = masterThreads.get(0).getMaster();
179     assertTrue(master.isActiveMaster());
180     assertTrue(master.isInitialized());
181 
182     // disable load balancing on this master
183     master.balanceSwitch(false);
184 
185     // create two tables in META, each with 10 regions
186     byte [] FAMILY = Bytes.toBytes("family");
187     byte [][] SPLIT_KEYS = new byte [][] {
188         new byte[0], Bytes.toBytes("aaa"), Bytes.toBytes("bbb"),
189         Bytes.toBytes("ccc"), Bytes.toBytes("ddd"), Bytes.toBytes("eee"),
190         Bytes.toBytes("fff"), Bytes.toBytes("ggg"), Bytes.toBytes("hhh"),
191         Bytes.toBytes("iii"), Bytes.toBytes("jjj")
192     };
193 
194     byte [] enabledTable = Bytes.toBytes("enabledTable");
195     HTableDescriptor htdEnabled = new HTableDescriptor(TableName.valueOf(enabledTable));
196     htdEnabled.addFamily(new HColumnDescriptor(FAMILY));
197 
198     FileSystem filesystem = FileSystem.get(conf);
199     Path rootdir = FSUtils.getRootDir(conf);
200     FSTableDescriptors fstd = new FSTableDescriptors(filesystem, rootdir);
201     // Write the .tableinfo
202     fstd.createTableDescriptor(htdEnabled);
203 
204     HRegionInfo hriEnabled = new HRegionInfo(htdEnabled.getTableName(), null, null);
205     createRegion(hriEnabled, rootdir, conf, htdEnabled);
206 
207     List<HRegionInfo> enabledRegions = TEST_UTIL.createMultiRegionsInMeta(
208         TEST_UTIL.getConfiguration(), htdEnabled, SPLIT_KEYS);
209 
210     TableName disabledTable = TableName.valueOf("disabledTable");
211     HTableDescriptor htdDisabled = new HTableDescriptor(disabledTable);
212     htdDisabled.addFamily(new HColumnDescriptor(FAMILY));
213     // Write the .tableinfo
214     fstd.createTableDescriptor(htdDisabled);
215     HRegionInfo hriDisabled = new HRegionInfo(htdDisabled.getTableName(), null, null);
216     createRegion(hriDisabled, rootdir, conf, htdDisabled);
217     List<HRegionInfo> disabledRegions = TEST_UTIL.createMultiRegionsInMeta(
218         TEST_UTIL.getConfiguration(), htdDisabled, SPLIT_KEYS);
219 
220     TableName tableWithMergingRegions = TableName.valueOf("tableWithMergingRegions");
221     TEST_UTIL.createTable(tableWithMergingRegions, FAMILY, new byte [][] {Bytes.toBytes("m")});
222 
223     log("Regions in hbase:meta and namespace have been created");
224 
225     // at this point we only expect 4 regions to be assigned out
226     // (catalogs and namespace, + 2 merging regions)
227     assertEquals(4, cluster.countServedRegions());
228 
229     // Move merging regions to the same region server
230     AssignmentManager am = master.getAssignmentManager();
231     RegionStates regionStates = am.getRegionStates();
232     List<HRegionInfo> mergingRegions = regionStates.getRegionsOfTable(tableWithMergingRegions);
233     assertEquals(2, mergingRegions.size());
234     HRegionInfo a = mergingRegions.get(0);
235     HRegionInfo b = mergingRegions.get(1);
236     HRegionInfo newRegion = RegionMergeTransaction.getMergedRegionInfo(a, b);
237     ServerName mergingServer = regionStates.getRegionServerOfRegion(a);
238     ServerName serverB = regionStates.getRegionServerOfRegion(b);
239     if (!serverB.equals(mergingServer)) {
240       RegionPlan plan = new RegionPlan(b, serverB, mergingServer);
241       am.balance(plan);
242       assertTrue(am.waitForAssignment(b));
243     }
244 
245     // Let's just assign everything to first RS
246     HRegionServer hrs = cluster.getRegionServer(0);
247     ServerName serverName = hrs.getServerName();
248     HRegionInfo closingRegion = enabledRegions.remove(0);
249     // we'll need some regions to already be assigned out properly on live RS
250     List<HRegionInfo> enabledAndAssignedRegions = new ArrayList<HRegionInfo>();
251     enabledAndAssignedRegions.add(enabledRegions.remove(0));
252     enabledAndAssignedRegions.add(enabledRegions.remove(0));
253     enabledAndAssignedRegions.add(closingRegion);
254 
255     List<HRegionInfo> disabledAndAssignedRegions = new ArrayList<HRegionInfo>();
256     disabledAndAssignedRegions.add(disabledRegions.remove(0));
257     disabledAndAssignedRegions.add(disabledRegions.remove(0));
258 
259     // now actually assign them
260     for (HRegionInfo hri : enabledAndAssignedRegions) {
261       master.assignmentManager.regionPlans.put(hri.getEncodedName(),
262           new RegionPlan(hri, null, serverName));
263       master.assignRegion(hri);
264     }
265     for (HRegionInfo hri : disabledAndAssignedRegions) {
266       master.assignmentManager.regionPlans.put(hri.getEncodedName(),
267           new RegionPlan(hri, null, serverName));
268       master.assignRegion(hri);
269     }
270 
271     // wait for no more RIT
272     log("Waiting for assignment to finish");
273     ZKAssign.blockUntilNoRIT(zkw);
274     log("Assignment completed");
275 
276     // Stop the master
277     log("Aborting master");
278     cluster.abortMaster(0);
279     cluster.waitOnMaster(0);
280     log("Master has aborted");
281 
282     /*
283      * Now, let's start mocking up some weird states as described in the method
284      * javadoc.
285      */
286 
287     List<HRegionInfo> regionsThatShouldBeOnline = new ArrayList<HRegionInfo>();
288     List<HRegionInfo> regionsThatShouldBeOffline = new ArrayList<HRegionInfo>();
289 
290     log("Beginning to mock scenarios");
291 
292     // Disable the disabledTable in ZK
293     ZKTable zktable = new ZKTable(zkw);
294     zktable.setDisabledTable(disabledTable);
295 
296     /*
297      *  ZK = OFFLINE
298      */
299 
300     // Region that should be assigned but is not and is in ZK as OFFLINE
301     // Cause: This can happen if the master crashed after creating the znode but before sending the
302     //  request to the region server
303     HRegionInfo region = enabledRegions.remove(0);
304     regionsThatShouldBeOnline.add(region);
305     ZKAssign.createNodeOffline(zkw, region, serverName);
306 
307     /*
308      * ZK = CLOSING
309      */
310     // Cause: Same as offline.
311     regionsThatShouldBeOnline.add(closingRegion);
312     ZKAssign.createNodeClosing(zkw, closingRegion, serverName);
313 
314     /*
315      * ZK = CLOSED
316      */
317 
318     // Region of enabled table closed but not ack
319     //Cause: Master was down while the region server updated the ZK status.
320     region = enabledRegions.remove(0);
321     regionsThatShouldBeOnline.add(region);
322     int version = ZKAssign.createNodeClosing(zkw, region, serverName);
323     ZKAssign.transitionNodeClosed(zkw, region, serverName, version);
324 
325     // Region of disabled table closed but not ack
326     region = disabledRegions.remove(0);
327     regionsThatShouldBeOffline.add(region);
328     version = ZKAssign.createNodeClosing(zkw, region, serverName);
329     ZKAssign.transitionNodeClosed(zkw, region, serverName, version);
330 
331     /*
332      * ZK = OPENED
333      */
334 
335     // Region of enabled table was opened on RS
336     // Cause: as offline
337     region = enabledRegions.remove(0);
338     regionsThatShouldBeOnline.add(region);
339     ZKAssign.createNodeOffline(zkw, region, serverName);
340     ProtobufUtil.openRegion(hrs, hrs.getServerName(), region);
341     while (true) {
342       byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
343       RegionTransition rt = RegionTransition.parseFrom(bytes);
344       if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
345         break;
346       }
347       Thread.sleep(100);
348     }
349 
350     // Region of disable table was opened on RS
351     // Cause: Master failed while updating the status for this region server.
352     region = disabledRegions.remove(0);
353     regionsThatShouldBeOffline.add(region);
354     ZKAssign.createNodeOffline(zkw, region, serverName);
355     ProtobufUtil.openRegion(hrs, hrs.getServerName(), region);
356     while (true) {
357       byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
358       RegionTransition rt = RegionTransition.parseFrom(bytes);
359       if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
360         break;
361       }
362       Thread.sleep(100);
363     }
364 
365     /*
366      * ZK = MERGING
367      */
368 
369     // Regions of table of merging regions
370     // Cause: Master was down while merging was going on
371     RegionMergeTransaction.createNodeMerging(
372       zkw, newRegion, mergingServer, a, b);
373 
374     /*
375      * ZK = NONE
376      */
377 
378     /*
379      * DONE MOCKING
380      */
381 
382     log("Done mocking data up in ZK");
383 
384     // Start up a new master
385     log("Starting up a new master");
386     master = cluster.startMaster().getMaster();
387     log("Waiting for master to be ready");
388     cluster.waitForActiveAndReadyMaster();
389     log("Master is ready");
390 
391     // Get new region states since master restarted
392     regionStates = master.getAssignmentManager().getRegionStates();
393     // Merging region should remain merging
394     assertTrue(regionStates.isRegionInState(a, State.MERGING));
395     assertTrue(regionStates.isRegionInState(b, State.MERGING));
396     assertTrue(regionStates.isRegionInState(newRegion, State.MERGING_NEW));
397     // Now remove the faked merging znode, merging regions should be
398     // offlined automatically, otherwise it is a bug in AM.
399     ZKAssign.deleteNodeFailSilent(zkw, newRegion);
400 
401     // Failover should be completed, now wait for no RIT
402     log("Waiting for no more RIT");
403     ZKAssign.blockUntilNoRIT(zkw);
404     log("No more RIT in ZK, now doing final test verification");
405 
406     // Grab all the regions that are online across RSs
407     Set<HRegionInfo> onlineRegions = new TreeSet<HRegionInfo>();
408     for (JVMClusterUtil.RegionServerThread rst :
409       cluster.getRegionServerThreads()) {
410       onlineRegions.addAll(ProtobufUtil.getOnlineRegions(rst.getRegionServer()));
411     }
412 
413     // Now, everything that should be online should be online
414     for (HRegionInfo hri : regionsThatShouldBeOnline) {
415       assertTrue(onlineRegions.contains(hri));
416     }
417 
418     // Everything that should be offline should not be online
419     for (HRegionInfo hri : regionsThatShouldBeOffline) {
420       if (onlineRegions.contains(hri)) {
421        LOG.debug(hri);
422       }
423       assertFalse(onlineRegions.contains(hri));
424     }
425 
426     log("Done with verification, all passed, shutting down cluster");
427 
428     // Done, shutdown the cluster
429     TEST_UTIL.shutdownMiniCluster();
430   }
431 
432   /**
433    * Complex test of master failover that tests as many permutations of the
434    * different possible states that regions in transition could be in within ZK
435    * pointing to an RS that has died while no master is around to process it.
436    * <p>
437    * This tests the proper handling of these states by the failed-over master
438    * and includes a thorough testing of the timeout code as well.
439    * <p>
440    * Starts with a single master and two regionservers.
441    * <p>
442    * Creates two tables, enabledTable and disabledTable, each containing 5
443    * regions.  The disabledTable is then disabled.
444    * <p>
445    * After reaching steady-state, the master is killed.  We then mock several
446    * states in ZK.  And one of the RS will be killed.
447    * <p>
448    * After mocking them and killing an RS, we will startup a new master which
449    * should become the active master and also detect that it is a failover.  The
450    * primary test passing condition will be that all regions of the enabled
451    * table are assigned and all the regions of the disabled table are not
452    * assigned.
453    * <p>
454    * The different scenarios to be tested are below:
455    * <p>
456    * <b>ZK State:  CLOSING</b>
457    * <p>A node can get into CLOSING state if</p>
458    * <ul>
459    * <li>An RS has begun to close a region
460    * </ul>
461    * <p>We will mock the scenarios</p>
462    * <ul>
463    * <li>Region was being closed but the RS died before finishing the close
464    * </ul>
465    * <b>ZK State:  OPENED</b>
466    * <p>A node can get into OPENED state if</p>
467    * <ul>
468    * <li>An RS has finished opening a region but not acknowledged by master yet
469    * </ul>
470    * <p>We will mock the scenarios</p>
471    * <ul>
472    * <li>Region of a table that should be enabled was opened by a now-dead RS
473    * <li>Region of a table that should be disabled was opened by a now-dead RS
474    * </ul>
475    * <p>
476    * <b>ZK State:  NONE</b>
477    * <p>A region could not have a transition node if</p>
478    * <ul>
479    * <li>The server hosting the region died and no master processed it
480    * </ul>
481    * <p>We will mock the scenarios</p>
482    * <ul>
483    * <li>Region of enabled table was on a dead RS that was not yet processed
484    * <li>Region of disabled table was on a dead RS that was not yet processed
485    * </ul>
486    * @throws Exception
487    */
488   @Test (timeout=180000)
489   public void testMasterFailoverWithMockedRITOnDeadRS() throws Exception {
490 
491     final int NUM_MASTERS = 1;
492     final int NUM_RS = 2;
493 
494     // Create and start the cluster
495     HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
496     Configuration conf = TEST_UTIL.getConfiguration();
497 
498     conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
499     conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 2);
500     TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
501     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
502     log("Cluster started");
503 
504     // Create a ZKW to use in the test
505     ZooKeeperWatcher zkw = new ZooKeeperWatcher(TEST_UTIL.getConfiguration(),
506         "unittest", new Abortable() {
507 
508           @Override
509           public void abort(String why, Throwable e) {
510             LOG.error("Fatal ZK Error: " + why, e);
511             org.junit.Assert.assertFalse("Fatal ZK error", true);
512           }
513 
514           @Override
515           public boolean isAborted() {
516             return false;
517           }
518 
519     });
520 
521     // get all the master threads
522     List<MasterThread> masterThreads = cluster.getMasterThreads();
523     assertEquals(1, masterThreads.size());
524 
525     // only one master thread, let's wait for it to be initialized
526     assertTrue(cluster.waitForActiveAndReadyMaster());
527     HMaster master = masterThreads.get(0).getMaster();
528     assertTrue(master.isActiveMaster());
529     assertTrue(master.isInitialized());
530 
531     // disable load balancing on this master
532     master.balanceSwitch(false);
533 
534     // create two tables in META, each with 30 regions
535     byte [] FAMILY = Bytes.toBytes("family");
536     byte[][] SPLIT_KEYS =
537         TEST_UTIL.getRegionSplitStartKeys(Bytes.toBytes("aaa"), Bytes.toBytes("zzz"), 30);
538 
539     byte [] enabledTable = Bytes.toBytes("enabledTable");
540     HTableDescriptor htdEnabled = new HTableDescriptor(TableName.valueOf(enabledTable));
541     htdEnabled.addFamily(new HColumnDescriptor(FAMILY));
542     FileSystem filesystem = FileSystem.get(conf);
543     Path rootdir = FSUtils.getRootDir(conf);
544     FSTableDescriptors fstd = new FSTableDescriptors(filesystem, rootdir);
545     // Write the .tableinfo
546     fstd.createTableDescriptor(htdEnabled);
547     HRegionInfo hriEnabled = new HRegionInfo(htdEnabled.getTableName(),
548         null, null);
549     createRegion(hriEnabled, rootdir, conf, htdEnabled);
550 
551     List<HRegionInfo> enabledRegions = TEST_UTIL.createMultiRegionsInMeta(
552         TEST_UTIL.getConfiguration(), htdEnabled, SPLIT_KEYS);
553 
554     TableName disabledTable =
555         TableName.valueOf("disabledTable");
556     HTableDescriptor htdDisabled = new HTableDescriptor(disabledTable);
557     htdDisabled.addFamily(new HColumnDescriptor(FAMILY));
558     // Write the .tableinfo
559     fstd.createTableDescriptor(htdDisabled);
560     HRegionInfo hriDisabled = new HRegionInfo(htdDisabled.getTableName(), null, null);
561     createRegion(hriDisabled, rootdir, conf, htdDisabled);
562 
563     List<HRegionInfo> disabledRegions = TEST_UTIL.createMultiRegionsInMeta(
564         TEST_UTIL.getConfiguration(), htdDisabled, SPLIT_KEYS);
565 
566     log("Regions in hbase:meta and Namespace have been created");
567 
568     // at this point we only expect 2 regions to be assigned out (catalogs and namespace  )
569     assertEquals(2, cluster.countServedRegions());
570 
571     // The first RS will stay online
572     List<RegionServerThread> regionservers =
573       cluster.getRegionServerThreads();
574     HRegionServer hrs = regionservers.get(0).getRegionServer();
575 
576     // The second RS is going to be hard-killed
577     RegionServerThread hrsDeadThread = regionservers.get(1);
578     HRegionServer hrsDead = hrsDeadThread.getRegionServer();
579     ServerName deadServerName = hrsDead.getServerName();
580 
581     // we'll need some regions to already be assigned out properly on live RS
582     List<HRegionInfo> enabledAndAssignedRegions = new ArrayList<HRegionInfo>();
583     enabledAndAssignedRegions.addAll(enabledRegions.subList(0, 6));
584     enabledRegions.removeAll(enabledAndAssignedRegions);
585     List<HRegionInfo> disabledAndAssignedRegions = new ArrayList<HRegionInfo>();
586     disabledAndAssignedRegions.addAll(disabledRegions.subList(0, 6));
587     disabledRegions.removeAll(disabledAndAssignedRegions);
588 
589     // now actually assign them
590     for (HRegionInfo hri : enabledAndAssignedRegions) {
591       master.assignmentManager.regionPlans.put(hri.getEncodedName(),
592           new RegionPlan(hri, null, hrs.getServerName()));
593       master.assignRegion(hri);
594     }
595     for (HRegionInfo hri : disabledAndAssignedRegions) {
596       master.assignmentManager.regionPlans.put(hri.getEncodedName(),
597           new RegionPlan(hri, null, hrs.getServerName()));
598       master.assignRegion(hri);
599     }
600 
601     log("Waiting for assignment to finish");
602     ZKAssign.blockUntilNoRIT(zkw);
603     master.assignmentManager.waitUntilNoRegionsInTransition(60000);
604     log("Assignment completed");
605 
606     assertTrue(" Table must be enabled.", master.getAssignmentManager()
607         .getZKTable().isEnabledTable(TableName.valueOf("enabledTable")));
608     // we also need regions assigned out on the dead server
609     List<HRegionInfo> enabledAndOnDeadRegions = new ArrayList<HRegionInfo>();
610     enabledAndOnDeadRegions.addAll(enabledRegions.subList(0, 6));
611     enabledRegions.removeAll(enabledAndOnDeadRegions);
612     List<HRegionInfo> disabledAndOnDeadRegions = new ArrayList<HRegionInfo>();
613     disabledAndOnDeadRegions.addAll(disabledRegions.subList(0, 6));
614     disabledRegions.removeAll(disabledAndOnDeadRegions);
615 
616     // set region plan to server to be killed and trigger assign
617     for (HRegionInfo hri : enabledAndOnDeadRegions) {
618       master.assignmentManager.regionPlans.put(hri.getEncodedName(),
619           new RegionPlan(hri, null, deadServerName));
620       master.assignRegion(hri);
621     }
622     for (HRegionInfo hri : disabledAndOnDeadRegions) {
623       master.assignmentManager.regionPlans.put(hri.getEncodedName(),
624           new RegionPlan(hri, null, deadServerName));
625       master.assignRegion(hri);
626     }
627 
628     // wait for no more RIT
629     log("Waiting for assignment to finish");
630     ZKAssign.blockUntilNoRIT(zkw);
631     master.assignmentManager.waitUntilNoRegionsInTransition(60000);
632     log("Assignment completed");
633 
634     // Due to master.assignRegion(hri) could fail to assign a region to a specified RS
635     // therefore, we need make sure that regions are in the expected RS
636     verifyRegionLocation(hrs, enabledAndAssignedRegions);
637     verifyRegionLocation(hrs, disabledAndAssignedRegions);
638     verifyRegionLocation(hrsDead, enabledAndOnDeadRegions);
639     verifyRegionLocation(hrsDead, disabledAndOnDeadRegions);
640 
641     assertTrue(" Didn't get enough regions of enabledTalbe on live rs.",
642       enabledAndAssignedRegions.size() >= 2);
643     assertTrue(" Didn't get enough regions of disalbedTable on live rs.",
644       disabledAndAssignedRegions.size() >= 2);
645     assertTrue(" Didn't get enough regions of enabledTalbe on dead rs.",
646       enabledAndOnDeadRegions.size() >= 2);
647     assertTrue(" Didn't get enough regions of disalbedTable on dead rs.",
648       disabledAndOnDeadRegions.size() >= 2);
649 
650     // Stop the master
651     log("Aborting master");
652     cluster.abortMaster(0);
653     cluster.waitOnMaster(0);
654     log("Master has aborted");
655 
656     /*
657      * Now, let's start mocking up some weird states as described in the method
658      * javadoc.
659      */
660 
661     List<HRegionInfo> regionsThatShouldBeOnline = new ArrayList<HRegionInfo>();
662     List<HRegionInfo> regionsThatShouldBeOffline = new ArrayList<HRegionInfo>();
663 
664     log("Beginning to mock scenarios");
665 
666     // Disable the disabledTable in ZK
667     ZKTable zktable = new ZKTable(zkw);
668     zktable.setDisabledTable(disabledTable);
669 
670     assertTrue(" The enabled table should be identified on master fail over.",
671         zktable.isEnabledTable(TableName.valueOf("enabledTable")));
672 
673     /*
674      * ZK = CLOSING
675      */
676 
677     // Region of enabled table being closed on dead RS but not finished
678     HRegionInfo region = enabledAndOnDeadRegions.remove(0);
679     regionsThatShouldBeOnline.add(region);
680     ZKAssign.createNodeClosing(zkw, region, deadServerName);
681     LOG.debug("\n\nRegion of enabled table was CLOSING on dead RS\n" +
682         region + "\n\n");
683 
684     // Region of disabled table being closed on dead RS but not finished
685     region = disabledAndOnDeadRegions.remove(0);
686     regionsThatShouldBeOffline.add(region);
687     ZKAssign.createNodeClosing(zkw, region, deadServerName);
688     LOG.debug("\n\nRegion of disabled table was CLOSING on dead RS\n" +
689         region + "\n\n");
690 
691     /*
692      * ZK = CLOSED
693      */
694 
695     // Region of enabled on dead server gets closed but not ack'd by master
696     region = enabledAndOnDeadRegions.remove(0);
697     regionsThatShouldBeOnline.add(region);
698     int version = ZKAssign.createNodeClosing(zkw, region, deadServerName);
699     ZKAssign.transitionNodeClosed(zkw, region, deadServerName, version);
700     LOG.debug("\n\nRegion of enabled table was CLOSED on dead RS\n" +
701         region + "\n\n");
702 
703     // Region of disabled on dead server gets closed but not ack'd by master
704     region = disabledAndOnDeadRegions.remove(0);
705     regionsThatShouldBeOffline.add(region);
706     version = ZKAssign.createNodeClosing(zkw, region, deadServerName);
707     ZKAssign.transitionNodeClosed(zkw, region, deadServerName, version);
708     LOG.debug("\n\nRegion of disabled table was CLOSED on dead RS\n" +
709         region + "\n\n");
710 
711     /*
712      * ZK = OPENING
713      */
714 
715     // RS was opening a region of enabled table then died
716     region = enabledRegions.remove(0);
717     regionsThatShouldBeOnline.add(region);
718     ZKAssign.createNodeOffline(zkw, region, deadServerName);
719     ZKAssign.transitionNodeOpening(zkw, region, deadServerName);
720     LOG.debug("\n\nRegion of enabled table was OPENING on dead RS\n" +
721         region + "\n\n");
722 
723     // RS was opening a region of disabled table then died
724     region = disabledRegions.remove(0);
725     regionsThatShouldBeOffline.add(region);
726     ZKAssign.createNodeOffline(zkw, region, deadServerName);
727     ZKAssign.transitionNodeOpening(zkw, region, deadServerName);
728     LOG.debug("\n\nRegion of disabled table was OPENING on dead RS\n" +
729         region + "\n\n");
730 
731     /*
732      * ZK = OPENED
733      */
734 
735     // Region of enabled table was opened on dead RS
736     region = enabledRegions.remove(0);
737     regionsThatShouldBeOnline.add(region);
738     ZKAssign.createNodeOffline(zkw, region, deadServerName);
739     ProtobufUtil.openRegion(hrsDead, hrsDead.getServerName(), region);
740     while (true) {
741       byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
742       RegionTransition rt = RegionTransition.parseFrom(bytes);
743       if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
744         break;
745       }
746       Thread.sleep(100);
747     }
748     LOG.debug("\n\nRegion of enabled table was OPENED on dead RS\n" +
749         region + "\n\n");
750 
751     // Region of disabled table was opened on dead RS
752     region = disabledRegions.remove(0);
753     regionsThatShouldBeOffline.add(region);
754     ZKAssign.createNodeOffline(zkw, region, deadServerName);
755     ProtobufUtil.openRegion(hrsDead, hrsDead.getServerName(), region);
756     while (true) {
757       byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
758       RegionTransition rt = RegionTransition.parseFrom(bytes);
759       if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
760         break;
761       }
762       Thread.sleep(100);
763     }
764     LOG.debug("\n\nRegion of disabled table was OPENED on dead RS\n" +
765         region + "\n\n");
766 
767     /*
768      * ZK = NONE
769      */
770 
771     // Region of enabled table was open at steady-state on dead RS
772     region = enabledRegions.remove(0);
773     regionsThatShouldBeOnline.add(region);
774     ZKAssign.createNodeOffline(zkw, region, deadServerName);
775     ProtobufUtil.openRegion(hrsDead, hrsDead.getServerName(), region);
776     while (true) {
777       byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
778       RegionTransition rt = RegionTransition.parseFrom(bytes);
779       if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
780         ZKAssign.deleteOpenedNode(zkw, region.getEncodedName(), rt.getServerName());
781         LOG.debug("DELETED " + rt);
782         break;
783       }
784       Thread.sleep(100);
785     }
786     LOG.debug("\n\nRegion of enabled table was open at steady-state on dead RS"
787         + "\n" + region + "\n\n");
788 
789     // Region of disabled table was open at steady-state on dead RS
790     region = disabledRegions.remove(0);
791     regionsThatShouldBeOffline.add(region);
792     ZKAssign.createNodeOffline(zkw, region, deadServerName);
793     ProtobufUtil.openRegion(hrsDead, hrsDead.getServerName(), region);
794     while (true) {
795       byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
796       RegionTransition rt = RegionTransition.parseFrom(bytes);
797       if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
798         ZKAssign.deleteOpenedNode(zkw, region.getEncodedName(), rt.getServerName());
799         break;
800       }
801       Thread.sleep(100);
802     }
803     LOG.debug("\n\nRegion of disabled table was open at steady-state on dead RS"
804       + "\n" + region + "\n\n");
805 
806     /*
807      * DONE MOCKING
808      */
809 
810     log("Done mocking data up in ZK");
811 
812     // Kill the RS that had a hard death
813     log("Killing RS " + deadServerName);
814     hrsDead.abort("Killing for unit test");
815     log("RS " + deadServerName + " killed");
816 
817     // Start up a new master.  Wait until regionserver is completely down
818     // before starting new master because of hbase-4511.
819     while (hrsDeadThread.isAlive()) {
820       Threads.sleep(10);
821     }
822     log("Starting up a new master");
823     master = cluster.startMaster().getMaster();
824     log("Waiting for master to be ready");
825     assertTrue(cluster.waitForActiveAndReadyMaster());
826     log("Master is ready");
827 
828     // Wait until SSH processing completed for dead server.
829     while (master.getServerManager().areDeadServersInProgress()) {
830       Thread.sleep(10);
831     }
832 
833     // Failover should be completed, now wait for no RIT
834     log("Waiting for no more RIT");
835     ZKAssign.blockUntilNoRIT(zkw);
836     log("No more RIT in ZK");
837     long now = System.currentTimeMillis();
838     long maxTime = 120000;
839     boolean done = master.assignmentManager.waitUntilNoRegionsInTransition(maxTime);
840     if (!done) {
841       RegionStates regionStates = master.getAssignmentManager().getRegionStates();
842       LOG.info("rit=" + regionStates.getRegionsInTransition());
843     }
844     long elapsed = System.currentTimeMillis() - now;
845     assertTrue("Elapsed=" + elapsed + ", maxTime=" + maxTime + ", done=" + done,
846       elapsed < maxTime);
847     log("No more RIT in RIT map, doing final test verification");
848 
849     // Grab all the regions that are online across RSs
850     Set<HRegionInfo> onlineRegions = new TreeSet<HRegionInfo>();
851     now = System.currentTimeMillis();
852     maxTime = 30000;
853     for (JVMClusterUtil.RegionServerThread rst :
854         cluster.getRegionServerThreads()) {
855       try {
856         HRegionServer rs = rst.getRegionServer();
857         while (!rs.getRegionsInTransitionInRS().isEmpty()) {
858           elapsed = System.currentTimeMillis() - now;
859           assertTrue("Test timed out in getting online regions", elapsed < maxTime);
860           if (rs.isAborted() || rs.isStopped()) {
861             // This region server is stopped, skip it.
862             break;
863           }
864           Thread.sleep(100);
865         }
866         onlineRegions.addAll(ProtobufUtil.getOnlineRegions(rs));
867       } catch (RegionServerStoppedException e) {
868         LOG.info("Got RegionServerStoppedException", e);
869       }
870     }
871 
872     // Now, everything that should be online should be online
873     for (HRegionInfo hri : regionsThatShouldBeOnline) {
874       assertTrue("region=" + hri.getRegionNameAsString() + ", " + onlineRegions.toString(),
875         onlineRegions.contains(hri));
876     }
877 
878     // Everything that should be offline should not be online
879     for (HRegionInfo hri : regionsThatShouldBeOffline) {
880       assertFalse(onlineRegions.contains(hri));
881     }
882 
883     log("Done with verification, all passed, shutting down cluster");
884 
885     // Done, shutdown the cluster
886     TEST_UTIL.shutdownMiniCluster();
887   }
888 
889   /**
890    * Verify regions are on the expected region server
891    */
892   private void verifyRegionLocation(HRegionServer hrs, List<HRegionInfo> regions)
893       throws IOException {
894     List<HRegionInfo> tmpOnlineRegions = ProtobufUtil.getOnlineRegions(hrs);
895     Iterator<HRegionInfo> itr = regions.iterator();
896     while (itr.hasNext()) {
897       HRegionInfo tmp = itr.next();
898       if (!tmpOnlineRegions.contains(tmp)) {
899         itr.remove();
900       }
901     }
902   }
903 
904   HRegion createRegion(final HRegionInfo  hri, final Path rootdir, final Configuration c,
905       final HTableDescriptor htd)
906   throws IOException {
907     HRegion r = HRegion.createHRegion(hri, rootdir, c, htd);
908     // The above call to create a region will create an hlog file.  Each
909     // log file create will also create a running thread to do syncing.  We need
910     // to close out this log else we will have a running thread trying to sync
911     // the file system continuously which is ugly when dfs is taken away at the
912     // end of the test.
913     HRegion.closeHRegion(r);
914     return r;
915   }
916 
917   // TODO: Next test to add is with testing permutations of the RIT or the RS
918   //       killed are hosting ROOT and hbase:meta regions.
919 
920   private void log(String string) {
921     LOG.info("\n\n" + string + " \n\n");
922   }
923 
924   @Test (timeout=180000)
925   public void testShouldCheckMasterFailOverWhenMETAIsInOpenedState()
926       throws Exception {
927     LOG.info("Starting testShouldCheckMasterFailOverWhenMETAIsInOpenedState");
928     final int NUM_MASTERS = 1;
929     final int NUM_RS = 2;
930 
931     // Start the cluster
932     HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
933     Configuration conf = TEST_UTIL.getConfiguration();
934     conf.setInt("hbase.master.info.port", -1);
935 
936     TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
937     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
938 
939     // Find regionserver carrying meta.
940     List<RegionServerThread> regionServerThreads =
941       cluster.getRegionServerThreads();
942     int count = -1;
943     HRegion metaRegion = null;
944     for (RegionServerThread regionServerThread : regionServerThreads) {
945       HRegionServer regionServer = regionServerThread.getRegionServer();
946       metaRegion = regionServer.getOnlineRegion(HRegionInfo.FIRST_META_REGIONINFO.getRegionName());
947       count++;
948       regionServer.abort("");
949       if (null != metaRegion) break;
950     }
951     HRegionServer regionServer = cluster.getRegionServer(count);
952 
953     TEST_UTIL.shutdownMiniHBaseCluster();
954 
955     // Create a ZKW to use in the test
956     ZooKeeperWatcher zkw =
957       HBaseTestingUtility.createAndForceNodeToOpenedState(TEST_UTIL,
958           metaRegion, regionServer.getServerName());
959 
960     LOG.info("Staring cluster for second time");
961     TEST_UTIL.startMiniHBaseCluster(NUM_MASTERS, NUM_RS);
962 
963     HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
964     while (!master.isInitialized()) {
965       Thread.sleep(100);
966     }
967     // Failover should be completed, now wait for no RIT
968     log("Waiting for no more RIT");
969     ZKAssign.blockUntilNoRIT(zkw);
970 
971     zkw.close();
972     // Stop the cluster
973     TEST_UTIL.shutdownMiniCluster();
974   }
975 
976   /**
977    * This tests a RIT in offline state will get re-assigned after a master restart
978    */
979   @Test(timeout=240000)
980   public void testOfflineRegionReAssginedAfterMasterRestart() throws Exception {
981     final TableName table = TableName.valueOf("testOfflineRegionReAssginedAfterMasterRestart");
982     final int NUM_MASTERS = 1;
983     final int NUM_RS = 2;
984 
985     // Create config to use for this cluster
986     Configuration conf = HBaseConfiguration.create();
987 
988     // Start the cluster
989     final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
990     TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
991     log("Cluster started");
992 
993     TEST_UTIL.createTable(table, Bytes.toBytes("family"));
994     HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
995     RegionStates regionStates = master.getAssignmentManager().getRegionStates();
996     HRegionInfo hri = regionStates.getRegionsOfTable(table).get(0);
997     ServerName serverName = regionStates.getRegionServerOfRegion(hri);
998     TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
999 
1000     ServerName dstName = null;
1001     for (ServerName tmpServer : master.serverManager.getOnlineServers().keySet()) {
1002       if (!tmpServer.equals(serverName)) {
1003         dstName = tmpServer;
1004         break;
1005       }
1006     }
1007     // find a different server
1008     assertTrue(dstName != null);
1009     // shutdown HBase cluster
1010     TEST_UTIL.shutdownMiniHBaseCluster();
1011     // create a RIT node in offline state
1012     ZooKeeperWatcher zkw = TEST_UTIL.getZooKeeperWatcher();
1013     ZKAssign.createNodeOffline(zkw, hri, dstName);
1014     Stat stat = new Stat();
1015     byte[] data =
1016         ZKAssign.getDataNoWatch(zkw, hri.getEncodedName(), stat);
1017     assertTrue(data != null);
1018     RegionTransition rt = RegionTransition.parseFrom(data);
1019     assertTrue(rt.getEventType() == EventType.M_ZK_REGION_OFFLINE);
1020 
1021     LOG.info(hri.getEncodedName() + " region is in offline state with source server=" + serverName
1022         + " and dst server=" + dstName);
1023 
1024     // start HBase cluster
1025     TEST_UTIL.startMiniHBaseCluster(NUM_MASTERS, NUM_RS);
1026 
1027     while (true) {
1028       master = TEST_UTIL.getHBaseCluster().getMaster();
1029       if (master != null && master.isInitialized()) {
1030         ServerManager serverManager = master.getServerManager();
1031         if (!serverManager.areDeadServersInProgress()) {
1032           break;
1033         }
1034       }
1035       Thread.sleep(200);
1036     }
1037 
1038     // verify the region is assigned
1039     master = TEST_UTIL.getHBaseCluster().getMaster();
1040     master.getAssignmentManager().waitForAssignment(hri);
1041     regionStates = master.getAssignmentManager().getRegionStates();
1042     RegionState newState = regionStates.getRegionState(hri);
1043     assertTrue(newState.isOpened());
1044   }
1045 
1046   /**
1047    * Simple test of master failover.
1048    * <p>
1049    * Starts with three masters.  Kills a backup master.  Then kills the active
1050    * master.  Ensures the final master becomes active and we can still contact
1051    * the cluster.
1052    * @throws Exception
1053    */
1054   @Test (timeout=240000)
1055   public void testSimpleMasterFailover() throws Exception {
1056 
1057     final int NUM_MASTERS = 3;
1058     final int NUM_RS = 3;
1059 
1060     // Start the cluster
1061     HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
1062 
1063     TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
1064     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
1065 
1066     // get all the master threads
1067     List<MasterThread> masterThreads = cluster.getMasterThreads();
1068 
1069     // wait for each to come online
1070     for (MasterThread mt : masterThreads) {
1071       assertTrue(mt.isAlive());
1072     }
1073 
1074     // verify only one is the active master and we have right number
1075     int numActive = 0;
1076     int activeIndex = -1;
1077     ServerName activeName = null;
1078     HMaster active = null;
1079     for (int i = 0; i < masterThreads.size(); i++) {
1080       if (masterThreads.get(i).getMaster().isActiveMaster()) {
1081         numActive++;
1082         activeIndex = i;
1083         active = masterThreads.get(activeIndex).getMaster();
1084         activeName = active.getServerName();
1085       }
1086     }
1087     assertEquals(1, numActive);
1088     assertEquals(NUM_MASTERS, masterThreads.size());
1089     LOG.info("Active master " + activeName);
1090 
1091     // Check that ClusterStatus reports the correct active and backup masters
1092     assertNotNull(active);
1093     ClusterStatus status = active.getClusterStatus();
1094     assertTrue(status.getMaster().equals(activeName));
1095     assertEquals(2, status.getBackupMastersSize());
1096     assertEquals(2, status.getBackupMasters().size());
1097 
1098     // attempt to stop one of the inactive masters
1099     int backupIndex = (activeIndex == 0 ? 1 : activeIndex - 1);
1100     HMaster master = cluster.getMaster(backupIndex);
1101     LOG.debug("\n\nStopping a backup master: " + master.getServerName() + "\n");
1102     cluster.stopMaster(backupIndex, false);
1103     cluster.waitOnMaster(backupIndex);
1104 
1105     // Verify still one active master and it's the same
1106     for (int i = 0; i < masterThreads.size(); i++) {
1107       if (masterThreads.get(i).getMaster().isActiveMaster()) {
1108         assertTrue(activeName.equals(masterThreads.get(i).getMaster().getServerName()));
1109         activeIndex = i;
1110         active = masterThreads.get(activeIndex).getMaster();
1111       }
1112     }
1113     assertEquals(1, numActive);
1114     assertEquals(2, masterThreads.size());
1115     int rsCount = masterThreads.get(activeIndex).getMaster().getClusterStatus().getServersSize();
1116     LOG.info("Active master " + active.getServerName() + " managing " + rsCount +  " regions servers");
1117     assertEquals(3, rsCount);
1118 
1119     // Check that ClusterStatus reports the correct active and backup masters
1120     assertNotNull(active);
1121     status = active.getClusterStatus();
1122     assertTrue(status.getMaster().equals(activeName));
1123     assertEquals(1, status.getBackupMastersSize());
1124     assertEquals(1, status.getBackupMasters().size());
1125 
1126     // kill the active master
1127     LOG.debug("\n\nStopping the active master " + active.getServerName() + "\n");
1128     cluster.stopMaster(activeIndex, false);
1129     cluster.waitOnMaster(activeIndex);
1130 
1131     // wait for an active master to show up and be ready
1132     assertTrue(cluster.waitForActiveAndReadyMaster());
1133 
1134     LOG.debug("\n\nVerifying backup master is now active\n");
1135     // should only have one master now
1136     assertEquals(1, masterThreads.size());
1137 
1138     // and he should be active
1139     active = masterThreads.get(0).getMaster();
1140     assertNotNull(active);
1141     status = active.getClusterStatus();
1142     ServerName mastername = status.getMaster();
1143     assertTrue(mastername.equals(active.getServerName()));
1144     assertTrue(active.isActiveMaster());
1145     assertEquals(0, status.getBackupMastersSize());
1146     assertEquals(0, status.getBackupMasters().size());
1147     int rss = status.getServersSize();
1148     LOG.info("Active master " + mastername.getServerName() + " managing " +
1149       rss +  " region servers");
1150     assertEquals(3, rss);
1151 
1152     // Stop the cluster
1153     TEST_UTIL.shutdownMiniCluster();
1154   }
1155 }
1156