1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.master;
20
21 import static org.junit.Assert.assertEquals;
22 import static org.junit.Assert.assertFalse;
23 import static org.junit.Assert.assertNotNull;
24 import static org.junit.Assert.assertTrue;
25
26 import java.io.IOException;
27 import java.util.ArrayList;
28 import java.util.Iterator;
29 import java.util.List;
30 import java.util.Set;
31 import java.util.TreeSet;
32
33 import org.apache.commons.logging.Log;
34 import org.apache.commons.logging.LogFactory;
35 import org.apache.hadoop.conf.Configuration;
36 import org.apache.hadoop.fs.FileSystem;
37 import org.apache.hadoop.fs.Path;
38 import org.apache.hadoop.hbase.Abortable;
39 import org.apache.hadoop.hbase.ClusterStatus;
40 import org.apache.hadoop.hbase.HBaseConfiguration;
41 import org.apache.hadoop.hbase.HBaseTestingUtility;
42 import org.apache.hadoop.hbase.HColumnDescriptor;
43 import org.apache.hadoop.hbase.HRegionInfo;
44 import org.apache.hadoop.hbase.HTableDescriptor;
45 import org.apache.hadoop.hbase.LargeTests;
46 import org.apache.hadoop.hbase.MiniHBaseCluster;
47 import org.apache.hadoop.hbase.RegionTransition;
48 import org.apache.hadoop.hbase.ServerName;
49 import org.apache.hadoop.hbase.TableName;
50 import org.apache.hadoop.hbase.executor.EventType;
51 import org.apache.hadoop.hbase.master.RegionState.State;
52 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
53 import org.apache.hadoop.hbase.regionserver.HRegion;
54 import org.apache.hadoop.hbase.regionserver.HRegionServer;
55 import org.apache.hadoop.hbase.regionserver.RegionMergeTransaction;
56 import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
57 import org.apache.hadoop.hbase.util.Bytes;
58 import org.apache.hadoop.hbase.util.FSTableDescriptors;
59 import org.apache.hadoop.hbase.util.FSUtils;
60 import org.apache.hadoop.hbase.util.JVMClusterUtil;
61 import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
62 import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
63 import org.apache.hadoop.hbase.util.Threads;
64 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
65 import org.apache.hadoop.hbase.zookeeper.ZKTable;
66 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
67 import org.apache.zookeeper.data.Stat;
68 import org.junit.Test;
69 import org.junit.experimental.categories.Category;
70
71 @Category(LargeTests.class)
72 public class TestMasterFailover {
73 private static final Log LOG = LogFactory.getLog(TestMasterFailover.class);
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154 @Test (timeout=240000)
155 public void testMasterFailoverWithMockedRIT() throws Exception {
156
157 final int NUM_MASTERS = 1;
158 final int NUM_RS = 3;
159
160
161 Configuration conf = HBaseConfiguration.create();
162
163
164 HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
165 TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
166 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
167 log("Cluster started");
168
169
170 ZooKeeperWatcher zkw = HBaseTestingUtility.getZooKeeperWatcher(TEST_UTIL);
171
172
173 List<MasterThread> masterThreads = cluster.getMasterThreads();
174 assertEquals(1, masterThreads.size());
175
176
177 assertTrue(cluster.waitForActiveAndReadyMaster());
178 HMaster master = masterThreads.get(0).getMaster();
179 assertTrue(master.isActiveMaster());
180 assertTrue(master.isInitialized());
181
182
183 master.balanceSwitch(false);
184
185
186 byte [] FAMILY = Bytes.toBytes("family");
187 byte [][] SPLIT_KEYS = new byte [][] {
188 new byte[0], Bytes.toBytes("aaa"), Bytes.toBytes("bbb"),
189 Bytes.toBytes("ccc"), Bytes.toBytes("ddd"), Bytes.toBytes("eee"),
190 Bytes.toBytes("fff"), Bytes.toBytes("ggg"), Bytes.toBytes("hhh"),
191 Bytes.toBytes("iii"), Bytes.toBytes("jjj")
192 };
193
194 byte [] enabledTable = Bytes.toBytes("enabledTable");
195 HTableDescriptor htdEnabled = new HTableDescriptor(TableName.valueOf(enabledTable));
196 htdEnabled.addFamily(new HColumnDescriptor(FAMILY));
197
198 FileSystem filesystem = FileSystem.get(conf);
199 Path rootdir = FSUtils.getRootDir(conf);
200 FSTableDescriptors fstd = new FSTableDescriptors(filesystem, rootdir);
201
202 fstd.createTableDescriptor(htdEnabled);
203
204 HRegionInfo hriEnabled = new HRegionInfo(htdEnabled.getTableName(), null, null);
205 createRegion(hriEnabled, rootdir, conf, htdEnabled);
206
207 List<HRegionInfo> enabledRegions = TEST_UTIL.createMultiRegionsInMeta(
208 TEST_UTIL.getConfiguration(), htdEnabled, SPLIT_KEYS);
209
210 TableName disabledTable = TableName.valueOf("disabledTable");
211 HTableDescriptor htdDisabled = new HTableDescriptor(disabledTable);
212 htdDisabled.addFamily(new HColumnDescriptor(FAMILY));
213
214 fstd.createTableDescriptor(htdDisabled);
215 HRegionInfo hriDisabled = new HRegionInfo(htdDisabled.getTableName(), null, null);
216 createRegion(hriDisabled, rootdir, conf, htdDisabled);
217 List<HRegionInfo> disabledRegions = TEST_UTIL.createMultiRegionsInMeta(
218 TEST_UTIL.getConfiguration(), htdDisabled, SPLIT_KEYS);
219
220 TableName tableWithMergingRegions = TableName.valueOf("tableWithMergingRegions");
221 TEST_UTIL.createTable(tableWithMergingRegions, FAMILY, new byte [][] {Bytes.toBytes("m")});
222
223 log("Regions in hbase:meta and namespace have been created");
224
225
226
227 assertEquals(4, cluster.countServedRegions());
228
229
230 AssignmentManager am = master.getAssignmentManager();
231 RegionStates regionStates = am.getRegionStates();
232 List<HRegionInfo> mergingRegions = regionStates.getRegionsOfTable(tableWithMergingRegions);
233 assertEquals(2, mergingRegions.size());
234 HRegionInfo a = mergingRegions.get(0);
235 HRegionInfo b = mergingRegions.get(1);
236 HRegionInfo newRegion = RegionMergeTransaction.getMergedRegionInfo(a, b);
237 ServerName mergingServer = regionStates.getRegionServerOfRegion(a);
238 ServerName serverB = regionStates.getRegionServerOfRegion(b);
239 if (!serverB.equals(mergingServer)) {
240 RegionPlan plan = new RegionPlan(b, serverB, mergingServer);
241 am.balance(plan);
242 assertTrue(am.waitForAssignment(b));
243 }
244
245
246 HRegionServer hrs = cluster.getRegionServer(0);
247 ServerName serverName = hrs.getServerName();
248 HRegionInfo closingRegion = enabledRegions.remove(0);
249
250 List<HRegionInfo> enabledAndAssignedRegions = new ArrayList<HRegionInfo>();
251 enabledAndAssignedRegions.add(enabledRegions.remove(0));
252 enabledAndAssignedRegions.add(enabledRegions.remove(0));
253 enabledAndAssignedRegions.add(closingRegion);
254
255 List<HRegionInfo> disabledAndAssignedRegions = new ArrayList<HRegionInfo>();
256 disabledAndAssignedRegions.add(disabledRegions.remove(0));
257 disabledAndAssignedRegions.add(disabledRegions.remove(0));
258
259
260 for (HRegionInfo hri : enabledAndAssignedRegions) {
261 master.assignmentManager.regionPlans.put(hri.getEncodedName(),
262 new RegionPlan(hri, null, serverName));
263 master.assignRegion(hri);
264 }
265 for (HRegionInfo hri : disabledAndAssignedRegions) {
266 master.assignmentManager.regionPlans.put(hri.getEncodedName(),
267 new RegionPlan(hri, null, serverName));
268 master.assignRegion(hri);
269 }
270
271
272 log("Waiting for assignment to finish");
273 ZKAssign.blockUntilNoRIT(zkw);
274 log("Assignment completed");
275
276
277 log("Aborting master");
278 cluster.abortMaster(0);
279 cluster.waitOnMaster(0);
280 log("Master has aborted");
281
282
283
284
285
286
287 List<HRegionInfo> regionsThatShouldBeOnline = new ArrayList<HRegionInfo>();
288 List<HRegionInfo> regionsThatShouldBeOffline = new ArrayList<HRegionInfo>();
289
290 log("Beginning to mock scenarios");
291
292
293 ZKTable zktable = new ZKTable(zkw);
294 zktable.setDisabledTable(disabledTable);
295
296
297
298
299
300
301
302
303 HRegionInfo region = enabledRegions.remove(0);
304 regionsThatShouldBeOnline.add(region);
305 ZKAssign.createNodeOffline(zkw, region, serverName);
306
307
308
309
310
311 regionsThatShouldBeOnline.add(closingRegion);
312 ZKAssign.createNodeClosing(zkw, closingRegion, serverName);
313
314
315
316
317
318
319
320 region = enabledRegions.remove(0);
321 regionsThatShouldBeOnline.add(region);
322 int version = ZKAssign.createNodeClosing(zkw, region, serverName);
323 ZKAssign.transitionNodeClosed(zkw, region, serverName, version);
324
325
326 region = disabledRegions.remove(0);
327 regionsThatShouldBeOffline.add(region);
328 version = ZKAssign.createNodeClosing(zkw, region, serverName);
329 ZKAssign.transitionNodeClosed(zkw, region, serverName, version);
330
331
332
333
334
335
336
337 region = enabledRegions.remove(0);
338 regionsThatShouldBeOnline.add(region);
339 ZKAssign.createNodeOffline(zkw, region, serverName);
340 ProtobufUtil.openRegion(hrs, hrs.getServerName(), region);
341 while (true) {
342 byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
343 RegionTransition rt = RegionTransition.parseFrom(bytes);
344 if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
345 break;
346 }
347 Thread.sleep(100);
348 }
349
350
351
352 region = disabledRegions.remove(0);
353 regionsThatShouldBeOffline.add(region);
354 ZKAssign.createNodeOffline(zkw, region, serverName);
355 ProtobufUtil.openRegion(hrs, hrs.getServerName(), region);
356 while (true) {
357 byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
358 RegionTransition rt = RegionTransition.parseFrom(bytes);
359 if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
360 break;
361 }
362 Thread.sleep(100);
363 }
364
365
366
367
368
369
370
371 RegionMergeTransaction.createNodeMerging(
372 zkw, newRegion, mergingServer, a, b);
373
374
375
376
377
378
379
380
381
382 log("Done mocking data up in ZK");
383
384
385 log("Starting up a new master");
386 master = cluster.startMaster().getMaster();
387 log("Waiting for master to be ready");
388 cluster.waitForActiveAndReadyMaster();
389 log("Master is ready");
390
391
392 regionStates = master.getAssignmentManager().getRegionStates();
393
394 assertTrue(regionStates.isRegionInState(a, State.MERGING));
395 assertTrue(regionStates.isRegionInState(b, State.MERGING));
396 assertTrue(regionStates.isRegionInState(newRegion, State.MERGING_NEW));
397
398
399 ZKAssign.deleteNodeFailSilent(zkw, newRegion);
400
401
402 log("Waiting for no more RIT");
403 ZKAssign.blockUntilNoRIT(zkw);
404 log("No more RIT in ZK, now doing final test verification");
405
406
407 Set<HRegionInfo> onlineRegions = new TreeSet<HRegionInfo>();
408 for (JVMClusterUtil.RegionServerThread rst :
409 cluster.getRegionServerThreads()) {
410 onlineRegions.addAll(ProtobufUtil.getOnlineRegions(rst.getRegionServer()));
411 }
412
413
414 for (HRegionInfo hri : regionsThatShouldBeOnline) {
415 assertTrue(onlineRegions.contains(hri));
416 }
417
418
419 for (HRegionInfo hri : regionsThatShouldBeOffline) {
420 if (onlineRegions.contains(hri)) {
421 LOG.debug(hri);
422 }
423 assertFalse(onlineRegions.contains(hri));
424 }
425
426 log("Done with verification, all passed, shutting down cluster");
427
428
429 TEST_UTIL.shutdownMiniCluster();
430 }
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488 @Test (timeout=180000)
489 public void testMasterFailoverWithMockedRITOnDeadRS() throws Exception {
490
491 final int NUM_MASTERS = 1;
492 final int NUM_RS = 2;
493
494
495 HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
496 Configuration conf = TEST_UTIL.getConfiguration();
497
498 conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
499 conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 2);
500 TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
501 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
502 log("Cluster started");
503
504
505 ZooKeeperWatcher zkw = new ZooKeeperWatcher(TEST_UTIL.getConfiguration(),
506 "unittest", new Abortable() {
507
508 @Override
509 public void abort(String why, Throwable e) {
510 LOG.error("Fatal ZK Error: " + why, e);
511 org.junit.Assert.assertFalse("Fatal ZK error", true);
512 }
513
514 @Override
515 public boolean isAborted() {
516 return false;
517 }
518
519 });
520
521
522 List<MasterThread> masterThreads = cluster.getMasterThreads();
523 assertEquals(1, masterThreads.size());
524
525
526 assertTrue(cluster.waitForActiveAndReadyMaster());
527 HMaster master = masterThreads.get(0).getMaster();
528 assertTrue(master.isActiveMaster());
529 assertTrue(master.isInitialized());
530
531
532 master.balanceSwitch(false);
533
534
535 byte [] FAMILY = Bytes.toBytes("family");
536 byte[][] SPLIT_KEYS =
537 TEST_UTIL.getRegionSplitStartKeys(Bytes.toBytes("aaa"), Bytes.toBytes("zzz"), 30);
538
539 byte [] enabledTable = Bytes.toBytes("enabledTable");
540 HTableDescriptor htdEnabled = new HTableDescriptor(TableName.valueOf(enabledTable));
541 htdEnabled.addFamily(new HColumnDescriptor(FAMILY));
542 FileSystem filesystem = FileSystem.get(conf);
543 Path rootdir = FSUtils.getRootDir(conf);
544 FSTableDescriptors fstd = new FSTableDescriptors(filesystem, rootdir);
545
546 fstd.createTableDescriptor(htdEnabled);
547 HRegionInfo hriEnabled = new HRegionInfo(htdEnabled.getTableName(),
548 null, null);
549 createRegion(hriEnabled, rootdir, conf, htdEnabled);
550
551 List<HRegionInfo> enabledRegions = TEST_UTIL.createMultiRegionsInMeta(
552 TEST_UTIL.getConfiguration(), htdEnabled, SPLIT_KEYS);
553
554 TableName disabledTable =
555 TableName.valueOf("disabledTable");
556 HTableDescriptor htdDisabled = new HTableDescriptor(disabledTable);
557 htdDisabled.addFamily(new HColumnDescriptor(FAMILY));
558
559 fstd.createTableDescriptor(htdDisabled);
560 HRegionInfo hriDisabled = new HRegionInfo(htdDisabled.getTableName(), null, null);
561 createRegion(hriDisabled, rootdir, conf, htdDisabled);
562
563 List<HRegionInfo> disabledRegions = TEST_UTIL.createMultiRegionsInMeta(
564 TEST_UTIL.getConfiguration(), htdDisabled, SPLIT_KEYS);
565
566 log("Regions in hbase:meta and Namespace have been created");
567
568
569 assertEquals(2, cluster.countServedRegions());
570
571
572 List<RegionServerThread> regionservers =
573 cluster.getRegionServerThreads();
574 HRegionServer hrs = regionservers.get(0).getRegionServer();
575
576
577 RegionServerThread hrsDeadThread = regionservers.get(1);
578 HRegionServer hrsDead = hrsDeadThread.getRegionServer();
579 ServerName deadServerName = hrsDead.getServerName();
580
581
582 List<HRegionInfo> enabledAndAssignedRegions = new ArrayList<HRegionInfo>();
583 enabledAndAssignedRegions.addAll(enabledRegions.subList(0, 6));
584 enabledRegions.removeAll(enabledAndAssignedRegions);
585 List<HRegionInfo> disabledAndAssignedRegions = new ArrayList<HRegionInfo>();
586 disabledAndAssignedRegions.addAll(disabledRegions.subList(0, 6));
587 disabledRegions.removeAll(disabledAndAssignedRegions);
588
589
590 for (HRegionInfo hri : enabledAndAssignedRegions) {
591 master.assignmentManager.regionPlans.put(hri.getEncodedName(),
592 new RegionPlan(hri, null, hrs.getServerName()));
593 master.assignRegion(hri);
594 }
595 for (HRegionInfo hri : disabledAndAssignedRegions) {
596 master.assignmentManager.regionPlans.put(hri.getEncodedName(),
597 new RegionPlan(hri, null, hrs.getServerName()));
598 master.assignRegion(hri);
599 }
600
601 log("Waiting for assignment to finish");
602 ZKAssign.blockUntilNoRIT(zkw);
603 master.assignmentManager.waitUntilNoRegionsInTransition(60000);
604 log("Assignment completed");
605
606 assertTrue(" Table must be enabled.", master.getAssignmentManager()
607 .getZKTable().isEnabledTable(TableName.valueOf("enabledTable")));
608
609 List<HRegionInfo> enabledAndOnDeadRegions = new ArrayList<HRegionInfo>();
610 enabledAndOnDeadRegions.addAll(enabledRegions.subList(0, 6));
611 enabledRegions.removeAll(enabledAndOnDeadRegions);
612 List<HRegionInfo> disabledAndOnDeadRegions = new ArrayList<HRegionInfo>();
613 disabledAndOnDeadRegions.addAll(disabledRegions.subList(0, 6));
614 disabledRegions.removeAll(disabledAndOnDeadRegions);
615
616
617 for (HRegionInfo hri : enabledAndOnDeadRegions) {
618 master.assignmentManager.regionPlans.put(hri.getEncodedName(),
619 new RegionPlan(hri, null, deadServerName));
620 master.assignRegion(hri);
621 }
622 for (HRegionInfo hri : disabledAndOnDeadRegions) {
623 master.assignmentManager.regionPlans.put(hri.getEncodedName(),
624 new RegionPlan(hri, null, deadServerName));
625 master.assignRegion(hri);
626 }
627
628
629 log("Waiting for assignment to finish");
630 ZKAssign.blockUntilNoRIT(zkw);
631 master.assignmentManager.waitUntilNoRegionsInTransition(60000);
632 log("Assignment completed");
633
634
635
636 verifyRegionLocation(hrs, enabledAndAssignedRegions);
637 verifyRegionLocation(hrs, disabledAndAssignedRegions);
638 verifyRegionLocation(hrsDead, enabledAndOnDeadRegions);
639 verifyRegionLocation(hrsDead, disabledAndOnDeadRegions);
640
641 assertTrue(" Didn't get enough regions of enabledTalbe on live rs.",
642 enabledAndAssignedRegions.size() >= 2);
643 assertTrue(" Didn't get enough regions of disalbedTable on live rs.",
644 disabledAndAssignedRegions.size() >= 2);
645 assertTrue(" Didn't get enough regions of enabledTalbe on dead rs.",
646 enabledAndOnDeadRegions.size() >= 2);
647 assertTrue(" Didn't get enough regions of disalbedTable on dead rs.",
648 disabledAndOnDeadRegions.size() >= 2);
649
650
651 log("Aborting master");
652 cluster.abortMaster(0);
653 cluster.waitOnMaster(0);
654 log("Master has aborted");
655
656
657
658
659
660
661 List<HRegionInfo> regionsThatShouldBeOnline = new ArrayList<HRegionInfo>();
662 List<HRegionInfo> regionsThatShouldBeOffline = new ArrayList<HRegionInfo>();
663
664 log("Beginning to mock scenarios");
665
666
667 ZKTable zktable = new ZKTable(zkw);
668 zktable.setDisabledTable(disabledTable);
669
670 assertTrue(" The enabled table should be identified on master fail over.",
671 zktable.isEnabledTable(TableName.valueOf("enabledTable")));
672
673
674
675
676
677
678 HRegionInfo region = enabledAndOnDeadRegions.remove(0);
679 regionsThatShouldBeOnline.add(region);
680 ZKAssign.createNodeClosing(zkw, region, deadServerName);
681 LOG.debug("\n\nRegion of enabled table was CLOSING on dead RS\n" +
682 region + "\n\n");
683
684
685 region = disabledAndOnDeadRegions.remove(0);
686 regionsThatShouldBeOffline.add(region);
687 ZKAssign.createNodeClosing(zkw, region, deadServerName);
688 LOG.debug("\n\nRegion of disabled table was CLOSING on dead RS\n" +
689 region + "\n\n");
690
691
692
693
694
695
696 region = enabledAndOnDeadRegions.remove(0);
697 regionsThatShouldBeOnline.add(region);
698 int version = ZKAssign.createNodeClosing(zkw, region, deadServerName);
699 ZKAssign.transitionNodeClosed(zkw, region, deadServerName, version);
700 LOG.debug("\n\nRegion of enabled table was CLOSED on dead RS\n" +
701 region + "\n\n");
702
703
704 region = disabledAndOnDeadRegions.remove(0);
705 regionsThatShouldBeOffline.add(region);
706 version = ZKAssign.createNodeClosing(zkw, region, deadServerName);
707 ZKAssign.transitionNodeClosed(zkw, region, deadServerName, version);
708 LOG.debug("\n\nRegion of disabled table was CLOSED on dead RS\n" +
709 region + "\n\n");
710
711
712
713
714
715
716 region = enabledRegions.remove(0);
717 regionsThatShouldBeOnline.add(region);
718 ZKAssign.createNodeOffline(zkw, region, deadServerName);
719 ZKAssign.transitionNodeOpening(zkw, region, deadServerName);
720 LOG.debug("\n\nRegion of enabled table was OPENING on dead RS\n" +
721 region + "\n\n");
722
723
724 region = disabledRegions.remove(0);
725 regionsThatShouldBeOffline.add(region);
726 ZKAssign.createNodeOffline(zkw, region, deadServerName);
727 ZKAssign.transitionNodeOpening(zkw, region, deadServerName);
728 LOG.debug("\n\nRegion of disabled table was OPENING on dead RS\n" +
729 region + "\n\n");
730
731
732
733
734
735
736 region = enabledRegions.remove(0);
737 regionsThatShouldBeOnline.add(region);
738 ZKAssign.createNodeOffline(zkw, region, deadServerName);
739 ProtobufUtil.openRegion(hrsDead, hrsDead.getServerName(), region);
740 while (true) {
741 byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
742 RegionTransition rt = RegionTransition.parseFrom(bytes);
743 if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
744 break;
745 }
746 Thread.sleep(100);
747 }
748 LOG.debug("\n\nRegion of enabled table was OPENED on dead RS\n" +
749 region + "\n\n");
750
751
752 region = disabledRegions.remove(0);
753 regionsThatShouldBeOffline.add(region);
754 ZKAssign.createNodeOffline(zkw, region, deadServerName);
755 ProtobufUtil.openRegion(hrsDead, hrsDead.getServerName(), region);
756 while (true) {
757 byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
758 RegionTransition rt = RegionTransition.parseFrom(bytes);
759 if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
760 break;
761 }
762 Thread.sleep(100);
763 }
764 LOG.debug("\n\nRegion of disabled table was OPENED on dead RS\n" +
765 region + "\n\n");
766
767
768
769
770
771
772 region = enabledRegions.remove(0);
773 regionsThatShouldBeOnline.add(region);
774 ZKAssign.createNodeOffline(zkw, region, deadServerName);
775 ProtobufUtil.openRegion(hrsDead, hrsDead.getServerName(), region);
776 while (true) {
777 byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
778 RegionTransition rt = RegionTransition.parseFrom(bytes);
779 if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
780 ZKAssign.deleteOpenedNode(zkw, region.getEncodedName(), rt.getServerName());
781 LOG.debug("DELETED " + rt);
782 break;
783 }
784 Thread.sleep(100);
785 }
786 LOG.debug("\n\nRegion of enabled table was open at steady-state on dead RS"
787 + "\n" + region + "\n\n");
788
789
790 region = disabledRegions.remove(0);
791 regionsThatShouldBeOffline.add(region);
792 ZKAssign.createNodeOffline(zkw, region, deadServerName);
793 ProtobufUtil.openRegion(hrsDead, hrsDead.getServerName(), region);
794 while (true) {
795 byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
796 RegionTransition rt = RegionTransition.parseFrom(bytes);
797 if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
798 ZKAssign.deleteOpenedNode(zkw, region.getEncodedName(), rt.getServerName());
799 break;
800 }
801 Thread.sleep(100);
802 }
803 LOG.debug("\n\nRegion of disabled table was open at steady-state on dead RS"
804 + "\n" + region + "\n\n");
805
806
807
808
809
810 log("Done mocking data up in ZK");
811
812
813 log("Killing RS " + deadServerName);
814 hrsDead.abort("Killing for unit test");
815 log("RS " + deadServerName + " killed");
816
817
818
819 while (hrsDeadThread.isAlive()) {
820 Threads.sleep(10);
821 }
822 log("Starting up a new master");
823 master = cluster.startMaster().getMaster();
824 log("Waiting for master to be ready");
825 assertTrue(cluster.waitForActiveAndReadyMaster());
826 log("Master is ready");
827
828
829 while (master.getServerManager().areDeadServersInProgress()) {
830 Thread.sleep(10);
831 }
832
833
834 log("Waiting for no more RIT");
835 ZKAssign.blockUntilNoRIT(zkw);
836 log("No more RIT in ZK");
837 long now = System.currentTimeMillis();
838 long maxTime = 120000;
839 boolean done = master.assignmentManager.waitUntilNoRegionsInTransition(maxTime);
840 if (!done) {
841 RegionStates regionStates = master.getAssignmentManager().getRegionStates();
842 LOG.info("rit=" + regionStates.getRegionsInTransition());
843 }
844 long elapsed = System.currentTimeMillis() - now;
845 assertTrue("Elapsed=" + elapsed + ", maxTime=" + maxTime + ", done=" + done,
846 elapsed < maxTime);
847 log("No more RIT in RIT map, doing final test verification");
848
849
850 Set<HRegionInfo> onlineRegions = new TreeSet<HRegionInfo>();
851 now = System.currentTimeMillis();
852 maxTime = 30000;
853 for (JVMClusterUtil.RegionServerThread rst :
854 cluster.getRegionServerThreads()) {
855 try {
856 HRegionServer rs = rst.getRegionServer();
857 while (!rs.getRegionsInTransitionInRS().isEmpty()) {
858 elapsed = System.currentTimeMillis() - now;
859 assertTrue("Test timed out in getting online regions", elapsed < maxTime);
860 if (rs.isAborted() || rs.isStopped()) {
861
862 break;
863 }
864 Thread.sleep(100);
865 }
866 onlineRegions.addAll(ProtobufUtil.getOnlineRegions(rs));
867 } catch (RegionServerStoppedException e) {
868 LOG.info("Got RegionServerStoppedException", e);
869 }
870 }
871
872
873 for (HRegionInfo hri : regionsThatShouldBeOnline) {
874 assertTrue("region=" + hri.getRegionNameAsString() + ", " + onlineRegions.toString(),
875 onlineRegions.contains(hri));
876 }
877
878
879 for (HRegionInfo hri : regionsThatShouldBeOffline) {
880 assertFalse(onlineRegions.contains(hri));
881 }
882
883 log("Done with verification, all passed, shutting down cluster");
884
885
886 TEST_UTIL.shutdownMiniCluster();
887 }
888
889
890
891
892 private void verifyRegionLocation(HRegionServer hrs, List<HRegionInfo> regions)
893 throws IOException {
894 List<HRegionInfo> tmpOnlineRegions = ProtobufUtil.getOnlineRegions(hrs);
895 Iterator<HRegionInfo> itr = regions.iterator();
896 while (itr.hasNext()) {
897 HRegionInfo tmp = itr.next();
898 if (!tmpOnlineRegions.contains(tmp)) {
899 itr.remove();
900 }
901 }
902 }
903
904 HRegion createRegion(final HRegionInfo hri, final Path rootdir, final Configuration c,
905 final HTableDescriptor htd)
906 throws IOException {
907 HRegion r = HRegion.createHRegion(hri, rootdir, c, htd);
908
909
910
911
912
913 HRegion.closeHRegion(r);
914 return r;
915 }
916
917
918
919
920 private void log(String string) {
921 LOG.info("\n\n" + string + " \n\n");
922 }
923
924 @Test (timeout=180000)
925 public void testShouldCheckMasterFailOverWhenMETAIsInOpenedState()
926 throws Exception {
927 LOG.info("Starting testShouldCheckMasterFailOverWhenMETAIsInOpenedState");
928 final int NUM_MASTERS = 1;
929 final int NUM_RS = 2;
930
931
932 HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
933 Configuration conf = TEST_UTIL.getConfiguration();
934 conf.setInt("hbase.master.info.port", -1);
935
936 TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
937 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
938
939
940 List<RegionServerThread> regionServerThreads =
941 cluster.getRegionServerThreads();
942 int count = -1;
943 HRegion metaRegion = null;
944 for (RegionServerThread regionServerThread : regionServerThreads) {
945 HRegionServer regionServer = regionServerThread.getRegionServer();
946 metaRegion = regionServer.getOnlineRegion(HRegionInfo.FIRST_META_REGIONINFO.getRegionName());
947 count++;
948 regionServer.abort("");
949 if (null != metaRegion) break;
950 }
951 HRegionServer regionServer = cluster.getRegionServer(count);
952
953 TEST_UTIL.shutdownMiniHBaseCluster();
954
955
956 ZooKeeperWatcher zkw =
957 HBaseTestingUtility.createAndForceNodeToOpenedState(TEST_UTIL,
958 metaRegion, regionServer.getServerName());
959
960 LOG.info("Staring cluster for second time");
961 TEST_UTIL.startMiniHBaseCluster(NUM_MASTERS, NUM_RS);
962
963 HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
964 while (!master.isInitialized()) {
965 Thread.sleep(100);
966 }
967
968 log("Waiting for no more RIT");
969 ZKAssign.blockUntilNoRIT(zkw);
970
971 zkw.close();
972
973 TEST_UTIL.shutdownMiniCluster();
974 }
975
976
977
978
979 @Test(timeout=240000)
980 public void testOfflineRegionReAssginedAfterMasterRestart() throws Exception {
981 final TableName table = TableName.valueOf("testOfflineRegionReAssginedAfterMasterRestart");
982 final int NUM_MASTERS = 1;
983 final int NUM_RS = 2;
984
985
986 Configuration conf = HBaseConfiguration.create();
987
988
989 final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
990 TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
991 log("Cluster started");
992
993 TEST_UTIL.createTable(table, Bytes.toBytes("family"));
994 HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
995 RegionStates regionStates = master.getAssignmentManager().getRegionStates();
996 HRegionInfo hri = regionStates.getRegionsOfTable(table).get(0);
997 ServerName serverName = regionStates.getRegionServerOfRegion(hri);
998 TEST_UTIL.assertRegionOnServer(hri, serverName, 200);
999
1000 ServerName dstName = null;
1001 for (ServerName tmpServer : master.serverManager.getOnlineServers().keySet()) {
1002 if (!tmpServer.equals(serverName)) {
1003 dstName = tmpServer;
1004 break;
1005 }
1006 }
1007
1008 assertTrue(dstName != null);
1009
1010 TEST_UTIL.shutdownMiniHBaseCluster();
1011
1012 ZooKeeperWatcher zkw = TEST_UTIL.getZooKeeperWatcher();
1013 ZKAssign.createNodeOffline(zkw, hri, dstName);
1014 Stat stat = new Stat();
1015 byte[] data =
1016 ZKAssign.getDataNoWatch(zkw, hri.getEncodedName(), stat);
1017 assertTrue(data != null);
1018 RegionTransition rt = RegionTransition.parseFrom(data);
1019 assertTrue(rt.getEventType() == EventType.M_ZK_REGION_OFFLINE);
1020
1021 LOG.info(hri.getEncodedName() + " region is in offline state with source server=" + serverName
1022 + " and dst server=" + dstName);
1023
1024
1025 TEST_UTIL.startMiniHBaseCluster(NUM_MASTERS, NUM_RS);
1026
1027 while (true) {
1028 master = TEST_UTIL.getHBaseCluster().getMaster();
1029 if (master != null && master.isInitialized()) {
1030 ServerManager serverManager = master.getServerManager();
1031 if (!serverManager.areDeadServersInProgress()) {
1032 break;
1033 }
1034 }
1035 Thread.sleep(200);
1036 }
1037
1038
1039 master = TEST_UTIL.getHBaseCluster().getMaster();
1040 master.getAssignmentManager().waitForAssignment(hri);
1041 regionStates = master.getAssignmentManager().getRegionStates();
1042 RegionState newState = regionStates.getRegionState(hri);
1043 assertTrue(newState.isOpened());
1044 }
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054 @Test (timeout=240000)
1055 public void testSimpleMasterFailover() throws Exception {
1056
1057 final int NUM_MASTERS = 3;
1058 final int NUM_RS = 3;
1059
1060
1061 HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
1062
1063 TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
1064 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
1065
1066
1067 List<MasterThread> masterThreads = cluster.getMasterThreads();
1068
1069
1070 for (MasterThread mt : masterThreads) {
1071 assertTrue(mt.isAlive());
1072 }
1073
1074
1075 int numActive = 0;
1076 int activeIndex = -1;
1077 ServerName activeName = null;
1078 HMaster active = null;
1079 for (int i = 0; i < masterThreads.size(); i++) {
1080 if (masterThreads.get(i).getMaster().isActiveMaster()) {
1081 numActive++;
1082 activeIndex = i;
1083 active = masterThreads.get(activeIndex).getMaster();
1084 activeName = active.getServerName();
1085 }
1086 }
1087 assertEquals(1, numActive);
1088 assertEquals(NUM_MASTERS, masterThreads.size());
1089 LOG.info("Active master " + activeName);
1090
1091
1092 assertNotNull(active);
1093 ClusterStatus status = active.getClusterStatus();
1094 assertTrue(status.getMaster().equals(activeName));
1095 assertEquals(2, status.getBackupMastersSize());
1096 assertEquals(2, status.getBackupMasters().size());
1097
1098
1099 int backupIndex = (activeIndex == 0 ? 1 : activeIndex - 1);
1100 HMaster master = cluster.getMaster(backupIndex);
1101 LOG.debug("\n\nStopping a backup master: " + master.getServerName() + "\n");
1102 cluster.stopMaster(backupIndex, false);
1103 cluster.waitOnMaster(backupIndex);
1104
1105
1106 for (int i = 0; i < masterThreads.size(); i++) {
1107 if (masterThreads.get(i).getMaster().isActiveMaster()) {
1108 assertTrue(activeName.equals(masterThreads.get(i).getMaster().getServerName()));
1109 activeIndex = i;
1110 active = masterThreads.get(activeIndex).getMaster();
1111 }
1112 }
1113 assertEquals(1, numActive);
1114 assertEquals(2, masterThreads.size());
1115 int rsCount = masterThreads.get(activeIndex).getMaster().getClusterStatus().getServersSize();
1116 LOG.info("Active master " + active.getServerName() + " managing " + rsCount + " regions servers");
1117 assertEquals(3, rsCount);
1118
1119
1120 assertNotNull(active);
1121 status = active.getClusterStatus();
1122 assertTrue(status.getMaster().equals(activeName));
1123 assertEquals(1, status.getBackupMastersSize());
1124 assertEquals(1, status.getBackupMasters().size());
1125
1126
1127 LOG.debug("\n\nStopping the active master " + active.getServerName() + "\n");
1128 cluster.stopMaster(activeIndex, false);
1129 cluster.waitOnMaster(activeIndex);
1130
1131
1132 assertTrue(cluster.waitForActiveAndReadyMaster());
1133
1134 LOG.debug("\n\nVerifying backup master is now active\n");
1135
1136 assertEquals(1, masterThreads.size());
1137
1138
1139 active = masterThreads.get(0).getMaster();
1140 assertNotNull(active);
1141 status = active.getClusterStatus();
1142 ServerName mastername = status.getMaster();
1143 assertTrue(mastername.equals(active.getServerName()));
1144 assertTrue(active.isActiveMaster());
1145 assertEquals(0, status.getBackupMastersSize());
1146 assertEquals(0, status.getBackupMasters().size());
1147 int rss = status.getServersSize();
1148 LOG.info("Active master " + mastername.getServerName() + " managing " +
1149 rss + " region servers");
1150 assertEquals(3, rss);
1151
1152
1153 TEST_UTIL.shutdownMiniCluster();
1154 }
1155 }
1156