1 /** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, software 14 * distributed under the License is distributed on an "AS IS" BASIS, 15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 * See the License for the specific language governing permissions and 17 * limitations under the License. 18 */ 19 package org.apache.hadoop.hbase.master; 20 21 import java.io.IOException; 22 23 import org.apache.commons.logging.Log; 24 import org.apache.commons.logging.LogFactory; 25 import org.apache.hadoop.hbase.TableName; 26 import org.apache.hadoop.hbase.HBaseTestingUtility; 27 import org.apache.hadoop.hbase.HConstants; 28 import org.apache.hadoop.hbase.HRegionInfo; 29 import org.apache.hadoop.hbase.LargeTests; 30 import org.apache.hadoop.hbase.client.HTable; 31 import org.apache.hadoop.hbase.client.Put; 32 import org.apache.hadoop.hbase.client.Result; 33 import org.apache.hadoop.hbase.client.ResultScanner; 34 import org.apache.hadoop.hbase.client.Scan; 35 import org.apache.hadoop.hbase.client.Durability; 36 import org.apache.hadoop.hbase.util.Bytes; 37 import org.junit.AfterClass; 38 import org.junit.Assert; 39 import org.junit.Before; 40 import org.junit.BeforeClass; 41 import org.junit.Ignore; 42 import org.junit.Test; 43 import org.junit.experimental.categories.Category; 44 45 /** 46 * Test transitions of state across the master. Sets up the cluster once and 47 * then runs a couple of tests. 48 */ 49 @Category(LargeTests.class) 50 public class TestMasterTransitions { 51 private static final Log LOG = LogFactory.getLog(TestMasterTransitions.class); 52 private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); 53 private static final String TABLENAME = "master_transitions"; 54 private static final byte [][] FAMILIES = new byte [][] {Bytes.toBytes("a"), 55 Bytes.toBytes("b"), Bytes.toBytes("c")}; 56 57 /** 58 * Start up a mini cluster and put a small table of many empty regions into it. 59 * @throws Exception 60 */ 61 @BeforeClass public static void beforeAllTests() throws Exception { 62 TEST_UTIL.getConfiguration().setBoolean("dfs.support.append", true); 63 TEST_UTIL.startMiniCluster(2); 64 // Create a table of three families. This will assign a region. 65 TableName tableName = TableName.valueOf(TABLENAME); 66 TEST_UTIL.createTable(tableName, FAMILIES); 67 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); 68 int countOfRegions = TEST_UTIL.createMultiRegions(t, getTestFamily()); 69 TEST_UTIL.waitUntilAllRegionsAssigned(tableName); 70 addToEachStartKey(countOfRegions); 71 t.close(); 72 } 73 74 @AfterClass public static void afterAllTests() throws Exception { 75 TEST_UTIL.shutdownMiniCluster(); 76 } 77 78 @Before public void setup() throws IOException { 79 TEST_UTIL.ensureSomeRegionServersAvailable(2); 80 } 81 82 /** 83 * Listener for regionserver events testing hbase-2428 (Infinite loop of 84 * region closes if hbase:meta region is offline). In particular, listen 85 * for the close of the 'metaServer' and when it comes in, requeue it with a 86 * delay as though there were an issue processing the shutdown. As part of 87 * the requeuing, send over a close of a region on 'otherServer' so it comes 88 * into a master that has its meta region marked as offline. 89 */ 90 /* 91 static class HBase2428Listener implements RegionServerOperationListener { 92 // Map of what we've delayed so we don't do do repeated delays. 93 private final Set<RegionServerOperation> postponed = 94 new CopyOnWriteArraySet<RegionServerOperation>(); 95 private boolean done = false;; 96 private boolean metaShutdownReceived = false; 97 private final HServerAddress metaAddress; 98 private final MiniHBaseCluster cluster; 99 private final int otherServerIndex; 100 private final HRegionInfo hri; 101 private int closeCount = 0; 102 static final int SERVER_DURATION = 3 * 1000; 103 static final int CLOSE_DURATION = 1 * 1000; 104 105 HBase2428Listener(final MiniHBaseCluster c, final HServerAddress metaAddress, 106 final HRegionInfo closingHRI, final int otherServerIndex) { 107 this.cluster = c; 108 this.metaAddress = metaAddress; 109 this.hri = closingHRI; 110 this.otherServerIndex = otherServerIndex; 111 } 112 113 @Override 114 public boolean process(final RegionServerOperation op) throws IOException { 115 // If a regionserver shutdown and its of the meta server, then we want to 116 // delay the processing of the shutdown and send off a close of a region on 117 // the 'otherServer. 118 boolean result = true; 119 if (op instanceof ProcessServerShutdown) { 120 ProcessServerShutdown pss = (ProcessServerShutdown)op; 121 if (pss.getDeadServerAddress().equals(this.metaAddress)) { 122 // Don't postpone more than once. 123 if (!this.postponed.contains(pss)) { 124 // Close some region. 125 this.cluster.addMessageToSendRegionServer(this.otherServerIndex, 126 new HMsg(HMsg.Type.MSG_REGION_CLOSE, hri, 127 Bytes.toBytes("Forcing close in test"))); 128 this.postponed.add(pss); 129 // Put off the processing of the regionserver shutdown processing. 130 pss.setDelay(SERVER_DURATION); 131 this.metaShutdownReceived = true; 132 // Return false. This will add this op to the delayed queue. 133 result = false; 134 } 135 } 136 } else { 137 // Have the close run frequently. 138 if (isWantedCloseOperation(op) != null) { 139 op.setDelay(CLOSE_DURATION); 140 // Count how many times it comes through here. 141 this.closeCount++; 142 } 143 } 144 return result; 145 } 146 147 public void processed(final RegionServerOperation op) { 148 if (isWantedCloseOperation(op) != null) return; 149 this.done = true; 150 } 151 */ 152 /* 153 * @param op 154 * @return Null if not the wanted ProcessRegionClose, else <code>op</code> 155 * cast as a ProcessRegionClose. 156 */ 157 /* 158 private ProcessRegionClose isWantedCloseOperation(final RegionServerOperation op) { 159 // Count every time we get a close operation. 160 if (op instanceof ProcessRegionClose) { 161 ProcessRegionClose c = (ProcessRegionClose)op; 162 if (c.regionInfo.equals(hri)) { 163 return c; 164 } 165 } 166 return null; 167 } 168 169 boolean isDone() { 170 return this.done; 171 } 172 173 boolean isMetaShutdownReceived() { 174 return metaShutdownReceived; 175 } 176 177 int getCloseCount() { 178 return this.closeCount; 179 } 180 181 @Override 182 public boolean process(HServerInfo serverInfo, HMsg incomingMsg) { 183 return true; 184 } 185 } 186 */ 187 /** 188 * In 2428, the meta region has just been set offline and then a close comes 189 * in. 190 * @see <a href="https://issues.apache.org/jira/browse/HBASE-2428">HBASE-2428</a> 191 */ 192 @Ignore @Test (timeout=300000) public void testRegionCloseWhenNoMetaHBase2428() 193 throws Exception { 194 /* 195 LOG.info("Running testRegionCloseWhenNoMetaHBase2428"); 196 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 197 final HMaster master = cluster.getMaster(); 198 int metaIndex = cluster.getServerWithMeta(); 199 // Figure the index of the server that is not server the hbase:meta 200 int otherServerIndex = -1; 201 for (int i = 0; i < cluster.getRegionServerThreads().size(); i++) { 202 if (i == metaIndex) continue; 203 otherServerIndex = i; 204 break; 205 } 206 final HRegionServer otherServer = cluster.getRegionServer(otherServerIndex); 207 final HRegionServer metaHRS = cluster.getRegionServer(metaIndex); 208 209 // Get a region out on the otherServer. 210 final HRegionInfo hri = 211 otherServer.getOnlineRegions().iterator().next().getRegionInfo(); 212 213 // Add our RegionServerOperationsListener 214 HBase2428Listener listener = new HBase2428Listener(cluster, 215 metaHRS.getHServerInfo().getServerAddress(), hri, otherServerIndex); 216 master.getRegionServerOperationQueue(). 217 registerRegionServerOperationListener(listener); 218 try { 219 // Now close the server carrying meta. 220 cluster.abortRegionServer(metaIndex); 221 222 // First wait on receipt of meta server shutdown message. 223 while(!listener.metaShutdownReceived) Threads.sleep(100); 224 while(!listener.isDone()) Threads.sleep(10); 225 // We should not have retried the close more times than it took for the 226 // server shutdown message to exit the delay queue and get processed 227 // (Multiple by two to add in some slop in case of GC or something). 228 assertTrue(listener.getCloseCount() > 1); 229 assertTrue(listener.getCloseCount() < 230 ((HBase2428Listener.SERVER_DURATION/HBase2428Listener.CLOSE_DURATION) * 2)); 231 232 // Assert the closed region came back online 233 assertRegionIsBackOnline(hri); 234 } finally { 235 master.getRegionServerOperationQueue(). 236 unregisterRegionServerOperationListener(listener); 237 } 238 */ 239 } 240 241 /** 242 * Test adding in a new server before old one on same host+port is dead. 243 * Make the test more onerous by having the server under test carry the meta. 244 * If confusion between old and new, purportedly meta never comes back. Test 245 * that meta gets redeployed. 246 */ 247 @Ignore @Test (timeout=300000) public void testAddingServerBeforeOldIsDead2413() 248 throws IOException { 249 /* 250 LOG.info("Running testAddingServerBeforeOldIsDead2413"); 251 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 252 int count = count(); 253 int metaIndex = cluster.getServerWithMeta(); 254 MiniHBaseClusterRegionServer metaHRS = 255 (MiniHBaseClusterRegionServer)cluster.getRegionServer(metaIndex); 256 int port = metaHRS.getServerInfo().getServerAddress().getPort(); 257 Configuration c = TEST_UTIL.getConfiguration(); 258 String oldPort = c.get(HConstants.REGIONSERVER_PORT, "0"); 259 try { 260 LOG.info("KILLED=" + metaHRS); 261 metaHRS.kill(); 262 c.set(HConstants.REGIONSERVER_PORT, Integer.toString(port)); 263 // Try and start new regionserver. It might clash with the old 264 // regionserver port so keep trying to get past the BindException. 265 HRegionServer hrs = null; 266 while (true) { 267 try { 268 hrs = cluster.startRegionServer().getRegionServer(); 269 break; 270 } catch (IOException e) { 271 if (e.getCause() != null && e.getCause() instanceof InvocationTargetException) { 272 InvocationTargetException ee = (InvocationTargetException)e.getCause(); 273 if (ee.getCause() != null && ee.getCause() instanceof BindException) { 274 LOG.info("BindException; retrying: " + e.toString()); 275 } 276 } 277 } 278 } 279 LOG.info("STARTED=" + hrs); 280 // Wait until he's been given at least 3 regions before we go on to try 281 // and count rows in table. 282 while (hrs.getOnlineRegions().size() < 3) Threads.sleep(100); 283 LOG.info(hrs.toString() + " has " + hrs.getOnlineRegions().size() + 284 " regions"); 285 assertEquals(count, count()); 286 } finally { 287 c.set(HConstants.REGIONSERVER_PORT, oldPort); 288 } 289 */ 290 } 291 292 /** 293 * HBase2482 is about outstanding region openings. If any are outstanding 294 * when a regionserver goes down, then they'll never deploy. They'll be 295 * stuck in the regions-in-transition list for ever. This listener looks 296 * for a region opening HMsg and if its from the server passed on construction, 297 * then we kill it. It also looks out for a close message on the victim 298 * server because that signifies start of the fireworks. 299 */ 300 /* 301 static class HBase2482Listener implements RegionServerOperationListener { 302 private final HRegionServer victim; 303 private boolean abortSent = false; 304 // We closed regions on new server. 305 private volatile boolean closed = false; 306 // Copy of regions on new server 307 private final Collection<HRegion> copyOfOnlineRegions; 308 // This is the region that was in transition on the server we aborted. Test 309 // passes if this region comes back online successfully. 310 private HRegionInfo regionToFind; 311 312 HBase2482Listener(final HRegionServer victim) { 313 this.victim = victim; 314 // Copy regions currently open on this server so I can notice when 315 // there is a close. 316 this.copyOfOnlineRegions = 317 this.victim.getCopyOfOnlineRegionsSortedBySize().values(); 318 } 319 320 @Override 321 public boolean process(HServerInfo serverInfo, HMsg incomingMsg) { 322 if (!victim.getServerInfo().equals(serverInfo) || 323 this.abortSent || !this.closed) { 324 return true; 325 } 326 if (!incomingMsg.isType(HMsg.Type.MSG_REPORT_PROCESS_OPEN)) return true; 327 // Save the region that is in transition so can test later it came back. 328 this.regionToFind = incomingMsg.getRegionInfo(); 329 String msg = "ABORTING " + this.victim + " because got a " + 330 HMsg.Type.MSG_REPORT_PROCESS_OPEN + " on this server for " + 331 incomingMsg.getRegionInfo().getRegionNameAsString(); 332 this.victim.abort(msg); 333 this.abortSent = true; 334 return true; 335 } 336 337 @Override 338 public boolean process(RegionServerOperation op) throws IOException { 339 return true; 340 } 341 342 @Override 343 public void processed(RegionServerOperation op) { 344 if (this.closed || !(op instanceof ProcessRegionClose)) return; 345 ProcessRegionClose close = (ProcessRegionClose)op; 346 for (HRegion r: this.copyOfOnlineRegions) { 347 if (r.getRegionInfo().equals(close.regionInfo)) { 348 // We've closed one of the regions that was on the victim server. 349 // Now can start testing for when all regions are back online again 350 LOG.info("Found close of " + 351 r.getRegionInfo().getRegionNameAsString() + 352 "; setting close happened flag"); 353 this.closed = true; 354 break; 355 } 356 } 357 } 358 } 359 */ 360 /** 361 * In 2482, a RS with an opening region on it dies. The said region is then 362 * stuck in the master's regions-in-transition and never leaves it. This 363 * test works by bringing up a new regionserver, waiting for the load 364 * balancer to give it some regions. Then, we close all on the new server. 365 * After sending all the close messages, we send the new regionserver the 366 * special blocking message so it can not process any more messages. 367 * Meantime reopening of the just-closed regions is backed up on the new 368 * server. Soon as master gets an opening region from the new regionserver, 369 * we kill it. We then wait on all regions to come back on line. If bug 370 * is fixed, this should happen soon as the processing of the killed server is 371 * done. 372 * @see <a href="https://issues.apache.org/jira/browse/HBASE-2482">HBASE-2482</a> 373 */ 374 @Ignore @Test (timeout=300000) public void testKillRSWithOpeningRegion2482() 375 throws Exception { 376 /* 377 LOG.info("Running testKillRSWithOpeningRegion2482"); 378 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 379 if (cluster.getLiveRegionServerThreads().size() < 2) { 380 // Need at least two servers. 381 cluster.startRegionServer(); 382 } 383 // Count how many regions are online. They need to be all back online for 384 // this test to succeed. 385 int countOfMetaRegions = countOfMetaRegions(); 386 // Add a listener on the server. 387 HMaster m = cluster.getMaster(); 388 // Start new regionserver. 389 MiniHBaseClusterRegionServer hrs = 390 (MiniHBaseClusterRegionServer)cluster.startRegionServer().getRegionServer(); 391 LOG.info("Started new regionserver: " + hrs.toString()); 392 // Wait until has some regions before proceeding. Balancer will give it some. 393 int minimumRegions = 394 countOfMetaRegions/(cluster.getRegionServerThreads().size() * 2); 395 while (hrs.getOnlineRegions().size() < minimumRegions) Threads.sleep(100); 396 // Set the listener only after some regions have been opened on new server. 397 HBase2482Listener listener = new HBase2482Listener(hrs); 398 m.getRegionServerOperationQueue(). 399 registerRegionServerOperationListener(listener); 400 try { 401 // Go close all non-catalog regions on this new server 402 closeAllNonCatalogRegions(cluster, hrs); 403 // After all closes, add blocking message before the region opens start to 404 // come in. 405 cluster.addMessageToSendRegionServer(hrs, 406 new HMsg(HMsg.Type.TESTING_BLOCK_REGIONSERVER)); 407 // Wait till one of the above close messages has an effect before we start 408 // wait on all regions back online. 409 while (!listener.closed) Threads.sleep(100); 410 LOG.info("Past close"); 411 // Make sure the abort server message was sent. 412 while(!listener.abortSent) Threads.sleep(100); 413 LOG.info("Past abort send; waiting on all regions to redeploy"); 414 // Now wait for regions to come back online. 415 assertRegionIsBackOnline(listener.regionToFind); 416 } finally { 417 m.getRegionServerOperationQueue(). 418 unregisterRegionServerOperationListener(listener); 419 } 420 */ 421 } 422 423 /* 424 * @return Count of all non-catalog regions on the designated server 425 */ 426 /* 427 private int closeAllNonCatalogRegions(final MiniHBaseCluster cluster, 428 final MiniHBaseCluster.MiniHBaseClusterRegionServer hrs) 429 throws IOException { 430 int countOfRegions = 0; 431 for (HRegion r: hrs.getOnlineRegions()) { 432 if (r.getRegionInfo().isMetaRegion()) continue; 433 cluster.addMessageToSendRegionServer(hrs, 434 new HMsg(HMsg.Type.MSG_REGION_CLOSE, r.getRegionInfo())); 435 LOG.info("Sent close of " + r.getRegionInfo().getRegionNameAsString() + 436 " on " + hrs.toString()); 437 countOfRegions++; 438 } 439 return countOfRegions; 440 } 441 442 private void assertRegionIsBackOnline(final HRegionInfo hri) 443 throws IOException { 444 // Region should have an entry in its startkey because of addRowToEachRegion. 445 byte [] row = getStartKey(hri); 446 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); 447 Get g = new Get(row); 448 assertTrue((t.get(g)).size() > 0); 449 } 450 451 /* 452 * @return Count of regions in meta table. 453 * @throws IOException 454 */ 455 /* 456 private static int countOfMetaRegions() 457 throws IOException { 458 HTable meta = new HTable(TEST_UTIL.getConfiguration(), 459 HConstants.META_TABLE_NAME); 460 int rows = 0; 461 Scan scan = new Scan(); 462 scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); 463 ResultScanner s = meta.getScanner(scan); 464 for (Result r = null; (r = s.next()) != null;) { 465 byte [] b = 466 r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); 467 if (b == null || b.length <= 0) break; 468 rows++; 469 } 470 s.close(); 471 return rows; 472 } 473 */ 474 /* 475 * Add to each of the regions in hbase:meta a value. Key is the startrow of the 476 * region (except its 'aaa' for first region). Actual value is the row name. 477 * @param expected 478 * @return 479 * @throws IOException 480 */ 481 private static int addToEachStartKey(final int expected) throws IOException { 482 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); 483 HTable meta = new HTable(TEST_UTIL.getConfiguration(), 484 TableName.META_TABLE_NAME); 485 int rows = 0; 486 Scan scan = new Scan(); 487 scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER); 488 ResultScanner s = meta.getScanner(scan); 489 for (Result r = null; (r = s.next()) != null;) { 490 HRegionInfo hri = HRegionInfo.getHRegionInfo(r); 491 if (hri == null) break; 492 if (!hri.getTable().getNameAsString().equals(TABLENAME)) { 493 continue; 494 } 495 496 // If start key, add 'aaa'. 497 if(!hri.getTable().getNameAsString().equals(TABLENAME)) { 498 continue; 499 } 500 byte [] row = getStartKey(hri); 501 Put p = new Put(row); 502 p.setDurability(Durability.SKIP_WAL); 503 p.add(getTestFamily(), getTestQualifier(), row); 504 t.put(p); 505 rows++; 506 } 507 s.close(); 508 Assert.assertEquals(expected, rows); 509 t.close(); 510 meta.close(); 511 return rows; 512 } 513 514 /* 515 * @param hri 516 * @return Start key for hri (If start key is '', then return 'aaa'. 517 */ 518 private static byte [] getStartKey(final HRegionInfo hri) { 519 return Bytes.equals(HConstants.EMPTY_START_ROW, hri.getStartKey())? 520 Bytes.toBytes("aaa"): hri.getStartKey(); 521 } 522 523 private static byte [] getTestFamily() { 524 return FAMILIES[0]; 525 } 526 527 private static byte [] getTestQualifier() { 528 return getTestFamily(); 529 } 530 }