View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.master;
20  
21  import java.io.IOException;
22  
23  import org.apache.commons.logging.Log;
24  import org.apache.commons.logging.LogFactory;
25  import org.apache.hadoop.hbase.TableName;
26  import org.apache.hadoop.hbase.HBaseTestingUtility;
27  import org.apache.hadoop.hbase.HConstants;
28  import org.apache.hadoop.hbase.HRegionInfo;
29  import org.apache.hadoop.hbase.LargeTests;
30  import org.apache.hadoop.hbase.client.HTable;
31  import org.apache.hadoop.hbase.client.Put;
32  import org.apache.hadoop.hbase.client.Result;
33  import org.apache.hadoop.hbase.client.ResultScanner;
34  import org.apache.hadoop.hbase.client.Scan;
35  import org.apache.hadoop.hbase.client.Durability;
36  import org.apache.hadoop.hbase.util.Bytes;
37  import org.junit.AfterClass;
38  import org.junit.Assert;
39  import org.junit.Before;
40  import org.junit.BeforeClass;
41  import org.junit.Ignore;
42  import org.junit.Test;
43  import org.junit.experimental.categories.Category;
44  
45  /**
46   * Test transitions of state across the master.  Sets up the cluster once and
47   * then runs a couple of tests.
48   */
49  @Category(LargeTests.class)
50  public class TestMasterTransitions {
51    private static final Log LOG = LogFactory.getLog(TestMasterTransitions.class);
52    private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
53    private static final String TABLENAME = "master_transitions";
54    private static final byte [][] FAMILIES = new byte [][] {Bytes.toBytes("a"),
55      Bytes.toBytes("b"), Bytes.toBytes("c")};
56  
57    /**
58     * Start up a mini cluster and put a small table of many empty regions into it.
59     * @throws Exception
60     */
61    @BeforeClass public static void beforeAllTests() throws Exception {
62      TEST_UTIL.getConfiguration().setBoolean("dfs.support.append", true);
63      TEST_UTIL.startMiniCluster(2);
64      // Create a table of three families.  This will assign a region.
65      TableName tableName = TableName.valueOf(TABLENAME);
66      TEST_UTIL.createTable(tableName, FAMILIES);
67      HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
68      int countOfRegions = TEST_UTIL.createMultiRegions(t, getTestFamily());
69      TEST_UTIL.waitUntilAllRegionsAssigned(tableName);
70      addToEachStartKey(countOfRegions);
71      t.close();
72    }
73  
74    @AfterClass public static void afterAllTests() throws Exception {
75      TEST_UTIL.shutdownMiniCluster();
76    }
77  
78    @Before public void setup() throws IOException {
79      TEST_UTIL.ensureSomeRegionServersAvailable(2);
80    }
81  
82    /**
83     * Listener for regionserver events testing hbase-2428 (Infinite loop of
84     * region closes if hbase:meta region is offline).  In particular, listen
85     * for the close of the 'metaServer' and when it comes in, requeue it with a
86     * delay as though there were an issue processing the shutdown.  As part of
87     * the requeuing,  send over a close of a region on 'otherServer' so it comes
88     * into a master that has its meta region marked as offline.
89     */
90    /*
91    static class HBase2428Listener implements RegionServerOperationListener {
92      // Map of what we've delayed so we don't do do repeated delays.
93      private final Set<RegionServerOperation> postponed =
94        new CopyOnWriteArraySet<RegionServerOperation>();
95      private boolean done = false;;
96      private boolean metaShutdownReceived = false;
97      private final HServerAddress metaAddress;
98      private final MiniHBaseCluster cluster;
99      private final int otherServerIndex;
100     private final HRegionInfo hri;
101     private int closeCount = 0;
102     static final int SERVER_DURATION = 3 * 1000;
103     static final int CLOSE_DURATION = 1 * 1000;
104 
105     HBase2428Listener(final MiniHBaseCluster c, final HServerAddress metaAddress,
106         final HRegionInfo closingHRI, final int otherServerIndex) {
107       this.cluster = c;
108       this.metaAddress = metaAddress;
109       this.hri = closingHRI;
110       this.otherServerIndex = otherServerIndex;
111     }
112 
113     @Override
114     public boolean process(final RegionServerOperation op) throws IOException {
115       // If a regionserver shutdown and its of the meta server, then we want to
116       // delay the processing of the shutdown and send off a close of a region on
117       // the 'otherServer.
118       boolean result = true;
119       if (op instanceof ProcessServerShutdown) {
120         ProcessServerShutdown pss = (ProcessServerShutdown)op;
121         if (pss.getDeadServerAddress().equals(this.metaAddress)) {
122           // Don't postpone more than once.
123           if (!this.postponed.contains(pss)) {
124             // Close some region.
125             this.cluster.addMessageToSendRegionServer(this.otherServerIndex,
126               new HMsg(HMsg.Type.MSG_REGION_CLOSE, hri,
127               Bytes.toBytes("Forcing close in test")));
128             this.postponed.add(pss);
129             // Put off the processing of the regionserver shutdown processing.
130             pss.setDelay(SERVER_DURATION);
131             this.metaShutdownReceived = true;
132             // Return false.  This will add this op to the delayed queue.
133             result = false;
134           }
135         }
136       } else {
137         // Have the close run frequently.
138         if (isWantedCloseOperation(op) != null) {
139           op.setDelay(CLOSE_DURATION);
140           // Count how many times it comes through here.
141           this.closeCount++;
142         }
143       }
144       return result;
145     }
146 
147     public void processed(final RegionServerOperation op) {
148       if (isWantedCloseOperation(op) != null) return;
149       this.done = true;
150     }
151 */
152     /*
153      * @param op
154      * @return Null if not the wanted ProcessRegionClose, else <code>op</code>
155      * cast as a ProcessRegionClose.
156      */
157   /*
158     private ProcessRegionClose isWantedCloseOperation(final RegionServerOperation op) {
159       // Count every time we get a close operation.
160       if (op instanceof ProcessRegionClose) {
161         ProcessRegionClose c = (ProcessRegionClose)op;
162         if (c.regionInfo.equals(hri)) {
163           return c;
164         }
165       }
166       return null;
167     }
168 
169     boolean isDone() {
170       return this.done;
171     }
172 
173     boolean isMetaShutdownReceived() {
174       return metaShutdownReceived;
175     }
176 
177     int getCloseCount() {
178       return this.closeCount;
179     }
180 
181     @Override
182     public boolean process(HServerInfo serverInfo, HMsg incomingMsg) {
183       return true;
184     }
185   }
186 */
187   /**
188    * In 2428, the meta region has just been set offline and then a close comes
189    * in.
190    * @see <a href="https://issues.apache.org/jira/browse/HBASE-2428">HBASE-2428</a>
191    */
192   @Ignore @Test  (timeout=300000) public void testRegionCloseWhenNoMetaHBase2428()
193   throws Exception {
194     /*
195     LOG.info("Running testRegionCloseWhenNoMetaHBase2428");
196     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
197     final HMaster master = cluster.getMaster();
198     int metaIndex = cluster.getServerWithMeta();
199     // Figure the index of the server that is not server the hbase:meta
200     int otherServerIndex = -1;
201     for (int i = 0; i < cluster.getRegionServerThreads().size(); i++) {
202       if (i == metaIndex) continue;
203       otherServerIndex = i;
204       break;
205     }
206     final HRegionServer otherServer = cluster.getRegionServer(otherServerIndex);
207     final HRegionServer metaHRS = cluster.getRegionServer(metaIndex);
208 
209     // Get a region out on the otherServer.
210     final HRegionInfo hri =
211       otherServer.getOnlineRegions().iterator().next().getRegionInfo();
212 
213     // Add our RegionServerOperationsListener
214     HBase2428Listener listener = new HBase2428Listener(cluster,
215       metaHRS.getHServerInfo().getServerAddress(), hri, otherServerIndex);
216     master.getRegionServerOperationQueue().
217       registerRegionServerOperationListener(listener);
218     try {
219       // Now close the server carrying meta.
220       cluster.abortRegionServer(metaIndex);
221 
222       // First wait on receipt of meta server shutdown message.
223       while(!listener.metaShutdownReceived) Threads.sleep(100);
224       while(!listener.isDone()) Threads.sleep(10);
225       // We should not have retried the close more times than it took for the
226       // server shutdown message to exit the delay queue and get processed
227       // (Multiple by two to add in some slop in case of GC or something).
228       assertTrue(listener.getCloseCount() > 1);
229       assertTrue(listener.getCloseCount() <
230         ((HBase2428Listener.SERVER_DURATION/HBase2428Listener.CLOSE_DURATION) * 2));
231 
232       // Assert the closed region came back online
233       assertRegionIsBackOnline(hri);
234     } finally {
235       master.getRegionServerOperationQueue().
236         unregisterRegionServerOperationListener(listener);
237     }
238     */
239   }
240 
241   /**
242    * Test adding in a new server before old one on same host+port is dead.
243    * Make the test more onerous by having the server under test carry the meta.
244    * If confusion between old and new, purportedly meta never comes back.  Test
245    * that meta gets redeployed.
246    */
247   @Ignore @Test (timeout=300000) public void testAddingServerBeforeOldIsDead2413()
248   throws IOException {
249     /*
250     LOG.info("Running testAddingServerBeforeOldIsDead2413");
251     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
252     int count = count();
253     int metaIndex = cluster.getServerWithMeta();
254     MiniHBaseClusterRegionServer metaHRS =
255       (MiniHBaseClusterRegionServer)cluster.getRegionServer(metaIndex);
256     int port = metaHRS.getServerInfo().getServerAddress().getPort();
257     Configuration c = TEST_UTIL.getConfiguration();
258     String oldPort = c.get(HConstants.REGIONSERVER_PORT, "0");
259     try {
260       LOG.info("KILLED=" + metaHRS);
261       metaHRS.kill();
262       c.set(HConstants.REGIONSERVER_PORT, Integer.toString(port));
263       // Try and start new regionserver.  It might clash with the old
264       // regionserver port so keep trying to get past the BindException.
265       HRegionServer hrs = null;
266       while (true) {
267         try {
268           hrs = cluster.startRegionServer().getRegionServer();
269           break;
270         } catch (IOException e) {
271           if (e.getCause() != null && e.getCause() instanceof InvocationTargetException) {
272             InvocationTargetException ee = (InvocationTargetException)e.getCause();
273             if (ee.getCause() != null && ee.getCause() instanceof BindException) {
274               LOG.info("BindException; retrying: " + e.toString());
275             }
276           }
277         }
278       }
279       LOG.info("STARTED=" + hrs);
280       // Wait until he's been given at least 3 regions before we go on to try
281       // and count rows in table.
282       while (hrs.getOnlineRegions().size() < 3) Threads.sleep(100);
283       LOG.info(hrs.toString() + " has " + hrs.getOnlineRegions().size() +
284         " regions");
285       assertEquals(count, count());
286     } finally {
287       c.set(HConstants.REGIONSERVER_PORT, oldPort);
288     }
289     */
290   }
291 
292   /**
293    * HBase2482 is about outstanding region openings.  If any are outstanding
294    * when a regionserver goes down, then they'll never deploy.  They'll be
295    * stuck in the regions-in-transition list for ever.  This listener looks
296    * for a region opening HMsg and if its from the server passed on construction,
297    * then we kill it.  It also looks out for a close message on the victim
298    * server because that signifies start of the fireworks.
299    */
300   /*
301   static class HBase2482Listener implements RegionServerOperationListener {
302     private final HRegionServer victim;
303     private boolean abortSent = false;
304     // We closed regions on new server.
305     private volatile boolean closed = false;
306     // Copy of regions on new server
307     private final Collection<HRegion> copyOfOnlineRegions;
308     // This is the region that was in transition on the server we aborted. Test
309     // passes if this region comes back online successfully.
310     private HRegionInfo regionToFind;
311 
312     HBase2482Listener(final HRegionServer victim) {
313       this.victim = victim;
314       // Copy regions currently open on this server so I can notice when
315       // there is a close.
316       this.copyOfOnlineRegions =
317         this.victim.getCopyOfOnlineRegionsSortedBySize().values();
318     }
319 
320     @Override
321     public boolean process(HServerInfo serverInfo, HMsg incomingMsg) {
322       if (!victim.getServerInfo().equals(serverInfo) ||
323           this.abortSent || !this.closed) {
324         return true;
325       }
326       if (!incomingMsg.isType(HMsg.Type.MSG_REPORT_PROCESS_OPEN)) return true;
327       // Save the region that is in transition so can test later it came back.
328       this.regionToFind = incomingMsg.getRegionInfo();
329       String msg = "ABORTING " + this.victim + " because got a " +
330         HMsg.Type.MSG_REPORT_PROCESS_OPEN + " on this server for " +
331         incomingMsg.getRegionInfo().getRegionNameAsString();
332       this.victim.abort(msg);
333       this.abortSent = true;
334       return true;
335     }
336 
337     @Override
338     public boolean process(RegionServerOperation op) throws IOException {
339       return true;
340     }
341 
342     @Override
343     public void processed(RegionServerOperation op) {
344       if (this.closed || !(op instanceof ProcessRegionClose)) return;
345       ProcessRegionClose close = (ProcessRegionClose)op;
346       for (HRegion r: this.copyOfOnlineRegions) {
347         if (r.getRegionInfo().equals(close.regionInfo)) {
348           // We've closed one of the regions that was on the victim server.
349           // Now can start testing for when all regions are back online again
350           LOG.info("Found close of " +
351             r.getRegionInfo().getRegionNameAsString() +
352             "; setting close happened flag");
353           this.closed = true;
354           break;
355         }
356       }
357     }
358   }
359 */
360   /**
361    * In 2482, a RS with an opening region on it dies.  The said region is then
362    * stuck in the master's regions-in-transition and never leaves it.  This
363    * test works by bringing up a new regionserver, waiting for the load
364    * balancer to give it some regions.  Then, we close all on the new server.
365    * After sending all the close messages, we send the new regionserver the
366    * special blocking message so it can not process any more messages.
367    * Meantime reopening of the just-closed regions is backed up on the new
368    * server.  Soon as master gets an opening region from the new regionserver,
369    * we kill it.  We then wait on all regions to come back on line.  If bug
370    * is fixed, this should happen soon as the processing of the killed server is
371    * done.
372    * @see <a href="https://issues.apache.org/jira/browse/HBASE-2482">HBASE-2482</a>
373    */
374   @Ignore @Test (timeout=300000) public void testKillRSWithOpeningRegion2482()
375   throws Exception {
376     /*
377     LOG.info("Running testKillRSWithOpeningRegion2482");
378     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
379     if (cluster.getLiveRegionServerThreads().size() < 2) {
380       // Need at least two servers.
381       cluster.startRegionServer();
382     }
383     // Count how many regions are online.  They need to be all back online for
384     // this test to succeed.
385     int countOfMetaRegions = countOfMetaRegions();
386     // Add a listener on the server.
387     HMaster m = cluster.getMaster();
388     // Start new regionserver.
389     MiniHBaseClusterRegionServer hrs =
390       (MiniHBaseClusterRegionServer)cluster.startRegionServer().getRegionServer();
391     LOG.info("Started new regionserver: " + hrs.toString());
392     // Wait until has some regions before proceeding.  Balancer will give it some.
393     int minimumRegions =
394       countOfMetaRegions/(cluster.getRegionServerThreads().size() * 2);
395     while (hrs.getOnlineRegions().size() < minimumRegions) Threads.sleep(100);
396     // Set the listener only after some regions have been opened on new server.
397     HBase2482Listener listener = new HBase2482Listener(hrs);
398     m.getRegionServerOperationQueue().
399       registerRegionServerOperationListener(listener);
400     try {
401       // Go close all non-catalog regions on this new server
402       closeAllNonCatalogRegions(cluster, hrs);
403       // After all closes, add blocking message before the region opens start to
404       // come in.
405       cluster.addMessageToSendRegionServer(hrs,
406         new HMsg(HMsg.Type.TESTING_BLOCK_REGIONSERVER));
407       // Wait till one of the above close messages has an effect before we start
408       // wait on all regions back online.
409       while (!listener.closed) Threads.sleep(100);
410       LOG.info("Past close");
411       // Make sure the abort server message was sent.
412       while(!listener.abortSent) Threads.sleep(100);
413       LOG.info("Past abort send; waiting on all regions to redeploy");
414       // Now wait for regions to come back online.
415       assertRegionIsBackOnline(listener.regionToFind);
416     } finally {
417       m.getRegionServerOperationQueue().
418         unregisterRegionServerOperationListener(listener);
419     }
420     */
421   }
422 
423   /*
424    * @return Count of all non-catalog regions on the designated server
425    */
426 /*
427   private int closeAllNonCatalogRegions(final MiniHBaseCluster cluster,
428     final MiniHBaseCluster.MiniHBaseClusterRegionServer hrs)
429   throws IOException {
430     int countOfRegions = 0;
431     for (HRegion r: hrs.getOnlineRegions()) {
432       if (r.getRegionInfo().isMetaRegion()) continue;
433       cluster.addMessageToSendRegionServer(hrs,
434         new HMsg(HMsg.Type.MSG_REGION_CLOSE, r.getRegionInfo()));
435       LOG.info("Sent close of " + r.getRegionInfo().getRegionNameAsString() +
436         " on " + hrs.toString());
437       countOfRegions++;
438     }
439     return countOfRegions;
440   }
441 
442   private void assertRegionIsBackOnline(final HRegionInfo hri)
443   throws IOException {
444     // Region should have an entry in its startkey because of addRowToEachRegion.
445     byte [] row = getStartKey(hri);
446     HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
447     Get g =  new Get(row);
448     assertTrue((t.get(g)).size() > 0);
449   }
450 
451   /*
452    * @return Count of regions in meta table.
453    * @throws IOException
454    */
455   /*
456   private static int countOfMetaRegions()
457   throws IOException {
458     HTable meta = new HTable(TEST_UTIL.getConfiguration(),
459       HConstants.META_TABLE_NAME);
460     int rows = 0;
461     Scan scan = new Scan();
462     scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
463     ResultScanner s = meta.getScanner(scan);
464     for (Result r = null; (r = s.next()) != null;) {
465       byte [] b =
466         r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
467       if (b == null || b.length <= 0) break;
468       rows++;
469     }
470     s.close();
471     return rows;
472   }
473 */
474   /*
475    * Add to each of the regions in hbase:meta a value.  Key is the startrow of the
476    * region (except its 'aaa' for first region).  Actual value is the row name.
477    * @param expected
478    * @return
479    * @throws IOException
480    */
481   private static int addToEachStartKey(final int expected) throws IOException {
482     HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
483     HTable meta = new HTable(TEST_UTIL.getConfiguration(),
484         TableName.META_TABLE_NAME);
485     int rows = 0;
486     Scan scan = new Scan();
487     scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
488     ResultScanner s = meta.getScanner(scan);
489     for (Result r = null; (r = s.next()) != null;) {
490       HRegionInfo hri = HRegionInfo.getHRegionInfo(r);
491       if (hri == null) break;
492       if (!hri.getTable().getNameAsString().equals(TABLENAME)) {
493         continue;
494       }
495 
496       // If start key, add 'aaa'.
497       if(!hri.getTable().getNameAsString().equals(TABLENAME)) {
498         continue;
499       }
500       byte [] row = getStartKey(hri);
501       Put p = new Put(row);
502       p.setDurability(Durability.SKIP_WAL);
503       p.add(getTestFamily(), getTestQualifier(), row);
504       t.put(p);
505       rows++;
506     }
507     s.close();
508     Assert.assertEquals(expected, rows);
509     t.close();
510     meta.close();
511     return rows;
512   }
513 
514   /*
515    * @param hri
516    * @return Start key for hri (If start key is '', then return 'aaa'.
517    */
518   private static byte [] getStartKey(final HRegionInfo hri) {
519     return Bytes.equals(HConstants.EMPTY_START_ROW, hri.getStartKey())?
520         Bytes.toBytes("aaa"): hri.getStartKey();
521   }
522 
523   private static byte [] getTestFamily() {
524     return FAMILIES[0];
525   }
526 
527   private static byte [] getTestQualifier() {
528     return getTestFamily();
529   }
530 }