View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  package org.apache.hadoop.hbase.chaos.actions;
20  
21  import java.io.IOException;
22  import java.util.ArrayList;
23  import java.util.LinkedList;
24  import java.util.List;
25  import java.util.Queue;
26  
27  import org.apache.commons.lang.math.RandomUtils;
28  import org.apache.commons.logging.Log;
29  import org.apache.commons.logging.LogFactory;
30  import org.apache.hadoop.hbase.ServerName;
31  import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
32  
33  /**
34   * Restarts a ratio of the regionservers in a rolling fashion. At each step, either kills a
35   * server, or starts one, sleeping randomly (0-sleepTime) in between steps. The parameter maxDeadServers
36   * limits the maximum number of servers that can be down at the same time during rolling restarts.
37   */
38  public class RollingBatchRestartRsAction extends BatchRestartRsAction {
39    private static final Log LOG = LogFactory.getLog(RollingBatchRestartRsAction.class);
40    protected int maxDeadServers; // number of maximum dead servers at any given time. Defaults to 5
41  
42    public RollingBatchRestartRsAction(long sleepTime, float ratio) {
43      this(sleepTime, ratio, 5);
44    }
45  
46    public RollingBatchRestartRsAction(long sleepTime, float ratio, int maxDeadServers) {
47      super(sleepTime, ratio);
48      this.maxDeadServers = maxDeadServers;
49    }
50  
51    enum KillOrStart {
52      KILL,
53      START
54    }
55  
56    @Override
57    public void perform() throws Exception {
58      LOG.info(String.format("Performing action: Rolling batch restarting %d%% of region servers",
59          (int)(ratio * 100)));
60      List<ServerName> selectedServers = selectServers();
61  
62      Queue<ServerName> serversToBeKilled = new LinkedList<ServerName>(selectedServers);
63      Queue<ServerName> deadServers = new LinkedList<ServerName>();
64  
65      // loop while there are servers to be killed or dead servers to be restarted
66      while ((!serversToBeKilled.isEmpty() || !deadServers.isEmpty())  && !context.isStopping()) {
67        KillOrStart action = KillOrStart.KILL;
68  
69        if (serversToBeKilled.isEmpty()) { // no more servers to kill
70          action = KillOrStart.START;
71        } else if (deadServers.isEmpty()) {
72          action = KillOrStart.KILL; // no more servers to start
73        } else if (deadServers.size() >= maxDeadServers) {
74          // we have too many dead servers. Don't kill any more
75          action = KillOrStart.START;
76        } else {
77          // do a coin toss
78          action = RandomUtils.nextBoolean() ? KillOrStart.KILL : KillOrStart.START;
79        }
80  
81        ServerName server;
82  
83        switch (action) {
84        case KILL:
85           server = serversToBeKilled.remove();
86          try {
87            killRs(server);
88          } catch (org.apache.hadoop.util.Shell.ExitCodeException e) {
89            // We've seen this in test runs where we timeout but the kill went through. HBASE-9743
90            // So, add to deadServers even if exception so the start gets called.
91            LOG.info("Problem killing but presume successful; code=" + e.getExitCode(), e);
92          }
93          deadServers.add(server);
94          break;
95        case START:
96          try {
97            server = deadServers.remove();
98            startRs(server);
99          } catch (org.apache.hadoop.util.Shell.ExitCodeException e) {
100           // The start may fail but better to just keep going though we may lose server.
101           //
102           LOG.info("Problem starting, will retry; code=" + e.getExitCode(), e);
103         }
104         break;
105       }
106 
107       sleep(RandomUtils.nextInt((int)sleepTime));
108     }
109   }
110 
111   protected List<ServerName> selectServers() throws IOException {
112     return PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio);
113   }
114 
115   /**
116    * Small test to ensure the class basically works.
117    * @param args
118    * @throws Exception
119    */
120   public static void main(final String[] args) throws Exception {
121     RollingBatchRestartRsAction action = new RollingBatchRestartRsAction(1, 1.0f) {
122       private int invocations = 0;
123       @Override
124       protected ServerName[] getCurrentServers() throws IOException {
125         final int count = 4;
126         List<ServerName> serverNames = new ArrayList<ServerName>(count);
127         for (int i = 0; i < 4; i++) {
128           serverNames.add(ServerName.valueOf(i + ".example.org", i, i));
129         }
130         return serverNames.toArray(new ServerName[serverNames.size()]);
131       }
132 
133       @Override
134       protected void killRs(ServerName server) throws IOException {
135         LOG.info("Killed " + server);
136         if (this.invocations++ % 3 == 0) {
137           throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed");
138         }
139       }
140 
141       @Override
142       protected void startRs(ServerName server) throws IOException {
143         LOG.info("Started " + server);
144         if (this.invocations++ % 3 == 0) {
145           throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed");
146         }
147       }
148     };
149 
150     action.perform();
151   }
152 }