View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  package org.apache.hadoop.hbase.chaos.actions;
20  
21  import java.io.IOException;
22  import java.util.ArrayList;
23  import java.util.LinkedList;
24  import java.util.List;
25  import java.util.Queue;
26  
27  import org.apache.commons.lang.math.RandomUtils;
28  import org.apache.commons.logging.Log;
29  import org.apache.commons.logging.LogFactory;
30  import org.apache.hadoop.hbase.ServerName;
31  import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
32  
33  /**
34   * Restarts a ratio of the regionservers in a rolling fashion. At each step, either kills a
35   * server, or starts one, sleeping randomly (0-sleepTime) in between steps.
36   */
37  public class RollingBatchRestartRsAction extends BatchRestartRsAction {
38    private static Log LOG = LogFactory.getLog(RollingBatchRestartRsAction.class);
39  
40    public RollingBatchRestartRsAction(long sleepTime, float ratio) {
41      super(sleepTime, ratio);
42    }
43  
44    @Override
45    public void perform() throws Exception {
46      LOG.info(String.format("Performing action: Rolling batch restarting %d%% of region servers",
47          (int)(ratio * 100)));
48      List<ServerName> selectedServers = PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(),
49          ratio);
50  
51      Queue<ServerName> serversToBeKilled = new LinkedList<ServerName>(selectedServers);
52      Queue<ServerName> deadServers = new LinkedList<ServerName>();
53  
54      //
55      while (!serversToBeKilled.isEmpty() || !deadServers.isEmpty()) {
56        boolean action = true; //action true = kill server, false = start server
57  
58        if (serversToBeKilled.isEmpty() || deadServers.isEmpty()) {
59          action = deadServers.isEmpty();
60        } else {
61          action = RandomUtils.nextBoolean();
62        }
63  
64        if (action) {
65          ServerName server = serversToBeKilled.remove();
66          try {
67            killRs(server);
68          } catch (org.apache.hadoop.util.Shell.ExitCodeException e) {
69            // We've seen this in test runs where we timeout but the kill went through. HBASE-9743
70            // So, add to deadServers even if exception so the start gets called.
71            LOG.info("Problem killing but presume successful; code=" + e.getExitCode(), e);
72          }
73          deadServers.add(server);
74        } else {
75          try {
76            ServerName server = deadServers.remove();
77            startRs(server);
78          } catch (org.apache.hadoop.util.Shell.ExitCodeException e) {
79            // The start may fail but better to just keep going though we may lose server.
80            //
81            LOG.info("Problem starting, will retry; code=" + e.getExitCode(), e);
82          }
83        }
84  
85        sleep(RandomUtils.nextInt((int)sleepTime));
86      }
87    }
88  
89    /**
90     * Small test to ensure the class basically works.
91     * @param args
92     * @throws Exception
93     */
94    public static void main(final String[] args) throws Exception {
95      RollingBatchRestartRsAction action = new RollingBatchRestartRsAction(1, 1.0f) {
96        private int invocations = 0;
97        @Override
98        protected ServerName[] getCurrentServers() throws IOException {
99          final int count = 4;
100         List<ServerName> serverNames = new ArrayList<ServerName>(count);
101         for (int i = 0; i < 4; i++) {
102           serverNames.add(ServerName.valueOf(i + ".example.org", i, i));
103         }
104         return serverNames.toArray(new ServerName [] {});
105       }
106 
107       @Override
108       protected void killRs(ServerName server) throws IOException {
109         LOG.info("Killed " + server);
110         if (this.invocations++ % 3 == 0) {
111           throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed");
112         }
113       }
114 
115       @Override
116       protected void startRs(ServerName server) throws IOException {
117         LOG.info("Started " + server);
118         if (this.invocations++ % 3 == 0) {
119           throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed");
120         }
121       }
122     };
123 
124     action.perform();
125   }
126 }