YARN源码剖析(二):RM启动过程
RM启动代码流程
public static void main(String argv[]) {
//为主线程注册一个UncaughtException处理器,设置在线程因未捕获异常而突
//然终止时调用的默认处理程序
Thread.setDefaultUncaughtExceptionHandler(new YarnUncaughtExceptionHandler());
//server启动和终止是打印相应的日志(终止日志打印是通过addShutdownHook来完成的)
StringUtils.startupShutdownMessage(ResourceManager.class, argv, LOG);
try {
//加载配置文件:core-default.xml,core-site.xml,yarn-default.xml,yarn-site.xml
Configuration conf = new YarnConfiguration();
GenericOptionsParser hParser = new GenericOptionsParser(conf, argv);
argv = hParser.getRemainingArgs();
// If -format-state-store, then delete RMStateStore; else startup normally TODO:实现分析
if (argv.length == 1 && argv[0].equals("-format-state-store")) {
deleteRMStateStore(conf);
} else {
//构造RM对象,
ResourceManager resourceManager = new ResourceManager();
//添加一个shutdownHook,用来停止服务的所有资源
ShutdownHookManager.get().addShutdownHook(
new CompositeServiceShutdownHook(resourceManager),
SHUTDOWN_HOOK_PRIORITY);
//RM初始化
resourceManager.init(conf);
//RM启动
resourceManager.start();
}
} catch (Throwable t) {
LOG.fatal("Error starting ResourceManager", t);
System.exit(-1);
}
}
接下来重点分析 RM初始化 和 RM启动
RM初始化
public void init(Configuration conf) {
if (conf == null) {
throw new ServiceStateException("Cannot initialize service "
+ getName() + ": null configuration");
}
if (isInState(STATE.INITED)) { //如果当前已经初始化完了则返回
return;
}
synchronized (stateChangeLock) {
//相当于对服务做一次校验,确保服务初始化成功
if (enterState(STATE.INITED) != STATE.INITED) {
setConfig(conf);
try {
serviceInit(config);
if (isInState(STATE.INITED)) {
//if the service ended up here during init,
//notify the listeners
notifyListeners();
}
} catch (Exception e) {
noteFailure(e);
ServiceOperations.stopQuietly(LOG, this);
throw ServiceStateException.convert(e);
}
}
}
}
@Override
protected void serviceInit(Configuration conf) throws Exception {
this.conf = conf;
this.rmContext = new RMContextImpl();
this.configurationProvider =
ConfigurationProviderFactory.getConfigurationProvider(conf);
this.configurationProvider.init(this.conf);
rmContext.setConfigurationProvider(configurationProvider);
// load core-site.xml
InputStream coreSiteXMLInputStream =
this.configurationProvider.getConfigurationInputStream(this.conf,
YarnConfiguration.CORE_SITE_CONFIGURATION_FILE);
if (coreSiteXMLInputStream != null) {
this.conf.addResource(coreSiteXMLInputStream);
}
// Do refreshUserToGroupsMappings with loaded core-site.xml
Groups.getUserToGroupsMappingServiceWithLoadedConfiguration(this.conf)
.refresh();
// Do refreshSuperUserGroupsConfiguration with loaded core-site.xml
// Or use RM specific configurations to overwrite the common ones first
// if they exist
RMServerUtils.processRMProxyUsersConf(conf);
ProxyUsers.refreshSuperUserGroupsConfiguration(this.conf);
// load yarn-site.xml
InputStream yarnSiteXMLInputStream =
this.configurationProvider.getConfigurationInputStream(this.conf,
YarnConfiguration.YARN_SITE_CONFIGURATION_FILE);
if (yarnSiteXMLInputStream != null) {
this.conf.addResource(yarnSiteXMLInputStream);
}
validateConfigs(this.conf);
// Set HA configuration should be done before login
this.rmContext.setHAEnabled(HAUtil.isHAEnabled(this.conf));
if (this.rmContext.isHAEnabled()) {
HAUtil.verifyAndSetConfiguration(this.conf);
}
// Set UGI and do login
// If security is enabled, use login user
// If security is not enabled, use current user
this.rmLoginUGI = UserGroupInformation.getCurrentUser();
try {
doSecureLogin();
} catch(IOException ie) {
throw new YarnRuntimeException("Failed to login", ie);
}
//前面的代码关键就做了初始化配置文件,读取配置文件,设置配置文件上下文这类的事情,就不多做分析了
// register the handlers for all AlwaysOn services using setupDispatcher().
// 注册了一个调度器,用于内部事件调度处理:new AsyncDispatcher(),
//并且为其注册一个事件类型-handler元素
//<RMFatalEventType.class,RMFatalEventDispatcher>
rmDispatcher = setupDispatcher();
//rmDispatcher也是一个service,将其添加到RM的serviceList成员中
addIfService(rmDispatcher);
rmContext.setDispatcher(rmDispatcher);
adminService = createAdminService();
addService(adminService);
rmContext.setRMAdminService(adminService);
rmContext.setYarnConfiguration(conf);
//这个方法内部实现了许多重要的服务初始化的过程,其实真正需要分析的就是这个方法。
//这是由RM的一个内部类RMActiveServices实现的。
createAndInitActiveServices();
webAppAddress = WebAppUtils.getWebAppBindURL(this.conf,
YarnConfiguration.RM_BIND_HOST,
WebAppUtils.getRMWebAppURLWithoutScheme(this.conf));
super.serviceInit(this.conf);
}
createAndInitActiveServices();中的重点介绍一个关键类的初始化,这是NM与RM保持心跳的关键类,它的初始化方法也是一些配置参数的初始化。
resourceTracker = createResourceTrackerService();
addService(resourceTracker);
rmContext.setResourceTrackerService(resourceTracker);
最后super.serviceInit(this.conf);就是init各个service
protected void serviceInit(Configuration conf) throws Exception {
List<Service> services = getServices();
if (LOG.isDebugEnabled()) {
LOG.debug(getName() + ": initing services, size=" + services.size());
}
for (Service service : services) {
service.init(conf);
}
super.serviceInit(conf);
}
RM启动
@Override
public void start() {
if (isInState(STATE.STARTED)) {
return;
}
//enter the started state
synchronized (stateChangeLock) {
if (stateModel.enterState(STATE.STARTED) != STATE.STARTED) {
try {
startTime = System.currentTimeMillis();
serviceStart();
if (isInState(STATE.STARTED)) {
//if the service started (and isn't now in a later state), notify
if (LOG.isDebugEnabled()) {
LOG.debug("Service " + getName() + " is started");
}
notifyListeners();
}
} catch (Exception e) {
noteFailure(e);
ServiceOperations.stopQuietly(LOG, this);
throw ServiceStateException.convert(e);
}
}
}
}
@Override
protected void serviceStart() throws Exception {
if (this.rmContext.isHAEnabled()) {
transitionToStandby(true);
} else {
transitionToActive();
}
startWepApp();
if (getConfig().getBoolean(YarnConfiguration.IS_MINI_YARN_CLUSTER,
false)) {
int port = webApp.port();
WebAppUtils.setRMWebAppPort(conf, port);
}
super.serviceStart();
}
最终看super.serviceStart(),内部就是将每一个service启动
protected void serviceStart() throws Exception {
List<Service> services = getServices();
if (LOG.isDebugEnabled()) {
LOG.debug(getName() + ": starting services, size=" + services.size());
}
for (Service service : services) {
// start the service. If this fails that service
// will be stopped and an exception raised
service.start();
}
super.serviceStart();
}
其中第一个重要的service需要分析一下-AsyncDispatcher。创建了一个死循环线程,用来分发阻塞队列中的各个事件。
@Override
protected void serviceStart() throws Exception {
//start all the components
super.serviceStart();
eventHandlingThread = new Thread(createThread());
eventHandlingThread.setName("AsyncDispatcher event handler");
eventHandlingThread.start();
}
Runnable createThread() {
return new Runnable() {
@Override
public void run() {
while (!stopped && !Thread.currentThread().isInterrupted()) {
drained = eventQueue.isEmpty();
// blockNewEvents is only set when dispatcher is draining to stop,
// adding this check is to avoid the overhead of acquiring the lock
// and calling notify every time in the normal run of the loop.
if (blockNewEvents) {
synchronized (waitForDrained) {
if (drained) {
waitForDrained.notify();
}
}
}
Event event;
try {
event = eventQueue.take();
} catch(InterruptedException ie) {
if (!stopped) {
LOG.warn("AsyncDispatcher thread interrupted", ie);
}
return;
}
if (event != null) {
dispatch(event);
}
}
}
};
}
第二个需要重点分析的service是ResourceTrackerServive。
@Override
protected void serviceStart() throws Exception {
super.serviceStart();
// ResourceTrackerServer authenticates NodeManager via Kerberos if
// security is enabled, so no secretManager.
Configuration conf = getConfig();
YarnRPC rpc = YarnRPC.create(conf);
//创建RM的RPC server
this.server =
rpc.getServer(ResourceTracker.class, this, resourceTrackerAddress,
conf, null,
conf.getInt(YarnConfiguration.RM_RESOURCE_TRACKER_CLIENT_THREAD_COUNT,
YarnConfiguration.DEFAULT_RM_RESOURCE_TRACKER_CLIENT_THREAD_COUNT));
// Enable service authorization?
if (conf.getBoolean(
CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHORIZATION,
false)) {
InputStream inputStream =
this.rmContext.getConfigurationProvider()
.getConfigurationInputStream(conf,
YarnConfiguration.HADOOP_POLICY_CONFIGURATION_FILE);
if (inputStream != null) {
conf.addResource(inputStream);
}
refreshServiceAcls(conf, RMPolicyProvider.getInstance());
}
//启动RM rpc相关服务,主要是selector方式的网络通信中的组件的启动,如selector监听其,reader,writer,handler线程启动。
this.server.start();
conf.updateConnectAddr(YarnConfiguration.RM_BIND_HOST,
YarnConfiguration.RM_RESOURCE_TRACKER_ADDRESS,
YarnConfiguration.DEFAULT_RM_RESOURCE_TRACKER_ADDRESS,
server.getListenerAddress());
}
/** Starts the service. Must be called before any calls will be handled. */
public synchronized void start() {
responder.start();
listener.start();
handlers = new Handler[handlerCount];
for (int i = 0; i < handlerCount; i++) {
handlers[i] = new Handler(i);
handlers[i].start();
}
}
至此RM的启动就完毕了。(除了AsyncDispatcher的其他每一个具体服务的启动在后文介绍的时候会在使用到的地方再做解释)
附录
hadoop工程的Configuartion架构分析
父类Configuration的资源加载:
static{
//print deprecation warning if hadoop-site.xml is found in classpath
ClassLoader cL = Thread.currentThread().getContextClassLoader();
if (cL == null) {
cL = Configuration.class.getClassLoader();
}
if(cL.getResource("hadoop-site.xml")!=null) {
LOG.warn("DEPRECATED: hadoop-site.xml found in the classpath. " +
"Usage of hadoop-site.xml is deprecated. Instead use core-site.xml, "
+ "mapred-site.xml and hdfs-site.xml to override properties of " +
"core-default.xml, mapred-default.xml and hdfs-default.xml " +
"respectively");
}
addDefaultResource("core-default.xml");
addDefaultResource("core-site.xml");
}
子类YarnConfiguration的资源加载:
static {
addDeprecatedKeys();
Configuration.addDefaultResource(YARN_DEFAULT_CONFIGURATION_FILE);
Configuration.addDefaultResource(YARN_SITE_CONFIGURATION_FILE);
}
static {
addDeprecatedKeys();
// adds the default resources
Configuration.addDefaultResource("hdfs-default.xml");
Configuration.addDefaultResource("hdfs-site.xml");
}
子类JobConf的资源加载:
static{
ConfigUtil.loadResources();
}
/**
* Adds all the deprecated keys. Loads mapred-default.xml and mapred-site.xml
*/
public static void loadResources() {
addDeprecatedKeys();
Configuration.addDefaultResource("mapred-default.xml");
Configuration.addDefaultResource("mapred-site.xml");
Configuration.addDefaultResource("yarn-default.xml");
Configuration.addDefaultResource("yarn-site.xml");
}
RM类继承关系
组件状态
public interface Service extends Closeable {
/**
* Service states
*/
public enum STATE {
/** Constructed but not initialized */
NOTINITED(0, "NOTINITED"),
/** Initialized but not started or stopped */
INITED(1, "INITED"),
/** started and not stopped */
STARTED(2, "STARTED"),
/** stopped. No further state transitions are permitted */
STOPPED(3, "STOPPED");
AsyncDispatcher分析
/**
- Dispatches {@link Event}s in a separate thread. Currently only single thread
- does that. Potentially there could be multiple channels for each event type
- class and a thread pool can be used to dispatch the events.
*/
核心类AsyncDispatcher:异步事件分发器(Yarn中最底层的总管道)
使用BlockingQueue存储事件,并维护一个事件类型和handler的对应map。
public class AsyncDispatcher extends AbstractService implements Dispatcher {
主要的属性
(1)事件队列: BlockingQueue<Event> eventQueue;
(2)事件分发器: Map<Class<? extends Enum>, EventHandler> eventDispatchers
(3)处理事件的线程 Thread eventHandlingThread
主要的方法
1、从eventQueue中取出事件以及处理事件
(1)createThread():返回一个Runnable对象,该线程类对象有一个while循环,不断从eventQueue中取出事件(RM启动之后),event = eventQueue.take();然后将事件分发出去dispatch(event)。
(2)dispatch(event):首先得到事件的类型,然后从eventDispatchers中根据事件类型得到相应的事件处理器EventHandler,然后EventHandler.handle(event)对事件进行处理。
2、向eventQueue中添加事件
AsyncDispatcher 的内部类GenericEventHandler implements EventHanler的handle(event)方法向eventQueue中添加事件eventQueue.put(event);
}
ResourceManager类中的Dispatcher类
ResourceManager类中定义了许多EventDispatcher内部类
RM中存在不同的事件,每种事件具有不同的类型,同一类型的事件交给一个XXXEventDispatcher,XXXEventDispatcher将事件交给真正的事件处理实体进行处理。
RM中定义的XXXEventDispatcher内部类如下:
|事件分发器|处理的事件|事件类型|事件处理实体|
|SchedulerEventDispatcher|SchedulerEvent|enum SchedulerEventType|ResourceScheduler|
|RMFatalEventDispatcher|enum RMFatalEventType|enum RMFatalEventType|System.exit(status)|
|ApplicationEventDispatcher|RMAppEvent|RMAppEventType|RMAppImpl|
|RMContainerPreemptEventDispatcher|ContainerPreemptEvent|ContainerPreemptEventType|CapacityScheduler|
ApplicationAttemptEventDispatcher|RMAppAttemptEvent|RMAppAttemptEventType|RMAppAttemptImpl|
|NodeEventDispatcher|RMNodeEvent|RMNodeEventType|RMNodeImpl|
以上这些dispatcher的注册是在createAndInitActiveServices();中完成的
schedulerDispatcher = createSchedulerEventDispatcher();
addIfService(schedulerDispatcher);
rmDispatcher.register(SchedulerEventType.class, schedulerDispatcher);
// Register event handler for RmAppEvents
rmDispatcher.register(RMAppEventType.class,
new ApplicationEventDispatcher(rmContext));
// Register event handler for RmAppAttemptEvents
rmDispatcher.register(RMAppAttemptEventType.class,
new ApplicationAttemptEventDispatcher(rmContext));
// Register event handler for RmNodes
rmDispatcher.register(
RMNodeEventType.class, new NodeEventDispatcher(rmContext));