該圖是kubernetes的總體架構圖,設計到kubernetes中的一些重要模塊,除了 kubernetes api server 之外,其他的模塊都要和 api server 進行通訊來獲取須要的資源,原理是 list、watch 機制,當用戶建立pod資源後,此時的 pod 中 nodename 屬性值是空的,schedule 模塊會獲取到這樣的 pod併爲其選擇合適的運行節點,schedule爲pod選擇合適的運行節點是一個很複雜的過程,要考慮不少因素好比zone、node的 affinity、anti-afinity,主機的 cpu、內存、卷衝突、taint等。當爲pod獲取到合適的運行主機後,會將主機名設置給pod做爲一個屬性,存儲到持久化存儲也就是etcd中,kubelet模塊在監聽pod的nodename屬性有值的pod,獲取到後在當前主機上運行pod。node
這是官方的一張框架圖,這個是最新版本中實現的選擇node的流程,在以前的版本中存在不少自定義插件帶來的痛點,在以前的版本中插件是基於predicate、Prioritize的方式進行註冊,改成 framerwork後的註冊方式更加清晰而且擴展性更好,這些會在kubernetes extension中詳細說明,圖中每個位置都是一個可擴展的點。
基於predicate、Prioritize的註冊方式git
// NewLegacyRegistry returns a legacy algorithm registry of predicates and priorities. func NewLegacyRegistry() *LegacyRegistry { registry := &LegacyRegistry{ // MandatoryPredicates the set of keys for predicates that the scheduler will // be configured with all the time. MandatoryPredicates: sets.NewString( PodToleratesNodeTaintsPred, CheckNodeUnschedulablePred, ), // Used as the default set of predicates if Policy was specified, but predicates was nil. DefaultPredicates: sets.NewString( NoVolumeZoneConflictPred, MaxEBSVolumeCountPred, MaxGCEPDVolumeCountPred, MaxAzureDiskVolumeCountPred, MaxCSIVolumeCountPred, MatchInterPodAffinityPred, NoDiskConflictPred, GeneralPred, PodToleratesNodeTaintsPred, CheckVolumeBindingPred, CheckNodeUnschedulablePred, ), // Used as the default set of predicates if Policy was specified, but priorities was nil. DefaultPriorities: map[string]int64{ SelectorSpreadPriority: 1, InterPodAffinityPriority: 1, LeastRequestedPriority: 1, BalancedResourceAllocation: 1, NodePreferAvoidPodsPriority: 10000, NodeAffinityPriority: 1, TaintTolerationPriority: 1, ImageLocalityPriority: 1, }, PredicateToConfigProducer: make(map[string]ConfigProducer), PriorityToConfigProducer: make(map[string]ConfigProducer), } registry.registerPredicateConfigProducer(GeneralPred, func(args ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { // GeneralPredicate is a combination of predicates. plugins.Filter = appendToPluginSet(plugins.Filter, noderesources.FitName, nil) plugins.PreFilter = appendToPluginSet(plugins.PreFilter, noderesources.FitName, nil) if args.NodeResourcesFitArgs != nil { pluginConfig = append(pluginConfig, NewPluginConfig(noderesources.FitName, args.NodeResourcesFitArgs)) } plugins.Filter = appendToPluginSet(plugins.Filter, nodename.Name, nil) plugins.Filter = appendToPluginSet(plugins.Filter, nodeports.Name, nil) plugins.PreFilter = appendToPluginSet(plugins.PreFilter, nodeports.Name, nil) plugins.Filter = appendToPluginSet(plugins.Filter, nodeaffinity.Name, nil) return }) registry.registerPredicateConfigProducer(PodToleratesNodeTaintsPred, func(_ ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Filter = appendToPluginSet(plugins.Filter, tainttoleration.Name, nil) return }) registry.registerPredicateConfigProducer(PodFitsResourcesPred, func(args ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Filter = appendToPluginSet(plugins.Filter, noderesources.FitName, nil) plugins.PreFilter = appendToPluginSet(plugins.PreFilter, noderesources.FitName, nil) if args.NodeResourcesFitArgs != nil { pluginConfig = append(pluginConfig, NewPluginConfig(noderesources.FitName, args.NodeResourcesFitArgs)) } return }) registry.registerPredicateConfigProducer(HostNamePred, func(_ ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Filter = appendToPluginSet(plugins.Filter, nodename.Name, nil) return }) registry.registerPredicateConfigProducer(PodFitsHostPortsPred, func(_ ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Filter = appendToPluginSet(plugins.Filter, nodeports.Name, nil) plugins.PreFilter = appendToPluginSet(plugins.PreFilter, nodeports.Name, nil) return }) registry.registerPredicateConfigProducer(MatchNodeSelectorPred, func(_ ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Filter = appendToPluginSet(plugins.Filter, nodeaffinity.Name, nil) return }) registry.registerPredicateConfigProducer(CheckNodeUnschedulablePred, func(args ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Filter = appendToPluginSet(plugins.Filter, nodeunschedulable.Name, nil) return }) registry.registerPredicateConfigProducer(CheckVolumeBindingPred, func(args ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Filter = appendToPluginSet(plugins.Filter, volumebinding.Name, nil) return }) registry.registerPredicateConfigProducer(NoDiskConflictPred, func(_ ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Filter = appendToPluginSet(plugins.Filter, volumerestrictions.Name, nil) return }) registry.registerPredicateConfigProducer(NoVolumeZoneConflictPred, func(_ ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Filter = appendToPluginSet(plugins.Filter, volumezone.Name, nil) return }) registry.registerPredicateConfigProducer(MaxCSIVolumeCountPred, func(_ ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Filter = appendToPluginSet(plugins.Filter, nodevolumelimits.CSIName, nil) return }) registry.registerPredicateConfigProducer(MaxEBSVolumeCountPred, func(_ ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Filter = appendToPluginSet(plugins.Filter, nodevolumelimits.EBSName, nil) return }) registry.registerPredicateConfigProducer(MaxGCEPDVolumeCountPred, func(_ ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Filter = appendToPluginSet(plugins.Filter, nodevolumelimits.GCEPDName, nil) return }) registry.registerPredicateConfigProducer(MaxAzureDiskVolumeCountPred, func(_ ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Filter = appendToPluginSet(plugins.Filter, nodevolumelimits.AzureDiskName, nil) return }) registry.registerPredicateConfigProducer(MaxCinderVolumeCountPred, func(_ ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Filter = appendToPluginSet(plugins.Filter, nodevolumelimits.CinderName, nil) return }) registry.registerPredicateConfigProducer(MatchInterPodAffinityPred, func(_ ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Filter = appendToPluginSet(plugins.Filter, interpodaffinity.Name, nil) plugins.PreFilter = appendToPluginSet(plugins.PreFilter, interpodaffinity.Name, nil) return }) registry.registerPredicateConfigProducer(CheckNodeLabelPresencePred, func(args ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Filter = appendToPluginSet(plugins.Filter, nodelabel.Name, nil) if args.NodeLabelArgs != nil { pluginConfig = append(pluginConfig, NewPluginConfig(nodelabel.Name, args.NodeLabelArgs)) } return }) registry.registerPredicateConfigProducer(CheckServiceAffinityPred, func(args ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Filter = appendToPluginSet(plugins.Filter, serviceaffinity.Name, nil) if args.ServiceAffinityArgs != nil { pluginConfig = append(pluginConfig, NewPluginConfig(serviceaffinity.Name, args.ServiceAffinityArgs)) } plugins.PreFilter = appendToPluginSet(plugins.PreFilter, serviceaffinity.Name, nil) return }) // Register Priorities. registry.registerPriorityConfigProducer(SelectorSpreadPriority, func(args ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Score = appendToPluginSet(plugins.Score, defaultpodtopologyspread.Name, &args.Weight) plugins.PreScore = appendToPluginSet(plugins.PreScore, defaultpodtopologyspread.Name, nil) return }) registry.registerPriorityConfigProducer(TaintTolerationPriority, func(args ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.PreScore = appendToPluginSet(plugins.PreScore, tainttoleration.Name, nil) plugins.Score = appendToPluginSet(plugins.Score, tainttoleration.Name, &args.Weight) return }) registry.registerPriorityConfigProducer(NodeAffinityPriority, func(args ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Score = appendToPluginSet(plugins.Score, nodeaffinity.Name, &args.Weight) return }) registry.registerPriorityConfigProducer(ImageLocalityPriority, func(args ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Score = appendToPluginSet(plugins.Score, imagelocality.Name, &args.Weight) return }) registry.registerPriorityConfigProducer(InterPodAffinityPriority, func(args ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.PreScore = appendToPluginSet(plugins.PreScore, interpodaffinity.Name, nil) plugins.Score = appendToPluginSet(plugins.Score, interpodaffinity.Name, &args.Weight) if args.InterPodAffinityArgs != nil { pluginConfig = append(pluginConfig, NewPluginConfig(interpodaffinity.Name, args.InterPodAffinityArgs)) } return }) registry.registerPriorityConfigProducer(NodePreferAvoidPodsPriority, func(args ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Score = appendToPluginSet(plugins.Score, nodepreferavoidpods.Name, &args.Weight) return }) registry.registerPriorityConfigProducer(MostRequestedPriority, func(args ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Score = appendToPluginSet(plugins.Score, noderesources.MostAllocatedName, &args.Weight) return }) registry.registerPriorityConfigProducer(BalancedResourceAllocation, func(args ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Score = appendToPluginSet(plugins.Score, noderesources.BalancedAllocationName, &args.Weight) return }) registry.registerPriorityConfigProducer(LeastRequestedPriority, func(args ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Score = appendToPluginSet(plugins.Score, noderesources.LeastAllocatedName, &args.Weight) return }) registry.registerPriorityConfigProducer(noderesources.RequestedToCapacityRatioName, func(args ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.Score = appendToPluginSet(plugins.Score, noderesources.RequestedToCapacityRatioName, &args.Weight) if args.RequestedToCapacityRatioArgs != nil { pluginConfig = append(pluginConfig, NewPluginConfig(noderesources.RequestedToCapacityRatioName, args.RequestedToCapacityRatioArgs)) } return }) registry.registerPriorityConfigProducer(nodelabel.Name, func(args ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { // If there are n LabelPreference priorities in the policy, the weight for the corresponding // score plugin is n*weight (note that the validation logic verifies that all LabelPreference // priorities specified in Policy have the same weight). weight := args.Weight * int32(len(args.NodeLabelArgs.PresentLabelsPreference)+len(args.NodeLabelArgs.AbsentLabelsPreference)) plugins.Score = appendToPluginSet(plugins.Score, nodelabel.Name, &weight) if args.NodeLabelArgs != nil { pluginConfig = append(pluginConfig, NewPluginConfig(nodelabel.Name, args.NodeLabelArgs)) } return }) registry.registerPriorityConfigProducer(serviceaffinity.Name, func(args ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { // If there are n ServiceAffinity priorities in the policy, the weight for the corresponding // score plugin is n*weight (note that the validation logic verifies that all ServiceAffinity // priorities specified in Policy have the same weight). weight := args.Weight * int32(len(args.ServiceAffinityArgs.AntiAffinityLabelsPreference)) plugins.Score = appendToPluginSet(plugins.Score, serviceaffinity.Name, &weight) if args.ServiceAffinityArgs != nil { pluginConfig = append(pluginConfig, NewPluginConfig(serviceaffinity.Name, args.ServiceAffinityArgs)) } return }) // The following two features are the last ones to be supported as predicate/priority. // Once they graduate to GA, there will be no more checking for featue gates here. // Only register EvenPodsSpread predicate & priority if the feature is enabled if utilfeature.DefaultFeatureGate.Enabled(features.EvenPodsSpread) { klog.Infof("Registering EvenPodsSpread predicate and priority function") registry.registerPredicateConfigProducer(EvenPodsSpreadPred, func(_ ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.PreFilter = appendToPluginSet(plugins.PreFilter, podtopologyspread.Name, nil) plugins.Filter = appendToPluginSet(plugins.Filter, podtopologyspread.Name, nil) return }) registry.DefaultPredicates.Insert(EvenPodsSpreadPred) registry.registerPriorityConfigProducer(EvenPodsSpreadPriority, func(args ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.PreScore = appendToPluginSet(plugins.PreScore, podtopologyspread.Name, nil) plugins.Score = appendToPluginSet(plugins.Score, podtopologyspread.Name, &args.Weight) return }) registry.DefaultPriorities[EvenPodsSpreadPriority] = 1 } // Prioritizes nodes that satisfy pod's resource limits if utilfeature.DefaultFeatureGate.Enabled(features.ResourceLimitsPriorityFunction) { klog.Infof("Registering resourcelimits priority function") registry.registerPriorityConfigProducer(ResourceLimitsPriority, func(args ConfigProducerArgs) (plugins config.Plugins, pluginConfig []config.PluginConfig) { plugins.PreScore = appendToPluginSet(plugins.PreScore, noderesources.ResourceLimitsName, nil) plugins.Score = appendToPluginSet(plugins.Score, noderesources.ResourceLimitsName, &args.Weight) return }) registry.DefaultPriorities[ResourceLimitsPriority] = 1 } return registry }
基於framerwork的註冊方式github
// ListAlgorithmProviders lists registered algorithm providers. func ListAlgorithmProviders() string { r := NewRegistry() var providers []string for k := range r { providers = append(providers, k) } sort.Strings(providers) return strings.Join(providers, " | ") } func getDefaultConfig() *schedulerapi.Plugins { return &schedulerapi.Plugins{ QueueSort: &schedulerapi.PluginSet{ Enabled: []schedulerapi.Plugin{ {Name: queuesort.Name}, }, }, PreFilter: &schedulerapi.PluginSet{ Enabled: []schedulerapi.Plugin{ {Name: noderesources.FitName}, {Name: nodeports.Name}, {Name: interpodaffinity.Name}, }, }, Filter: &schedulerapi.PluginSet{ Enabled: []schedulerapi.Plugin{ {Name: nodeunschedulable.Name}, {Name: noderesources.FitName}, {Name: nodename.Name}, {Name: nodeports.Name}, {Name: nodeaffinity.Name}, {Name: volumerestrictions.Name}, {Name: tainttoleration.Name}, {Name: nodevolumelimits.EBSName}, {Name: nodevolumelimits.GCEPDName}, {Name: nodevolumelimits.CSIName}, {Name: nodevolumelimits.AzureDiskName}, {Name: volumebinding.Name}, {Name: volumezone.Name}, {Name: interpodaffinity.Name}, }, }, PreScore: &schedulerapi.PluginSet{ Enabled: []schedulerapi.Plugin{ {Name: interpodaffinity.Name}, {Name: defaultpodtopologyspread.Name}, {Name: tainttoleration.Name}, }, }, Score: &schedulerapi.PluginSet{ Enabled: []schedulerapi.Plugin{ {Name: noderesources.BalancedAllocationName, Weight: 1}, {Name: imagelocality.Name, Weight: 1}, {Name: interpodaffinity.Name, Weight: 1}, {Name: noderesources.LeastAllocatedName, Weight: 1}, {Name: nodeaffinity.Name, Weight: 1}, {Name: nodepreferavoidpods.Name, Weight: 10000}, {Name: defaultpodtopologyspread.Name, Weight: 1}, {Name: tainttoleration.Name, Weight: 1}, }, }, Bind: &schedulerapi.PluginSet{ Enabled: []schedulerapi.Plugin{ {Name: defaultbinder.Name}, }, }, } }
在schedule模塊中原生的插件只是最基本的,在使用過程當中必定須要不少適合業務的一些插件須要被執行,新的framerwork框架爲用戶提供瞭如下幾種擴展實現方式api