mirror of
https://gitee.com/bianbu-linux/linux-6.6
synced 2025-07-22 01:43:37 -04:00
Commit 7d2b5dd0bc
("sched/numa: Allow a floating imbalance between NUMA
nodes") allowed an imbalance between NUMA nodes such that communicating
tasks would not be pulled apart by the load balancer. This works fine when
there is a 1:1 relationship between LLC and node but can be suboptimal
for multiple LLCs if independent tasks prematurely use CPUs sharing cache.
Zen* has multiple LLCs per node with local memory channels and due to
the allowed imbalance, it's far harder to tune some workloads to run
optimally than it is on hardware that has 1 LLC per node. This patch
allows an imbalance to exist up to the point where LLCs should be balanced
between nodes.
On a Zen3 machine running STREAM parallelised with OMP to have on instance
per LLC the results and without binding, the results are
5.17.0-rc0 5.17.0-rc0
vanilla sched-numaimb-v6
MB/sec copy-16 162596.94 ( 0.00%) 580559.74 ( 257.05%)
MB/sec scale-16 136901.28 ( 0.00%) 374450.52 ( 173.52%)
MB/sec add-16 157300.70 ( 0.00%) 564113.76 ( 258.62%)
MB/sec triad-16 151446.88 ( 0.00%) 564304.24 ( 272.61%)
STREAM can use directives to force the spread if the OpenMP is new
enough but that doesn't help if an application uses threads and
it's not known in advance how many threads will be created.
Coremark is a CPU and cache intensive benchmark parallelised with
threads. When running with 1 thread per core, the vanilla kernel
allows threads to contend on cache. With the patch;
5.17.0-rc0 5.17.0-rc0
vanilla sched-numaimb-v5
Min Score-16 368239.36 ( 0.00%) 389816.06 ( 5.86%)
Hmean Score-16 388607.33 ( 0.00%) 427877.08 * 10.11%*
Max Score-16 408945.69 ( 0.00%) 481022.17 ( 17.62%)
Stddev Score-16 15247.04 ( 0.00%) 24966.82 ( -63.75%)
CoeffVar Score-16 3.92 ( 0.00%) 5.82 ( -48.48%)
It can also make a big difference for semi-realistic workloads
like specjbb which can execute arbitrary numbers of threads without
advance knowledge of how they should be placed. Even in cases where
the average performance is neutral, the results are more stable.
5.17.0-rc0 5.17.0-rc0
vanilla sched-numaimb-v6
Hmean tput-1 71631.55 ( 0.00%) 73065.57 ( 2.00%)
Hmean tput-8 582758.78 ( 0.00%) 556777.23 ( -4.46%)
Hmean tput-16 1020372.75 ( 0.00%) 1009995.26 ( -1.02%)
Hmean tput-24 1416430.67 ( 0.00%) 1398700.11 ( -1.25%)
Hmean tput-32 1687702.72 ( 0.00%) 1671357.04 ( -0.97%)
Hmean tput-40 1798094.90 ( 0.00%) 2015616.46 * 12.10%*
Hmean tput-48 1972731.77 ( 0.00%) 2333233.72 ( 18.27%)
Hmean tput-56 2386872.38 ( 0.00%) 2759483.38 ( 15.61%)
Hmean tput-64 2909475.33 ( 0.00%) 2925074.69 ( 0.54%)
Hmean tput-72 2585071.36 ( 0.00%) 2962443.97 ( 14.60%)
Hmean tput-80 2994387.24 ( 0.00%) 3015980.59 ( 0.72%)
Hmean tput-88 3061408.57 ( 0.00%) 3010296.16 ( -1.67%)
Hmean tput-96 3052394.82 ( 0.00%) 2784743.41 ( -8.77%)
Hmean tput-104 2997814.76 ( 0.00%) 2758184.50 ( -7.99%)
Hmean tput-112 2955353.29 ( 0.00%) 2859705.09 ( -3.24%)
Hmean tput-120 2889770.71 ( 0.00%) 2764478.46 ( -4.34%)
Hmean tput-128 2871713.84 ( 0.00%) 2750136.73 ( -4.23%)
Stddev tput-1 5325.93 ( 0.00%) 2002.53 ( 62.40%)
Stddev tput-8 6630.54 ( 0.00%) 10905.00 ( -64.47%)
Stddev tput-16 25608.58 ( 0.00%) 6851.16 ( 73.25%)
Stddev tput-24 12117.69 ( 0.00%) 4227.79 ( 65.11%)
Stddev tput-32 27577.16 ( 0.00%) 8761.05 ( 68.23%)
Stddev tput-40 59505.86 ( 0.00%) 2048.49 ( 96.56%)
Stddev tput-48 168330.30 ( 0.00%) 93058.08 ( 44.72%)
Stddev tput-56 219540.39 ( 0.00%) 30687.02 ( 86.02%)
Stddev tput-64 121750.35 ( 0.00%) 9617.36 ( 92.10%)
Stddev tput-72 223387.05 ( 0.00%) 34081.13 ( 84.74%)
Stddev tput-80 128198.46 ( 0.00%) 22565.19 ( 82.40%)
Stddev tput-88 136665.36 ( 0.00%) 27905.97 ( 79.58%)
Stddev tput-96 111925.81 ( 0.00%) 99615.79 ( 11.00%)
Stddev tput-104 146455.96 ( 0.00%) 28861.98 ( 80.29%)
Stddev tput-112 88740.49 ( 0.00%) 58288.23 ( 34.32%)
Stddev tput-120 186384.86 ( 0.00%) 45812.03 ( 75.42%)
Stddev tput-128 78761.09 ( 0.00%) 57418.48 ( 27.10%)
Similarly, for embarassingly parallel problems like NPB-ep, there are
improvements due to better spreading across LLC when the machine is not
fully utilised.
vanilla sched-numaimb-v6
Min ep.D 31.79 ( 0.00%) 26.11 ( 17.87%)
Amean ep.D 31.86 ( 0.00%) 26.17 * 17.86%*
Stddev ep.D 0.07 ( 0.00%) 0.05 ( 24.41%)
CoeffVar ep.D 0.22 ( 0.00%) 0.20 ( 7.97%)
Max ep.D 31.93 ( 0.00%) 26.21 ( 17.91%)
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lore.kernel.org/r/20220208094334.16379-3-mgorman@techsingularity.net
282 lines
6.7 KiB
C
282 lines
6.7 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _LINUX_SCHED_TOPOLOGY_H
|
|
#define _LINUX_SCHED_TOPOLOGY_H
|
|
|
|
#include <linux/topology.h>
|
|
|
|
#include <linux/sched/idle.h>
|
|
|
|
/*
|
|
* sched-domains (multiprocessor balancing) declarations:
|
|
*/
|
|
#ifdef CONFIG_SMP
|
|
|
|
/* Generate SD flag indexes */
|
|
#define SD_FLAG(name, mflags) __##name,
|
|
enum {
|
|
#include <linux/sched/sd_flags.h>
|
|
__SD_FLAG_CNT,
|
|
};
|
|
#undef SD_FLAG
|
|
/* Generate SD flag bits */
|
|
#define SD_FLAG(name, mflags) name = 1 << __##name,
|
|
enum {
|
|
#include <linux/sched/sd_flags.h>
|
|
};
|
|
#undef SD_FLAG
|
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
|
|
struct sd_flag_debug {
|
|
unsigned int meta_flags;
|
|
char *name;
|
|
};
|
|
extern const struct sd_flag_debug sd_flag_debug[];
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_SCHED_SMT
|
|
static inline int cpu_smt_flags(void)
|
|
{
|
|
return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
|
|
}
|
|
#endif
|
|
|
|
#ifdef CONFIG_SCHED_CLUSTER
|
|
static inline int cpu_cluster_flags(void)
|
|
{
|
|
return SD_SHARE_PKG_RESOURCES;
|
|
}
|
|
#endif
|
|
|
|
#ifdef CONFIG_SCHED_MC
|
|
static inline int cpu_core_flags(void)
|
|
{
|
|
return SD_SHARE_PKG_RESOURCES;
|
|
}
|
|
#endif
|
|
|
|
#ifdef CONFIG_NUMA
|
|
static inline int cpu_numa_flags(void)
|
|
{
|
|
return SD_NUMA;
|
|
}
|
|
#endif
|
|
|
|
extern int arch_asym_cpu_priority(int cpu);
|
|
|
|
struct sched_domain_attr {
|
|
int relax_domain_level;
|
|
};
|
|
|
|
#define SD_ATTR_INIT (struct sched_domain_attr) { \
|
|
.relax_domain_level = -1, \
|
|
}
|
|
|
|
extern int sched_domain_level_max;
|
|
|
|
struct sched_group;
|
|
|
|
struct sched_domain_shared {
|
|
atomic_t ref;
|
|
atomic_t nr_busy_cpus;
|
|
int has_idle_cores;
|
|
};
|
|
|
|
struct sched_domain {
|
|
/* These fields must be setup */
|
|
struct sched_domain __rcu *parent; /* top domain must be null terminated */
|
|
struct sched_domain __rcu *child; /* bottom domain must be null terminated */
|
|
struct sched_group *groups; /* the balancing groups of the domain */
|
|
unsigned long min_interval; /* Minimum balance interval ms */
|
|
unsigned long max_interval; /* Maximum balance interval ms */
|
|
unsigned int busy_factor; /* less balancing by factor if busy */
|
|
unsigned int imbalance_pct; /* No balance until over watermark */
|
|
unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
|
|
unsigned int imb_numa_nr; /* Nr running tasks that allows a NUMA imbalance */
|
|
|
|
int nohz_idle; /* NOHZ IDLE status */
|
|
int flags; /* See SD_* */
|
|
int level;
|
|
|
|
/* Runtime fields. */
|
|
unsigned long last_balance; /* init to jiffies. units in jiffies */
|
|
unsigned int balance_interval; /* initialise to 1. units in ms. */
|
|
unsigned int nr_balance_failed; /* initialise to 0 */
|
|
|
|
/* idle_balance() stats */
|
|
u64 max_newidle_lb_cost;
|
|
unsigned long last_decay_max_lb_cost;
|
|
|
|
u64 avg_scan_cost; /* select_idle_sibling */
|
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
|
/* load_balance() stats */
|
|
unsigned int lb_count[CPU_MAX_IDLE_TYPES];
|
|
unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
|
|
unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
|
|
unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
|
|
unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
|
|
unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
|
|
unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
|
|
unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];
|
|
|
|
/* Active load balancing */
|
|
unsigned int alb_count;
|
|
unsigned int alb_failed;
|
|
unsigned int alb_pushed;
|
|
|
|
/* SD_BALANCE_EXEC stats */
|
|
unsigned int sbe_count;
|
|
unsigned int sbe_balanced;
|
|
unsigned int sbe_pushed;
|
|
|
|
/* SD_BALANCE_FORK stats */
|
|
unsigned int sbf_count;
|
|
unsigned int sbf_balanced;
|
|
unsigned int sbf_pushed;
|
|
|
|
/* try_to_wake_up() stats */
|
|
unsigned int ttwu_wake_remote;
|
|
unsigned int ttwu_move_affine;
|
|
unsigned int ttwu_move_balance;
|
|
#endif
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
char *name;
|
|
#endif
|
|
union {
|
|
void *private; /* used during construction */
|
|
struct rcu_head rcu; /* used during destruction */
|
|
};
|
|
struct sched_domain_shared *shared;
|
|
|
|
unsigned int span_weight;
|
|
/*
|
|
* Span of all CPUs in this domain.
|
|
*
|
|
* NOTE: this field is variable length. (Allocated dynamically
|
|
* by attaching extra space to the end of the structure,
|
|
* depending on how many CPUs the kernel has booted up with)
|
|
*/
|
|
unsigned long span[];
|
|
};
|
|
|
|
static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
|
|
{
|
|
return to_cpumask(sd->span);
|
|
}
|
|
|
|
extern void partition_sched_domains_locked(int ndoms_new,
|
|
cpumask_var_t doms_new[],
|
|
struct sched_domain_attr *dattr_new);
|
|
|
|
extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
|
|
struct sched_domain_attr *dattr_new);
|
|
|
|
/* Allocate an array of sched domains, for partition_sched_domains(). */
|
|
cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
|
|
void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
|
|
|
|
bool cpus_share_cache(int this_cpu, int that_cpu);
|
|
|
|
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
|
|
typedef int (*sched_domain_flags_f)(void);
|
|
|
|
#define SDTL_OVERLAP 0x01
|
|
|
|
struct sd_data {
|
|
struct sched_domain *__percpu *sd;
|
|
struct sched_domain_shared *__percpu *sds;
|
|
struct sched_group *__percpu *sg;
|
|
struct sched_group_capacity *__percpu *sgc;
|
|
};
|
|
|
|
struct sched_domain_topology_level {
|
|
sched_domain_mask_f mask;
|
|
sched_domain_flags_f sd_flags;
|
|
int flags;
|
|
int numa_level;
|
|
struct sd_data data;
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
char *name;
|
|
#endif
|
|
};
|
|
|
|
extern void set_sched_topology(struct sched_domain_topology_level *tl);
|
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
# define SD_INIT_NAME(type) .name = #type
|
|
#else
|
|
# define SD_INIT_NAME(type)
|
|
#endif
|
|
|
|
#else /* CONFIG_SMP */
|
|
|
|
struct sched_domain_attr;
|
|
|
|
static inline void
|
|
partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
|
|
struct sched_domain_attr *dattr_new)
|
|
{
|
|
}
|
|
|
|
static inline void
|
|
partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
|
|
struct sched_domain_attr *dattr_new)
|
|
{
|
|
}
|
|
|
|
static inline bool cpus_share_cache(int this_cpu, int that_cpu)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
#endif /* !CONFIG_SMP */
|
|
|
|
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
|
|
extern void rebuild_sched_domains_energy(void);
|
|
#else
|
|
static inline void rebuild_sched_domains_energy(void)
|
|
{
|
|
}
|
|
#endif
|
|
|
|
#ifndef arch_scale_cpu_capacity
|
|
/**
|
|
* arch_scale_cpu_capacity - get the capacity scale factor of a given CPU.
|
|
* @cpu: the CPU in question.
|
|
*
|
|
* Return: the CPU scale factor normalized against SCHED_CAPACITY_SCALE, i.e.
|
|
*
|
|
* max_perf(cpu)
|
|
* ----------------------------- * SCHED_CAPACITY_SCALE
|
|
* max(max_perf(c) : c \in CPUs)
|
|
*/
|
|
static __always_inline
|
|
unsigned long arch_scale_cpu_capacity(int cpu)
|
|
{
|
|
return SCHED_CAPACITY_SCALE;
|
|
}
|
|
#endif
|
|
|
|
#ifndef arch_scale_thermal_pressure
|
|
static __always_inline
|
|
unsigned long arch_scale_thermal_pressure(int cpu)
|
|
{
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
#ifndef arch_update_thermal_pressure
|
|
static __always_inline
|
|
void arch_update_thermal_pressure(const struct cpumask *cpus,
|
|
unsigned long capped_frequency)
|
|
{ }
|
|
#endif
|
|
|
|
static inline int task_node(const struct task_struct *p)
|
|
{
|
|
return cpu_to_node(task_cpu(p));
|
|
}
|
|
|
|
#endif /* _LINUX_SCHED_TOPOLOGY_H */
|