383 lines
9.0 KiB
C
383 lines
9.0 KiB
C
/*
|
|
* arch/ia64/kernel/domain.c
|
|
* Architecture specific sched-domains builder.
|
|
*
|
|
* Copyright (C) 2004 Jesse Barnes
|
|
* Copyright (C) 2004 Silicon Graphics, Inc.
|
|
*/
|
|
|
|
#include <linux/sched.h>
|
|
#include <linux/percpu.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/cpumask.h>
|
|
#include <linux/init.h>
|
|
#include <linux/topology.h>
|
|
#include <linux/nodemask.h>
|
|
|
|
#define SD_NODES_PER_DOMAIN 6
|
|
|
|
#ifdef CONFIG_NUMA
|
|
/**
|
|
* find_next_best_node - find the next node to include in a sched_domain
|
|
* @node: node whose sched_domain we're building
|
|
* @used_nodes: nodes already in the sched_domain
|
|
*
|
|
* Find the next node to include in a given scheduling domain. Simply
|
|
* finds the closest node not already in the @used_nodes map.
|
|
*
|
|
* Should use nodemask_t.
|
|
*/
|
|
static int __devinit find_next_best_node(int node, unsigned long *used_nodes)
|
|
{
|
|
int i, n, val, min_val, best_node = 0;
|
|
|
|
min_val = INT_MAX;
|
|
|
|
for (i = 0; i < MAX_NUMNODES; i++) {
|
|
/* Start at @node */
|
|
n = (node + i) % MAX_NUMNODES;
|
|
|
|
if (!nr_cpus_node(n))
|
|
continue;
|
|
|
|
/* Skip already used nodes */
|
|
if (test_bit(n, used_nodes))
|
|
continue;
|
|
|
|
/* Simple min distance search */
|
|
val = node_distance(node, n);
|
|
|
|
if (val < min_val) {
|
|
min_val = val;
|
|
best_node = n;
|
|
}
|
|
}
|
|
|
|
set_bit(best_node, used_nodes);
|
|
return best_node;
|
|
}
|
|
|
|
/**
|
|
* sched_domain_node_span - get a cpumask for a node's sched_domain
|
|
* @node: node whose cpumask we're constructing
|
|
* @size: number of nodes to include in this span
|
|
*
|
|
* Given a node, construct a good cpumask for its sched_domain to span. It
|
|
* should be one that prevents unnecessary balancing, but also spreads tasks
|
|
* out optimally.
|
|
*/
|
|
static cpumask_t __devinit sched_domain_node_span(int node)
|
|
{
|
|
int i;
|
|
cpumask_t span, nodemask;
|
|
DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
|
|
|
|
cpus_clear(span);
|
|
bitmap_zero(used_nodes, MAX_NUMNODES);
|
|
|
|
nodemask = node_to_cpumask(node);
|
|
cpus_or(span, span, nodemask);
|
|
set_bit(node, used_nodes);
|
|
|
|
for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
|
|
int next_node = find_next_best_node(node, used_nodes);
|
|
nodemask = node_to_cpumask(next_node);
|
|
cpus_or(span, span, nodemask);
|
|
}
|
|
|
|
return span;
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
|
|
* can switch it on easily if needed.
|
|
*/
|
|
#ifdef CONFIG_SCHED_SMT
|
|
static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
|
|
static struct sched_group sched_group_cpus[NR_CPUS];
|
|
static int __devinit cpu_to_cpu_group(int cpu)
|
|
{
|
|
return cpu;
|
|
}
|
|
#endif
|
|
|
|
static DEFINE_PER_CPU(struct sched_domain, phys_domains);
|
|
static struct sched_group sched_group_phys[NR_CPUS];
|
|
static int __devinit cpu_to_phys_group(int cpu)
|
|
{
|
|
#ifdef CONFIG_SCHED_SMT
|
|
return first_cpu(cpu_sibling_map[cpu]);
|
|
#else
|
|
return cpu;
|
|
#endif
|
|
}
|
|
|
|
#ifdef CONFIG_NUMA
|
|
/*
|
|
* The init_sched_build_groups can't handle what we want to do with node
|
|
* groups, so roll our own. Now each node has its own list of groups which
|
|
* gets dynamically allocated.
|
|
*/
|
|
static DEFINE_PER_CPU(struct sched_domain, node_domains);
|
|
static struct sched_group *sched_group_nodes[MAX_NUMNODES];
|
|
|
|
static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
|
|
static struct sched_group sched_group_allnodes[MAX_NUMNODES];
|
|
|
|
static int __devinit cpu_to_allnodes_group(int cpu)
|
|
{
|
|
return cpu_to_node(cpu);
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* Set up scheduler domains and groups. Callers must hold the hotplug lock.
|
|
*/
|
|
void __devinit arch_init_sched_domains(void)
|
|
{
|
|
int i;
|
|
cpumask_t cpu_default_map;
|
|
|
|
/*
|
|
* Setup mask for cpus without special case scheduling requirements.
|
|
* For now this just excludes isolated cpus, but could be used to
|
|
* exclude other special cases in the future.
|
|
*/
|
|
cpus_complement(cpu_default_map, cpu_isolated_map);
|
|
cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
|
|
|
|
/*
|
|
* Set up domains. Isolated domains just stay on the dummy domain.
|
|
*/
|
|
for_each_cpu_mask(i, cpu_default_map) {
|
|
int group;
|
|
struct sched_domain *sd = NULL, *p;
|
|
cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
|
|
|
|
cpus_and(nodemask, nodemask, cpu_default_map);
|
|
|
|
#ifdef CONFIG_NUMA
|
|
if (num_online_cpus()
|
|
> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
|
|
sd = &per_cpu(allnodes_domains, i);
|
|
*sd = SD_ALLNODES_INIT;
|
|
sd->span = cpu_default_map;
|
|
group = cpu_to_allnodes_group(i);
|
|
sd->groups = &sched_group_allnodes[group];
|
|
p = sd;
|
|
} else
|
|
p = NULL;
|
|
|
|
sd = &per_cpu(node_domains, i);
|
|
*sd = SD_NODE_INIT;
|
|
sd->span = sched_domain_node_span(cpu_to_node(i));
|
|
sd->parent = p;
|
|
cpus_and(sd->span, sd->span, cpu_default_map);
|
|
#endif
|
|
|
|
p = sd;
|
|
sd = &per_cpu(phys_domains, i);
|
|
group = cpu_to_phys_group(i);
|
|
*sd = SD_CPU_INIT;
|
|
sd->span = nodemask;
|
|
sd->parent = p;
|
|
sd->groups = &sched_group_phys[group];
|
|
|
|
#ifdef CONFIG_SCHED_SMT
|
|
p = sd;
|
|
sd = &per_cpu(cpu_domains, i);
|
|
group = cpu_to_cpu_group(i);
|
|
*sd = SD_SIBLING_INIT;
|
|
sd->span = cpu_sibling_map[i];
|
|
cpus_and(sd->span, sd->span, cpu_default_map);
|
|
sd->parent = p;
|
|
sd->groups = &sched_group_cpus[group];
|
|
#endif
|
|
}
|
|
|
|
#ifdef CONFIG_SCHED_SMT
|
|
/* Set up CPU (sibling) groups */
|
|
for_each_cpu_mask(i, cpu_default_map) {
|
|
cpumask_t this_sibling_map = cpu_sibling_map[i];
|
|
cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
|
|
if (i != first_cpu(this_sibling_map))
|
|
continue;
|
|
|
|
init_sched_build_groups(sched_group_cpus, this_sibling_map,
|
|
&cpu_to_cpu_group);
|
|
}
|
|
#endif
|
|
|
|
/* Set up physical groups */
|
|
for (i = 0; i < MAX_NUMNODES; i++) {
|
|
cpumask_t nodemask = node_to_cpumask(i);
|
|
|
|
cpus_and(nodemask, nodemask, cpu_default_map);
|
|
if (cpus_empty(nodemask))
|
|
continue;
|
|
|
|
init_sched_build_groups(sched_group_phys, nodemask,
|
|
&cpu_to_phys_group);
|
|
}
|
|
|
|
#ifdef CONFIG_NUMA
|
|
init_sched_build_groups(sched_group_allnodes, cpu_default_map,
|
|
&cpu_to_allnodes_group);
|
|
|
|
for (i = 0; i < MAX_NUMNODES; i++) {
|
|
/* Set up node groups */
|
|
struct sched_group *sg, *prev;
|
|
cpumask_t nodemask = node_to_cpumask(i);
|
|
cpumask_t domainspan;
|
|
cpumask_t covered = CPU_MASK_NONE;
|
|
int j;
|
|
|
|
cpus_and(nodemask, nodemask, cpu_default_map);
|
|
if (cpus_empty(nodemask))
|
|
continue;
|
|
|
|
domainspan = sched_domain_node_span(i);
|
|
cpus_and(domainspan, domainspan, cpu_default_map);
|
|
|
|
sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
|
|
sched_group_nodes[i] = sg;
|
|
for_each_cpu_mask(j, nodemask) {
|
|
struct sched_domain *sd;
|
|
sd = &per_cpu(node_domains, j);
|
|
sd->groups = sg;
|
|
if (sd->groups == NULL) {
|
|
/* Turn off balancing if we have no groups */
|
|
sd->flags = 0;
|
|
}
|
|
}
|
|
if (!sg) {
|
|
printk(KERN_WARNING
|
|
"Can not alloc domain group for node %d\n", i);
|
|
continue;
|
|
}
|
|
sg->cpu_power = 0;
|
|
sg->cpumask = nodemask;
|
|
cpus_or(covered, covered, nodemask);
|
|
prev = sg;
|
|
|
|
for (j = 0; j < MAX_NUMNODES; j++) {
|
|
cpumask_t tmp, notcovered;
|
|
int n = (i + j) % MAX_NUMNODES;
|
|
|
|
cpus_complement(notcovered, covered);
|
|
cpus_and(tmp, notcovered, cpu_default_map);
|
|
cpus_and(tmp, tmp, domainspan);
|
|
if (cpus_empty(tmp))
|
|
break;
|
|
|
|
nodemask = node_to_cpumask(n);
|
|
cpus_and(tmp, tmp, nodemask);
|
|
if (cpus_empty(tmp))
|
|
continue;
|
|
|
|
sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
|
|
if (!sg) {
|
|
printk(KERN_WARNING
|
|
"Can not alloc domain group for node %d\n", j);
|
|
break;
|
|
}
|
|
sg->cpu_power = 0;
|
|
sg->cpumask = tmp;
|
|
cpus_or(covered, covered, tmp);
|
|
prev->next = sg;
|
|
prev = sg;
|
|
}
|
|
prev->next = sched_group_nodes[i];
|
|
}
|
|
#endif
|
|
|
|
/* Calculate CPU power for physical packages and nodes */
|
|
for_each_cpu_mask(i, cpu_default_map) {
|
|
int power;
|
|
struct sched_domain *sd;
|
|
#ifdef CONFIG_SCHED_SMT
|
|
sd = &per_cpu(cpu_domains, i);
|
|
power = SCHED_LOAD_SCALE;
|
|
sd->groups->cpu_power = power;
|
|
#endif
|
|
|
|
sd = &per_cpu(phys_domains, i);
|
|
power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
|
|
(cpus_weight(sd->groups->cpumask)-1) / 10;
|
|
sd->groups->cpu_power = power;
|
|
|
|
#ifdef CONFIG_NUMA
|
|
sd = &per_cpu(allnodes_domains, i);
|
|
if (sd->groups) {
|
|
power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
|
|
(cpus_weight(sd->groups->cpumask)-1) / 10;
|
|
sd->groups->cpu_power = power;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
#ifdef CONFIG_NUMA
|
|
for (i = 0; i < MAX_NUMNODES; i++) {
|
|
struct sched_group *sg = sched_group_nodes[i];
|
|
int j;
|
|
|
|
if (sg == NULL)
|
|
continue;
|
|
next_sg:
|
|
for_each_cpu_mask(j, sg->cpumask) {
|
|
struct sched_domain *sd;
|
|
int power;
|
|
|
|
sd = &per_cpu(phys_domains, j);
|
|
if (j != first_cpu(sd->groups->cpumask)) {
|
|
/*
|
|
* Only add "power" once for each
|
|
* physical package.
|
|
*/
|
|
continue;
|
|
}
|
|
power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
|
|
(cpus_weight(sd->groups->cpumask)-1) / 10;
|
|
|
|
sg->cpu_power += power;
|
|
}
|
|
sg = sg->next;
|
|
if (sg != sched_group_nodes[i])
|
|
goto next_sg;
|
|
}
|
|
#endif
|
|
|
|
/* Attach the domains */
|
|
for_each_online_cpu(i) {
|
|
struct sched_domain *sd;
|
|
#ifdef CONFIG_SCHED_SMT
|
|
sd = &per_cpu(cpu_domains, i);
|
|
#else
|
|
sd = &per_cpu(phys_domains, i);
|
|
#endif
|
|
cpu_attach_domain(sd, i);
|
|
}
|
|
}
|
|
|
|
void __devinit arch_destroy_sched_domains(void)
|
|
{
|
|
#ifdef CONFIG_NUMA
|
|
int i;
|
|
for (i = 0; i < MAX_NUMNODES; i++) {
|
|
struct sched_group *oldsg, *sg = sched_group_nodes[i];
|
|
if (sg == NULL)
|
|
continue;
|
|
sg = sg->next;
|
|
next_sg:
|
|
oldsg = sg;
|
|
sg = sg->next;
|
|
kfree(oldsg);
|
|
if (oldsg != sched_group_nodes[i])
|
|
goto next_sg;
|
|
sched_group_nodes[i] = NULL;
|
|
}
|
|
#endif
|
|
}
|
|
|