understanding linux load balance-letmego163-ChinaUnix博客

涌泉闪闪@原创空间

首页　| 　博文目录　| 　关于我

letmego163

博客访问： 448743
博文数量： 123
博客积分： 2686
博客等级：少校
技术积分： 1349
用户组：普通用户
注册时间： 2009-12-23 22:11

文章分类

全部博文（123）

game（2）
mcore（1）
Virtual Address （1）
Page Coloring（1）
memory managemen（27）
Job（2）
configuration（1）
schedule（4）
common_kernel_AP（2）
kernel_patch（1）
多核研究（2）
Linux内核数据结（2）
U-boot-2009移植（8）
Linux内核源码（26）
Linux应用相关（11）
Linux驱动程序（11）
未分配的博文（21）

文章存档

2012年（3）

2011年（10）

2010年（100）

2009年（10）

我的朋友

周宏1990

相关博文

understanding linux load balance

分类： LINUX

2010-08-23 21:57:29

STEP 1: This function gets called by the timer code, with HZ frequency.We call it with interrupts disabled. It also gets called by the fork code, when changing the parent's timeslices.

void scheduler_tick(void)

int cpu = smp_processor_id();
   struct rq *rq = cpu_rq(cpu);
   struct task_struct *curr = rq->curr;

   sched_clock_tick();

   raw_spin_lock(&rq->lock);
   update_rq_clock(rq);
   update_cpu_load(rq);
   curr->sched_class->task_tick(rq, curr, 0);
   raw_spin_unlock(&rq->lock);

   perf_event_task_tick(curr);

#ifdef CONFIG_SMP
   rq->idle_at_tick = idle_cpu(cpu);
   trigger_load_balance(rq, cpu);
#endif
}

STEP 2: trigger_load_balance() get called by scheduler_tick(),which raise a softirq

/* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. * * In case of CONFIG_NO_HZ, this is the place where we nominate a new * idle load balancing owner or decide to stop the periodic load balancing, * if the whole system is idle. */ static inline void trigger_load_balance(struct rq *rq, int cpu) { #ifdef CONFIG_NO_HZ /* * If we were in the nohz mode recently and busy at the current * scheduler tick, then check if we need to nominate new idle * load balancer. */ if (rq->in_nohz_recently && !rq->idle_at_tick) { rq->in_nohz_recently = 0; if (atomic_read(&nohz.load_balancer) == cpu) { cpumask_clear_cpu(cpu, nohz.cpu_mask); atomic_set(&nohz.load_balancer, -1); } if (atomic_read(&nohz.load_balancer) == -1) { int ilb = find_new_ilb(cpu); if (ilb < nr_cpu_ids) resched_cpu(ilb); } } /* * If this cpu is idle and doing idle load balancing for all the * cpus with ticks stopped, is it time for that to stop? */ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { resched_cpu(cpu); return; } /* * If this cpu is idle and the idle load balancing is done by * someone else, then no need raise the SCHED_SOFTIRQ */ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && cpumask_test_cpu(cpu, nohz.cpu_mask)) return; #endif /* Don't need to rebalance while attached to NULL domain */ if (time_after_eq(jiffies, rq->next_balance) && likely(!on_null_domain(cpu))) raise_softirq(SCHED_SOFTIRQ); }

STEP 3: deal with the softirq---SCHED_SOFTIRQ.This function is triggered by triggered load balance.

/* * run_rebalance_domains is triggered when needed from the scheduler tick. * In CONFIG_NO_HZ case, the idle load balance owner will do the * rebalancing for all the cpus for whom scheduler ticks are stopped. */ static void run_rebalance_domains(struct softirq_action *h) { int this_cpu = smp_processor_id(); struct rq *this_rq = cpu_rq(this_cpu); enum cpu_idle_type idle = this_rq->idle_at_tick ? CPU_IDLE : CPU_NOT_IDLE; rebalance_domains(this_cpu, idle); #ifdef CONFIG_NO_HZ /* * If this cpu is the owner for idle load balancing, then do the * balancing on behalf of the other idle cpus whose ticks are * stopped. */ if (this_rq->idle_at_tick && atomic_read(&nohz.load_balancer) == this_cpu) { struct rq *rq; int balance_cpu; for_each_cpu(balance_cpu, nohz.cpu_mask) { if (balance_cpu == this_cpu) continue; /* * If this cpu gets work to do, stop the load balancing * work being done for other cpus. Next load * balancing owner will pick it up. */ if (need_resched()) break; rebalance_domains(balance_cpu, CPU_IDLE); rq = cpu_rq(balance_cpu); if (time_after(this_rq->next_balance, rq->next_balance)) this_rq->next_balance = rq->next_balance; } } #endif }

/* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. * * Balancing parameters are set up in arch_init_sched_domains. */ static void rebalance_domains(int cpu, enum cpu_idle_type idle) { int balance = 1; struct rq *rq = cpu_rq(cpu); unsigned long interval; struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; int need_serialize; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) continue; interval = sd->balance_interval; if (idle != CPU_IDLE) interval *= sd->busy_factor; /* scale ms to jiffies */ interval = msecs_to_jiffies(interval); if (unlikely(!interval)) interval = 1; if (interval > HZ*NR_CPUS/10) interval = HZ*NR_CPUS/10; need_serialize = sd->flags & SD_SERIALIZE; if (need_serialize) { if (!spin_trylock(&balancing)) goto out; } if (time_after_eq(jiffies, sd->last_balance + interval)) { if (load_balance(cpu, rq, sd, idle, &balance)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is * not idle. */ idle = CPU_NOT_IDLE; } sd->last_balance = jiffies; } if (need_serialize) spin_unlock(&balancing); out: if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; update_next_balance = 1; } /* * Stop the load balance at this level. There is another * CPU in our sched group which is doing load balancing more * actively. */ if (!balance) break; } /* * next_balance will be updated only when there is a need. * When the cpu is attached to null domain for ex, it will not be * updated. */ if (likely(update_next_balance)) rq->next_balance = next_balance; }

/* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; unsigned long flags; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); cpumask_copy(cpus, cpu_active_mask); /* * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, * let the state of idle sibling percolate up as CPU_IDLE, instead of * portraying it as CPU_NOT_IDLE. */ if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; schedstat_inc(sd, lb_count[idle]); redo: update_shares(sd); group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, cpus, balance); if (*balance == 0) goto out_balanced; if (!group) { schedstat_inc(sd, lb_nobusyg[idle]); goto out_balanced; } busiest = find_busiest_queue(group, idle, imbalance, cpus); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; } BUG_ON(busiest == this_rq); schedstat_add(sd, lb_imbalance[idle], imbalance); ld_moved = 0; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found * an imbalance but busiest->nr_running <= 1, the group is * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ local_irq_save(flags); double_rq_lock(this_rq, busiest); ld_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, idle, &all_pinned); double_rq_unlock(this_rq, busiest); local_irq_restore(flags); /* * some other cpu did the load balance for us. */ if (ld_moved && this_cpu != smp_processor_id()) resched_cpu(this_cpu); /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { cpumask_clear_cpu(cpu_of(busiest), cpus); if (!cpumask_empty(cpus)) goto redo; goto out_balanced; } } if (!ld_moved) { schedstat_inc(sd, lb_failed[idle]); sd->nr_balance_failed++; if (need_active_balance(sd, sd_idle, idle)) { raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { raw_spin_unlock_irqrestore(&busiest->lock, flags); all_pinned = 1; goto out_one_pinned; } if (!busiest->active_balance) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; active_balance = 1; } raw_spin_unlock_irqrestore(&busiest->lock, flags); if (active_balance) wake_up_process(busiest->migration_thread); /* * We've kicked active balancing, reset the failure * counter. */ sd->nr_balance_failed = sd->cache_nice_tries+1; } } else sd->nr_balance_failed = 0; if (likely(!active_balance)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; } else { /* * If we've begun active balancing, start to back off. This * case may not be covered by the all_pinned logic if there * is only 1 task on the busy runqueue (because we don't call * move_tasks). */ if (sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; } if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) ld_moved = -1; goto out; out_balanced: schedstat_inc(sd, lb_balanced[idle]); sd->nr_balance_failed = 0; out_one_pinned: /* tune up the balancing interval */ if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) ld_moved = -1; else ld_moved = 0; out: if (ld_moved) update_shares(sd); return ld_moved; }

/* * move_tasks tries to move up to max_load_move weighted load from busiest to * this_rq, as part of a balancing operation within domain "sd". * Returns 1 if successful and 0 otherwise. * * Called with both runqueues locked. */ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned) { unsigned long total_load_moved = 0, load_moved; int this_best_prio = this_rq->curr->prio; do { load_moved = load_balance_fair(this_rq, this_cpu, busiest, max_load_move - total_load_moved, sd, idle, all_pinned, &this_best_prio); total_load_moved += load_moved; #ifdef CONFIG_PREEMPT /* * NEWIDLE balancing is a source of latency, so preemptible * kernels will stop after the first task is pulled to minimize * the critical section. */ if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) break; if (raw_spin_is_contended(&this_rq->lock) || raw_spin_is_contended(&busiest->lock)) break; #endif } while (load_moved && max_load_move > total_load_moved); return total_load_moved > 0; }

static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio) { long rem_load_move = max_load_move; int busiest_cpu = cpu_of(busiest); struct task_group *tg; rcu_read_lock(); update_h_load(busiest_cpu); list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; unsigned long busiest_h_load = busiest_cfs_rq->h_load; unsigned long busiest_weight = busiest_cfs_rq->load.weight; u64 rem_load, moved_load; /* * empty group */ if (!busiest_cfs_rq->task_weight) continue; rem_load = (u64)rem_load_move * busiest_weight; rem_load = div_u64(rem_load, busiest_h_load + 1); moved_load = balance_tasks(this_rq, this_cpu, busiest, rem_load, sd, idle, all_pinned, this_best_prio, busiest_cfs_rq); if (!moved_load) continue; moved_load *= busiest_h_load; moved_load = div_u64(moved_load, busiest_weight + 1); rem_load_move -= moved_load; if (rem_load_move < 0) break; } rcu_read_unlock(); return max_load_move - rem_load_move; }

static unsigned long balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, struct cfs_rq *busiest_cfs_rq) { int loops = 0, pulled = 0, pinned = 0; long rem_load_move = max_load_move; struct task_struct *p, *n; if (max_load_move == 0) goto out; pinned = 1; list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { if (loops++ > sysctl_sched_nr_migrate) break; if ((p->se.load.weight >> 1) > rem_load_move || !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) continue; pull_task(busiest, p, this_rq, this_cpu); pulled++; rem_load_move -= p->se.load.weight; #ifdef CONFIG_PREEMPT /* * NEWIDLE balancing is a source of latency, so preemptible * kernels will stop after the first task is pulled to minimize * the critical section. */ if (idle == CPU_NEWLY_IDLE) break; #endif /* * We only want to steal up to the prescribed amount of * weighted load. */ if (rem_load_move <= 0) break; if (p->prio < *this_best_prio) *this_best_prio = p->prio; } out: /* * Right now, this is one of only two places pull_task() is called, * so we can safely collect pull_task() stats here rather than * inside pull_task(). */ schedstat_add(sd, lb_gained[idle], pulled); if (all_pinned) *all_pinned = pinned; return max_load_move - rem_load_move; }

/************************************************** * Fair scheduling class load-balancing methods: */ /* * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ static void pull_task(struct rq *src_rq, struct task_struct *p, struct rq *this_rq, int this_cpu) { deactivate_task(src_rq, p, 0); set_task_cpu(p, this_cpu); activate_task(this_rq, p, 0); check_preempt_curr(this_rq, p, 0); }

阅读(1482) | 评论(1) | 转发(0) |

上一篇：scheduler_tick()

下一篇：schedule()详解

给主人留下些什么吧！~~

chinaunix网友2010-08-26 14:24:50

Download More than 1000 free IT eBooks: http://free-ebooks.appspot.com

回复 | 举报

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6