1 | // SPDX-License-Identifier: GPL-2.0 |
---|---|

2 | /* |

3 | * kernel/sched/loadavg.c |

4 | * |

5 | * This file contains the magic bits required to compute the global loadavg |

6 | * figure. Its a silly number but people think its important. We go through |

7 | * great pains to make it work on big machines and tickless kernels. |

8 | */ |

9 | #include "sched.h" |

10 | |

11 | /* |

12 | * Global load-average calculations |

13 | * |

14 | * We take a distributed and async approach to calculating the global load-avg |

15 | * in order to minimize overhead. |

16 | * |

17 | * The global load average is an exponentially decaying average of nr_running + |

18 | * nr_uninterruptible. |

19 | * |

20 | * Once every LOAD_FREQ: |

21 | * |

22 | * nr_active = 0; |

23 | * for_each_possible_cpu(cpu) |

24 | * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; |

25 | * |

26 | * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) |

27 | * |

28 | * Due to a number of reasons the above turns in the mess below: |

29 | * |

30 | * - for_each_possible_cpu() is prohibitively expensive on machines with |

31 | * serious number of CPUs, therefore we need to take a distributed approach |

32 | * to calculating nr_active. |

33 | * |

34 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 |

35 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } |

36 | * |

37 | * So assuming nr_active := 0 when we start out -- true per definition, we |

38 | * can simply take per-CPU deltas and fold those into a global accumulate |

39 | * to obtain the same result. See calc_load_fold_active(). |

40 | * |

41 | * Furthermore, in order to avoid synchronizing all per-CPU delta folding |

42 | * across the machine, we assume 10 ticks is sufficient time for every |

43 | * CPU to have completed this task. |

44 | * |

45 | * This places an upper-bound on the IRQ-off latency of the machine. Then |

46 | * again, being late doesn't loose the delta, just wrecks the sample. |

47 | * |

48 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because |

49 | * this would add another cross-CPU cacheline miss and atomic operation |

50 | * to the wakeup path. Instead we increment on whatever CPU the task ran |

51 | * when it went into uninterruptible state and decrement on whatever CPU |

52 | * did the wakeup. This means that only the sum of nr_uninterruptible over |

53 | * all CPUs yields the correct result. |

54 | * |

55 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. |

56 | */ |

57 | |

58 | /* Variables and functions for calc_load */ |

59 | atomic_long_t calc_load_tasks; |

60 | unsigned long calc_load_update; |

61 | unsigned long avenrun[3]; |

62 | EXPORT_SYMBOL(avenrun); /* should be removed */ |

63 | |

64 | /** |

65 | * get_avenrun - get the load average array |

66 | * @loads: pointer to dest load array |

67 | * @offset: offset to add |

68 | * @shift: shift count to shift the result left |

69 | * |

70 | * These values are estimates at best, so no need for locking. |

71 | */ |

72 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) |

73 | { |

74 | loads[0] = (avenrun[0] + offset) << shift; |

75 | loads[1] = (avenrun[1] + offset) << shift; |

76 | loads[2] = (avenrun[2] + offset) << shift; |

77 | } |

78 | |

79 | long calc_load_fold_active(struct rq *this_rq, long adjust) |

80 | { |

81 | long nr_active, delta = 0; |

82 | |

83 | nr_active = this_rq->nr_running - adjust; |

84 | nr_active += (long)this_rq->nr_uninterruptible; |

85 | |

86 | if (nr_active != this_rq->calc_load_active) { |

87 | delta = nr_active - this_rq->calc_load_active; |

88 | this_rq->calc_load_active = nr_active; |

89 | } |

90 | |

91 | return delta; |

92 | } |

93 | |

94 | /** |

95 | * fixed_power_int - compute: x^n, in O(log n) time |

96 | * |

97 | * @x: base of the power |

98 | * @frac_bits: fractional bits of @x |

99 | * @n: power to raise @x to. |

100 | * |

101 | * By exploiting the relation between the definition of the natural power |

102 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and |

103 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, |

104 | * (where: n_i \elem {0, 1}, the binary vector representing n), |

105 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is |

106 | * of course trivially computable in O(log_2 n), the length of our binary |

107 | * vector. |

108 | */ |

109 | static unsigned long |

110 | fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) |

111 | { |

112 | unsigned long result = 1UL << frac_bits; |

113 | |

114 | if (n) { |

115 | for (;;) { |

116 | if (n & 1) { |

117 | result *= x; |

118 | result += 1UL << (frac_bits - 1); |

119 | result >>= frac_bits; |

120 | } |

121 | n >>= 1; |

122 | if (!n) |

123 | break; |

124 | x *= x; |

125 | x += 1UL << (frac_bits - 1); |

126 | x >>= frac_bits; |

127 | } |

128 | } |

129 | |

130 | return result; |

131 | } |

132 | |

133 | /* |

134 | * a1 = a0 * e + a * (1 - e) |

135 | * |

136 | * a2 = a1 * e + a * (1 - e) |

137 | * = (a0 * e + a * (1 - e)) * e + a * (1 - e) |

138 | * = a0 * e^2 + a * (1 - e) * (1 + e) |

139 | * |

140 | * a3 = a2 * e + a * (1 - e) |

141 | * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) |

142 | * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) |

143 | * |

144 | * ... |

145 | * |

146 | * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] |

147 | * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) |

148 | * = a0 * e^n + a * (1 - e^n) |

149 | * |

150 | * [1] application of the geometric series: |

151 | * |

152 | * n 1 - x^(n+1) |

153 | * S_n := \Sum x^i = ------------- |

154 | * i=0 1 - x |

155 | */ |

156 | unsigned long |

157 | calc_load_n(unsigned long load, unsigned long exp, |

158 | unsigned long active, unsigned int n) |

159 | { |

160 | return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); |

161 | } |

162 | |

163 | #ifdef CONFIG_NO_HZ_COMMON |

164 | /* |

165 | * Handle NO_HZ for the global load-average. |

166 | * |

167 | * Since the above described distributed algorithm to compute the global |

168 | * load-average relies on per-CPU sampling from the tick, it is affected by |

169 | * NO_HZ. |

170 | * |

171 | * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon |

172 | * entering NO_HZ state such that we can include this as an 'extra' CPU delta |

173 | * when we read the global state. |

174 | * |

175 | * Obviously reality has to ruin such a delightfully simple scheme: |

176 | * |

177 | * - When we go NO_HZ idle during the window, we can negate our sample |

178 | * contribution, causing under-accounting. |

179 | * |

180 | * We avoid this by keeping two NO_HZ-delta counters and flipping them |

181 | * when the window starts, thus separating old and new NO_HZ load. |

182 | * |

183 | * The only trick is the slight shift in index flip for read vs write. |

184 | * |

185 | * 0s 5s 10s 15s |

186 | * +10 +10 +10 +10 |

187 | * |-|-----------|-|-----------|-|-----------|-| |

188 | * r:0 0 1 1 0 0 1 1 0 |

189 | * w:0 1 1 0 0 1 1 0 0 |

190 | * |

191 | * This ensures we'll fold the old NO_HZ contribution in this window while |

192 | * accumlating the new one. |

193 | * |

194 | * - When we wake up from NO_HZ during the window, we push up our |

195 | * contribution, since we effectively move our sample point to a known |

196 | * busy state. |

197 | * |

198 | * This is solved by pushing the window forward, and thus skipping the |

199 | * sample, for this CPU (effectively using the NO_HZ-delta for this CPU which |

200 | * was in effect at the time the window opened). This also solves the issue |

201 | * of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ |

202 | * intervals. |

203 | * |

204 | * When making the ILB scale, we should try to pull this in as well. |

205 | */ |

206 | static atomic_long_t calc_load_nohz[2]; |

207 | static int calc_load_idx; |

208 | |

209 | static inline int calc_load_write_idx(void) |

210 | { |

211 | int idx = calc_load_idx; |

212 | |

213 | /* |

214 | * See calc_global_nohz(), if we observe the new index, we also |

215 | * need to observe the new update time. |

216 | */ |

217 | smp_rmb(); |

218 | |

219 | /* |

220 | * If the folding window started, make sure we start writing in the |

221 | * next NO_HZ-delta. |

222 | */ |

223 | if (!time_before(jiffies, READ_ONCE(calc_load_update))) |

224 | idx++; |

225 | |

226 | return idx & 1; |

227 | } |

228 | |

229 | static inline int calc_load_read_idx(void) |

230 | { |

231 | return calc_load_idx & 1; |

232 | } |

233 | |

234 | void calc_load_nohz_start(void) |

235 | { |

236 | struct rq *this_rq = this_rq(); |

237 | long delta; |

238 | |

239 | /* |

240 | * We're going into NO_HZ mode, if there's any pending delta, fold it |

241 | * into the pending NO_HZ delta. |

242 | */ |

243 | delta = calc_load_fold_active(this_rq, 0); |

244 | if (delta) { |

245 | int idx = calc_load_write_idx(); |

246 | |

247 | atomic_long_add(delta, &calc_load_nohz[idx]); |

248 | } |

249 | } |

250 | |

251 | void calc_load_nohz_stop(void) |

252 | { |

253 | struct rq *this_rq = this_rq(); |

254 | |

255 | /* |

256 | * If we're still before the pending sample window, we're done. |

257 | */ |

258 | this_rq->calc_load_update = READ_ONCE(calc_load_update); |

259 | if (time_before(jiffies, this_rq->calc_load_update)) |

260 | return; |

261 | |

262 | /* |

263 | * We woke inside or after the sample window, this means we're already |

264 | * accounted through the nohz accounting, so skip the entire deal and |

265 | * sync up for the next window. |

266 | */ |

267 | if (time_before(jiffies, this_rq->calc_load_update + 10)) |

268 | this_rq->calc_load_update += LOAD_FREQ; |

269 | } |

270 | |

271 | static long calc_load_nohz_fold(void) |

272 | { |

273 | int idx = calc_load_read_idx(); |

274 | long delta = 0; |

275 | |

276 | if (atomic_long_read(&calc_load_nohz[idx])) |

277 | delta = atomic_long_xchg(&calc_load_nohz[idx], 0); |

278 | |

279 | return delta; |

280 | } |

281 | |

282 | /* |

283 | * NO_HZ can leave us missing all per-CPU ticks calling |

284 | * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into |

285 | * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold |

286 | * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. |

287 | * |

288 | * Once we've updated the global active value, we need to apply the exponential |

289 | * weights adjusted to the number of cycles missed. |

290 | */ |

291 | static void calc_global_nohz(void) |

292 | { |

293 | unsigned long sample_window; |

294 | long delta, active, n; |

295 | |

296 | sample_window = READ_ONCE(calc_load_update); |

297 | if (!time_before(jiffies, sample_window + 10)) { |

298 | /* |

299 | * Catch-up, fold however many we are behind still |

300 | */ |

301 | delta = jiffies - sample_window - 10; |

302 | n = 1 + (delta / LOAD_FREQ); |

303 | |

304 | active = atomic_long_read(&calc_load_tasks); |

305 | active = active > 0 ? active * FIXED_1 : 0; |

306 | |

307 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); |

308 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); |

309 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); |

310 | |

311 | WRITE_ONCE(calc_load_update, sample_window + n * LOAD_FREQ); |

312 | } |

313 | |

314 | /* |

315 | * Flip the NO_HZ index... |

316 | * |

317 | * Make sure we first write the new time then flip the index, so that |

318 | * calc_load_write_idx() will see the new time when it reads the new |

319 | * index, this avoids a double flip messing things up. |

320 | */ |

321 | smp_wmb(); |

322 | calc_load_idx++; |

323 | } |

324 | #else /* !CONFIG_NO_HZ_COMMON */ |

325 | |

326 | static inline long calc_load_nohz_fold(void) { return 0; } |

327 | static inline void calc_global_nohz(void) { } |

328 | |

329 | #endif /* CONFIG_NO_HZ_COMMON */ |

330 | |

331 | /* |

332 | * calc_load - update the avenrun load estimates 10 ticks after the |

333 | * CPUs have updated calc_load_tasks. |

334 | * |

335 | * Called from the global timer code. |

336 | */ |

337 | void calc_global_load(unsigned long ticks) |

338 | { |

339 | unsigned long sample_window; |

340 | long active, delta; |

341 | |

342 | sample_window = READ_ONCE(calc_load_update); |

343 | if (time_before(jiffies, sample_window + 10)) |

344 | return; |

345 | |

346 | /* |

347 | * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs. |

348 | */ |

349 | delta = calc_load_nohz_fold(); |

350 | if (delta) |

351 | atomic_long_add(delta, &calc_load_tasks); |

352 | |

353 | active = atomic_long_read(&calc_load_tasks); |

354 | active = active > 0 ? active * FIXED_1 : 0; |

355 | |

356 | avenrun[0] = calc_load(avenrun[0], EXP_1, active); |

357 | avenrun[1] = calc_load(avenrun[1], EXP_5, active); |

358 | avenrun[2] = calc_load(avenrun[2], EXP_15, active); |

359 | |

360 | WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ); |

361 | |

362 | /* |

363 | * In case we went to NO_HZ for multiple LOAD_FREQ intervals |

364 | * catch up in bulk. |

365 | */ |

366 | calc_global_nohz(); |

367 | } |

368 | |

369 | /* |

370 | * Called from scheduler_tick() to periodically update this CPU's |

371 | * active count. |

372 | */ |

373 | void calc_global_load_tick(struct rq *this_rq) |

374 | { |

375 | long delta; |

376 | |

377 | if (time_before(jiffies, this_rq->calc_load_update)) |

378 | return; |

379 | |

380 | delta = calc_load_fold_active(this_rq, 0); |

381 | if (delta) |

382 | atomic_long_add(delta, &calc_load_tasks); |

383 | |

384 | this_rq->calc_load_update += LOAD_FREQ; |

385 | } |

386 |