Print this page
6147 segop_getpolicy already checks for a NULL op
Reviewed by: Garrett D'Amore <garrett@damore.org>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/os/lgrp.c
+++ new/usr/src/uts/common/os/lgrp.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 25
26 26 /*
27 27 * Basic NUMA support in terms of locality groups
28 28 *
29 29 * Solaris needs to know which CPUs, memory, etc. are near each other to
30 30 * provide good performance on NUMA machines by optimizing for locality.
31 31 * In order to do this, a new abstraction called a "locality group (lgroup)"
32 32 * has been introduced to keep track of which CPU-like and memory-like hardware
33 33 * resources are close to each other. Currently, latency is the only measure
34 34 * used to determine how to group hardware resources into lgroups, but this
35 35 * does not limit the groupings to be based solely on latency. Other factors
36 36 * may be used to determine the groupings in the future.
37 37 *
38 38 * Lgroups are organized into a hieararchy or topology that represents the
39 39 * latency topology of the machine. There is always at least a root lgroup in
40 40 * the system. It represents all the hardware resources in the machine at a
41 41 * latency big enough that any hardware resource can at least access any other
42 42 * hardware resource within that latency. A Uniform Memory Access (UMA)
43 43 * machine is represented with one lgroup (the root). In contrast, a NUMA
44 44 * machine is represented at least by the root lgroup and some number of leaf
45 45 * lgroups where the leaf lgroups contain the hardware resources within the
46 46 * least latency of each other and the root lgroup still contains all the
47 47 * resources in the machine. Some number of intermediate lgroups may exist
48 48 * which represent more levels of locality than just the local latency of the
49 49 * leaf lgroups and the system latency of the root lgroup. Non-leaf lgroups
50 50 * (eg. root and intermediate lgroups) contain the next nearest resources to
51 51 * its children lgroups. Thus, the lgroup hierarchy from a given leaf lgroup
52 52 * to the root lgroup shows the hardware resources from closest to farthest
53 53 * from the leaf lgroup such that each successive ancestor lgroup contains
54 54 * the next nearest resources at the next level of locality from the previous.
55 55 *
56 56 * The kernel uses the lgroup abstraction to know how to allocate resources
57 57 * near a given process/thread. At fork() and lwp/thread_create() time, a
58 58 * "home" lgroup is chosen for a thread. This is done by picking the lgroup
59 59 * with the lowest load average. Binding to a processor or processor set will
60 60 * change the home lgroup for a thread. The scheduler has been modified to try
61 61 * to dispatch a thread on a CPU in its home lgroup. Physical memory
62 62 * allocation is lgroup aware too, so memory will be allocated from the current
63 63 * thread's home lgroup if possible. If the desired resources are not
64 64 * available, the kernel traverses the lgroup hierarchy going to the parent
65 65 * lgroup to find resources at the next level of locality until it reaches the
66 66 * root lgroup.
67 67 */
68 68
69 69 #include <sys/lgrp.h>
70 70 #include <sys/lgrp_user.h>
71 71 #include <sys/types.h>
72 72 #include <sys/mman.h>
73 73 #include <sys/param.h>
74 74 #include <sys/var.h>
75 75 #include <sys/thread.h>
76 76 #include <sys/cpuvar.h>
77 77 #include <sys/cpupart.h>
78 78 #include <sys/kmem.h>
79 79 #include <vm/seg.h>
80 80 #include <vm/seg_kmem.h>
81 81 #include <vm/seg_spt.h>
82 82 #include <vm/seg_vn.h>
83 83 #include <vm/as.h>
84 84 #include <sys/atomic.h>
85 85 #include <sys/systm.h>
86 86 #include <sys/errno.h>
87 87 #include <sys/cmn_err.h>
88 88 #include <sys/kstat.h>
89 89 #include <sys/sysmacros.h>
90 90 #include <sys/pg.h>
91 91 #include <sys/promif.h>
92 92 #include <sys/sdt.h>
93 93
94 94 lgrp_gen_t lgrp_gen = 0; /* generation of lgroup hierarchy */
95 95 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
96 96 /* indexed by lgrp_id */
97 97 int nlgrps; /* number of lgroups in machine */
98 98 int lgrp_alloc_hint = -1; /* hint for where to try to allocate next */
99 99 int lgrp_alloc_max = 0; /* max lgroup ID allocated so far */
100 100
101 101 /*
102 102 * Kstat data for lgroups.
103 103 *
104 104 * Actual kstat data is collected in lgrp_stats array.
105 105 * The lgrp_kstat_data array of named kstats is used to extract data from
106 106 * lgrp_stats and present it to kstat framework. It is protected from partallel
107 107 * modifications by lgrp_kstat_mutex. This may cause some contention when
108 108 * several kstat commands run in parallel but this is not the
109 109 * performance-critical path.
110 110 */
111 111 extern struct lgrp_stats lgrp_stats[]; /* table of per-lgrp stats */
112 112
113 113 /*
114 114 * Declare kstat names statically for enums as defined in the header file.
115 115 */
116 116 LGRP_KSTAT_NAMES;
117 117
118 118 static void lgrp_kstat_init(void);
119 119 static int lgrp_kstat_extract(kstat_t *, int);
120 120 static void lgrp_kstat_reset(lgrp_id_t);
121 121
122 122 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS];
123 123 static kmutex_t lgrp_kstat_mutex;
124 124
125 125
126 126 /*
127 127 * max number of lgroups supported by the platform
128 128 */
129 129 int nlgrpsmax = 0;
130 130
131 131 /*
132 132 * The root lgroup. Represents the set of resources at the system wide
133 133 * level of locality.
134 134 */
135 135 lgrp_t *lgrp_root = NULL;
136 136
137 137 /*
138 138 * During system bootstrap cp_default does not contain the list of lgrp load
139 139 * averages (cp_lgrploads). The list is allocated after the first CPU is brought
140 140 * on-line when cp_default is initialized by cpupart_initialize_default().
141 141 * Configuring CPU0 may create a two-level topology with root and one leaf node
142 142 * containing CPU0. This topology is initially constructed in a special
143 143 * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
144 144 * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
145 145 * for all lpl operations until cp_default is fully constructed.
146 146 *
147 147 * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
148 148 * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
149 149 * the first element of lpl_bootstrap_list.
150 150 *
151 151 * CPUs that are added to the system, but have not yet been assigned to an
152 152 * lgrp will use lpl_bootstrap as a default lpl. This is necessary because
153 153 * on some architectures (x86) it's possible for the slave CPU startup thread
154 154 * to enter the dispatcher or allocate memory before calling lgrp_cpu_init().
155 155 */
156 156 #define LPL_BOOTSTRAP_SIZE 2
157 157 static lpl_t lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE];
158 158 lpl_t *lpl_bootstrap;
159 159 static lpl_t *lpl_bootstrap_rset[LPL_BOOTSTRAP_SIZE];
160 160 static int lpl_bootstrap_id2rset[LPL_BOOTSTRAP_SIZE];
161 161
162 162 /*
163 163 * If cp still references the bootstrap lpl, it has not yet been added to
164 164 * an lgrp. lgrp_mem_choose() uses this macro to detect the case where
165 165 * a thread is trying to allocate memory close to a CPU that has no lgrp.
166 166 */
167 167 #define LGRP_CPU_HAS_NO_LGRP(cp) ((cp)->cpu_lpl == lpl_bootstrap)
168 168
169 169 static lgrp_t lroot;
170 170
171 171 /*
172 172 * Size, in bytes, beyond which random memory allocation policy is applied
173 173 * to non-shared memory. Default is the maximum size, so random memory
174 174 * allocation won't be used for non-shared memory by default.
175 175 */
176 176 size_t lgrp_privm_random_thresh = (size_t)(-1);
177 177
178 178 /* the maximum effect that a single thread can have on it's lgroup's load */
179 179 #define LGRP_LOADAVG_MAX_EFFECT(ncpu) \
180 180 ((lgrp_loadavg_max_effect) / (ncpu))
181 181 uint32_t lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX;
182 182
183 183
184 184 /*
185 185 * Size, in bytes, beyond which random memory allocation policy is applied to
186 186 * shared memory. Default is 8MB (2 ISM pages).
187 187 */
188 188 size_t lgrp_shm_random_thresh = 8*1024*1024;
189 189
190 190 /*
191 191 * Whether to do processor set aware memory allocation by default
192 192 */
193 193 int lgrp_mem_pset_aware = 0;
194 194
195 195 /*
196 196 * Set the default memory allocation policy for root lgroup
197 197 */
198 198 lgrp_mem_policy_t lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM;
199 199
200 200 /*
201 201 * Set the default memory allocation policy. For most platforms,
202 202 * next touch is sufficient, but some platforms may wish to override
203 203 * this.
204 204 */
205 205 lgrp_mem_policy_t lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
206 206
207 207
208 208 /*
209 209 * lgroup CPU event handlers
210 210 */
211 211 static void lgrp_cpu_init(struct cpu *);
212 212 static void lgrp_cpu_fini(struct cpu *, lgrp_id_t);
213 213 static lgrp_t *lgrp_cpu_to_lgrp(struct cpu *);
214 214
215 215 /*
216 216 * lgroup memory event handlers
217 217 */
218 218 static void lgrp_mem_init(int, lgrp_handle_t, boolean_t);
219 219 static void lgrp_mem_fini(int, lgrp_handle_t, boolean_t);
220 220 static void lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t);
221 221
222 222 /*
223 223 * lgroup CPU partition event handlers
224 224 */
225 225 static void lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
226 226 static void lgrp_part_del_cpu(struct cpu *);
227 227
228 228 /*
229 229 * lgroup framework initialization
230 230 */
231 231 static void lgrp_main_init(void);
232 232 static void lgrp_main_mp_init(void);
233 233 static void lgrp_root_init(void);
234 234 static void lgrp_setup(void);
235 235
236 236 /*
237 237 * lpl topology
238 238 */
239 239 static void lpl_init(lpl_t *, lpl_t *, lgrp_t *);
240 240 static void lpl_clear(lpl_t *);
241 241 static void lpl_leaf_insert(lpl_t *, struct cpupart *);
242 242 static void lpl_leaf_remove(lpl_t *, struct cpupart *);
243 243 static void lpl_rset_add(lpl_t *, lpl_t *);
244 244 static void lpl_rset_del(lpl_t *, lpl_t *);
245 245 static int lpl_rset_contains(lpl_t *, lpl_t *);
246 246 static void lpl_cpu_adjcnt(lpl_act_t, struct cpu *);
247 247 static void lpl_child_update(lpl_t *, struct cpupart *);
248 248 static int lpl_pick(lpl_t *, lpl_t *);
249 249 static void lpl_verify_wrapper(struct cpupart *);
250 250
251 251 /*
252 252 * defines for lpl topology verifier return codes
253 253 */
254 254
255 255 #define LPL_TOPO_CORRECT 0
256 256 #define LPL_TOPO_PART_HAS_NO_LPL -1
257 257 #define LPL_TOPO_CPUS_NOT_EMPTY -2
258 258 #define LPL_TOPO_LGRP_MISMATCH -3
259 259 #define LPL_TOPO_MISSING_PARENT -4
260 260 #define LPL_TOPO_PARENT_MISMATCH -5
261 261 #define LPL_TOPO_BAD_CPUCNT -6
262 262 #define LPL_TOPO_RSET_MISMATCH -7
263 263 #define LPL_TOPO_LPL_ORPHANED -8
264 264 #define LPL_TOPO_LPL_BAD_NCPU -9
265 265 #define LPL_TOPO_RSET_MSSNG_LF -10
266 266 #define LPL_TOPO_CPU_HAS_BAD_LPL -11
267 267 #define LPL_TOPO_NONLEAF_HAS_CPUS -12
268 268 #define LPL_TOPO_LGRP_NOT_LEAF -13
269 269 #define LPL_TOPO_BAD_RSETCNT -14
270 270
271 271 /*
272 272 * Return whether lgroup optimizations should be enabled on this system
273 273 */
274 274 int
275 275 lgrp_optimizations(void)
276 276 {
277 277 /*
278 278 * System must have more than 2 lgroups to enable lgroup optimizations
279 279 *
280 280 * XXX This assumes that a 2 lgroup system has an empty root lgroup
281 281 * with one child lgroup containing all the resources. A 2 lgroup
282 282 * system with a root lgroup directly containing CPUs or memory might
283 283 * need lgroup optimizations with its child lgroup, but there
284 284 * isn't such a machine for now....
285 285 */
286 286 if (nlgrps > 2)
287 287 return (1);
288 288
289 289 return (0);
290 290 }
291 291
292 292 /*
293 293 * Setup root lgroup
294 294 */
295 295 static void
296 296 lgrp_root_init(void)
297 297 {
298 298 lgrp_handle_t hand;
299 299 int i;
300 300 lgrp_id_t id;
301 301
302 302 /*
303 303 * Create the "root" lgroup
304 304 */
305 305 ASSERT(nlgrps == 0);
306 306 id = nlgrps++;
307 307
308 308 lgrp_root = &lroot;
309 309
310 310 lgrp_root->lgrp_cpu = NULL;
311 311 lgrp_root->lgrp_mnodes = 0;
312 312 lgrp_root->lgrp_nmnodes = 0;
313 313 hand = lgrp_plat_root_hand();
314 314 lgrp_root->lgrp_plathand = hand;
315 315
316 316 lgrp_root->lgrp_id = id;
317 317 lgrp_root->lgrp_cpucnt = 0;
318 318 lgrp_root->lgrp_childcnt = 0;
319 319 klgrpset_clear(lgrp_root->lgrp_children);
320 320 klgrpset_clear(lgrp_root->lgrp_leaves);
321 321 lgrp_root->lgrp_parent = NULL;
322 322 lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand);
323 323
324 324 for (i = 0; i < LGRP_RSRC_COUNT; i++)
325 325 klgrpset_clear(lgrp_root->lgrp_set[i]);
326 326
327 327 lgrp_root->lgrp_kstat = NULL;
328 328
329 329 lgrp_table[id] = lgrp_root;
330 330
331 331 /*
332 332 * Setup initial lpl list for CPU0 and initial t0 home.
333 333 * The only lpl space we have so far is lpl_bootstrap. It is used for
334 334 * all topology operations until cp_default is initialized at which
335 335 * point t0.t_lpl will be updated.
336 336 */
337 337 lpl_bootstrap = lpl_bootstrap_list;
338 338 t0.t_lpl = lpl_bootstrap;
339 339 cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE;
340 340 lpl_bootstrap_list[1].lpl_lgrpid = 1;
341 341
342 342 /*
343 343 * Set up the bootstrap rset
344 344 * Since the bootstrap toplogy has just the root, and a leaf,
345 345 * the rset contains just the leaf, and both lpls can use the same rset
346 346 */
347 347 lpl_bootstrap_rset[0] = &lpl_bootstrap_list[1];
348 348 lpl_bootstrap_list[0].lpl_rset_sz = 1;
349 349 lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset;
350 350 lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset;
351 351
352 352 lpl_bootstrap_list[1].lpl_rset_sz = 1;
353 353 lpl_bootstrap_list[1].lpl_rset = lpl_bootstrap_rset;
354 354 lpl_bootstrap_list[1].lpl_id2rset = lpl_bootstrap_id2rset;
355 355
356 356 cp_default.cp_lgrploads = lpl_bootstrap;
357 357 }
358 358
359 359 /*
360 360 * Initialize the lgroup framework and allow the platform to do the same
361 361 *
362 362 * This happens in stages during boot and is all funnelled through this routine
363 363 * (see definition of lgrp_init_stages_t to see what happens at each stage and
364 364 * when)
365 365 */
366 366 void
367 367 lgrp_init(lgrp_init_stages_t stage)
368 368 {
369 369 /*
370 370 * Initialize the platform
371 371 */
372 372 lgrp_plat_init(stage);
373 373
374 374 switch (stage) {
375 375 case LGRP_INIT_STAGE1:
376 376 /*
377 377 * Set max number of lgroups supported on this platform which
378 378 * must be less than the max number of lgroups supported by the
379 379 * common lgroup framework (eg. NLGRPS_MAX is max elements in
380 380 * lgrp_table[], etc.)
381 381 */
382 382 nlgrpsmax = lgrp_plat_max_lgrps();
383 383 ASSERT(nlgrpsmax <= NLGRPS_MAX);
384 384 break;
385 385
386 386 case LGRP_INIT_STAGE2:
387 387 lgrp_setup();
388 388 break;
389 389
390 390 case LGRP_INIT_STAGE4:
391 391 lgrp_main_init();
392 392 break;
393 393
394 394 case LGRP_INIT_STAGE5:
395 395 lgrp_main_mp_init();
396 396 break;
397 397
398 398 default:
399 399 break;
400 400 }
401 401 }
402 402
403 403 /*
404 404 * Create the root and cpu0's lgroup, and set t0's home.
405 405 */
406 406 static void
407 407 lgrp_setup(void)
408 408 {
409 409 /*
410 410 * Setup the root lgroup
411 411 */
412 412 lgrp_root_init();
413 413
414 414 /*
415 415 * Add cpu0 to an lgroup
416 416 */
417 417 lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0);
418 418 lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0);
419 419 }
420 420
421 421 /*
422 422 * true when lgrp initialization has been completed.
423 423 */
424 424 int lgrp_initialized = 0;
425 425
426 426 /*
427 427 * True when lgrp topology is constructed.
428 428 */
429 429 int lgrp_topo_initialized = 0;
430 430
431 431 /*
432 432 * Init routine called after startup(), /etc/system has been processed,
433 433 * and cpu0 has been added to an lgroup.
434 434 */
435 435 static void
436 436 lgrp_main_init(void)
437 437 {
438 438 cpu_t *cp = CPU;
439 439 lgrp_id_t lgrpid;
440 440 int i;
441 441 extern void pg_cpu0_reinit();
442 442
443 443 /*
444 444 * Enforce a valid lgrp_mem_default_policy
445 445 */
446 446 if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
447 447 (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES) ||
448 448 (lgrp_mem_default_policy == LGRP_MEM_POLICY_NEXT_SEG))
449 449 lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
450 450
451 451 /*
452 452 * See if mpo should be disabled.
453 453 * This may happen in the case of null proc LPA on Starcat.
454 454 * The platform won't be able to detect null proc LPA until after
455 455 * cpu0 and memory have already been added to lgroups.
456 456 * When and if it is detected, the Starcat platform will return
457 457 * a different platform handle for cpu0 which is what we check for
458 458 * here. If mpo should be disabled move cpu0 to it's rightful place
459 459 * (the root), and destroy the remaining lgroups. This effectively
460 460 * provides an UMA lgroup topology.
461 461 */
462 462 lgrpid = cp->cpu_lpl->lpl_lgrpid;
463 463 if (lgrp_table[lgrpid]->lgrp_plathand !=
464 464 lgrp_plat_cpu_to_hand(cp->cpu_id)) {
465 465 lgrp_part_del_cpu(cp);
466 466 lgrp_cpu_fini(cp, lgrpid);
467 467
468 468 lgrp_cpu_init(cp);
469 469 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
470 470
471 471 ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID);
472 472
473 473 /*
474 474 * Notify the PG subsystem that the CPU's lgrp
475 475 * association has changed
476 476 */
477 477 pg_cpu0_reinit();
478 478
479 479 /*
480 480 * Destroy all lgroups except for root
481 481 */
482 482 for (i = 0; i <= lgrp_alloc_max; i++) {
483 483 if (LGRP_EXISTS(lgrp_table[i]) &&
484 484 lgrp_table[i] != lgrp_root)
485 485 lgrp_destroy(lgrp_table[i]);
486 486 }
487 487
488 488 /*
489 489 * Fix up root to point at itself for leaves and resources
490 490 * and not have any children
491 491 */
492 492 lgrp_root->lgrp_childcnt = 0;
493 493 klgrpset_clear(lgrp_root->lgrp_children);
494 494 klgrpset_clear(lgrp_root->lgrp_leaves);
495 495 klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID);
496 496 klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]);
497 497 klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID);
498 498 }
499 499
500 500 /*
501 501 * Initialize kstats framework.
502 502 */
503 503 lgrp_kstat_init();
504 504 /*
505 505 * cpu0 is finally where it should be, so create it's lgroup's kstats
506 506 */
507 507 mutex_enter(&cpu_lock);
508 508 lgrp_kstat_create(cp);
509 509 mutex_exit(&cpu_lock);
510 510
511 511 lgrp_initialized = 1;
512 512 }
513 513
514 514 /*
515 515 * Finish lgrp initialization after all CPUS are brought on-line.
516 516 * This routine is called after start_other_cpus().
517 517 */
518 518 static void
519 519 lgrp_main_mp_init(void)
520 520 {
521 521 klgrpset_t changed;
522 522
523 523 /*
524 524 * Update lgroup topology (if necessary)
525 525 */
526 526 klgrpset_clear(changed);
527 527 (void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed);
528 528 lgrp_topo_initialized = 1;
529 529 }
530 530
531 531 /*
532 532 * Change latency of lgroup with specified lgroup platform handle (if one is
533 533 * given) or change all lgroups with old latency to new latency
534 534 */
535 535 void
536 536 lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime,
537 537 u_longlong_t newtime)
538 538 {
539 539 lgrp_t *lgrp;
540 540 int i;
541 541
542 542 for (i = 0; i <= lgrp_alloc_max; i++) {
543 543 lgrp = lgrp_table[i];
544 544
545 545 if (!LGRP_EXISTS(lgrp))
546 546 continue;
547 547
548 548 if ((hand == LGRP_NULL_HANDLE &&
549 549 lgrp->lgrp_latency == oldtime) ||
550 550 (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand))
551 551 lgrp->lgrp_latency = (int)newtime;
552 552 }
553 553 }
554 554
555 555 /*
556 556 * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
557 557 */
558 558 void
559 559 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where)
560 560 {
561 561 klgrpset_t changed;
562 562 cpu_t *cp;
563 563 lgrp_id_t id;
564 564 int rc;
565 565
566 566 switch (event) {
567 567 /*
568 568 * The following (re)configuration events are common code
569 569 * initiated. lgrp_plat_config() is called here to inform the
570 570 * platform of the reconfiguration event.
571 571 */
572 572 case LGRP_CONFIG_CPU_ADD:
573 573 cp = (cpu_t *)resource;
574 574
575 575 /*
576 576 * Initialize the new CPU's lgrp related next/prev
577 577 * links, and give it a bootstrap lpl so that it can
578 578 * survive should it need to enter the dispatcher.
579 579 */
580 580 cp->cpu_next_lpl = cp;
581 581 cp->cpu_prev_lpl = cp;
582 582 cp->cpu_next_lgrp = cp;
583 583 cp->cpu_prev_lgrp = cp;
584 584 cp->cpu_lpl = lpl_bootstrap;
585 585
586 586 lgrp_plat_config(event, resource);
587 587 atomic_inc_32(&lgrp_gen);
588 588
589 589 break;
590 590 case LGRP_CONFIG_CPU_DEL:
591 591 lgrp_plat_config(event, resource);
592 592 atomic_inc_32(&lgrp_gen);
593 593
594 594 break;
595 595 case LGRP_CONFIG_CPU_ONLINE:
596 596 cp = (cpu_t *)resource;
597 597 lgrp_cpu_init(cp);
598 598 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
599 599 rc = lpl_topo_verify(cp->cpu_part);
600 600 if (rc != LPL_TOPO_CORRECT) {
601 601 panic("lpl_topo_verify failed: %d", rc);
602 602 }
603 603 lgrp_plat_config(event, resource);
604 604 atomic_inc_32(&lgrp_gen);
605 605
606 606 break;
607 607 case LGRP_CONFIG_CPU_OFFLINE:
608 608 cp = (cpu_t *)resource;
609 609 id = cp->cpu_lpl->lpl_lgrpid;
610 610 lgrp_part_del_cpu(cp);
611 611 lgrp_cpu_fini(cp, id);
612 612 rc = lpl_topo_verify(cp->cpu_part);
613 613 if (rc != LPL_TOPO_CORRECT) {
614 614 panic("lpl_topo_verify failed: %d", rc);
615 615 }
616 616 lgrp_plat_config(event, resource);
617 617 atomic_inc_32(&lgrp_gen);
618 618
619 619 break;
620 620 case LGRP_CONFIG_CPUPART_ADD:
621 621 cp = (cpu_t *)resource;
622 622 lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where);
623 623 rc = lpl_topo_verify(cp->cpu_part);
624 624 if (rc != LPL_TOPO_CORRECT) {
625 625 panic("lpl_topo_verify failed: %d", rc);
626 626 }
627 627 lgrp_plat_config(event, resource);
628 628
629 629 break;
630 630 case LGRP_CONFIG_CPUPART_DEL:
631 631 cp = (cpu_t *)resource;
632 632 lgrp_part_del_cpu((cpu_t *)resource);
633 633 rc = lpl_topo_verify(cp->cpu_part);
634 634 if (rc != LPL_TOPO_CORRECT) {
635 635 panic("lpl_topo_verify failed: %d", rc);
636 636 }
637 637 lgrp_plat_config(event, resource);
638 638
639 639 break;
640 640 /*
641 641 * The following events are initiated by the memnode
642 642 * subsystem.
643 643 */
644 644 case LGRP_CONFIG_MEM_ADD:
645 645 lgrp_mem_init((int)resource, where, B_FALSE);
646 646 atomic_inc_32(&lgrp_gen);
647 647
648 648 break;
649 649 case LGRP_CONFIG_MEM_DEL:
650 650 lgrp_mem_fini((int)resource, where, B_FALSE);
651 651 atomic_inc_32(&lgrp_gen);
652 652
653 653 break;
654 654 case LGRP_CONFIG_MEM_RENAME: {
655 655 lgrp_config_mem_rename_t *ren_arg =
656 656 (lgrp_config_mem_rename_t *)where;
657 657
658 658 lgrp_mem_rename((int)resource,
659 659 ren_arg->lmem_rename_from,
660 660 ren_arg->lmem_rename_to);
661 661 atomic_inc_32(&lgrp_gen);
662 662
663 663 break;
664 664 }
665 665 case LGRP_CONFIG_GEN_UPDATE:
666 666 atomic_inc_32(&lgrp_gen);
667 667
668 668 break;
669 669 case LGRP_CONFIG_FLATTEN:
670 670 if (where == 0)
671 671 lgrp_topo_levels = (int)resource;
672 672 else
673 673 (void) lgrp_topo_flatten(resource,
674 674 lgrp_table, lgrp_alloc_max, &changed);
675 675
676 676 break;
677 677 /*
678 678 * Update any lgroups with old latency to new latency
679 679 */
680 680 case LGRP_CONFIG_LAT_CHANGE_ALL:
681 681 lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource,
682 682 (u_longlong_t)where);
683 683
684 684 break;
685 685 /*
686 686 * Update lgroup with specified lgroup platform handle to have
687 687 * new latency
688 688 */
689 689 case LGRP_CONFIG_LAT_CHANGE:
690 690 lgrp_latency_change((lgrp_handle_t)resource, 0,
691 691 (u_longlong_t)where);
692 692
693 693 break;
694 694 case LGRP_CONFIG_NOP:
695 695
696 696 break;
697 697 default:
698 698 break;
699 699 }
700 700
701 701 }
702 702
703 703 /*
704 704 * Called to add lgrp info into cpu structure from cpu_add_unit;
705 705 * do not assume cpu is in cpu[] yet!
706 706 *
707 707 * CPUs are brought online with all other CPUs paused so we can't
708 708 * allocate memory or we could deadlock the system, so we rely on
709 709 * the platform to statically allocate as much space as we need
710 710 * for the lgrp structs and stats.
711 711 */
712 712 static void
713 713 lgrp_cpu_init(struct cpu *cp)
714 714 {
715 715 klgrpset_t changed;
716 716 int count;
717 717 lgrp_handle_t hand;
718 718 int first_cpu;
719 719 lgrp_t *my_lgrp;
720 720 lgrp_id_t lgrpid;
721 721 struct cpu *cptr;
722 722
723 723 /*
724 724 * This is the first time through if the resource set
725 725 * for the root lgroup is empty. After cpu0 has been
726 726 * initially added to an lgroup, the root's CPU resource
727 727 * set can never be empty, since the system's last CPU
728 728 * cannot be offlined.
729 729 */
730 730 if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) {
731 731 /*
732 732 * First time through.
733 733 */
734 734 first_cpu = 1;
735 735 } else {
736 736 /*
737 737 * If cpu0 needs to move lgroups, we may come
738 738 * through here again, at which time cpu_lock won't
739 739 * be held, and lgrp_initialized will be false.
740 740 */
741 741 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
742 742 ASSERT(cp->cpu_part != NULL);
743 743 first_cpu = 0;
744 744 }
745 745
746 746 hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
747 747 my_lgrp = lgrp_hand_to_lgrp(hand);
748 748
749 749 if (my_lgrp == NULL) {
750 750 /*
751 751 * Create new lgrp and add it to lgroup topology
752 752 */
753 753 my_lgrp = lgrp_create();
754 754 my_lgrp->lgrp_plathand = hand;
755 755 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
756 756 lgrpid = my_lgrp->lgrp_id;
757 757 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
758 758 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
759 759
760 760 count = 0;
761 761 klgrpset_clear(changed);
762 762 count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
763 763 &changed);
764 764 /*
765 765 * May have added new intermediate lgroups, so need to add
766 766 * resources other than CPUs which are added below
767 767 */
768 768 (void) lgrp_mnode_update(changed, NULL);
769 769 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
770 770 > 0) {
771 771 /*
772 772 * Leaf lgroup was created, but latency wasn't available
773 773 * then. So, set latency for it and fill in rest of lgroup
774 774 * topology now that we know how far it is from other leaf
775 775 * lgroups.
776 776 */
777 777 lgrpid = my_lgrp->lgrp_id;
778 778 klgrpset_clear(changed);
779 779 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
780 780 lgrpid))
781 781 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
782 782 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
783 783 &changed);
784 784
785 785 /*
786 786 * May have added new intermediate lgroups, so need to add
787 787 * resources other than CPUs which are added below
788 788 */
789 789 (void) lgrp_mnode_update(changed, NULL);
790 790 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
791 791 my_lgrp->lgrp_id)) {
792 792 int i;
793 793
794 794 /*
795 795 * Update existing lgroup and lgroups containing it with CPU
796 796 * resource
797 797 */
798 798 lgrpid = my_lgrp->lgrp_id;
799 799 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
800 800 for (i = 0; i <= lgrp_alloc_max; i++) {
801 801 lgrp_t *lgrp;
802 802
803 803 lgrp = lgrp_table[i];
804 804 if (!LGRP_EXISTS(lgrp) ||
805 805 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
806 806 continue;
807 807
808 808 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
809 809 }
810 810 }
811 811
812 812 lgrpid = my_lgrp->lgrp_id;
813 813 cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid];
814 814
815 815 /*
816 816 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
817 817 * end up in lpl for lgroup 0 whether it is supposed to be in there or
818 818 * not since none of lgroup IDs in the lpl's have been set yet.
819 819 */
820 820 if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid)
821 821 cp->cpu_lpl->lpl_lgrpid = lgrpid;
822 822
823 823 /*
824 824 * link the CPU into the lgrp's CPU list
825 825 */
826 826 if (my_lgrp->lgrp_cpucnt == 0) {
827 827 my_lgrp->lgrp_cpu = cp;
828 828 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp;
829 829 } else {
830 830 cptr = my_lgrp->lgrp_cpu;
831 831 cp->cpu_next_lgrp = cptr;
832 832 cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp;
833 833 cptr->cpu_prev_lgrp->cpu_next_lgrp = cp;
834 834 cptr->cpu_prev_lgrp = cp;
835 835 }
836 836 my_lgrp->lgrp_cpucnt++;
837 837 }
838 838
839 839 lgrp_t *
840 840 lgrp_create(void)
841 841 {
842 842 lgrp_t *my_lgrp;
843 843 lgrp_id_t lgrpid;
844 844 int i;
845 845
846 846 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
847 847
848 848 /*
849 849 * Find an open slot in the lgroup table and recycle unused lgroup
850 850 * left there if any
851 851 */
852 852 my_lgrp = NULL;
853 853 if (lgrp_alloc_hint == -1)
854 854 /*
855 855 * Allocate from end when hint not set yet because no lgroups
856 856 * have been deleted yet
857 857 */
858 858 lgrpid = nlgrps++;
859 859 else {
860 860 /*
861 861 * Start looking for next open slot from hint and leave hint
862 862 * at slot allocated
863 863 */
864 864 for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
865 865 my_lgrp = lgrp_table[i];
866 866 if (!LGRP_EXISTS(my_lgrp)) {
867 867 lgrpid = i;
868 868 nlgrps++;
869 869 break;
870 870 }
871 871 }
872 872 lgrp_alloc_hint = lgrpid;
873 873 }
874 874
875 875 /*
876 876 * Keep track of max lgroup ID allocated so far to cut down on searches
877 877 */
878 878 if (lgrpid > lgrp_alloc_max)
879 879 lgrp_alloc_max = lgrpid;
880 880
881 881 /*
882 882 * Need to allocate new lgroup if next open slot didn't have one
883 883 * for recycling
884 884 */
885 885 if (my_lgrp == NULL)
886 886 my_lgrp = lgrp_plat_alloc(lgrpid);
887 887
888 888 if (nlgrps > nlgrpsmax || my_lgrp == NULL)
889 889 panic("Too many lgrps for platform (%d)", nlgrps);
890 890
891 891 my_lgrp->lgrp_id = lgrpid;
892 892 my_lgrp->lgrp_latency = 0;
893 893 my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
894 894 my_lgrp->lgrp_parent = NULL;
895 895 my_lgrp->lgrp_childcnt = 0;
896 896 my_lgrp->lgrp_mnodes = (mnodeset_t)0;
897 897 my_lgrp->lgrp_nmnodes = 0;
898 898 klgrpset_clear(my_lgrp->lgrp_children);
899 899 klgrpset_clear(my_lgrp->lgrp_leaves);
900 900 for (i = 0; i < LGRP_RSRC_COUNT; i++)
901 901 klgrpset_clear(my_lgrp->lgrp_set[i]);
902 902
903 903 my_lgrp->lgrp_cpu = NULL;
904 904 my_lgrp->lgrp_cpucnt = 0;
905 905
906 906 if (my_lgrp->lgrp_kstat != NULL)
907 907 lgrp_kstat_reset(lgrpid);
908 908
909 909 lgrp_table[my_lgrp->lgrp_id] = my_lgrp;
910 910
911 911 return (my_lgrp);
912 912 }
913 913
914 914 void
915 915 lgrp_destroy(lgrp_t *lgrp)
916 916 {
917 917 int i;
918 918
919 919 /*
920 920 * Unless this lgroup is being destroyed on behalf of
921 921 * the boot CPU, cpu_lock must be held
922 922 */
923 923 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
924 924
925 925 if (nlgrps == 1)
926 926 cmn_err(CE_PANIC, "Can't destroy only lgroup!");
927 927
928 928 if (!LGRP_EXISTS(lgrp))
929 929 return;
930 930
931 931 /*
932 932 * Set hint to lgroup being deleted and try to keep lower numbered
933 933 * hints to facilitate finding empty slots
934 934 */
935 935 if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint)
936 936 lgrp_alloc_hint = lgrp->lgrp_id;
937 937
938 938 /*
939 939 * Mark this lgroup to be recycled by setting its lgroup ID to
940 940 * LGRP_NONE and clear relevant fields
941 941 */
942 942 lgrp->lgrp_id = LGRP_NONE;
943 943 lgrp->lgrp_latency = 0;
944 944 lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
945 945 lgrp->lgrp_parent = NULL;
946 946 lgrp->lgrp_childcnt = 0;
947 947
948 948 klgrpset_clear(lgrp->lgrp_children);
949 949 klgrpset_clear(lgrp->lgrp_leaves);
950 950 for (i = 0; i < LGRP_RSRC_COUNT; i++)
951 951 klgrpset_clear(lgrp->lgrp_set[i]);
952 952
953 953 lgrp->lgrp_mnodes = (mnodeset_t)0;
954 954 lgrp->lgrp_nmnodes = 0;
955 955
956 956 lgrp->lgrp_cpu = NULL;
957 957 lgrp->lgrp_cpucnt = 0;
958 958
959 959 nlgrps--;
960 960 }
961 961
962 962 /*
963 963 * Initialize kstat data. Called from lgrp intialization code.
964 964 */
965 965 static void
966 966 lgrp_kstat_init(void)
967 967 {
968 968 lgrp_stat_t stat;
969 969
970 970 mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
971 971
972 972 for (stat = 0; stat < LGRP_NUM_STATS; stat++)
973 973 kstat_named_init(&lgrp_kstat_data[stat],
974 974 lgrp_kstat_names[stat], KSTAT_DATA_INT64);
975 975 }
976 976
977 977 /*
978 978 * initialize an lgrp's kstats if needed
979 979 * called with cpu_lock held but not with cpus paused.
980 980 * we don't tear these down now because we don't know about
981 981 * memory leaving the lgrp yet...
982 982 */
983 983
984 984 void
985 985 lgrp_kstat_create(cpu_t *cp)
986 986 {
987 987 kstat_t *lgrp_kstat;
988 988 lgrp_id_t lgrpid;
989 989 lgrp_t *my_lgrp;
990 990
991 991 ASSERT(MUTEX_HELD(&cpu_lock));
992 992
993 993 lgrpid = cp->cpu_lpl->lpl_lgrpid;
994 994 my_lgrp = lgrp_table[lgrpid];
995 995
996 996 if (my_lgrp->lgrp_kstat != NULL)
997 997 return; /* already initialized */
998 998
999 999 lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc",
1000 1000 KSTAT_TYPE_NAMED, LGRP_NUM_STATS,
1001 1001 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
1002 1002
1003 1003 if (lgrp_kstat != NULL) {
1004 1004 lgrp_kstat->ks_lock = &lgrp_kstat_mutex;
1005 1005 lgrp_kstat->ks_private = my_lgrp;
1006 1006 lgrp_kstat->ks_data = &lgrp_kstat_data;
1007 1007 lgrp_kstat->ks_update = lgrp_kstat_extract;
1008 1008 my_lgrp->lgrp_kstat = lgrp_kstat;
1009 1009 kstat_install(lgrp_kstat);
1010 1010 }
1011 1011 }
1012 1012
1013 1013 /*
1014 1014 * this will do something when we manage to remove now unused lgrps
1015 1015 */
1016 1016
1017 1017 /* ARGSUSED */
1018 1018 void
1019 1019 lgrp_kstat_destroy(cpu_t *cp)
1020 1020 {
1021 1021 ASSERT(MUTEX_HELD(&cpu_lock));
1022 1022 }
1023 1023
1024 1024 /*
1025 1025 * Called when a CPU is off-lined.
1026 1026 */
1027 1027 static void
1028 1028 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid)
1029 1029 {
1030 1030 lgrp_t *my_lgrp;
1031 1031 struct cpu *prev;
1032 1032 struct cpu *next;
1033 1033
1034 1034 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
1035 1035
1036 1036 prev = cp->cpu_prev_lgrp;
1037 1037 next = cp->cpu_next_lgrp;
1038 1038
1039 1039 prev->cpu_next_lgrp = next;
1040 1040 next->cpu_prev_lgrp = prev;
1041 1041
1042 1042 /*
1043 1043 * just because I'm paranoid doesn't mean...
1044 1044 */
1045 1045
1046 1046 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL;
1047 1047
1048 1048 my_lgrp = lgrp_table[lgrpid];
1049 1049 my_lgrp->lgrp_cpucnt--;
1050 1050
1051 1051 /*
1052 1052 * Removing last CPU in lgroup, so update lgroup topology
1053 1053 */
1054 1054 if (my_lgrp->lgrp_cpucnt == 0) {
1055 1055 klgrpset_t changed;
1056 1056 int count;
1057 1057 int i;
1058 1058
1059 1059 my_lgrp->lgrp_cpu = NULL;
1060 1060
1061 1061 /*
1062 1062 * Remove this lgroup from its lgroup CPU resources and remove
1063 1063 * lgroup from lgroup topology if it doesn't have any more
1064 1064 * resources in it now
1065 1065 */
1066 1066 klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1067 1067 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1068 1068 count = 0;
1069 1069 klgrpset_clear(changed);
1070 1070 count += lgrp_leaf_delete(my_lgrp, lgrp_table,
1071 1071 lgrp_alloc_max + 1, &changed);
1072 1072 return;
1073 1073 }
1074 1074
1075 1075 /*
1076 1076 * This lgroup isn't empty, so just remove it from CPU
1077 1077 * resources of any lgroups that contain it as such
1078 1078 */
1079 1079 for (i = 0; i <= lgrp_alloc_max; i++) {
1080 1080 lgrp_t *lgrp;
1081 1081
1082 1082 lgrp = lgrp_table[i];
1083 1083 if (!LGRP_EXISTS(lgrp) ||
1084 1084 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU],
1085 1085 lgrpid))
1086 1086 continue;
1087 1087
1088 1088 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1089 1089 }
1090 1090 return;
1091 1091 }
1092 1092
1093 1093 if (my_lgrp->lgrp_cpu == cp)
1094 1094 my_lgrp->lgrp_cpu = next;
1095 1095
1096 1096 }
1097 1097
1098 1098 /*
1099 1099 * Update memory nodes in target lgroups and return ones that get changed
1100 1100 */
1101 1101 int
1102 1102 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed)
1103 1103 {
1104 1104 int count;
1105 1105 int i;
1106 1106 int j;
1107 1107 lgrp_t *lgrp;
1108 1108 lgrp_t *lgrp_rsrc;
1109 1109
1110 1110 count = 0;
1111 1111 if (changed)
1112 1112 klgrpset_clear(*changed);
1113 1113
1114 1114 if (klgrpset_isempty(target))
1115 1115 return (0);
1116 1116
1117 1117 /*
1118 1118 * Find each lgroup in target lgroups
1119 1119 */
1120 1120 for (i = 0; i <= lgrp_alloc_max; i++) {
1121 1121 /*
1122 1122 * Skip any lgroups that don't exist or aren't in target group
1123 1123 */
1124 1124 lgrp = lgrp_table[i];
1125 1125 if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) {
1126 1126 continue;
1127 1127 }
1128 1128
1129 1129 /*
1130 1130 * Initialize memnodes for intermediate lgroups to 0
1131 1131 * and update them from scratch since they may have completely
1132 1132 * changed
1133 1133 */
1134 1134 if (lgrp->lgrp_childcnt && lgrp != lgrp_root) {
1135 1135 lgrp->lgrp_mnodes = (mnodeset_t)0;
1136 1136 lgrp->lgrp_nmnodes = 0;
1137 1137 }
1138 1138
1139 1139 /*
1140 1140 * Update memory nodes of of target lgroup with memory nodes
1141 1141 * from each lgroup in its lgroup memory resource set
1142 1142 */
1143 1143 for (j = 0; j <= lgrp_alloc_max; j++) {
1144 1144 int k;
1145 1145
1146 1146 /*
1147 1147 * Skip any lgroups that don't exist or aren't in
1148 1148 * memory resources of target lgroup
1149 1149 */
1150 1150 lgrp_rsrc = lgrp_table[j];
1151 1151 if (!LGRP_EXISTS(lgrp_rsrc) ||
1152 1152 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1153 1153 j))
1154 1154 continue;
1155 1155
1156 1156 /*
1157 1157 * Update target lgroup's memnodes to include memnodes
1158 1158 * of this lgroup
1159 1159 */
1160 1160 for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
1161 1161 mnodeset_t mnode_mask;
1162 1162
1163 1163 mnode_mask = (mnodeset_t)1 << k;
1164 1164 if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) &&
1165 1165 !(lgrp->lgrp_mnodes & mnode_mask)) {
1166 1166 lgrp->lgrp_mnodes |= mnode_mask;
1167 1167 lgrp->lgrp_nmnodes++;
1168 1168 }
1169 1169 }
1170 1170 count++;
1171 1171 if (changed)
1172 1172 klgrpset_add(*changed, lgrp->lgrp_id);
1173 1173 }
1174 1174 }
1175 1175
1176 1176 return (count);
1177 1177 }
1178 1178
1179 1179 /*
1180 1180 * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
1181 1181 * is moved from one board to another. The "from" and "to" arguments specify the
1182 1182 * source and the destination of the move.
1183 1183 *
1184 1184 * See plat_lgrp_config() for a detailed description of the copy-rename
1185 1185 * semantics.
1186 1186 *
1187 1187 * The lgrp_mem_rename() is called by the platform copy-rename code to update
1188 1188 * the lgroup topology which is changing as memory moves from one lgroup to
1189 1189 * another. It removes the mnode from the source lgroup and re-inserts it in the
1190 1190 * target lgroup.
1191 1191 *
1192 1192 * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
1193 1193 * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
1194 1194 * copy-rename operation.
1195 1195 *
1196 1196 * There is one case which requires special handling. If the system contains
1197 1197 * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
1198 1198 * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
1199 1199 * lgrp_mem_init), but there is a window when the system has no memory in the
1200 1200 * lgroup hierarchy. If another thread tries to allocate memory during this
1201 1201 * window, the allocation will fail, although the system has physical memory.
1202 1202 * This may cause a system panic or a deadlock (some sleeping memory allocations
1203 1203 * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
1204 1204 * the mnode back).
1205 1205 *
1206 1206 * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
1207 1207 * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
1208 1208 * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
1209 1209 * but it updates the rest of the lgroup topology as if the mnode was actually
1210 1210 * removed. The lgrp_mem_init() function recognizes that the mnode being
1211 1211 * inserted represents such a special case and updates the topology
1212 1212 * appropriately.
1213 1213 */
1214 1214 void
1215 1215 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to)
1216 1216 {
1217 1217 /*
1218 1218 * Remove the memory from the source node and add it to the destination
1219 1219 * node.
1220 1220 */
1221 1221 lgrp_mem_fini(mnode, from, B_TRUE);
1222 1222 lgrp_mem_init(mnode, to, B_TRUE);
1223 1223 }
1224 1224
1225 1225 /*
1226 1226 * Called to indicate that the lgrp with platform handle "hand" now
1227 1227 * contains the memory identified by "mnode".
1228 1228 *
1229 1229 * LOCKING for this routine is a bit tricky. Usually it is called without
1230 1230 * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1231 1231 * callers. During DR of the board containing the caged memory it may be called
1232 1232 * with cpu_lock already held and CPUs paused.
1233 1233 *
1234 1234 * If the insertion is part of the DR copy-rename and the inserted mnode (and
1235 1235 * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
1236 1236 * dealing with the special case of DR copy-rename described in
1237 1237 * lgrp_mem_rename().
1238 1238 */
1239 1239 void
1240 1240 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1241 1241 {
1242 1242 klgrpset_t changed;
1243 1243 int count;
1244 1244 int i;
1245 1245 lgrp_t *my_lgrp;
1246 1246 lgrp_id_t lgrpid;
1247 1247 mnodeset_t mnodes_mask = ((mnodeset_t)1 << mnode);
1248 1248 boolean_t drop_lock = B_FALSE;
1249 1249 boolean_t need_synch = B_FALSE;
1250 1250
1251 1251 /*
1252 1252 * Grab CPU lock (if we haven't already)
1253 1253 */
1254 1254 if (!MUTEX_HELD(&cpu_lock)) {
1255 1255 mutex_enter(&cpu_lock);
1256 1256 drop_lock = B_TRUE;
1257 1257 }
1258 1258
1259 1259 /*
1260 1260 * This routine may be called from a context where we already
1261 1261 * hold cpu_lock, and have already paused cpus.
1262 1262 */
1263 1263 if (!cpus_paused())
1264 1264 need_synch = B_TRUE;
1265 1265
1266 1266 /*
1267 1267 * Check if this mnode is already configured and return immediately if
1268 1268 * it is.
1269 1269 *
1270 1270 * NOTE: in special case of copy-rename of the only remaining mnode,
1271 1271 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
1272 1272 * recognize this case and continue as usual, but skip the update to
1273 1273 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
1274 1274 * in topology, temporarily introduced by lgrp_mem_fini().
1275 1275 */
1276 1276 if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) &&
1277 1277 lgrp_root->lgrp_mnodes & mnodes_mask) {
1278 1278 if (drop_lock)
1279 1279 mutex_exit(&cpu_lock);
1280 1280 return;
1281 1281 }
1282 1282
1283 1283 /*
1284 1284 * Update lgroup topology with new memory resources, keeping track of
1285 1285 * which lgroups change
1286 1286 */
1287 1287 count = 0;
1288 1288 klgrpset_clear(changed);
1289 1289 my_lgrp = lgrp_hand_to_lgrp(hand);
1290 1290 if (my_lgrp == NULL) {
1291 1291 /* new lgrp */
1292 1292 my_lgrp = lgrp_create();
1293 1293 lgrpid = my_lgrp->lgrp_id;
1294 1294 my_lgrp->lgrp_plathand = hand;
1295 1295 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
1296 1296 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
1297 1297 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1298 1298
1299 1299 if (need_synch)
1300 1300 pause_cpus(NULL, NULL);
1301 1301 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1302 1302 &changed);
1303 1303 if (need_synch)
1304 1304 start_cpus();
1305 1305 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
1306 1306 > 0) {
1307 1307 /*
1308 1308 * Leaf lgroup was created, but latency wasn't available
1309 1309 * then. So, set latency for it and fill in rest of lgroup
1310 1310 * topology now that we know how far it is from other leaf
1311 1311 * lgroups.
1312 1312 */
1313 1313 klgrpset_clear(changed);
1314 1314 lgrpid = my_lgrp->lgrp_id;
1315 1315 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1316 1316 lgrpid))
1317 1317 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1318 1318 if (need_synch)
1319 1319 pause_cpus(NULL, NULL);
1320 1320 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1321 1321 &changed);
1322 1322 if (need_synch)
1323 1323 start_cpus();
1324 1324 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1325 1325 my_lgrp->lgrp_id)) {
1326 1326 /*
1327 1327 * Add new lgroup memory resource to existing lgroup
1328 1328 */
1329 1329 lgrpid = my_lgrp->lgrp_id;
1330 1330 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1331 1331 klgrpset_add(changed, lgrpid);
1332 1332 count++;
1333 1333 for (i = 0; i <= lgrp_alloc_max; i++) {
1334 1334 lgrp_t *lgrp;
1335 1335
1336 1336 lgrp = lgrp_table[i];
1337 1337 if (!LGRP_EXISTS(lgrp) ||
1338 1338 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
1339 1339 continue;
1340 1340
1341 1341 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1342 1342 klgrpset_add(changed, lgrp->lgrp_id);
1343 1343 count++;
1344 1344 }
1345 1345 }
1346 1346
1347 1347 /*
1348 1348 * Add memory node to lgroup and remove lgroup from ones that need
1349 1349 * to be updated
1350 1350 */
1351 1351 if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) {
1352 1352 my_lgrp->lgrp_mnodes |= mnodes_mask;
1353 1353 my_lgrp->lgrp_nmnodes++;
1354 1354 }
1355 1355 klgrpset_del(changed, lgrpid);
1356 1356
1357 1357 /*
1358 1358 * Update memory node information for all lgroups that changed and
1359 1359 * contain new memory node as a resource
1360 1360 */
1361 1361 if (count)
1362 1362 (void) lgrp_mnode_update(changed, NULL);
1363 1363
1364 1364 if (drop_lock)
1365 1365 mutex_exit(&cpu_lock);
1366 1366 }
1367 1367
1368 1368 /*
1369 1369 * Called to indicate that the lgroup associated with the platform
1370 1370 * handle "hand" no longer contains given memory node
1371 1371 *
1372 1372 * LOCKING for this routine is a bit tricky. Usually it is called without
1373 1373 * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1374 1374 * callers. During DR of the board containing the caged memory it may be called
1375 1375 * with cpu_lock already held and CPUs paused.
1376 1376 *
1377 1377 * If the deletion is part of the DR copy-rename and the deleted mnode is the
1378 1378 * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
1379 1379 * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
1380 1380 * the same mnode back into the topology. See lgrp_mem_rename() and
1381 1381 * lgrp_mem_init() for additional details.
1382 1382 */
1383 1383 void
1384 1384 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1385 1385 {
1386 1386 klgrpset_t changed;
1387 1387 int count;
1388 1388 int i;
1389 1389 lgrp_t *my_lgrp;
1390 1390 lgrp_id_t lgrpid;
1391 1391 mnodeset_t mnodes_mask;
1392 1392 boolean_t drop_lock = B_FALSE;
1393 1393 boolean_t need_synch = B_FALSE;
1394 1394
1395 1395 /*
1396 1396 * Grab CPU lock (if we haven't already)
1397 1397 */
1398 1398 if (!MUTEX_HELD(&cpu_lock)) {
1399 1399 mutex_enter(&cpu_lock);
1400 1400 drop_lock = B_TRUE;
1401 1401 }
1402 1402
1403 1403 /*
1404 1404 * This routine may be called from a context where we already
1405 1405 * hold cpu_lock and have already paused cpus.
1406 1406 */
1407 1407 if (!cpus_paused())
1408 1408 need_synch = B_TRUE;
1409 1409
1410 1410 my_lgrp = lgrp_hand_to_lgrp(hand);
1411 1411
1412 1412 /*
1413 1413 * The lgrp *must* be pre-existing
1414 1414 */
1415 1415 ASSERT(my_lgrp != NULL);
1416 1416
1417 1417 /*
1418 1418 * Delete memory node from lgroups which contain it
1419 1419 */
1420 1420 mnodes_mask = ((mnodeset_t)1 << mnode);
1421 1421 for (i = 0; i <= lgrp_alloc_max; i++) {
1422 1422 lgrp_t *lgrp = lgrp_table[i];
1423 1423 /*
1424 1424 * Skip any non-existent lgroups and any lgroups that don't
1425 1425 * contain leaf lgroup of memory as a memory resource
1426 1426 */
1427 1427 if (!LGRP_EXISTS(lgrp) ||
1428 1428 !(lgrp->lgrp_mnodes & mnodes_mask))
1429 1429 continue;
1430 1430
1431 1431 /*
1432 1432 * Avoid removing the last mnode from the root in the DR
1433 1433 * copy-rename case. See lgrp_mem_rename() for details.
1434 1434 */
1435 1435 if (is_copy_rename &&
1436 1436 (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask))
1437 1437 continue;
1438 1438
1439 1439 /*
1440 1440 * Remove memory node from lgroup.
1441 1441 */
1442 1442 lgrp->lgrp_mnodes &= ~mnodes_mask;
1443 1443 lgrp->lgrp_nmnodes--;
1444 1444 ASSERT(lgrp->lgrp_nmnodes >= 0);
1445 1445 }
1446 1446 ASSERT(lgrp_root->lgrp_nmnodes > 0);
1447 1447
1448 1448 /*
1449 1449 * Don't need to update lgroup topology if this lgroup still has memory.
1450 1450 *
1451 1451 * In the special case of DR copy-rename with the only mnode being
1452 1452 * removed, the lgrp_mnodes for the root is always non-zero, but we
1453 1453 * still need to update the lgroup topology.
1454 1454 */
1455 1455 if ((my_lgrp->lgrp_nmnodes > 0) &&
1456 1456 !(is_copy_rename && (my_lgrp == lgrp_root) &&
1457 1457 (my_lgrp->lgrp_mnodes == mnodes_mask))) {
1458 1458 if (drop_lock)
1459 1459 mutex_exit(&cpu_lock);
1460 1460 return;
1461 1461 }
1462 1462
1463 1463 /*
1464 1464 * This lgroup does not contain any memory now
1465 1465 */
1466 1466 klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]);
1467 1467
1468 1468 /*
1469 1469 * Remove this lgroup from lgroup topology if it does not contain any
1470 1470 * resources now
1471 1471 */
1472 1472 lgrpid = my_lgrp->lgrp_id;
1473 1473 count = 0;
1474 1474 klgrpset_clear(changed);
1475 1475 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1476 1476 /*
1477 1477 * Delete lgroup when no more resources
1478 1478 */
1479 1479 if (need_synch)
1480 1480 pause_cpus(NULL, NULL);
1481 1481 count = lgrp_leaf_delete(my_lgrp, lgrp_table,
1482 1482 lgrp_alloc_max + 1, &changed);
1483 1483 ASSERT(count > 0);
1484 1484 if (need_synch)
1485 1485 start_cpus();
1486 1486 } else {
1487 1487 /*
1488 1488 * Remove lgroup from memory resources of any lgroups that
1489 1489 * contain it as such
1490 1490 */
1491 1491 for (i = 0; i <= lgrp_alloc_max; i++) {
1492 1492 lgrp_t *lgrp;
1493 1493
1494 1494 lgrp = lgrp_table[i];
1495 1495 if (!LGRP_EXISTS(lgrp) ||
1496 1496 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1497 1497 lgrpid))
1498 1498 continue;
1499 1499
1500 1500 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1501 1501 }
1502 1502 }
1503 1503 if (drop_lock)
1504 1504 mutex_exit(&cpu_lock);
1505 1505 }
1506 1506
1507 1507 /*
1508 1508 * Return lgroup with given platform handle
1509 1509 */
1510 1510 lgrp_t *
1511 1511 lgrp_hand_to_lgrp(lgrp_handle_t hand)
1512 1512 {
1513 1513 int i;
1514 1514 lgrp_t *lgrp;
1515 1515
1516 1516 if (hand == LGRP_NULL_HANDLE)
1517 1517 return (NULL);
1518 1518
1519 1519 for (i = 0; i <= lgrp_alloc_max; i++) {
1520 1520 lgrp = lgrp_table[i];
1521 1521 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1522 1522 return (lgrp);
1523 1523 }
1524 1524 return (NULL);
1525 1525 }
1526 1526
1527 1527 /*
1528 1528 * Return the home lgroup of the current thread.
1529 1529 * We must do this with kernel preemption disabled, since we don't want our
1530 1530 * thread to be re-homed while we're poking around with its lpl, and the lpl
1531 1531 * should never be NULL.
1532 1532 *
1533 1533 * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
1534 1534 * is enabled because of DR. Callers can use disable kernel preemption
1535 1535 * around this call to guarantee that the lgroup will be valid beyond this
1536 1536 * routine, since kernel preemption can be recursive.
1537 1537 */
1538 1538 lgrp_t *
1539 1539 lgrp_home_lgrp(void)
1540 1540 {
1541 1541 lgrp_t *lgrp;
1542 1542 lpl_t *lpl;
1543 1543
1544 1544 kpreempt_disable();
1545 1545
1546 1546 lpl = curthread->t_lpl;
1547 1547 ASSERT(lpl != NULL);
1548 1548 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1549 1549 ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid]));
1550 1550 lgrp = lgrp_table[lpl->lpl_lgrpid];
1551 1551
1552 1552 kpreempt_enable();
1553 1553
1554 1554 return (lgrp);
1555 1555 }
1556 1556
1557 1557 /*
1558 1558 * Return ID of home lgroup for given thread
1559 1559 * (See comments for lgrp_home_lgrp() for special care and handling
1560 1560 * instructions)
1561 1561 */
1562 1562 lgrp_id_t
1563 1563 lgrp_home_id(kthread_t *t)
1564 1564 {
1565 1565 lgrp_id_t lgrp;
1566 1566 lpl_t *lpl;
1567 1567
1568 1568 ASSERT(t != NULL);
1569 1569 /*
1570 1570 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
1571 1571 * cannot since the HAT layer can call into this routine to
1572 1572 * determine the locality for its data structures in the context
1573 1573 * of a page fault.
1574 1574 */
1575 1575
1576 1576 kpreempt_disable();
1577 1577
1578 1578 lpl = t->t_lpl;
1579 1579 ASSERT(lpl != NULL);
1580 1580 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1581 1581 lgrp = lpl->lpl_lgrpid;
1582 1582
1583 1583 kpreempt_enable();
1584 1584
1585 1585 return (lgrp);
1586 1586 }
1587 1587
1588 1588 /*
1589 1589 * Return lgroup containing the physical memory for the given page frame number
1590 1590 */
1591 1591 lgrp_t *
1592 1592 lgrp_pfn_to_lgrp(pfn_t pfn)
1593 1593 {
1594 1594 lgrp_handle_t hand;
1595 1595 int i;
1596 1596 lgrp_t *lgrp;
1597 1597
1598 1598 hand = lgrp_plat_pfn_to_hand(pfn);
1599 1599 if (hand != LGRP_NULL_HANDLE)
1600 1600 for (i = 0; i <= lgrp_alloc_max; i++) {
1601 1601 lgrp = lgrp_table[i];
1602 1602 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1603 1603 return (lgrp);
1604 1604 }
1605 1605 return (NULL);
1606 1606 }
1607 1607
1608 1608 /*
1609 1609 * Return lgroup containing the physical memory for the given page frame number
1610 1610 */
1611 1611 lgrp_t *
1612 1612 lgrp_phys_to_lgrp(u_longlong_t physaddr)
1613 1613 {
1614 1614 lgrp_handle_t hand;
1615 1615 int i;
1616 1616 lgrp_t *lgrp;
1617 1617 pfn_t pfn;
1618 1618
1619 1619 pfn = btop(physaddr);
1620 1620 hand = lgrp_plat_pfn_to_hand(pfn);
1621 1621 if (hand != LGRP_NULL_HANDLE)
1622 1622 for (i = 0; i <= lgrp_alloc_max; i++) {
1623 1623 lgrp = lgrp_table[i];
1624 1624 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1625 1625 return (lgrp);
1626 1626 }
1627 1627 return (NULL);
1628 1628 }
1629 1629
1630 1630 /*
1631 1631 * Return the leaf lgroup containing the given CPU
1632 1632 *
1633 1633 * The caller needs to take precautions necessary to prevent
1634 1634 * "cpu", and it's lpl from going away across a call to this function.
1635 1635 * hint: kpreempt_disable()/kpreempt_enable()
1636 1636 */
1637 1637 static lgrp_t *
1638 1638 lgrp_cpu_to_lgrp(cpu_t *cpu)
1639 1639 {
1640 1640 return (cpu->cpu_lpl->lpl_lgrp);
1641 1641 }
1642 1642
1643 1643 /*
1644 1644 * Return the sum of the partition loads in an lgrp divided by
1645 1645 * the number of CPUs in the lgrp. This is our best approximation
1646 1646 * of an 'lgroup load average' for a useful per-lgroup kstat.
1647 1647 */
1648 1648 static uint64_t
1649 1649 lgrp_sum_loadavgs(lgrp_t *lgrp)
1650 1650 {
1651 1651 cpu_t *cpu;
1652 1652 int ncpu;
1653 1653 uint64_t loads = 0;
1654 1654
1655 1655 mutex_enter(&cpu_lock);
1656 1656
1657 1657 cpu = lgrp->lgrp_cpu;
1658 1658 ncpu = lgrp->lgrp_cpucnt;
1659 1659
1660 1660 if (cpu == NULL || ncpu == 0) {
1661 1661 mutex_exit(&cpu_lock);
1662 1662 return (0ull);
1663 1663 }
1664 1664
1665 1665 do {
1666 1666 loads += cpu->cpu_lpl->lpl_loadavg;
1667 1667 cpu = cpu->cpu_next_lgrp;
1668 1668 } while (cpu != lgrp->lgrp_cpu);
1669 1669
1670 1670 mutex_exit(&cpu_lock);
1671 1671
1672 1672 return (loads / ncpu);
1673 1673 }
1674 1674
1675 1675 void
1676 1676 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val)
1677 1677 {
1678 1678 struct lgrp_stats *pstats;
1679 1679
1680 1680 /*
1681 1681 * Verify that the caller isn't trying to add to
1682 1682 * a statistic for an lgroup that has gone away
1683 1683 */
1684 1684 if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1685 1685 return;
1686 1686
1687 1687 pstats = &lgrp_stats[lgrpid];
1688 1688 atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val);
1689 1689 }
1690 1690
1691 1691 int64_t
1692 1692 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat)
1693 1693 {
1694 1694 uint64_t val;
1695 1695 struct lgrp_stats *pstats;
1696 1696
1697 1697 if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1698 1698 return ((int64_t)0);
1699 1699
1700 1700 pstats = &lgrp_stats[lgrpid];
1701 1701 LGRP_STAT_READ(pstats, stat, val);
1702 1702 return (val);
1703 1703 }
1704 1704
1705 1705 /*
1706 1706 * Reset all kstats for lgrp specified by its lgrpid.
1707 1707 */
1708 1708 static void
1709 1709 lgrp_kstat_reset(lgrp_id_t lgrpid)
1710 1710 {
1711 1711 lgrp_stat_t stat;
1712 1712
1713 1713 if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1714 1714 return;
1715 1715
1716 1716 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1717 1717 LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat);
1718 1718 }
1719 1719 }
1720 1720
1721 1721 /*
1722 1722 * Collect all per-lgrp statistics for the lgrp associated with this
1723 1723 * kstat, and store them in the ks_data array.
1724 1724 *
1725 1725 * The superuser can reset all the running counter statistics for an
1726 1726 * lgrp by writing to any of the lgrp's stats.
1727 1727 */
1728 1728 static int
1729 1729 lgrp_kstat_extract(kstat_t *ksp, int rw)
1730 1730 {
1731 1731 lgrp_stat_t stat;
1732 1732 struct kstat_named *ksd;
1733 1733 lgrp_t *lgrp;
1734 1734 lgrp_id_t lgrpid;
1735 1735
1736 1736 lgrp = (lgrp_t *)ksp->ks_private;
1737 1737
1738 1738 ksd = (struct kstat_named *)ksp->ks_data;
1739 1739 ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data);
1740 1740
1741 1741 lgrpid = lgrp->lgrp_id;
1742 1742
1743 1743 if (lgrpid == LGRP_NONE) {
1744 1744 /*
1745 1745 * Return all zeroes as stats for freed lgrp.
1746 1746 */
1747 1747 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1748 1748 ksd[stat].value.i64 = 0;
1749 1749 }
1750 1750 ksd[stat + LGRP_NUM_CPUS].value.i64 = 0;
1751 1751 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0;
1752 1752 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0;
1753 1753 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0;
1754 1754 ksd[stat + LGRP_LOADAVG].value.i64 = 0;
1755 1755 } else if (rw != KSTAT_WRITE) {
1756 1756 /*
1757 1757 * Handle counter stats
1758 1758 */
1759 1759 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1760 1760 ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat);
1761 1761 }
1762 1762
1763 1763 /*
1764 1764 * Handle kernel data snapshot stats
1765 1765 */
1766 1766 ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt;
1767 1767 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 =
1768 1768 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL);
1769 1769 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 =
1770 1770 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL);
1771 1771 ksd[stat + LGRP_NUM_PG_FREE].value.i64 =
1772 1772 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
1773 1773 ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp);
1774 1774 ksd[stat + LGRP_LOADAVG_SCALE].value.i64 =
1775 1775 lgrp_loadavg_max_effect;
1776 1776 } else {
1777 1777 lgrp_kstat_reset(lgrpid);
1778 1778 }
1779 1779
1780 1780 return (0);
1781 1781 }
1782 1782
1783 1783 int
1784 1784 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp)
1785 1785 {
1786 1786 cpu_t *cp;
1787 1787
1788 1788 mutex_enter(&cpu_lock);
1789 1789
1790 1790 if ((cp = cpu_get(id)) == NULL) {
1791 1791 mutex_exit(&cpu_lock);
1792 1792 return (EINVAL);
1793 1793 }
1794 1794
1795 1795 if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) {
1796 1796 mutex_exit(&cpu_lock);
1797 1797 return (EINVAL);
1798 1798 }
1799 1799
1800 1800 ASSERT(cp->cpu_lpl != NULL);
1801 1801
1802 1802 *lp = cp->cpu_lpl->lpl_lgrpid;
1803 1803
1804 1804 mutex_exit(&cpu_lock);
1805 1805
1806 1806 return (0);
1807 1807 }
1808 1808
1809 1809 int
1810 1810 lgrp_query_load(processorid_t id, lgrp_load_t *lp)
1811 1811 {
1812 1812 cpu_t *cp;
1813 1813
1814 1814 mutex_enter(&cpu_lock);
1815 1815
1816 1816 if ((cp = cpu_get(id)) == NULL) {
1817 1817 mutex_exit(&cpu_lock);
1818 1818 return (EINVAL);
1819 1819 }
1820 1820
1821 1821 ASSERT(cp->cpu_lpl != NULL);
1822 1822
1823 1823 *lp = cp->cpu_lpl->lpl_loadavg;
1824 1824
1825 1825 mutex_exit(&cpu_lock);
1826 1826
1827 1827 return (0);
1828 1828 }
1829 1829
1830 1830 /*
1831 1831 * Add a resource named by lpl_leaf to rset of lpl_target
1832 1832 *
1833 1833 * This routine also adjusts ncpu and nrset if the call succeeds in adding a
1834 1834 * resource. It is adjusted here, as this is presently the only place that we
1835 1835 * can be certain a resource addition has succeeded.
1836 1836 *
1837 1837 * We keep the list of rsets sorted so that the dispatcher can quickly walk the
1838 1838 * list in order until it reaches a NULL. (This list is required to be NULL
1839 1839 * terminated, too). This is done so that we can mark start pos + 1, so that
1840 1840 * each lpl is traversed sequentially, but in a different order. We hope this
1841 1841 * will improve performance a bit. (Hopefully, less read-to-own traffic...)
1842 1842 */
1843 1843
1844 1844 void
1845 1845 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf)
1846 1846 {
1847 1847 int i;
1848 1848 int entry_slot = 0;
1849 1849
1850 1850 /* return if leaf is already present */
1851 1851 for (i = 0; i < lpl_target->lpl_nrset; i++) {
1852 1852 if (lpl_target->lpl_rset[i] == lpl_leaf) {
1853 1853 return;
1854 1854 }
1855 1855
1856 1856 if (lpl_target->lpl_rset[i]->lpl_lgrpid >
1857 1857 lpl_leaf->lpl_lgrpid) {
1858 1858 break;
1859 1859 }
1860 1860 }
1861 1861
1862 1862 /* insert leaf, update counts */
1863 1863 entry_slot = i;
1864 1864 i = lpl_target->lpl_nrset++;
1865 1865
1866 1866 /*
1867 1867 * Start at the end of the rset array and work backwards towards the
1868 1868 * slot into which the new lpl will be inserted. This effectively
1869 1869 * preserves the current ordering by scooting everybody over one entry,
1870 1870 * and placing the new entry into the space created.
1871 1871 */
1872 1872 while (i-- > entry_slot) {
1873 1873 lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i];
1874 1874 lpl_target->lpl_id2rset[lpl_target->lpl_rset[i]->lpl_lgrpid] =
1875 1875 i + 1;
1876 1876 }
1877 1877
1878 1878 lpl_target->lpl_rset[entry_slot] = lpl_leaf;
1879 1879 lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = entry_slot;
1880 1880
1881 1881 lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu;
1882 1882 }
1883 1883
1884 1884 /*
1885 1885 * Update each of lpl_parent's children with a reference to their parent.
1886 1886 * The lgrp topology is used as the reference since it is fully
1887 1887 * consistent and correct at this point.
1888 1888 * This should be called after any potential change in lpl_parent's
1889 1889 * rset.
1890 1890 */
1891 1891 static void
1892 1892 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp)
1893 1893 {
1894 1894 klgrpset_t children;
1895 1895 int i;
1896 1896
1897 1897 children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children;
1898 1898 if (klgrpset_isempty(children))
1899 1899 return; /* nothing to do */
1900 1900
1901 1901 for (i = 0; i <= lgrp_alloc_max; i++) {
1902 1902 if (klgrpset_ismember(children, i)) {
1903 1903 /*
1904 1904 * (Re)set the parent. It may be incorrect if
1905 1905 * lpl_parent is new in the topology.
1906 1906 */
1907 1907 cp->cp_lgrploads[i].lpl_parent = lpl_parent;
1908 1908 }
1909 1909 }
1910 1910 }
1911 1911
1912 1912 /*
1913 1913 * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
1914 1914 *
1915 1915 * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
1916 1916 * resource. The values are adjusted here, as this is the only place that we can
1917 1917 * be certain a resource was successfully deleted.
1918 1918 */
1919 1919 void
1920 1920 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf)
1921 1921 {
1922 1922 int i;
1923 1923 lpl_t *leaf;
1924 1924
1925 1925 if (lpl_target->lpl_nrset == 0)
1926 1926 return;
1927 1927
1928 1928 /* find leaf in intermediate node */
1929 1929 for (i = 0; i < lpl_target->lpl_nrset; i++) {
1930 1930 if (lpl_target->lpl_rset[i] == lpl_leaf)
1931 1931 break;
1932 1932 }
1933 1933
1934 1934 /* return if leaf not found */
1935 1935 if (lpl_target->lpl_rset[i] != lpl_leaf)
1936 1936 return;
1937 1937
1938 1938 /* prune leaf, compress array */
1939 1939 lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL;
1940 1940 lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = -1;
1941 1941 lpl_target->lpl_ncpu--;
1942 1942 do {
1943 1943 lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1];
1944 1944 /*
1945 1945 * Update the lgrp id <=> rset mapping
1946 1946 */
1947 1947 if ((leaf = lpl_target->lpl_rset[i]) != NULL) {
1948 1948 lpl_target->lpl_id2rset[leaf->lpl_lgrpid] = i;
1949 1949 }
1950 1950 } while (i++ < lpl_target->lpl_nrset);
1951 1951 }
1952 1952
1953 1953 /*
1954 1954 * Check to see if the resource set of the target lpl contains the
1955 1955 * supplied leaf lpl. This returns 1 if the lpl is found, 0 if it is not.
1956 1956 */
1957 1957
1958 1958 int
1959 1959 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf)
1960 1960 {
1961 1961 int i;
1962 1962
1963 1963 for (i = 0; i < lpl_target->lpl_nrset; i++) {
1964 1964 if (lpl_target->lpl_rset[i] == lpl_leaf)
1965 1965 return (1);
1966 1966 }
1967 1967
1968 1968 return (0);
1969 1969 }
1970 1970
1971 1971 /*
1972 1972 * Called when we change cpu lpl membership. This increments or decrements the
1973 1973 * per-cpu counter in every lpl in which our leaf appears.
1974 1974 */
1975 1975 void
1976 1976 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp)
1977 1977 {
1978 1978 cpupart_t *cpupart;
1979 1979 lgrp_t *lgrp_leaf;
1980 1980 lgrp_t *lgrp_cur;
1981 1981 lpl_t *lpl_leaf;
1982 1982 lpl_t *lpl_cur;
1983 1983 int i;
1984 1984
1985 1985 ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT);
1986 1986
1987 1987 cpupart = cp->cpu_part;
1988 1988 lpl_leaf = cp->cpu_lpl;
1989 1989 lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid];
1990 1990
1991 1991 for (i = 0; i <= lgrp_alloc_max; i++) {
1992 1992 lgrp_cur = lgrp_table[i];
1993 1993
1994 1994 /*
1995 1995 * Don't adjust if the lgrp isn't there, if we're the leaf lpl
1996 1996 * for the cpu in question, or if the current lgrp and leaf
1997 1997 * don't share the same resources.
1998 1998 */
1999 1999
2000 2000 if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) ||
2001 2001 !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU],
2002 2002 lgrp_cur->lgrp_set[LGRP_RSRC_CPU]))
2003 2003 continue;
2004 2004
2005 2005
2006 2006 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2007 2007
2008 2008 if (lpl_cur->lpl_nrset > 0) {
2009 2009 if (act == LPL_INCREMENT) {
2010 2010 lpl_cur->lpl_ncpu++;
2011 2011 } else if (act == LPL_DECREMENT) {
2012 2012 lpl_cur->lpl_ncpu--;
2013 2013 }
2014 2014 }
2015 2015 }
2016 2016 }
2017 2017
2018 2018 /*
2019 2019 * Initialize lpl with given resources and specified lgrp
2020 2020 */
2021 2021 void
2022 2022 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp)
2023 2023 {
2024 2024 lpl->lpl_lgrpid = lgrp->lgrp_id;
2025 2025 lpl->lpl_loadavg = 0;
2026 2026 if (lpl == lpl_leaf)
2027 2027 lpl->lpl_ncpu = 1;
2028 2028 else
2029 2029 lpl->lpl_ncpu = lpl_leaf->lpl_ncpu;
2030 2030 lpl->lpl_nrset = 1;
2031 2031 lpl->lpl_rset[0] = lpl_leaf;
2032 2032 lpl->lpl_id2rset[lpl_leaf->lpl_lgrpid] = 0;
2033 2033 lpl->lpl_lgrp = lgrp;
2034 2034 lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */
2035 2035 lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */
2036 2036 }
2037 2037
2038 2038 /*
2039 2039 * Clear an unused lpl
2040 2040 */
2041 2041 void
2042 2042 lpl_clear(lpl_t *lpl)
2043 2043 {
2044 2044 /*
2045 2045 * Clear out all fields in the lpl except:
2046 2046 * lpl_lgrpid - to facilitate debugging
2047 2047 * lpl_rset, lpl_rset_sz, lpl_id2rset - rset array references / size
2048 2048 *
2049 2049 * Note that the lpl's rset and id2rset mapping are cleared as well.
2050 2050 */
2051 2051 lpl->lpl_loadavg = 0;
2052 2052 lpl->lpl_ncpu = 0;
2053 2053 lpl->lpl_lgrp = NULL;
2054 2054 lpl->lpl_parent = NULL;
2055 2055 lpl->lpl_cpus = NULL;
2056 2056 lpl->lpl_nrset = 0;
2057 2057 lpl->lpl_homed_time = 0;
2058 2058 bzero(lpl->lpl_rset, sizeof (lpl->lpl_rset[0]) * lpl->lpl_rset_sz);
2059 2059 bzero(lpl->lpl_id2rset,
2060 2060 sizeof (lpl->lpl_id2rset[0]) * lpl->lpl_rset_sz);
2061 2061 }
2062 2062
2063 2063 /*
2064 2064 * Given a CPU-partition, verify that the lpl topology in the CPU-partition
2065 2065 * is in sync with the lgroup toplogy in the system. The lpl topology may not
2066 2066 * make full use of all of the lgroup topology, but this checks to make sure
2067 2067 * that for the parts that it does use, it has correctly understood the
2068 2068 * relationships that exist. This function returns
2069 2069 * 0 if the topology is correct, and a non-zero error code, for non-debug
2070 2070 * kernels if incorrect. Asserts are spread throughout the code to aid in
2071 2071 * debugging on a DEBUG kernel.
2072 2072 */
2073 2073 int
2074 2074 lpl_topo_verify(cpupart_t *cpupart)
2075 2075 {
2076 2076 lgrp_t *lgrp;
2077 2077 lpl_t *lpl;
2078 2078 klgrpset_t rset;
2079 2079 klgrpset_t cset;
2080 2080 cpu_t *cpu;
2081 2081 cpu_t *cp_start;
2082 2082 int i;
2083 2083 int j;
2084 2084 int sum;
2085 2085
2086 2086 /* topology can't be incorrect if it doesn't exist */
2087 2087 if (!lgrp_topo_initialized || !lgrp_initialized)
2088 2088 return (LPL_TOPO_CORRECT);
2089 2089
2090 2090 ASSERT(cpupart != NULL);
2091 2091
2092 2092 for (i = 0; i <= lgrp_alloc_max; i++) {
2093 2093 lgrp = lgrp_table[i];
2094 2094 lpl = NULL;
2095 2095 /* make sure lpls are allocated */
2096 2096 ASSERT(cpupart->cp_lgrploads);
2097 2097 if (!cpupart->cp_lgrploads)
2098 2098 return (LPL_TOPO_PART_HAS_NO_LPL);
2099 2099
2100 2100 lpl = &cpupart->cp_lgrploads[i];
2101 2101 /* make sure our index is good */
2102 2102 ASSERT(i < cpupart->cp_nlgrploads);
2103 2103
2104 2104 /* if lgroup doesn't exist, make sure lpl is empty */
2105 2105 if (!LGRP_EXISTS(lgrp)) {
2106 2106 ASSERT(lpl->lpl_ncpu == 0);
2107 2107 if (lpl->lpl_ncpu > 0) {
2108 2108 return (LPL_TOPO_CPUS_NOT_EMPTY);
2109 2109 } else {
2110 2110 continue;
2111 2111 }
2112 2112 }
2113 2113
2114 2114 /* verify that lgroup and lpl are identically numbered */
2115 2115 ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid);
2116 2116
2117 2117 /* if lgroup isn't in our partition, make sure lpl is empty */
2118 2118 if (!klgrpset_intersects(lgrp->lgrp_leaves,
2119 2119 cpupart->cp_lgrpset)) {
2120 2120 ASSERT(lpl->lpl_ncpu == 0);
2121 2121 if (lpl->lpl_ncpu > 0) {
2122 2122 return (LPL_TOPO_CPUS_NOT_EMPTY);
2123 2123 }
2124 2124 /*
2125 2125 * lpl is empty, and lgroup isn't in partition. verify
2126 2126 * that lpl doesn't show up in anyone else's rsets (in
2127 2127 * this partition, anyway)
2128 2128 */
2129 2129 for (j = 0; j < cpupart->cp_nlgrploads; j++) {
2130 2130 lpl_t *i_lpl; /* lpl we're iterating over */
2131 2131
2132 2132 i_lpl = &cpupart->cp_lgrploads[j];
2133 2133
2134 2134 ASSERT(!lpl_rset_contains(i_lpl, lpl));
2135 2135 if (lpl_rset_contains(i_lpl, lpl)) {
2136 2136 return (LPL_TOPO_LPL_ORPHANED);
2137 2137 }
2138 2138 }
2139 2139 /* lgroup is empty, and everything is ok. continue */
2140 2140 continue;
2141 2141 }
2142 2142
2143 2143
2144 2144 /* lgroup is in this partition, now check it against lpl */
2145 2145
2146 2146 /* do both have matching lgrps? */
2147 2147 ASSERT(lgrp == lpl->lpl_lgrp);
2148 2148 if (lgrp != lpl->lpl_lgrp) {
2149 2149 return (LPL_TOPO_LGRP_MISMATCH);
2150 2150 }
2151 2151
2152 2152 /* do the parent lgroups exist and do they match? */
2153 2153 if (lgrp->lgrp_parent) {
2154 2154 ASSERT(lpl->lpl_parent);
2155 2155 ASSERT(lgrp->lgrp_parent->lgrp_id ==
2156 2156 lpl->lpl_parent->lpl_lgrpid);
2157 2157
2158 2158 if (!lpl->lpl_parent) {
2159 2159 return (LPL_TOPO_MISSING_PARENT);
2160 2160 } else if (lgrp->lgrp_parent->lgrp_id !=
2161 2161 lpl->lpl_parent->lpl_lgrpid) {
2162 2162 return (LPL_TOPO_PARENT_MISMATCH);
2163 2163 }
2164 2164 }
2165 2165
2166 2166 /* only leaf lgroups keep a cpucnt, only check leaves */
2167 2167 if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) {
2168 2168
2169 2169 /* verify that lgrp is also a leaf */
2170 2170 ASSERT((lgrp->lgrp_childcnt == 0) &&
2171 2171 (klgrpset_ismember(lgrp->lgrp_leaves,
2172 2172 lpl->lpl_lgrpid)));
2173 2173
2174 2174 if ((lgrp->lgrp_childcnt > 0) ||
2175 2175 (!klgrpset_ismember(lgrp->lgrp_leaves,
2176 2176 lpl->lpl_lgrpid))) {
2177 2177 return (LPL_TOPO_LGRP_NOT_LEAF);
2178 2178 }
2179 2179
2180 2180 ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) &&
2181 2181 (lpl->lpl_ncpu > 0));
2182 2182 if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) ||
2183 2183 (lpl->lpl_ncpu <= 0)) {
2184 2184 return (LPL_TOPO_BAD_CPUCNT);
2185 2185 }
2186 2186
2187 2187 /*
2188 2188 * Check that lpl_ncpu also matches the number of
2189 2189 * cpus in the lpl's linked list. This only exists in
2190 2190 * leaves, but they should always match.
2191 2191 */
2192 2192 j = 0;
2193 2193 cpu = cp_start = lpl->lpl_cpus;
2194 2194 while (cpu != NULL) {
2195 2195 j++;
2196 2196
2197 2197 /* check to make sure cpu's lpl is leaf lpl */
2198 2198 ASSERT(cpu->cpu_lpl == lpl);
2199 2199 if (cpu->cpu_lpl != lpl) {
2200 2200 return (LPL_TOPO_CPU_HAS_BAD_LPL);
2201 2201 }
2202 2202
2203 2203 /* check next cpu */
2204 2204 if ((cpu = cpu->cpu_next_lpl) != cp_start) {
2205 2205 continue;
2206 2206 } else {
2207 2207 cpu = NULL;
2208 2208 }
2209 2209 }
2210 2210
2211 2211 ASSERT(j == lpl->lpl_ncpu);
2212 2212 if (j != lpl->lpl_ncpu) {
2213 2213 return (LPL_TOPO_LPL_BAD_NCPU);
2214 2214 }
2215 2215
2216 2216 /*
2217 2217 * Also, check that leaf lpl is contained in all
2218 2218 * intermediate lpls that name the leaf as a descendant
2219 2219 */
2220 2220 for (j = 0; j <= lgrp_alloc_max; j++) {
2221 2221 klgrpset_t intersect;
2222 2222 lgrp_t *lgrp_cand;
2223 2223 lpl_t *lpl_cand;
2224 2224
2225 2225 lgrp_cand = lgrp_table[j];
2226 2226 intersect = klgrpset_intersects(
2227 2227 lgrp_cand->lgrp_set[LGRP_RSRC_CPU],
2228 2228 cpupart->cp_lgrpset);
2229 2229
2230 2230 if (!LGRP_EXISTS(lgrp_cand) ||
2231 2231 !klgrpset_intersects(lgrp_cand->lgrp_leaves,
2232 2232 cpupart->cp_lgrpset) ||
2233 2233 (intersect == 0))
2234 2234 continue;
2235 2235
2236 2236 lpl_cand =
2237 2237 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2238 2238
2239 2239 if (klgrpset_ismember(intersect,
2240 2240 lgrp->lgrp_id)) {
2241 2241 ASSERT(lpl_rset_contains(lpl_cand,
2242 2242 lpl));
2243 2243
2244 2244 if (!lpl_rset_contains(lpl_cand, lpl)) {
2245 2245 return (LPL_TOPO_RSET_MSSNG_LF);
2246 2246 }
2247 2247 }
2248 2248 }
2249 2249
2250 2250 } else { /* non-leaf specific checks */
2251 2251
2252 2252 /*
2253 2253 * Non-leaf lpls should have lpl_cpus == NULL
2254 2254 * verify that this is so
2255 2255 */
2256 2256 ASSERT(lpl->lpl_cpus == NULL);
2257 2257 if (lpl->lpl_cpus != NULL) {
2258 2258 return (LPL_TOPO_NONLEAF_HAS_CPUS);
2259 2259 }
2260 2260
2261 2261 /*
2262 2262 * verify that the sum of the cpus in the leaf resources
2263 2263 * is equal to the total ncpu in the intermediate
2264 2264 */
2265 2265 for (j = sum = 0; j < lpl->lpl_nrset; j++) {
2266 2266 sum += lpl->lpl_rset[j]->lpl_ncpu;
2267 2267 }
2268 2268
2269 2269 ASSERT(sum == lpl->lpl_ncpu);
2270 2270 if (sum != lpl->lpl_ncpu) {
2271 2271 return (LPL_TOPO_LPL_BAD_NCPU);
2272 2272 }
2273 2273 }
2274 2274
2275 2275 /*
2276 2276 * Check the rset of the lpl in question. Make sure that each
2277 2277 * rset contains a subset of the resources in
2278 2278 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset. This also makes
2279 2279 * sure that each rset doesn't include resources that are
2280 2280 * outside of that set. (Which would be resources somehow not
2281 2281 * accounted for).
2282 2282 */
2283 2283 klgrpset_clear(rset);
2284 2284 for (j = 0; j < lpl->lpl_nrset; j++) {
2285 2285 klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid);
2286 2286 }
2287 2287 klgrpset_copy(cset, rset);
2288 2288 /* make sure lpl rset matches lgrp rset */
2289 2289 klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]);
2290 2290 /* make sure rset is contained with in partition, too */
2291 2291 klgrpset_diff(cset, cpupart->cp_lgrpset);
2292 2292
2293 2293 ASSERT(klgrpset_isempty(rset) && klgrpset_isempty(cset));
2294 2294 if (!klgrpset_isempty(rset) || !klgrpset_isempty(cset)) {
2295 2295 return (LPL_TOPO_RSET_MISMATCH);
2296 2296 }
2297 2297
2298 2298 /*
2299 2299 * check to make sure lpl_nrset matches the number of rsets
2300 2300 * contained in the lpl
2301 2301 */
2302 2302 for (j = 0; j < lpl->lpl_nrset; j++) {
2303 2303 if (lpl->lpl_rset[j] == NULL)
2304 2304 break;
2305 2305 }
2306 2306
2307 2307 ASSERT(j == lpl->lpl_nrset);
2308 2308 if (j != lpl->lpl_nrset) {
2309 2309 return (LPL_TOPO_BAD_RSETCNT);
2310 2310 }
2311 2311
2312 2312 }
2313 2313 return (LPL_TOPO_CORRECT);
2314 2314 }
2315 2315
2316 2316 /*
2317 2317 * Flatten lpl topology to given number of levels. This is presently only
2318 2318 * implemented for a flatten to 2 levels, which will prune out the intermediates
2319 2319 * and home the leaf lpls to the root lpl.
2320 2320 */
2321 2321 int
2322 2322 lpl_topo_flatten(int levels)
2323 2323 {
2324 2324 int i;
2325 2325 uint_t sum;
2326 2326 lgrp_t *lgrp_cur;
2327 2327 lpl_t *lpl_cur;
2328 2328 lpl_t *lpl_root;
2329 2329 cpupart_t *cp;
2330 2330
2331 2331 if (levels != 2)
2332 2332 return (0);
2333 2333
2334 2334 /* called w/ cpus paused - grab no locks! */
2335 2335 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2336 2336 !lgrp_initialized);
2337 2337
2338 2338 cp = cp_list_head;
2339 2339 do {
2340 2340 lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id];
2341 2341 ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0));
2342 2342
2343 2343 for (i = 0; i <= lgrp_alloc_max; i++) {
2344 2344 lgrp_cur = lgrp_table[i];
2345 2345 lpl_cur = &cp->cp_lgrploads[i];
2346 2346
2347 2347 if ((lgrp_cur == lgrp_root) ||
2348 2348 (!LGRP_EXISTS(lgrp_cur) &&
2349 2349 (lpl_cur->lpl_ncpu == 0)))
2350 2350 continue;
2351 2351
2352 2352 if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) {
2353 2353 /*
2354 2354 * this should be a deleted intermediate, so
2355 2355 * clear it
2356 2356 */
2357 2357 lpl_clear(lpl_cur);
2358 2358 } else if ((lpl_cur->lpl_nrset == 1) &&
2359 2359 (lpl_cur->lpl_rset[0] == lpl_cur) &&
2360 2360 ((lpl_cur->lpl_parent->lpl_ncpu == 0) ||
2361 2361 (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) {
2362 2362 /*
2363 2363 * this is a leaf whose parent was deleted, or
2364 2364 * whose parent had their lgrp deleted. (And
2365 2365 * whose parent will soon be deleted). Point
2366 2366 * this guy back to the root lpl.
2367 2367 */
2368 2368 lpl_cur->lpl_parent = lpl_root;
2369 2369 lpl_rset_add(lpl_root, lpl_cur);
2370 2370 }
2371 2371
2372 2372 }
2373 2373
2374 2374 /*
2375 2375 * Now that we're done, make sure the count on the root lpl is
2376 2376 * correct, and update the hints of the children for the sake of
2377 2377 * thoroughness
2378 2378 */
2379 2379 for (i = sum = 0; i < lpl_root->lpl_nrset; i++) {
2380 2380 sum += lpl_root->lpl_rset[i]->lpl_ncpu;
2381 2381 }
2382 2382 lpl_root->lpl_ncpu = sum;
2383 2383 lpl_child_update(lpl_root, cp);
2384 2384
2385 2385 cp = cp->cp_next;
2386 2386 } while (cp != cp_list_head);
2387 2387
2388 2388 return (levels);
2389 2389 }
2390 2390
2391 2391 /*
2392 2392 * Insert a lpl into the resource hierarchy and create any additional lpls that
2393 2393 * are necessary to represent the varying states of locality for the cpu
2394 2394 * resoruces newly added to the partition.
2395 2395 *
2396 2396 * This routine is clever enough that it can correctly add resources from the
2397 2397 * new leaf into both direct and indirect resource sets in the hierarchy. (Ie,
2398 2398 * those for which the lpl is a leaf as opposed to simply a named equally local
2399 2399 * resource). The one special case that needs additional processing is when a
2400 2400 * new intermediate lpl is introduced. Since the main loop only traverses
2401 2401 * looking to add the leaf resource where it does not yet exist, additional work
2402 2402 * is necessary to add other leaf resources that may need to exist in the newly
2403 2403 * created intermediate. This is performed by the second inner loop, and is
2404 2404 * only done when the check for more than one overlapping resource succeeds.
2405 2405 */
2406 2406
2407 2407 void
2408 2408 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart)
2409 2409 {
2410 2410 int i;
2411 2411 int j;
2412 2412 int rset_num_intersect;
2413 2413 lgrp_t *lgrp_cur;
2414 2414 lpl_t *lpl_cur;
2415 2415 lpl_t *lpl_parent;
2416 2416 lgrp_id_t parent_id;
2417 2417 klgrpset_t rset_intersect; /* resources in cpupart and lgrp */
2418 2418
2419 2419 for (i = 0; i <= lgrp_alloc_max; i++) {
2420 2420 lgrp_cur = lgrp_table[i];
2421 2421
2422 2422 /*
2423 2423 * Don't insert if the lgrp isn't there, if the leaf isn't
2424 2424 * contained within the current lgrp, or if the current lgrp has
2425 2425 * no leaves in this partition
2426 2426 */
2427 2427
2428 2428 if (!LGRP_EXISTS(lgrp_cur) ||
2429 2429 !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2430 2430 lpl_leaf->lpl_lgrpid) ||
2431 2431 !klgrpset_intersects(lgrp_cur->lgrp_leaves,
2432 2432 cpupart->cp_lgrpset))
2433 2433 continue;
2434 2434
2435 2435 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2436 2436 if (lgrp_cur->lgrp_parent != NULL) {
2437 2437 /* if lgrp has a parent, assign it properly */
2438 2438 parent_id = lgrp_cur->lgrp_parent->lgrp_id;
2439 2439 lpl_parent = &cpupart->cp_lgrploads[parent_id];
2440 2440 } else {
2441 2441 /* if not, make sure parent ptr gets set to null */
2442 2442 lpl_parent = NULL;
2443 2443 }
2444 2444
2445 2445 if (lpl_cur == lpl_leaf) {
2446 2446 /*
2447 2447 * Almost all leaf state was initialized elsewhere. The
2448 2448 * only thing left to do is to set the parent.
2449 2449 */
2450 2450 lpl_cur->lpl_parent = lpl_parent;
2451 2451 continue;
2452 2452 }
2453 2453
2454 2454 lpl_clear(lpl_cur);
2455 2455 lpl_init(lpl_cur, lpl_leaf, lgrp_cur);
2456 2456
2457 2457 lpl_cur->lpl_parent = lpl_parent;
2458 2458
2459 2459 /* does new lpl need to be populated with other resources? */
2460 2460 rset_intersect =
2461 2461 klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2462 2462 cpupart->cp_lgrpset);
2463 2463 klgrpset_nlgrps(rset_intersect, rset_num_intersect);
2464 2464
2465 2465 if (rset_num_intersect > 1) {
2466 2466 /*
2467 2467 * If so, figure out what lpls have resources that
2468 2468 * intersect this one, and add them.
2469 2469 */
2470 2470 for (j = 0; j <= lgrp_alloc_max; j++) {
2471 2471 lgrp_t *lgrp_cand; /* candidate lgrp */
2472 2472 lpl_t *lpl_cand; /* candidate lpl */
2473 2473
2474 2474 lgrp_cand = lgrp_table[j];
2475 2475 if (!LGRP_EXISTS(lgrp_cand) ||
2476 2476 !klgrpset_ismember(rset_intersect,
2477 2477 lgrp_cand->lgrp_id))
2478 2478 continue;
2479 2479 lpl_cand =
2480 2480 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2481 2481 lpl_rset_add(lpl_cur, lpl_cand);
2482 2482 }
2483 2483 }
2484 2484 /*
2485 2485 * This lpl's rset has changed. Update the hint in it's
2486 2486 * children.
2487 2487 */
2488 2488 lpl_child_update(lpl_cur, cpupart);
2489 2489 }
2490 2490 }
2491 2491
2492 2492 /*
2493 2493 * remove a lpl from the hierarchy of resources, clearing its state when
2494 2494 * finished. If the lpls at the intermediate levels of the hierarchy have no
2495 2495 * remaining resources, or no longer name a leaf resource in the cpu-partition,
2496 2496 * delete them as well.
2497 2497 */
2498 2498
2499 2499 void
2500 2500 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart)
2501 2501 {
2502 2502 int i;
2503 2503 lgrp_t *lgrp_cur;
2504 2504 lpl_t *lpl_cur;
2505 2505 klgrpset_t leaf_intersect; /* intersection of leaves */
2506 2506
2507 2507 for (i = 0; i <= lgrp_alloc_max; i++) {
2508 2508 lgrp_cur = lgrp_table[i];
2509 2509
2510 2510 /*
2511 2511 * Don't attempt to remove from lgrps that aren't there, that
2512 2512 * don't contain our leaf, or from the leaf itself. (We do that
2513 2513 * later)
2514 2514 */
2515 2515
2516 2516 if (!LGRP_EXISTS(lgrp_cur))
2517 2517 continue;
2518 2518
2519 2519 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2520 2520
2521 2521 if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2522 2522 lpl_leaf->lpl_lgrpid) ||
2523 2523 (lpl_cur == lpl_leaf)) {
2524 2524 continue;
2525 2525 }
2526 2526
2527 2527 /*
2528 2528 * This is a slightly sleazy simplification in that we have
2529 2529 * already marked the cp_lgrpset as no longer containing the
2530 2530 * leaf we've deleted. Any lpls that pass the above checks
2531 2531 * based upon lgrp membership but not necessarily cpu-part
2532 2532 * membership also get cleared by the checks below. Currently
2533 2533 * this is harmless, as the lpls should be empty anyway.
2534 2534 *
2535 2535 * In particular, we want to preserve lpls that have additional
2536 2536 * leaf resources, even though we don't yet have a processor
2537 2537 * architecture that represents resources this way.
2538 2538 */
2539 2539
2540 2540 leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves,
2541 2541 cpupart->cp_lgrpset);
2542 2542
2543 2543 lpl_rset_del(lpl_cur, lpl_leaf);
2544 2544 if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) {
2545 2545 lpl_clear(lpl_cur);
2546 2546 } else {
2547 2547 /*
2548 2548 * Update this lpl's children
2549 2549 */
2550 2550 lpl_child_update(lpl_cur, cpupart);
2551 2551 }
2552 2552 }
2553 2553 lpl_clear(lpl_leaf);
2554 2554 }
2555 2555
2556 2556 /*
2557 2557 * add a cpu to a partition in terms of lgrp load avg bookeeping
2558 2558 *
2559 2559 * The lpl (cpu partition load average information) is now arranged in a
2560 2560 * hierarchical fashion whereby resources that are closest, ie. most local, to
2561 2561 * the cpu in question are considered to be leaves in a tree of resources.
2562 2562 * There are two general cases for cpu additon:
2563 2563 *
2564 2564 * 1. A lpl structure that contains resources already in the hierarchy tree.
2565 2565 * In this case, all of the associated lpl relationships have been defined, and
2566 2566 * all that is necessary is that we link the new cpu into the per-lpl list of
2567 2567 * cpus, and increment the ncpu count of all places where this cpu resource will
2568 2568 * be accounted for. lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
2569 2569 * pushing is accomplished by this routine.
2570 2570 *
2571 2571 * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
2572 2572 * not exist yet. In this case, it is necessary to build the leaf lpl, and
2573 2573 * construct the hierarchy of state necessary to name it's more distant
2574 2574 * resources, if they should exist. The leaf structure is initialized by this
2575 2575 * routine, as is the cpu-partition state for the lgrp membership. This routine
2576 2576 * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
2577 2577 * and builds all of the "ancestoral" state necessary to identify resources at
2578 2578 * differing levels of locality.
2579 2579 */
2580 2580 void
2581 2581 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid)
2582 2582 {
2583 2583 cpupart_t *cpupart;
2584 2584 lgrp_t *lgrp_leaf;
2585 2585 lpl_t *lpl_leaf;
2586 2586
2587 2587 /* called sometimes w/ cpus paused - grab no locks */
2588 2588 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2589 2589
2590 2590 cpupart = cp->cpu_part;
2591 2591 lgrp_leaf = lgrp_table[lgrpid];
2592 2592
2593 2593 /* don't add non-existent lgrp */
2594 2594 ASSERT(LGRP_EXISTS(lgrp_leaf));
2595 2595 lpl_leaf = &cpupart->cp_lgrploads[lgrpid];
2596 2596 cp->cpu_lpl = lpl_leaf;
2597 2597
2598 2598 /* only leaf lpls contain cpus */
2599 2599
2600 2600 if (lpl_leaf->lpl_ncpu++ == 0) {
2601 2601 lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf);
2602 2602 klgrpset_add(cpupart->cp_lgrpset, lgrpid);
2603 2603 lpl_leaf_insert(lpl_leaf, cpupart);
2604 2604 } else {
2605 2605 /*
2606 2606 * the lpl should already exist in the parent, so just update
2607 2607 * the count of available CPUs
2608 2608 */
2609 2609 lpl_cpu_adjcnt(LPL_INCREMENT, cp);
2610 2610 }
2611 2611
2612 2612 /* link cpu into list of cpus in lpl */
2613 2613
2614 2614 if (lpl_leaf->lpl_cpus) {
2615 2615 cp->cpu_next_lpl = lpl_leaf->lpl_cpus;
2616 2616 cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl;
2617 2617 lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp;
2618 2618 lpl_leaf->lpl_cpus->cpu_prev_lpl = cp;
2619 2619 } else {
2620 2620 /*
2621 2621 * We increment ncpu immediately after we create a new leaf
2622 2622 * lpl, so assert that ncpu == 1 for the case where we don't
2623 2623 * have any cpu pointers yet.
2624 2624 */
2625 2625 ASSERT(lpl_leaf->lpl_ncpu == 1);
2626 2626 lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp;
2627 2627 }
2628 2628
2629 2629 }
2630 2630
2631 2631
2632 2632 /*
2633 2633 * remove a cpu from a partition in terms of lgrp load avg bookeeping
2634 2634 *
2635 2635 * The lpl (cpu partition load average information) is now arranged in a
2636 2636 * hierarchical fashion whereby resources that are closest, ie. most local, to
2637 2637 * the cpu in question are considered to be leaves in a tree of resources.
2638 2638 * There are two removal cases in question:
2639 2639 *
2640 2640 * 1. Removal of the resource in the leaf leaves other resources remaining in
2641 2641 * that leaf. (Another cpu still exists at this level of locality). In this
2642 2642 * case, the count of available cpus is decremented in all assocated lpls by
2643 2643 * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
2644 2644 * from the per-cpu lpl list.
2645 2645 *
2646 2646 * 2. Removal of the resource results in the lpl containing no resources. (It's
2647 2647 * empty) In this case, all of what has occurred for the first step must take
2648 2648 * place; however, additionally we must remove the lpl structure itself, prune
2649 2649 * out any stranded lpls that do not directly name a leaf resource, and mark the
2650 2650 * cpu partition in question as no longer containing resources from the lgrp of
2651 2651 * the lpl that has been delted. Cpu-partition changes are handled by this
2652 2652 * method, but the lpl_leaf_remove function deals with the details of pruning
2653 2653 * out the empty lpl and any of its orphaned direct ancestors.
2654 2654 */
2655 2655 void
2656 2656 lgrp_part_del_cpu(cpu_t *cp)
2657 2657 {
2658 2658 lpl_t *lpl;
2659 2659 lpl_t *leaf_lpl;
2660 2660 lgrp_t *lgrp_leaf;
2661 2661
2662 2662 /* called sometimes w/ cpus paused - grab no locks */
2663 2663
2664 2664 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2665 2665
2666 2666 lpl = leaf_lpl = cp->cpu_lpl;
2667 2667 lgrp_leaf = leaf_lpl->lpl_lgrp;
2668 2668
2669 2669 /* don't delete a leaf that isn't there */
2670 2670 ASSERT(LGRP_EXISTS(lgrp_leaf));
2671 2671
2672 2672 /* no double-deletes */
2673 2673 ASSERT(lpl->lpl_ncpu);
2674 2674 if (--lpl->lpl_ncpu == 0) {
2675 2675 /*
2676 2676 * This was the last cpu in this lgroup for this partition,
2677 2677 * clear its bit in the partition's lgroup bitmask
2678 2678 */
2679 2679 klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid);
2680 2680
2681 2681 /* eliminate remaning lpl link pointers in cpu, lpl */
2682 2682 lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL;
2683 2683
2684 2684 lpl_leaf_remove(leaf_lpl, cp->cpu_part);
2685 2685 } else {
2686 2686
2687 2687 /* unlink cpu from lists of cpus in lpl */
2688 2688 cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl;
2689 2689 cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl;
2690 2690 if (lpl->lpl_cpus == cp) {
2691 2691 lpl->lpl_cpus = cp->cpu_next_lpl;
2692 2692 }
2693 2693
2694 2694 /*
2695 2695 * Update the cpu count in the lpls associated with parent
2696 2696 * lgroups.
2697 2697 */
2698 2698 lpl_cpu_adjcnt(LPL_DECREMENT, cp);
2699 2699
2700 2700 }
2701 2701 /* clear cpu's lpl ptr when we're all done */
2702 2702 cp->cpu_lpl = NULL;
2703 2703 }
2704 2704
2705 2705 /*
2706 2706 * Recompute load average for the specified partition/lgrp fragment.
2707 2707 *
2708 2708 * We rely on the fact that this routine is called from the clock thread
2709 2709 * at a point before the clock thread can block (i.e. before its first
2710 2710 * lock request). Since the clock thread can not be preempted (since it
2711 2711 * runs at highest priority), we know that cpu partitions can not change
2712 2712 * (since doing so would require either the repartition requester or the
2713 2713 * cpu_pause thread to run on this cpu), so we can update the cpu's load
2714 2714 * without grabbing cpu_lock.
2715 2715 */
2716 2716 void
2717 2717 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag)
2718 2718 {
2719 2719 uint_t ncpu;
2720 2720 int64_t old, new, f;
2721 2721
2722 2722 /*
2723 2723 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
2724 2724 */
2725 2725 static short expval[] = {
2726 2726 0, 3196, 1618, 1083,
2727 2727 814, 652, 543, 466,
2728 2728 408, 363, 326, 297,
2729 2729 272, 251, 233, 218,
2730 2730 204, 192, 181, 172,
2731 2731 163, 155, 148, 142,
2732 2732 136, 130, 125, 121,
2733 2733 116, 112, 109, 105
2734 2734 };
2735 2735
2736 2736 /* ASSERT (called from clock level) */
2737 2737
2738 2738 if ((lpl == NULL) || /* we're booting - this is easiest for now */
2739 2739 ((ncpu = lpl->lpl_ncpu) == 0)) {
2740 2740 return;
2741 2741 }
2742 2742
2743 2743 for (;;) {
2744 2744
2745 2745 if (ncpu >= sizeof (expval) / sizeof (expval[0]))
2746 2746 f = expval[1]/ncpu; /* good approx. for large ncpu */
2747 2747 else
2748 2748 f = expval[ncpu];
2749 2749
2750 2750 /*
2751 2751 * Modify the load average atomically to avoid losing
2752 2752 * anticipatory load updates (see lgrp_move_thread()).
2753 2753 */
2754 2754 if (ageflag) {
2755 2755 /*
2756 2756 * We're supposed to both update and age the load.
2757 2757 * This happens 10 times/sec. per cpu. We do a
2758 2758 * little hoop-jumping to avoid integer overflow.
2759 2759 */
2760 2760 int64_t q, r;
2761 2761
2762 2762 do {
2763 2763 old = new = lpl->lpl_loadavg;
2764 2764 q = (old >> 16) << 7;
2765 2765 r = (old & 0xffff) << 7;
2766 2766 new += ((long long)(nrcpus - q) * f -
2767 2767 ((r * f) >> 16)) >> 7;
2768 2768
2769 2769 /*
2770 2770 * Check for overflow
2771 2771 */
2772 2772 if (new > LGRP_LOADAVG_MAX)
2773 2773 new = LGRP_LOADAVG_MAX;
2774 2774 else if (new < 0)
2775 2775 new = 0;
2776 2776 } while (atomic_cas_32((lgrp_load_t *)&lpl->lpl_loadavg,
2777 2777 old, new) != old);
2778 2778 } else {
2779 2779 /*
2780 2780 * We're supposed to update the load, but not age it.
2781 2781 * This option is used to update the load (which either
2782 2782 * has already been aged in this 1/10 sec. interval or
2783 2783 * soon will be) to account for a remotely executing
2784 2784 * thread.
2785 2785 */
2786 2786 do {
2787 2787 old = new = lpl->lpl_loadavg;
2788 2788 new += f;
2789 2789 /*
2790 2790 * Check for overflow
2791 2791 * Underflow not possible here
2792 2792 */
2793 2793 if (new < old)
2794 2794 new = LGRP_LOADAVG_MAX;
2795 2795 } while (atomic_cas_32((lgrp_load_t *)&lpl->lpl_loadavg,
2796 2796 old, new) != old);
2797 2797 }
2798 2798
2799 2799 /*
2800 2800 * Do the same for this lpl's parent
2801 2801 */
2802 2802 if ((lpl = lpl->lpl_parent) == NULL)
2803 2803 break;
2804 2804 ncpu = lpl->lpl_ncpu;
2805 2805 }
2806 2806 }
2807 2807
2808 2808 /*
2809 2809 * Initialize lpl topology in the target based on topology currently present in
2810 2810 * lpl_bootstrap.
2811 2811 *
2812 2812 * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
2813 2813 * initialize cp_default list of lpls. Up to this point all topology operations
2814 2814 * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
2815 2815 * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
2816 2816 * `target' points to the list of lpls in cp_default and `size' is the size of
2817 2817 * this list.
2818 2818 *
2819 2819 * This function walks the lpl topology in lpl_bootstrap and does for things:
2820 2820 *
2821 2821 * 1) Copies all fields from lpl_bootstrap to the target.
2822 2822 *
2823 2823 * 2) Sets CPU0 lpl pointer to the correct element of the target list.
2824 2824 *
2825 2825 * 3) Updates lpl_parent pointers to point to the lpls in the target list
2826 2826 * instead of lpl_bootstrap.
2827 2827 *
2828 2828 * 4) Updates pointers in the resource list of the target to point to the lpls
2829 2829 * in the target list instead of lpl_bootstrap.
2830 2830 *
2831 2831 * After lpl_topo_bootstrap() completes, target contains the same information
2832 2832 * that would be present there if it were used during boot instead of
2833 2833 * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
2834 2834 * and it is bzeroed.
2835 2835 */
2836 2836 void
2837 2837 lpl_topo_bootstrap(lpl_t *target, int size)
2838 2838 {
2839 2839 lpl_t *lpl = lpl_bootstrap;
2840 2840 lpl_t *target_lpl = target;
2841 2841 lpl_t **rset;
2842 2842 int *id2rset;
2843 2843 int sz;
2844 2844 int howmany;
2845 2845 int id;
2846 2846 int i;
2847 2847
2848 2848 /*
2849 2849 * The only target that should be passed here is cp_default lpl list.
2850 2850 */
2851 2851 ASSERT(target == cp_default.cp_lgrploads);
2852 2852 ASSERT(size == cp_default.cp_nlgrploads);
2853 2853 ASSERT(!lgrp_topo_initialized);
2854 2854 ASSERT(ncpus == 1);
2855 2855
2856 2856 howmany = MIN(LPL_BOOTSTRAP_SIZE, size);
2857 2857 for (i = 0; i < howmany; i++, lpl++, target_lpl++) {
2858 2858 /*
2859 2859 * Copy all fields from lpl, except for the rset,
2860 2860 * lgrp id <=> rset mapping storage,
2861 2861 * and amount of storage
2862 2862 */
2863 2863 rset = target_lpl->lpl_rset;
2864 2864 id2rset = target_lpl->lpl_id2rset;
2865 2865 sz = target_lpl->lpl_rset_sz;
2866 2866
2867 2867 *target_lpl = *lpl;
2868 2868
2869 2869 target_lpl->lpl_rset_sz = sz;
2870 2870 target_lpl->lpl_rset = rset;
2871 2871 target_lpl->lpl_id2rset = id2rset;
2872 2872
2873 2873 /*
2874 2874 * Substitute CPU0 lpl pointer with one relative to target.
2875 2875 */
2876 2876 if (lpl->lpl_cpus == CPU) {
2877 2877 ASSERT(CPU->cpu_lpl == lpl);
2878 2878 CPU->cpu_lpl = target_lpl;
2879 2879 }
2880 2880
2881 2881 /*
2882 2882 * Substitute parent information with parent relative to target.
2883 2883 */
2884 2884 if (lpl->lpl_parent != NULL)
2885 2885 target_lpl->lpl_parent = (lpl_t *)
2886 2886 (((uintptr_t)lpl->lpl_parent -
2887 2887 (uintptr_t)lpl_bootstrap) +
2888 2888 (uintptr_t)target);
2889 2889
2890 2890 /*
2891 2891 * Walk over resource set substituting pointers relative to
2892 2892 * lpl_bootstrap's rset to pointers relative to target's
2893 2893 */
2894 2894 ASSERT(lpl->lpl_nrset <= 1);
2895 2895
2896 2896 for (id = 0; id < lpl->lpl_nrset; id++) {
2897 2897 if (lpl->lpl_rset[id] != NULL) {
2898 2898 target_lpl->lpl_rset[id] = (lpl_t *)
2899 2899 (((uintptr_t)lpl->lpl_rset[id] -
2900 2900 (uintptr_t)lpl_bootstrap) +
2901 2901 (uintptr_t)target);
2902 2902 }
2903 2903 target_lpl->lpl_id2rset[id] =
2904 2904 lpl->lpl_id2rset[id];
2905 2905 }
2906 2906 }
2907 2907
2908 2908 /*
2909 2909 * Clean up the bootstrap lpls since we have switched over to the
2910 2910 * actual lpl array in the default cpu partition.
2911 2911 *
2912 2912 * We still need to keep one empty lpl around for newly starting
2913 2913 * slave CPUs to reference should they need to make it through the
2914 2914 * dispatcher prior to their lgrp/lpl initialization.
2915 2915 *
2916 2916 * The lpl related dispatcher code has been designed to work properly
2917 2917 * (and without extra checks) for this special case of a zero'ed
2918 2918 * bootstrap lpl. Such an lpl appears to the dispatcher as an lpl
2919 2919 * with lgrpid 0 and an empty resource set. Iteration over the rset
2920 2920 * array by the dispatcher is also NULL terminated for this reason.
2921 2921 *
2922 2922 * This provides the desired behaviour for an uninitialized CPU.
2923 2923 * It shouldn't see any other CPU to either dispatch to or steal
2924 2924 * from until it is properly initialized.
2925 2925 */
2926 2926 bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list));
2927 2927 bzero(lpl_bootstrap_id2rset, sizeof (lpl_bootstrap_id2rset));
2928 2928 bzero(lpl_bootstrap_rset, sizeof (lpl_bootstrap_rset));
2929 2929
2930 2930 lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset;
2931 2931 lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset;
2932 2932 }
2933 2933
2934 2934 /*
2935 2935 * If the lowest load among the lgroups a process' threads are currently
2936 2936 * spread across is greater than lgrp_expand_proc_thresh, we'll consider
2937 2937 * expanding the process to a new lgroup.
2938 2938 */
2939 2939 #define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
2940 2940 lgrp_load_t lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT;
2941 2941
2942 2942 #define LGRP_EXPAND_PROC_THRESH(ncpu) \
2943 2943 ((lgrp_expand_proc_thresh) / (ncpu))
2944 2944
2945 2945 /*
2946 2946 * A process will be expanded to a new lgroup only if the difference between
2947 2947 * the lowest load on the lgroups the process' thread's are currently spread
2948 2948 * across and the lowest load on the other lgroups in the process' partition
2949 2949 * is greater than lgrp_expand_proc_diff.
2950 2950 */
2951 2951 #define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
2952 2952 lgrp_load_t lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT;
2953 2953
2954 2954 #define LGRP_EXPAND_PROC_DIFF(ncpu) \
2955 2955 ((lgrp_expand_proc_diff) / (ncpu))
2956 2956
2957 2957 /*
2958 2958 * The loadavg tolerance accounts for "noise" inherent in the load, which may
2959 2959 * be present due to impreciseness of the load average decay algorithm.
2960 2960 *
2961 2961 * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
2962 2962 * tolerance is scaled by the number of cpus in the lgroup just like
2963 2963 * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
2964 2964 * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
2965 2965 * of: 0x10000 / 4 => 0x4000 or greater to be significant.
2966 2966 */
2967 2967 uint32_t lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX;
2968 2968 #define LGRP_LOADAVG_TOLERANCE(ncpu) \
2969 2969 ((lgrp_loadavg_tolerance) / ncpu)
2970 2970
2971 2971 /*
2972 2972 * lgrp_choose() will choose root lgroup as home when lowest lgroup load
2973 2973 * average is above this threshold
2974 2974 */
2975 2975 uint32_t lgrp_load_thresh = UINT32_MAX;
2976 2976
2977 2977 /*
2978 2978 * lgrp_choose() will try to skip any lgroups with less memory
2979 2979 * than this free when choosing a home lgroup
2980 2980 */
2981 2981 pgcnt_t lgrp_mem_free_thresh = 0;
2982 2982
2983 2983 /*
2984 2984 * When choosing between similarly loaded lgroups, lgrp_choose() will pick
2985 2985 * one based on one of the following policies:
2986 2986 * - Random selection
2987 2987 * - Pseudo round robin placement
2988 2988 * - Longest time since a thread was last placed
2989 2989 */
2990 2990 #define LGRP_CHOOSE_RANDOM 1
2991 2991 #define LGRP_CHOOSE_RR 2
2992 2992 #define LGRP_CHOOSE_TIME 3
2993 2993
2994 2994 int lgrp_choose_policy = LGRP_CHOOSE_TIME;
2995 2995
2996 2996 /*
2997 2997 * Choose a suitable leaf lgroup for a kthread. The kthread is assumed not to
2998 2998 * be bound to a CPU or processor set.
2999 2999 *
3000 3000 * Arguments:
3001 3001 * t The thread
3002 3002 * cpupart The partition the thread belongs to.
3003 3003 *
3004 3004 * NOTE: Should at least be called with the cpu_lock held, kernel preemption
3005 3005 * disabled, or thread_lock held (at splhigh) to protect against the CPU
3006 3006 * partitions changing out from under us and assumes that given thread is
3007 3007 * protected. Also, called sometimes w/ cpus paused or kernel preemption
3008 3008 * disabled, so don't grab any locks because we should never block under
3009 3009 * those conditions.
3010 3010 */
3011 3011 lpl_t *
3012 3012 lgrp_choose(kthread_t *t, cpupart_t *cpupart)
3013 3013 {
3014 3014 lgrp_load_t bestload, bestrload;
3015 3015 int lgrpid_offset, lgrp_count;
3016 3016 lgrp_id_t lgrpid, lgrpid_start;
3017 3017 lpl_t *lpl, *bestlpl, *bestrlpl;
3018 3018 klgrpset_t lgrpset;
3019 3019 proc_t *p;
3020 3020
3021 3021 ASSERT(t != NULL);
3022 3022 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3023 3023 THREAD_LOCK_HELD(t));
3024 3024 ASSERT(cpupart != NULL);
3025 3025
3026 3026 p = t->t_procp;
3027 3027
3028 3028 /* A process should always be in an active partition */
3029 3029 ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset));
3030 3030
3031 3031 bestlpl = bestrlpl = NULL;
3032 3032 bestload = bestrload = LGRP_LOADAVG_MAX;
3033 3033 lgrpset = cpupart->cp_lgrpset;
3034 3034
3035 3035 switch (lgrp_choose_policy) {
3036 3036 case LGRP_CHOOSE_RR:
3037 3037 lgrpid = cpupart->cp_lgrp_hint;
3038 3038 do {
3039 3039 if (++lgrpid > lgrp_alloc_max)
3040 3040 lgrpid = 0;
3041 3041 } while (!klgrpset_ismember(lgrpset, lgrpid));
3042 3042
3043 3043 break;
3044 3044 default:
3045 3045 case LGRP_CHOOSE_TIME:
3046 3046 case LGRP_CHOOSE_RANDOM:
3047 3047 klgrpset_nlgrps(lgrpset, lgrp_count);
3048 3048 lgrpid_offset =
3049 3049 (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1;
3050 3050 for (lgrpid = 0; ; lgrpid++) {
3051 3051 if (klgrpset_ismember(lgrpset, lgrpid)) {
3052 3052 if (--lgrpid_offset == 0)
3053 3053 break;
3054 3054 }
3055 3055 }
3056 3056 break;
3057 3057 }
3058 3058
3059 3059 lgrpid_start = lgrpid;
3060 3060
3061 3061 DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start,
3062 3062 lgrp_id_t, cpupart->cp_lgrp_hint);
3063 3063
3064 3064 /*
3065 3065 * Use lgroup affinities (if any) to choose best lgroup
3066 3066 *
3067 3067 * NOTE: Assumes that thread is protected from going away and its
3068 3068 * lgroup affinities won't change (ie. p_lock, or
3069 3069 * thread_lock() being held and/or CPUs paused)
3070 3070 */
3071 3071 if (t->t_lgrp_affinity) {
3072 3072 lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE);
3073 3073 if (lpl != NULL)
3074 3074 return (lpl);
3075 3075 }
3076 3076
3077 3077 ASSERT(klgrpset_ismember(lgrpset, lgrpid_start));
3078 3078
3079 3079 do {
3080 3080 pgcnt_t npgs;
3081 3081
3082 3082 /*
3083 3083 * Skip any lgroups outside of thread's pset
3084 3084 */
3085 3085 if (!klgrpset_ismember(lgrpset, lgrpid)) {
3086 3086 if (++lgrpid > lgrp_alloc_max)
3087 3087 lgrpid = 0; /* wrap the search */
3088 3088 continue;
3089 3089 }
3090 3090
3091 3091 /*
3092 3092 * Skip any non-leaf lgroups
3093 3093 */
3094 3094 if (lgrp_table[lgrpid]->lgrp_childcnt != 0)
3095 3095 continue;
3096 3096
3097 3097 /*
3098 3098 * Skip any lgroups without enough free memory
3099 3099 * (when threshold set to nonzero positive value)
3100 3100 */
3101 3101 if (lgrp_mem_free_thresh > 0) {
3102 3102 npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
3103 3103 if (npgs < lgrp_mem_free_thresh) {
3104 3104 if (++lgrpid > lgrp_alloc_max)
3105 3105 lgrpid = 0; /* wrap the search */
3106 3106 continue;
3107 3107 }
3108 3108 }
3109 3109
3110 3110 lpl = &cpupart->cp_lgrploads[lgrpid];
3111 3111 if (klgrpset_isempty(p->p_lgrpset) ||
3112 3112 klgrpset_ismember(p->p_lgrpset, lgrpid)) {
3113 3113 /*
3114 3114 * Either this is a new process or the process already
3115 3115 * has threads on this lgrp, so this is a preferred
3116 3116 * lgroup for the thread.
3117 3117 */
3118 3118 if (bestlpl == NULL ||
3119 3119 lpl_pick(lpl, bestlpl)) {
3120 3120 bestload = lpl->lpl_loadavg;
3121 3121 bestlpl = lpl;
3122 3122 }
3123 3123 } else {
3124 3124 /*
3125 3125 * The process doesn't have any threads on this lgrp,
3126 3126 * but we're willing to consider this lgrp if the load
3127 3127 * difference is big enough to justify splitting up
3128 3128 * the process' threads.
3129 3129 */
3130 3130 if (bestrlpl == NULL ||
3131 3131 lpl_pick(lpl, bestrlpl)) {
3132 3132 bestrload = lpl->lpl_loadavg;
3133 3133 bestrlpl = lpl;
3134 3134 }
3135 3135 }
3136 3136 if (++lgrpid > lgrp_alloc_max)
3137 3137 lgrpid = 0; /* wrap the search */
3138 3138 } while (lgrpid != lgrpid_start);
3139 3139
3140 3140 /*
3141 3141 * Return root lgroup if threshold isn't set to maximum value and
3142 3142 * lowest lgroup load average more than a certain threshold
3143 3143 */
3144 3144 if (lgrp_load_thresh != UINT32_MAX &&
3145 3145 bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh)
3146 3146 return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]);
3147 3147
3148 3148 /*
3149 3149 * If all the lgroups over which the thread's process is spread are
3150 3150 * heavily loaded, or otherwise undesirable, we'll consider placing
3151 3151 * the thread on one of the other leaf lgroups in the thread's
3152 3152 * partition.
3153 3153 */
3154 3154 if ((bestlpl == NULL) ||
3155 3155 ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) &&
3156 3156 (bestrload < bestload) && /* paranoid about wraparound */
3157 3157 (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) <
3158 3158 bestload))) {
3159 3159 bestlpl = bestrlpl;
3160 3160 }
3161 3161
3162 3162 if (bestlpl == NULL) {
3163 3163 /*
3164 3164 * No lgroup looked particularly good, but we still
3165 3165 * have to pick something. Go with the randomly selected
3166 3166 * legal lgroup we started with above.
3167 3167 */
3168 3168 bestlpl = &cpupart->cp_lgrploads[lgrpid_start];
3169 3169 }
3170 3170
3171 3171 cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid;
3172 3172 bestlpl->lpl_homed_time = gethrtime_unscaled();
3173 3173
3174 3174 ASSERT(bestlpl->lpl_ncpu > 0);
3175 3175 return (bestlpl);
3176 3176 }
3177 3177
3178 3178 /*
3179 3179 * Decide if lpl1 is a better candidate than lpl2 for lgrp homing.
3180 3180 * Returns non-zero if lpl1 is a better candidate, and 0 otherwise.
3181 3181 */
3182 3182 static int
3183 3183 lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
3184 3184 {
3185 3185 lgrp_load_t l1, l2;
3186 3186 lgrp_load_t tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu);
3187 3187
3188 3188 l1 = lpl1->lpl_loadavg;
3189 3189 l2 = lpl2->lpl_loadavg;
3190 3190
3191 3191 if ((l1 + tolerance < l2) && (l1 < l2)) {
3192 3192 /* lpl1 is significantly less loaded than lpl2 */
3193 3193 return (1);
3194 3194 }
3195 3195
3196 3196 if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
3197 3197 l1 + tolerance >= l2 && l1 < l2 &&
3198 3198 lpl1->lpl_homed_time < lpl2->lpl_homed_time) {
3199 3199 /*
3200 3200 * lpl1's load is within the tolerance of lpl2. We're
3201 3201 * willing to consider it be to better however if
3202 3202 * it has been longer since we last homed a thread there
3203 3203 */
3204 3204 return (1);
3205 3205 }
3206 3206
3207 3207 return (0);
3208 3208 }
3209 3209
3210 3210 /*
3211 3211 * lgrp_trthr_moves counts the number of times main thread (t_tid = 1) of a
3212 3212 * process that uses text replication changed home lgrp. This info is used by
3213 3213 * segvn asyncronous thread to detect if it needs to recheck what lgrps
3214 3214 * should be used for text replication.
3215 3215 */
3216 3216 static uint64_t lgrp_trthr_moves = 0;
3217 3217
3218 3218 uint64_t
3219 3219 lgrp_get_trthr_migrations(void)
3220 3220 {
3221 3221 return (lgrp_trthr_moves);
3222 3222 }
3223 3223
3224 3224 void
3225 3225 lgrp_update_trthr_migrations(uint64_t incr)
3226 3226 {
3227 3227 atomic_add_64(&lgrp_trthr_moves, incr);
3228 3228 }
3229 3229
3230 3230 /*
3231 3231 * An LWP is expected to be assigned to an lgroup for at least this long
3232 3232 * for its anticipatory load to be justified. NOTE that this value should
3233 3233 * not be set extremely huge (say, larger than 100 years), to avoid problems
3234 3234 * with overflow in the calculation that uses it.
3235 3235 */
3236 3236 #define LGRP_MIN_NSEC (NANOSEC / 10) /* 1/10 of a second */
3237 3237 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC;
3238 3238
3239 3239 /*
3240 3240 * Routine to change a thread's lgroup affiliation. This routine updates
3241 3241 * the thread's kthread_t struct and its process' proc_t struct to note the
3242 3242 * thread's new lgroup affiliation, and its lgroup affinities.
3243 3243 *
3244 3244 * Note that this is the only routine that modifies a thread's t_lpl field,
3245 3245 * and that adds in or removes anticipatory load.
3246 3246 *
3247 3247 * If the thread is exiting, newlpl is NULL.
3248 3248 *
3249 3249 * Locking:
3250 3250 * The following lock must be held on entry:
3251 3251 * cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
3252 3252 * doesn't get removed from t's partition
3253 3253 *
3254 3254 * This routine is not allowed to grab any locks, since it may be called
3255 3255 * with cpus paused (such as from cpu_offline).
3256 3256 */
3257 3257 void
3258 3258 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
3259 3259 {
3260 3260 proc_t *p;
3261 3261 lpl_t *lpl, *oldlpl;
3262 3262 lgrp_id_t oldid;
3263 3263 kthread_t *tp;
3264 3264 uint_t ncpu;
3265 3265 lgrp_load_t old, new;
3266 3266
3267 3267 ASSERT(t);
3268 3268 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3269 3269 THREAD_LOCK_HELD(t));
3270 3270
3271 3271 /*
3272 3272 * If not changing lpls, just return
3273 3273 */
3274 3274 if ((oldlpl = t->t_lpl) == newlpl)
3275 3275 return;
3276 3276
3277 3277 /*
3278 3278 * Make sure the thread's lwp hasn't exited (if so, this thread is now
3279 3279 * associated with process 0 rather than with its original process).
3280 3280 */
3281 3281 if (t->t_proc_flag & TP_LWPEXIT) {
3282 3282 if (newlpl != NULL) {
3283 3283 t->t_lpl = newlpl;
3284 3284 }
3285 3285 return;
3286 3286 }
3287 3287
3288 3288 p = ttoproc(t);
3289 3289
3290 3290 /*
3291 3291 * If the thread had a previous lgroup, update its process' p_lgrpset
3292 3292 * to account for it being moved from its old lgroup.
3293 3293 */
3294 3294 if ((oldlpl != NULL) && /* thread had a previous lgroup */
3295 3295 (p->p_tlist != NULL)) {
3296 3296 oldid = oldlpl->lpl_lgrpid;
3297 3297
3298 3298 if (newlpl != NULL)
3299 3299 lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1);
3300 3300
3301 3301 if ((do_lgrpset_delete) &&
3302 3302 (klgrpset_ismember(p->p_lgrpset, oldid))) {
3303 3303 for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) {
3304 3304 /*
3305 3305 * Check if a thread other than the thread
3306 3306 * that's moving is assigned to the same
3307 3307 * lgroup as the thread that's moving. Note
3308 3308 * that we have to compare lgroup IDs, rather
3309 3309 * than simply comparing t_lpl's, since the
3310 3310 * threads may belong to different partitions
3311 3311 * but be assigned to the same lgroup.
3312 3312 */
3313 3313 ASSERT(tp->t_lpl != NULL);
3314 3314
3315 3315 if ((tp != t) &&
3316 3316 (tp->t_lpl->lpl_lgrpid == oldid)) {
3317 3317 /*
3318 3318 * Another thread is assigned to the
3319 3319 * same lgroup as the thread that's
3320 3320 * moving, p_lgrpset doesn't change.
3321 3321 */
3322 3322 break;
3323 3323 } else if (tp == p->p_tlist) {
3324 3324 /*
3325 3325 * No other thread is assigned to the
3326 3326 * same lgroup as the exiting thread,
3327 3327 * clear the lgroup's bit in p_lgrpset.
3328 3328 */
3329 3329 klgrpset_del(p->p_lgrpset, oldid);
3330 3330 break;
3331 3331 }
3332 3332 }
3333 3333 }
3334 3334
3335 3335 /*
3336 3336 * If this thread was assigned to its old lgroup for such a
3337 3337 * short amount of time that the anticipatory load that was
3338 3338 * added on its behalf has aged very little, remove that
3339 3339 * anticipatory load.
3340 3340 */
3341 3341 if ((t->t_anttime + lgrp_min_nsec > gethrtime()) &&
3342 3342 ((ncpu = oldlpl->lpl_ncpu) > 0)) {
3343 3343 lpl = oldlpl;
3344 3344 for (;;) {
3345 3345 do {
3346 3346 old = new = lpl->lpl_loadavg;
3347 3347 new -= LGRP_LOADAVG_MAX_EFFECT(ncpu);
3348 3348 if (new > old) {
3349 3349 /*
3350 3350 * this can happen if the load
3351 3351 * average was aged since we
3352 3352 * added in the anticipatory
3353 3353 * load
3354 3354 */
3355 3355 new = 0;
3356 3356 }
3357 3357 } while (atomic_cas_32(
3358 3358 (lgrp_load_t *)&lpl->lpl_loadavg, old,
3359 3359 new) != old);
3360 3360
3361 3361 lpl = lpl->lpl_parent;
3362 3362 if (lpl == NULL)
3363 3363 break;
3364 3364
3365 3365 ncpu = lpl->lpl_ncpu;
3366 3366 ASSERT(ncpu > 0);
3367 3367 }
3368 3368 }
3369 3369 }
3370 3370 /*
3371 3371 * If the thread has a new lgroup (i.e. it's not exiting), update its
3372 3372 * t_lpl and its process' p_lgrpset, and apply an anticipatory load
3373 3373 * to its new lgroup to account for its move to its new lgroup.
3374 3374 */
3375 3375 if (newlpl != NULL) {
3376 3376 /*
3377 3377 * This thread is moving to a new lgroup
3378 3378 */
3379 3379 t->t_lpl = newlpl;
3380 3380 if (t->t_tid == 1 && p->p_t1_lgrpid != newlpl->lpl_lgrpid) {
3381 3381 p->p_t1_lgrpid = newlpl->lpl_lgrpid;
3382 3382 membar_producer();
3383 3383 if (p->p_tr_lgrpid != LGRP_NONE &&
3384 3384 p->p_tr_lgrpid != p->p_t1_lgrpid) {
3385 3385 lgrp_update_trthr_migrations(1);
3386 3386 }
3387 3387 }
3388 3388
3389 3389 /*
3390 3390 * Reflect move in load average of new lgroup
3391 3391 * unless it is root lgroup
3392 3392 */
3393 3393 if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root)
3394 3394 return;
3395 3395
3396 3396 if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) {
3397 3397 klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid);
3398 3398 }
3399 3399
3400 3400 /*
3401 3401 * It'll take some time for the load on the new lgroup
3402 3402 * to reflect this thread's placement on it. We'd
3403 3403 * like not, however, to have all threads between now
3404 3404 * and then also piling on to this lgroup. To avoid
3405 3405 * this pileup, we anticipate the load this thread
3406 3406 * will generate on its new lgroup. The goal is to
3407 3407 * make the lgroup's load appear as though the thread
3408 3408 * had been there all along. We're very conservative
3409 3409 * in calculating this anticipatory load, we assume
3410 3410 * the worst case case (100% CPU-bound thread). This
3411 3411 * may be modified in the future to be more accurate.
3412 3412 */
3413 3413 lpl = newlpl;
3414 3414 for (;;) {
3415 3415 ncpu = lpl->lpl_ncpu;
3416 3416 ASSERT(ncpu > 0);
3417 3417 do {
3418 3418 old = new = lpl->lpl_loadavg;
3419 3419 new += LGRP_LOADAVG_MAX_EFFECT(ncpu);
3420 3420 /*
3421 3421 * Check for overflow
3422 3422 * Underflow not possible here
3423 3423 */
3424 3424 if (new < old)
3425 3425 new = UINT32_MAX;
3426 3426 } while (atomic_cas_32((lgrp_load_t *)&lpl->lpl_loadavg,
3427 3427 old, new) != old);
3428 3428
3429 3429 lpl = lpl->lpl_parent;
3430 3430 if (lpl == NULL)
3431 3431 break;
3432 3432 }
3433 3433 t->t_anttime = gethrtime();
3434 3434 }
3435 3435 }
3436 3436
3437 3437 /*
3438 3438 * Return lgroup memory allocation policy given advice from madvise(3C)
3439 3439 */
3440 3440 lgrp_mem_policy_t
3441 3441 lgrp_madv_to_policy(uchar_t advice, size_t size, int type)
3442 3442 {
3443 3443 switch (advice) {
3444 3444 case MADV_ACCESS_LWP:
3445 3445 return (LGRP_MEM_POLICY_NEXT);
3446 3446 case MADV_ACCESS_MANY:
3447 3447 return (LGRP_MEM_POLICY_RANDOM);
3448 3448 default:
3449 3449 return (lgrp_mem_policy_default(size, type));
3450 3450 }
3451 3451 }
3452 3452
3453 3453 /*
3454 3454 * Figure out default policy
3455 3455 */
3456 3456 lgrp_mem_policy_t
3457 3457 lgrp_mem_policy_default(size_t size, int type)
3458 3458 {
3459 3459 cpupart_t *cp;
3460 3460 lgrp_mem_policy_t policy;
3461 3461 size_t pset_mem_size;
3462 3462
3463 3463 /*
3464 3464 * Randomly allocate memory across lgroups for shared memory
3465 3465 * beyond a certain threshold
3466 3466 */
3467 3467 if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) ||
3468 3468 (type == MAP_SHARED && size > lgrp_shm_random_thresh)) {
3469 3469 /*
3470 3470 * Get total memory size of current thread's pset
3471 3471 */
3472 3472 kpreempt_disable();
3473 3473 cp = curthread->t_cpupart;
3474 3474 klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size);
3475 3475 kpreempt_enable();
3476 3476
3477 3477 /*
3478 3478 * Choose policy to randomly allocate memory across
3479 3479 * lgroups in pset if it will fit and is not default
3480 3480 * partition. Otherwise, allocate memory randomly
3481 3481 * across machine.
3482 3482 */
3483 3483 if (lgrp_mem_pset_aware && size < pset_mem_size)
3484 3484 policy = LGRP_MEM_POLICY_RANDOM_PSET;
3485 3485 else
3486 3486 policy = LGRP_MEM_POLICY_RANDOM;
3487 3487 } else
3488 3488 /*
3489 3489 * Apply default policy for private memory and
3490 3490 * shared memory under the respective random
3491 3491 * threshold.
3492 3492 */
3493 3493 policy = lgrp_mem_default_policy;
↓ open down ↓ |
3493 lines elided |
↑ open up ↑ |
3494 3494
3495 3495 return (policy);
3496 3496 }
3497 3497
3498 3498 /*
3499 3499 * Get memory allocation policy for this segment
3500 3500 */
3501 3501 lgrp_mem_policy_info_t *
3502 3502 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr)
3503 3503 {
3504 - lgrp_mem_policy_info_t *policy_info;
3505 3504 extern struct seg_ops segspt_ops;
3506 3505 extern struct seg_ops segspt_shmops;
3507 3506
3508 3507 /*
3509 3508 * This is for binary compatibility to protect against third party
3510 3509 * segment drivers which haven't recompiled to allow for
3511 3510 * segop_getpolicy()
3512 3511 */
3513 3512 if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops &&
3514 3513 seg->s_ops != &segspt_shmops)
3515 3514 return (NULL);
3516 3515
3517 - policy_info = NULL;
3518 - if (seg->s_ops->getpolicy != NULL)
3519 - policy_info = segop_getpolicy(seg, vaddr);
3520 -
3521 - return (policy_info);
3516 + return (segop_getpolicy(seg, vaddr));
3522 3517 }
3523 3518
3524 3519 /*
3525 3520 * Set policy for allocating private memory given desired policy, policy info,
3526 3521 * size in bytes of memory that policy is being applied.
3527 3522 * Return 0 if policy wasn't set already and 1 if policy was set already
3528 3523 */
3529 3524 int
3530 3525 lgrp_privm_policy_set(lgrp_mem_policy_t policy,
3531 3526 lgrp_mem_policy_info_t *policy_info, size_t size)
3532 3527 {
3533 3528
3534 3529 ASSERT(policy_info != NULL);
3535 3530
3536 3531 if (policy == LGRP_MEM_POLICY_DEFAULT)
3537 3532 policy = lgrp_mem_policy_default(size, MAP_PRIVATE);
3538 3533
3539 3534 /*
3540 3535 * Policy set already?
3541 3536 */
3542 3537 if (policy == policy_info->mem_policy)
3543 3538 return (1);
3544 3539
3545 3540 /*
3546 3541 * Set policy
3547 3542 */
3548 3543 policy_info->mem_policy = policy;
3549 3544 policy_info->mem_lgrpid = LGRP_NONE;
3550 3545
3551 3546 return (0);
3552 3547 }
3553 3548
3554 3549
3555 3550 /*
3556 3551 * Get shared memory allocation policy with given tree and offset
3557 3552 */
3558 3553 lgrp_mem_policy_info_t *
3559 3554 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
3560 3555 u_offset_t vn_off)
3561 3556 {
3562 3557 u_offset_t off;
3563 3558 lgrp_mem_policy_info_t *policy_info;
3564 3559 lgrp_shm_policy_seg_t *policy_seg;
3565 3560 lgrp_shm_locality_t *shm_locality;
3566 3561 avl_tree_t *tree;
3567 3562 avl_index_t where;
3568 3563
3569 3564 /*
3570 3565 * Get policy segment tree from anon_map or vnode and use specified
3571 3566 * anon index or vnode offset as offset
3572 3567 *
3573 3568 * Assume that no lock needs to be held on anon_map or vnode, since
3574 3569 * they should be protected by their reference count which must be
3575 3570 * nonzero for an existing segment
3576 3571 */
3577 3572 if (amp) {
3578 3573 ASSERT(amp->refcnt != 0);
3579 3574 shm_locality = amp->locality;
3580 3575 if (shm_locality == NULL)
3581 3576 return (NULL);
3582 3577 tree = shm_locality->loc_tree;
3583 3578 off = ptob(anon_index);
3584 3579 } else if (vp) {
3585 3580 shm_locality = vp->v_locality;
3586 3581 if (shm_locality == NULL)
3587 3582 return (NULL);
3588 3583 ASSERT(shm_locality->loc_count != 0);
3589 3584 tree = shm_locality->loc_tree;
3590 3585 off = vn_off;
3591 3586 }
3592 3587
3593 3588 if (tree == NULL)
3594 3589 return (NULL);
3595 3590
3596 3591 /*
3597 3592 * Lookup policy segment for offset into shared object and return
3598 3593 * policy info
3599 3594 */
3600 3595 rw_enter(&shm_locality->loc_lock, RW_READER);
3601 3596 policy_info = NULL;
3602 3597 policy_seg = avl_find(tree, &off, &where);
3603 3598 if (policy_seg)
3604 3599 policy_info = &policy_seg->shm_policy;
3605 3600 rw_exit(&shm_locality->loc_lock);
3606 3601
3607 3602 return (policy_info);
3608 3603 }
3609 3604
3610 3605 /*
3611 3606 * Default memory allocation policy for kernel segmap pages
3612 3607 */
3613 3608 lgrp_mem_policy_t lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM;
3614 3609
3615 3610 /*
3616 3611 * Return lgroup to use for allocating memory
3617 3612 * given the segment and address
3618 3613 *
3619 3614 * There isn't any mutual exclusion that exists between calls
3620 3615 * to this routine and DR, so this routine and whomever calls it
3621 3616 * should be mindful of the possibility that the lgrp returned
3622 3617 * may be deleted. If this happens, dereferences of the lgrp
3623 3618 * pointer will still be safe, but the resources in the lgrp will
3624 3619 * be gone, and LGRP_EXISTS() will no longer be true.
3625 3620 */
3626 3621 lgrp_t *
3627 3622 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
3628 3623 {
3629 3624 int i;
3630 3625 lgrp_t *lgrp;
3631 3626 klgrpset_t lgrpset;
3632 3627 int lgrps_spanned;
3633 3628 unsigned long off;
3634 3629 lgrp_mem_policy_t policy;
3635 3630 lgrp_mem_policy_info_t *policy_info;
3636 3631 ushort_t random;
3637 3632 int stat = 0;
3638 3633 extern struct seg *segkmap;
3639 3634
3640 3635 /*
3641 3636 * Just return null if the lgrp framework hasn't finished
3642 3637 * initializing or if this is a UMA machine.
3643 3638 */
3644 3639 if (nlgrps == 1 || !lgrp_initialized)
3645 3640 return (lgrp_root);
3646 3641
3647 3642 /*
3648 3643 * Get memory allocation policy for this segment
3649 3644 */
3650 3645 policy = lgrp_mem_default_policy;
3651 3646 if (seg != NULL) {
3652 3647 if (seg->s_as == &kas) {
3653 3648 if (seg == segkmap)
3654 3649 policy = lgrp_segmap_default_policy;
3655 3650 if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
3656 3651 policy == LGRP_MEM_POLICY_RANDOM_PSET)
3657 3652 policy = LGRP_MEM_POLICY_RANDOM;
3658 3653 } else {
3659 3654 policy_info = lgrp_mem_policy_get(seg, vaddr);
3660 3655 if (policy_info != NULL) {
3661 3656 policy = policy_info->mem_policy;
3662 3657 if (policy == LGRP_MEM_POLICY_NEXT_SEG) {
3663 3658 lgrp_id_t id = policy_info->mem_lgrpid;
3664 3659 ASSERT(id != LGRP_NONE);
3665 3660 ASSERT(id < NLGRPS_MAX);
3666 3661 lgrp = lgrp_table[id];
3667 3662 if (!LGRP_EXISTS(lgrp)) {
3668 3663 policy = LGRP_MEM_POLICY_NEXT;
3669 3664 } else {
3670 3665 lgrp_stat_add(id,
3671 3666 LGRP_NUM_NEXT_SEG, 1);
3672 3667 return (lgrp);
3673 3668 }
3674 3669 }
3675 3670 }
3676 3671 }
3677 3672 }
3678 3673 lgrpset = 0;
3679 3674
3680 3675 /*
3681 3676 * Initialize lgroup to home by default
3682 3677 */
3683 3678 lgrp = lgrp_home_lgrp();
3684 3679
3685 3680 /*
3686 3681 * When homing threads on root lgrp, override default memory
3687 3682 * allocation policies with root lgroup memory allocation policy
3688 3683 */
3689 3684 if (lgrp == lgrp_root)
3690 3685 policy = lgrp_mem_policy_root;
3691 3686
3692 3687 /*
3693 3688 * Implement policy
3694 3689 */
3695 3690 switch (policy) {
3696 3691 case LGRP_MEM_POLICY_NEXT_CPU:
3697 3692
3698 3693 /*
3699 3694 * Return lgroup of current CPU which faulted on memory
3700 3695 * If the CPU isn't currently in an lgrp, then opt to
3701 3696 * allocate from the root.
3702 3697 *
3703 3698 * Kernel preemption needs to be disabled here to prevent
3704 3699 * the current CPU from going away before lgrp is found.
3705 3700 */
3706 3701 if (LGRP_CPU_HAS_NO_LGRP(CPU)) {
3707 3702 lgrp = lgrp_root;
3708 3703 } else {
3709 3704 kpreempt_disable();
3710 3705 lgrp = lgrp_cpu_to_lgrp(CPU);
3711 3706 kpreempt_enable();
3712 3707 }
3713 3708 break;
3714 3709
3715 3710 case LGRP_MEM_POLICY_NEXT:
3716 3711 case LGRP_MEM_POLICY_DEFAULT:
3717 3712 default:
3718 3713
3719 3714 /*
3720 3715 * Just return current thread's home lgroup
3721 3716 * for default policy (next touch)
3722 3717 * If the thread is homed to the root,
3723 3718 * then the default policy is random across lgroups.
3724 3719 * Fallthrough to the random case.
3725 3720 */
3726 3721 if (lgrp != lgrp_root) {
3727 3722 if (policy == LGRP_MEM_POLICY_NEXT)
3728 3723 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1);
3729 3724 else
3730 3725 lgrp_stat_add(lgrp->lgrp_id,
3731 3726 LGRP_NUM_DEFAULT, 1);
3732 3727 break;
3733 3728 }
3734 3729 /* LINTED fallthrough on case statement */
3735 3730 case LGRP_MEM_POLICY_RANDOM:
3736 3731
3737 3732 /*
3738 3733 * Return a random leaf lgroup with memory
3739 3734 */
3740 3735 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3741 3736 /*
3742 3737 * Count how many lgroups are spanned
3743 3738 */
3744 3739 klgrpset_nlgrps(lgrpset, lgrps_spanned);
3745 3740
3746 3741 /*
3747 3742 * There may be no memnodes in the root lgroup during DR copy
3748 3743 * rename on a system with only two boards (memnodes)
3749 3744 * configured. In this case just return the root lgrp.
3750 3745 */
3751 3746 if (lgrps_spanned == 0) {
3752 3747 lgrp = lgrp_root;
3753 3748 break;
3754 3749 }
3755 3750
3756 3751 /*
3757 3752 * Pick a random offset within lgroups spanned
3758 3753 * and return lgroup at that offset
3759 3754 */
3760 3755 random = (ushort_t)gethrtime() >> 4;
3761 3756 off = random % lgrps_spanned;
3762 3757 ASSERT(off <= lgrp_alloc_max);
3763 3758
3764 3759 for (i = 0; i <= lgrp_alloc_max; i++) {
3765 3760 if (!klgrpset_ismember(lgrpset, i))
3766 3761 continue;
3767 3762 if (off)
3768 3763 off--;
3769 3764 else {
3770 3765 lgrp = lgrp_table[i];
3771 3766 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3772 3767 1);
3773 3768 break;
3774 3769 }
3775 3770 }
3776 3771 break;
3777 3772
3778 3773 case LGRP_MEM_POLICY_RANDOM_PROC:
3779 3774
3780 3775 /*
3781 3776 * Grab copy of bitmask of lgroups spanned by
3782 3777 * this process
3783 3778 */
3784 3779 klgrpset_copy(lgrpset, curproc->p_lgrpset);
3785 3780 stat = LGRP_NUM_RANDOM_PROC;
3786 3781
3787 3782 /* LINTED fallthrough on case statement */
3788 3783 case LGRP_MEM_POLICY_RANDOM_PSET:
3789 3784
3790 3785 if (!stat)
3791 3786 stat = LGRP_NUM_RANDOM_PSET;
3792 3787
3793 3788 if (klgrpset_isempty(lgrpset)) {
3794 3789 /*
3795 3790 * Grab copy of bitmask of lgroups spanned by
3796 3791 * this processor set
3797 3792 */
3798 3793 kpreempt_disable();
3799 3794 klgrpset_copy(lgrpset,
3800 3795 curthread->t_cpupart->cp_lgrpset);
3801 3796 kpreempt_enable();
3802 3797 }
3803 3798
3804 3799 /*
3805 3800 * Count how many lgroups are spanned
3806 3801 */
3807 3802 klgrpset_nlgrps(lgrpset, lgrps_spanned);
3808 3803 ASSERT(lgrps_spanned <= nlgrps);
3809 3804
3810 3805 /*
3811 3806 * Probably lgrps_spanned should be always non-zero, but to be
3812 3807 * on the safe side we return lgrp_root if it is empty.
3813 3808 */
3814 3809 if (lgrps_spanned == 0) {
3815 3810 lgrp = lgrp_root;
3816 3811 break;
3817 3812 }
3818 3813
3819 3814 /*
3820 3815 * Pick a random offset within lgroups spanned
3821 3816 * and return lgroup at that offset
3822 3817 */
3823 3818 random = (ushort_t)gethrtime() >> 4;
3824 3819 off = random % lgrps_spanned;
3825 3820 ASSERT(off <= lgrp_alloc_max);
3826 3821
3827 3822 for (i = 0; i <= lgrp_alloc_max; i++) {
3828 3823 if (!klgrpset_ismember(lgrpset, i))
3829 3824 continue;
3830 3825 if (off)
3831 3826 off--;
3832 3827 else {
3833 3828 lgrp = lgrp_table[i];
3834 3829 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3835 3830 1);
3836 3831 break;
3837 3832 }
3838 3833 }
3839 3834 break;
3840 3835
3841 3836 case LGRP_MEM_POLICY_ROUNDROBIN:
3842 3837
3843 3838 /*
3844 3839 * Use offset within segment to determine
3845 3840 * offset from home lgroup to choose for
3846 3841 * next lgroup to allocate memory from
3847 3842 */
3848 3843 off = ((unsigned long)(vaddr - seg->s_base) / pgsz) %
3849 3844 (lgrp_alloc_max + 1);
3850 3845
3851 3846 kpreempt_disable();
3852 3847 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3853 3848 i = lgrp->lgrp_id;
3854 3849 kpreempt_enable();
3855 3850
3856 3851 while (off > 0) {
3857 3852 i = (i + 1) % (lgrp_alloc_max + 1);
3858 3853 lgrp = lgrp_table[i];
3859 3854 if (klgrpset_ismember(lgrpset, i))
3860 3855 off--;
3861 3856 }
3862 3857 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1);
3863 3858
3864 3859 break;
3865 3860 }
3866 3861
3867 3862 ASSERT(lgrp != NULL);
3868 3863 return (lgrp);
3869 3864 }
3870 3865
3871 3866 /*
3872 3867 * Return the number of pages in an lgroup
3873 3868 *
3874 3869 * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
3875 3870 * could cause tests that rely on the numat driver to fail....
3876 3871 */
3877 3872 pgcnt_t
3878 3873 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query)
3879 3874 {
3880 3875 lgrp_t *lgrp;
3881 3876
3882 3877 lgrp = lgrp_table[lgrpid];
3883 3878 if (!LGRP_EXISTS(lgrp) ||
3884 3879 klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) ||
3885 3880 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid))
3886 3881 return (0);
3887 3882
3888 3883 return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query));
3889 3884 }
3890 3885
3891 3886 /*
3892 3887 * Initialize lgroup shared memory allocation policy support
3893 3888 */
3894 3889 void
3895 3890 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp)
3896 3891 {
3897 3892 lgrp_shm_locality_t *shm_locality;
3898 3893
3899 3894 /*
3900 3895 * Initialize locality field in anon_map
3901 3896 * Don't need any locks because this is called when anon_map is
3902 3897 * allocated, but not used anywhere yet.
3903 3898 */
3904 3899 if (amp) {
3905 3900 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
3906 3901 if (amp->locality == NULL) {
3907 3902 /*
3908 3903 * Allocate and initialize shared memory locality info
3909 3904 * and set anon_map locality pointer to it
3910 3905 * Drop lock across kmem_alloc(KM_SLEEP)
3911 3906 */
3912 3907 ANON_LOCK_EXIT(&->a_rwlock);
3913 3908 shm_locality = kmem_alloc(sizeof (*shm_locality),
3914 3909 KM_SLEEP);
3915 3910 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT,
3916 3911 NULL);
3917 3912 shm_locality->loc_count = 1; /* not used for amp */
3918 3913 shm_locality->loc_tree = NULL;
3919 3914
3920 3915 /*
3921 3916 * Reacquire lock and check to see whether anyone beat
3922 3917 * us to initializing the locality info
3923 3918 */
3924 3919 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
3925 3920 if (amp->locality != NULL) {
3926 3921 rw_destroy(&shm_locality->loc_lock);
3927 3922 kmem_free(shm_locality,
3928 3923 sizeof (*shm_locality));
3929 3924 } else
3930 3925 amp->locality = shm_locality;
3931 3926 }
3932 3927 ANON_LOCK_EXIT(&->a_rwlock);
3933 3928 return;
3934 3929 }
3935 3930
3936 3931 /*
3937 3932 * Allocate shared vnode policy info if vnode is not locality aware yet
3938 3933 */
3939 3934 mutex_enter(&vp->v_lock);
3940 3935 if ((vp->v_flag & V_LOCALITY) == 0) {
3941 3936 /*
3942 3937 * Allocate and initialize shared memory locality info
3943 3938 */
3944 3939 mutex_exit(&vp->v_lock);
3945 3940 shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP);
3946 3941 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL);
3947 3942 shm_locality->loc_count = 1;
3948 3943 shm_locality->loc_tree = NULL;
3949 3944
3950 3945 /*
3951 3946 * Point vnode locality field at shared vnode policy info
3952 3947 * and set locality aware flag in vnode
3953 3948 */
3954 3949 mutex_enter(&vp->v_lock);
3955 3950 if ((vp->v_flag & V_LOCALITY) == 0) {
3956 3951 vp->v_locality = shm_locality;
3957 3952 vp->v_flag |= V_LOCALITY;
3958 3953 } else {
3959 3954 /*
3960 3955 * Lost race so free locality info and increment count.
3961 3956 */
3962 3957 rw_destroy(&shm_locality->loc_lock);
3963 3958 kmem_free(shm_locality, sizeof (*shm_locality));
3964 3959 shm_locality = vp->v_locality;
3965 3960 shm_locality->loc_count++;
3966 3961 }
3967 3962 mutex_exit(&vp->v_lock);
3968 3963
3969 3964 return;
3970 3965 }
3971 3966
3972 3967 /*
3973 3968 * Increment reference count of number of segments mapping this vnode
3974 3969 * shared
3975 3970 */
3976 3971 shm_locality = vp->v_locality;
3977 3972 shm_locality->loc_count++;
3978 3973 mutex_exit(&vp->v_lock);
3979 3974 }
3980 3975
3981 3976 /*
3982 3977 * Destroy the given shared memory policy segment tree
3983 3978 */
3984 3979 void
3985 3980 lgrp_shm_policy_tree_destroy(avl_tree_t *tree)
3986 3981 {
3987 3982 lgrp_shm_policy_seg_t *cur;
3988 3983 lgrp_shm_policy_seg_t *next;
3989 3984
3990 3985 if (tree == NULL)
3991 3986 return;
3992 3987
3993 3988 cur = (lgrp_shm_policy_seg_t *)avl_first(tree);
3994 3989 while (cur != NULL) {
3995 3990 next = AVL_NEXT(tree, cur);
3996 3991 avl_remove(tree, cur);
3997 3992 kmem_free(cur, sizeof (*cur));
3998 3993 cur = next;
3999 3994 }
4000 3995 kmem_free(tree, sizeof (avl_tree_t));
4001 3996 }
4002 3997
4003 3998 /*
4004 3999 * Uninitialize lgroup shared memory allocation policy support
4005 4000 */
4006 4001 void
4007 4002 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp)
4008 4003 {
4009 4004 lgrp_shm_locality_t *shm_locality;
4010 4005
4011 4006 /*
4012 4007 * For anon_map, deallocate shared memory policy tree and
4013 4008 * zero locality field
4014 4009 * Don't need any locks because anon_map is being freed
4015 4010 */
4016 4011 if (amp) {
4017 4012 if (amp->locality == NULL)
4018 4013 return;
4019 4014 shm_locality = amp->locality;
4020 4015 shm_locality->loc_count = 0; /* not really used for amp */
4021 4016 rw_destroy(&shm_locality->loc_lock);
4022 4017 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
4023 4018 kmem_free(shm_locality, sizeof (*shm_locality));
4024 4019 amp->locality = 0;
4025 4020 return;
4026 4021 }
4027 4022
4028 4023 /*
4029 4024 * For vnode, decrement reference count of segments mapping this vnode
4030 4025 * shared and delete locality info if reference count drops to 0
4031 4026 */
4032 4027 mutex_enter(&vp->v_lock);
4033 4028 shm_locality = vp->v_locality;
4034 4029 shm_locality->loc_count--;
4035 4030
4036 4031 if (shm_locality->loc_count == 0) {
4037 4032 rw_destroy(&shm_locality->loc_lock);
4038 4033 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
4039 4034 kmem_free(shm_locality, sizeof (*shm_locality));
4040 4035 vp->v_locality = 0;
4041 4036 vp->v_flag &= ~V_LOCALITY;
4042 4037 }
4043 4038 mutex_exit(&vp->v_lock);
4044 4039 }
4045 4040
4046 4041 /*
4047 4042 * Compare two shared memory policy segments
4048 4043 * Used by AVL tree code for searching
4049 4044 */
4050 4045 int
4051 4046 lgrp_shm_policy_compar(const void *x, const void *y)
4052 4047 {
4053 4048 lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
4054 4049 lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;
4055 4050
4056 4051 if (a->shm_off < b->shm_off)
4057 4052 return (-1);
4058 4053 if (a->shm_off >= b->shm_off + b->shm_size)
4059 4054 return (1);
4060 4055 return (0);
4061 4056 }
4062 4057
4063 4058 /*
4064 4059 * Concatenate seg1 with seg2 and remove seg2
4065 4060 */
4066 4061 static int
4067 4062 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1,
4068 4063 lgrp_shm_policy_seg_t *seg2)
4069 4064 {
4070 4065 if (!seg1 || !seg2 ||
4071 4066 seg1->shm_off + seg1->shm_size != seg2->shm_off ||
4072 4067 seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy)
4073 4068 return (-1);
4074 4069
4075 4070 seg1->shm_size += seg2->shm_size;
4076 4071 avl_remove(tree, seg2);
4077 4072 kmem_free(seg2, sizeof (*seg2));
4078 4073 return (0);
4079 4074 }
4080 4075
4081 4076 /*
4082 4077 * Split segment at given offset and return rightmost (uppermost) segment
4083 4078 * Assumes that there are no overlapping segments
4084 4079 */
4085 4080 static lgrp_shm_policy_seg_t *
4086 4081 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
4087 4082 u_offset_t off)
4088 4083 {
4089 4084 lgrp_shm_policy_seg_t *newseg;
4090 4085 avl_index_t where;
4091 4086
4092 4087 ASSERT(seg != NULL);
4093 4088 ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size);
4094 4089
4095 4090 if (!seg || off < seg->shm_off || off > seg->shm_off +
4096 4091 seg->shm_size)
4097 4092 return (NULL);
4098 4093
4099 4094 if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
4100 4095 return (seg);
4101 4096
4102 4097 /*
4103 4098 * Adjust size of left segment and allocate new (right) segment
4104 4099 */
4105 4100 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP);
4106 4101 newseg->shm_policy = seg->shm_policy;
4107 4102 newseg->shm_off = off;
4108 4103 newseg->shm_size = seg->shm_size - (off - seg->shm_off);
4109 4104 seg->shm_size = off - seg->shm_off;
4110 4105
4111 4106 /*
4112 4107 * Find where to insert new segment in AVL tree and insert it
4113 4108 */
4114 4109 (void) avl_find(tree, &off, &where);
4115 4110 avl_insert(tree, newseg, where);
4116 4111
4117 4112 return (newseg);
4118 4113 }
4119 4114
4120 4115 /*
4121 4116 * Set shared memory allocation policy on specified shared object at given
4122 4117 * offset and length
4123 4118 *
4124 4119 * Return 0 if policy wasn't set already, 1 if policy was set already, and
4125 4120 * -1 if can't set policy.
4126 4121 */
4127 4122 int
4128 4123 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
4129 4124 ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len)
4130 4125 {
4131 4126 u_offset_t eoff;
4132 4127 lgrp_shm_policy_seg_t *next;
4133 4128 lgrp_shm_policy_seg_t *newseg;
4134 4129 u_offset_t off;
4135 4130 u_offset_t oldeoff;
4136 4131 lgrp_shm_policy_seg_t *prev;
4137 4132 int retval;
4138 4133 lgrp_shm_policy_seg_t *seg;
4139 4134 lgrp_shm_locality_t *shm_locality;
4140 4135 avl_tree_t *tree;
4141 4136 avl_index_t where;
4142 4137
4143 4138 ASSERT(amp || vp);
4144 4139 ASSERT((len & PAGEOFFSET) == 0);
4145 4140
4146 4141 if (len == 0)
4147 4142 return (-1);
4148 4143
4149 4144 retval = 0;
4150 4145
4151 4146 /*
4152 4147 * Get locality info and starting offset into shared object
4153 4148 * Try anon map first and then vnode
4154 4149 * Assume that no locks need to be held on anon_map or vnode, since
4155 4150 * it should be protected by its reference count which must be nonzero
4156 4151 * for an existing segment.
4157 4152 */
4158 4153 if (amp) {
4159 4154 /*
4160 4155 * Get policy info from anon_map
4161 4156 *
4162 4157 */
4163 4158 ASSERT(amp->refcnt != 0);
4164 4159 if (amp->locality == NULL)
4165 4160 lgrp_shm_policy_init(amp, NULL);
4166 4161 shm_locality = amp->locality;
4167 4162 off = ptob(anon_index);
4168 4163 } else if (vp) {
4169 4164 /*
4170 4165 * Get policy info from vnode
4171 4166 */
4172 4167 if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL)
4173 4168 lgrp_shm_policy_init(NULL, vp);
4174 4169 shm_locality = vp->v_locality;
4175 4170 ASSERT(shm_locality->loc_count != 0);
4176 4171 off = vn_off;
4177 4172 } else
4178 4173 return (-1);
4179 4174
4180 4175 ASSERT((off & PAGEOFFSET) == 0);
4181 4176
4182 4177 /*
4183 4178 * Figure out default policy
4184 4179 */
4185 4180 if (policy == LGRP_MEM_POLICY_DEFAULT)
4186 4181 policy = lgrp_mem_policy_default(len, MAP_SHARED);
4187 4182
4188 4183 /*
4189 4184 * Create AVL tree if there isn't one yet
4190 4185 * and set locality field to point at it
4191 4186 */
4192 4187 rw_enter(&shm_locality->loc_lock, RW_WRITER);
4193 4188 tree = shm_locality->loc_tree;
4194 4189 if (!tree) {
4195 4190 rw_exit(&shm_locality->loc_lock);
4196 4191
4197 4192 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
4198 4193
4199 4194 rw_enter(&shm_locality->loc_lock, RW_WRITER);
4200 4195 if (shm_locality->loc_tree == NULL) {
4201 4196 avl_create(tree, lgrp_shm_policy_compar,
4202 4197 sizeof (lgrp_shm_policy_seg_t),
4203 4198 offsetof(lgrp_shm_policy_seg_t, shm_tree));
4204 4199 shm_locality->loc_tree = tree;
4205 4200 } else {
4206 4201 /*
4207 4202 * Another thread managed to set up the tree
4208 4203 * before we could. Free the tree we allocated
4209 4204 * and use the one that's already there.
4210 4205 */
4211 4206 kmem_free(tree, sizeof (*tree));
4212 4207 tree = shm_locality->loc_tree;
4213 4208 }
4214 4209 }
4215 4210
4216 4211 /*
4217 4212 * Set policy
4218 4213 *
4219 4214 * Need to maintain hold on writer's lock to keep tree from
4220 4215 * changing out from under us
4221 4216 */
4222 4217 while (len != 0) {
4223 4218 /*
4224 4219 * Find policy segment for specified offset into shared object
4225 4220 */
4226 4221 seg = avl_find(tree, &off, &where);
4227 4222
4228 4223 /*
4229 4224 * Didn't find any existing segment that contains specified
4230 4225 * offset, so allocate new segment, insert it, and concatenate
4231 4226 * with adjacent segments if possible
4232 4227 */
4233 4228 if (seg == NULL) {
4234 4229 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
4235 4230 KM_SLEEP);
4236 4231 newseg->shm_policy.mem_policy = policy;
4237 4232 newseg->shm_policy.mem_lgrpid = LGRP_NONE;
4238 4233 newseg->shm_off = off;
4239 4234 avl_insert(tree, newseg, where);
4240 4235
4241 4236 /*
4242 4237 * Check to see whether new segment overlaps with next
4243 4238 * one, set length of new segment accordingly, and
4244 4239 * calculate remaining length and next offset
4245 4240 */
4246 4241 seg = AVL_NEXT(tree, newseg);
4247 4242 if (seg == NULL || off + len <= seg->shm_off) {
4248 4243 newseg->shm_size = len;
4249 4244 len = 0;
4250 4245 } else {
4251 4246 newseg->shm_size = seg->shm_off - off;
4252 4247 off = seg->shm_off;
4253 4248 len -= newseg->shm_size;
4254 4249 }
4255 4250
4256 4251 /*
4257 4252 * Try to concatenate new segment with next and
4258 4253 * previous ones, since they might have the same policy
4259 4254 * now. Grab previous and next segments first because
4260 4255 * they will change on concatenation.
4261 4256 */
4262 4257 prev = AVL_PREV(tree, newseg);
4263 4258 next = AVL_NEXT(tree, newseg);
4264 4259 (void) lgrp_shm_policy_concat(tree, newseg, next);
4265 4260 (void) lgrp_shm_policy_concat(tree, prev, newseg);
4266 4261
4267 4262 continue;
4268 4263 }
4269 4264
4270 4265 eoff = off + len;
4271 4266 oldeoff = seg->shm_off + seg->shm_size;
4272 4267
4273 4268 /*
4274 4269 * Policy set already?
4275 4270 */
4276 4271 if (policy == seg->shm_policy.mem_policy) {
4277 4272 /*
4278 4273 * Nothing left to do if offset and length
4279 4274 * fall within this segment
4280 4275 */
4281 4276 if (eoff <= oldeoff) {
4282 4277 retval = 1;
4283 4278 break;
4284 4279 } else {
4285 4280 len = eoff - oldeoff;
4286 4281 off = oldeoff;
4287 4282 continue;
4288 4283 }
4289 4284 }
4290 4285
4291 4286 /*
4292 4287 * Specified offset and length match existing segment exactly
4293 4288 */
4294 4289 if (off == seg->shm_off && len == seg->shm_size) {
4295 4290 /*
4296 4291 * Set policy and update current length
4297 4292 */
4298 4293 seg->shm_policy.mem_policy = policy;
4299 4294 seg->shm_policy.mem_lgrpid = LGRP_NONE;
4300 4295 len = 0;
4301 4296
4302 4297 /*
4303 4298 * Try concatenating new segment with previous and next
4304 4299 * segments, since they might have the same policy now.
4305 4300 * Grab previous and next segments first because they
4306 4301 * will change on concatenation.
4307 4302 */
4308 4303 prev = AVL_PREV(tree, seg);
4309 4304 next = AVL_NEXT(tree, seg);
4310 4305 (void) lgrp_shm_policy_concat(tree, seg, next);
4311 4306 (void) lgrp_shm_policy_concat(tree, prev, seg);
4312 4307 } else {
4313 4308 /*
4314 4309 * Specified offset and length only apply to part of
4315 4310 * existing segment
4316 4311 */
4317 4312
4318 4313 /*
4319 4314 * New segment starts in middle of old one, so split
4320 4315 * new one off near beginning of old one
4321 4316 */
4322 4317 newseg = NULL;
4323 4318 if (off > seg->shm_off) {
4324 4319 newseg = lgrp_shm_policy_split(tree, seg, off);
4325 4320
4326 4321 /*
4327 4322 * New segment ends where old one did, so try
4328 4323 * to concatenate with next segment
4329 4324 */
4330 4325 if (eoff == oldeoff) {
4331 4326 newseg->shm_policy.mem_policy = policy;
4332 4327 newseg->shm_policy.mem_lgrpid =
4333 4328 LGRP_NONE;
4334 4329 (void) lgrp_shm_policy_concat(tree,
4335 4330 newseg, AVL_NEXT(tree, newseg));
4336 4331 break;
4337 4332 }
4338 4333 }
4339 4334
4340 4335 /*
4341 4336 * New segment ends before old one, so split off end of
4342 4337 * old one
4343 4338 */
4344 4339 if (eoff < oldeoff) {
4345 4340 if (newseg) {
4346 4341 (void) lgrp_shm_policy_split(tree,
4347 4342 newseg, eoff);
4348 4343 newseg->shm_policy.mem_policy = policy;
4349 4344 newseg->shm_policy.mem_lgrpid =
4350 4345 LGRP_NONE;
4351 4346 } else {
4352 4347 (void) lgrp_shm_policy_split(tree, seg,
4353 4348 eoff);
4354 4349 seg->shm_policy.mem_policy = policy;
4355 4350 seg->shm_policy.mem_lgrpid = LGRP_NONE;
4356 4351 }
4357 4352
4358 4353 if (off == seg->shm_off)
4359 4354 (void) lgrp_shm_policy_concat(tree,
4360 4355 AVL_PREV(tree, seg), seg);
4361 4356 break;
4362 4357 }
4363 4358
4364 4359 /*
4365 4360 * Calculate remaining length and next offset
4366 4361 */
4367 4362 len = eoff - oldeoff;
4368 4363 off = oldeoff;
4369 4364 }
4370 4365 }
4371 4366
4372 4367 rw_exit(&shm_locality->loc_lock);
4373 4368 return (retval);
4374 4369 }
4375 4370
4376 4371 /*
4377 4372 * Return the best memnode from which to allocate memory given
4378 4373 * an lgroup.
4379 4374 *
4380 4375 * "c" is for cookie, which is good enough for me.
4381 4376 * It references a cookie struct that should be zero'ed to initialize.
4382 4377 * The cookie should live on the caller's stack.
4383 4378 *
4384 4379 * The routine returns -1 when:
4385 4380 * - traverse is 0, and all the memnodes in "lgrp" have been returned.
4386 4381 * - traverse is 1, and all the memnodes in the system have been
4387 4382 * returned.
4388 4383 */
4389 4384 int
4390 4385 lgrp_memnode_choose(lgrp_mnode_cookie_t *c)
4391 4386 {
4392 4387 lgrp_t *lp = c->lmc_lgrp;
4393 4388 mnodeset_t nodes = c->lmc_nodes;
4394 4389 int cnt = c->lmc_cnt;
4395 4390 int offset, mnode;
4396 4391
4397 4392 extern int max_mem_nodes;
4398 4393
4399 4394 /*
4400 4395 * If the set is empty, and the caller is willing, traverse
4401 4396 * up the hierarchy until we find a non-empty set.
4402 4397 */
4403 4398 while (nodes == (mnodeset_t)0 || cnt <= 0) {
4404 4399 if (c->lmc_scope == LGRP_SRCH_LOCAL ||
4405 4400 ((lp = lp->lgrp_parent) == NULL))
4406 4401 return (-1);
4407 4402
4408 4403 nodes = lp->lgrp_mnodes & ~(c->lmc_tried);
4409 4404 cnt = lp->lgrp_nmnodes - c->lmc_ntried;
4410 4405 }
4411 4406
4412 4407 /*
4413 4408 * Select a memnode by picking one at a "random" offset.
4414 4409 * Because of DR, memnodes can come and go at any time.
4415 4410 * This code must be able to cope with the possibility
4416 4411 * that the nodes count "cnt" is inconsistent with respect
4417 4412 * to the number of elements actually in "nodes", and
4418 4413 * therefore that the offset chosen could be greater than
4419 4414 * the number of elements in the set (some memnodes may
4420 4415 * have dissapeared just before cnt was read).
4421 4416 * If this happens, the search simply wraps back to the
4422 4417 * beginning of the set.
4423 4418 */
4424 4419 ASSERT(nodes != (mnodeset_t)0 && cnt > 0);
4425 4420 offset = c->lmc_rand % cnt;
4426 4421 do {
4427 4422 for (mnode = 0; mnode < max_mem_nodes; mnode++)
4428 4423 if (nodes & ((mnodeset_t)1 << mnode))
4429 4424 if (!offset--)
4430 4425 break;
4431 4426 } while (mnode >= max_mem_nodes);
4432 4427
4433 4428 /* Found a node. Store state before returning. */
4434 4429 c->lmc_lgrp = lp;
4435 4430 c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode));
4436 4431 c->lmc_cnt = cnt - 1;
4437 4432 c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode));
4438 4433 c->lmc_ntried++;
4439 4434
4440 4435 return (mnode);
4441 4436 }
↓ open down ↓ |
910 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX