Print this page
XXXX pass in cpu_pause_func via pause_cpus
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/disp/cmt.c
+++ new/usr/src/uts/common/disp/cmt.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23 23 */
24 24
25 25 #include <sys/systm.h>
26 26 #include <sys/types.h>
27 27 #include <sys/param.h>
28 28 #include <sys/thread.h>
29 29 #include <sys/cpuvar.h>
30 30 #include <sys/cpupart.h>
31 31 #include <sys/kmem.h>
32 32 #include <sys/cmn_err.h>
33 33 #include <sys/kstat.h>
34 34 #include <sys/processor.h>
35 35 #include <sys/disp.h>
36 36 #include <sys/group.h>
37 37 #include <sys/pghw.h>
38 38 #include <sys/bitset.h>
39 39 #include <sys/lgrp.h>
40 40 #include <sys/cmt.h>
41 41 #include <sys/cpu_pm.h>
42 42
43 43 /*
44 44 * CMT scheduler / dispatcher support
45 45 *
46 46 * This file implements CMT scheduler support using Processor Groups.
47 47 * The CMT processor group class creates and maintains the CMT class
48 48 * specific processor group pg_cmt_t.
49 49 *
50 50 * ---------------------------- <-- pg_cmt_t *
51 51 * | pghw_t |
52 52 * ----------------------------
53 53 * | CMT class specific data |
54 54 * | - hierarchy linkage |
55 55 * | - CMT load balancing data|
56 56 * | - active CPU group/bitset|
57 57 * ----------------------------
58 58 *
59 59 * The scheduler/dispatcher leverages knowledge of the performance
60 60 * relevant CMT sharing relationships existing between cpus to implement
61 61 * optimized affinity, load balancing, and coalescence policies.
62 62 *
63 63 * Load balancing policy seeks to improve performance by minimizing
64 64 * contention over shared processor resources / facilities, Affinity
65 65 * policies seek to improve cache and TLB utilization. Coalescence
66 66 * policies improve resource utilization and ultimately power efficiency.
67 67 *
68 68 * The CMT PGs created by this class are already arranged into a
69 69 * hierarchy (which is done in the pghw layer). To implement the top-down
70 70 * CMT load balancing algorithm, the CMT PGs additionally maintain
71 71 * parent, child and sibling hierarchy relationships.
72 72 * Parent PGs always contain a superset of their children(s) resources,
73 73 * each PG can have at most one parent, and siblings are the group of PGs
74 74 * sharing the same parent.
75 75 *
76 76 * On UMA based systems, the CMT load balancing algorithm begins by balancing
77 77 * load across the group of top level PGs in the system hierarchy.
78 78 * On NUMA systems, the CMT load balancing algorithm balances load across the
79 79 * group of top level PGs in each leaf lgroup...but for root homed threads,
80 80 * is willing to balance against all the top level PGs in the system.
81 81 *
82 82 * Groups of top level PGs are maintained to implement the above, one for each
83 83 * leaf lgroup (containing the top level PGs in that lgroup), and one (for the
84 84 * root lgroup) that contains all the top level PGs in the system.
85 85 */
86 86 static cmt_lgrp_t *cmt_lgrps = NULL; /* cmt_lgrps list head */
87 87 static cmt_lgrp_t *cpu0_lgrp = NULL; /* boot CPU's initial lgrp */
88 88 /* used for null_proc_lpa */
89 89 cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */
90 90
91 91 static int is_cpu0 = 1; /* true if this is boot CPU context */
92 92
93 93 /*
94 94 * Array of hardware sharing relationships that are blacklisted.
95 95 * CMT scheduling optimizations won't be performed for blacklisted sharing
96 96 * relationships.
97 97 */
98 98 static int cmt_hw_blacklisted[PGHW_NUM_COMPONENTS];
99 99
100 100 /*
101 101 * Set this to non-zero to disable CMT scheduling
102 102 * This must be done via kmdb -d, as /etc/system will be too late
103 103 */
104 104 int cmt_sched_disabled = 0;
105 105
106 106 /*
107 107 * Status codes for CMT lineage validation
108 108 * See pg_cmt_lineage_validate() below
109 109 */
110 110 typedef enum cmt_lineage_validation {
111 111 CMT_LINEAGE_VALID,
112 112 CMT_LINEAGE_NON_CONCENTRIC,
113 113 CMT_LINEAGE_PG_SPANS_LGRPS,
114 114 CMT_LINEAGE_NON_PROMOTABLE,
115 115 CMT_LINEAGE_REPAIRED,
116 116 CMT_LINEAGE_UNRECOVERABLE
117 117 } cmt_lineage_validation_t;
118 118
119 119 /*
120 120 * Status of the current lineage under construction.
121 121 * One must be holding cpu_lock to change this.
122 122 */
123 123 cmt_lineage_validation_t cmt_lineage_status = CMT_LINEAGE_VALID;
124 124
125 125 /*
126 126 * Power domain definitions (on x86) are defined by ACPI, and
127 127 * therefore may be subject to BIOS bugs.
128 128 */
129 129 #define PG_CMT_HW_SUSPECT(hw) PGHW_IS_PM_DOMAIN(hw)
130 130
131 131 /*
132 132 * Macro to test if PG is managed by the CMT PG class
133 133 */
134 134 #define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)
135 135
136 136 static pg_cid_t pg_cmt_class_id; /* PG class id */
137 137
138 138 static pg_t *pg_cmt_alloc();
139 139 static void pg_cmt_free(pg_t *);
140 140 static void pg_cmt_cpu_init(cpu_t *, cpu_pg_t *);
141 141 static void pg_cmt_cpu_fini(cpu_t *, cpu_pg_t *);
142 142 static void pg_cmt_cpu_active(cpu_t *);
143 143 static void pg_cmt_cpu_inactive(cpu_t *);
144 144 static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
145 145 static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
146 146 static char *pg_cmt_policy_name(pg_t *);
147 147 static void pg_cmt_hier_sort(pg_cmt_t **, int);
148 148 static pg_cmt_t *pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *);
149 149 static int pg_cmt_cpu_belongs(pg_t *, cpu_t *);
150 150 static int pg_cmt_hw(pghw_type_t);
151 151 static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t);
152 152 static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t);
153 153 static void cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t,
154 154 kthread_t *, kthread_t *);
155 155 static void cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t,
156 156 kthread_t *, kthread_t *);
157 157 static void cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *);
158 158 static cmt_lineage_validation_t pg_cmt_lineage_validate(pg_cmt_t **, int *,
159 159 cpu_pg_t *);
160 160
161 161 /*
162 162 * CMT PG ops
163 163 */
164 164 struct pg_ops pg_ops_cmt = {
165 165 pg_cmt_alloc,
166 166 pg_cmt_free,
167 167 pg_cmt_cpu_init,
168 168 pg_cmt_cpu_fini,
169 169 pg_cmt_cpu_active,
170 170 pg_cmt_cpu_inactive,
171 171 pg_cmt_cpupart_in,
172 172 NULL, /* cpupart_out */
173 173 pg_cmt_cpupart_move,
174 174 pg_cmt_cpu_belongs,
175 175 pg_cmt_policy_name,
176 176 };
177 177
178 178 /*
179 179 * Initialize the CMT PG class
180 180 */
181 181 void
182 182 pg_cmt_class_init(void)
183 183 {
184 184 if (cmt_sched_disabled)
185 185 return;
186 186
187 187 pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL);
188 188 }
189 189
190 190 /*
191 191 * Called to indicate a new CPU has started up so
192 192 * that either t0 or the slave startup thread can
193 193 * be accounted for.
194 194 */
195 195 void
196 196 pg_cmt_cpu_startup(cpu_t *cp)
197 197 {
198 198 pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread,
199 199 cp->cpu_thread);
200 200 }
201 201
202 202 /*
203 203 * Return non-zero if thread can migrate between "from" and "to"
204 204 * without a performance penalty
205 205 */
206 206 int
207 207 pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
208 208 {
209 209 if (from->cpu_physid->cpu_cacheid ==
210 210 to->cpu_physid->cpu_cacheid)
211 211 return (1);
212 212 return (0);
213 213 }
214 214
215 215 /*
216 216 * CMT class specific PG allocation
217 217 */
218 218 static pg_t *
219 219 pg_cmt_alloc(void)
220 220 {
221 221 return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP));
222 222 }
223 223
224 224 /*
225 225 * Class specific PG de-allocation
226 226 */
227 227 static void
228 228 pg_cmt_free(pg_t *pg)
229 229 {
230 230 ASSERT(pg != NULL);
231 231 ASSERT(IS_CMT_PG(pg));
232 232
233 233 kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t));
234 234 }
235 235
236 236 /*
237 237 * Given a hardware sharing relationship, return which dispatcher
238 238 * policies should be implemented to optimize performance and efficiency
239 239 */
240 240 static pg_cmt_policy_t
241 241 pg_cmt_policy(pghw_type_t hw)
242 242 {
243 243 pg_cmt_policy_t p;
244 244
245 245 /*
246 246 * Give the platform a chance to override the default
247 247 */
248 248 if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY)
249 249 return (p);
250 250
251 251 switch (hw) {
252 252 case PGHW_IPIPE:
253 253 case PGHW_FPU:
254 254 case PGHW_PROCNODE:
255 255 case PGHW_CHIP:
256 256 return (CMT_BALANCE);
257 257 case PGHW_CACHE:
258 258 return (CMT_AFFINITY | CMT_BALANCE);
259 259 case PGHW_POW_ACTIVE:
260 260 case PGHW_POW_IDLE:
261 261 return (CMT_BALANCE);
262 262 default:
263 263 return (CMT_NO_POLICY);
264 264 }
265 265 }
266 266
267 267 /*
268 268 * Rank the importance of optimizing for the pg1 relationship vs.
269 269 * the pg2 relationship.
270 270 */
271 271 static pg_cmt_t *
272 272 pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2)
273 273 {
274 274 pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw;
275 275 pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw;
276 276
277 277 /*
278 278 * A power domain is only important if CPUPM is enabled.
279 279 */
280 280 if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) {
281 281 if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2))
282 282 return (pg2);
283 283 if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1))
284 284 return (pg1);
285 285 }
286 286
287 287 /*
288 288 * Otherwise, ask the platform
289 289 */
290 290 if (pg_plat_hw_rank(hw1, hw2) == hw1)
291 291 return (pg1);
292 292 else
293 293 return (pg2);
294 294 }
295 295
296 296 /*
297 297 * Initialize CMT callbacks for the given PG
298 298 */
299 299 static void
300 300 cmt_callback_init(pg_t *pg)
301 301 {
302 302 /*
303 303 * Stick with the default callbacks if there isn't going to be
304 304 * any CMT thread placement optimizations implemented.
305 305 */
306 306 if (((pg_cmt_t *)pg)->cmt_policy == CMT_NO_POLICY)
307 307 return;
308 308
309 309 switch (((pghw_t *)pg)->pghw_hw) {
310 310 case PGHW_POW_ACTIVE:
311 311 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr;
312 312 pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr;
313 313 break;
314 314 default:
315 315 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch;
316 316
317 317 }
318 318 }
319 319
320 320 /*
321 321 * Promote PG above it's current parent.
322 322 * This is only legal if PG has an equal or greater number of CPUs than its
323 323 * parent.
324 324 *
325 325 * This routine operates on the CPU specific processor group data (for the CPUs
326 326 * in the PG being promoted), and may be invoked from a context where one CPU's
327 327 * PG data is under construction. In this case the argument "pgdata", if not
328 328 * NULL, is a reference to the CPU's under-construction PG data.
329 329 */
330 330 static void
331 331 cmt_hier_promote(pg_cmt_t *pg, cpu_pg_t *pgdata)
332 332 {
333 333 pg_cmt_t *parent;
334 334 group_t *children;
335 335 cpu_t *cpu;
336 336 group_iter_t iter;
337 337 pg_cpu_itr_t cpu_iter;
338 338 int r;
339 339 int err;
340 340 int nchildren;
341 341
342 342 ASSERT(MUTEX_HELD(&cpu_lock));
343 343
344 344 parent = pg->cmt_parent;
345 345 if (parent == NULL) {
346 346 /*
347 347 * Nothing to do
↓ open down ↓ |
347 lines elided |
↑ open up ↑ |
348 348 */
349 349 return;
350 350 }
351 351
352 352 ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent));
353 353
354 354 /*
355 355 * We're changing around the hierarchy, which is actively traversed
356 356 * by the dispatcher. Pause CPUS to ensure exclusivity.
357 357 */
358 - pause_cpus(NULL);
358 + pause_cpus(NULL, NULL);
359 359
360 360 /*
361 361 * If necessary, update the parent's sibling set, replacing parent
362 362 * with PG.
363 363 */
364 364 if (parent->cmt_siblings) {
365 365 if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE)
366 366 != -1) {
367 367 r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE);
368 368 ASSERT(r != -1);
369 369 }
370 370 }
371 371
372 372 /*
373 373 * If the parent is at the top of the hierarchy, replace it's entry
374 374 * in the root lgroup's group of top level PGs.
375 375 */
376 376 if (parent->cmt_parent == NULL &&
377 377 parent->cmt_siblings != &cmt_root->cl_pgs) {
378 378 if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE)
379 379 != -1) {
380 380 r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE);
381 381 ASSERT(r != -1);
382 382 }
383 383 }
384 384
385 385 /*
386 386 * We assume (and therefore assert) that the PG being promoted is an
387 387 * only child of it's parent. Update the parent's children set
388 388 * replacing PG's entry with the parent (since the parent is becoming
389 389 * the child). Then have PG and the parent swap children sets and
390 390 * children counts.
391 391 */
392 392 ASSERT(GROUP_SIZE(parent->cmt_children) <= 1);
393 393 if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) {
394 394 r = group_add(parent->cmt_children, parent, GRP_NORESIZE);
395 395 ASSERT(r != -1);
396 396 }
397 397
398 398 children = pg->cmt_children;
399 399 pg->cmt_children = parent->cmt_children;
400 400 parent->cmt_children = children;
401 401
402 402 nchildren = pg->cmt_nchildren;
403 403 pg->cmt_nchildren = parent->cmt_nchildren;
404 404 parent->cmt_nchildren = nchildren;
405 405
406 406 /*
407 407 * Update the sibling references for PG and it's parent
408 408 */
409 409 pg->cmt_siblings = parent->cmt_siblings;
410 410 parent->cmt_siblings = pg->cmt_children;
411 411
412 412 /*
413 413 * Update any cached lineages in the per CPU pg data.
414 414 */
415 415 PG_CPU_ITR_INIT(pg, cpu_iter);
416 416 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
417 417 int idx;
418 418 int sz;
419 419 pg_cmt_t *cpu_pg;
420 420 cpu_pg_t *pgd; /* CPU's PG data */
421 421
422 422 /*
423 423 * The CPU's whose lineage is under construction still
424 424 * references the bootstrap CPU PG data structure.
425 425 */
426 426 if (pg_cpu_is_bootstrapped(cpu))
427 427 pgd = pgdata;
428 428 else
429 429 pgd = cpu->cpu_pg;
430 430
431 431 /*
432 432 * Iterate over the CPU's PGs updating the children
433 433 * of the PG being promoted, since they have a new parent.
434 434 */
435 435 group_iter_init(&iter);
436 436 while ((cpu_pg = group_iterate(&pgd->cmt_pgs, &iter)) != NULL) {
437 437 if (cpu_pg->cmt_parent == pg) {
438 438 cpu_pg->cmt_parent = parent;
439 439 }
440 440 }
441 441
442 442 /*
443 443 * Update the CMT load balancing lineage
444 444 */
445 445 if ((idx = group_find(&pgd->cmt_pgs, (void *)pg)) == -1) {
446 446 /*
447 447 * Unless this is the CPU who's lineage is being
448 448 * constructed, the PG being promoted should be
449 449 * in the lineage.
450 450 */
451 451 ASSERT(pg_cpu_is_bootstrapped(cpu));
452 452 continue;
453 453 }
454 454
455 455 ASSERT(idx > 0);
456 456 ASSERT(GROUP_ACCESS(&pgd->cmt_pgs, idx - 1) == parent);
457 457
458 458 /*
459 459 * Have the child and the parent swap places in the CPU's
460 460 * lineage
461 461 */
462 462 group_remove_at(&pgd->cmt_pgs, idx);
463 463 group_remove_at(&pgd->cmt_pgs, idx - 1);
464 464 err = group_add_at(&pgd->cmt_pgs, parent, idx);
465 465 ASSERT(err == 0);
466 466 err = group_add_at(&pgd->cmt_pgs, pg, idx - 1);
467 467 ASSERT(err == 0);
468 468
469 469 /*
470 470 * Ensure cmt_lineage references CPU's leaf PG.
471 471 * Since cmt_pgs is top-down ordered, the bottom is the last
472 472 * element.
473 473 */
474 474 if ((sz = GROUP_SIZE(&pgd->cmt_pgs)) > 0)
475 475 pgd->cmt_lineage = GROUP_ACCESS(&pgd->cmt_pgs, sz - 1);
476 476 }
477 477
478 478 /*
479 479 * Update the parent references for PG and it's parent
480 480 */
481 481 pg->cmt_parent = parent->cmt_parent;
482 482 parent->cmt_parent = pg;
483 483
484 484 start_cpus();
485 485 }
486 486
487 487 /*
488 488 * CMT class callback for a new CPU entering the system
489 489 *
490 490 * This routine operates on the CPU specific processor group data (for the CPU
491 491 * being initialized). The argument "pgdata" is a reference to the CPU's PG
492 492 * data to be constructed.
493 493 *
494 494 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
495 495 * references a "bootstrap" structure. pg_cmt_cpu_init() and the routines it
496 496 * calls must be careful to operate only on the "pgdata" argument, and not
497 497 * cp->cpu_pg.
498 498 */
499 499 static void
500 500 pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata)
501 501 {
502 502 pg_cmt_t *pg;
503 503 group_t *cmt_pgs;
504 504 int levels, level;
505 505 pghw_type_t hw;
506 506 pg_t *pg_cache = NULL;
507 507 pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS];
508 508 lgrp_handle_t lgrp_handle;
509 509 cmt_lgrp_t *lgrp;
510 510 cmt_lineage_validation_t lineage_status;
511 511
512 512 ASSERT(MUTEX_HELD(&cpu_lock));
513 513 ASSERT(pg_cpu_is_bootstrapped(cp));
514 514
515 515 if (cmt_sched_disabled)
516 516 return;
517 517
518 518 /*
519 519 * A new CPU is coming into the system.
520 520 * Interrogate the platform to see if the CPU
521 521 * has any performance or efficiency relevant
522 522 * sharing relationships
523 523 */
524 524 cmt_pgs = &pgdata->cmt_pgs;
525 525 pgdata->cmt_lineage = NULL;
526 526
527 527 bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
528 528 levels = 0;
529 529 for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {
530 530
531 531 pg_cmt_policy_t policy;
532 532
533 533 /*
534 534 * We're only interested in the hw sharing relationships
535 535 * for which we know how to optimize.
536 536 */
537 537 policy = pg_cmt_policy(hw);
538 538 if (policy == CMT_NO_POLICY ||
539 539 pg_plat_hw_shared(cp, hw) == 0)
540 540 continue;
541 541
542 542 /*
543 543 * We will still create the PGs for hardware sharing
544 544 * relationships that have been blacklisted, but won't
545 545 * implement CMT thread placement optimizations against them.
546 546 */
547 547 if (cmt_hw_blacklisted[hw] == 1)
548 548 policy = CMT_NO_POLICY;
549 549
550 550 /*
551 551 * Find (or create) the PG associated with
552 552 * the hw sharing relationship in which cp
553 553 * belongs.
554 554 *
555 555 * Determine if a suitable PG already
556 556 * exists, or if one needs to be created.
557 557 */
558 558 pg = (pg_cmt_t *)pghw_place_cpu(cp, hw);
559 559 if (pg == NULL) {
560 560 /*
561 561 * Create a new one.
562 562 * Initialize the common...
563 563 */
564 564 pg = (pg_cmt_t *)pg_create(pg_cmt_class_id);
565 565
566 566 /* ... physical ... */
567 567 pghw_init((pghw_t *)pg, cp, hw);
568 568
569 569 /*
570 570 * ... and CMT specific portions of the
571 571 * structure.
572 572 */
573 573 pg->cmt_policy = policy;
574 574
575 575 /* CMT event callbacks */
576 576 cmt_callback_init((pg_t *)pg);
577 577
578 578 bitset_init(&pg->cmt_cpus_actv_set);
579 579 group_create(&pg->cmt_cpus_actv);
580 580 } else {
581 581 ASSERT(IS_CMT_PG(pg));
582 582 }
583 583
584 584 ((pghw_t *)pg)->pghw_generation++;
585 585
586 586 /* Add the CPU to the PG */
587 587 pg_cpu_add((pg_t *)pg, cp, pgdata);
588 588
589 589 /*
590 590 * Ensure capacity of the active CPU group/bitset
591 591 */
592 592 group_expand(&pg->cmt_cpus_actv,
593 593 GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
594 594
595 595 if (cp->cpu_seqid >=
596 596 bitset_capacity(&pg->cmt_cpus_actv_set)) {
597 597 bitset_resize(&pg->cmt_cpus_actv_set,
598 598 cp->cpu_seqid + 1);
599 599 }
600 600
601 601 /*
602 602 * Build a lineage of CMT PGs for load balancing / coalescence
603 603 */
604 604 if (policy & (CMT_BALANCE | CMT_COALESCE)) {
605 605 cpu_cmt_hier[levels++] = pg;
606 606 }
607 607
608 608 /* Cache this for later */
609 609 if (hw == PGHW_CACHE)
610 610 pg_cache = (pg_t *)pg;
611 611 }
612 612
613 613 group_expand(cmt_pgs, levels);
614 614
615 615 if (cmt_root == NULL)
616 616 cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand());
617 617
618 618 /*
619 619 * Find the lgrp that encapsulates this CPU's CMT hierarchy
620 620 */
621 621 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
622 622 if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL)
623 623 lgrp = pg_cmt_lgrp_create(lgrp_handle);
624 624
625 625 /*
626 626 * Ascendingly sort the PGs in the lineage by number of CPUs
627 627 */
628 628 pg_cmt_hier_sort(cpu_cmt_hier, levels);
629 629
630 630 /*
631 631 * Examine the lineage and validate it.
632 632 * This routine will also try to fix the lineage along with the
633 633 * rest of the PG hierarchy should it detect an issue.
634 634 *
635 635 * If it returns anything other than VALID or REPAIRED, an
636 636 * unrecoverable error has occurred, and we cannot proceed.
637 637 */
638 638 lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels, pgdata);
639 639 if ((lineage_status != CMT_LINEAGE_VALID) &&
640 640 (lineage_status != CMT_LINEAGE_REPAIRED)) {
641 641 /*
642 642 * In the case of an unrecoverable error where CMT scheduling
643 643 * has been disabled, assert that the under construction CPU's
644 644 * PG data has an empty CMT load balancing lineage.
645 645 */
646 646 ASSERT((cmt_sched_disabled == 0) ||
647 647 (GROUP_SIZE(&(pgdata->cmt_pgs)) == 0));
648 648 return;
649 649 }
650 650
651 651 /*
652 652 * For existing PGs in the lineage, verify that the parent is
653 653 * correct, as the generation in the lineage may have changed
654 654 * as a result of the sorting. Start the traversal at the top
655 655 * of the lineage, moving down.
656 656 */
657 657 for (level = levels - 1; level >= 0; ) {
658 658 int reorg;
659 659
660 660 reorg = 0;
661 661 pg = cpu_cmt_hier[level];
662 662
663 663 /*
664 664 * Promote PGs at an incorrect generation into place.
665 665 */
666 666 while (pg->cmt_parent &&
667 667 pg->cmt_parent != cpu_cmt_hier[level + 1]) {
668 668 cmt_hier_promote(pg, pgdata);
669 669 reorg++;
670 670 }
671 671 if (reorg > 0)
672 672 level = levels - 1;
673 673 else
674 674 level--;
675 675 }
676 676
677 677 /*
678 678 * For each of the PGs in the CPU's lineage:
679 679 * - Add an entry in the CPU sorted CMT PG group
680 680 * which is used for top down CMT load balancing
681 681 * - Tie the PG into the CMT hierarchy by connecting
682 682 * it to it's parent and siblings.
683 683 */
684 684 for (level = 0; level < levels; level++) {
685 685 uint_t children;
686 686 int err;
687 687
688 688 pg = cpu_cmt_hier[level];
689 689 err = group_add_at(cmt_pgs, pg, levels - level - 1);
690 690 ASSERT(err == 0);
691 691
692 692 if (level == 0)
693 693 pgdata->cmt_lineage = (pg_t *)pg;
694 694
695 695 if (pg->cmt_siblings != NULL) {
696 696 /* Already initialized */
697 697 ASSERT(pg->cmt_parent == NULL ||
698 698 pg->cmt_parent == cpu_cmt_hier[level + 1]);
699 699 ASSERT(pg->cmt_siblings == &lgrp->cl_pgs ||
700 700 ((pg->cmt_parent != NULL) &&
701 701 pg->cmt_siblings == pg->cmt_parent->cmt_children));
702 702 continue;
703 703 }
704 704
705 705 if ((level + 1) == levels) {
706 706 pg->cmt_parent = NULL;
707 707
708 708 pg->cmt_siblings = &lgrp->cl_pgs;
709 709 children = ++lgrp->cl_npgs;
710 710 if (cmt_root != lgrp)
711 711 cmt_root->cl_npgs++;
712 712 } else {
713 713 pg->cmt_parent = cpu_cmt_hier[level + 1];
714 714
715 715 /*
716 716 * A good parent keeps track of their children.
717 717 * The parent's children group is also the PG's
718 718 * siblings.
719 719 */
720 720 if (pg->cmt_parent->cmt_children == NULL) {
721 721 pg->cmt_parent->cmt_children =
722 722 kmem_zalloc(sizeof (group_t), KM_SLEEP);
723 723 group_create(pg->cmt_parent->cmt_children);
724 724 }
725 725 pg->cmt_siblings = pg->cmt_parent->cmt_children;
726 726 children = ++pg->cmt_parent->cmt_nchildren;
727 727 }
728 728
729 729 group_expand(pg->cmt_siblings, children);
730 730 group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs);
731 731 }
732 732
733 733 /*
734 734 * Cache the chip and core IDs in the cpu_t->cpu_physid structure
735 735 * for fast lookups later.
736 736 */
737 737 if (cp->cpu_physid) {
738 738 cp->cpu_physid->cpu_chipid =
739 739 pg_plat_hw_instance_id(cp, PGHW_CHIP);
740 740 cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp);
741 741
742 742 /*
743 743 * If this cpu has a PG representing shared cache, then set
744 744 * cpu_cacheid to that PG's logical id
745 745 */
746 746 if (pg_cache)
747 747 cp->cpu_physid->cpu_cacheid = pg_cache->pg_id;
748 748 }
749 749
750 750 /* CPU0 only initialization */
751 751 if (is_cpu0) {
752 752 is_cpu0 = 0;
753 753 cpu0_lgrp = lgrp;
754 754 }
755 755
756 756 }
757 757
758 758 /*
759 759 * Class callback when a CPU is leaving the system (deletion)
760 760 *
761 761 * "pgdata" is a reference to the CPU's PG data to be deconstructed.
762 762 *
763 763 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
764 764 * references a "bootstrap" structure across this function's invocation.
765 765 * pg_cmt_cpu_fini() and the routines it calls must be careful to operate only
766 766 * on the "pgdata" argument, and not cp->cpu_pg.
767 767 */
768 768 static void
769 769 pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata)
770 770 {
771 771 group_iter_t i;
772 772 pg_cmt_t *pg;
773 773 group_t *pgs, *cmt_pgs;
774 774 lgrp_handle_t lgrp_handle;
775 775 cmt_lgrp_t *lgrp;
776 776
777 777 if (cmt_sched_disabled)
778 778 return;
779 779
780 780 ASSERT(pg_cpu_is_bootstrapped(cp));
781 781
782 782 pgs = &pgdata->pgs;
783 783 cmt_pgs = &pgdata->cmt_pgs;
784 784
785 785 /*
786 786 * Find the lgroup that encapsulates this CPU's CMT hierarchy
787 787 */
788 788 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
789 789
790 790 lgrp = pg_cmt_find_lgrp(lgrp_handle);
791 791 if (ncpus == 1 && lgrp != cpu0_lgrp) {
792 792 /*
793 793 * One might wonder how we could be deconfiguring the
794 794 * only CPU in the system.
795 795 *
796 796 * On Starcat systems when null_proc_lpa is detected,
797 797 * the boot CPU (which is already configured into a leaf
798 798 * lgroup), is moved into the root lgroup. This is done by
799 799 * deconfiguring it from both lgroups and processor
800 800 * groups), and then later reconfiguring it back in. This
801 801 * call to pg_cmt_cpu_fini() is part of that deconfiguration.
802 802 *
803 803 * This special case is detected by noting that the platform
804 804 * has changed the CPU's lgrp affiliation (since it now
805 805 * belongs in the root). In this case, use the cmt_lgrp_t
806 806 * cached for the boot CPU, since this is what needs to be
807 807 * torn down.
808 808 */
809 809 lgrp = cpu0_lgrp;
810 810 }
811 811
812 812 ASSERT(lgrp != NULL);
813 813
814 814 /*
815 815 * First, clean up anything load balancing specific for each of
816 816 * the CPU's PGs that participated in CMT load balancing
817 817 */
818 818 pg = (pg_cmt_t *)pgdata->cmt_lineage;
819 819 while (pg != NULL) {
820 820
821 821 ((pghw_t *)pg)->pghw_generation++;
822 822
823 823 /*
824 824 * Remove the PG from the CPU's load balancing lineage
825 825 */
826 826 (void) group_remove(cmt_pgs, pg, GRP_RESIZE);
827 827
828 828 /*
829 829 * If it's about to become empty, destroy it's children
830 830 * group, and remove it's reference from it's siblings.
831 831 * This is done here (rather than below) to avoid removing
832 832 * our reference from a PG that we just eliminated.
833 833 */
834 834 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) {
835 835 if (pg->cmt_children != NULL)
836 836 group_destroy(pg->cmt_children);
837 837 if (pg->cmt_siblings != NULL) {
838 838 if (pg->cmt_siblings == &lgrp->cl_pgs)
839 839 lgrp->cl_npgs--;
840 840 else
841 841 pg->cmt_parent->cmt_nchildren--;
842 842 }
843 843 }
844 844 pg = pg->cmt_parent;
845 845 }
846 846 ASSERT(GROUP_SIZE(cmt_pgs) == 0);
847 847
848 848 /*
849 849 * Now that the load balancing lineage updates have happened,
850 850 * remove the CPU from all it's PGs (destroying any that become
851 851 * empty).
852 852 */
853 853 group_iter_init(&i);
854 854 while ((pg = group_iterate(pgs, &i)) != NULL) {
855 855 if (IS_CMT_PG(pg) == 0)
856 856 continue;
857 857
858 858 pg_cpu_delete((pg_t *)pg, cp, pgdata);
859 859 /*
860 860 * Deleting the CPU from the PG changes the CPU's
861 861 * PG group over which we are actively iterating
862 862 * Re-initialize the iteration
863 863 */
864 864 group_iter_init(&i);
865 865
866 866 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) {
867 867
868 868 /*
869 869 * The PG has become zero sized, so destroy it.
870 870 */
871 871 group_destroy(&pg->cmt_cpus_actv);
872 872 bitset_fini(&pg->cmt_cpus_actv_set);
873 873 pghw_fini((pghw_t *)pg);
874 874
875 875 pg_destroy((pg_t *)pg);
876 876 }
877 877 }
878 878 }
879 879
880 880 /*
881 881 * Class callback when a CPU is entering a cpu partition
882 882 */
883 883 static void
884 884 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
885 885 {
886 886 group_t *pgs;
887 887 pg_t *pg;
888 888 group_iter_t i;
889 889
890 890 ASSERT(MUTEX_HELD(&cpu_lock));
891 891
892 892 if (cmt_sched_disabled)
893 893 return;
894 894
895 895 pgs = &cp->cpu_pg->pgs;
896 896
897 897 /*
898 898 * Ensure that the new partition's PG bitset
899 899 * is large enough for all CMT PG's to which cp
900 900 * belongs
901 901 */
902 902 group_iter_init(&i);
903 903 while ((pg = group_iterate(pgs, &i)) != NULL) {
904 904 if (IS_CMT_PG(pg) == 0)
905 905 continue;
906 906
907 907 if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id)
908 908 bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1);
909 909 }
910 910 }
911 911
912 912 /*
913 913 * Class callback when a CPU is actually moving partitions
914 914 */
915 915 static void
916 916 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
917 917 {
918 918 cpu_t *cpp;
919 919 group_t *pgs;
920 920 pg_t *pg;
921 921 group_iter_t pg_iter;
922 922 pg_cpu_itr_t cpu_iter;
923 923 boolean_t found;
924 924
925 925 ASSERT(MUTEX_HELD(&cpu_lock));
926 926
927 927 if (cmt_sched_disabled)
928 928 return;
929 929
930 930 pgs = &cp->cpu_pg->pgs;
931 931 group_iter_init(&pg_iter);
932 932
933 933 /*
934 934 * Iterate over the CPUs CMT PGs
935 935 */
936 936 while ((pg = group_iterate(pgs, &pg_iter)) != NULL) {
937 937
938 938 if (IS_CMT_PG(pg) == 0)
939 939 continue;
940 940
941 941 /*
942 942 * Add the PG to the bitset in the new partition.
943 943 */
944 944 bitset_add(&newpp->cp_cmt_pgs, pg->pg_id);
945 945
946 946 /*
947 947 * Remove the PG from the bitset in the old partition
948 948 * if the last of the PG's CPUs have left.
949 949 */
950 950 found = B_FALSE;
951 951 PG_CPU_ITR_INIT(pg, cpu_iter);
952 952 while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) {
953 953 if (cpp == cp)
954 954 continue;
955 955 if (CPU_ACTIVE(cpp) &&
956 956 cpp->cpu_part->cp_id == oldpp->cp_id) {
957 957 found = B_TRUE;
958 958 break;
959 959 }
960 960 }
961 961 if (!found)
962 962 bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id);
963 963 }
964 964 }
965 965
966 966 /*
967 967 * Class callback when a CPU becomes active (online)
968 968 *
969 969 * This is called in a context where CPUs are paused
970 970 */
971 971 static void
972 972 pg_cmt_cpu_active(cpu_t *cp)
973 973 {
974 974 int err;
975 975 group_iter_t i;
976 976 pg_cmt_t *pg;
977 977 group_t *pgs;
978 978
979 979 ASSERT(MUTEX_HELD(&cpu_lock));
980 980
981 981 if (cmt_sched_disabled)
982 982 return;
983 983
984 984 pgs = &cp->cpu_pg->pgs;
985 985 group_iter_init(&i);
986 986
987 987 /*
988 988 * Iterate over the CPU's PGs
989 989 */
990 990 while ((pg = group_iterate(pgs, &i)) != NULL) {
991 991
992 992 if (IS_CMT_PG(pg) == 0)
993 993 continue;
994 994
995 995 /*
996 996 * Move to the next generation since topology is changing
997 997 */
998 998 ((pghw_t *)pg)->pghw_generation++;
999 999
1000 1000 err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
1001 1001 ASSERT(err == 0);
1002 1002
1003 1003 /*
1004 1004 * If this is the first active CPU in the PG, and it
1005 1005 * represents a hardware sharing relationship over which
1006 1006 * CMT load balancing is performed, add it as a candidate
1007 1007 * for balancing with it's siblings.
1008 1008 */
1009 1009 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
1010 1010 (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
1011 1011 err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
1012 1012 ASSERT(err == 0);
1013 1013
1014 1014 /*
1015 1015 * If this is a top level PG, add it as a balancing
1016 1016 * candidate when balancing within the root lgroup.
1017 1017 */
1018 1018 if (pg->cmt_parent == NULL &&
1019 1019 pg->cmt_siblings != &cmt_root->cl_pgs) {
1020 1020 err = group_add(&cmt_root->cl_pgs, pg,
1021 1021 GRP_NORESIZE);
1022 1022 ASSERT(err == 0);
1023 1023 }
1024 1024 }
1025 1025
1026 1026 /*
1027 1027 * Notate the CPU in the PGs active CPU bitset.
1028 1028 * Also notate the PG as being active in it's associated
1029 1029 * partition
1030 1030 */
1031 1031 bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
1032 1032 bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id);
1033 1033 }
1034 1034 }
1035 1035
1036 1036 /*
1037 1037 * Class callback when a CPU goes inactive (offline)
1038 1038 *
1039 1039 * This is called in a context where CPUs are paused
1040 1040 */
1041 1041 static void
1042 1042 pg_cmt_cpu_inactive(cpu_t *cp)
1043 1043 {
1044 1044 int err;
1045 1045 group_t *pgs;
1046 1046 pg_cmt_t *pg;
1047 1047 cpu_t *cpp;
1048 1048 group_iter_t i;
1049 1049 pg_cpu_itr_t cpu_itr;
1050 1050 boolean_t found;
1051 1051
1052 1052 ASSERT(MUTEX_HELD(&cpu_lock));
1053 1053
1054 1054 if (cmt_sched_disabled)
1055 1055 return;
1056 1056
1057 1057 pgs = &cp->cpu_pg->pgs;
1058 1058 group_iter_init(&i);
1059 1059
1060 1060 while ((pg = group_iterate(pgs, &i)) != NULL) {
1061 1061
1062 1062 if (IS_CMT_PG(pg) == 0)
1063 1063 continue;
1064 1064
1065 1065 /*
1066 1066 * Move to the next generation since topology is changing
1067 1067 */
1068 1068 ((pghw_t *)pg)->pghw_generation++;
1069 1069
1070 1070 /*
1071 1071 * Remove the CPU from the CMT PGs active CPU group
1072 1072 * bitmap
1073 1073 */
1074 1074 err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
1075 1075 ASSERT(err == 0);
1076 1076
1077 1077 bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
1078 1078
1079 1079 /*
1080 1080 * If there are no more active CPUs in this PG over which
1081 1081 * load was balanced, remove it as a balancing candidate.
1082 1082 */
1083 1083 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
1084 1084 (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
1085 1085 err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
1086 1086 ASSERT(err == 0);
1087 1087
1088 1088 if (pg->cmt_parent == NULL &&
1089 1089 pg->cmt_siblings != &cmt_root->cl_pgs) {
1090 1090 err = group_remove(&cmt_root->cl_pgs, pg,
1091 1091 GRP_NORESIZE);
1092 1092 ASSERT(err == 0);
1093 1093 }
1094 1094 }
1095 1095
1096 1096 /*
1097 1097 * Assert the number of active CPUs does not exceed
1098 1098 * the total number of CPUs in the PG
1099 1099 */
1100 1100 ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <=
1101 1101 GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
1102 1102
1103 1103 /*
1104 1104 * Update the PG bitset in the CPU's old partition
1105 1105 */
1106 1106 found = B_FALSE;
1107 1107 PG_CPU_ITR_INIT(pg, cpu_itr);
1108 1108 while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) {
1109 1109 if (cpp == cp)
1110 1110 continue;
1111 1111 if (CPU_ACTIVE(cpp) &&
1112 1112 cpp->cpu_part->cp_id == cp->cpu_part->cp_id) {
1113 1113 found = B_TRUE;
1114 1114 break;
1115 1115 }
1116 1116 }
1117 1117 if (!found) {
1118 1118 bitset_del(&cp->cpu_part->cp_cmt_pgs,
1119 1119 ((pg_t *)pg)->pg_id);
1120 1120 }
1121 1121 }
1122 1122 }
1123 1123
1124 1124 /*
1125 1125 * Return non-zero if the CPU belongs in the given PG
1126 1126 */
1127 1127 static int
1128 1128 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
1129 1129 {
1130 1130 cpu_t *pg_cpu;
1131 1131
1132 1132 pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0);
1133 1133
1134 1134 ASSERT(pg_cpu != NULL);
1135 1135
1136 1136 /*
1137 1137 * The CPU belongs if, given the nature of the hardware sharing
1138 1138 * relationship represented by the PG, the CPU has that
1139 1139 * relationship with some other CPU already in the PG
1140 1140 */
1141 1141 if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw))
1142 1142 return (1);
1143 1143
1144 1144 return (0);
1145 1145 }
1146 1146
1147 1147 /*
1148 1148 * Sort the CPUs CMT hierarchy, where "size" is the number of levels.
1149 1149 */
1150 1150 static void
1151 1151 pg_cmt_hier_sort(pg_cmt_t **hier, int size)
1152 1152 {
1153 1153 int i, j, inc, sz;
1154 1154 int start, end;
1155 1155 pg_t *tmp;
1156 1156 pg_t **h = (pg_t **)hier;
1157 1157
1158 1158 /*
1159 1159 * First sort by number of CPUs
1160 1160 */
1161 1161 inc = size / 2;
1162 1162 while (inc > 0) {
1163 1163 for (i = inc; i < size; i++) {
1164 1164 j = i;
1165 1165 tmp = h[i];
1166 1166 while ((j >= inc) &&
1167 1167 (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) {
1168 1168 h[j] = h[j - inc];
1169 1169 j = j - inc;
1170 1170 }
1171 1171 h[j] = tmp;
1172 1172 }
1173 1173 if (inc == 2)
1174 1174 inc = 1;
1175 1175 else
1176 1176 inc = (inc * 5) / 11;
1177 1177 }
1178 1178
1179 1179 /*
1180 1180 * Break ties by asking the platform.
1181 1181 * Determine if h[i] outranks h[i + 1] and if so, swap them.
1182 1182 */
1183 1183 for (start = 0; start < size; start++) {
1184 1184
1185 1185 /*
1186 1186 * Find various contiguous sets of elements,
1187 1187 * in the array, with the same number of cpus
1188 1188 */
1189 1189 end = start;
1190 1190 sz = PG_NUM_CPUS(h[start]);
1191 1191 while ((end < size) && (sz == PG_NUM_CPUS(h[end])))
1192 1192 end++;
1193 1193 /*
1194 1194 * Sort each such set of the array by rank
1195 1195 */
1196 1196 for (i = start + 1; i < end; i++) {
1197 1197 j = i - 1;
1198 1198 tmp = h[i];
1199 1199 while (j >= start &&
1200 1200 pg_cmt_hier_rank(hier[j],
1201 1201 (pg_cmt_t *)tmp) == hier[j]) {
1202 1202 h[j + 1] = h[j];
1203 1203 j--;
1204 1204 }
1205 1205 h[j + 1] = tmp;
1206 1206 }
1207 1207 }
1208 1208 }
1209 1209
1210 1210 /*
1211 1211 * Return a cmt_lgrp_t * given an lgroup handle.
1212 1212 */
1213 1213 static cmt_lgrp_t *
1214 1214 pg_cmt_find_lgrp(lgrp_handle_t hand)
1215 1215 {
1216 1216 cmt_lgrp_t *lgrp;
1217 1217
1218 1218 ASSERT(MUTEX_HELD(&cpu_lock));
1219 1219
1220 1220 lgrp = cmt_lgrps;
1221 1221 while (lgrp != NULL) {
1222 1222 if (lgrp->cl_hand == hand)
1223 1223 break;
1224 1224 lgrp = lgrp->cl_next;
1225 1225 }
1226 1226 return (lgrp);
1227 1227 }
1228 1228
1229 1229 /*
1230 1230 * Create a cmt_lgrp_t with the specified handle.
1231 1231 */
1232 1232 static cmt_lgrp_t *
1233 1233 pg_cmt_lgrp_create(lgrp_handle_t hand)
1234 1234 {
1235 1235 cmt_lgrp_t *lgrp;
1236 1236
1237 1237 ASSERT(MUTEX_HELD(&cpu_lock));
1238 1238
1239 1239 lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP);
1240 1240
1241 1241 lgrp->cl_hand = hand;
1242 1242 lgrp->cl_npgs = 0;
1243 1243 lgrp->cl_next = cmt_lgrps;
1244 1244 cmt_lgrps = lgrp;
1245 1245 group_create(&lgrp->cl_pgs);
1246 1246
1247 1247 return (lgrp);
1248 1248 }
1249 1249
1250 1250 /*
1251 1251 * Interfaces to enable and disable power aware dispatching
1252 1252 * The caller must be holding cpu_lock.
1253 1253 *
1254 1254 * Return 0 on success and -1 on failure.
1255 1255 */
1256 1256 int
1257 1257 cmt_pad_enable(pghw_type_t type)
1258 1258 {
1259 1259 group_t *hwset;
1260 1260 group_iter_t iter;
1261 1261 pg_cmt_t *pg;
1262 1262
1263 1263 ASSERT(PGHW_IS_PM_DOMAIN(type));
1264 1264 ASSERT(MUTEX_HELD(&cpu_lock));
1265 1265
1266 1266 if (cmt_sched_disabled == 1)
1267 1267 return (-1);
1268 1268
1269 1269 if ((hwset = pghw_set_lookup(type)) == NULL ||
1270 1270 cmt_hw_blacklisted[type]) {
1271 1271 /*
1272 1272 * Unable to find any instances of the specified type
1273 1273 * of power domain, or the power domains have been blacklisted.
1274 1274 */
1275 1275 return (-1);
1276 1276 }
1277 1277
1278 1278 /*
1279 1279 * Iterate over the power domains, setting the default dispatcher
1280 1280 * policy for power/performance optimization.
1281 1281 *
1282 1282 * Simply setting the policy isn't enough in the case where the power
1283 1283 * domain is an only child of another PG. Because the dispatcher walks
1284 1284 * the PG hierarchy in a top down fashion, the higher up PG's policy
1285 1285 * will dominate. So promote the power domain above it's parent if both
1286 1286 * PG and it's parent have the same CPUs to ensure it's policy
1287 1287 * dominates.
1288 1288 */
1289 1289 group_iter_init(&iter);
1290 1290 while ((pg = group_iterate(hwset, &iter)) != NULL) {
1291 1291 /*
1292 1292 * If the power domain is an only child to a parent
1293 1293 * not implementing the same policy, promote the child
1294 1294 * above the parent to activate the policy.
1295 1295 */
1296 1296 pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw);
1297 1297 while ((pg->cmt_parent != NULL) &&
1298 1298 (pg->cmt_parent->cmt_policy != pg->cmt_policy) &&
1299 1299 (PG_NUM_CPUS((pg_t *)pg) ==
1300 1300 PG_NUM_CPUS((pg_t *)pg->cmt_parent))) {
1301 1301 cmt_hier_promote(pg, NULL);
1302 1302 }
1303 1303 }
1304 1304
1305 1305 return (0);
1306 1306 }
1307 1307
1308 1308 int
1309 1309 cmt_pad_disable(pghw_type_t type)
1310 1310 {
1311 1311 group_t *hwset;
1312 1312 group_iter_t iter;
1313 1313 pg_cmt_t *pg;
1314 1314 pg_cmt_t *child;
1315 1315
1316 1316 ASSERT(PGHW_IS_PM_DOMAIN(type));
1317 1317 ASSERT(MUTEX_HELD(&cpu_lock));
1318 1318
1319 1319 if (cmt_sched_disabled == 1)
1320 1320 return (-1);
1321 1321
1322 1322 if ((hwset = pghw_set_lookup(type)) == NULL) {
1323 1323 /*
1324 1324 * Unable to find any instances of the specified type of
1325 1325 * power domain.
1326 1326 */
1327 1327 return (-1);
1328 1328 }
1329 1329 /*
1330 1330 * Iterate over the power domains, setting the default dispatcher
1331 1331 * policy for performance optimization (load balancing).
1332 1332 */
1333 1333 group_iter_init(&iter);
1334 1334 while ((pg = group_iterate(hwset, &iter)) != NULL) {
1335 1335
1336 1336 /*
1337 1337 * If the power domain has an only child that implements
1338 1338 * policy other than load balancing, promote the child
1339 1339 * above the power domain to ensure it's policy dominates.
1340 1340 */
1341 1341 if (pg->cmt_children != NULL &&
1342 1342 GROUP_SIZE(pg->cmt_children) == 1) {
1343 1343 child = GROUP_ACCESS(pg->cmt_children, 0);
1344 1344 if ((child->cmt_policy & CMT_BALANCE) == 0) {
1345 1345 cmt_hier_promote(child, NULL);
1346 1346 }
1347 1347 }
1348 1348 pg->cmt_policy = CMT_BALANCE;
1349 1349 }
1350 1350 return (0);
1351 1351 }
1352 1352
1353 1353 /* ARGSUSED */
1354 1354 static void
1355 1355 cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
1356 1356 kthread_t *new)
1357 1357 {
1358 1358 pg_cmt_t *cmt_pg = (pg_cmt_t *)pg;
1359 1359
1360 1360 if (old == cp->cpu_idle_thread) {
1361 1361 atomic_add_32(&cmt_pg->cmt_utilization, 1);
1362 1362 } else if (new == cp->cpu_idle_thread) {
1363 1363 atomic_add_32(&cmt_pg->cmt_utilization, -1);
1364 1364 }
1365 1365 }
1366 1366
1367 1367 /*
1368 1368 * Macro to test whether a thread is currently runnable on a CPU in a PG.
1369 1369 */
1370 1370 #define THREAD_RUNNABLE_IN_PG(t, pg) \
1371 1371 ((t)->t_state == TS_RUN && \
1372 1372 (t)->t_disp_queue->disp_cpu && \
1373 1373 bitset_in_set(&(pg)->cmt_cpus_actv_set, \
1374 1374 (t)->t_disp_queue->disp_cpu->cpu_seqid))
1375 1375
1376 1376 static void
1377 1377 cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
1378 1378 kthread_t *new)
1379 1379 {
1380 1380 pg_cmt_t *cmt = (pg_cmt_t *)pg;
1381 1381 cpupm_domain_t *dom;
1382 1382 uint32_t u;
1383 1383
1384 1384 if (old == cp->cpu_idle_thread) {
1385 1385 ASSERT(new != cp->cpu_idle_thread);
1386 1386 u = atomic_add_32_nv(&cmt->cmt_utilization, 1);
1387 1387 if (u == 1) {
1388 1388 /*
1389 1389 * Notify the CPU power manager that the domain
1390 1390 * is non-idle.
1391 1391 */
1392 1392 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1393 1393 cpupm_utilization_event(cp, now, dom,
1394 1394 CPUPM_DOM_BUSY_FROM_IDLE);
1395 1395 }
1396 1396 } else if (new == cp->cpu_idle_thread) {
1397 1397 ASSERT(old != cp->cpu_idle_thread);
1398 1398 u = atomic_add_32_nv(&cmt->cmt_utilization, -1);
1399 1399 if (u == 0) {
1400 1400 /*
1401 1401 * The domain is idle, notify the CPU power
1402 1402 * manager.
1403 1403 *
1404 1404 * Avoid notifying if the thread is simply migrating
1405 1405 * between CPUs in the domain.
1406 1406 */
1407 1407 if (!THREAD_RUNNABLE_IN_PG(old, cmt)) {
1408 1408 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1409 1409 cpupm_utilization_event(cp, now, dom,
1410 1410 CPUPM_DOM_IDLE_FROM_BUSY);
1411 1411 }
1412 1412 }
1413 1413 }
1414 1414 }
1415 1415
1416 1416 /* ARGSUSED */
1417 1417 static void
1418 1418 cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t)
1419 1419 {
1420 1420 pg_cmt_t *cmt = (pg_cmt_t *)pg;
1421 1421 cpupm_domain_t *dom;
1422 1422
1423 1423 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1424 1424 cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY);
1425 1425 }
1426 1426
1427 1427 /*
1428 1428 * Return the name of the CMT scheduling policy
1429 1429 * being implemented across this PG
1430 1430 */
1431 1431 static char *
1432 1432 pg_cmt_policy_name(pg_t *pg)
1433 1433 {
1434 1434 pg_cmt_policy_t policy;
1435 1435
1436 1436 policy = ((pg_cmt_t *)pg)->cmt_policy;
1437 1437
1438 1438 if (policy & CMT_AFFINITY) {
1439 1439 if (policy & CMT_BALANCE)
1440 1440 return ("Load Balancing & Affinity");
1441 1441 else if (policy & CMT_COALESCE)
1442 1442 return ("Load Coalescence & Affinity");
1443 1443 else
1444 1444 return ("Affinity");
1445 1445 } else {
1446 1446 if (policy & CMT_BALANCE)
1447 1447 return ("Load Balancing");
1448 1448 else if (policy & CMT_COALESCE)
1449 1449 return ("Load Coalescence");
1450 1450 else
1451 1451 return ("None");
1452 1452 }
1453 1453 }
1454 1454
1455 1455 /*
1456 1456 * Prune PG, and all other instances of PG's hardware sharing relationship
1457 1457 * from the CMT PG hierarchy.
1458 1458 *
1459 1459 * This routine operates on the CPU specific processor group data (for the CPUs
1460 1460 * in the PG being pruned), and may be invoked from a context where one CPU's
1461 1461 * PG data is under construction. In this case the argument "pgdata", if not
1462 1462 * NULL, is a reference to the CPU's under-construction PG data.
1463 1463 */
1464 1464 static int
1465 1465 pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata)
1466 1466 {
1467 1467 group_t *hwset, *children;
1468 1468 int i, j, r, size = *sz;
1469 1469 group_iter_t hw_iter, child_iter;
1470 1470 pg_cpu_itr_t cpu_iter;
1471 1471 pg_cmt_t *pg, *child;
1472 1472 cpu_t *cpu;
1473 1473 int cap_needed;
1474 1474 pghw_type_t hw;
1475 1475
1476 1476 ASSERT(MUTEX_HELD(&cpu_lock));
1477 1477
1478 1478 /*
1479 1479 * Inform pghw layer that this PG is pruned.
1480 1480 */
1481 1481 pghw_cmt_fini((pghw_t *)pg_bad);
1482 1482
1483 1483 hw = ((pghw_t *)pg_bad)->pghw_hw;
1484 1484
1485 1485 if (hw == PGHW_POW_ACTIVE) {
1486 1486 cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. "
1487 1487 "Event Based CPUPM Unavailable");
1488 1488 } else if (hw == PGHW_POW_IDLE) {
1489 1489 cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. "
1490 1490 "Dispatcher assisted CPUPM disabled.");
1491 1491 }
1492 1492
1493 1493 /*
1494 1494 * Find and eliminate the PG from the lineage.
1495 1495 */
1496 1496 for (i = 0; i < size; i++) {
1497 1497 if (lineage[i] == pg_bad) {
1498 1498 for (j = i; j < size - 1; j++)
1499 1499 lineage[j] = lineage[j + 1];
1500 1500 *sz = size - 1;
1501 1501 break;
1502 1502 }
1503 1503 }
1504 1504
1505 1505 /*
1506 1506 * We'll prune all instances of the hardware sharing relationship
1507 1507 * represented by pg. But before we do that (and pause CPUs) we need
1508 1508 * to ensure the hierarchy's groups are properly sized.
1509 1509 */
1510 1510 hwset = pghw_set_lookup(hw);
1511 1511
1512 1512 /*
1513 1513 * Blacklist the hardware so future processor groups of this type won't
1514 1514 * participate in CMT thread placement.
1515 1515 *
1516 1516 * XXX
1517 1517 * For heterogeneous system configurations, this might be overkill.
1518 1518 * We may only need to blacklist the illegal PGs, and other instances
1519 1519 * of this hardware sharing relationship may be ok.
1520 1520 */
1521 1521 cmt_hw_blacklisted[hw] = 1;
1522 1522
1523 1523 /*
1524 1524 * For each of the PGs being pruned, ensure sufficient capacity in
1525 1525 * the siblings set for the PG's children
1526 1526 */
1527 1527 group_iter_init(&hw_iter);
1528 1528 while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
1529 1529 /*
1530 1530 * PG is being pruned, but if it is bringing up more than
1531 1531 * one child, ask for more capacity in the siblings group.
1532 1532 */
1533 1533 cap_needed = 0;
1534 1534 if (pg->cmt_children &&
1535 1535 GROUP_SIZE(pg->cmt_children) > 1) {
1536 1536 cap_needed = GROUP_SIZE(pg->cmt_children) - 1;
1537 1537
1538 1538 group_expand(pg->cmt_siblings,
1539 1539 GROUP_SIZE(pg->cmt_siblings) + cap_needed);
1540 1540
1541 1541 /*
1542 1542 * If this is a top level group, also ensure the
1543 1543 * capacity in the root lgrp level CMT grouping.
1544 1544 */
1545 1545 if (pg->cmt_parent == NULL &&
1546 1546 pg->cmt_siblings != &cmt_root->cl_pgs) {
1547 1547 group_expand(&cmt_root->cl_pgs,
↓ open down ↓ |
1179 lines elided |
↑ open up ↑ |
1548 1548 GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed);
1549 1549 cmt_root->cl_npgs += cap_needed;
1550 1550 }
1551 1551 }
1552 1552 }
1553 1553
1554 1554 /*
1555 1555 * We're operating on the PG hierarchy. Pause CPUs to ensure
1556 1556 * exclusivity with respect to the dispatcher.
1557 1557 */
1558 - pause_cpus(NULL);
1558 + pause_cpus(NULL, NULL);
1559 1559
1560 1560 /*
1561 1561 * Prune all PG instances of the hardware sharing relationship
1562 1562 * represented by pg.
1563 1563 */
1564 1564 group_iter_init(&hw_iter);
1565 1565 while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
1566 1566
1567 1567 /*
1568 1568 * Remove PG from it's group of siblings, if it's there.
1569 1569 */
1570 1570 if (pg->cmt_siblings) {
1571 1571 (void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
1572 1572 }
1573 1573 if (pg->cmt_parent == NULL &&
1574 1574 pg->cmt_siblings != &cmt_root->cl_pgs) {
1575 1575 (void) group_remove(&cmt_root->cl_pgs, pg,
1576 1576 GRP_NORESIZE);
1577 1577 }
1578 1578
1579 1579 /*
1580 1580 * Indicate that no CMT policy will be implemented across
1581 1581 * this PG.
1582 1582 */
1583 1583 pg->cmt_policy = CMT_NO_POLICY;
1584 1584
1585 1585 /*
1586 1586 * Move PG's children from it's children set to it's parent's
1587 1587 * children set. Note that the parent's children set, and PG's
1588 1588 * siblings set are the same thing.
1589 1589 *
1590 1590 * Because we are iterating over the same group that we are
1591 1591 * operating on (removing the children), first add all of PG's
1592 1592 * children to the parent's children set, and once we are done
1593 1593 * iterating, empty PG's children set.
1594 1594 */
1595 1595 if (pg->cmt_children != NULL) {
1596 1596 children = pg->cmt_children;
1597 1597
1598 1598 group_iter_init(&child_iter);
1599 1599 while ((child = group_iterate(children, &child_iter))
1600 1600 != NULL) {
1601 1601 if (pg->cmt_siblings != NULL) {
1602 1602 r = group_add(pg->cmt_siblings, child,
1603 1603 GRP_NORESIZE);
1604 1604 ASSERT(r == 0);
1605 1605
1606 1606 if (pg->cmt_parent == NULL &&
1607 1607 pg->cmt_siblings !=
1608 1608 &cmt_root->cl_pgs) {
1609 1609 r = group_add(&cmt_root->cl_pgs,
1610 1610 child, GRP_NORESIZE);
1611 1611 ASSERT(r == 0);
1612 1612 }
1613 1613 }
1614 1614 }
1615 1615 group_empty(pg->cmt_children);
1616 1616 }
1617 1617
1618 1618 /*
1619 1619 * Reset the callbacks to the defaults
1620 1620 */
1621 1621 pg_callback_set_defaults((pg_t *)pg);
1622 1622
1623 1623 /*
1624 1624 * Update all the CPU lineages in each of PG's CPUs
1625 1625 */
1626 1626 PG_CPU_ITR_INIT(pg, cpu_iter);
1627 1627 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
1628 1628 pg_cmt_t *cpu_pg;
1629 1629 group_iter_t liter; /* Iterator for the lineage */
1630 1630 cpu_pg_t *cpd; /* CPU's PG data */
1631 1631
1632 1632 /*
1633 1633 * The CPU's lineage is under construction still
1634 1634 * references the bootstrap CPU PG data structure.
1635 1635 */
1636 1636 if (pg_cpu_is_bootstrapped(cpu))
1637 1637 cpd = pgdata;
1638 1638 else
1639 1639 cpd = cpu->cpu_pg;
1640 1640
1641 1641 /*
1642 1642 * Iterate over the CPU's PGs updating the children
1643 1643 * of the PG being promoted, since they have a new
1644 1644 * parent and siblings set.
1645 1645 */
1646 1646 group_iter_init(&liter);
1647 1647 while ((cpu_pg = group_iterate(&cpd->pgs,
1648 1648 &liter)) != NULL) {
1649 1649 if (cpu_pg->cmt_parent == pg) {
1650 1650 cpu_pg->cmt_parent = pg->cmt_parent;
1651 1651 cpu_pg->cmt_siblings = pg->cmt_siblings;
1652 1652 }
1653 1653 }
1654 1654
1655 1655 /*
1656 1656 * Update the CPU's lineages
1657 1657 *
1658 1658 * Remove the PG from the CPU's group used for CMT
1659 1659 * scheduling.
1660 1660 */
1661 1661 (void) group_remove(&cpd->cmt_pgs, pg, GRP_NORESIZE);
1662 1662 }
1663 1663 }
1664 1664 start_cpus();
1665 1665 return (0);
1666 1666 }
1667 1667
↓ open down ↓ |
99 lines elided |
↑ open up ↑ |
1668 1668 /*
1669 1669 * Disable CMT scheduling
1670 1670 */
1671 1671 static void
1672 1672 pg_cmt_disable(void)
1673 1673 {
1674 1674 cpu_t *cpu;
1675 1675
1676 1676 ASSERT(MUTEX_HELD(&cpu_lock));
1677 1677
1678 - pause_cpus(NULL);
1678 + pause_cpus(NULL, NULL);
1679 1679 cpu = cpu_list;
1680 1680
1681 1681 do {
1682 1682 if (cpu->cpu_pg)
1683 1683 group_empty(&cpu->cpu_pg->cmt_pgs);
1684 1684 } while ((cpu = cpu->cpu_next) != cpu_list);
1685 1685
1686 1686 cmt_sched_disabled = 1;
1687 1687 start_cpus();
1688 1688 cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable");
1689 1689 }
1690 1690
1691 1691 /*
1692 1692 * CMT lineage validation
1693 1693 *
1694 1694 * This routine is invoked by pg_cmt_cpu_init() to validate the integrity
1695 1695 * of the PGs in a CPU's lineage. This is necessary because it's possible that
1696 1696 * some groupings (power domain groupings in particular) may be defined by
1697 1697 * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be
1698 1698 * possible to integrate those groupings into the CMT PG hierarchy, if doing
1699 1699 * so would violate the subset invariant of the hierarchy, which says that
1700 1700 * a PG must be subset of its parent (if it has one).
1701 1701 *
1702 1702 * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that
1703 1703 * would result in a violation of this invariant. If a violation is found,
1704 1704 * and the PG is of a grouping type who's definition is known to originate from
1705 1705 * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the
1706 1706 * PG (and all other instances PG's sharing relationship type) from the CMT
1707 1707 * hierarchy. Further, future instances of that sharing relationship type won't
1708 1708 * be added. If the grouping definition doesn't originate from suspect
1709 1709 * sources, then pg_cmt_disable() will be invoked to log an error, and disable
1710 1710 * CMT scheduling altogether.
1711 1711 *
1712 1712 * This routine is invoked after the CPU has been added to the PGs in which
1713 1713 * it belongs, but before those PGs have been added to (or had their place
1714 1714 * adjusted in) the CMT PG hierarchy.
1715 1715 *
1716 1716 * The first argument is the CPUs PG lineage (essentially an array of PGs in
1717 1717 * which the CPU belongs) that has already been sorted in ascending order
1718 1718 * by CPU count. Some of the PGs in the CPUs lineage may already have other
1719 1719 * CPUs in them, and have already been integrated into the CMT hierarchy.
1720 1720 *
1721 1721 * The addition of this new CPU to these pre-existing PGs means that those
1722 1722 * PGs may need to be promoted up in the hierarchy to satisfy the subset
1723 1723 * invariant. In additon to testing the subset invariant for the lineage,
1724 1724 * this routine also verifies that the addition of the new CPU to the
1725 1725 * existing PGs wouldn't cause the subset invariant to be violated in
1726 1726 * the exiting lineages.
1727 1727 *
1728 1728 * This routine will normally return one of the following:
1729 1729 * CMT_LINEAGE_VALID - There were no problems detected with the lineage.
1730 1730 * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning.
1731 1731 *
1732 1732 * Otherwise, this routine will return a value indicating which error it
1733 1733 * was unable to recover from (and set cmt_lineage_status along the way).
1734 1734 *
1735 1735 * This routine operates on the CPU specific processor group data (for the CPU
1736 1736 * whose lineage is being validated), which is under-construction.
1737 1737 * "pgdata" is a reference to the CPU's under-construction PG data.
1738 1738 * This routine must be careful to operate only on "pgdata", and not cp->cpu_pg.
1739 1739 */
1740 1740 static cmt_lineage_validation_t
1741 1741 pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata)
1742 1742 {
1743 1743 int i, j, size;
1744 1744 pg_cmt_t *pg, *pg_next, *pg_bad, *pg_tmp, *parent;
1745 1745 cpu_t *cp;
1746 1746 pg_cpu_itr_t cpu_iter;
1747 1747 lgrp_handle_t lgrp;
1748 1748
1749 1749 ASSERT(MUTEX_HELD(&cpu_lock));
1750 1750
1751 1751 revalidate:
1752 1752 size = *sz;
1753 1753 pg_bad = NULL;
1754 1754 lgrp = LGRP_NULL_HANDLE;
1755 1755 for (i = 0; i < size; i++) {
1756 1756
1757 1757 pg = lineage[i];
1758 1758 if (i < size - 1)
1759 1759 pg_next = lineage[i + 1];
1760 1760 else
1761 1761 pg_next = NULL;
1762 1762
1763 1763 /*
1764 1764 * We assume that the lineage has already been sorted
1765 1765 * by the number of CPUs. In fact, we depend on it.
1766 1766 */
1767 1767 ASSERT(pg_next == NULL ||
1768 1768 (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next)));
1769 1769
1770 1770 /*
1771 1771 * The CPUs PG lineage was passed as the first argument to
1772 1772 * this routine and contains the sorted list of the CPU's
1773 1773 * PGs. Ultimately, the ordering of the PGs in that list, and
1774 1774 * the ordering as traversed by the cmt_parent list must be
1775 1775 * the same. PG promotion will be used as the mechanism to
1776 1776 * achieve this, but first we need to look for cases where
1777 1777 * promotion will be necessary, and validate that will be
1778 1778 * possible without violating the subset invarient described
1779 1779 * above.
1780 1780 *
1781 1781 * Since the PG topology is in the middle of being changed, we
1782 1782 * need to check whether the PG's existing parent (if any) is
1783 1783 * part of this CPU's lineage (and therefore should contain
1784 1784 * the new CPU). If not, it means that the addition of the
1785 1785 * new CPU should have made this PG have more CPUs than its
1786 1786 * parent (and other ancestors not in the same lineage) and
1787 1787 * will need to be promoted into place.
1788 1788 *
1789 1789 * We need to verify all of this to defend against a buggy
1790 1790 * BIOS giving bad power domain CPU groupings. Sigh.
1791 1791 */
1792 1792 parent = pg->cmt_parent;
1793 1793 while (parent != NULL) {
1794 1794 /*
1795 1795 * Determine if the parent/ancestor is in this lineage
1796 1796 */
1797 1797 pg_tmp = NULL;
1798 1798 for (j = 0; (j < size) && (pg_tmp != parent); j++) {
1799 1799 pg_tmp = lineage[j];
1800 1800 }
1801 1801 if (pg_tmp == parent) {
1802 1802 /*
1803 1803 * It's in the lineage. The concentricity
1804 1804 * checks will handle the rest.
1805 1805 */
1806 1806 break;
1807 1807 }
1808 1808 /*
1809 1809 * If it is not in the lineage, PG will eventually
1810 1810 * need to be promoted above it. Verify the ancestor
1811 1811 * is a proper subset. There is still an error if
1812 1812 * the ancestor has the same number of CPUs as PG,
1813 1813 * since that would imply it should be in the lineage,
1814 1814 * and we already know it isn't.
1815 1815 */
1816 1816 if (PG_NUM_CPUS((pg_t *)parent) >=
1817 1817 PG_NUM_CPUS((pg_t *)pg)) {
1818 1818 /*
1819 1819 * Not a proper subset if the parent/ancestor
1820 1820 * has the same or more CPUs than PG.
1821 1821 */
1822 1822 cmt_lineage_status = CMT_LINEAGE_NON_PROMOTABLE;
1823 1823 goto handle_error;
1824 1824 }
1825 1825 parent = parent->cmt_parent;
1826 1826 }
1827 1827
1828 1828 /*
1829 1829 * Walk each of the CPUs in the PGs group and perform
1830 1830 * consistency checks along the way.
1831 1831 */
1832 1832 PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter);
1833 1833 while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
1834 1834 /*
1835 1835 * Verify that there aren't any CPUs contained in PG
1836 1836 * that the next PG in the lineage (which is larger
1837 1837 * or same size) doesn't also contain.
1838 1838 */
1839 1839 if (pg_next != NULL &&
1840 1840 pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) {
1841 1841 cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC;
1842 1842 goto handle_error;
1843 1843 }
1844 1844
1845 1845 /*
1846 1846 * Verify that all the CPUs in the PG are in the same
1847 1847 * lgroup.
1848 1848 */
1849 1849 if (lgrp == LGRP_NULL_HANDLE) {
1850 1850 lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id);
1851 1851 } else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) {
1852 1852 cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS;
1853 1853 goto handle_error;
1854 1854 }
1855 1855 }
1856 1856 }
1857 1857
1858 1858 handle_error:
1859 1859 /*
1860 1860 * Some of these validation errors can result when the CPU grouping
1861 1861 * information is derived from buggy sources (for example, incorrect
1862 1862 * ACPI tables on x86 systems).
1863 1863 *
1864 1864 * We'll try to recover in such cases by pruning out the illegal
1865 1865 * groupings from the PG hierarchy, which means that we won't optimize
1866 1866 * for those levels, but we will for the remaining ones.
1867 1867 */
1868 1868 switch (cmt_lineage_status) {
1869 1869 case CMT_LINEAGE_VALID:
1870 1870 case CMT_LINEAGE_REPAIRED:
1871 1871 break;
1872 1872 case CMT_LINEAGE_PG_SPANS_LGRPS:
1873 1873 /*
1874 1874 * We've detected a PG whose CPUs span lgroups.
1875 1875 *
1876 1876 * This isn't supported, as the dispatcher isn't allowed to
1877 1877 * to do CMT thread placement across lgroups, as this would
1878 1878 * conflict with policies implementing MPO thread affinity.
1879 1879 *
1880 1880 * If the PG is of a sharing relationship type known to
1881 1881 * legitimately span lgroups, specify that no CMT thread
1882 1882 * placement policy should be implemented, and prune the PG
1883 1883 * from the existing CMT PG hierarchy.
1884 1884 *
1885 1885 * Otherwise, fall though to the case below for handling.
1886 1886 */
1887 1887 if (((pghw_t *)pg)->pghw_hw == PGHW_CHIP) {
1888 1888 if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) {
1889 1889 cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1890 1890 goto revalidate;
1891 1891 }
1892 1892 }
1893 1893 /*LINTED*/
1894 1894 case CMT_LINEAGE_NON_PROMOTABLE:
1895 1895 /*
1896 1896 * We've detected a PG that already exists in another CPU's
1897 1897 * lineage that cannot cannot legally be promoted into place
1898 1898 * without breaking the invariants of the hierarchy.
1899 1899 */
1900 1900 if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
1901 1901 if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) {
1902 1902 cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1903 1903 goto revalidate;
1904 1904 }
1905 1905 }
1906 1906 /*
1907 1907 * Something went wrong trying to prune out the bad level.
1908 1908 * Disable CMT scheduling altogether.
1909 1909 */
1910 1910 pg_cmt_disable();
1911 1911 break;
1912 1912 case CMT_LINEAGE_NON_CONCENTRIC:
1913 1913 /*
1914 1914 * We've detected a non-concentric PG lineage, which means that
1915 1915 * there's a PG in the lineage that has CPUs that the next PG
1916 1916 * over in the lineage (which is the same size or larger)
1917 1917 * doesn't have.
1918 1918 *
1919 1919 * In this case, we examine the two PGs to see if either
1920 1920 * grouping is defined by potentially buggy sources.
1921 1921 *
1922 1922 * If one has less CPUs than the other, and contains CPUs
1923 1923 * not found in the parent, and it is an untrusted enumeration,
1924 1924 * then prune it. If both have the same number of CPUs, then
1925 1925 * prune the one that is untrusted.
1926 1926 *
1927 1927 * This process repeats until we have a concentric lineage,
1928 1928 * or we would have to prune out level derived from what we
1929 1929 * thought was a reliable source, in which case CMT scheduling
1930 1930 * is disabled altogether.
1931 1931 */
1932 1932 if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) &&
1933 1933 (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) {
1934 1934 pg_bad = pg;
1935 1935 } else if (PG_NUM_CPUS((pg_t *)pg) ==
1936 1936 PG_NUM_CPUS((pg_t *)pg_next)) {
1937 1937 if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) {
1938 1938 pg_bad = pg_next;
1939 1939 } else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
1940 1940 pg_bad = pg;
1941 1941 }
1942 1942 }
1943 1943 if (pg_bad) {
1944 1944 if (pg_cmt_prune(pg_bad, lineage, sz, pgdata) == 0) {
1945 1945 cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1946 1946 goto revalidate;
1947 1947 }
1948 1948 }
1949 1949 /*
1950 1950 * Something went wrong trying to identify and/or prune out
1951 1951 * the bad level. Disable CMT scheduling altogether.
1952 1952 */
1953 1953 pg_cmt_disable();
1954 1954 break;
1955 1955 default:
1956 1956 /*
1957 1957 * If we're here, we've encountered a validation error for
1958 1958 * which we don't know how to recover. In this case, disable
1959 1959 * CMT scheduling altogether.
1960 1960 */
1961 1961 cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE;
1962 1962 pg_cmt_disable();
1963 1963 }
1964 1964 return (cmt_lineage_status);
1965 1965 }
↓ open down ↓ |
277 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX