1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 #include <sys/systm.h> 26 #include <sys/types.h> 27 #include <sys/param.h> 28 #include <sys/thread.h> 29 #include <sys/cpuvar.h> 30 #include <sys/cpupart.h> 31 #include <sys/kmem.h> 32 #include <sys/cmn_err.h> 33 #include <sys/kstat.h> 34 #include <sys/processor.h> 35 #include <sys/disp.h> 36 #include <sys/group.h> 37 #include <sys/pghw.h> 38 #include <sys/bitset.h> 39 #include <sys/lgrp.h> 40 #include <sys/cmt.h> 41 #include <sys/cpu_pm.h> 42 43 /* 44 * CMT scheduler / dispatcher support 45 * 46 * This file implements CMT scheduler support using Processor Groups. 47 * The CMT processor group class creates and maintains the CMT class 48 * specific processor group pg_cmt_t. 49 * 50 * ---------------------------- <-- pg_cmt_t * 51 * | pghw_t | 52 * ---------------------------- 53 * | CMT class specific data | 54 * | - hierarchy linkage | 55 * | - CMT load balancing data| 56 * | - active CPU group/bitset| 57 * ---------------------------- 58 * 59 * The scheduler/dispatcher leverages knowledge of the performance 60 * relevant CMT sharing relationships existing between cpus to implement 61 * optimized affinity, load balancing, and coalescence policies. 62 * 63 * Load balancing policy seeks to improve performance by minimizing 64 * contention over shared processor resources / facilities, Affinity 65 * policies seek to improve cache and TLB utilization. Coalescence 66 * policies improve resource utilization and ultimately power efficiency. 67 * 68 * The CMT PGs created by this class are already arranged into a 69 * hierarchy (which is done in the pghw layer). To implement the top-down 70 * CMT load balancing algorithm, the CMT PGs additionally maintain 71 * parent, child and sibling hierarchy relationships. 72 * Parent PGs always contain a superset of their children(s) resources, 73 * each PG can have at most one parent, and siblings are the group of PGs 74 * sharing the same parent. 75 * 76 * On UMA based systems, the CMT load balancing algorithm begins by balancing 77 * load across the group of top level PGs in the system hierarchy. 78 * On NUMA systems, the CMT load balancing algorithm balances load across the 79 * group of top level PGs in each leaf lgroup...but for root homed threads, 80 * is willing to balance against all the top level PGs in the system. 81 * 82 * Groups of top level PGs are maintained to implement the above, one for each 83 * leaf lgroup (containing the top level PGs in that lgroup), and one (for the 84 * root lgroup) that contains all the top level PGs in the system. 85 */ 86 static cmt_lgrp_t *cmt_lgrps = NULL; /* cmt_lgrps list head */ 87 static cmt_lgrp_t *cpu0_lgrp = NULL; /* boot CPU's initial lgrp */ 88 /* used for null_proc_lpa */ 89 cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */ 90 91 static int is_cpu0 = 1; /* true if this is boot CPU context */ 92 93 /* 94 * Array of hardware sharing relationships that are blacklisted. 95 * CMT scheduling optimizations won't be performed for blacklisted sharing 96 * relationships. 97 */ 98 static int cmt_hw_blacklisted[PGHW_NUM_COMPONENTS]; 99 100 /* 101 * Set this to non-zero to disable CMT scheduling 102 * This must be done via kmdb -d, as /etc/system will be too late 103 */ 104 int cmt_sched_disabled = 0; 105 106 /* 107 * Status codes for CMT lineage validation 108 * See pg_cmt_lineage_validate() below 109 */ 110 typedef enum cmt_lineage_validation { 111 CMT_LINEAGE_VALID, 112 CMT_LINEAGE_NON_CONCENTRIC, 113 CMT_LINEAGE_PG_SPANS_LGRPS, 114 CMT_LINEAGE_NON_PROMOTABLE, 115 CMT_LINEAGE_REPAIRED, 116 CMT_LINEAGE_UNRECOVERABLE 117 } cmt_lineage_validation_t; 118 119 /* 120 * Status of the current lineage under construction. 121 * One must be holding cpu_lock to change this. 122 */ 123 cmt_lineage_validation_t cmt_lineage_status = CMT_LINEAGE_VALID; 124 125 /* 126 * Power domain definitions (on x86) are defined by ACPI, and 127 * therefore may be subject to BIOS bugs. 128 */ 129 #define PG_CMT_HW_SUSPECT(hw) PGHW_IS_PM_DOMAIN(hw) 130 131 /* 132 * Macro to test if PG is managed by the CMT PG class 133 */ 134 #define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id) 135 136 static pg_cid_t pg_cmt_class_id; /* PG class id */ 137 138 static pg_t *pg_cmt_alloc(); 139 static void pg_cmt_free(pg_t *); 140 static void pg_cmt_cpu_init(cpu_t *, cpu_pg_t *); 141 static void pg_cmt_cpu_fini(cpu_t *, cpu_pg_t *); 142 static void pg_cmt_cpu_active(cpu_t *); 143 static void pg_cmt_cpu_inactive(cpu_t *); 144 static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *); 145 static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *); 146 static char *pg_cmt_policy_name(pg_t *); 147 static void pg_cmt_hier_sort(pg_cmt_t **, int); 148 static pg_cmt_t *pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *); 149 static int pg_cmt_cpu_belongs(pg_t *, cpu_t *); 150 static int pg_cmt_hw(pghw_type_t); 151 static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t); 152 static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t); 153 static void cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t, 154 kthread_t *, kthread_t *); 155 static void cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t, 156 kthread_t *, kthread_t *); 157 static void cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *); 158 static cmt_lineage_validation_t pg_cmt_lineage_validate(pg_cmt_t **, int *, 159 cpu_pg_t *); 160 161 /* 162 * CMT PG ops 163 */ 164 struct pg_ops pg_ops_cmt = { 165 pg_cmt_alloc, 166 pg_cmt_free, 167 pg_cmt_cpu_init, 168 pg_cmt_cpu_fini, 169 pg_cmt_cpu_active, 170 pg_cmt_cpu_inactive, 171 pg_cmt_cpupart_in, 172 NULL, /* cpupart_out */ 173 pg_cmt_cpupart_move, 174 pg_cmt_cpu_belongs, 175 pg_cmt_policy_name, 176 }; 177 178 /* 179 * Initialize the CMT PG class 180 */ 181 void 182 pg_cmt_class_init(void) 183 { 184 if (cmt_sched_disabled) 185 return; 186 187 pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL); 188 } 189 190 /* 191 * Called to indicate a new CPU has started up so 192 * that either t0 or the slave startup thread can 193 * be accounted for. 194 */ 195 void 196 pg_cmt_cpu_startup(cpu_t *cp) 197 { 198 pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread, 199 cp->cpu_thread); 200 } 201 202 /* 203 * Return non-zero if thread can migrate between "from" and "to" 204 * without a performance penalty 205 */ 206 int 207 pg_cmt_can_migrate(cpu_t *from, cpu_t *to) 208 { 209 if (from->cpu_physid->cpu_cacheid == 210 to->cpu_physid->cpu_cacheid) 211 return (1); 212 return (0); 213 } 214 215 /* 216 * CMT class specific PG allocation 217 */ 218 static pg_t * 219 pg_cmt_alloc(void) 220 { 221 return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP)); 222 } 223 224 /* 225 * Class specific PG de-allocation 226 */ 227 static void 228 pg_cmt_free(pg_t *pg) 229 { 230 ASSERT(pg != NULL); 231 ASSERT(IS_CMT_PG(pg)); 232 233 kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t)); 234 } 235 236 /* 237 * Given a hardware sharing relationship, return which dispatcher 238 * policies should be implemented to optimize performance and efficiency 239 */ 240 static pg_cmt_policy_t 241 pg_cmt_policy(pghw_type_t hw) 242 { 243 pg_cmt_policy_t p; 244 245 /* 246 * Give the platform a chance to override the default 247 */ 248 if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY) 249 return (p); 250 251 switch (hw) { 252 case PGHW_IPIPE: 253 case PGHW_FPU: 254 case PGHW_PROCNODE: 255 case PGHW_CHIP: 256 return (CMT_BALANCE); 257 case PGHW_CACHE: 258 return (CMT_AFFINITY | CMT_BALANCE); 259 case PGHW_POW_ACTIVE: 260 case PGHW_POW_IDLE: 261 return (CMT_BALANCE); 262 default: 263 return (CMT_NO_POLICY); 264 } 265 } 266 267 /* 268 * Rank the importance of optimizing for the pg1 relationship vs. 269 * the pg2 relationship. 270 */ 271 static pg_cmt_t * 272 pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2) 273 { 274 pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw; 275 pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw; 276 277 /* 278 * A power domain is only important if CPUPM is enabled. 279 */ 280 if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) { 281 if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2)) 282 return (pg2); 283 if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1)) 284 return (pg1); 285 } 286 287 /* 288 * Otherwise, ask the platform 289 */ 290 if (pg_plat_hw_rank(hw1, hw2) == hw1) 291 return (pg1); 292 else 293 return (pg2); 294 } 295 296 /* 297 * Initialize CMT callbacks for the given PG 298 */ 299 static void 300 cmt_callback_init(pg_t *pg) 301 { 302 /* 303 * Stick with the default callbacks if there isn't going to be 304 * any CMT thread placement optimizations implemented. 305 */ 306 if (((pg_cmt_t *)pg)->cmt_policy == CMT_NO_POLICY) 307 return; 308 309 switch (((pghw_t *)pg)->pghw_hw) { 310 case PGHW_POW_ACTIVE: 311 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr; 312 pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr; 313 break; 314 default: 315 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch; 316 317 } 318 } 319 320 /* 321 * Promote PG above it's current parent. 322 * This is only legal if PG has an equal or greater number of CPUs than its 323 * parent. 324 * 325 * This routine operates on the CPU specific processor group data (for the CPUs 326 * in the PG being promoted), and may be invoked from a context where one CPU's 327 * PG data is under construction. In this case the argument "pgdata", if not 328 * NULL, is a reference to the CPU's under-construction PG data. 329 */ 330 static void 331 cmt_hier_promote(pg_cmt_t *pg, cpu_pg_t *pgdata) 332 { 333 pg_cmt_t *parent; 334 group_t *children; 335 cpu_t *cpu; 336 group_iter_t iter; 337 pg_cpu_itr_t cpu_iter; 338 int r; 339 int err; 340 int nchildren; 341 342 ASSERT(MUTEX_HELD(&cpu_lock)); 343 344 parent = pg->cmt_parent; 345 if (parent == NULL) { 346 /* 347 * Nothing to do 348 */ 349 return; 350 } 351 352 ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent)); 353 354 /* 355 * We're changing around the hierarchy, which is actively traversed 356 * by the dispatcher. Pause CPUS to ensure exclusivity. 357 */ 358 pause_cpus(NULL); 359 360 /* 361 * If necessary, update the parent's sibling set, replacing parent 362 * with PG. 363 */ 364 if (parent->cmt_siblings) { 365 if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE) 366 != -1) { 367 r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE); 368 ASSERT(r != -1); 369 } 370 } 371 372 /* 373 * If the parent is at the top of the hierarchy, replace it's entry 374 * in the root lgroup's group of top level PGs. 375 */ 376 if (parent->cmt_parent == NULL && 377 parent->cmt_siblings != &cmt_root->cl_pgs) { 378 if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE) 379 != -1) { 380 r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE); 381 ASSERT(r != -1); 382 } 383 } 384 385 /* 386 * We assume (and therefore assert) that the PG being promoted is an 387 * only child of it's parent. Update the parent's children set 388 * replacing PG's entry with the parent (since the parent is becoming 389 * the child). Then have PG and the parent swap children sets and 390 * children counts. 391 */ 392 ASSERT(GROUP_SIZE(parent->cmt_children) <= 1); 393 if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) { 394 r = group_add(parent->cmt_children, parent, GRP_NORESIZE); 395 ASSERT(r != -1); 396 } 397 398 children = pg->cmt_children; 399 pg->cmt_children = parent->cmt_children; 400 parent->cmt_children = children; 401 402 nchildren = pg->cmt_nchildren; 403 pg->cmt_nchildren = parent->cmt_nchildren; 404 parent->cmt_nchildren = nchildren; 405 406 /* 407 * Update the sibling references for PG and it's parent 408 */ 409 pg->cmt_siblings = parent->cmt_siblings; 410 parent->cmt_siblings = pg->cmt_children; 411 412 /* 413 * Update any cached lineages in the per CPU pg data. 414 */ 415 PG_CPU_ITR_INIT(pg, cpu_iter); 416 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 417 int idx; 418 int sz; 419 pg_cmt_t *cpu_pg; 420 cpu_pg_t *pgd; /* CPU's PG data */ 421 422 /* 423 * The CPU's whose lineage is under construction still 424 * references the bootstrap CPU PG data structure. 425 */ 426 if (pg_cpu_is_bootstrapped(cpu)) 427 pgd = pgdata; 428 else 429 pgd = cpu->cpu_pg; 430 431 /* 432 * Iterate over the CPU's PGs updating the children 433 * of the PG being promoted, since they have a new parent. 434 */ 435 group_iter_init(&iter); 436 while ((cpu_pg = group_iterate(&pgd->cmt_pgs, &iter)) != NULL) { 437 if (cpu_pg->cmt_parent == pg) { 438 cpu_pg->cmt_parent = parent; 439 } 440 } 441 442 /* 443 * Update the CMT load balancing lineage 444 */ 445 if ((idx = group_find(&pgd->cmt_pgs, (void *)pg)) == -1) { 446 /* 447 * Unless this is the CPU who's lineage is being 448 * constructed, the PG being promoted should be 449 * in the lineage. 450 */ 451 ASSERT(pg_cpu_is_bootstrapped(cpu)); 452 continue; 453 } 454 455 ASSERT(idx > 0); 456 ASSERT(GROUP_ACCESS(&pgd->cmt_pgs, idx - 1) == parent); 457 458 /* 459 * Have the child and the parent swap places in the CPU's 460 * lineage 461 */ 462 group_remove_at(&pgd->cmt_pgs, idx); 463 group_remove_at(&pgd->cmt_pgs, idx - 1); 464 err = group_add_at(&pgd->cmt_pgs, parent, idx); 465 ASSERT(err == 0); 466 err = group_add_at(&pgd->cmt_pgs, pg, idx - 1); 467 ASSERT(err == 0); 468 469 /* 470 * Ensure cmt_lineage references CPU's leaf PG. 471 * Since cmt_pgs is top-down ordered, the bottom is the last 472 * element. 473 */ 474 if ((sz = GROUP_SIZE(&pgd->cmt_pgs)) > 0) 475 pgd->cmt_lineage = GROUP_ACCESS(&pgd->cmt_pgs, sz - 1); 476 } 477 478 /* 479 * Update the parent references for PG and it's parent 480 */ 481 pg->cmt_parent = parent->cmt_parent; 482 parent->cmt_parent = pg; 483 484 start_cpus(); 485 } 486 487 /* 488 * CMT class callback for a new CPU entering the system 489 * 490 * This routine operates on the CPU specific processor group data (for the CPU 491 * being initialized). The argument "pgdata" is a reference to the CPU's PG 492 * data to be constructed. 493 * 494 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data 495 * references a "bootstrap" structure. pg_cmt_cpu_init() and the routines it 496 * calls must be careful to operate only on the "pgdata" argument, and not 497 * cp->cpu_pg. 498 */ 499 static void 500 pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata) 501 { 502 pg_cmt_t *pg; 503 group_t *cmt_pgs; 504 int levels, level; 505 pghw_type_t hw; 506 pg_t *pg_cache = NULL; 507 pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS]; 508 lgrp_handle_t lgrp_handle; 509 cmt_lgrp_t *lgrp; 510 cmt_lineage_validation_t lineage_status; 511 512 ASSERT(MUTEX_HELD(&cpu_lock)); 513 ASSERT(pg_cpu_is_bootstrapped(cp)); 514 515 if (cmt_sched_disabled) 516 return; 517 518 /* 519 * A new CPU is coming into the system. 520 * Interrogate the platform to see if the CPU 521 * has any performance or efficiency relevant 522 * sharing relationships 523 */ 524 cmt_pgs = &pgdata->cmt_pgs; 525 pgdata->cmt_lineage = NULL; 526 527 bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier)); 528 levels = 0; 529 for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) { 530 531 pg_cmt_policy_t policy; 532 533 /* 534 * We're only interested in the hw sharing relationships 535 * for which we know how to optimize. 536 */ 537 policy = pg_cmt_policy(hw); 538 if (policy == CMT_NO_POLICY || 539 pg_plat_hw_shared(cp, hw) == 0) 540 continue; 541 542 /* 543 * We will still create the PGs for hardware sharing 544 * relationships that have been blacklisted, but won't 545 * implement CMT thread placement optimizations against them. 546 */ 547 if (cmt_hw_blacklisted[hw] == 1) 548 policy = CMT_NO_POLICY; 549 550 /* 551 * Find (or create) the PG associated with 552 * the hw sharing relationship in which cp 553 * belongs. 554 * 555 * Determine if a suitable PG already 556 * exists, or if one needs to be created. 557 */ 558 pg = (pg_cmt_t *)pghw_place_cpu(cp, hw); 559 if (pg == NULL) { 560 /* 561 * Create a new one. 562 * Initialize the common... 563 */ 564 pg = (pg_cmt_t *)pg_create(pg_cmt_class_id); 565 566 /* ... physical ... */ 567 pghw_init((pghw_t *)pg, cp, hw); 568 569 /* 570 * ... and CMT specific portions of the 571 * structure. 572 */ 573 pg->cmt_policy = policy; 574 575 /* CMT event callbacks */ 576 cmt_callback_init((pg_t *)pg); 577 578 bitset_init(&pg->cmt_cpus_actv_set); 579 group_create(&pg->cmt_cpus_actv); 580 } else { 581 ASSERT(IS_CMT_PG(pg)); 582 } 583 584 ((pghw_t *)pg)->pghw_generation++; 585 586 /* Add the CPU to the PG */ 587 pg_cpu_add((pg_t *)pg, cp, pgdata); 588 589 /* 590 * Ensure capacity of the active CPU group/bitset 591 */ 592 group_expand(&pg->cmt_cpus_actv, 593 GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 594 595 if (cp->cpu_seqid >= 596 bitset_capacity(&pg->cmt_cpus_actv_set)) { 597 bitset_resize(&pg->cmt_cpus_actv_set, 598 cp->cpu_seqid + 1); 599 } 600 601 /* 602 * Build a lineage of CMT PGs for load balancing / coalescence 603 */ 604 if (policy & (CMT_BALANCE | CMT_COALESCE)) { 605 cpu_cmt_hier[levels++] = pg; 606 } 607 608 /* Cache this for later */ 609 if (hw == PGHW_CACHE) 610 pg_cache = (pg_t *)pg; 611 } 612 613 group_expand(cmt_pgs, levels); 614 615 if (cmt_root == NULL) 616 cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand()); 617 618 /* 619 * Find the lgrp that encapsulates this CPU's CMT hierarchy 620 */ 621 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 622 if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL) 623 lgrp = pg_cmt_lgrp_create(lgrp_handle); 624 625 /* 626 * Ascendingly sort the PGs in the lineage by number of CPUs 627 */ 628 pg_cmt_hier_sort(cpu_cmt_hier, levels); 629 630 /* 631 * Examine the lineage and validate it. 632 * This routine will also try to fix the lineage along with the 633 * rest of the PG hierarchy should it detect an issue. 634 * 635 * If it returns anything other than VALID or REPAIRED, an 636 * unrecoverable error has occurred, and we cannot proceed. 637 */ 638 lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels, pgdata); 639 if ((lineage_status != CMT_LINEAGE_VALID) && 640 (lineage_status != CMT_LINEAGE_REPAIRED)) { 641 /* 642 * In the case of an unrecoverable error where CMT scheduling 643 * has been disabled, assert that the under construction CPU's 644 * PG data has an empty CMT load balancing lineage. 645 */ 646 ASSERT((cmt_sched_disabled == 0) || 647 (GROUP_SIZE(&(pgdata->cmt_pgs)) == 0)); 648 return; 649 } 650 651 /* 652 * For existing PGs in the lineage, verify that the parent is 653 * correct, as the generation in the lineage may have changed 654 * as a result of the sorting. Start the traversal at the top 655 * of the lineage, moving down. 656 */ 657 for (level = levels - 1; level >= 0; ) { 658 int reorg; 659 660 reorg = 0; 661 pg = cpu_cmt_hier[level]; 662 663 /* 664 * Promote PGs at an incorrect generation into place. 665 */ 666 while (pg->cmt_parent && 667 pg->cmt_parent != cpu_cmt_hier[level + 1]) { 668 cmt_hier_promote(pg, pgdata); 669 reorg++; 670 } 671 if (reorg > 0) 672 level = levels - 1; 673 else 674 level--; 675 } 676 677 /* 678 * For each of the PGs in the CPU's lineage: 679 * - Add an entry in the CPU sorted CMT PG group 680 * which is used for top down CMT load balancing 681 * - Tie the PG into the CMT hierarchy by connecting 682 * it to it's parent and siblings. 683 */ 684 for (level = 0; level < levels; level++) { 685 uint_t children; 686 int err; 687 688 pg = cpu_cmt_hier[level]; 689 err = group_add_at(cmt_pgs, pg, levels - level - 1); 690 ASSERT(err == 0); 691 692 if (level == 0) 693 pgdata->cmt_lineage = (pg_t *)pg; 694 695 if (pg->cmt_siblings != NULL) { 696 /* Already initialized */ 697 ASSERT(pg->cmt_parent == NULL || 698 pg->cmt_parent == cpu_cmt_hier[level + 1]); 699 ASSERT(pg->cmt_siblings == &lgrp->cl_pgs || 700 ((pg->cmt_parent != NULL) && 701 pg->cmt_siblings == pg->cmt_parent->cmt_children)); 702 continue; 703 } 704 705 if ((level + 1) == levels) { 706 pg->cmt_parent = NULL; 707 708 pg->cmt_siblings = &lgrp->cl_pgs; 709 children = ++lgrp->cl_npgs; 710 if (cmt_root != lgrp) 711 cmt_root->cl_npgs++; 712 } else { 713 pg->cmt_parent = cpu_cmt_hier[level + 1]; 714 715 /* 716 * A good parent keeps track of their children. 717 * The parent's children group is also the PG's 718 * siblings. 719 */ 720 if (pg->cmt_parent->cmt_children == NULL) { 721 pg->cmt_parent->cmt_children = 722 kmem_zalloc(sizeof (group_t), KM_SLEEP); 723 group_create(pg->cmt_parent->cmt_children); 724 } 725 pg->cmt_siblings = pg->cmt_parent->cmt_children; 726 children = ++pg->cmt_parent->cmt_nchildren; 727 } 728 729 group_expand(pg->cmt_siblings, children); 730 group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs); 731 } 732 733 /* 734 * Cache the chip and core IDs in the cpu_t->cpu_physid structure 735 * for fast lookups later. 736 */ 737 if (cp->cpu_physid) { 738 cp->cpu_physid->cpu_chipid = 739 pg_plat_hw_instance_id(cp, PGHW_CHIP); 740 cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp); 741 742 /* 743 * If this cpu has a PG representing shared cache, then set 744 * cpu_cacheid to that PG's logical id 745 */ 746 if (pg_cache) 747 cp->cpu_physid->cpu_cacheid = pg_cache->pg_id; 748 } 749 750 /* CPU0 only initialization */ 751 if (is_cpu0) { 752 is_cpu0 = 0; 753 cpu0_lgrp = lgrp; 754 } 755 756 } 757 758 /* 759 * Class callback when a CPU is leaving the system (deletion) 760 * 761 * "pgdata" is a reference to the CPU's PG data to be deconstructed. 762 * 763 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data 764 * references a "bootstrap" structure across this function's invocation. 765 * pg_cmt_cpu_fini() and the routines it calls must be careful to operate only 766 * on the "pgdata" argument, and not cp->cpu_pg. 767 */ 768 static void 769 pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata) 770 { 771 group_iter_t i; 772 pg_cmt_t *pg; 773 group_t *pgs, *cmt_pgs; 774 lgrp_handle_t lgrp_handle; 775 cmt_lgrp_t *lgrp; 776 777 if (cmt_sched_disabled) 778 return; 779 780 ASSERT(pg_cpu_is_bootstrapped(cp)); 781 782 pgs = &pgdata->pgs; 783 cmt_pgs = &pgdata->cmt_pgs; 784 785 /* 786 * Find the lgroup that encapsulates this CPU's CMT hierarchy 787 */ 788 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 789 790 lgrp = pg_cmt_find_lgrp(lgrp_handle); 791 if (ncpus == 1 && lgrp != cpu0_lgrp) { 792 /* 793 * One might wonder how we could be deconfiguring the 794 * only CPU in the system. 795 * 796 * On Starcat systems when null_proc_lpa is detected, 797 * the boot CPU (which is already configured into a leaf 798 * lgroup), is moved into the root lgroup. This is done by 799 * deconfiguring it from both lgroups and processor 800 * groups), and then later reconfiguring it back in. This 801 * call to pg_cmt_cpu_fini() is part of that deconfiguration. 802 * 803 * This special case is detected by noting that the platform 804 * has changed the CPU's lgrp affiliation (since it now 805 * belongs in the root). In this case, use the cmt_lgrp_t 806 * cached for the boot CPU, since this is what needs to be 807 * torn down. 808 */ 809 lgrp = cpu0_lgrp; 810 } 811 812 ASSERT(lgrp != NULL); 813 814 /* 815 * First, clean up anything load balancing specific for each of 816 * the CPU's PGs that participated in CMT load balancing 817 */ 818 pg = (pg_cmt_t *)pgdata->cmt_lineage; 819 while (pg != NULL) { 820 821 ((pghw_t *)pg)->pghw_generation++; 822 823 /* 824 * Remove the PG from the CPU's load balancing lineage 825 */ 826 (void) group_remove(cmt_pgs, pg, GRP_RESIZE); 827 828 /* 829 * If it's about to become empty, destroy it's children 830 * group, and remove it's reference from it's siblings. 831 * This is done here (rather than below) to avoid removing 832 * our reference from a PG that we just eliminated. 833 */ 834 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) { 835 if (pg->cmt_children != NULL) 836 group_destroy(pg->cmt_children); 837 if (pg->cmt_siblings != NULL) { 838 if (pg->cmt_siblings == &lgrp->cl_pgs) 839 lgrp->cl_npgs--; 840 else 841 pg->cmt_parent->cmt_nchildren--; 842 } 843 } 844 pg = pg->cmt_parent; 845 } 846 ASSERT(GROUP_SIZE(cmt_pgs) == 0); 847 848 /* 849 * Now that the load balancing lineage updates have happened, 850 * remove the CPU from all it's PGs (destroying any that become 851 * empty). 852 */ 853 group_iter_init(&i); 854 while ((pg = group_iterate(pgs, &i)) != NULL) { 855 if (IS_CMT_PG(pg) == 0) 856 continue; 857 858 pg_cpu_delete((pg_t *)pg, cp, pgdata); 859 /* 860 * Deleting the CPU from the PG changes the CPU's 861 * PG group over which we are actively iterating 862 * Re-initialize the iteration 863 */ 864 group_iter_init(&i); 865 866 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) { 867 868 /* 869 * The PG has become zero sized, so destroy it. 870 */ 871 group_destroy(&pg->cmt_cpus_actv); 872 bitset_fini(&pg->cmt_cpus_actv_set); 873 pghw_fini((pghw_t *)pg); 874 875 pg_destroy((pg_t *)pg); 876 } 877 } 878 } 879 880 /* 881 * Class callback when a CPU is entering a cpu partition 882 */ 883 static void 884 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp) 885 { 886 group_t *pgs; 887 pg_t *pg; 888 group_iter_t i; 889 890 ASSERT(MUTEX_HELD(&cpu_lock)); 891 892 if (cmt_sched_disabled) 893 return; 894 895 pgs = &cp->cpu_pg->pgs; 896 897 /* 898 * Ensure that the new partition's PG bitset 899 * is large enough for all CMT PG's to which cp 900 * belongs 901 */ 902 group_iter_init(&i); 903 while ((pg = group_iterate(pgs, &i)) != NULL) { 904 if (IS_CMT_PG(pg) == 0) 905 continue; 906 907 if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id) 908 bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1); 909 } 910 } 911 912 /* 913 * Class callback when a CPU is actually moving partitions 914 */ 915 static void 916 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp) 917 { 918 cpu_t *cpp; 919 group_t *pgs; 920 pg_t *pg; 921 group_iter_t pg_iter; 922 pg_cpu_itr_t cpu_iter; 923 boolean_t found; 924 925 ASSERT(MUTEX_HELD(&cpu_lock)); 926 927 if (cmt_sched_disabled) 928 return; 929 930 pgs = &cp->cpu_pg->pgs; 931 group_iter_init(&pg_iter); 932 933 /* 934 * Iterate over the CPUs CMT PGs 935 */ 936 while ((pg = group_iterate(pgs, &pg_iter)) != NULL) { 937 938 if (IS_CMT_PG(pg) == 0) 939 continue; 940 941 /* 942 * Add the PG to the bitset in the new partition. 943 */ 944 bitset_add(&newpp->cp_cmt_pgs, pg->pg_id); 945 946 /* 947 * Remove the PG from the bitset in the old partition 948 * if the last of the PG's CPUs have left. 949 */ 950 found = B_FALSE; 951 PG_CPU_ITR_INIT(pg, cpu_iter); 952 while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) { 953 if (cpp == cp) 954 continue; 955 if (CPU_ACTIVE(cpp) && 956 cpp->cpu_part->cp_id == oldpp->cp_id) { 957 found = B_TRUE; 958 break; 959 } 960 } 961 if (!found) 962 bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id); 963 } 964 } 965 966 /* 967 * Class callback when a CPU becomes active (online) 968 * 969 * This is called in a context where CPUs are paused 970 */ 971 static void 972 pg_cmt_cpu_active(cpu_t *cp) 973 { 974 int err; 975 group_iter_t i; 976 pg_cmt_t *pg; 977 group_t *pgs; 978 979 ASSERT(MUTEX_HELD(&cpu_lock)); 980 981 if (cmt_sched_disabled) 982 return; 983 984 pgs = &cp->cpu_pg->pgs; 985 group_iter_init(&i); 986 987 /* 988 * Iterate over the CPU's PGs 989 */ 990 while ((pg = group_iterate(pgs, &i)) != NULL) { 991 992 if (IS_CMT_PG(pg) == 0) 993 continue; 994 995 /* 996 * Move to the next generation since topology is changing 997 */ 998 ((pghw_t *)pg)->pghw_generation++; 999 1000 err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 1001 ASSERT(err == 0); 1002 1003 /* 1004 * If this is the first active CPU in the PG, and it 1005 * represents a hardware sharing relationship over which 1006 * CMT load balancing is performed, add it as a candidate 1007 * for balancing with it's siblings. 1008 */ 1009 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 && 1010 (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { 1011 err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE); 1012 ASSERT(err == 0); 1013 1014 /* 1015 * If this is a top level PG, add it as a balancing 1016 * candidate when balancing within the root lgroup. 1017 */ 1018 if (pg->cmt_parent == NULL && 1019 pg->cmt_siblings != &cmt_root->cl_pgs) { 1020 err = group_add(&cmt_root->cl_pgs, pg, 1021 GRP_NORESIZE); 1022 ASSERT(err == 0); 1023 } 1024 } 1025 1026 /* 1027 * Notate the CPU in the PGs active CPU bitset. 1028 * Also notate the PG as being active in it's associated 1029 * partition 1030 */ 1031 bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 1032 bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id); 1033 } 1034 } 1035 1036 /* 1037 * Class callback when a CPU goes inactive (offline) 1038 * 1039 * This is called in a context where CPUs are paused 1040 */ 1041 static void 1042 pg_cmt_cpu_inactive(cpu_t *cp) 1043 { 1044 int err; 1045 group_t *pgs; 1046 pg_cmt_t *pg; 1047 cpu_t *cpp; 1048 group_iter_t i; 1049 pg_cpu_itr_t cpu_itr; 1050 boolean_t found; 1051 1052 ASSERT(MUTEX_HELD(&cpu_lock)); 1053 1054 if (cmt_sched_disabled) 1055 return; 1056 1057 pgs = &cp->cpu_pg->pgs; 1058 group_iter_init(&i); 1059 1060 while ((pg = group_iterate(pgs, &i)) != NULL) { 1061 1062 if (IS_CMT_PG(pg) == 0) 1063 continue; 1064 1065 /* 1066 * Move to the next generation since topology is changing 1067 */ 1068 ((pghw_t *)pg)->pghw_generation++; 1069 1070 /* 1071 * Remove the CPU from the CMT PGs active CPU group 1072 * bitmap 1073 */ 1074 err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 1075 ASSERT(err == 0); 1076 1077 bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 1078 1079 /* 1080 * If there are no more active CPUs in this PG over which 1081 * load was balanced, remove it as a balancing candidate. 1082 */ 1083 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 && 1084 (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { 1085 err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 1086 ASSERT(err == 0); 1087 1088 if (pg->cmt_parent == NULL && 1089 pg->cmt_siblings != &cmt_root->cl_pgs) { 1090 err = group_remove(&cmt_root->cl_pgs, pg, 1091 GRP_NORESIZE); 1092 ASSERT(err == 0); 1093 } 1094 } 1095 1096 /* 1097 * Assert the number of active CPUs does not exceed 1098 * the total number of CPUs in the PG 1099 */ 1100 ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <= 1101 GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 1102 1103 /* 1104 * Update the PG bitset in the CPU's old partition 1105 */ 1106 found = B_FALSE; 1107 PG_CPU_ITR_INIT(pg, cpu_itr); 1108 while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) { 1109 if (cpp == cp) 1110 continue; 1111 if (CPU_ACTIVE(cpp) && 1112 cpp->cpu_part->cp_id == cp->cpu_part->cp_id) { 1113 found = B_TRUE; 1114 break; 1115 } 1116 } 1117 if (!found) { 1118 bitset_del(&cp->cpu_part->cp_cmt_pgs, 1119 ((pg_t *)pg)->pg_id); 1120 } 1121 } 1122 } 1123 1124 /* 1125 * Return non-zero if the CPU belongs in the given PG 1126 */ 1127 static int 1128 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp) 1129 { 1130 cpu_t *pg_cpu; 1131 1132 pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0); 1133 1134 ASSERT(pg_cpu != NULL); 1135 1136 /* 1137 * The CPU belongs if, given the nature of the hardware sharing 1138 * relationship represented by the PG, the CPU has that 1139 * relationship with some other CPU already in the PG 1140 */ 1141 if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw)) 1142 return (1); 1143 1144 return (0); 1145 } 1146 1147 /* 1148 * Sort the CPUs CMT hierarchy, where "size" is the number of levels. 1149 */ 1150 static void 1151 pg_cmt_hier_sort(pg_cmt_t **hier, int size) 1152 { 1153 int i, j, inc, sz; 1154 int start, end; 1155 pg_t *tmp; 1156 pg_t **h = (pg_t **)hier; 1157 1158 /* 1159 * First sort by number of CPUs 1160 */ 1161 inc = size / 2; 1162 while (inc > 0) { 1163 for (i = inc; i < size; i++) { 1164 j = i; 1165 tmp = h[i]; 1166 while ((j >= inc) && 1167 (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) { 1168 h[j] = h[j - inc]; 1169 j = j - inc; 1170 } 1171 h[j] = tmp; 1172 } 1173 if (inc == 2) 1174 inc = 1; 1175 else 1176 inc = (inc * 5) / 11; 1177 } 1178 1179 /* 1180 * Break ties by asking the platform. 1181 * Determine if h[i] outranks h[i + 1] and if so, swap them. 1182 */ 1183 for (start = 0; start < size; start++) { 1184 1185 /* 1186 * Find various contiguous sets of elements, 1187 * in the array, with the same number of cpus 1188 */ 1189 end = start; 1190 sz = PG_NUM_CPUS(h[start]); 1191 while ((end < size) && (sz == PG_NUM_CPUS(h[end]))) 1192 end++; 1193 /* 1194 * Sort each such set of the array by rank 1195 */ 1196 for (i = start + 1; i < end; i++) { 1197 j = i - 1; 1198 tmp = h[i]; 1199 while (j >= start && 1200 pg_cmt_hier_rank(hier[j], 1201 (pg_cmt_t *)tmp) == hier[j]) { 1202 h[j + 1] = h[j]; 1203 j--; 1204 } 1205 h[j + 1] = tmp; 1206 } 1207 } 1208 } 1209 1210 /* 1211 * Return a cmt_lgrp_t * given an lgroup handle. 1212 */ 1213 static cmt_lgrp_t * 1214 pg_cmt_find_lgrp(lgrp_handle_t hand) 1215 { 1216 cmt_lgrp_t *lgrp; 1217 1218 ASSERT(MUTEX_HELD(&cpu_lock)); 1219 1220 lgrp = cmt_lgrps; 1221 while (lgrp != NULL) { 1222 if (lgrp->cl_hand == hand) 1223 break; 1224 lgrp = lgrp->cl_next; 1225 } 1226 return (lgrp); 1227 } 1228 1229 /* 1230 * Create a cmt_lgrp_t with the specified handle. 1231 */ 1232 static cmt_lgrp_t * 1233 pg_cmt_lgrp_create(lgrp_handle_t hand) 1234 { 1235 cmt_lgrp_t *lgrp; 1236 1237 ASSERT(MUTEX_HELD(&cpu_lock)); 1238 1239 lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP); 1240 1241 lgrp->cl_hand = hand; 1242 lgrp->cl_npgs = 0; 1243 lgrp->cl_next = cmt_lgrps; 1244 cmt_lgrps = lgrp; 1245 group_create(&lgrp->cl_pgs); 1246 1247 return (lgrp); 1248 } 1249 1250 /* 1251 * Interfaces to enable and disable power aware dispatching 1252 * The caller must be holding cpu_lock. 1253 * 1254 * Return 0 on success and -1 on failure. 1255 */ 1256 int 1257 cmt_pad_enable(pghw_type_t type) 1258 { 1259 group_t *hwset; 1260 group_iter_t iter; 1261 pg_cmt_t *pg; 1262 1263 ASSERT(PGHW_IS_PM_DOMAIN(type)); 1264 ASSERT(MUTEX_HELD(&cpu_lock)); 1265 1266 if (cmt_sched_disabled == 1) 1267 return (-1); 1268 1269 if ((hwset = pghw_set_lookup(type)) == NULL || 1270 cmt_hw_blacklisted[type]) { 1271 /* 1272 * Unable to find any instances of the specified type 1273 * of power domain, or the power domains have been blacklisted. 1274 */ 1275 return (-1); 1276 } 1277 1278 /* 1279 * Iterate over the power domains, setting the default dispatcher 1280 * policy for power/performance optimization. 1281 * 1282 * Simply setting the policy isn't enough in the case where the power 1283 * domain is an only child of another PG. Because the dispatcher walks 1284 * the PG hierarchy in a top down fashion, the higher up PG's policy 1285 * will dominate. So promote the power domain above it's parent if both 1286 * PG and it's parent have the same CPUs to ensure it's policy 1287 * dominates. 1288 */ 1289 group_iter_init(&iter); 1290 while ((pg = group_iterate(hwset, &iter)) != NULL) { 1291 /* 1292 * If the power domain is an only child to a parent 1293 * not implementing the same policy, promote the child 1294 * above the parent to activate the policy. 1295 */ 1296 pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw); 1297 while ((pg->cmt_parent != NULL) && 1298 (pg->cmt_parent->cmt_policy != pg->cmt_policy) && 1299 (PG_NUM_CPUS((pg_t *)pg) == 1300 PG_NUM_CPUS((pg_t *)pg->cmt_parent))) { 1301 cmt_hier_promote(pg, NULL); 1302 } 1303 } 1304 1305 return (0); 1306 } 1307 1308 int 1309 cmt_pad_disable(pghw_type_t type) 1310 { 1311 group_t *hwset; 1312 group_iter_t iter; 1313 pg_cmt_t *pg; 1314 pg_cmt_t *child; 1315 1316 ASSERT(PGHW_IS_PM_DOMAIN(type)); 1317 ASSERT(MUTEX_HELD(&cpu_lock)); 1318 1319 if (cmt_sched_disabled == 1) 1320 return (-1); 1321 1322 if ((hwset = pghw_set_lookup(type)) == NULL) { 1323 /* 1324 * Unable to find any instances of the specified type of 1325 * power domain. 1326 */ 1327 return (-1); 1328 } 1329 /* 1330 * Iterate over the power domains, setting the default dispatcher 1331 * policy for performance optimization (load balancing). 1332 */ 1333 group_iter_init(&iter); 1334 while ((pg = group_iterate(hwset, &iter)) != NULL) { 1335 1336 /* 1337 * If the power domain has an only child that implements 1338 * policy other than load balancing, promote the child 1339 * above the power domain to ensure it's policy dominates. 1340 */ 1341 if (pg->cmt_children != NULL && 1342 GROUP_SIZE(pg->cmt_children) == 1) { 1343 child = GROUP_ACCESS(pg->cmt_children, 0); 1344 if ((child->cmt_policy & CMT_BALANCE) == 0) { 1345 cmt_hier_promote(child, NULL); 1346 } 1347 } 1348 pg->cmt_policy = CMT_BALANCE; 1349 } 1350 return (0); 1351 } 1352 1353 /* ARGSUSED */ 1354 static void 1355 cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, 1356 kthread_t *new) 1357 { 1358 pg_cmt_t *cmt_pg = (pg_cmt_t *)pg; 1359 1360 if (old == cp->cpu_idle_thread) { 1361 atomic_add_32(&cmt_pg->cmt_utilization, 1); 1362 } else if (new == cp->cpu_idle_thread) { 1363 atomic_add_32(&cmt_pg->cmt_utilization, -1); 1364 } 1365 } 1366 1367 /* 1368 * Macro to test whether a thread is currently runnable on a CPU in a PG. 1369 */ 1370 #define THREAD_RUNNABLE_IN_PG(t, pg) \ 1371 ((t)->t_state == TS_RUN && \ 1372 (t)->t_disp_queue->disp_cpu && \ 1373 bitset_in_set(&(pg)->cmt_cpus_actv_set, \ 1374 (t)->t_disp_queue->disp_cpu->cpu_seqid)) 1375 1376 static void 1377 cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, 1378 kthread_t *new) 1379 { 1380 pg_cmt_t *cmt = (pg_cmt_t *)pg; 1381 cpupm_domain_t *dom; 1382 uint32_t u; 1383 1384 if (old == cp->cpu_idle_thread) { 1385 ASSERT(new != cp->cpu_idle_thread); 1386 u = atomic_add_32_nv(&cmt->cmt_utilization, 1); 1387 if (u == 1) { 1388 /* 1389 * Notify the CPU power manager that the domain 1390 * is non-idle. 1391 */ 1392 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1393 cpupm_utilization_event(cp, now, dom, 1394 CPUPM_DOM_BUSY_FROM_IDLE); 1395 } 1396 } else if (new == cp->cpu_idle_thread) { 1397 ASSERT(old != cp->cpu_idle_thread); 1398 u = atomic_add_32_nv(&cmt->cmt_utilization, -1); 1399 if (u == 0) { 1400 /* 1401 * The domain is idle, notify the CPU power 1402 * manager. 1403 * 1404 * Avoid notifying if the thread is simply migrating 1405 * between CPUs in the domain. 1406 */ 1407 if (!THREAD_RUNNABLE_IN_PG(old, cmt)) { 1408 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1409 cpupm_utilization_event(cp, now, dom, 1410 CPUPM_DOM_IDLE_FROM_BUSY); 1411 } 1412 } 1413 } 1414 } 1415 1416 /* ARGSUSED */ 1417 static void 1418 cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t) 1419 { 1420 pg_cmt_t *cmt = (pg_cmt_t *)pg; 1421 cpupm_domain_t *dom; 1422 1423 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1424 cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY); 1425 } 1426 1427 /* 1428 * Return the name of the CMT scheduling policy 1429 * being implemented across this PG 1430 */ 1431 static char * 1432 pg_cmt_policy_name(pg_t *pg) 1433 { 1434 pg_cmt_policy_t policy; 1435 1436 policy = ((pg_cmt_t *)pg)->cmt_policy; 1437 1438 if (policy & CMT_AFFINITY) { 1439 if (policy & CMT_BALANCE) 1440 return ("Load Balancing & Affinity"); 1441 else if (policy & CMT_COALESCE) 1442 return ("Load Coalescence & Affinity"); 1443 else 1444 return ("Affinity"); 1445 } else { 1446 if (policy & CMT_BALANCE) 1447 return ("Load Balancing"); 1448 else if (policy & CMT_COALESCE) 1449 return ("Load Coalescence"); 1450 else 1451 return ("None"); 1452 } 1453 } 1454 1455 /* 1456 * Prune PG, and all other instances of PG's hardware sharing relationship 1457 * from the CMT PG hierarchy. 1458 * 1459 * This routine operates on the CPU specific processor group data (for the CPUs 1460 * in the PG being pruned), and may be invoked from a context where one CPU's 1461 * PG data is under construction. In this case the argument "pgdata", if not 1462 * NULL, is a reference to the CPU's under-construction PG data. 1463 */ 1464 static int 1465 pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata) 1466 { 1467 group_t *hwset, *children; 1468 int i, j, r, size = *sz; 1469 group_iter_t hw_iter, child_iter; 1470 pg_cpu_itr_t cpu_iter; 1471 pg_cmt_t *pg, *child; 1472 cpu_t *cpu; 1473 int cap_needed; 1474 pghw_type_t hw; 1475 1476 ASSERT(MUTEX_HELD(&cpu_lock)); 1477 1478 /* 1479 * Inform pghw layer that this PG is pruned. 1480 */ 1481 pghw_cmt_fini((pghw_t *)pg_bad); 1482 1483 hw = ((pghw_t *)pg_bad)->pghw_hw; 1484 1485 if (hw == PGHW_POW_ACTIVE) { 1486 cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. " 1487 "Event Based CPUPM Unavailable"); 1488 } else if (hw == PGHW_POW_IDLE) { 1489 cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. " 1490 "Dispatcher assisted CPUPM disabled."); 1491 } 1492 1493 /* 1494 * Find and eliminate the PG from the lineage. 1495 */ 1496 for (i = 0; i < size; i++) { 1497 if (lineage[i] == pg_bad) { 1498 for (j = i; j < size - 1; j++) 1499 lineage[j] = lineage[j + 1]; 1500 *sz = size - 1; 1501 break; 1502 } 1503 } 1504 1505 /* 1506 * We'll prune all instances of the hardware sharing relationship 1507 * represented by pg. But before we do that (and pause CPUs) we need 1508 * to ensure the hierarchy's groups are properly sized. 1509 */ 1510 hwset = pghw_set_lookup(hw); 1511 1512 /* 1513 * Blacklist the hardware so future processor groups of this type won't 1514 * participate in CMT thread placement. 1515 * 1516 * XXX 1517 * For heterogeneous system configurations, this might be overkill. 1518 * We may only need to blacklist the illegal PGs, and other instances 1519 * of this hardware sharing relationship may be ok. 1520 */ 1521 cmt_hw_blacklisted[hw] = 1; 1522 1523 /* 1524 * For each of the PGs being pruned, ensure sufficient capacity in 1525 * the siblings set for the PG's children 1526 */ 1527 group_iter_init(&hw_iter); 1528 while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { 1529 /* 1530 * PG is being pruned, but if it is bringing up more than 1531 * one child, ask for more capacity in the siblings group. 1532 */ 1533 cap_needed = 0; 1534 if (pg->cmt_children && 1535 GROUP_SIZE(pg->cmt_children) > 1) { 1536 cap_needed = GROUP_SIZE(pg->cmt_children) - 1; 1537 1538 group_expand(pg->cmt_siblings, 1539 GROUP_SIZE(pg->cmt_siblings) + cap_needed); 1540 1541 /* 1542 * If this is a top level group, also ensure the 1543 * capacity in the root lgrp level CMT grouping. 1544 */ 1545 if (pg->cmt_parent == NULL && 1546 pg->cmt_siblings != &cmt_root->cl_pgs) { 1547 group_expand(&cmt_root->cl_pgs, 1548 GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed); 1549 cmt_root->cl_npgs += cap_needed; 1550 } 1551 } 1552 } 1553 1554 /* 1555 * We're operating on the PG hierarchy. Pause CPUs to ensure 1556 * exclusivity with respect to the dispatcher. 1557 */ 1558 pause_cpus(NULL); 1559 1560 /* 1561 * Prune all PG instances of the hardware sharing relationship 1562 * represented by pg. 1563 */ 1564 group_iter_init(&hw_iter); 1565 while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { 1566 1567 /* 1568 * Remove PG from it's group of siblings, if it's there. 1569 */ 1570 if (pg->cmt_siblings) { 1571 (void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 1572 } 1573 if (pg->cmt_parent == NULL && 1574 pg->cmt_siblings != &cmt_root->cl_pgs) { 1575 (void) group_remove(&cmt_root->cl_pgs, pg, 1576 GRP_NORESIZE); 1577 } 1578 1579 /* 1580 * Indicate that no CMT policy will be implemented across 1581 * this PG. 1582 */ 1583 pg->cmt_policy = CMT_NO_POLICY; 1584 1585 /* 1586 * Move PG's children from it's children set to it's parent's 1587 * children set. Note that the parent's children set, and PG's 1588 * siblings set are the same thing. 1589 * 1590 * Because we are iterating over the same group that we are 1591 * operating on (removing the children), first add all of PG's 1592 * children to the parent's children set, and once we are done 1593 * iterating, empty PG's children set. 1594 */ 1595 if (pg->cmt_children != NULL) { 1596 children = pg->cmt_children; 1597 1598 group_iter_init(&child_iter); 1599 while ((child = group_iterate(children, &child_iter)) 1600 != NULL) { 1601 if (pg->cmt_siblings != NULL) { 1602 r = group_add(pg->cmt_siblings, child, 1603 GRP_NORESIZE); 1604 ASSERT(r == 0); 1605 1606 if (pg->cmt_parent == NULL && 1607 pg->cmt_siblings != 1608 &cmt_root->cl_pgs) { 1609 r = group_add(&cmt_root->cl_pgs, 1610 child, GRP_NORESIZE); 1611 ASSERT(r == 0); 1612 } 1613 } 1614 } 1615 group_empty(pg->cmt_children); 1616 } 1617 1618 /* 1619 * Reset the callbacks to the defaults 1620 */ 1621 pg_callback_set_defaults((pg_t *)pg); 1622 1623 /* 1624 * Update all the CPU lineages in each of PG's CPUs 1625 */ 1626 PG_CPU_ITR_INIT(pg, cpu_iter); 1627 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 1628 pg_cmt_t *cpu_pg; 1629 group_iter_t liter; /* Iterator for the lineage */ 1630 cpu_pg_t *cpd; /* CPU's PG data */ 1631 1632 /* 1633 * The CPU's lineage is under construction still 1634 * references the bootstrap CPU PG data structure. 1635 */ 1636 if (pg_cpu_is_bootstrapped(cpu)) 1637 cpd = pgdata; 1638 else 1639 cpd = cpu->cpu_pg; 1640 1641 /* 1642 * Iterate over the CPU's PGs updating the children 1643 * of the PG being promoted, since they have a new 1644 * parent and siblings set. 1645 */ 1646 group_iter_init(&liter); 1647 while ((cpu_pg = group_iterate(&cpd->pgs, 1648 &liter)) != NULL) { 1649 if (cpu_pg->cmt_parent == pg) { 1650 cpu_pg->cmt_parent = pg->cmt_parent; 1651 cpu_pg->cmt_siblings = pg->cmt_siblings; 1652 } 1653 } 1654 1655 /* 1656 * Update the CPU's lineages 1657 * 1658 * Remove the PG from the CPU's group used for CMT 1659 * scheduling. 1660 */ 1661 (void) group_remove(&cpd->cmt_pgs, pg, GRP_NORESIZE); 1662 } 1663 } 1664 start_cpus(); 1665 return (0); 1666 } 1667 1668 /* 1669 * Disable CMT scheduling 1670 */ 1671 static void 1672 pg_cmt_disable(void) 1673 { 1674 cpu_t *cpu; 1675 1676 ASSERT(MUTEX_HELD(&cpu_lock)); 1677 1678 pause_cpus(NULL); 1679 cpu = cpu_list; 1680 1681 do { 1682 if (cpu->cpu_pg) 1683 group_empty(&cpu->cpu_pg->cmt_pgs); 1684 } while ((cpu = cpu->cpu_next) != cpu_list); 1685 1686 cmt_sched_disabled = 1; 1687 start_cpus(); 1688 cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable"); 1689 } 1690 1691 /* 1692 * CMT lineage validation 1693 * 1694 * This routine is invoked by pg_cmt_cpu_init() to validate the integrity 1695 * of the PGs in a CPU's lineage. This is necessary because it's possible that 1696 * some groupings (power domain groupings in particular) may be defined by 1697 * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be 1698 * possible to integrate those groupings into the CMT PG hierarchy, if doing 1699 * so would violate the subset invariant of the hierarchy, which says that 1700 * a PG must be subset of its parent (if it has one). 1701 * 1702 * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that 1703 * would result in a violation of this invariant. If a violation is found, 1704 * and the PG is of a grouping type who's definition is known to originate from 1705 * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the 1706 * PG (and all other instances PG's sharing relationship type) from the CMT 1707 * hierarchy. Further, future instances of that sharing relationship type won't 1708 * be added. If the grouping definition doesn't originate from suspect 1709 * sources, then pg_cmt_disable() will be invoked to log an error, and disable 1710 * CMT scheduling altogether. 1711 * 1712 * This routine is invoked after the CPU has been added to the PGs in which 1713 * it belongs, but before those PGs have been added to (or had their place 1714 * adjusted in) the CMT PG hierarchy. 1715 * 1716 * The first argument is the CPUs PG lineage (essentially an array of PGs in 1717 * which the CPU belongs) that has already been sorted in ascending order 1718 * by CPU count. Some of the PGs in the CPUs lineage may already have other 1719 * CPUs in them, and have already been integrated into the CMT hierarchy. 1720 * 1721 * The addition of this new CPU to these pre-existing PGs means that those 1722 * PGs may need to be promoted up in the hierarchy to satisfy the subset 1723 * invariant. In additon to testing the subset invariant for the lineage, 1724 * this routine also verifies that the addition of the new CPU to the 1725 * existing PGs wouldn't cause the subset invariant to be violated in 1726 * the exiting lineages. 1727 * 1728 * This routine will normally return one of the following: 1729 * CMT_LINEAGE_VALID - There were no problems detected with the lineage. 1730 * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning. 1731 * 1732 * Otherwise, this routine will return a value indicating which error it 1733 * was unable to recover from (and set cmt_lineage_status along the way). 1734 * 1735 * This routine operates on the CPU specific processor group data (for the CPU 1736 * whose lineage is being validated), which is under-construction. 1737 * "pgdata" is a reference to the CPU's under-construction PG data. 1738 * This routine must be careful to operate only on "pgdata", and not cp->cpu_pg. 1739 */ 1740 static cmt_lineage_validation_t 1741 pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata) 1742 { 1743 int i, j, size; 1744 pg_cmt_t *pg, *pg_next, *pg_bad, *pg_tmp, *parent; 1745 cpu_t *cp; 1746 pg_cpu_itr_t cpu_iter; 1747 lgrp_handle_t lgrp; 1748 1749 ASSERT(MUTEX_HELD(&cpu_lock)); 1750 1751 revalidate: 1752 size = *sz; 1753 pg_bad = NULL; 1754 lgrp = LGRP_NULL_HANDLE; 1755 for (i = 0; i < size; i++) { 1756 1757 pg = lineage[i]; 1758 if (i < size - 1) 1759 pg_next = lineage[i + 1]; 1760 else 1761 pg_next = NULL; 1762 1763 /* 1764 * We assume that the lineage has already been sorted 1765 * by the number of CPUs. In fact, we depend on it. 1766 */ 1767 ASSERT(pg_next == NULL || 1768 (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next))); 1769 1770 /* 1771 * The CPUs PG lineage was passed as the first argument to 1772 * this routine and contains the sorted list of the CPU's 1773 * PGs. Ultimately, the ordering of the PGs in that list, and 1774 * the ordering as traversed by the cmt_parent list must be 1775 * the same. PG promotion will be used as the mechanism to 1776 * achieve this, but first we need to look for cases where 1777 * promotion will be necessary, and validate that will be 1778 * possible without violating the subset invarient described 1779 * above. 1780 * 1781 * Since the PG topology is in the middle of being changed, we 1782 * need to check whether the PG's existing parent (if any) is 1783 * part of this CPU's lineage (and therefore should contain 1784 * the new CPU). If not, it means that the addition of the 1785 * new CPU should have made this PG have more CPUs than its 1786 * parent (and other ancestors not in the same lineage) and 1787 * will need to be promoted into place. 1788 * 1789 * We need to verify all of this to defend against a buggy 1790 * BIOS giving bad power domain CPU groupings. Sigh. 1791 */ 1792 parent = pg->cmt_parent; 1793 while (parent != NULL) { 1794 /* 1795 * Determine if the parent/ancestor is in this lineage 1796 */ 1797 pg_tmp = NULL; 1798 for (j = 0; (j < size) && (pg_tmp != parent); j++) { 1799 pg_tmp = lineage[j]; 1800 } 1801 if (pg_tmp == parent) { 1802 /* 1803 * It's in the lineage. The concentricity 1804 * checks will handle the rest. 1805 */ 1806 break; 1807 } 1808 /* 1809 * If it is not in the lineage, PG will eventually 1810 * need to be promoted above it. Verify the ancestor 1811 * is a proper subset. There is still an error if 1812 * the ancestor has the same number of CPUs as PG, 1813 * since that would imply it should be in the lineage, 1814 * and we already know it isn't. 1815 */ 1816 if (PG_NUM_CPUS((pg_t *)parent) >= 1817 PG_NUM_CPUS((pg_t *)pg)) { 1818 /* 1819 * Not a proper subset if the parent/ancestor 1820 * has the same or more CPUs than PG. 1821 */ 1822 cmt_lineage_status = CMT_LINEAGE_NON_PROMOTABLE; 1823 goto handle_error; 1824 } 1825 parent = parent->cmt_parent; 1826 } 1827 1828 /* 1829 * Walk each of the CPUs in the PGs group and perform 1830 * consistency checks along the way. 1831 */ 1832 PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter); 1833 while ((cp = pg_cpu_next(&cpu_iter)) != NULL) { 1834 /* 1835 * Verify that there aren't any CPUs contained in PG 1836 * that the next PG in the lineage (which is larger 1837 * or same size) doesn't also contain. 1838 */ 1839 if (pg_next != NULL && 1840 pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) { 1841 cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC; 1842 goto handle_error; 1843 } 1844 1845 /* 1846 * Verify that all the CPUs in the PG are in the same 1847 * lgroup. 1848 */ 1849 if (lgrp == LGRP_NULL_HANDLE) { 1850 lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id); 1851 } else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) { 1852 cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS; 1853 goto handle_error; 1854 } 1855 } 1856 } 1857 1858 handle_error: 1859 /* 1860 * Some of these validation errors can result when the CPU grouping 1861 * information is derived from buggy sources (for example, incorrect 1862 * ACPI tables on x86 systems). 1863 * 1864 * We'll try to recover in such cases by pruning out the illegal 1865 * groupings from the PG hierarchy, which means that we won't optimize 1866 * for those levels, but we will for the remaining ones. 1867 */ 1868 switch (cmt_lineage_status) { 1869 case CMT_LINEAGE_VALID: 1870 case CMT_LINEAGE_REPAIRED: 1871 break; 1872 case CMT_LINEAGE_PG_SPANS_LGRPS: 1873 /* 1874 * We've detected a PG whose CPUs span lgroups. 1875 * 1876 * This isn't supported, as the dispatcher isn't allowed to 1877 * to do CMT thread placement across lgroups, as this would 1878 * conflict with policies implementing MPO thread affinity. 1879 * 1880 * If the PG is of a sharing relationship type known to 1881 * legitimately span lgroups, specify that no CMT thread 1882 * placement policy should be implemented, and prune the PG 1883 * from the existing CMT PG hierarchy. 1884 * 1885 * Otherwise, fall though to the case below for handling. 1886 */ 1887 if (((pghw_t *)pg)->pghw_hw == PGHW_CHIP) { 1888 if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) { 1889 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1890 goto revalidate; 1891 } 1892 } 1893 /*LINTED*/ 1894 case CMT_LINEAGE_NON_PROMOTABLE: 1895 /* 1896 * We've detected a PG that already exists in another CPU's 1897 * lineage that cannot cannot legally be promoted into place 1898 * without breaking the invariants of the hierarchy. 1899 */ 1900 if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { 1901 if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) { 1902 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1903 goto revalidate; 1904 } 1905 } 1906 /* 1907 * Something went wrong trying to prune out the bad level. 1908 * Disable CMT scheduling altogether. 1909 */ 1910 pg_cmt_disable(); 1911 break; 1912 case CMT_LINEAGE_NON_CONCENTRIC: 1913 /* 1914 * We've detected a non-concentric PG lineage, which means that 1915 * there's a PG in the lineage that has CPUs that the next PG 1916 * over in the lineage (which is the same size or larger) 1917 * doesn't have. 1918 * 1919 * In this case, we examine the two PGs to see if either 1920 * grouping is defined by potentially buggy sources. 1921 * 1922 * If one has less CPUs than the other, and contains CPUs 1923 * not found in the parent, and it is an untrusted enumeration, 1924 * then prune it. If both have the same number of CPUs, then 1925 * prune the one that is untrusted. 1926 * 1927 * This process repeats until we have a concentric lineage, 1928 * or we would have to prune out level derived from what we 1929 * thought was a reliable source, in which case CMT scheduling 1930 * is disabled altogether. 1931 */ 1932 if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) && 1933 (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) { 1934 pg_bad = pg; 1935 } else if (PG_NUM_CPUS((pg_t *)pg) == 1936 PG_NUM_CPUS((pg_t *)pg_next)) { 1937 if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) { 1938 pg_bad = pg_next; 1939 } else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { 1940 pg_bad = pg; 1941 } 1942 } 1943 if (pg_bad) { 1944 if (pg_cmt_prune(pg_bad, lineage, sz, pgdata) == 0) { 1945 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1946 goto revalidate; 1947 } 1948 } 1949 /* 1950 * Something went wrong trying to identify and/or prune out 1951 * the bad level. Disable CMT scheduling altogether. 1952 */ 1953 pg_cmt_disable(); 1954 break; 1955 default: 1956 /* 1957 * If we're here, we've encountered a validation error for 1958 * which we don't know how to recover. In this case, disable 1959 * CMT scheduling altogether. 1960 */ 1961 cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE; 1962 pg_cmt_disable(); 1963 } 1964 return (cmt_lineage_status); 1965 }