1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 #include <sys/types.h>
  26 #include <sys/systm.h>
  27 #include <sys/cmn_err.h>
  28 #include <sys/cpuvar.h>
  29 #include <sys/thread.h>
  30 #include <sys/disp.h>
  31 #include <sys/kmem.h>
  32 #include <sys/debug.h>
  33 #include <sys/cpupart.h>
  34 #include <sys/pset.h>
  35 #include <sys/var.h>
  36 #include <sys/cyclic.h>
  37 #include <sys/lgrp.h>
  38 #include <sys/pghw.h>
  39 #include <sys/loadavg.h>
  40 #include <sys/class.h>
  41 #include <sys/fss.h>
  42 #include <sys/pool.h>
  43 #include <sys/pool_pset.h>
  44 #include <sys/policy.h>
  45 
  46 /*
  47  * Calling pool_lock() protects the pools configuration, which includes
  48  * CPU partitions.  cpu_lock protects the CPU partition list, and prevents
  49  * partitions from being created or destroyed while the lock is held.
  50  * The lock ordering with respect to related locks is:
  51  *
  52  *    pool_lock() ---> cpu_lock  --->  pidlock  -->  p_lock
  53  *
  54  * Blocking memory allocations may be made while holding "pool_lock"
  55  * or cpu_lock.
  56  */
  57 
  58 /*
  59  * The cp_default partition is allocated statically, but its lgroup load average
  60  * (lpl) list is allocated dynamically after kmem subsystem is initialized. This
  61  * saves some memory since the space allocated reflects the actual number of
  62  * lgroups supported by the platform. The lgrp facility provides a temporary
  63  * space to hold lpl information during system bootstrap.
  64  */
  65 
  66 cpupart_t               *cp_list_head;
  67 cpupart_t               cp_default;
  68 static cpupartid_t      cp_id_next;
  69 uint_t                  cp_numparts;
  70 uint_t                  cp_numparts_nonempty;
  71 
  72 /*
  73  * Need to limit total number of partitions to avoid slowing down the
  74  * clock code too much.  The clock code traverses the list of
  75  * partitions and needs to be able to execute in a reasonable amount
  76  * of time (less than 1/hz seconds).  The maximum is sized based on
  77  * max_ncpus so it shouldn't be a problem unless there are large
  78  * numbers of empty partitions.
  79  */
  80 static uint_t           cp_max_numparts;
  81 
  82 /*
  83  * Processor sets and CPU partitions are different but related concepts.
  84  * A processor set is a user-level abstraction allowing users to create
  85  * sets of CPUs and bind threads exclusively to those sets.  A CPU
  86  * partition is a kernel dispatcher object consisting of a set of CPUs
  87  * and a global dispatch queue.  The processor set abstraction is
  88  * implemented via a CPU partition, and currently there is a 1-1
  89  * mapping between processor sets and partitions (excluding the default
  90  * partition, which is not visible as a processor set).  Hence, the
  91  * numbering for processor sets and CPU partitions is identical.  This
  92  * may not always be true in the future, and these macros could become
  93  * less trivial if we support e.g. a processor set containing multiple
  94  * CPU partitions.
  95  */
  96 #define PSTOCP(psid)    ((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid)))
  97 #define CPTOPS(cpid)    ((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid)))
  98 
  99 static int cpupart_unbind_threads(cpupart_t *, boolean_t);
 100 
 101 /*
 102  * Find a CPU partition given a processor set ID.
 103  */
 104 static cpupart_t *
 105 cpupart_find_all(psetid_t psid)
 106 {
 107         cpupart_t *cp;
 108         cpupartid_t cpid = PSTOCP(psid);
 109 
 110         ASSERT(MUTEX_HELD(&cpu_lock));
 111 
 112         /* default partition not visible as a processor set */
 113         if (psid == CP_DEFAULT)
 114                 return (NULL);
 115 
 116         if (psid == PS_MYID)
 117                 return (curthread->t_cpupart);
 118 
 119         cp = cp_list_head;
 120         do {
 121                 if (cp->cp_id == cpid)
 122                         return (cp);
 123                 cp = cp->cp_next;
 124         } while (cp != cp_list_head);
 125         return (NULL);
 126 }
 127 
 128 /*
 129  * Find a CPU partition given a processor set ID if the processor set
 130  * should be visible from the calling zone.
 131  */
 132 cpupart_t *
 133 cpupart_find(psetid_t psid)
 134 {
 135         cpupart_t *cp;
 136 
 137         ASSERT(MUTEX_HELD(&cpu_lock));
 138         cp = cpupart_find_all(psid);
 139         if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() &&
 140             zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id))
 141                         return (NULL);
 142         return (cp);
 143 }
 144 
 145 static int
 146 cpupart_kstat_update(kstat_t *ksp, int rw)
 147 {
 148         cpupart_t *cp = (cpupart_t *)ksp->ks_private;
 149         cpupart_kstat_t *cpksp = ksp->ks_data;
 150 
 151         if (rw == KSTAT_WRITE)
 152                 return (EACCES);
 153 
 154         cpksp->cpk_updates.value.ui64 = cp->cp_updates;
 155         cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum;
 156         cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum;
 157         cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus;
 158         cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >>
 159             (16 - FSHIFT);
 160         cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >>
 161             (16 - FSHIFT);
 162         cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >>
 163             (16 - FSHIFT);
 164         return (0);
 165 }
 166 
 167 static void
 168 cpupart_kstat_create(cpupart_t *cp)
 169 {
 170         kstat_t *ksp;
 171         zoneid_t zoneid;
 172 
 173         ASSERT(MUTEX_HELD(&cpu_lock));
 174 
 175         /*
 176          * We have a bit of a chicken-egg problem since this code will
 177          * get called to create the kstats for CP_DEFAULT before the
 178          * pools framework gets initialized.  We circumvent the problem
 179          * by special-casing cp_default.
 180          */
 181         if (cp != &cp_default && pool_pset_enabled())
 182                 zoneid = GLOBAL_ZONEID;
 183         else
 184                 zoneid = ALL_ZONES;
 185         ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc",
 186             KSTAT_TYPE_NAMED,
 187             sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid);
 188         if (ksp != NULL) {
 189                 cpupart_kstat_t *cpksp = ksp->ks_data;
 190 
 191                 kstat_named_init(&cpksp->cpk_updates, "updates",
 192                     KSTAT_DATA_UINT64);
 193                 kstat_named_init(&cpksp->cpk_runnable, "runnable",
 194                     KSTAT_DATA_UINT64);
 195                 kstat_named_init(&cpksp->cpk_waiting, "waiting",
 196                     KSTAT_DATA_UINT64);
 197                 kstat_named_init(&cpksp->cpk_ncpus, "ncpus",
 198                     KSTAT_DATA_UINT32);
 199                 kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min",
 200                     KSTAT_DATA_UINT32);
 201                 kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min",
 202                     KSTAT_DATA_UINT32);
 203                 kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min",
 204                     KSTAT_DATA_UINT32);
 205 
 206                 ksp->ks_update = cpupart_kstat_update;
 207                 ksp->ks_private = cp;
 208 
 209                 kstat_install(ksp);
 210         }
 211         cp->cp_kstat = ksp;
 212 }
 213 
 214 /*
 215  * Initialize the cpupart's lgrp partions (lpls)
 216  */
 217 static void
 218 cpupart_lpl_initialize(cpupart_t *cp)
 219 {
 220         int i, sz;
 221 
 222         sz = cp->cp_nlgrploads = lgrp_plat_max_lgrps();
 223         cp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * sz, KM_SLEEP);
 224 
 225         for (i = 0; i < sz; i++) {
 226                 /*
 227                  * The last entry of the lpl's resource set is always NULL
 228                  * by design (to facilitate iteration)...hence the "oversizing"
 229                  * by 1.
 230                  */
 231                 cp->cp_lgrploads[i].lpl_rset_sz = sz + 1;
 232                 cp->cp_lgrploads[i].lpl_rset =
 233                     kmem_zalloc(sizeof (struct lgrp_ld *) * (sz + 1), KM_SLEEP);
 234                 cp->cp_lgrploads[i].lpl_id2rset =
 235                     kmem_zalloc(sizeof (int) * (sz + 1), KM_SLEEP);
 236                 cp->cp_lgrploads[i].lpl_lgrpid = i;
 237         }
 238 }
 239 
 240 /*
 241  * Teardown the cpupart's lgrp partitions
 242  */
 243 static void
 244 cpupart_lpl_teardown(cpupart_t *cp)
 245 {
 246         int i, sz;
 247         lpl_t *lpl;
 248 
 249         for (i = 0; i < cp->cp_nlgrploads; i++) {
 250                 lpl = &cp->cp_lgrploads[i];
 251 
 252                 sz = lpl->lpl_rset_sz;
 253                 kmem_free(lpl->lpl_rset, sizeof (struct lgrp_ld *) * sz);
 254                 kmem_free(lpl->lpl_id2rset, sizeof (int) * sz);
 255                 lpl->lpl_rset = NULL;
 256                 lpl->lpl_id2rset = NULL;
 257         }
 258         kmem_free(cp->cp_lgrploads, sizeof (lpl_t) * cp->cp_nlgrploads);
 259         cp->cp_lgrploads = NULL;
 260 }
 261 
 262 /*
 263  * Initialize the default partition and kpreempt disp queue.
 264  */
 265 void
 266 cpupart_initialize_default(void)
 267 {
 268         lgrp_id_t i;
 269 
 270         cp_list_head = &cp_default;
 271         cp_default.cp_next = &cp_default;
 272         cp_default.cp_prev = &cp_default;
 273         cp_default.cp_id = CP_DEFAULT;
 274         cp_default.cp_kp_queue.disp_maxrunpri = -1;
 275         cp_default.cp_kp_queue.disp_max_unbound_pri = -1;
 276         cp_default.cp_kp_queue.disp_cpu = NULL;
 277         cp_default.cp_gen = 0;
 278         cp_default.cp_loadavg.lg_cur = 0;
 279         cp_default.cp_loadavg.lg_len = 0;
 280         cp_default.cp_loadavg.lg_total = 0;
 281         for (i = 0; i < S_LOADAVG_SZ; i++) {
 282                 cp_default.cp_loadavg.lg_loads[i] = 0;
 283         }
 284         DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock);
 285         cp_id_next = CP_DEFAULT + 1;
 286         cpupart_kstat_create(&cp_default);
 287         cp_numparts = 1;
 288         if (cp_max_numparts == 0)       /* allow for /etc/system tuning */
 289                 cp_max_numparts = max_ncpus * 2 + 1;
 290         /*
 291          * Allocate space for cp_default list of lgrploads
 292          */
 293         cpupart_lpl_initialize(&cp_default);
 294 
 295         /*
 296          * The initial lpl topology is created in a special lpl list
 297          * lpl_bootstrap. It should be copied to cp_default.
 298          * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point
 299          *       to the correct lpl in the cp_default.cp_lgrploads list.
 300          */
 301         lpl_topo_bootstrap(cp_default.cp_lgrploads,
 302             cp_default.cp_nlgrploads);
 303 
 304 
 305         cp_default.cp_attr = PSET_NOESCAPE;
 306         cp_numparts_nonempty = 1;
 307         /*
 308          * Set t0's home
 309          */
 310         t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID];
 311 
 312         bitset_init(&cp_default.cp_cmt_pgs);
 313         bitset_init_fanout(&cp_default.cp_haltset, cp_haltset_fanout);
 314 
 315         bitset_resize(&cp_default.cp_haltset, max_ncpus);
 316 }
 317 
 318 
 319 static int
 320 cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced)
 321 {
 322         cpupart_t *oldpp;
 323         cpu_t   *ncp, *newlist;
 324         kthread_t *t;
 325         int     move_threads = 1;
 326         lgrp_id_t lgrpid;
 327         proc_t  *p;
 328         int lgrp_diff_lpl;
 329         lpl_t   *cpu_lpl;
 330         int     ret;
 331         boolean_t unbind_all_threads = (forced != 0);
 332 
 333         ASSERT(MUTEX_HELD(&cpu_lock));
 334         ASSERT(newpp != NULL);
 335 
 336         oldpp = cp->cpu_part;
 337         ASSERT(oldpp != NULL);
 338         ASSERT(oldpp->cp_ncpus > 0);
 339 
 340         if (newpp == oldpp) {
 341                 /*
 342                  * Don't need to do anything.
 343                  */
 344                 return (0);
 345         }
 346 
 347         cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT);
 348 
 349         if (!disp_bound_partition(cp, 0)) {
 350                 /*
 351                  * Don't need to move threads if there are no threads in
 352                  * the partition.  Note that threads can't enter the
 353                  * partition while we're holding cpu_lock.
 354                  */
 355                 move_threads = 0;
 356         } else if (oldpp->cp_ncpus == 1) {
 357                 /*
 358                  * The last CPU is removed from a partition which has threads
 359                  * running in it. Some of these threads may be bound to this
 360                  * CPU.
 361                  *
 362                  * Attempt to unbind threads from the CPU and from the processor
 363                  * set. Note that no threads should be bound to this CPU since
 364                  * cpupart_move_threads will refuse to move bound threads to
 365                  * other CPUs.
 366                  */
 367                 (void) cpu_unbind(oldpp->cp_cpulist->cpu_id, B_FALSE);
 368                 (void) cpupart_unbind_threads(oldpp, B_FALSE);
 369 
 370                 if (!disp_bound_partition(cp, 0)) {
 371                         /*
 372                          * No bound threads in this partition any more
 373                          */
 374                         move_threads = 0;
 375                 } else {
 376                         /*
 377                          * There are still threads bound to the partition
 378                          */
 379                         cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
 380                         return (EBUSY);
 381                 }
 382         }
 383 
 384         /*
 385          * If forced flag is set unbind any threads from this CPU.
 386          * Otherwise unbind soft-bound threads only.
 387          */
 388         if ((ret = cpu_unbind(cp->cpu_id, unbind_all_threads)) != 0) {
 389                 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
 390                 return (ret);
 391         }
 392 
 393         /*
 394          * Stop further threads weak binding to this cpu.
 395          */
 396         cpu_inmotion = cp;
 397         membar_enter();
 398 
 399         /*
 400          * Notify the Processor Groups subsystem that the CPU
 401          * will be moving cpu partitions. This is done before
 402          * CPUs are paused to provide an opportunity for any
 403          * needed memory allocations.
 404          */
 405         pg_cpupart_out(cp, oldpp);
 406         pg_cpupart_in(cp, newpp);
 407 
 408 again:
 409         if (move_threads) {
 410                 int loop_count;
 411                 /*
 412                  * Check for threads strong or weak bound to this CPU.
 413                  */
 414                 for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) {
 415                         if (loop_count >= 5) {
 416                                 cpu_state_change_notify(cp->cpu_id,
 417                                     CPU_CPUPART_IN);
 418                                 pg_cpupart_out(cp, newpp);
 419                                 pg_cpupart_in(cp, oldpp);
 420                                 cpu_inmotion = NULL;
 421                                 return (EBUSY); /* some threads still bound */
 422                         }
 423                         delay(1);
 424                 }
 425         }
 426 
 427         /*
 428          * Before we actually start changing data structures, notify
 429          * the cyclic subsystem that we want to move this CPU out of its
 430          * partition.
 431          */
 432         if (!cyclic_move_out(cp)) {
 433                 /*
 434                  * This CPU must be the last CPU in a processor set with
 435                  * a bound cyclic.
 436                  */
 437                 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
 438                 pg_cpupart_out(cp, newpp);
 439                 pg_cpupart_in(cp, oldpp);
 440                 cpu_inmotion = NULL;
 441                 return (EBUSY);
 442         }
 443 
 444         pause_cpus(cp);
 445 
 446         if (move_threads) {
 447                 /*
 448                  * The thread on cpu before the pause thread may have read
 449                  * cpu_inmotion before we raised the barrier above.  Check
 450                  * again.
 451                  */
 452                 if (disp_bound_threads(cp, 1)) {
 453                         start_cpus();
 454                         goto again;
 455                 }
 456 
 457         }
 458 
 459         /*
 460          * Now that CPUs are paused, let the PG subsystem perform
 461          * any necessary data structure updates.
 462          */
 463         pg_cpupart_move(cp, oldpp, newpp);
 464 
 465         /* save this cpu's lgroup -- it'll be the same in the new partition */
 466         lgrpid = cp->cpu_lpl->lpl_lgrpid;
 467 
 468         cpu_lpl = cp->cpu_lpl;
 469         /*
 470          * let the lgroup framework know cp has left the partition
 471          */
 472         lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid);
 473 
 474         /* move out of old partition */
 475         oldpp->cp_ncpus--;
 476         if (oldpp->cp_ncpus > 0) {
 477 
 478                 ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;
 479                 cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part;
 480                 if (oldpp->cp_cpulist == cp) {
 481                         oldpp->cp_cpulist = ncp;
 482                 }
 483         } else {
 484                 ncp = oldpp->cp_cpulist = NULL;
 485                 cp_numparts_nonempty--;
 486                 ASSERT(cp_numparts_nonempty != 0);
 487         }
 488         oldpp->cp_gen++;
 489 
 490         /* move into new partition */
 491         newlist = newpp->cp_cpulist;
 492         if (newlist == NULL) {
 493                 newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp;
 494                 cp_numparts_nonempty++;
 495                 ASSERT(cp_numparts_nonempty != 0);
 496         } else {
 497                 cp->cpu_next_part = newlist;
 498                 cp->cpu_prev_part = newlist->cpu_prev_part;
 499                 newlist->cpu_prev_part->cpu_next_part = cp;
 500                 newlist->cpu_prev_part = cp;
 501         }
 502         cp->cpu_part = newpp;
 503         newpp->cp_ncpus++;
 504         newpp->cp_gen++;
 505 
 506         ASSERT(bitset_is_null(&newpp->cp_haltset));
 507         ASSERT(bitset_is_null(&oldpp->cp_haltset));
 508 
 509         /*
 510          * let the lgroup framework know cp has entered the partition
 511          */
 512         lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid);
 513 
 514         /*
 515          * If necessary, move threads off processor.
 516          */
 517         if (move_threads) {
 518                 ASSERT(ncp != NULL);
 519 
 520                 /*
 521                  * Walk thru the active process list to look for
 522                  * threads that need to have a new home lgroup,
 523                  * or the last CPU they run on is the same CPU
 524                  * being moved out of the partition.
 525                  */
 526 
 527                 for (p = practive; p != NULL; p = p->p_next) {
 528 
 529                         t = p->p_tlist;
 530 
 531                         if (t == NULL)
 532                                 continue;
 533 
 534                         lgrp_diff_lpl = 0;
 535 
 536                         do {
 537 
 538                                 ASSERT(t->t_lpl != NULL);
 539 
 540                                 /*
 541                                  * Update the count of how many threads are
 542                                  * in this CPU's lgroup but have a different lpl
 543                                  */
 544 
 545                                 if (t->t_lpl != cpu_lpl &&
 546                                     t->t_lpl->lpl_lgrpid == lgrpid)
 547                                         lgrp_diff_lpl++;
 548                                 /*
 549                                  * If the lgroup that t is assigned to no
 550                                  * longer has any CPUs in t's partition,
 551                                  * we'll have to choose a new lgroup for t.
 552                                  */
 553 
 554                                 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
 555                                     t->t_cpupart)) {
 556                                         lgrp_move_thread(t,
 557                                             lgrp_choose(t, t->t_cpupart), 0);
 558                                 }
 559 
 560                                 /*
 561                                  * make sure lpl points to our own partition
 562                                  */
 563                                 ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads &&
 564                                     (t->t_lpl < t->t_cpupart->cp_lgrploads +
 565                                     t->t_cpupart->cp_nlgrploads));
 566 
 567                                 ASSERT(t->t_lpl->lpl_ncpu > 0);
 568 
 569                                 /* Update CPU last ran on if it was this CPU */
 570                                 if (t->t_cpu == cp && t->t_cpupart == oldpp &&
 571                                     t->t_bound_cpu != cp) {
 572                                         t->t_cpu = disp_lowpri_cpu(ncp,
 573                                             t->t_lpl, t->t_pri, NULL);
 574                                 }
 575                                 t = t->t_forw;
 576                         } while (t != p->p_tlist);
 577 
 578                         /*
 579                          * Didn't find any threads in the same lgroup as this
 580                          * CPU with a different lpl, so remove the lgroup from
 581                          * the process lgroup bitmask.
 582                          */
 583 
 584                         if (lgrp_diff_lpl)
 585                                 klgrpset_del(p->p_lgrpset, lgrpid);
 586                 }
 587 
 588                 /*
 589                  * Walk thread list looking for threads that need to be
 590                  * rehomed, since there are some threads that are not in
 591                  * their process's p_tlist.
 592                  */
 593 
 594                 t = curthread;
 595 
 596                 do {
 597                         ASSERT(t != NULL && t->t_lpl != NULL);
 598 
 599                         /*
 600                          * If the lgroup that t is assigned to no
 601                          * longer has any CPUs in t's partition,
 602                          * we'll have to choose a new lgroup for t.
 603                          * Also, choose best lgroup for home when
 604                          * thread has specified lgroup affinities,
 605                          * since there may be an lgroup with more
 606                          * affinity available after moving CPUs
 607                          * around.
 608                          */
 609                         if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
 610                             t->t_cpupart) || t->t_lgrp_affinity) {
 611                                 lgrp_move_thread(t,
 612                                     lgrp_choose(t, t->t_cpupart), 1);
 613                         }
 614 
 615                         /* make sure lpl points to our own partition */
 616                         ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) &&
 617                             (t->t_lpl < t->t_cpupart->cp_lgrploads +
 618                             t->t_cpupart->cp_nlgrploads));
 619 
 620                         ASSERT(t->t_lpl->lpl_ncpu > 0);
 621 
 622                         /* Update CPU last ran on if it was this CPU */
 623                         if (t->t_cpu == cp && t->t_cpupart == oldpp &&
 624                             t->t_bound_cpu != cp) {
 625                                 t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl,
 626                                     t->t_pri, NULL);
 627                         }
 628 
 629                         t = t->t_next;
 630                 } while (t != curthread);
 631 
 632                 /*
 633                  * Clear off the CPU's run queue, and the kp queue if the
 634                  * partition is now empty.
 635                  */
 636                 disp_cpu_inactive(cp);
 637 
 638                 /*
 639                  * Make cp switch to a thread from the new partition.
 640                  */
 641                 cp->cpu_runrun = 1;
 642                 cp->cpu_kprunrun = 1;
 643         }
 644 
 645         cpu_inmotion = NULL;
 646         start_cpus();
 647 
 648         /*
 649          * Let anyone interested know that cpu has been added to the set.
 650          */
 651         cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
 652 
 653         /*
 654          * Now let the cyclic subsystem know that it can reshuffle cyclics
 655          * bound to the new processor set.
 656          */
 657         cyclic_move_in(cp);
 658 
 659         return (0);
 660 }
 661 
 662 /*
 663  * Check if thread can be moved to a new cpu partition.  Called by
 664  * cpupart_move_thread() and pset_bind_start().
 665  */
 666 int
 667 cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore)
 668 {
 669         ASSERT(MUTEX_HELD(&cpu_lock));
 670         ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
 671         ASSERT(cp != NULL);
 672         ASSERT(THREAD_LOCK_HELD(tp));
 673 
 674         /*
 675          * CPU-bound threads can't be moved.
 676          */
 677         if (!ignore) {
 678                 cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu :
 679                     tp->t_weakbound_cpu;
 680                 if (boundcpu != NULL && boundcpu->cpu_part != cp)
 681                         return (EBUSY);
 682         }
 683 
 684         if (tp->t_cid == sysdccid) {
 685                 return (EINVAL);        /* For now, sysdc threads can't move */
 686         }
 687 
 688         return (0);
 689 }
 690 
 691 /*
 692  * Move thread to new partition.  If ignore is non-zero, then CPU
 693  * bindings should be ignored (this is used when destroying a
 694  * partition).
 695  */
 696 static int
 697 cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore,
 698     void *projbuf, void *zonebuf)
 699 {
 700         cpupart_t *oldpp = tp->t_cpupart;
 701         int ret;
 702 
 703         ASSERT(MUTEX_HELD(&cpu_lock));
 704         ASSERT(MUTEX_HELD(&pidlock));
 705         ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
 706         ASSERT(newpp != NULL);
 707 
 708         if (newpp->cp_cpulist == NULL)
 709                 return (EINVAL);
 710 
 711         /*
 712          * Check for errors first.
 713          */
 714         thread_lock(tp);
 715         if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) {
 716                 thread_unlock(tp);
 717                 return (ret);
 718         }
 719 
 720         /* move the thread */
 721         if (oldpp != newpp) {
 722                 /*
 723                  * Make the thread switch to the new partition.
 724                  */
 725                 tp->t_cpupart = newpp;
 726                 ASSERT(tp->t_lpl != NULL);
 727                 /*
 728                  * Leave the thread on the same lgroup if possible; otherwise
 729                  * choose a new lgroup for it.  In either case, update its
 730                  * t_lpl.
 731                  */
 732                 if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) &&
 733                     tp->t_lgrp_affinity == NULL) {
 734                         /*
 735                          * The thread's lgroup has CPUs in the thread's new
 736                          * partition, so the thread can stay assigned to the
 737                          * same lgroup.  Update its t_lpl to point to the
 738                          * lpl_t for its lgroup in its new partition.
 739                          */
 740                         lgrp_move_thread(tp, &tp->t_cpupart->\
 741                             cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1);
 742                 } else {
 743                         /*
 744                          * The thread's lgroup has no cpus in its new
 745                          * partition or it has specified lgroup affinities,
 746                          * so choose the best lgroup for the thread and
 747                          * assign it to that lgroup.
 748                          */
 749                         lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart),
 750                             1);
 751                 }
 752                 /*
 753                  * make sure lpl points to our own partition
 754                  */
 755                 ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) &&
 756                     (tp->t_lpl < tp->t_cpupart->cp_lgrploads +
 757                     tp->t_cpupart->cp_nlgrploads));
 758 
 759                 ASSERT(tp->t_lpl->lpl_ncpu > 0);
 760 
 761                 if (tp->t_state == TS_ONPROC) {
 762                         cpu_surrender(tp);
 763                 } else if (tp->t_state == TS_RUN) {
 764                         (void) dispdeq(tp);
 765                         setbackdq(tp);
 766                 }
 767         }
 768 
 769         /*
 770          * Our binding has changed; set TP_CHANGEBIND.
 771          */
 772         tp->t_proc_flag |= TP_CHANGEBIND;
 773         aston(tp);
 774 
 775         thread_unlock(tp);
 776         fss_changepset(tp, newpp, projbuf, zonebuf);
 777 
 778         return (0);             /* success */
 779 }
 780 
 781 
 782 /*
 783  * This function binds a thread to a partition.  Must be called with the
 784  * p_lock of the containing process held (to keep the thread from going
 785  * away), and thus also with cpu_lock held (since cpu_lock must be
 786  * acquired before p_lock).  If ignore is non-zero, then CPU bindings
 787  * should be ignored (this is used when destroying a partition).
 788  */
 789 int
 790 cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf,
 791     void *zonebuf)
 792 {
 793         cpupart_t       *newpp;
 794 
 795         ASSERT(pool_lock_held());
 796         ASSERT(MUTEX_HELD(&cpu_lock));
 797         ASSERT(MUTEX_HELD(&pidlock));
 798         ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
 799 
 800         if (psid == PS_NONE)
 801                 newpp = &cp_default;
 802         else {
 803                 newpp = cpupart_find(psid);
 804                 if (newpp == NULL) {
 805                         return (EINVAL);
 806                 }
 807         }
 808         return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf));
 809 }
 810 
 811 
 812 /*
 813  * Create a new partition.  On MP systems, this also allocates a
 814  * kpreempt disp queue for that partition.
 815  */
 816 int
 817 cpupart_create(psetid_t *psid)
 818 {
 819         cpupart_t       *pp;
 820 
 821         ASSERT(pool_lock_held());
 822 
 823         pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP);
 824         pp->cp_nlgrploads = lgrp_plat_max_lgrps();
 825         pp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * pp->cp_nlgrploads,
 826             KM_SLEEP);
 827 
 828         mutex_enter(&cpu_lock);
 829         if (cp_numparts == cp_max_numparts) {
 830                 mutex_exit(&cpu_lock);
 831                 kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads);
 832                 pp->cp_lgrploads = NULL;
 833                 kmem_free(pp, sizeof (cpupart_t));
 834                 return (ENOMEM);
 835         }
 836         cp_numparts++;
 837         /* find the next free partition ID */
 838         while (cpupart_find(CPTOPS(cp_id_next)) != NULL)
 839                 cp_id_next++;
 840         pp->cp_id = cp_id_next++;
 841         pp->cp_ncpus = 0;
 842         pp->cp_cpulist = NULL;
 843         pp->cp_attr = 0;
 844         klgrpset_clear(pp->cp_lgrpset);
 845         pp->cp_kp_queue.disp_maxrunpri = -1;
 846         pp->cp_kp_queue.disp_max_unbound_pri = -1;
 847         pp->cp_kp_queue.disp_cpu = NULL;
 848         pp->cp_gen = 0;
 849         DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock);
 850         *psid = CPTOPS(pp->cp_id);
 851         disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris);
 852         cpupart_kstat_create(pp);
 853         cpupart_lpl_initialize(pp);
 854 
 855         bitset_init(&pp->cp_cmt_pgs);
 856 
 857         /*
 858          * Initialize and size the partition's bitset of halted CPUs.
 859          */
 860         bitset_init_fanout(&pp->cp_haltset, cp_haltset_fanout);
 861         bitset_resize(&pp->cp_haltset, max_ncpus);
 862 
 863         /*
 864          * Pause all CPUs while changing the partition list, to make sure
 865          * the clock thread (which traverses the list without holding
 866          * cpu_lock) isn't running.
 867          */
 868         pause_cpus(NULL);
 869         pp->cp_next = cp_list_head;
 870         pp->cp_prev = cp_list_head->cp_prev;
 871         cp_list_head->cp_prev->cp_next = pp;
 872         cp_list_head->cp_prev = pp;
 873         start_cpus();
 874         mutex_exit(&cpu_lock);
 875 
 876         return (0);
 877 }
 878 
 879 /*
 880  * Move threads from specified partition to cp_default. If `force' is specified,
 881  * move all threads, otherwise move only soft-bound threads.
 882  */
 883 static int
 884 cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all)
 885 {
 886         void    *projbuf, *zonebuf;
 887         kthread_t *t;
 888         proc_t  *p;
 889         int     err = 0;
 890         psetid_t psid = pp->cp_id;
 891 
 892         ASSERT(pool_lock_held());
 893         ASSERT(MUTEX_HELD(&cpu_lock));
 894 
 895         if (pp == NULL || pp == &cp_default) {
 896                 return (EINVAL);
 897         }
 898 
 899         /*
 900          * Pre-allocate enough buffers for FSS for all active projects and
 901          * for all active zones on the system.  Unused buffers will be
 902          * freed later by fss_freebuf().
 903          */
 904         projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ);
 905         zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE);
 906 
 907         mutex_enter(&pidlock);
 908         t = curthread;
 909         do {
 910                 if (t->t_bind_pset == psid) {
 911 again:                  p = ttoproc(t);
 912                         mutex_enter(&p->p_lock);
 913                         if (ttoproc(t) != p) {
 914                                 /*
 915                                  * lwp_exit has changed this thread's process
 916                                  * pointer before we grabbed its p_lock.
 917                                  */
 918                                 mutex_exit(&p->p_lock);
 919                                 goto again;
 920                         }
 921 
 922                         /*
 923                          * Can only unbind threads which have revocable binding
 924                          * unless force unbinding requested.
 925                          */
 926                         if (unbind_all || TB_PSET_IS_SOFT(t)) {
 927                                 err = cpupart_bind_thread(t, PS_NONE, 1,
 928                                     projbuf, zonebuf);
 929                                 if (err) {
 930                                         mutex_exit(&p->p_lock);
 931                                         mutex_exit(&pidlock);
 932                                         fss_freebuf(projbuf, FSS_ALLOC_PROJ);
 933                                         fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
 934                                         return (err);
 935                                 }
 936                                 t->t_bind_pset = PS_NONE;
 937                         }
 938                         mutex_exit(&p->p_lock);
 939                 }
 940                 t = t->t_next;
 941         } while (t != curthread);
 942 
 943         mutex_exit(&pidlock);
 944         fss_freebuf(projbuf, FSS_ALLOC_PROJ);
 945         fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
 946         return (err);
 947 }
 948 
 949 /*
 950  * Destroy a partition.
 951  */
 952 int
 953 cpupart_destroy(psetid_t psid)
 954 {
 955         cpu_t   *cp, *first_cp;
 956         cpupart_t *pp, *newpp;
 957         int     err = 0;
 958 
 959         ASSERT(pool_lock_held());
 960         mutex_enter(&cpu_lock);
 961 
 962         pp = cpupart_find(psid);
 963         if (pp == NULL || pp == &cp_default) {
 964                 mutex_exit(&cpu_lock);
 965                 return (EINVAL);
 966         }
 967 
 968         /*
 969          * Unbind all the threads currently bound to the partition.
 970          */
 971         err = cpupart_unbind_threads(pp, B_TRUE);
 972         if (err) {
 973                 mutex_exit(&cpu_lock);
 974                 return (err);
 975         }
 976 
 977         newpp = &cp_default;
 978         while ((cp = pp->cp_cpulist) != NULL) {
 979                 if (err = cpupart_move_cpu(cp, newpp, 0)) {
 980                         mutex_exit(&cpu_lock);
 981                         return (err);
 982                 }
 983         }
 984 
 985         ASSERT(bitset_is_null(&pp->cp_cmt_pgs));
 986         ASSERT(bitset_is_null(&pp->cp_haltset));
 987 
 988         /*
 989          * Teardown the partition's group of active CMT PGs and halted
 990          * CPUs now that they have all left.
 991          */
 992         bitset_fini(&pp->cp_cmt_pgs);
 993         bitset_fini(&pp->cp_haltset);
 994 
 995         /*
 996          * Reset the pointers in any offline processors so they won't
 997          * try to rejoin the destroyed partition when they're turned
 998          * online.
 999          */
1000         first_cp = cp = CPU;
1001         do {
1002                 if (cp->cpu_part == pp) {
1003                         ASSERT(cp->cpu_flags & CPU_OFFLINE);
1004                         cp->cpu_part = newpp;
1005                 }
1006                 cp = cp->cpu_next;
1007         } while (cp != first_cp);
1008 
1009         /*
1010          * Pause all CPUs while changing the partition list, to make sure
1011          * the clock thread (which traverses the list without holding
1012          * cpu_lock) isn't running.
1013          */
1014         pause_cpus(NULL);
1015         pp->cp_prev->cp_next = pp->cp_next;
1016         pp->cp_next->cp_prev = pp->cp_prev;
1017         if (cp_list_head == pp)
1018                 cp_list_head = pp->cp_next;
1019         start_cpus();
1020 
1021         if (cp_id_next > pp->cp_id)
1022                 cp_id_next = pp->cp_id;
1023 
1024         if (pp->cp_kstat)
1025                 kstat_delete(pp->cp_kstat);
1026 
1027         cp_numparts--;
1028 
1029         disp_kp_free(&pp->cp_kp_queue);
1030 
1031         cpupart_lpl_teardown(pp);
1032 
1033         kmem_free(pp, sizeof (cpupart_t));
1034         mutex_exit(&cpu_lock);
1035 
1036         return (err);
1037 }
1038 
1039 
1040 /*
1041  * Return the ID of the partition to which the specified processor belongs.
1042  */
1043 psetid_t
1044 cpupart_query_cpu(cpu_t *cp)
1045 {
1046         ASSERT(MUTEX_HELD(&cpu_lock));
1047 
1048         return (CPTOPS(cp->cpu_part->cp_id));
1049 }
1050 
1051 
1052 /*
1053  * Attach a processor to an existing partition.
1054  */
1055 int
1056 cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced)
1057 {
1058         cpupart_t       *pp;
1059         int             err;
1060 
1061         ASSERT(pool_lock_held());
1062         ASSERT(MUTEX_HELD(&cpu_lock));
1063 
1064         pp = cpupart_find(psid);
1065         if (pp == NULL)
1066                 return (EINVAL);
1067         if (cp->cpu_flags & CPU_OFFLINE)
1068                 return (EINVAL);
1069 
1070         err = cpupart_move_cpu(cp, pp, forced);
1071         return (err);
1072 }
1073 
1074 /*
1075  * Get a list of cpus belonging to the partition.  If numcpus is NULL,
1076  * this just checks for a valid partition.  If numcpus is non-NULL but
1077  * cpulist is NULL, the current number of cpus is stored in *numcpus.
1078  * If both are non-NULL, the current number of cpus is stored in *numcpus,
1079  * and a list of those cpus up to the size originally in *numcpus is
1080  * stored in cpulist[].  Also, store the processor set id in *psid.
1081  * This is useful in case the processor set id passed in was PS_MYID.
1082  */
1083 int
1084 cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus)
1085 {
1086         cpupart_t       *pp;
1087         uint_t          ncpus;
1088         cpu_t           *c;
1089         int             i;
1090 
1091         mutex_enter(&cpu_lock);
1092         pp = cpupart_find(*psid);
1093         if (pp == NULL) {
1094                 mutex_exit(&cpu_lock);
1095                 return (EINVAL);
1096         }
1097         *psid = CPTOPS(pp->cp_id);
1098         ncpus = pp->cp_ncpus;
1099         if (numcpus) {
1100                 if (ncpus > *numcpus) {
1101                         /*
1102                          * Only copy as many cpus as were passed in, but
1103                          * pass back the real number.
1104                          */
1105                         uint_t t = ncpus;
1106                         ncpus = *numcpus;
1107                         *numcpus = t;
1108                 } else
1109                         *numcpus = ncpus;
1110 
1111                 if (cpulist) {
1112                         c = pp->cp_cpulist;
1113                         for (i = 0; i < ncpus; i++) {
1114                                 ASSERT(c != NULL);
1115                                 cpulist[i] = c->cpu_id;
1116                                 c = c->cpu_next_part;
1117                         }
1118                 }
1119         }
1120         mutex_exit(&cpu_lock);
1121         return (0);
1122 }
1123 
1124 /*
1125  * Reallocate kpreempt queues for each CPU partition.  Called from
1126  * disp_setup when a new scheduling class is loaded that increases the
1127  * number of priorities in the system.
1128  */
1129 void
1130 cpupart_kpqalloc(pri_t npri)
1131 {
1132         cpupart_t *cpp;
1133 
1134         ASSERT(MUTEX_HELD(&cpu_lock));
1135         cpp = cp_list_head;
1136         do {
1137                 disp_kp_alloc(&cpp->cp_kp_queue, npri);
1138                 cpp = cpp->cp_next;
1139         } while (cpp != cp_list_head);
1140 }
1141 
1142 int
1143 cpupart_get_loadavg(psetid_t psid, int *buf, int nelem)
1144 {
1145         cpupart_t *cp;
1146         int i;
1147 
1148         ASSERT(nelem >= 0);
1149         ASSERT(nelem <= LOADAVG_NSTATS);
1150         ASSERT(MUTEX_HELD(&cpu_lock));
1151 
1152         cp = cpupart_find(psid);
1153         if (cp == NULL)
1154                 return (EINVAL);
1155         for (i = 0; i < nelem; i++)
1156                 buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT);
1157 
1158         return (0);
1159 }
1160 
1161 
1162 uint_t
1163 cpupart_list(psetid_t *list, uint_t nelem, int flag)
1164 {
1165         uint_t numpart = 0;
1166         cpupart_t *cp;
1167 
1168         ASSERT(MUTEX_HELD(&cpu_lock));
1169         ASSERT(flag == CP_ALL || flag == CP_NONEMPTY);
1170 
1171         if (list != NULL) {
1172                 cp = cp_list_head;
1173                 do {
1174                         if (((flag == CP_ALL) && (cp != &cp_default)) ||
1175                             ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) {
1176                                 if (numpart == nelem)
1177                                         break;
1178                                 list[numpart++] = CPTOPS(cp->cp_id);
1179                         }
1180                         cp = cp->cp_next;
1181                 } while (cp != cp_list_head);
1182         }
1183 
1184         ASSERT(numpart < cp_numparts);
1185 
1186         if (flag == CP_ALL)
1187                 numpart = cp_numparts - 1; /* leave out default partition */
1188         else if (flag == CP_NONEMPTY)
1189                 numpart = cp_numparts_nonempty;
1190 
1191         return (numpart);
1192 }
1193 
1194 int
1195 cpupart_setattr(psetid_t psid, uint_t attr)
1196 {
1197         cpupart_t *cp;
1198 
1199         ASSERT(pool_lock_held());
1200 
1201         mutex_enter(&cpu_lock);
1202         if ((cp = cpupart_find(psid)) == NULL) {
1203                 mutex_exit(&cpu_lock);
1204                 return (EINVAL);
1205         }
1206         /*
1207          * PSET_NOESCAPE attribute for default cpu partition is always set
1208          */
1209         if (cp == &cp_default && !(attr & PSET_NOESCAPE)) {
1210                 mutex_exit(&cpu_lock);
1211                 return (EINVAL);
1212         }
1213         cp->cp_attr = attr;
1214         mutex_exit(&cpu_lock);
1215         return (0);
1216 }
1217 
1218 int
1219 cpupart_getattr(psetid_t psid, uint_t *attrp)
1220 {
1221         cpupart_t *cp;
1222 
1223         mutex_enter(&cpu_lock);
1224         if ((cp = cpupart_find(psid)) == NULL) {
1225                 mutex_exit(&cpu_lock);
1226                 return (EINVAL);
1227         }
1228         *attrp = cp->cp_attr;
1229         mutex_exit(&cpu_lock);
1230         return (0);
1231 }