1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28 
  29 
  30 #include <sys/types.h>
  31 #include <sys/param.h>
  32 #include <sys/sysmacros.h>
  33 #include <sys/signal.h>
  34 #include <sys/user.h>
  35 #include <sys/systm.h>
  36 #include <sys/sysinfo.h>
  37 #include <sys/var.h>
  38 #include <sys/errno.h>
  39 #include <sys/cmn_err.h>
  40 #include <sys/debug.h>
  41 #include <sys/inline.h>
  42 #include <sys/disp.h>
  43 #include <sys/class.h>
  44 #include <sys/bitmap.h>
  45 #include <sys/kmem.h>
  46 #include <sys/cpuvar.h>
  47 #include <sys/vtrace.h>
  48 #include <sys/tnf.h>
  49 #include <sys/cpupart.h>
  50 #include <sys/lgrp.h>
  51 #include <sys/pg.h>
  52 #include <sys/cmt.h>
  53 #include <sys/bitset.h>
  54 #include <sys/schedctl.h>
  55 #include <sys/atomic.h>
  56 #include <sys/dtrace.h>
  57 #include <sys/sdt.h>
  58 #include <sys/archsystm.h>
  59 
  60 #include <vm/as.h>
  61 
  62 #define BOUND_CPU       0x1
  63 #define BOUND_PARTITION 0x2
  64 #define BOUND_INTR      0x4
  65 
  66 /* Dispatch queue allocation structure and functions */
  67 struct disp_queue_info {
  68         disp_t  *dp;
  69         dispq_t *olddispq;
  70         dispq_t *newdispq;
  71         ulong_t *olddqactmap;
  72         ulong_t *newdqactmap;
  73         int     oldnglobpris;
  74 };
  75 static void     disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
  76     disp_t *dp);
  77 static void     disp_dq_assign(struct disp_queue_info *dptr, int numpris);
  78 static void     disp_dq_free(struct disp_queue_info *dptr);
  79 
  80 /* platform-specific routine to call when processor is idle */
  81 static void     generic_idle_cpu();
  82 void            (*idle_cpu)() = generic_idle_cpu;
  83 
  84 /* routines invoked when a CPU enters/exits the idle loop */
  85 static void     idle_enter();
  86 static void     idle_exit();
  87 
  88 /* platform-specific routine to call when thread is enqueued */
  89 static void     generic_enq_thread(cpu_t *, int);
  90 void            (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
  91 
  92 pri_t   kpreemptpri;            /* priority where kernel preemption applies */
  93 pri_t   upreemptpri = 0;        /* priority where normal preemption applies */
  94 pri_t   intr_pri;               /* interrupt thread priority base level */
  95 
  96 #define KPQPRI  -1              /* pri where cpu affinity is dropped for kpq */
  97 pri_t   kpqpri = KPQPRI;        /* can be set in /etc/system */
  98 disp_t  cpu0_disp;              /* boot CPU's dispatch queue */
  99 disp_lock_t     swapped_lock;   /* lock swapped threads and swap queue */
 100 int     nswapped;               /* total number of swapped threads */
 101 void    disp_swapped_enq(kthread_t *tp);
 102 static void     disp_swapped_setrun(kthread_t *tp);
 103 static void     cpu_resched(cpu_t *cp, pri_t tpri);
 104 
 105 /*
 106  * If this is set, only interrupt threads will cause kernel preemptions.
 107  * This is done by changing the value of kpreemptpri.  kpreemptpri
 108  * will either be the max sysclass pri + 1 or the min interrupt pri.
 109  */
 110 int     only_intr_kpreempt;
 111 
 112 extern void set_idle_cpu(int cpun);
 113 extern void unset_idle_cpu(int cpun);
 114 static void setkpdq(kthread_t *tp, int borf);
 115 #define SETKP_BACK      0
 116 #define SETKP_FRONT     1
 117 /*
 118  * Parameter that determines how recently a thread must have run
 119  * on the CPU to be considered loosely-bound to that CPU to reduce
 120  * cold cache effects.  The interval is in hertz.
 121  */
 122 #define RECHOOSE_INTERVAL 3
 123 int     rechoose_interval = RECHOOSE_INTERVAL;
 124 
 125 /*
 126  * Parameter that determines how long (in nanoseconds) a thread must
 127  * be sitting on a run queue before it can be stolen by another CPU
 128  * to reduce migrations.  The interval is in nanoseconds.
 129  *
 130  * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
 131  * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
 132  * here indicating it is uninitiallized.
 133  * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
 134  *
 135  */
 136 #define NOSTEAL_UNINITIALIZED   (-1)
 137 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
 138 extern void cmp_set_nosteal_interval(void);
 139 
 140 id_t    defaultcid;     /* system "default" class; see dispadmin(1M) */
 141 
 142 disp_lock_t     transition_lock;        /* lock on transitioning threads */
 143 disp_lock_t     stop_lock;              /* lock on stopped threads */
 144 
 145 static void     cpu_dispqalloc(int numpris);
 146 
 147 /*
 148  * This gets returned by disp_getwork/disp_getbest if we couldn't steal
 149  * a thread because it was sitting on its run queue for a very short
 150  * period of time.
 151  */
 152 #define T_DONTSTEAL     (kthread_t *)(-1) /* returned by disp_getwork/getbest */
 153 
 154 static kthread_t        *disp_getwork(cpu_t *to);
 155 static kthread_t        *disp_getbest(disp_t *from);
 156 static kthread_t        *disp_ratify(kthread_t *tp, disp_t *kpq);
 157 
 158 void    swtch_to(kthread_t *);
 159 
 160 /*
 161  * dispatcher and scheduler initialization
 162  */
 163 
 164 /*
 165  * disp_setup - Common code to calculate and allocate dispatcher
 166  *              variables and structures based on the maximum priority.
 167  */
 168 static void
 169 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
 170 {
 171         pri_t   newnglobpris;
 172 
 173         ASSERT(MUTEX_HELD(&cpu_lock));
 174 
 175         newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
 176 
 177         if (newnglobpris > oldnglobpris) {
 178                 /*
 179                  * Allocate new kp queues for each CPU partition.
 180                  */
 181                 cpupart_kpqalloc(newnglobpris);
 182 
 183                 /*
 184                  * Allocate new dispatch queues for each CPU.
 185                  */
 186                 cpu_dispqalloc(newnglobpris);
 187 
 188                 /*
 189                  * compute new interrupt thread base priority
 190                  */
 191                 intr_pri = maxglobpri;
 192                 if (only_intr_kpreempt) {
 193                         kpreemptpri = intr_pri + 1;
 194                         if (kpqpri == KPQPRI)
 195                                 kpqpri = kpreemptpri;
 196                 }
 197                 v.v_nglobpris = newnglobpris;
 198         }
 199 }
 200 
 201 /*
 202  * dispinit - Called to initialize all loaded classes and the
 203  *            dispatcher framework.
 204  */
 205 void
 206 dispinit(void)
 207 {
 208         id_t    cid;
 209         pri_t   maxglobpri;
 210         pri_t   cl_maxglobpri;
 211 
 212         maxglobpri = -1;
 213 
 214         /*
 215          * Initialize transition lock, which will always be set.
 216          */
 217         DISP_LOCK_INIT(&transition_lock);
 218         disp_lock_enter_high(&transition_lock);
 219         DISP_LOCK_INIT(&stop_lock);
 220 
 221         mutex_enter(&cpu_lock);
 222         CPU->cpu_disp->disp_maxrunpri = -1;
 223         CPU->cpu_disp->disp_max_unbound_pri = -1;
 224 
 225         /*
 226          * Initialize the default CPU partition.
 227          */
 228         cpupart_initialize_default();
 229         /*
 230          * Call the class specific initialization functions for
 231          * all pre-installed schedulers.
 232          *
 233          * We pass the size of a class specific parameter
 234          * buffer to each of the initialization functions
 235          * to try to catch problems with backward compatibility
 236          * of class modules.
 237          *
 238          * For example a new class module running on an old system
 239          * which didn't provide sufficiently large parameter buffers
 240          * would be bad news. Class initialization modules can check for
 241          * this and take action if they detect a problem.
 242          */
 243 
 244         for (cid = 0; cid < nclass; cid++) {
 245                 sclass_t        *sc;
 246 
 247                 sc = &sclass[cid];
 248                 if (SCHED_INSTALLED(sc)) {
 249                         cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
 250                             &sc->cl_funcs);
 251                         if (cl_maxglobpri > maxglobpri)
 252                                 maxglobpri = cl_maxglobpri;
 253                 }
 254         }
 255         kpreemptpri = (pri_t)v.v_maxsyspri + 1;
 256         if (kpqpri == KPQPRI)
 257                 kpqpri = kpreemptpri;
 258 
 259         ASSERT(maxglobpri >= 0);
 260         disp_setup(maxglobpri, 0);
 261 
 262         mutex_exit(&cpu_lock);
 263 
 264         /*
 265          * Platform specific sticky scheduler setup.
 266          */
 267         if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
 268                 cmp_set_nosteal_interval();
 269 
 270         /*
 271          * Get the default class ID; this may be later modified via
 272          * dispadmin(1M).  This will load the class (normally TS) and that will
 273          * call disp_add(), which is why we had to drop cpu_lock first.
 274          */
 275         if (getcid(defaultclass, &defaultcid) != 0) {
 276                 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
 277                     defaultclass);
 278         }
 279 }
 280 
 281 /*
 282  * disp_add - Called with class pointer to initialize the dispatcher
 283  *            for a newly loaded class.
 284  */
 285 void
 286 disp_add(sclass_t *clp)
 287 {
 288         pri_t   maxglobpri;
 289         pri_t   cl_maxglobpri;
 290 
 291         mutex_enter(&cpu_lock);
 292         /*
 293          * Initialize the scheduler class.
 294          */
 295         maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
 296         cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
 297         if (cl_maxglobpri > maxglobpri)
 298                 maxglobpri = cl_maxglobpri;
 299 
 300         /*
 301          * Save old queue information.  Since we're initializing a
 302          * new scheduling class which has just been loaded, then
 303          * the size of the dispq may have changed.  We need to handle
 304          * that here.
 305          */
 306         disp_setup(maxglobpri, v.v_nglobpris);
 307 
 308         mutex_exit(&cpu_lock);
 309 }
 310 
 311 
 312 /*
 313  * For each CPU, allocate new dispatch queues
 314  * with the stated number of priorities.
 315  */
 316 static void
 317 cpu_dispqalloc(int numpris)
 318 {
 319         cpu_t   *cpup;
 320         struct disp_queue_info  *disp_mem;
 321         int i, num;
 322 
 323         ASSERT(MUTEX_HELD(&cpu_lock));
 324 
 325         disp_mem = kmem_zalloc(NCPU *
 326             sizeof (struct disp_queue_info), KM_SLEEP);
 327 
 328         /*
 329          * This routine must allocate all of the memory before stopping
 330          * the cpus because it must not sleep in kmem_alloc while the
 331          * CPUs are stopped.  Locks they hold will not be freed until they
 332          * are restarted.
 333          */
 334         i = 0;
 335         cpup = cpu_list;
 336         do {
 337                 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
 338                 i++;
 339                 cpup = cpup->cpu_next;
 340         } while (cpup != cpu_list);
 341         num = i;
 342 
 343         pause_cpus(NULL, NULL);
 344         for (i = 0; i < num; i++)
 345                 disp_dq_assign(&disp_mem[i], numpris);
 346         start_cpus();
 347 
 348         /*
 349          * I must free all of the memory after starting the cpus because
 350          * I can not risk sleeping in kmem_free while the cpus are stopped.
 351          */
 352         for (i = 0; i < num; i++)
 353                 disp_dq_free(&disp_mem[i]);
 354 
 355         kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
 356 }
 357 
 358 static void
 359 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp)
 360 {
 361         dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
 362         dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
 363             sizeof (long), KM_SLEEP);
 364         dptr->dp = dp;
 365 }
 366 
 367 static void
 368 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
 369 {
 370         disp_t  *dp;
 371 
 372         dp = dptr->dp;
 373         dptr->olddispq = dp->disp_q;
 374         dptr->olddqactmap = dp->disp_qactmap;
 375         dptr->oldnglobpris = dp->disp_npri;
 376 
 377         ASSERT(dptr->oldnglobpris < numpris);
 378 
 379         if (dptr->olddispq != NULL) {
 380                 /*
 381                  * Use kcopy because bcopy is platform-specific
 382                  * and could block while we might have paused the cpus.
 383                  */
 384                 (void) kcopy(dptr->olddispq, dptr->newdispq,
 385                     dptr->oldnglobpris * sizeof (dispq_t));
 386                 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
 387                     ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
 388                     sizeof (long));
 389         }
 390         dp->disp_q = dptr->newdispq;
 391         dp->disp_qactmap = dptr->newdqactmap;
 392         dp->disp_q_limit = &dptr->newdispq[numpris];
 393         dp->disp_npri = numpris;
 394 }
 395 
 396 static void
 397 disp_dq_free(struct disp_queue_info *dptr)
 398 {
 399         if (dptr->olddispq != NULL)
 400                 kmem_free(dptr->olddispq,
 401                     dptr->oldnglobpris * sizeof (dispq_t));
 402         if (dptr->olddqactmap != NULL)
 403                 kmem_free(dptr->olddqactmap,
 404                     ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
 405 }
 406 
 407 /*
 408  * For a newly created CPU, initialize the dispatch queue.
 409  * This is called before the CPU is known through cpu[] or on any lists.
 410  */
 411 void
 412 disp_cpu_init(cpu_t *cp)
 413 {
 414         disp_t  *dp;
 415         dispq_t *newdispq;
 416         ulong_t *newdqactmap;
 417 
 418         ASSERT(MUTEX_HELD(&cpu_lock));      /* protect dispatcher queue sizes */
 419 
 420         if (cp == cpu0_disp.disp_cpu)
 421                 dp = &cpu0_disp;
 422         else
 423                 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
 424         bzero(dp, sizeof (disp_t));
 425         cp->cpu_disp = dp;
 426         dp->disp_cpu = cp;
 427         dp->disp_maxrunpri = -1;
 428         dp->disp_max_unbound_pri = -1;
 429         DISP_LOCK_INIT(&cp->cpu_thread_lock);
 430         /*
 431          * Allocate memory for the dispatcher queue headers
 432          * and the active queue bitmap.
 433          */
 434         newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
 435         newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
 436             sizeof (long), KM_SLEEP);
 437         dp->disp_q = newdispq;
 438         dp->disp_qactmap = newdqactmap;
 439         dp->disp_q_limit = &newdispq[v.v_nglobpris];
 440         dp->disp_npri = v.v_nglobpris;
 441 }
 442 
 443 void
 444 disp_cpu_fini(cpu_t *cp)
 445 {
 446         ASSERT(MUTEX_HELD(&cpu_lock));
 447 
 448         disp_kp_free(cp->cpu_disp);
 449         if (cp->cpu_disp != &cpu0_disp)
 450                 kmem_free(cp->cpu_disp, sizeof (disp_t));
 451 }
 452 
 453 /*
 454  * Allocate new, larger kpreempt dispatch queue to replace the old one.
 455  */
 456 void
 457 disp_kp_alloc(disp_t *dq, pri_t npri)
 458 {
 459         struct disp_queue_info  mem_info;
 460 
 461         if (npri > dq->disp_npri) {
 462                 /*
 463                  * Allocate memory for the new array.
 464                  */
 465                 disp_dq_alloc(&mem_info, npri, dq);
 466 
 467                 /*
 468                  * We need to copy the old structures to the new
 469                  * and free the old.
 470                  */
 471                 disp_dq_assign(&mem_info, npri);
 472                 disp_dq_free(&mem_info);
 473         }
 474 }
 475 
 476 /*
 477  * Free dispatch queue.
 478  * Used for the kpreempt queues for a removed CPU partition and
 479  * for the per-CPU queues of deleted CPUs.
 480  */
 481 void
 482 disp_kp_free(disp_t *dq)
 483 {
 484         struct disp_queue_info  mem_info;
 485 
 486         mem_info.olddispq = dq->disp_q;
 487         mem_info.olddqactmap = dq->disp_qactmap;
 488         mem_info.oldnglobpris = dq->disp_npri;
 489         disp_dq_free(&mem_info);
 490 }
 491 
 492 /*
 493  * End dispatcher and scheduler initialization.
 494  */
 495 
 496 /*
 497  * See if there's anything to do other than remain idle.
 498  * Return non-zero if there is.
 499  *
 500  * This function must be called with high spl, or with
 501  * kernel preemption disabled to prevent the partition's
 502  * active cpu list from changing while being traversed.
 503  *
 504  * This is essentially a simpler version of disp_getwork()
 505  * to be called by CPUs preparing to "halt".
 506  */
 507 int
 508 disp_anywork(void)
 509 {
 510         cpu_t           *cp = CPU;
 511         cpu_t           *ocp;
 512         volatile int    *local_nrunnable = &cp->cpu_disp->disp_nrunnable;
 513 
 514         if (!(cp->cpu_flags & CPU_OFFLINE)) {
 515                 if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
 516                         return (1);
 517 
 518                 for (ocp = cp->cpu_next_part; ocp != cp;
 519                     ocp = ocp->cpu_next_part) {
 520                         ASSERT(CPU_ACTIVE(ocp));
 521 
 522                         /*
 523                          * Something has appeared on the local run queue.
 524                          */
 525                         if (*local_nrunnable > 0)
 526                                 return (1);
 527                         /*
 528                          * If we encounter another idle CPU that will
 529                          * soon be trolling around through disp_anywork()
 530                          * terminate our walk here and let this other CPU
 531                          * patrol the next part of the list.
 532                          */
 533                         if (ocp->cpu_dispatch_pri == -1 &&
 534                             (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
 535                                 return (0);
 536                         /*
 537                          * Work can be taken from another CPU if:
 538                          *      - There is unbound work on the run queue
 539                          *      - That work isn't a thread undergoing a
 540                          *      - context switch on an otherwise empty queue.
 541                          *      - The CPU isn't running the idle loop.
 542                          */
 543                         if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
 544                             !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
 545                             ocp->cpu_disp->disp_nrunnable == 1) &&
 546                             ocp->cpu_dispatch_pri != -1)
 547                                 return (1);
 548                 }
 549         }
 550         return (0);
 551 }
 552 
 553 /*
 554  * Called when CPU enters the idle loop
 555  */
 556 static void
 557 idle_enter()
 558 {
 559         cpu_t           *cp = CPU;
 560 
 561         new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
 562         CPU_STATS_ADDQ(cp, sys, idlethread, 1);
 563         set_idle_cpu(cp->cpu_id);    /* arch-dependent hook */
 564 }
 565 
 566 /*
 567  * Called when CPU exits the idle loop
 568  */
 569 static void
 570 idle_exit()
 571 {
 572         cpu_t           *cp = CPU;
 573 
 574         new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
 575         unset_idle_cpu(cp->cpu_id);  /* arch-dependent hook */
 576 }
 577 
 578 /*
 579  * Idle loop.
 580  */
 581 void
 582 idle()
 583 {
 584         struct cpu      *cp = CPU;              /* pointer to this CPU */
 585         kthread_t       *t;                     /* taken thread */
 586 
 587         idle_enter();
 588 
 589         /*
 590          * Uniprocessor version of idle loop.
 591          * Do this until notified that we're on an actual multiprocessor.
 592          */
 593         while (ncpus == 1) {
 594                 if (cp->cpu_disp->disp_nrunnable == 0) {
 595                         (*idle_cpu)();
 596                         continue;
 597                 }
 598                 idle_exit();
 599                 swtch();
 600 
 601                 idle_enter(); /* returned from swtch */
 602         }
 603 
 604         /*
 605          * Multiprocessor idle loop.
 606          */
 607         for (;;) {
 608                 /*
 609                  * If CPU is completely quiesced by p_online(2), just wait
 610                  * here with minimal bus traffic until put online.
 611                  */
 612                 while (cp->cpu_flags & CPU_QUIESCED)
 613                         (*idle_cpu)();
 614 
 615                 if (cp->cpu_disp->disp_nrunnable != 0) {
 616                         idle_exit();
 617                         swtch();
 618                 } else {
 619                         if (cp->cpu_flags & CPU_OFFLINE)
 620                                 continue;
 621                         if ((t = disp_getwork(cp)) == NULL) {
 622                                 if (cp->cpu_chosen_level != -1) {
 623                                         disp_t *dp = cp->cpu_disp;
 624                                         disp_t *kpq;
 625 
 626                                         disp_lock_enter(&dp->disp_lock);
 627                                         /*
 628                                          * Set kpq under lock to prevent
 629                                          * migration between partitions.
 630                                          */
 631                                         kpq = &cp->cpu_part->cp_kp_queue;
 632                                         if (kpq->disp_maxrunpri == -1)
 633                                                 cp->cpu_chosen_level = -1;
 634                                         disp_lock_exit(&dp->disp_lock);
 635                                 }
 636                                 (*idle_cpu)();
 637                                 continue;
 638                         }
 639                         /*
 640                          * If there was a thread but we couldn't steal
 641                          * it, then keep trying.
 642                          */
 643                         if (t == T_DONTSTEAL)
 644                                 continue;
 645                         idle_exit();
 646                         swtch_to(t);
 647                 }
 648                 idle_enter(); /* returned from swtch/swtch_to */
 649         }
 650 }
 651 
 652 
 653 /*
 654  * Preempt the currently running thread in favor of the highest
 655  * priority thread.  The class of the current thread controls
 656  * where it goes on the dispatcher queues. If panicking, turn
 657  * preemption off.
 658  */
 659 void
 660 preempt()
 661 {
 662         kthread_t       *t = curthread;
 663         klwp_t          *lwp = ttolwp(curthread);
 664 
 665         if (panicstr)
 666                 return;
 667 
 668         TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
 669 
 670         thread_lock(t);
 671 
 672         if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
 673                 /*
 674                  * this thread has already been chosen to be run on
 675                  * another CPU. Clear kprunrun on this CPU since we're
 676                  * already headed for swtch().
 677                  */
 678                 CPU->cpu_kprunrun = 0;
 679                 thread_unlock_nopreempt(t);
 680                 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
 681         } else {
 682                 if (lwp != NULL)
 683                         lwp->lwp_ru.nivcsw++;
 684                 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
 685                 THREAD_TRANSITION(t);
 686                 CL_PREEMPT(t);
 687                 DTRACE_SCHED(preempt);
 688                 thread_unlock_nopreempt(t);
 689 
 690                 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
 691 
 692                 swtch();                /* clears CPU->cpu_runrun via disp() */
 693         }
 694 }
 695 
 696 extern kthread_t *thread_unpin();
 697 
 698 /*
 699  * disp() - find the highest priority thread for this processor to run, and
 700  * set it in TS_ONPROC state so that resume() can be called to run it.
 701  */
 702 static kthread_t *
 703 disp()
 704 {
 705         cpu_t           *cpup;
 706         disp_t          *dp;
 707         kthread_t       *tp;
 708         dispq_t         *dq;
 709         int             maxrunword;
 710         pri_t           pri;
 711         disp_t          *kpq;
 712 
 713         TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
 714 
 715         cpup = CPU;
 716         /*
 717          * Find the highest priority loaded, runnable thread.
 718          */
 719         dp = cpup->cpu_disp;
 720 
 721 reschedule:
 722         /*
 723          * If there is more important work on the global queue with a better
 724          * priority than the maximum on this CPU, take it now.
 725          */
 726         kpq = &cpup->cpu_part->cp_kp_queue;
 727         while ((pri = kpq->disp_maxrunpri) >= 0 &&
 728             pri >= dp->disp_maxrunpri &&
 729             (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
 730             (tp = disp_getbest(kpq)) != NULL) {
 731                 if (disp_ratify(tp, kpq) != NULL) {
 732                         TRACE_1(TR_FAC_DISP, TR_DISP_END,
 733                             "disp_end:tid %p", tp);
 734                         return (tp);
 735                 }
 736         }
 737 
 738         disp_lock_enter(&dp->disp_lock);
 739         pri = dp->disp_maxrunpri;
 740 
 741         /*
 742          * If there is nothing to run, look at what's runnable on other queues.
 743          * Choose the idle thread if the CPU is quiesced.
 744          * Note that CPUs that have the CPU_OFFLINE flag set can still run
 745          * interrupt threads, which will be the only threads on the CPU's own
 746          * queue, but cannot run threads from other queues.
 747          */
 748         if (pri == -1) {
 749                 if (!(cpup->cpu_flags & CPU_OFFLINE)) {
 750                         disp_lock_exit(&dp->disp_lock);
 751                         if ((tp = disp_getwork(cpup)) == NULL ||
 752                             tp == T_DONTSTEAL) {
 753                                 tp = cpup->cpu_idle_thread;
 754                                 (void) splhigh();
 755                                 THREAD_ONPROC(tp, cpup);
 756                                 cpup->cpu_dispthread = tp;
 757                                 cpup->cpu_dispatch_pri = -1;
 758                                 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
 759                                 cpup->cpu_chosen_level = -1;
 760                         }
 761                 } else {
 762                         disp_lock_exit_high(&dp->disp_lock);
 763                         tp = cpup->cpu_idle_thread;
 764                         THREAD_ONPROC(tp, cpup);
 765                         cpup->cpu_dispthread = tp;
 766                         cpup->cpu_dispatch_pri = -1;
 767                         cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
 768                         cpup->cpu_chosen_level = -1;
 769                 }
 770                 TRACE_1(TR_FAC_DISP, TR_DISP_END,
 771                     "disp_end:tid %p", tp);
 772                 return (tp);
 773         }
 774 
 775         dq = &dp->disp_q[pri];
 776         tp = dq->dq_first;
 777 
 778         ASSERT(tp != NULL);
 779         ASSERT(tp->t_schedflag & TS_LOAD);       /* thread must be swapped in */
 780 
 781         DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
 782 
 783         /*
 784          * Found it so remove it from queue.
 785          */
 786         dp->disp_nrunnable--;
 787         dq->dq_sruncnt--;
 788         if ((dq->dq_first = tp->t_link) == NULL) {
 789                 ulong_t *dqactmap = dp->disp_qactmap;
 790 
 791                 ASSERT(dq->dq_sruncnt == 0);
 792                 dq->dq_last = NULL;
 793 
 794                 /*
 795                  * The queue is empty, so the corresponding bit needs to be
 796                  * turned off in dqactmap.   If nrunnable != 0 just took the
 797                  * last runnable thread off the
 798                  * highest queue, so recompute disp_maxrunpri.
 799                  */
 800                 maxrunword = pri >> BT_ULSHIFT;
 801                 dqactmap[maxrunword] &= ~BT_BIW(pri);
 802 
 803                 if (dp->disp_nrunnable == 0) {
 804                         dp->disp_max_unbound_pri = -1;
 805                         dp->disp_maxrunpri = -1;
 806                 } else {
 807                         int ipri;
 808 
 809                         ipri = bt_gethighbit(dqactmap, maxrunword);
 810                         dp->disp_maxrunpri = ipri;
 811                         if (ipri < dp->disp_max_unbound_pri)
 812                                 dp->disp_max_unbound_pri = ipri;
 813                 }
 814         } else {
 815                 tp->t_link = NULL;
 816         }
 817 
 818         /*
 819          * Set TS_DONT_SWAP flag to prevent another processor from swapping
 820          * out this thread before we have a chance to run it.
 821          * While running, it is protected against swapping by t_lock.
 822          */
 823         tp->t_schedflag |= TS_DONT_SWAP;
 824         cpup->cpu_dispthread = tp;           /* protected by spl only */
 825         cpup->cpu_dispatch_pri = pri;
 826         ASSERT(pri == DISP_PRIO(tp));
 827         thread_onproc(tp, cpup);                /* set t_state to TS_ONPROC */
 828         disp_lock_exit_high(&dp->disp_lock);     /* drop run queue lock */
 829 
 830         ASSERT(tp != NULL);
 831         TRACE_1(TR_FAC_DISP, TR_DISP_END,
 832             "disp_end:tid %p", tp);
 833 
 834         if (disp_ratify(tp, kpq) == NULL)
 835                 goto reschedule;
 836 
 837         return (tp);
 838 }
 839 
 840 /*
 841  * swtch()
 842  *      Find best runnable thread and run it.
 843  *      Called with the current thread already switched to a new state,
 844  *      on a sleep queue, run queue, stopped, and not zombied.
 845  *      May be called at any spl level less than or equal to LOCK_LEVEL.
 846  *      Always drops spl to the base level (spl0()).
 847  */
 848 void
 849 swtch()
 850 {
 851         kthread_t       *t = curthread;
 852         kthread_t       *next;
 853         cpu_t           *cp;
 854 
 855         TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
 856 
 857         if (t->t_flag & T_INTR_THREAD)
 858                 cpu_intr_swtch_enter(t);
 859 
 860         if (t->t_intr != NULL) {
 861                 /*
 862                  * We are an interrupt thread.  Setup and return
 863                  * the interrupted thread to be resumed.
 864                  */
 865                 (void) splhigh();       /* block other scheduler action */
 866                 cp = CPU;               /* now protected against migration */
 867                 ASSERT(CPU_ON_INTR(cp) == 0);   /* not called with PIL > 10 */
 868                 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
 869                 CPU_STATS_ADDQ(cp, sys, intrblk, 1);
 870                 next = thread_unpin();
 871                 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 872                 resume_from_intr(next);
 873         } else {
 874 #ifdef  DEBUG
 875                 if (t->t_state == TS_ONPROC &&
 876                     t->t_disp_queue->disp_cpu == CPU &&
 877                     t->t_preempt == 0) {
 878                         thread_lock(t);
 879                         ASSERT(t->t_state != TS_ONPROC ||
 880                             t->t_disp_queue->disp_cpu != CPU ||
 881                             t->t_preempt != 0);      /* cannot migrate */
 882                         thread_unlock_nopreempt(t);
 883                 }
 884 #endif  /* DEBUG */
 885                 cp = CPU;
 886                 next = disp();          /* returns with spl high */
 887                 ASSERT(CPU_ON_INTR(cp) == 0);   /* not called with PIL > 10 */
 888 
 889                 /* OK to steal anything left on run queue */
 890                 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
 891 
 892                 if (next != t) {
 893                         hrtime_t now;
 894 
 895                         now = gethrtime_unscaled();
 896                         pg_ev_thread_swtch(cp, now, t, next);
 897 
 898                         /*
 899                          * If t was previously in the TS_ONPROC state,
 900                          * setfrontdq and setbackdq won't have set its t_waitrq.
 901                          * Since we now finally know that we're switching away
 902                          * from this thread, set its t_waitrq if it is on a run
 903                          * queue.
 904                          */
 905                         if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
 906                                 t->t_waitrq = now;
 907                         }
 908 
 909                         /*
 910                          * restore mstate of thread that we are switching to
 911                          */
 912                         restore_mstate(next);
 913 
 914                         CPU_STATS_ADDQ(cp, sys, pswitch, 1);
 915                         cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
 916                         TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 917 
 918                         if (dtrace_vtime_active)
 919                                 dtrace_vtime_switch(next);
 920 
 921                         resume(next);
 922                         /*
 923                          * The TR_RESUME_END and TR_SWTCH_END trace points
 924                          * appear at the end of resume(), because we may not
 925                          * return here
 926                          */
 927                 } else {
 928                         if (t->t_flag & T_INTR_THREAD)
 929                                 cpu_intr_swtch_exit(t);
 930                         /*
 931                          * Threads that enqueue themselves on a run queue defer
 932                          * setting t_waitrq. It is then either set in swtch()
 933                          * when the CPU is actually yielded, or not at all if it
 934                          * is remaining on the CPU.
 935                          * There is however a window between where the thread
 936                          * placed itself on a run queue, and where it selects
 937                          * itself in disp(), where a third party (eg. clock()
 938                          * doing tick processing) may have re-enqueued this
 939                          * thread, setting t_waitrq in the process. We detect
 940                          * this race by noticing that despite switching to
 941                          * ourself, our t_waitrq has been set, and should be
 942                          * cleared.
 943                          */
 944                         if (t->t_waitrq != 0)
 945                                 t->t_waitrq = 0;
 946 
 947                         pg_ev_thread_remain(cp, t);
 948 
 949                         DTRACE_SCHED(remain__cpu);
 950                         TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
 951                         (void) spl0();
 952                 }
 953         }
 954 }
 955 
 956 /*
 957  * swtch_from_zombie()
 958  *      Special case of swtch(), which allows checks for TS_ZOMB to be
 959  *      eliminated from normal resume.
 960  *      Find best runnable thread and run it.
 961  *      Called with the current thread zombied.
 962  *      Zombies cannot migrate, so CPU references are safe.
 963  */
 964 void
 965 swtch_from_zombie()
 966 {
 967         kthread_t       *next;
 968         cpu_t           *cpu = CPU;
 969 
 970         TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
 971 
 972         ASSERT(curthread->t_state == TS_ZOMB);
 973 
 974         next = disp();                  /* returns with spl high */
 975         ASSERT(CPU_ON_INTR(CPU) == 0);  /* not called with PIL > 10 */
 976         CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
 977         ASSERT(next != curthread);
 978         TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 979 
 980         pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
 981 
 982         restore_mstate(next);
 983 
 984         if (dtrace_vtime_active)
 985                 dtrace_vtime_switch(next);
 986 
 987         resume_from_zombie(next);
 988         /*
 989          * The TR_RESUME_END and TR_SWTCH_END trace points
 990          * appear at the end of resume(), because we certainly will not
 991          * return here
 992          */
 993 }
 994 
 995 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
 996 
 997 /*
 998  * search_disp_queues()
 999  *      Search the given dispatch queues for thread tp.
1000  *      Return 1 if tp is found, otherwise return 0.
1001  */
1002 static int
1003 search_disp_queues(disp_t *dp, kthread_t *tp)
1004 {
1005         dispq_t         *dq;
1006         dispq_t         *eq;
1007 
1008         disp_lock_enter_high(&dp->disp_lock);
1009 
1010         for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1011                 kthread_t       *rp;
1012 
1013                 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1014 
1015                 for (rp = dq->dq_first; rp; rp = rp->t_link)
1016                         if (tp == rp) {
1017                                 disp_lock_exit_high(&dp->disp_lock);
1018                                 return (1);
1019                         }
1020         }
1021         disp_lock_exit_high(&dp->disp_lock);
1022 
1023         return (0);
1024 }
1025 
1026 /*
1027  * thread_on_queue()
1028  *      Search all per-CPU dispatch queues and all partition-wide kpreempt
1029  *      queues for thread tp. Return 1 if tp is found, otherwise return 0.
1030  */
1031 static int
1032 thread_on_queue(kthread_t *tp)
1033 {
1034         cpu_t           *cp;
1035         struct cpupart  *part;
1036 
1037         ASSERT(getpil() >= DISP_LEVEL);
1038 
1039         /*
1040          * Search the per-CPU dispatch queues for tp.
1041          */
1042         cp = CPU;
1043         do {
1044                 if (search_disp_queues(cp->cpu_disp, tp))
1045                         return (1);
1046         } while ((cp = cp->cpu_next_onln) != CPU);
1047 
1048         /*
1049          * Search the partition-wide kpreempt queues for tp.
1050          */
1051         part = CPU->cpu_part;
1052         do {
1053                 if (search_disp_queues(&part->cp_kp_queue, tp))
1054                         return (1);
1055         } while ((part = part->cp_next) != CPU->cpu_part);
1056 
1057         return (0);
1058 }
1059 
1060 #else
1061 
1062 #define thread_on_queue(tp)     0       /* ASSERT must be !thread_on_queue */
1063 
1064 #endif  /* DEBUG */
1065 
1066 /*
1067  * like swtch(), but switch to a specified thread taken from another CPU.
1068  *      called with spl high..
1069  */
1070 void
1071 swtch_to(kthread_t *next)
1072 {
1073         cpu_t                   *cp = CPU;
1074         hrtime_t                now;
1075 
1076         TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1077 
1078         /*
1079          * Update context switch statistics.
1080          */
1081         CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1082 
1083         TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1084 
1085         now = gethrtime_unscaled();
1086         pg_ev_thread_swtch(cp, now, curthread, next);
1087 
1088         /* OK to steal anything left on run queue */
1089         cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1090 
1091         /* record last execution time */
1092         cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1093 
1094         /*
1095          * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1096          * won't have set its t_waitrq.  Since we now finally know that we're
1097          * switching away from this thread, set its t_waitrq if it is on a run
1098          * queue.
1099          */
1100         if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1101                 curthread->t_waitrq = now;
1102         }
1103 
1104         /* restore next thread to previously running microstate */
1105         restore_mstate(next);
1106 
1107         if (dtrace_vtime_active)
1108                 dtrace_vtime_switch(next);
1109 
1110         resume(next);
1111         /*
1112          * The TR_RESUME_END and TR_SWTCH_END trace points
1113          * appear at the end of resume(), because we may not
1114          * return here
1115          */
1116 }
1117 
1118 #define CPU_IDLING(pri) ((pri) == -1)
1119 
1120 static void
1121 cpu_resched(cpu_t *cp, pri_t tpri)
1122 {
1123         int     call_poke_cpu = 0;
1124         pri_t   cpupri = cp->cpu_dispatch_pri;
1125 
1126         if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1127                 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1128                     "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1129                 if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1130                         cp->cpu_runrun = 1;
1131                         aston(cp->cpu_dispthread);
1132                         if (tpri < kpreemptpri && cp != CPU)
1133                                 call_poke_cpu = 1;
1134                 }
1135                 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1136                         cp->cpu_kprunrun = 1;
1137                         if (cp != CPU)
1138                                 call_poke_cpu = 1;
1139                 }
1140         }
1141 
1142         /*
1143          * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1144          */
1145         membar_enter();
1146 
1147         if (call_poke_cpu)
1148                 poke_cpu(cp->cpu_id);
1149 }
1150 
1151 /*
1152  * setbackdq() keeps runqs balanced such that the difference in length
1153  * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1154  * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1155  * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1156  * try to keep runqs perfectly balanced regardless of the thread priority.
1157  */
1158 #define RUNQ_MATCH_PRI  16      /* pri below which queue lengths must match */
1159 #define RUNQ_MAX_DIFF   2       /* maximum runq length difference */
1160 #define RUNQ_LEN(cp, pri)       ((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1161 
1162 /*
1163  * Macro that evaluates to true if it is likely that the thread has cache
1164  * warmth. This is based on the amount of time that has elapsed since the
1165  * thread last ran. If that amount of time is less than "rechoose_interval"
1166  * ticks, then we decide that the thread has enough cache warmth to warrant
1167  * some affinity for t->t_cpu.
1168  */
1169 #define THREAD_HAS_CACHE_WARMTH(thread) \
1170         ((thread == curthread) ||       \
1171         ((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1172 
1173 /*
1174  * Put the specified thread on the front/back of the dispatcher queue
1175  * corresponding to its current priority.
1176  *
1177  * Called with the thread in transition, onproc or stopped state and locked
1178  * (transition implies locked) and at high spl.  Returns with the thread in
1179  * TS_RUN state and still locked.
1180  */
1181 static void
1182 setfrontbackdq(kthread_t *tp, boolean_t front)
1183 {
1184         dispq_t         *dq;
1185         disp_t          *dp;
1186         cpu_t           *cp;
1187         pri_t           tpri;
1188         boolean_t       bound;
1189         boolean_t       self;
1190 
1191         ASSERT(THREAD_LOCK_HELD(tp));
1192         ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1193         ASSERT(!thread_on_queue(tp));   /* make sure tp isn't on a runq */
1194 
1195         /*
1196          * If thread is "swapped" or on the swap queue don't
1197          * queue it, but wake sched.
1198          */
1199         if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1200                 disp_swapped_setrun(tp);
1201                 return;
1202         }
1203 
1204         self  = (tp == curthread);
1205         bound = (tp->t_bound_cpu || tp->t_weakbound_cpu);
1206 
1207         tpri = DISP_PRIO(tp);
1208         if (ncpus == 1)
1209                 cp = tp->t_cpu;
1210         else if (!bound) {
1211                 if (tpri >= kpqpri) {
1212                         setkpdq(tp, front ? SETKP_FRONT : SETKP_BACK);
1213                         return;
1214                 }
1215 
1216                 cp = tp->t_cpu;
1217 
1218                 if (!front) {
1219                         /*
1220                          * We'll generally let this thread continue to run where
1221                          * it last ran...but will consider migration if:
1222                          * - We thread probably doesn't have much cache warmth.
1223                          * - The CPU where it last ran is the target of an offline
1224                          *   request.
1225                          * - The thread last ran outside it's home lgroup.
1226                          */
1227                         if ((!THREAD_HAS_CACHE_WARMTH(tp)) || (cp == cpu_inmotion)) {
1228                                 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
1229                         } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1230                                 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1231                                     self ? tp->t_cpu : NULL);
1232                         }
1233 
1234                 }
1235 
1236                 if (tp->t_cpupart == cp->cpu_part) {
1237                         if (front) {
1238                                 /*
1239                                  * We'll generally let this thread continue to run
1240                                  * where it last ran, but will consider migration if:
1241                                  * - The thread last ran outside it's home lgroup.
1242                                  * - The CPU where it last ran is the target of an
1243                                  *   offline request (a thread_nomigrate() on the in
1244                                  *   motion CPU relies on this when forcing a preempt).
1245                                  * - The thread isn't the highest priority thread where
1246                                  *   it last ran, and it is considered not likely to
1247                                  *   have significant cache warmth.
1248                                  */
1249                                 if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
1250                                     (cp == cpu_inmotion)) {
1251                                         cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1252                                             self ? cp : NULL);
1253                                 } else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
1254                                     (!THREAD_HAS_CACHE_WARMTH(tp))) {
1255                                         cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1256                                             NULL);
1257                                 }
1258                         } else {
1259                                 int     qlen;
1260 
1261                                 /*
1262                                  * Perform any CMT load balancing
1263                                  */
1264                                 cp = cmt_balance(tp, cp);
1265 
1266                                 /*
1267                                  * Balance across the run queues
1268                                  */
1269                                 qlen = RUNQ_LEN(cp, tpri);
1270                                 if (tpri >= RUNQ_MATCH_PRI &&
1271                                     !(tp->t_schedflag & TS_RUNQMATCH))
1272                                         qlen -= RUNQ_MAX_DIFF;
1273                                 if (qlen > 0) {
1274                                         cpu_t *newcp;
1275 
1276                                         if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1277                                                 newcp = cp->cpu_next_part;
1278                                         } else if ((newcp = cp->cpu_next_lpl) == cp) {
1279                                                 newcp = cp->cpu_next_part;
1280                                         }
1281 
1282                                         if (RUNQ_LEN(newcp, tpri) < qlen) {
1283                                                 DTRACE_PROBE3(runq__balance,
1284                                                     kthread_t *, tp,
1285                                                     cpu_t *, cp, cpu_t *, newcp);
1286                                                 cp = newcp;
1287                                         }
1288                                 }
1289                         }
1290                 } else {
1291                         /*
1292                          * Migrate to a cpu in the new partition.
1293                          */
1294                         cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1295                             tp->t_lpl, tp->t_pri, NULL);
1296                 }
1297 
1298                 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1299         } else {
1300                 /*
1301                  * It is possible that t_weakbound_cpu != t_bound_cpu (for
1302                  * a short time until weak binding that existed when the
1303                  * strong binding was established has dropped) so we must
1304                  * favour weak binding over strong.
1305                  */
1306                 cp = tp->t_weakbound_cpu ?
1307                     tp->t_weakbound_cpu : tp->t_bound_cpu;
1308         }
1309 
1310         /*
1311          * A thread that is ONPROC may be temporarily placed on the run queue
1312          * but then chosen to run again by disp.  If the thread we're placing on
1313          * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1314          * replacement process is actually scheduled in swtch().  In this
1315          * situation, curthread is the only thread that could be in the ONPROC
1316          * state.
1317          */
1318         if ((!self) && (tp->t_waitrq == 0)) {
1319                 hrtime_t curtime;
1320 
1321                 curtime = gethrtime_unscaled();
1322                 (void) cpu_update_pct(tp, curtime);
1323                 tp->t_waitrq = curtime;
1324         } else {
1325                 (void) cpu_update_pct(tp, gethrtime_unscaled());
1326         }
1327 
1328         dp = cp->cpu_disp;
1329         disp_lock_enter_high(&dp->disp_lock);
1330 
1331         DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, front);
1332         if (front) {
1333                 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri,
1334                         tp);
1335         } else {
1336                 TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1337                         tpri, cp, tp);
1338         }
1339 
1340 #ifndef NPROBE
1341         /* Kernel probe */
1342         if (tnf_tracing_active)
1343                 tnf_thread_queue(tp, cp, tpri);
1344 #endif /* NPROBE */
1345 
1346         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1347 
1348         THREAD_RUN(tp, &dp->disp_lock);          /* set t_state to TS_RUN */
1349         tp->t_disp_queue = dp;
1350         tp->t_link = NULL;
1351 
1352         dq = &dp->disp_q[tpri];
1353         dp->disp_nrunnable++;
1354         if (!bound)
1355                 dp->disp_steal = 0;
1356         membar_enter();
1357 
1358         if (dq->dq_sruncnt++ != 0) {
1359                 if (front) {
1360                         ASSERT(dq->dq_last != NULL);
1361                         tp->t_link = dq->dq_first;
1362                         dq->dq_first = tp;
1363                 } else {
1364                         ASSERT(dq->dq_first != NULL);
1365                         dq->dq_last->t_link = tp;
1366                         dq->dq_last = tp;
1367                 }
1368         } else {
1369                 ASSERT(dq->dq_first == NULL);
1370                 ASSERT(dq->dq_last == NULL);
1371                 dq->dq_first = dq->dq_last = tp;
1372                 BT_SET(dp->disp_qactmap, tpri);
1373                 if (tpri > dp->disp_maxrunpri) {
1374                         dp->disp_maxrunpri = tpri;
1375                         membar_enter();
1376                         cpu_resched(cp, tpri);
1377                 }
1378         }
1379 
1380         if (!bound && tpri > dp->disp_max_unbound_pri) {
1381                 if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1382                         /*
1383                          * If there are no other unbound threads on the
1384                          * run queue, don't allow other CPUs to steal
1385                          * this thread while we are in the middle of a
1386                          * context switch. We may just switch to it
1387                          * again right away. CPU_DISP_DONTSTEAL is cleared
1388                          * in swtch and swtch_to.
1389                          */
1390                         cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1391                 }
1392                 dp->disp_max_unbound_pri = tpri;
1393         }
1394 
1395         (*disp_enq_thread)(cp, bound);
1396 }
1397 
1398 /*
1399  * Put the specified thread on the back of the dispatcher
1400  * queue corresponding to its current priority.
1401  *
1402  * Called with the thread in transition, onproc or stopped state
1403  * and locked (transition implies locked) and at high spl.
1404  * Returns with the thread in TS_RUN state and still locked.
1405  */
1406 void
1407 setbackdq(kthread_t *tp)
1408 {
1409         setfrontbackdq(tp, B_FALSE);
1410 }
1411 
1412 /*
1413  * Put the specified thread on the front of the dispatcher
1414  * queue corresponding to its current priority.
1415  *
1416  * Called with the thread in transition, onproc or stopped state
1417  * and locked (transition implies locked) and at high spl.
1418  * Returns with the thread in TS_RUN state and still locked.
1419  */
1420 void
1421 setfrontdq(kthread_t *tp)
1422 {
1423         setfrontbackdq(tp, B_TRUE);
1424 }
1425 
1426 /*
1427  * Put a high-priority unbound thread on the kp queue
1428  */
1429 static void
1430 setkpdq(kthread_t *tp, int borf)
1431 {
1432         dispq_t *dq;
1433         disp_t  *dp;
1434         cpu_t   *cp;
1435         pri_t   tpri;
1436 
1437         tpri = DISP_PRIO(tp);
1438 
1439         dp = &tp->t_cpupart->cp_kp_queue;
1440         disp_lock_enter_high(&dp->disp_lock);
1441 
1442         TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1443 
1444         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1445         DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1446         THREAD_RUN(tp, &dp->disp_lock);          /* set t_state to TS_RUN */
1447         tp->t_disp_queue = dp;
1448         dp->disp_nrunnable++;
1449         dq = &dp->disp_q[tpri];
1450 
1451         if (dq->dq_sruncnt++ != 0) {
1452                 if (borf == SETKP_BACK) {
1453                         ASSERT(dq->dq_first != NULL);
1454                         tp->t_link = NULL;
1455                         dq->dq_last->t_link = tp;
1456                         dq->dq_last = tp;
1457                 } else {
1458                         ASSERT(dq->dq_last != NULL);
1459                         tp->t_link = dq->dq_first;
1460                         dq->dq_first = tp;
1461                 }
1462         } else {
1463                 if (borf == SETKP_BACK) {
1464                         ASSERT(dq->dq_first == NULL);
1465                         ASSERT(dq->dq_last == NULL);
1466                         dq->dq_first = dq->dq_last = tp;
1467                 } else {
1468                         ASSERT(dq->dq_last == NULL);
1469                         ASSERT(dq->dq_first == NULL);
1470                         tp->t_link = NULL;
1471                         dq->dq_first = dq->dq_last = tp;
1472                 }
1473                 BT_SET(dp->disp_qactmap, tpri);
1474                 if (tpri > dp->disp_max_unbound_pri)
1475                         dp->disp_max_unbound_pri = tpri;
1476                 if (tpri > dp->disp_maxrunpri) {
1477                         dp->disp_maxrunpri = tpri;
1478                         membar_enter();
1479                 }
1480         }
1481 
1482         cp = tp->t_cpu;
1483         if (tp->t_cpupart != cp->cpu_part) {
1484                 /* migrate to a cpu in the new partition */
1485                 cp = tp->t_cpupart->cp_cpulist;
1486         }
1487         cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1488         disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1489         ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1490 
1491 #ifndef NPROBE
1492         /* Kernel probe */
1493         if (tnf_tracing_active)
1494                 tnf_thread_queue(tp, cp, tpri);
1495 #endif /* NPROBE */
1496 
1497         if (cp->cpu_chosen_level < tpri)
1498                 cp->cpu_chosen_level = tpri;
1499         cpu_resched(cp, tpri);
1500         disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1501         (*disp_enq_thread)(cp, 0);
1502 }
1503 
1504 /*
1505  * Remove a thread from the dispatcher queue if it is on it.
1506  * It is not an error if it is not found but we return whether
1507  * or not it was found in case the caller wants to check.
1508  */
1509 int
1510 dispdeq(kthread_t *tp)
1511 {
1512         disp_t          *dp;
1513         dispq_t         *dq;
1514         kthread_t       *rp;
1515         kthread_t       *trp;
1516         kthread_t       **ptp;
1517         int             tpri;
1518 
1519         ASSERT(THREAD_LOCK_HELD(tp));
1520 
1521         if (tp->t_state != TS_RUN)
1522                 return (0);
1523 
1524         /*
1525          * The thread is "swapped" or is on the swap queue and
1526          * hence no longer on the run queue, so return true.
1527          */
1528         if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1529                 return (1);
1530 
1531         tpri = DISP_PRIO(tp);
1532         dp = tp->t_disp_queue;
1533         ASSERT(tpri < dp->disp_npri);
1534         dq = &dp->disp_q[tpri];
1535         ptp = &dq->dq_first;
1536         rp = *ptp;
1537         trp = NULL;
1538 
1539         ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1540 
1541         /*
1542          * Search for thread in queue.
1543          * Double links would simplify this at the expense of disp/setrun.
1544          */
1545         while (rp != tp && rp != NULL) {
1546                 trp = rp;
1547                 ptp = &trp->t_link;
1548                 rp = trp->t_link;
1549         }
1550 
1551         if (rp == NULL) {
1552                 panic("dispdeq: thread not on queue");
1553         }
1554 
1555         DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1556 
1557         /*
1558          * Found it so remove it from queue.
1559          */
1560         if ((*ptp = rp->t_link) == NULL)
1561                 dq->dq_last = trp;
1562 
1563         dp->disp_nrunnable--;
1564         if (--dq->dq_sruncnt == 0) {
1565                 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1566                 if (dp->disp_nrunnable == 0) {
1567                         dp->disp_max_unbound_pri = -1;
1568                         dp->disp_maxrunpri = -1;
1569                 } else if (tpri == dp->disp_maxrunpri) {
1570                         int ipri;
1571 
1572                         ipri = bt_gethighbit(dp->disp_qactmap,
1573                             dp->disp_maxrunpri >> BT_ULSHIFT);
1574                         if (ipri < dp->disp_max_unbound_pri)
1575                                 dp->disp_max_unbound_pri = ipri;
1576                         dp->disp_maxrunpri = ipri;
1577                 }
1578         }
1579         tp->t_link = NULL;
1580         THREAD_TRANSITION(tp);          /* put in intermediate state */
1581         return (1);
1582 }
1583 
1584 
1585 /*
1586  * dq_sruninc and dq_srundec are public functions for
1587  * incrementing/decrementing the sruncnts when a thread on
1588  * a dispatcher queue is made schedulable/unschedulable by
1589  * resetting the TS_LOAD flag.
1590  *
1591  * The caller MUST have the thread lock and therefore the dispatcher
1592  * queue lock so that the operation which changes
1593  * the flag, the operation that checks the status of the thread to
1594  * determine if it's on a disp queue AND the call to this function
1595  * are one atomic operation with respect to interrupts.
1596  */
1597 
1598 /*
1599  * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1600  */
1601 void
1602 dq_sruninc(kthread_t *t)
1603 {
1604         ASSERT(t->t_state == TS_RUN);
1605         ASSERT(t->t_schedflag & TS_LOAD);
1606 
1607         THREAD_TRANSITION(t);
1608         setfrontdq(t);
1609 }
1610 
1611 /*
1612  * See comment on calling conventions above.
1613  * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1614  */
1615 void
1616 dq_srundec(kthread_t *t)
1617 {
1618         ASSERT(t->t_schedflag & TS_LOAD);
1619 
1620         (void) dispdeq(t);
1621         disp_swapped_enq(t);
1622 }
1623 
1624 /*
1625  * Change the dispatcher lock of thread to the "swapped_lock"
1626  * and return with thread lock still held.
1627  *
1628  * Called with thread_lock held, in transition state, and at high spl.
1629  */
1630 void
1631 disp_swapped_enq(kthread_t *tp)
1632 {
1633         ASSERT(THREAD_LOCK_HELD(tp));
1634         ASSERT(tp->t_schedflag & TS_LOAD);
1635 
1636         switch (tp->t_state) {
1637         case TS_RUN:
1638                 disp_lock_enter_high(&swapped_lock);
1639                 THREAD_SWAP(tp, &swapped_lock);     /* set TS_RUN state and lock */
1640                 break;
1641         case TS_ONPROC:
1642                 disp_lock_enter_high(&swapped_lock);
1643                 THREAD_TRANSITION(tp);
1644                 wake_sched_sec = 1;             /* tell clock to wake sched */
1645                 THREAD_SWAP(tp, &swapped_lock);     /* set TS_RUN state and lock */
1646                 break;
1647         default:
1648                 panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1649         }
1650 }
1651 
1652 /*
1653  * This routine is called by setbackdq/setfrontdq if the thread is
1654  * not loaded or loaded and on the swap queue.
1655  *
1656  * Thread state TS_SLEEP implies that a swapped thread
1657  * has been woken up and needs to be swapped in by the swapper.
1658  *
1659  * Thread state TS_RUN, it implies that the priority of a swapped
1660  * thread is being increased by scheduling class (e.g. ts_update).
1661  */
1662 static void
1663 disp_swapped_setrun(kthread_t *tp)
1664 {
1665         ASSERT(THREAD_LOCK_HELD(tp));
1666         ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1667 
1668         switch (tp->t_state) {
1669         case TS_SLEEP:
1670                 disp_lock_enter_high(&swapped_lock);
1671                 /*
1672                  * Wakeup sched immediately (i.e., next tick) if the
1673                  * thread priority is above maxclsyspri.
1674                  */
1675                 if (DISP_PRIO(tp) > maxclsyspri)
1676                         wake_sched = 1;
1677                 else
1678                         wake_sched_sec = 1;
1679                 THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1680                 break;
1681         case TS_RUN:                            /* called from ts_update */
1682                 break;
1683         default:
1684                 panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
1685         }
1686 }
1687 
1688 /*
1689  *      Make a thread give up its processor.  Find the processor on
1690  *      which this thread is executing, and have that processor
1691  *      preempt.
1692  *
1693  *      We allow System Duty Cycle (SDC) threads to be preempted even if
1694  *      they are running at kernel priorities.  To implement this, we always
1695  *      set cpu_kprunrun; this ensures preempt() will be called.  Since SDC
1696  *      calls cpu_surrender() very often, we only preempt if there is anyone
1697  *      competing with us.
1698  */
1699 void
1700 cpu_surrender(kthread_t *tp)
1701 {
1702         cpu_t   *cpup;
1703         int     max_pri;
1704         int     max_run_pri;
1705         klwp_t  *lwp;
1706 
1707         ASSERT(THREAD_LOCK_HELD(tp));
1708 
1709         if (tp->t_state != TS_ONPROC)
1710                 return;
1711         cpup = tp->t_disp_queue->disp_cpu;        /* CPU thread dispatched to */
1712         max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1713         max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1714         if (max_pri < max_run_pri)
1715                 max_pri = max_run_pri;
1716 
1717         if (tp->t_cid == sysdccid) {
1718                 uint_t t_pri = DISP_PRIO(tp);
1719                 if (t_pri > max_pri)
1720                         return;         /* we are not competing w/ anyone */
1721                 cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1722         } else {
1723                 cpup->cpu_runrun = 1;
1724                 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1725                         cpup->cpu_kprunrun = 1;
1726                 }
1727         }
1728 
1729         /*
1730          * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1731          */
1732         membar_enter();
1733 
1734         DTRACE_SCHED1(surrender, kthread_t *, tp);
1735 
1736         /*
1737          * Make the target thread take an excursion through trap()
1738          * to do preempt() (unless we're already in trap or post_syscall,
1739          * calling cpu_surrender via CL_TRAPRET).
1740          */
1741         if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1742             lwp->lwp_state != LWP_USER) {
1743                 aston(tp);
1744                 if (cpup != CPU)
1745                         poke_cpu(cpup->cpu_id);
1746         }
1747         TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1748             "cpu_surrender:tid %p cpu %p", tp, cpup);
1749 }
1750 
1751 /*
1752  * Commit to and ratify a scheduling decision
1753  */
1754 /*ARGSUSED*/
1755 static kthread_t *
1756 disp_ratify(kthread_t *tp, disp_t *kpq)
1757 {
1758         pri_t   tpri, maxpri;
1759         pri_t   maxkpri;
1760         cpu_t   *cpup;
1761 
1762         ASSERT(tp != NULL);
1763         /*
1764          * Commit to, then ratify scheduling decision
1765          */
1766         cpup = CPU;
1767         if (cpup->cpu_runrun != 0)
1768                 cpup->cpu_runrun = 0;
1769         if (cpup->cpu_kprunrun != 0)
1770                 cpup->cpu_kprunrun = 0;
1771         if (cpup->cpu_chosen_level != -1)
1772                 cpup->cpu_chosen_level = -1;
1773         membar_enter();
1774         tpri = DISP_PRIO(tp);
1775         maxpri = cpup->cpu_disp->disp_maxrunpri;
1776         maxkpri = kpq->disp_maxrunpri;
1777         if (maxpri < maxkpri)
1778                 maxpri = maxkpri;
1779         if (tpri < maxpri) {
1780                 /*
1781                  * should have done better
1782                  * put this one back and indicate to try again
1783                  */
1784                 cpup->cpu_dispthread = curthread;    /* fixup dispthread */
1785                 cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1786                 thread_lock_high(tp);
1787                 THREAD_TRANSITION(tp);
1788                 setfrontdq(tp);
1789                 thread_unlock_nopreempt(tp);
1790 
1791                 tp = NULL;
1792         }
1793         return (tp);
1794 }
1795 
1796 /*
1797  * See if there is any work on the dispatcher queue for other CPUs.
1798  * If there is, dequeue the best thread and return.
1799  */
1800 static kthread_t *
1801 disp_getwork(cpu_t *cp)
1802 {
1803         cpu_t           *ocp;           /* other CPU */
1804         cpu_t           *ocp_start;
1805         cpu_t           *tcp;           /* target local CPU */
1806         kthread_t       *tp;
1807         kthread_t       *retval = NULL;
1808         pri_t           maxpri;
1809         disp_t          *kpq;           /* kp queue for this partition */
1810         lpl_t           *lpl, *lpl_leaf;
1811         int             leafidx, startidx;
1812         hrtime_t        stealtime;
1813         lgrp_id_t       local_id;
1814 
1815         maxpri = -1;
1816         tcp = NULL;
1817 
1818         kpq = &cp->cpu_part->cp_kp_queue;
1819         while (kpq->disp_maxrunpri >= 0) {
1820                 /*
1821                  * Try to take a thread from the kp_queue.
1822                  */
1823                 tp = (disp_getbest(kpq));
1824                 if (tp)
1825                         return (disp_ratify(tp, kpq));
1826         }
1827 
1828         kpreempt_disable();             /* protect the cpu_active list */
1829 
1830         /*
1831          * Try to find something to do on another CPU's run queue.
1832          * Loop through all other CPUs looking for the one with the highest
1833          * priority unbound thread.
1834          *
1835          * On NUMA machines, the partition's CPUs are consulted in order of
1836          * distance from the current CPU. This way, the first available
1837          * work found is also the closest, and will suffer the least
1838          * from being migrated.
1839          */
1840         lpl = lpl_leaf = cp->cpu_lpl;
1841         local_id = lpl_leaf->lpl_lgrpid;
1842         leafidx = startidx = 0;
1843 
1844         /*
1845          * This loop traverses the lpl hierarchy. Higher level lpls represent
1846          * broader levels of locality
1847          */
1848         do {
1849                 /* This loop iterates over the lpl's leaves */
1850                 do {
1851                         if (lpl_leaf != cp->cpu_lpl)
1852                                 ocp = lpl_leaf->lpl_cpus;
1853                         else
1854                                 ocp = cp->cpu_next_lpl;
1855 
1856                         /* This loop iterates over the CPUs in the leaf */
1857                         ocp_start = ocp;
1858                         do {
1859                                 pri_t pri;
1860 
1861                                 ASSERT(CPU_ACTIVE(ocp));
1862 
1863                                 /*
1864                                  * End our stroll around this lpl if:
1865                                  *
1866                                  * - Something became runnable on the local
1867                                  *   queue...which also ends our stroll around
1868                                  *   the partition.
1869                                  *
1870                                  * - We happen across another idle CPU.
1871                                  *   Since it is patrolling the next portion
1872                                  *   of the lpl's list (assuming it's not
1873                                  *   halted, or busy servicing an interrupt),
1874                                  *   move to the next higher level of locality.
1875                                  */
1876                                 if (cp->cpu_disp->disp_nrunnable != 0) {
1877                                         kpreempt_enable();
1878                                         return (NULL);
1879                                 }
1880                                 if (ocp->cpu_dispatch_pri == -1) {
1881                                         if (ocp->cpu_disp_flags &
1882                                             CPU_DISP_HALTED ||
1883                                             ocp->cpu_intr_actv != 0)
1884                                                 continue;
1885                                         else
1886                                                 goto next_level;
1887                                 }
1888 
1889                                 /*
1890                                  * If there's only one thread and the CPU
1891                                  * is in the middle of a context switch,
1892                                  * or it's currently running the idle thread,
1893                                  * don't steal it.
1894                                  */
1895                                 if ((ocp->cpu_disp_flags &
1896                                     CPU_DISP_DONTSTEAL) &&
1897                                     ocp->cpu_disp->disp_nrunnable == 1)
1898                                         continue;
1899 
1900                                 pri = ocp->cpu_disp->disp_max_unbound_pri;
1901                                 if (pri > maxpri) {
1902                                         /*
1903                                          * Don't steal threads that we attempted
1904                                          * to steal recently until they're ready
1905                                          * to be stolen again.
1906                                          */
1907                                         stealtime = ocp->cpu_disp->disp_steal;
1908                                         if (stealtime == 0 ||
1909                                             stealtime - gethrtime() <= 0) {
1910                                                 maxpri = pri;
1911                                                 tcp = ocp;
1912                                         } else {
1913                                                 /*
1914                                                  * Don't update tcp, just set
1915                                                  * the retval to T_DONTSTEAL, so
1916                                                  * that if no acceptable CPUs
1917                                                  * are found the return value
1918                                                  * will be T_DONTSTEAL rather
1919                                                  * then NULL.
1920                                                  */
1921                                                 retval = T_DONTSTEAL;
1922                                         }
1923                                 }
1924                         } while ((ocp = ocp->cpu_next_lpl) != ocp_start);
1925 
1926                         /*
1927                          * Iterate to the next leaf lpl in the resource set
1928                          * at this level of locality. If we hit the end of
1929                          * the set, wrap back around to the beginning.
1930                          *
1931                          * Note: This iteration is NULL terminated for a reason
1932                          * see lpl_topo_bootstrap() in lgrp.c for details.
1933                          */
1934                         if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
1935                                 leafidx = 0;
1936                                 lpl_leaf = lpl->lpl_rset[leafidx];
1937                         }
1938                 } while (leafidx != startidx);
1939 
1940 next_level:
1941                 /*
1942                  * Expand the search to include farther away CPUs (next
1943                  * locality level). The closer CPUs that have already been
1944                  * checked will be checked again. In doing so, idle CPUs
1945                  * will tend to be more aggresive about stealing from CPUs
1946                  * that are closer (since the closer CPUs will be considered
1947                  * more often).
1948                  * Begin at this level with the CPUs local leaf lpl.
1949                  */
1950                 if ((lpl = lpl->lpl_parent) != NULL) {
1951                         leafidx = startidx = lpl->lpl_id2rset[local_id];
1952                         lpl_leaf = lpl->lpl_rset[leafidx];
1953                 }
1954         } while (!tcp && lpl);
1955 
1956         kpreempt_enable();
1957 
1958         /*
1959          * If another queue looks good, and there is still nothing on
1960          * the local queue, try to transfer one or more threads
1961          * from it to our queue.
1962          */
1963         if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
1964                 tp = disp_getbest(tcp->cpu_disp);
1965                 if (tp == NULL || tp == T_DONTSTEAL)
1966                         return (tp);
1967                 return (disp_ratify(tp, kpq));
1968         }
1969         return (retval);
1970 }
1971 
1972 
1973 /*
1974  * disp_fix_unbound_pri()
1975  *      Determines the maximum priority of unbound threads on the queue.
1976  *      The priority is kept for the queue, but is only increased, never
1977  *      reduced unless some CPU is looking for something on that queue.
1978  *
1979  *      The priority argument is the known upper limit.
1980  *
1981  *      Perhaps this should be kept accurately, but that probably means
1982  *      separate bitmaps for bound and unbound threads.  Since only idled
1983  *      CPUs will have to do this recalculation, it seems better this way.
1984  */
1985 static void
1986 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
1987 {
1988         kthread_t       *tp;
1989         dispq_t         *dq;
1990         ulong_t         *dqactmap = dp->disp_qactmap;
1991         ulong_t         mapword;
1992         int             wx;
1993 
1994         ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
1995 
1996         ASSERT(pri >= 0);                    /* checked by caller */
1997 
1998         /*
1999          * Start the search at the next lowest priority below the supplied
2000          * priority.  This depends on the bitmap implementation.
2001          */
2002         do {
2003                 wx = pri >> BT_ULSHIFT;           /* index of word in map */
2004 
2005                 /*
2006                  * Form mask for all lower priorities in the word.
2007                  */
2008                 mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2009 
2010                 /*
2011                  * Get next lower active priority.
2012                  */
2013                 if (mapword != 0) {
2014                         pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2015                 } else if (wx > 0) {
2016                         pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2017                         if (pri < 0)
2018                                 break;
2019                 } else {
2020                         pri = -1;
2021                         break;
2022                 }
2023 
2024                 /*
2025                  * Search the queue for unbound, runnable threads.
2026                  */
2027                 dq = &dp->disp_q[pri];
2028                 tp = dq->dq_first;
2029 
2030                 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2031                         tp = tp->t_link;
2032                 }
2033 
2034                 /*
2035                  * If a thread was found, set the priority and return.
2036                  */
2037         } while (tp == NULL);
2038 
2039         /*
2040          * pri holds the maximum unbound thread priority or -1.
2041          */
2042         if (dp->disp_max_unbound_pri != pri)
2043                 dp->disp_max_unbound_pri = pri;
2044 }
2045 
2046 /*
2047  * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2048  *      check if the CPU to which is was previously bound should have
2049  *      its disp_max_unbound_pri increased.
2050  */
2051 void
2052 disp_adjust_unbound_pri(kthread_t *tp)
2053 {
2054         disp_t *dp;
2055         pri_t tpri;
2056 
2057         ASSERT(THREAD_LOCK_HELD(tp));
2058 
2059         /*
2060          * Don't do anything if the thread is not bound, or
2061          * currently not runnable or swapped out.
2062          */
2063         if (tp->t_bound_cpu == NULL ||
2064             tp->t_state != TS_RUN ||
2065             tp->t_schedflag & TS_ON_SWAPQ)
2066                 return;
2067 
2068         tpri = DISP_PRIO(tp);
2069         dp = tp->t_bound_cpu->cpu_disp;
2070         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2071         if (tpri > dp->disp_max_unbound_pri)
2072                 dp->disp_max_unbound_pri = tpri;
2073 }
2074 
2075 /*
2076  * disp_getbest()
2077  *   De-queue the highest priority unbound runnable thread.
2078  *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
2079  *   Returns NULL if nothing found.
2080  *   Returns T_DONTSTEAL if the thread was not stealable.
2081  *   so that the caller will try again later.
2082  *
2083  *   Passed a pointer to a dispatch queue not associated with this CPU, and
2084  *   its type.
2085  */
2086 static kthread_t *
2087 disp_getbest(disp_t *dp)
2088 {
2089         kthread_t       *tp;
2090         dispq_t         *dq;
2091         pri_t           pri;
2092         cpu_t           *cp, *tcp;
2093         boolean_t       allbound;
2094 
2095         disp_lock_enter(&dp->disp_lock);
2096 
2097         /*
2098          * If there is nothing to run, or the CPU is in the middle of a
2099          * context switch of the only thread, return NULL.
2100          */
2101         tcp = dp->disp_cpu;
2102         cp = CPU;
2103         pri = dp->disp_max_unbound_pri;
2104         if (pri == -1 ||
2105             (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2106             tcp->cpu_disp->disp_nrunnable == 1)) {
2107                 disp_lock_exit_nopreempt(&dp->disp_lock);
2108                 return (NULL);
2109         }
2110 
2111         dq = &dp->disp_q[pri];
2112 
2113 
2114         /*
2115          * Assume that all threads are bound on this queue, and change it
2116          * later when we find out that it is not the case.
2117          */
2118         allbound = B_TRUE;
2119         for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2120                 hrtime_t now, nosteal, rqtime;
2121 
2122                 /*
2123                  * Skip over bound threads which could be here even
2124                  * though disp_max_unbound_pri indicated this level.
2125                  */
2126                 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2127                         continue;
2128 
2129                 /*
2130                  * We've got some unbound threads on this queue, so turn
2131                  * the allbound flag off now.
2132                  */
2133                 allbound = B_FALSE;
2134 
2135                 /*
2136                  * The thread is a candidate for stealing from its run queue. We
2137                  * don't want to steal threads that became runnable just a
2138                  * moment ago. This improves CPU affinity for threads that get
2139                  * preempted for short periods of time and go back on the run
2140                  * queue.
2141                  *
2142                  * We want to let it stay on its run queue if it was only placed
2143                  * there recently and it was running on the same CPU before that
2144                  * to preserve its cache investment. For the thread to remain on
2145                  * its run queue, ALL of the following conditions must be
2146                  * satisfied:
2147                  *
2148                  * - the disp queue should not be the kernel preemption queue
2149                  * - delayed idle stealing should not be disabled
2150                  * - nosteal_nsec should be non-zero
2151                  * - it should run with user priority
2152                  * - it should be on the run queue of the CPU where it was
2153                  *   running before being placed on the run queue
2154                  * - it should be the only thread on the run queue (to prevent
2155                  *   extra scheduling latency for other threads)
2156                  * - it should sit on the run queue for less than per-chip
2157                  *   nosteal interval or global nosteal interval
2158                  * - in case of CPUs with shared cache it should sit in a run
2159                  *   queue of a CPU from a different chip
2160                  *
2161                  * The checks are arranged so that the ones that are faster are
2162                  * placed earlier.
2163                  */
2164                 if (tcp == NULL ||
2165                     pri >= minclsyspri ||
2166                     tp->t_cpu != tcp)
2167                         break;
2168 
2169                 /*
2170                  * Steal immediately if, due to CMT processor architecture
2171                  * migraiton between cp and tcp would incur no performance
2172                  * penalty.
2173                  */
2174                 if (pg_cmt_can_migrate(cp, tcp))
2175                         break;
2176 
2177                 nosteal = nosteal_nsec;
2178                 if (nosteal == 0)
2179                         break;
2180 
2181                 /*
2182                  * Calculate time spent sitting on run queue
2183                  */
2184                 now = gethrtime_unscaled();
2185                 rqtime = now - tp->t_waitrq;
2186                 scalehrtime(&rqtime);
2187 
2188                 /*
2189                  * Steal immediately if the time spent on this run queue is more
2190                  * than allowed nosteal delay.
2191                  *
2192                  * Negative rqtime check is needed here to avoid infinite
2193                  * stealing delays caused by unlikely but not impossible
2194                  * drifts between CPU times on different CPUs.
2195                  */
2196                 if (rqtime > nosteal || rqtime < 0)
2197                         break;
2198 
2199                 DTRACE_PROBE4(nosteal, kthread_t *, tp,
2200                     cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2201                 scalehrtime(&now);
2202                 /*
2203                  * Calculate when this thread becomes stealable
2204                  */
2205                 now += (nosteal - rqtime);
2206 
2207                 /*
2208                  * Calculate time when some thread becomes stealable
2209                  */
2210                 if (now < dp->disp_steal)
2211                         dp->disp_steal = now;
2212         }
2213 
2214         /*
2215          * If there were no unbound threads on this queue, find the queue
2216          * where they are and then return later. The value of
2217          * disp_max_unbound_pri is not always accurate because it isn't
2218          * reduced until another idle CPU looks for work.
2219          */
2220         if (allbound)
2221                 disp_fix_unbound_pri(dp, pri);
2222 
2223         /*
2224          * If we reached the end of the queue and found no unbound threads
2225          * then return NULL so that other CPUs will be considered.  If there
2226          * are unbound threads but they cannot yet be stolen, then
2227          * return T_DONTSTEAL and try again later.
2228          */
2229         if (tp == NULL) {
2230                 disp_lock_exit_nopreempt(&dp->disp_lock);
2231                 return (allbound ? NULL : T_DONTSTEAL);
2232         }
2233 
2234         /*
2235          * Found a runnable, unbound thread, so remove it from queue.
2236          * dispdeq() requires that we have the thread locked, and we do,
2237          * by virtue of holding the dispatch queue lock.  dispdeq() will
2238          * put the thread in transition state, thereby dropping the dispq
2239          * lock.
2240          */
2241 
2242 #ifdef DEBUG
2243         {
2244                 int     thread_was_on_queue;
2245 
2246                 thread_was_on_queue = dispdeq(tp);      /* drops disp_lock */
2247                 ASSERT(thread_was_on_queue);
2248         }
2249 
2250 #else /* DEBUG */
2251         (void) dispdeq(tp);                     /* drops disp_lock */
2252 #endif /* DEBUG */
2253 
2254         /*
2255          * Reset the disp_queue steal time - we do not know what is the smallest
2256          * value across the queue is.
2257          */
2258         dp->disp_steal = 0;
2259 
2260         tp->t_schedflag |= TS_DONT_SWAP;
2261 
2262         /*
2263          * Setup thread to run on the current CPU.
2264          */
2265         tp->t_disp_queue = cp->cpu_disp;
2266 
2267         cp->cpu_dispthread = tp;             /* protected by spl only */
2268         cp->cpu_dispatch_pri = pri;
2269 
2270         /*
2271          * There can be a memory synchronization race between disp_getbest()
2272          * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2273          * to preempt the current thread to run the enqueued thread while
2274          * disp_getbest() and disp_ratify() are changing the current thread
2275          * to the stolen thread. This may lead to a situation where
2276          * cpu_resched() tries to preempt the wrong thread and the
2277          * stolen thread continues to run on the CPU which has been tagged
2278          * for preemption.
2279          * Later the clock thread gets enqueued but doesn't get to run on the
2280          * CPU causing the system to hang.
2281          *
2282          * To avoid this, grabbing and dropping the disp_lock (which does
2283          * a memory barrier) is needed to synchronize the execution of
2284          * cpu_resched() with disp_getbest() and disp_ratify() and
2285          * synchronize the memory read and written by cpu_resched(),
2286          * disp_getbest(), and disp_ratify() with each other.
2287          *  (see CR#6482861 for more details).
2288          */
2289         disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2290         disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2291 
2292         ASSERT(pri == DISP_PRIO(tp));
2293 
2294         DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2295 
2296         thread_onproc(tp, cp);                  /* set t_state to TS_ONPROC */
2297 
2298         /*
2299          * Return with spl high so that swtch() won't need to raise it.
2300          * The disp_lock was dropped by dispdeq().
2301          */
2302 
2303         return (tp);
2304 }
2305 
2306 /*
2307  * disp_bound_common() - common routine for higher level functions
2308  *      that check for bound threads under certain conditions.
2309  *      If 'threadlistsafe' is set then there is no need to acquire
2310  *      pidlock to stop the thread list from changing (eg, if
2311  *      disp_bound_* is called with cpus paused).
2312  */
2313 static int
2314 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2315 {
2316         int             found = 0;
2317         kthread_t       *tp;
2318 
2319         ASSERT(flag);
2320 
2321         if (!threadlistsafe)
2322                 mutex_enter(&pidlock);
2323         tp = curthread;         /* faster than allthreads */
2324         do {
2325                 if (tp->t_state != TS_FREE) {
2326                         /*
2327                          * If an interrupt thread is busy, but the
2328                          * caller doesn't care (i.e. BOUND_INTR is off),
2329                          * then just ignore it and continue through.
2330                          */
2331                         if ((tp->t_flag & T_INTR_THREAD) &&
2332                             !(flag & BOUND_INTR))
2333                                 continue;
2334 
2335                         /*
2336                          * Skip the idle thread for the CPU
2337                          * we're about to set offline.
2338                          */
2339                         if (tp == cp->cpu_idle_thread)
2340                                 continue;
2341 
2342                         /*
2343                          * Skip the pause thread for the CPU
2344                          * we're about to set offline.
2345                          */
2346                         if (tp == cp->cpu_pause_thread)
2347                                 continue;
2348 
2349                         if ((flag & BOUND_CPU) &&
2350                             (tp->t_bound_cpu == cp ||
2351                             tp->t_bind_cpu == cp->cpu_id ||
2352                             tp->t_weakbound_cpu == cp)) {
2353                                 found = 1;
2354                                 break;
2355                         }
2356 
2357                         if ((flag & BOUND_PARTITION) &&
2358                             (tp->t_cpupart == cp->cpu_part)) {
2359                                 found = 1;
2360                                 break;
2361                         }
2362                 }
2363         } while ((tp = tp->t_next) != curthread && found == 0);
2364         if (!threadlistsafe)
2365                 mutex_exit(&pidlock);
2366         return (found);
2367 }
2368 
2369 /*
2370  * disp_bound_threads - return nonzero if threads are bound to the processor.
2371  *      Called infrequently.  Keep this simple.
2372  *      Includes threads that are asleep or stopped but not onproc.
2373  */
2374 int
2375 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2376 {
2377         return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2378 }
2379 
2380 /*
2381  * disp_bound_anythreads - return nonzero if _any_ threads are bound
2382  * to the given processor, including interrupt threads.
2383  */
2384 int
2385 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2386 {
2387         return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2388 }
2389 
2390 /*
2391  * disp_bound_partition - return nonzero if threads are bound to the same
2392  * partition as the processor.
2393  *      Called infrequently.  Keep this simple.
2394  *      Includes threads that are asleep or stopped but not onproc.
2395  */
2396 int
2397 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2398 {
2399         return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2400 }
2401 
2402 /*
2403  * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2404  * threads to other CPUs.
2405  */
2406 void
2407 disp_cpu_inactive(cpu_t *cp)
2408 {
2409         kthread_t       *tp;
2410         disp_t          *dp = cp->cpu_disp;
2411         dispq_t         *dq;
2412         pri_t           pri;
2413         int             wasonq;
2414 
2415         disp_lock_enter(&dp->disp_lock);
2416         while ((pri = dp->disp_max_unbound_pri) != -1) {
2417                 dq = &dp->disp_q[pri];
2418                 tp = dq->dq_first;
2419 
2420                 /*
2421                  * Skip over bound threads.
2422                  */
2423                 while (tp != NULL && tp->t_bound_cpu != NULL) {
2424                         tp = tp->t_link;
2425                 }
2426 
2427                 if (tp == NULL) {
2428                         /* disp_max_unbound_pri must be inaccurate, so fix it */
2429                         disp_fix_unbound_pri(dp, pri);
2430                         continue;
2431                 }
2432 
2433                 wasonq = dispdeq(tp);           /* drops disp_lock */
2434                 ASSERT(wasonq);
2435                 ASSERT(tp->t_weakbound_cpu == NULL);
2436 
2437                 setbackdq(tp);
2438                 /*
2439                  * Called from cpu_offline:
2440                  *
2441                  * cp has already been removed from the list of active cpus
2442                  * and tp->t_cpu has been changed so there is no risk of
2443                  * tp ending up back on cp.
2444                  *
2445                  * Called from cpupart_move_cpu:
2446                  *
2447                  * The cpu has moved to a new cpupart.  Any threads that
2448                  * were on it's dispatch queues before the move remain
2449                  * in the old partition and can't run in the new partition.
2450                  */
2451                 ASSERT(tp->t_cpu != cp);
2452                 thread_unlock(tp);
2453 
2454                 disp_lock_enter(&dp->disp_lock);
2455         }
2456         disp_lock_exit(&dp->disp_lock);
2457 }
2458 
2459 /*
2460  * disp_lowpri_cpu - find CPU running the lowest priority thread.
2461  *      The hint passed in is used as a starting point so we don't favor
2462  *      CPU 0 or any other CPU.  The caller should pass in the most recently
2463  *      used CPU for the thread.
2464  *
2465  *      The lgroup and priority are used to determine the best CPU to run on
2466  *      in a NUMA machine.  The lgroup specifies which CPUs are closest while
2467  *      the thread priority will indicate whether the thread will actually run
2468  *      there.  To pick the best CPU, the CPUs inside and outside of the given
2469  *      lgroup which are running the lowest priority threads are found.  The
2470  *      remote CPU is chosen only if the thread will not run locally on a CPU
2471  *      within the lgroup, but will run on the remote CPU. If the thread
2472  *      cannot immediately run on any CPU, the best local CPU will be chosen.
2473  *
2474  *      The lpl specified also identifies the cpu partition from which
2475  *      disp_lowpri_cpu should select a CPU.
2476  *
2477  *      curcpu is used to indicate that disp_lowpri_cpu is being called on
2478  *      behalf of the current thread. (curthread is looking for a new cpu)
2479  *      In this case, cpu_dispatch_pri for this thread's cpu should be
2480  *      ignored.
2481  *
2482  *      If a cpu is the target of an offline request then try to avoid it.
2483  *
2484  *      This function must be called at either high SPL, or with preemption
2485  *      disabled, so that the "hint" CPU cannot be removed from the online
2486  *      CPU list while we are traversing it.
2487  */
2488 cpu_t *
2489 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2490 {
2491         cpu_t   *bestcpu;
2492         cpu_t   *besthomecpu;
2493         cpu_t   *cp, *cpstart;
2494 
2495         pri_t   bestpri;
2496         pri_t   cpupri;
2497 
2498         klgrpset_t      done;
2499         klgrpset_t      cur_set;
2500 
2501         lpl_t           *lpl_iter, *lpl_leaf;
2502         int             i;
2503 
2504         /*
2505          * Scan for a CPU currently running the lowest priority thread.
2506          * Cannot get cpu_lock here because it is adaptive.
2507          * We do not require lock on CPU list.
2508          */
2509         ASSERT(hint != NULL);
2510         ASSERT(lpl != NULL);
2511         ASSERT(lpl->lpl_ncpu > 0);
2512 
2513         /*
2514          * First examine local CPUs. Note that it's possible the hint CPU
2515          * passed in in remote to the specified home lgroup. If our priority
2516          * isn't sufficient enough such that we can run immediately at home,
2517          * then examine CPUs remote to our home lgroup.
2518          * We would like to give preference to CPUs closest to "home".
2519          * If we can't find a CPU where we'll run at a given level
2520          * of locality, we expand our search to include the next level.
2521          */
2522         bestcpu = besthomecpu = NULL;
2523         klgrpset_clear(done);
2524         /* start with lpl we were passed */
2525 
2526         lpl_iter = lpl;
2527 
2528         do {
2529 
2530                 bestpri = SHRT_MAX;
2531                 klgrpset_clear(cur_set);
2532 
2533                 for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2534                         lpl_leaf = lpl_iter->lpl_rset[i];
2535                         if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2536                                 continue;
2537 
2538                         klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2539 
2540                         if (hint->cpu_lpl == lpl_leaf)
2541                                 cp = cpstart = hint;
2542                         else
2543                                 cp = cpstart = lpl_leaf->lpl_cpus;
2544 
2545                         do {
2546                                 if (cp == curcpu)
2547                                         cpupri = -1;
2548                                 else if (cp == cpu_inmotion)
2549                                         cpupri = SHRT_MAX;
2550                                 else
2551                                         cpupri = cp->cpu_dispatch_pri;
2552                                 if (cp->cpu_disp->disp_maxrunpri > cpupri)
2553                                         cpupri = cp->cpu_disp->disp_maxrunpri;
2554                                 if (cp->cpu_chosen_level > cpupri)
2555                                         cpupri = cp->cpu_chosen_level;
2556                                 if (cpupri < bestpri) {
2557                                         if (CPU_IDLING(cpupri)) {
2558                                                 ASSERT((cp->cpu_flags &
2559                                                     CPU_QUIESCED) == 0);
2560                                                 return (cp);
2561                                         }
2562                                         bestcpu = cp;
2563                                         bestpri = cpupri;
2564                                 }
2565                         } while ((cp = cp->cpu_next_lpl) != cpstart);
2566                 }
2567 
2568                 if (bestcpu && (tpri > bestpri)) {
2569                         ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2570                         return (bestcpu);
2571                 }
2572                 if (besthomecpu == NULL)
2573                         besthomecpu = bestcpu;
2574                 /*
2575                  * Add the lgrps we just considered to the "done" set
2576                  */
2577                 klgrpset_or(done, cur_set);
2578 
2579         } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2580 
2581         /*
2582          * The specified priority isn't high enough to run immediately
2583          * anywhere, so just return the best CPU from the home lgroup.
2584          */
2585         ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2586         return (besthomecpu);
2587 }
2588 
2589 /*
2590  * This routine provides the generic idle cpu function for all processors.
2591  * If a processor has some specific code to execute when idle (say, to stop
2592  * the pipeline and save power) then that routine should be defined in the
2593  * processors specific code (module_xx.c) and the global variable idle_cpu
2594  * set to that function.
2595  */
2596 static void
2597 generic_idle_cpu(void)
2598 {
2599 }
2600 
2601 /*ARGSUSED*/
2602 static void
2603 generic_enq_thread(cpu_t *cpu, int bound)
2604 {
2605 }