1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28 
  29 
  30 #include <sys/types.h>
  31 #include <sys/param.h>
  32 #include <sys/sysmacros.h>
  33 #include <sys/signal.h>
  34 #include <sys/user.h>
  35 #include <sys/systm.h>
  36 #include <sys/sysinfo.h>
  37 #include <sys/var.h>
  38 #include <sys/errno.h>
  39 #include <sys/cmn_err.h>
  40 #include <sys/debug.h>
  41 #include <sys/inline.h>
  42 #include <sys/disp.h>
  43 #include <sys/class.h>
  44 #include <sys/bitmap.h>
  45 #include <sys/kmem.h>
  46 #include <sys/cpuvar.h>
  47 #include <sys/vtrace.h>
  48 #include <sys/tnf.h>
  49 #include <sys/cpupart.h>
  50 #include <sys/lgrp.h>
  51 #include <sys/pg.h>
  52 #include <sys/cmt.h>
  53 #include <sys/bitset.h>
  54 #include <sys/schedctl.h>
  55 #include <sys/atomic.h>
  56 #include <sys/dtrace.h>
  57 #include <sys/sdt.h>
  58 #include <sys/archsystm.h>
  59 
  60 #include <vm/as.h>
  61 
  62 #define BOUND_CPU       0x1
  63 #define BOUND_PARTITION 0x2
  64 #define BOUND_INTR      0x4
  65 
  66 /* Dispatch queue allocation structure and functions */
  67 struct disp_queue_info {
  68         disp_t  *dp;
  69         dispq_t *olddispq;
  70         dispq_t *newdispq;
  71         ulong_t *olddqactmap;
  72         ulong_t *newdqactmap;
  73         int     oldnglobpris;
  74 };
  75 static void     disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
  76     disp_t *dp);
  77 static void     disp_dq_assign(struct disp_queue_info *dptr, int numpris);
  78 static void     disp_dq_free(struct disp_queue_info *dptr);
  79 
  80 /* platform-specific routine to call when processor is idle */
  81 static void     generic_idle_cpu();
  82 void            (*idle_cpu)() = generic_idle_cpu;
  83 
  84 /* routines invoked when a CPU enters/exits the idle loop */
  85 static void     idle_enter();
  86 static void     idle_exit();
  87 
  88 /* platform-specific routine to call when thread is enqueued */
  89 static void     generic_enq_thread(cpu_t *, int);
  90 void            (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
  91 
  92 pri_t   kpreemptpri;            /* priority where kernel preemption applies */
  93 pri_t   upreemptpri = 0;        /* priority where normal preemption applies */
  94 pri_t   intr_pri;               /* interrupt thread priority base level */
  95 
  96 #define KPQPRI  -1              /* pri where cpu affinity is dropped for kpq */
  97 pri_t   kpqpri = KPQPRI;        /* can be set in /etc/system */
  98 disp_t  cpu0_disp;              /* boot CPU's dispatch queue */
  99 int     nswapped;               /* total number of swapped threads */
 100 static void     disp_swapped_setrun(kthread_t *tp);
 101 static void     cpu_resched(cpu_t *cp, pri_t tpri);
 102 
 103 /*
 104  * If this is set, only interrupt threads will cause kernel preemptions.
 105  * This is done by changing the value of kpreemptpri.  kpreemptpri
 106  * will either be the max sysclass pri + 1 or the min interrupt pri.
 107  */
 108 int     only_intr_kpreempt;
 109 
 110 extern void set_idle_cpu(int cpun);
 111 extern void unset_idle_cpu(int cpun);
 112 static void setkpdq(kthread_t *tp, int borf);
 113 #define SETKP_BACK      0
 114 #define SETKP_FRONT     1
 115 /*
 116  * Parameter that determines how recently a thread must have run
 117  * on the CPU to be considered loosely-bound to that CPU to reduce
 118  * cold cache effects.  The interval is in hertz.
 119  */
 120 #define RECHOOSE_INTERVAL 3
 121 int     rechoose_interval = RECHOOSE_INTERVAL;
 122 
 123 /*
 124  * Parameter that determines how long (in nanoseconds) a thread must
 125  * be sitting on a run queue before it can be stolen by another CPU
 126  * to reduce migrations.  The interval is in nanoseconds.
 127  *
 128  * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
 129  * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
 130  * here indicating it is uninitiallized.
 131  * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
 132  *
 133  */
 134 #define NOSTEAL_UNINITIALIZED   (-1)
 135 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
 136 extern void cmp_set_nosteal_interval(void);
 137 
 138 id_t    defaultcid;     /* system "default" class; see dispadmin(1M) */
 139 
 140 disp_lock_t     transition_lock;        /* lock on transitioning threads */
 141 disp_lock_t     stop_lock;              /* lock on stopped threads */
 142 
 143 static void     cpu_dispqalloc(int numpris);
 144 
 145 /*
 146  * This gets returned by disp_getwork/disp_getbest if we couldn't steal
 147  * a thread because it was sitting on its run queue for a very short
 148  * period of time.
 149  */
 150 #define T_DONTSTEAL     (kthread_t *)(-1) /* returned by disp_getwork/getbest */
 151 
 152 static kthread_t        *disp_getwork(cpu_t *to);
 153 static kthread_t        *disp_getbest(disp_t *from);
 154 static kthread_t        *disp_ratify(kthread_t *tp, disp_t *kpq);
 155 
 156 void    swtch_to(kthread_t *);
 157 
 158 /*
 159  * dispatcher and scheduler initialization
 160  */
 161 
 162 /*
 163  * disp_setup - Common code to calculate and allocate dispatcher
 164  *              variables and structures based on the maximum priority.
 165  */
 166 static void
 167 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
 168 {
 169         pri_t   newnglobpris;
 170 
 171         ASSERT(MUTEX_HELD(&cpu_lock));
 172 
 173         newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
 174 
 175         if (newnglobpris > oldnglobpris) {
 176                 /*
 177                  * Allocate new kp queues for each CPU partition.
 178                  */
 179                 cpupart_kpqalloc(newnglobpris);
 180 
 181                 /*
 182                  * Allocate new dispatch queues for each CPU.
 183                  */
 184                 cpu_dispqalloc(newnglobpris);
 185 
 186                 /*
 187                  * compute new interrupt thread base priority
 188                  */
 189                 intr_pri = maxglobpri;
 190                 if (only_intr_kpreempt) {
 191                         kpreemptpri = intr_pri + 1;
 192                         if (kpqpri == KPQPRI)
 193                                 kpqpri = kpreemptpri;
 194                 }
 195                 v.v_nglobpris = newnglobpris;
 196         }
 197 }
 198 
 199 /*
 200  * dispinit - Called to initialize all loaded classes and the
 201  *            dispatcher framework.
 202  */
 203 void
 204 dispinit(void)
 205 {
 206         id_t    cid;
 207         pri_t   maxglobpri;
 208         pri_t   cl_maxglobpri;
 209 
 210         maxglobpri = -1;
 211 
 212         /*
 213          * Initialize transition lock, which will always be set.
 214          */
 215         DISP_LOCK_INIT(&transition_lock);
 216         disp_lock_enter_high(&transition_lock);
 217         DISP_LOCK_INIT(&stop_lock);
 218 
 219         mutex_enter(&cpu_lock);
 220         CPU->cpu_disp->disp_maxrunpri = -1;
 221         CPU->cpu_disp->disp_max_unbound_pri = -1;
 222 
 223         /*
 224          * Initialize the default CPU partition.
 225          */
 226         cpupart_initialize_default();
 227         /*
 228          * Call the class specific initialization functions for
 229          * all pre-installed schedulers.
 230          *
 231          * We pass the size of a class specific parameter
 232          * buffer to each of the initialization functions
 233          * to try to catch problems with backward compatibility
 234          * of class modules.
 235          *
 236          * For example a new class module running on an old system
 237          * which didn't provide sufficiently large parameter buffers
 238          * would be bad news. Class initialization modules can check for
 239          * this and take action if they detect a problem.
 240          */
 241 
 242         for (cid = 0; cid < nclass; cid++) {
 243                 sclass_t        *sc;
 244 
 245                 sc = &sclass[cid];
 246                 if (SCHED_INSTALLED(sc)) {
 247                         cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
 248                             &sc->cl_funcs);
 249                         if (cl_maxglobpri > maxglobpri)
 250                                 maxglobpri = cl_maxglobpri;
 251                 }
 252         }
 253         kpreemptpri = (pri_t)v.v_maxsyspri + 1;
 254         if (kpqpri == KPQPRI)
 255                 kpqpri = kpreemptpri;
 256 
 257         ASSERT(maxglobpri >= 0);
 258         disp_setup(maxglobpri, 0);
 259 
 260         mutex_exit(&cpu_lock);
 261 
 262         /*
 263          * Platform specific sticky scheduler setup.
 264          */
 265         if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
 266                 cmp_set_nosteal_interval();
 267 
 268         /*
 269          * Get the default class ID; this may be later modified via
 270          * dispadmin(1M).  This will load the class (normally TS) and that will
 271          * call disp_add(), which is why we had to drop cpu_lock first.
 272          */
 273         if (getcid(defaultclass, &defaultcid) != 0) {
 274                 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
 275                     defaultclass);
 276         }
 277 }
 278 
 279 /*
 280  * disp_add - Called with class pointer to initialize the dispatcher
 281  *            for a newly loaded class.
 282  */
 283 void
 284 disp_add(sclass_t *clp)
 285 {
 286         pri_t   maxglobpri;
 287         pri_t   cl_maxglobpri;
 288 
 289         mutex_enter(&cpu_lock);
 290         /*
 291          * Initialize the scheduler class.
 292          */
 293         maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
 294         cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
 295         if (cl_maxglobpri > maxglobpri)
 296                 maxglobpri = cl_maxglobpri;
 297 
 298         /*
 299          * Save old queue information.  Since we're initializing a
 300          * new scheduling class which has just been loaded, then
 301          * the size of the dispq may have changed.  We need to handle
 302          * that here.
 303          */
 304         disp_setup(maxglobpri, v.v_nglobpris);
 305 
 306         mutex_exit(&cpu_lock);
 307 }
 308 
 309 
 310 /*
 311  * For each CPU, allocate new dispatch queues
 312  * with the stated number of priorities.
 313  */
 314 static void
 315 cpu_dispqalloc(int numpris)
 316 {
 317         cpu_t   *cpup;
 318         struct disp_queue_info  *disp_mem;
 319         int i, num;
 320 
 321         ASSERT(MUTEX_HELD(&cpu_lock));
 322 
 323         disp_mem = kmem_zalloc(NCPU *
 324             sizeof (struct disp_queue_info), KM_SLEEP);
 325 
 326         /*
 327          * This routine must allocate all of the memory before stopping
 328          * the cpus because it must not sleep in kmem_alloc while the
 329          * CPUs are stopped.  Locks they hold will not be freed until they
 330          * are restarted.
 331          */
 332         i = 0;
 333         cpup = cpu_list;
 334         do {
 335                 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
 336                 i++;
 337                 cpup = cpup->cpu_next;
 338         } while (cpup != cpu_list);
 339         num = i;
 340 
 341         pause_cpus(NULL);
 342         for (i = 0; i < num; i++)
 343                 disp_dq_assign(&disp_mem[i], numpris);
 344         start_cpus();
 345 
 346         /*
 347          * I must free all of the memory after starting the cpus because
 348          * I can not risk sleeping in kmem_free while the cpus are stopped.
 349          */
 350         for (i = 0; i < num; i++)
 351                 disp_dq_free(&disp_mem[i]);
 352 
 353         kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
 354 }
 355 
 356 static void
 357 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp)
 358 {
 359         dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
 360         dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
 361             sizeof (long), KM_SLEEP);
 362         dptr->dp = dp;
 363 }
 364 
 365 static void
 366 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
 367 {
 368         disp_t  *dp;
 369 
 370         dp = dptr->dp;
 371         dptr->olddispq = dp->disp_q;
 372         dptr->olddqactmap = dp->disp_qactmap;
 373         dptr->oldnglobpris = dp->disp_npri;
 374 
 375         ASSERT(dptr->oldnglobpris < numpris);
 376 
 377         if (dptr->olddispq != NULL) {
 378                 /*
 379                  * Use kcopy because bcopy is platform-specific
 380                  * and could block while we might have paused the cpus.
 381                  */
 382                 (void) kcopy(dptr->olddispq, dptr->newdispq,
 383                     dptr->oldnglobpris * sizeof (dispq_t));
 384                 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
 385                     ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
 386                     sizeof (long));
 387         }
 388         dp->disp_q = dptr->newdispq;
 389         dp->disp_qactmap = dptr->newdqactmap;
 390         dp->disp_q_limit = &dptr->newdispq[numpris];
 391         dp->disp_npri = numpris;
 392 }
 393 
 394 static void
 395 disp_dq_free(struct disp_queue_info *dptr)
 396 {
 397         if (dptr->olddispq != NULL)
 398                 kmem_free(dptr->olddispq,
 399                     dptr->oldnglobpris * sizeof (dispq_t));
 400         if (dptr->olddqactmap != NULL)
 401                 kmem_free(dptr->olddqactmap,
 402                     ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
 403 }
 404 
 405 /*
 406  * For a newly created CPU, initialize the dispatch queue.
 407  * This is called before the CPU is known through cpu[] or on any lists.
 408  */
 409 void
 410 disp_cpu_init(cpu_t *cp)
 411 {
 412         disp_t  *dp;
 413         dispq_t *newdispq;
 414         ulong_t *newdqactmap;
 415 
 416         ASSERT(MUTEX_HELD(&cpu_lock));      /* protect dispatcher queue sizes */
 417 
 418         if (cp == cpu0_disp.disp_cpu)
 419                 dp = &cpu0_disp;
 420         else
 421                 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
 422         bzero(dp, sizeof (disp_t));
 423         cp->cpu_disp = dp;
 424         dp->disp_cpu = cp;
 425         dp->disp_maxrunpri = -1;
 426         dp->disp_max_unbound_pri = -1;
 427         DISP_LOCK_INIT(&cp->cpu_thread_lock);
 428         /*
 429          * Allocate memory for the dispatcher queue headers
 430          * and the active queue bitmap.
 431          */
 432         newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
 433         newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
 434             sizeof (long), KM_SLEEP);
 435         dp->disp_q = newdispq;
 436         dp->disp_qactmap = newdqactmap;
 437         dp->disp_q_limit = &newdispq[v.v_nglobpris];
 438         dp->disp_npri = v.v_nglobpris;
 439 }
 440 
 441 void
 442 disp_cpu_fini(cpu_t *cp)
 443 {
 444         ASSERT(MUTEX_HELD(&cpu_lock));
 445 
 446         disp_kp_free(cp->cpu_disp);
 447         if (cp->cpu_disp != &cpu0_disp)
 448                 kmem_free(cp->cpu_disp, sizeof (disp_t));
 449 }
 450 
 451 /*
 452  * Allocate new, larger kpreempt dispatch queue to replace the old one.
 453  */
 454 void
 455 disp_kp_alloc(disp_t *dq, pri_t npri)
 456 {
 457         struct disp_queue_info  mem_info;
 458 
 459         if (npri > dq->disp_npri) {
 460                 /*
 461                  * Allocate memory for the new array.
 462                  */
 463                 disp_dq_alloc(&mem_info, npri, dq);
 464 
 465                 /*
 466                  * We need to copy the old structures to the new
 467                  * and free the old.
 468                  */
 469                 disp_dq_assign(&mem_info, npri);
 470                 disp_dq_free(&mem_info);
 471         }
 472 }
 473 
 474 /*
 475  * Free dispatch queue.
 476  * Used for the kpreempt queues for a removed CPU partition and
 477  * for the per-CPU queues of deleted CPUs.
 478  */
 479 void
 480 disp_kp_free(disp_t *dq)
 481 {
 482         struct disp_queue_info  mem_info;
 483 
 484         mem_info.olddispq = dq->disp_q;
 485         mem_info.olddqactmap = dq->disp_qactmap;
 486         mem_info.oldnglobpris = dq->disp_npri;
 487         disp_dq_free(&mem_info);
 488 }
 489 
 490 /*
 491  * End dispatcher and scheduler initialization.
 492  */
 493 
 494 /*
 495  * See if there's anything to do other than remain idle.
 496  * Return non-zero if there is.
 497  *
 498  * This function must be called with high spl, or with
 499  * kernel preemption disabled to prevent the partition's
 500  * active cpu list from changing while being traversed.
 501  *
 502  * This is essentially a simpler version of disp_getwork()
 503  * to be called by CPUs preparing to "halt".
 504  */
 505 int
 506 disp_anywork(void)
 507 {
 508         cpu_t           *cp = CPU;
 509         cpu_t           *ocp;
 510         volatile int    *local_nrunnable = &cp->cpu_disp->disp_nrunnable;
 511 
 512         if (!(cp->cpu_flags & CPU_OFFLINE)) {
 513                 if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
 514                         return (1);
 515 
 516                 for (ocp = cp->cpu_next_part; ocp != cp;
 517                     ocp = ocp->cpu_next_part) {
 518                         ASSERT(CPU_ACTIVE(ocp));
 519 
 520                         /*
 521                          * Something has appeared on the local run queue.
 522                          */
 523                         if (*local_nrunnable > 0)
 524                                 return (1);
 525                         /*
 526                          * If we encounter another idle CPU that will
 527                          * soon be trolling around through disp_anywork()
 528                          * terminate our walk here and let this other CPU
 529                          * patrol the next part of the list.
 530                          */
 531                         if (ocp->cpu_dispatch_pri == -1 &&
 532                             (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
 533                                 return (0);
 534                         /*
 535                          * Work can be taken from another CPU if:
 536                          *      - There is unbound work on the run queue
 537                          *      - That work isn't a thread undergoing a
 538                          *      - context switch on an otherwise empty queue.
 539                          *      - The CPU isn't running the idle loop.
 540                          */
 541                         if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
 542                             !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
 543                             ocp->cpu_disp->disp_nrunnable == 1) &&
 544                             ocp->cpu_dispatch_pri != -1)
 545                                 return (1);
 546                 }
 547         }
 548         return (0);
 549 }
 550 
 551 /*
 552  * Called when CPU enters the idle loop
 553  */
 554 static void
 555 idle_enter()
 556 {
 557         cpu_t           *cp = CPU;
 558 
 559         new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
 560         CPU_STATS_ADDQ(cp, sys, idlethread, 1);
 561         set_idle_cpu(cp->cpu_id);    /* arch-dependent hook */
 562 }
 563 
 564 /*
 565  * Called when CPU exits the idle loop
 566  */
 567 static void
 568 idle_exit()
 569 {
 570         cpu_t           *cp = CPU;
 571 
 572         new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
 573         unset_idle_cpu(cp->cpu_id);  /* arch-dependent hook */
 574 }
 575 
 576 /*
 577  * Idle loop.
 578  */
 579 void
 580 idle()
 581 {
 582         struct cpu      *cp = CPU;              /* pointer to this CPU */
 583         kthread_t       *t;                     /* taken thread */
 584 
 585         idle_enter();
 586 
 587         /*
 588          * Uniprocessor version of idle loop.
 589          * Do this until notified that we're on an actual multiprocessor.
 590          */
 591         while (ncpus == 1) {
 592                 if (cp->cpu_disp->disp_nrunnable == 0) {
 593                         (*idle_cpu)();
 594                         continue;
 595                 }
 596                 idle_exit();
 597                 swtch();
 598 
 599                 idle_enter(); /* returned from swtch */
 600         }
 601 
 602         /*
 603          * Multiprocessor idle loop.
 604          */
 605         for (;;) {
 606                 /*
 607                  * If CPU is completely quiesced by p_online(2), just wait
 608                  * here with minimal bus traffic until put online.
 609                  */
 610                 while (cp->cpu_flags & CPU_QUIESCED)
 611                         (*idle_cpu)();
 612 
 613                 if (cp->cpu_disp->disp_nrunnable != 0) {
 614                         idle_exit();
 615                         swtch();
 616                 } else {
 617                         if (cp->cpu_flags & CPU_OFFLINE)
 618                                 continue;
 619                         if ((t = disp_getwork(cp)) == NULL) {
 620                                 if (cp->cpu_chosen_level != -1) {
 621                                         disp_t *dp = cp->cpu_disp;
 622                                         disp_t *kpq;
 623 
 624                                         disp_lock_enter(&dp->disp_lock);
 625                                         /*
 626                                          * Set kpq under lock to prevent
 627                                          * migration between partitions.
 628                                          */
 629                                         kpq = &cp->cpu_part->cp_kp_queue;
 630                                         if (kpq->disp_maxrunpri == -1)
 631                                                 cp->cpu_chosen_level = -1;
 632                                         disp_lock_exit(&dp->disp_lock);
 633                                 }
 634                                 (*idle_cpu)();
 635                                 continue;
 636                         }
 637                         /*
 638                          * If there was a thread but we couldn't steal
 639                          * it, then keep trying.
 640                          */
 641                         if (t == T_DONTSTEAL)
 642                                 continue;
 643                         idle_exit();
 644                         swtch_to(t);
 645                 }
 646                 idle_enter(); /* returned from swtch/swtch_to */
 647         }
 648 }
 649 
 650 
 651 /*
 652  * Preempt the currently running thread in favor of the highest
 653  * priority thread.  The class of the current thread controls
 654  * where it goes on the dispatcher queues. If panicking, turn
 655  * preemption off.
 656  */
 657 void
 658 preempt()
 659 {
 660         kthread_t       *t = curthread;
 661         klwp_t          *lwp = ttolwp(curthread);
 662 
 663         if (panicstr)
 664                 return;
 665 
 666         TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
 667 
 668         thread_lock(t);
 669 
 670         if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
 671                 /*
 672                  * this thread has already been chosen to be run on
 673                  * another CPU. Clear kprunrun on this CPU since we're
 674                  * already headed for swtch().
 675                  */
 676                 CPU->cpu_kprunrun = 0;
 677                 thread_unlock_nopreempt(t);
 678                 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
 679         } else {
 680                 if (lwp != NULL)
 681                         lwp->lwp_ru.nivcsw++;
 682                 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
 683                 THREAD_TRANSITION(t);
 684                 CL_PREEMPT(t);
 685                 DTRACE_SCHED(preempt);
 686                 thread_unlock_nopreempt(t);
 687 
 688                 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
 689 
 690                 swtch();                /* clears CPU->cpu_runrun via disp() */
 691         }
 692 }
 693 
 694 extern kthread_t *thread_unpin();
 695 
 696 /*
 697  * disp() - find the highest priority thread for this processor to run, and
 698  * set it in TS_ONPROC state so that resume() can be called to run it.
 699  */
 700 static kthread_t *
 701 disp()
 702 {
 703         cpu_t           *cpup;
 704         disp_t          *dp;
 705         kthread_t       *tp;
 706         dispq_t         *dq;
 707         int             maxrunword;
 708         pri_t           pri;
 709         disp_t          *kpq;
 710 
 711         TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
 712 
 713         cpup = CPU;
 714         /*
 715          * Find the highest priority loaded, runnable thread.
 716          */
 717         dp = cpup->cpu_disp;
 718 
 719 reschedule:
 720         /*
 721          * If there is more important work on the global queue with a better
 722          * priority than the maximum on this CPU, take it now.
 723          */
 724         kpq = &cpup->cpu_part->cp_kp_queue;
 725         while ((pri = kpq->disp_maxrunpri) >= 0 &&
 726             pri >= dp->disp_maxrunpri &&
 727             (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
 728             (tp = disp_getbest(kpq)) != NULL) {
 729                 if (disp_ratify(tp, kpq) != NULL) {
 730                         TRACE_1(TR_FAC_DISP, TR_DISP_END,
 731                             "disp_end:tid %p", tp);
 732                         return (tp);
 733                 }
 734         }
 735 
 736         disp_lock_enter(&dp->disp_lock);
 737         pri = dp->disp_maxrunpri;
 738 
 739         /*
 740          * If there is nothing to run, look at what's runnable on other queues.
 741          * Choose the idle thread if the CPU is quiesced.
 742          * Note that CPUs that have the CPU_OFFLINE flag set can still run
 743          * interrupt threads, which will be the only threads on the CPU's own
 744          * queue, but cannot run threads from other queues.
 745          */
 746         if (pri == -1) {
 747                 if (!(cpup->cpu_flags & CPU_OFFLINE)) {
 748                         disp_lock_exit(&dp->disp_lock);
 749                         if ((tp = disp_getwork(cpup)) == NULL ||
 750                             tp == T_DONTSTEAL) {
 751                                 tp = cpup->cpu_idle_thread;
 752                                 (void) splhigh();
 753                                 THREAD_ONPROC(tp, cpup);
 754                                 cpup->cpu_dispthread = tp;
 755                                 cpup->cpu_dispatch_pri = -1;
 756                                 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
 757                                 cpup->cpu_chosen_level = -1;
 758                         }
 759                 } else {
 760                         disp_lock_exit_high(&dp->disp_lock);
 761                         tp = cpup->cpu_idle_thread;
 762                         THREAD_ONPROC(tp, cpup);
 763                         cpup->cpu_dispthread = tp;
 764                         cpup->cpu_dispatch_pri = -1;
 765                         cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
 766                         cpup->cpu_chosen_level = -1;
 767                 }
 768                 TRACE_1(TR_FAC_DISP, TR_DISP_END,
 769                     "disp_end:tid %p", tp);
 770                 return (tp);
 771         }
 772 
 773         dq = &dp->disp_q[pri];
 774         tp = dq->dq_first;
 775 
 776         ASSERT(tp != NULL);
 777 
 778         DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
 779 
 780         /*
 781          * Found it so remove it from queue.
 782          */
 783         dp->disp_nrunnable--;
 784         dq->dq_sruncnt--;
 785         if ((dq->dq_first = tp->t_link) == NULL) {
 786                 ulong_t *dqactmap = dp->disp_qactmap;
 787 
 788                 ASSERT(dq->dq_sruncnt == 0);
 789                 dq->dq_last = NULL;
 790 
 791                 /*
 792                  * The queue is empty, so the corresponding bit needs to be
 793                  * turned off in dqactmap.   If nrunnable != 0 just took the
 794                  * last runnable thread off the
 795                  * highest queue, so recompute disp_maxrunpri.
 796                  */
 797                 maxrunword = pri >> BT_ULSHIFT;
 798                 dqactmap[maxrunword] &= ~BT_BIW(pri);
 799 
 800                 if (dp->disp_nrunnable == 0) {
 801                         dp->disp_max_unbound_pri = -1;
 802                         dp->disp_maxrunpri = -1;
 803                 } else {
 804                         int ipri;
 805 
 806                         ipri = bt_gethighbit(dqactmap, maxrunword);
 807                         dp->disp_maxrunpri = ipri;
 808                         if (ipri < dp->disp_max_unbound_pri)
 809                                 dp->disp_max_unbound_pri = ipri;
 810                 }
 811         } else {
 812                 tp->t_link = NULL;
 813         }
 814 
 815         cpup->cpu_dispthread = tp;           /* protected by spl only */
 816         cpup->cpu_dispatch_pri = pri;
 817         ASSERT(pri == DISP_PRIO(tp));
 818         thread_onproc(tp, cpup);                /* set t_state to TS_ONPROC */
 819         disp_lock_exit_high(&dp->disp_lock);     /* drop run queue lock */
 820 
 821         ASSERT(tp != NULL);
 822         TRACE_1(TR_FAC_DISP, TR_DISP_END,
 823             "disp_end:tid %p", tp);
 824 
 825         if (disp_ratify(tp, kpq) == NULL)
 826                 goto reschedule;
 827 
 828         return (tp);
 829 }
 830 
 831 /*
 832  * swtch()
 833  *      Find best runnable thread and run it.
 834  *      Called with the current thread already switched to a new state,
 835  *      on a sleep queue, run queue, stopped, and not zombied.
 836  *      May be called at any spl level less than or equal to LOCK_LEVEL.
 837  *      Always drops spl to the base level (spl0()).
 838  */
 839 void
 840 swtch()
 841 {
 842         kthread_t       *t = curthread;
 843         kthread_t       *next;
 844         cpu_t           *cp;
 845 
 846         TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
 847 
 848         if (t->t_flag & T_INTR_THREAD)
 849                 cpu_intr_swtch_enter(t);
 850 
 851         if (t->t_intr != NULL) {
 852                 /*
 853                  * We are an interrupt thread.  Setup and return
 854                  * the interrupted thread to be resumed.
 855                  */
 856                 (void) splhigh();       /* block other scheduler action */
 857                 cp = CPU;               /* now protected against migration */
 858                 ASSERT(CPU_ON_INTR(cp) == 0);   /* not called with PIL > 10 */
 859                 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
 860                 CPU_STATS_ADDQ(cp, sys, intrblk, 1);
 861                 next = thread_unpin();
 862                 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 863                 resume_from_intr(next);
 864         } else {
 865 #ifdef  DEBUG
 866                 if (t->t_state == TS_ONPROC &&
 867                     t->t_disp_queue->disp_cpu == CPU &&
 868                     t->t_preempt == 0) {
 869                         thread_lock(t);
 870                         ASSERT(t->t_state != TS_ONPROC ||
 871                             t->t_disp_queue->disp_cpu != CPU ||
 872                             t->t_preempt != 0);      /* cannot migrate */
 873                         thread_unlock_nopreempt(t);
 874                 }
 875 #endif  /* DEBUG */
 876                 cp = CPU;
 877                 next = disp();          /* returns with spl high */
 878                 ASSERT(CPU_ON_INTR(cp) == 0);   /* not called with PIL > 10 */
 879 
 880                 /* OK to steal anything left on run queue */
 881                 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
 882 
 883                 if (next != t) {
 884                         hrtime_t now;
 885 
 886                         now = gethrtime_unscaled();
 887                         pg_ev_thread_swtch(cp, now, t, next);
 888 
 889                         /*
 890                          * If t was previously in the TS_ONPROC state,
 891                          * setfrontdq and setbackdq won't have set its t_waitrq.
 892                          * Since we now finally know that we're switching away
 893                          * from this thread, set its t_waitrq if it is on a run
 894                          * queue.
 895                          */
 896                         if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
 897                                 t->t_waitrq = now;
 898                         }
 899 
 900                         /*
 901                          * restore mstate of thread that we are switching to
 902                          */
 903                         restore_mstate(next);
 904 
 905                         CPU_STATS_ADDQ(cp, sys, pswitch, 1);
 906                         cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
 907                         TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 908 
 909                         if (dtrace_vtime_active)
 910                                 dtrace_vtime_switch(next);
 911 
 912                         resume(next);
 913                         /*
 914                          * The TR_RESUME_END and TR_SWTCH_END trace points
 915                          * appear at the end of resume(), because we may not
 916                          * return here
 917                          */
 918                 } else {
 919                         if (t->t_flag & T_INTR_THREAD)
 920                                 cpu_intr_swtch_exit(t);
 921                         /*
 922                          * Threads that enqueue themselves on a run queue defer
 923                          * setting t_waitrq. It is then either set in swtch()
 924                          * when the CPU is actually yielded, or not at all if it
 925                          * is remaining on the CPU.
 926                          * There is however a window between where the thread
 927                          * placed itself on a run queue, and where it selects
 928                          * itself in disp(), where a third party (eg. clock()
 929                          * doing tick processing) may have re-enqueued this
 930                          * thread, setting t_waitrq in the process. We detect
 931                          * this race by noticing that despite switching to
 932                          * ourself, our t_waitrq has been set, and should be
 933                          * cleared.
 934                          */
 935                         if (t->t_waitrq != 0)
 936                                 t->t_waitrq = 0;
 937 
 938                         pg_ev_thread_remain(cp, t);
 939 
 940                         DTRACE_SCHED(remain__cpu);
 941                         TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
 942                         (void) spl0();
 943                 }
 944         }
 945 }
 946 
 947 /*
 948  * swtch_from_zombie()
 949  *      Special case of swtch(), which allows checks for TS_ZOMB to be
 950  *      eliminated from normal resume.
 951  *      Find best runnable thread and run it.
 952  *      Called with the current thread zombied.
 953  *      Zombies cannot migrate, so CPU references are safe.
 954  */
 955 void
 956 swtch_from_zombie()
 957 {
 958         kthread_t       *next;
 959         cpu_t           *cpu = CPU;
 960 
 961         TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
 962 
 963         ASSERT(curthread->t_state == TS_ZOMB);
 964 
 965         next = disp();                  /* returns with spl high */
 966         ASSERT(CPU_ON_INTR(CPU) == 0);  /* not called with PIL > 10 */
 967         CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
 968         ASSERT(next != curthread);
 969         TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 970 
 971         pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
 972 
 973         restore_mstate(next);
 974 
 975         if (dtrace_vtime_active)
 976                 dtrace_vtime_switch(next);
 977 
 978         resume_from_zombie(next);
 979         /*
 980          * The TR_RESUME_END and TR_SWTCH_END trace points
 981          * appear at the end of resume(), because we certainly will not
 982          * return here
 983          */
 984 }
 985 
 986 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
 987 
 988 /*
 989  * search_disp_queues()
 990  *      Search the given dispatch queues for thread tp.
 991  *      Return 1 if tp is found, otherwise return 0.
 992  */
 993 static int
 994 search_disp_queues(disp_t *dp, kthread_t *tp)
 995 {
 996         dispq_t         *dq;
 997         dispq_t         *eq;
 998 
 999         disp_lock_enter_high(&dp->disp_lock);
1000 
1001         for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1002                 kthread_t       *rp;
1003 
1004                 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1005 
1006                 for (rp = dq->dq_first; rp; rp = rp->t_link)
1007                         if (tp == rp) {
1008                                 disp_lock_exit_high(&dp->disp_lock);
1009                                 return (1);
1010                         }
1011         }
1012         disp_lock_exit_high(&dp->disp_lock);
1013 
1014         return (0);
1015 }
1016 
1017 /*
1018  * thread_on_queue()
1019  *      Search all per-CPU dispatch queues and all partition-wide kpreempt
1020  *      queues for thread tp. Return 1 if tp is found, otherwise return 0.
1021  */
1022 static int
1023 thread_on_queue(kthread_t *tp)
1024 {
1025         cpu_t           *cp;
1026         struct cpupart  *part;
1027 
1028         ASSERT(getpil() >= DISP_LEVEL);
1029 
1030         /*
1031          * Search the per-CPU dispatch queues for tp.
1032          */
1033         cp = CPU;
1034         do {
1035                 if (search_disp_queues(cp->cpu_disp, tp))
1036                         return (1);
1037         } while ((cp = cp->cpu_next_onln) != CPU);
1038 
1039         /*
1040          * Search the partition-wide kpreempt queues for tp.
1041          */
1042         part = CPU->cpu_part;
1043         do {
1044                 if (search_disp_queues(&part->cp_kp_queue, tp))
1045                         return (1);
1046         } while ((part = part->cp_next) != CPU->cpu_part);
1047 
1048         return (0);
1049 }
1050 
1051 #else
1052 
1053 #define thread_on_queue(tp)     0       /* ASSERT must be !thread_on_queue */
1054 
1055 #endif  /* DEBUG */
1056 
1057 /*
1058  * like swtch(), but switch to a specified thread taken from another CPU.
1059  *      called with spl high..
1060  */
1061 void
1062 swtch_to(kthread_t *next)
1063 {
1064         cpu_t                   *cp = CPU;
1065         hrtime_t                now;
1066 
1067         TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1068 
1069         /*
1070          * Update context switch statistics.
1071          */
1072         CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1073 
1074         TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1075 
1076         now = gethrtime_unscaled();
1077         pg_ev_thread_swtch(cp, now, curthread, next);
1078 
1079         /* OK to steal anything left on run queue */
1080         cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1081 
1082         /* record last execution time */
1083         cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1084 
1085         /*
1086          * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1087          * won't have set its t_waitrq.  Since we now finally know that we're
1088          * switching away from this thread, set its t_waitrq if it is on a run
1089          * queue.
1090          */
1091         if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1092                 curthread->t_waitrq = now;
1093         }
1094 
1095         /* restore next thread to previously running microstate */
1096         restore_mstate(next);
1097 
1098         if (dtrace_vtime_active)
1099                 dtrace_vtime_switch(next);
1100 
1101         resume(next);
1102         /*
1103          * The TR_RESUME_END and TR_SWTCH_END trace points
1104          * appear at the end of resume(), because we may not
1105          * return here
1106          */
1107 }
1108 
1109 #define CPU_IDLING(pri) ((pri) == -1)
1110 
1111 static void
1112 cpu_resched(cpu_t *cp, pri_t tpri)
1113 {
1114         int     call_poke_cpu = 0;
1115         pri_t   cpupri = cp->cpu_dispatch_pri;
1116 
1117         if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1118                 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1119                     "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1120                 if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1121                         cp->cpu_runrun = 1;
1122                         aston(cp->cpu_dispthread);
1123                         if (tpri < kpreemptpri && cp != CPU)
1124                                 call_poke_cpu = 1;
1125                 }
1126                 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1127                         cp->cpu_kprunrun = 1;
1128                         if (cp != CPU)
1129                                 call_poke_cpu = 1;
1130                 }
1131         }
1132 
1133         /*
1134          * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1135          */
1136         membar_enter();
1137 
1138         if (call_poke_cpu)
1139                 poke_cpu(cp->cpu_id);
1140 }
1141 
1142 /*
1143  * setbackdq() keeps runqs balanced such that the difference in length
1144  * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1145  * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1146  * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1147  * try to keep runqs perfectly balanced regardless of the thread priority.
1148  */
1149 #define RUNQ_MATCH_PRI  16      /* pri below which queue lengths must match */
1150 #define RUNQ_MAX_DIFF   2       /* maximum runq length difference */
1151 #define RUNQ_LEN(cp, pri)       ((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1152 
1153 /*
1154  * Macro that evaluates to true if it is likely that the thread has cache
1155  * warmth. This is based on the amount of time that has elapsed since the
1156  * thread last ran. If that amount of time is less than "rechoose_interval"
1157  * ticks, then we decide that the thread has enough cache warmth to warrant
1158  * some affinity for t->t_cpu.
1159  */
1160 #define THREAD_HAS_CACHE_WARMTH(thread) \
1161         ((thread == curthread) ||       \
1162         ((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1163 /*
1164  * Put the specified thread on the back of the dispatcher
1165  * queue corresponding to its current priority.
1166  *
1167  * Called with the thread in transition, onproc or stopped state
1168  * and locked (transition implies locked) and at high spl.
1169  * Returns with the thread in TS_RUN state and still locked.
1170  */
1171 void
1172 setbackdq(kthread_t *tp)
1173 {
1174         dispq_t *dq;
1175         disp_t          *dp;
1176         cpu_t           *cp;
1177         pri_t           tpri;
1178         int             bound;
1179         boolean_t       self;
1180 
1181         ASSERT(THREAD_LOCK_HELD(tp));
1182         ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1183         ASSERT(!thread_on_queue(tp));   /* make sure tp isn't on a runq */
1184 
1185         self = (tp == curthread);
1186 
1187         if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1188                 bound = 1;
1189         else
1190                 bound = 0;
1191 
1192         tpri = DISP_PRIO(tp);
1193         if (ncpus == 1)
1194                 cp = tp->t_cpu;
1195         else if (!bound) {
1196                 if (tpri >= kpqpri) {
1197                         setkpdq(tp, SETKP_BACK);
1198                         return;
1199                 }
1200 
1201                 /*
1202                  * We'll generally let this thread continue to run where
1203                  * it last ran...but will consider migration if:
1204                  * - We thread probably doesn't have much cache warmth.
1205                  * - The CPU where it last ran is the target of an offline
1206                  *   request.
1207                  * - The thread last ran outside it's home lgroup.
1208                  */
1209                 if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1210                     (tp->t_cpu == cpu_inmotion)) {
1211                         cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
1212                 } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1213                         cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1214                             self ? tp->t_cpu : NULL);
1215                 } else {
1216                         cp = tp->t_cpu;
1217                 }
1218 
1219                 if (tp->t_cpupart == cp->cpu_part) {
1220                         int     qlen;
1221 
1222                         /*
1223                          * Perform any CMT load balancing
1224                          */
1225                         cp = cmt_balance(tp, cp);
1226 
1227                         /*
1228                          * Balance across the run queues
1229                          */
1230                         qlen = RUNQ_LEN(cp, tpri);
1231                         if (tpri >= RUNQ_MATCH_PRI &&
1232                             !(tp->t_schedflag & TS_RUNQMATCH))
1233                                 qlen -= RUNQ_MAX_DIFF;
1234                         if (qlen > 0) {
1235                                 cpu_t *newcp;
1236 
1237                                 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1238                                         newcp = cp->cpu_next_part;
1239                                 } else if ((newcp = cp->cpu_next_lpl) == cp) {
1240                                         newcp = cp->cpu_next_part;
1241                                 }
1242 
1243                                 if (RUNQ_LEN(newcp, tpri) < qlen) {
1244                                         DTRACE_PROBE3(runq__balance,
1245                                             kthread_t *, tp,
1246                                             cpu_t *, cp, cpu_t *, newcp);
1247                                         cp = newcp;
1248                                 }
1249                         }
1250                 } else {
1251                         /*
1252                          * Migrate to a cpu in the new partition.
1253                          */
1254                         cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1255                             tp->t_lpl, tp->t_pri, NULL);
1256                 }
1257                 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1258         } else {
1259                 /*
1260                  * It is possible that t_weakbound_cpu != t_bound_cpu (for
1261                  * a short time until weak binding that existed when the
1262                  * strong binding was established has dropped) so we must
1263                  * favour weak binding over strong.
1264                  */
1265                 cp = tp->t_weakbound_cpu ?
1266                     tp->t_weakbound_cpu : tp->t_bound_cpu;
1267         }
1268         /*
1269          * A thread that is ONPROC may be temporarily placed on the run queue
1270          * but then chosen to run again by disp.  If the thread we're placing on
1271          * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1272          * replacement process is actually scheduled in swtch().  In this
1273          * situation, curthread is the only thread that could be in the ONPROC
1274          * state.
1275          */
1276         if ((!self) && (tp->t_waitrq == 0)) {
1277                 hrtime_t curtime;
1278 
1279                 curtime = gethrtime_unscaled();
1280                 (void) cpu_update_pct(tp, curtime);
1281                 tp->t_waitrq = curtime;
1282         } else {
1283                 (void) cpu_update_pct(tp, gethrtime_unscaled());
1284         }
1285 
1286         dp = cp->cpu_disp;
1287         disp_lock_enter_high(&dp->disp_lock);
1288 
1289         DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1290         TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1291             tpri, cp, tp);
1292 
1293 #ifndef NPROBE
1294         /* Kernel probe */
1295         if (tnf_tracing_active)
1296                 tnf_thread_queue(tp, cp, tpri);
1297 #endif /* NPROBE */
1298 
1299         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1300 
1301         THREAD_RUN(tp, &dp->disp_lock);          /* set t_state to TS_RUN */
1302         tp->t_disp_queue = dp;
1303         tp->t_link = NULL;
1304 
1305         dq = &dp->disp_q[tpri];
1306         dp->disp_nrunnable++;
1307         if (!bound)
1308                 dp->disp_steal = 0;
1309         membar_enter();
1310 
1311         if (dq->dq_sruncnt++ != 0) {
1312                 ASSERT(dq->dq_first != NULL);
1313                 dq->dq_last->t_link = tp;
1314                 dq->dq_last = tp;
1315         } else {
1316                 ASSERT(dq->dq_first == NULL);
1317                 ASSERT(dq->dq_last == NULL);
1318                 dq->dq_first = dq->dq_last = tp;
1319                 BT_SET(dp->disp_qactmap, tpri);
1320                 if (tpri > dp->disp_maxrunpri) {
1321                         dp->disp_maxrunpri = tpri;
1322                         membar_enter();
1323                         cpu_resched(cp, tpri);
1324                 }
1325         }
1326 
1327         if (!bound && tpri > dp->disp_max_unbound_pri) {
1328                 if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1329                         /*
1330                          * If there are no other unbound threads on the
1331                          * run queue, don't allow other CPUs to steal
1332                          * this thread while we are in the middle of a
1333                          * context switch. We may just switch to it
1334                          * again right away. CPU_DISP_DONTSTEAL is cleared
1335                          * in swtch and swtch_to.
1336                          */
1337                         cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1338                 }
1339                 dp->disp_max_unbound_pri = tpri;
1340         }
1341         (*disp_enq_thread)(cp, bound);
1342 }
1343 
1344 /*
1345  * Put the specified thread on the front of the dispatcher
1346  * queue corresponding to its current priority.
1347  *
1348  * Called with the thread in transition, onproc or stopped state
1349  * and locked (transition implies locked) and at high spl.
1350  * Returns with the thread in TS_RUN state and still locked.
1351  */
1352 void
1353 setfrontdq(kthread_t *tp)
1354 {
1355         disp_t          *dp;
1356         dispq_t         *dq;
1357         cpu_t           *cp;
1358         pri_t           tpri;
1359         int             bound;
1360 
1361         ASSERT(THREAD_LOCK_HELD(tp));
1362         ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1363         ASSERT(!thread_on_queue(tp));   /* make sure tp isn't on a runq */
1364 
1365         if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1366                 bound = 1;
1367         else
1368                 bound = 0;
1369 
1370         tpri = DISP_PRIO(tp);
1371         if (ncpus == 1)
1372                 cp = tp->t_cpu;
1373         else if (!bound) {
1374                 if (tpri >= kpqpri) {
1375                         setkpdq(tp, SETKP_FRONT);
1376                         return;
1377                 }
1378                 cp = tp->t_cpu;
1379                 if (tp->t_cpupart == cp->cpu_part) {
1380                         /*
1381                          * We'll generally let this thread continue to run
1382                          * where it last ran, but will consider migration if:
1383                          * - The thread last ran outside it's home lgroup.
1384                          * - The CPU where it last ran is the target of an
1385                          *   offline request (a thread_nomigrate() on the in
1386                          *   motion CPU relies on this when forcing a preempt).
1387                          * - The thread isn't the highest priority thread where
1388                          *   it last ran, and it is considered not likely to
1389                          *   have significant cache warmth.
1390                          */
1391                         if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
1392                             (cp == cpu_inmotion)) {
1393                                 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1394                                     (tp == curthread) ? cp : NULL);
1395                         } else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
1396                             (!THREAD_HAS_CACHE_WARMTH(tp))) {
1397                                 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1398                                     NULL);
1399                         }
1400                 } else {
1401                         /*
1402                          * Migrate to a cpu in the new partition.
1403                          */
1404                         cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1405                             tp->t_lpl, tp->t_pri, NULL);
1406                 }
1407                 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1408         } else {
1409                 /*
1410                  * It is possible that t_weakbound_cpu != t_bound_cpu (for
1411                  * a short time until weak binding that existed when the
1412                  * strong binding was established has dropped) so we must
1413                  * favour weak binding over strong.
1414                  */
1415                 cp = tp->t_weakbound_cpu ?
1416                     tp->t_weakbound_cpu : tp->t_bound_cpu;
1417         }
1418 
1419         /*
1420          * A thread that is ONPROC may be temporarily placed on the run queue
1421          * but then chosen to run again by disp.  If the thread we're placing on
1422          * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1423          * replacement process is actually scheduled in swtch().  In this
1424          * situation, curthread is the only thread that could be in the ONPROC
1425          * state.
1426          */
1427         if ((tp != curthread) && (tp->t_waitrq == 0)) {
1428                 hrtime_t curtime;
1429 
1430                 curtime = gethrtime_unscaled();
1431                 (void) cpu_update_pct(tp, curtime);
1432                 tp->t_waitrq = curtime;
1433         } else {
1434                 (void) cpu_update_pct(tp, gethrtime_unscaled());
1435         }
1436 
1437         dp = cp->cpu_disp;
1438         disp_lock_enter_high(&dp->disp_lock);
1439 
1440         TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1441         DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1442 
1443 #ifndef NPROBE
1444         /* Kernel probe */
1445         if (tnf_tracing_active)
1446                 tnf_thread_queue(tp, cp, tpri);
1447 #endif /* NPROBE */
1448 
1449         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1450 
1451         THREAD_RUN(tp, &dp->disp_lock);          /* set TS_RUN state and lock */
1452         tp->t_disp_queue = dp;
1453 
1454         dq = &dp->disp_q[tpri];
1455         dp->disp_nrunnable++;
1456         if (!bound)
1457                 dp->disp_steal = 0;
1458         membar_enter();
1459 
1460         if (dq->dq_sruncnt++ != 0) {
1461                 ASSERT(dq->dq_last != NULL);
1462                 tp->t_link = dq->dq_first;
1463                 dq->dq_first = tp;
1464         } else {
1465                 ASSERT(dq->dq_last == NULL);
1466                 ASSERT(dq->dq_first == NULL);
1467                 tp->t_link = NULL;
1468                 dq->dq_first = dq->dq_last = tp;
1469                 BT_SET(dp->disp_qactmap, tpri);
1470                 if (tpri > dp->disp_maxrunpri) {
1471                         dp->disp_maxrunpri = tpri;
1472                         membar_enter();
1473                         cpu_resched(cp, tpri);
1474                 }
1475         }
1476 
1477         if (!bound && tpri > dp->disp_max_unbound_pri) {
1478                 if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1479                     cp == CPU) {
1480                         /*
1481                          * If there are no other unbound threads on the
1482                          * run queue, don't allow other CPUs to steal
1483                          * this thread while we are in the middle of a
1484                          * context switch. We may just switch to it
1485                          * again right away. CPU_DISP_DONTSTEAL is cleared
1486                          * in swtch and swtch_to.
1487                          */
1488                         cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1489                 }
1490                 dp->disp_max_unbound_pri = tpri;
1491         }
1492         (*disp_enq_thread)(cp, bound);
1493 }
1494 
1495 /*
1496  * Put a high-priority unbound thread on the kp queue
1497  */
1498 static void
1499 setkpdq(kthread_t *tp, int borf)
1500 {
1501         dispq_t *dq;
1502         disp_t  *dp;
1503         cpu_t   *cp;
1504         pri_t   tpri;
1505 
1506         tpri = DISP_PRIO(tp);
1507 
1508         dp = &tp->t_cpupart->cp_kp_queue;
1509         disp_lock_enter_high(&dp->disp_lock);
1510 
1511         TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1512 
1513         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1514         DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1515         THREAD_RUN(tp, &dp->disp_lock);          /* set t_state to TS_RUN */
1516         tp->t_disp_queue = dp;
1517         dp->disp_nrunnable++;
1518         dq = &dp->disp_q[tpri];
1519 
1520         if (dq->dq_sruncnt++ != 0) {
1521                 if (borf == SETKP_BACK) {
1522                         ASSERT(dq->dq_first != NULL);
1523                         tp->t_link = NULL;
1524                         dq->dq_last->t_link = tp;
1525                         dq->dq_last = tp;
1526                 } else {
1527                         ASSERT(dq->dq_last != NULL);
1528                         tp->t_link = dq->dq_first;
1529                         dq->dq_first = tp;
1530                 }
1531         } else {
1532                 if (borf == SETKP_BACK) {
1533                         ASSERT(dq->dq_first == NULL);
1534                         ASSERT(dq->dq_last == NULL);
1535                         dq->dq_first = dq->dq_last = tp;
1536                 } else {
1537                         ASSERT(dq->dq_last == NULL);
1538                         ASSERT(dq->dq_first == NULL);
1539                         tp->t_link = NULL;
1540                         dq->dq_first = dq->dq_last = tp;
1541                 }
1542                 BT_SET(dp->disp_qactmap, tpri);
1543                 if (tpri > dp->disp_max_unbound_pri)
1544                         dp->disp_max_unbound_pri = tpri;
1545                 if (tpri > dp->disp_maxrunpri) {
1546                         dp->disp_maxrunpri = tpri;
1547                         membar_enter();
1548                 }
1549         }
1550 
1551         cp = tp->t_cpu;
1552         if (tp->t_cpupart != cp->cpu_part) {
1553                 /* migrate to a cpu in the new partition */
1554                 cp = tp->t_cpupart->cp_cpulist;
1555         }
1556         cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1557         disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1558         ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1559 
1560 #ifndef NPROBE
1561         /* Kernel probe */
1562         if (tnf_tracing_active)
1563                 tnf_thread_queue(tp, cp, tpri);
1564 #endif /* NPROBE */
1565 
1566         if (cp->cpu_chosen_level < tpri)
1567                 cp->cpu_chosen_level = tpri;
1568         cpu_resched(cp, tpri);
1569         disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1570         (*disp_enq_thread)(cp, 0);
1571 }
1572 
1573 /*
1574  * Remove a thread from the dispatcher queue if it is on it.
1575  * It is not an error if it is not found but we return whether
1576  * or not it was found in case the caller wants to check.
1577  */
1578 int
1579 dispdeq(kthread_t *tp)
1580 {
1581         disp_t          *dp;
1582         dispq_t         *dq;
1583         kthread_t       *rp;
1584         kthread_t       *trp;
1585         kthread_t       **ptp;
1586         int             tpri;
1587 
1588         ASSERT(THREAD_LOCK_HELD(tp));
1589 
1590         if (tp->t_state != TS_RUN)
1591                 return (0);
1592 
1593         tpri = DISP_PRIO(tp);
1594         dp = tp->t_disp_queue;
1595         ASSERT(tpri < dp->disp_npri);
1596         dq = &dp->disp_q[tpri];
1597         ptp = &dq->dq_first;
1598         rp = *ptp;
1599         trp = NULL;
1600 
1601         ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1602 
1603         /*
1604          * Search for thread in queue.
1605          * Double links would simplify this at the expense of disp/setrun.
1606          */
1607         while (rp != tp && rp != NULL) {
1608                 trp = rp;
1609                 ptp = &trp->t_link;
1610                 rp = trp->t_link;
1611         }
1612 
1613         if (rp == NULL) {
1614                 panic("dispdeq: thread not on queue");
1615         }
1616 
1617         DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1618 
1619         /*
1620          * Found it so remove it from queue.
1621          */
1622         if ((*ptp = rp->t_link) == NULL)
1623                 dq->dq_last = trp;
1624 
1625         dp->disp_nrunnable--;
1626         if (--dq->dq_sruncnt == 0) {
1627                 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1628                 if (dp->disp_nrunnable == 0) {
1629                         dp->disp_max_unbound_pri = -1;
1630                         dp->disp_maxrunpri = -1;
1631                 } else if (tpri == dp->disp_maxrunpri) {
1632                         int ipri;
1633 
1634                         ipri = bt_gethighbit(dp->disp_qactmap,
1635                             dp->disp_maxrunpri >> BT_ULSHIFT);
1636                         if (ipri < dp->disp_max_unbound_pri)
1637                                 dp->disp_max_unbound_pri = ipri;
1638                         dp->disp_maxrunpri = ipri;
1639                 }
1640         }
1641         tp->t_link = NULL;
1642         THREAD_TRANSITION(tp);          /* put in intermediate state */
1643         return (1);
1644 }
1645 
1646 /*
1647  *      Make a thread give up its processor.  Find the processor on
1648  *      which this thread is executing, and have that processor
1649  *      preempt.
1650  *
1651  *      We allow System Duty Cycle (SDC) threads to be preempted even if
1652  *      they are running at kernel priorities.  To implement this, we always
1653  *      set cpu_kprunrun; this ensures preempt() will be called.  Since SDC
1654  *      calls cpu_surrender() very often, we only preempt if there is anyone
1655  *      competing with us.
1656  */
1657 void
1658 cpu_surrender(kthread_t *tp)
1659 {
1660         cpu_t   *cpup;
1661         int     max_pri;
1662         int     max_run_pri;
1663         klwp_t  *lwp;
1664 
1665         ASSERT(THREAD_LOCK_HELD(tp));
1666 
1667         if (tp->t_state != TS_ONPROC)
1668                 return;
1669         cpup = tp->t_disp_queue->disp_cpu;        /* CPU thread dispatched to */
1670         max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1671         max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1672         if (max_pri < max_run_pri)
1673                 max_pri = max_run_pri;
1674 
1675         if (tp->t_cid == sysdccid) {
1676                 uint_t t_pri = DISP_PRIO(tp);
1677                 if (t_pri > max_pri)
1678                         return;         /* we are not competing w/ anyone */
1679                 cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1680         } else {
1681                 cpup->cpu_runrun = 1;
1682                 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1683                         cpup->cpu_kprunrun = 1;
1684                 }
1685         }
1686 
1687         /*
1688          * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1689          */
1690         membar_enter();
1691 
1692         DTRACE_SCHED1(surrender, kthread_t *, tp);
1693 
1694         /*
1695          * Make the target thread take an excursion through trap()
1696          * to do preempt() (unless we're already in trap or post_syscall,
1697          * calling cpu_surrender via CL_TRAPRET).
1698          */
1699         if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1700             lwp->lwp_state != LWP_USER) {
1701                 aston(tp);
1702                 if (cpup != CPU)
1703                         poke_cpu(cpup->cpu_id);
1704         }
1705         TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1706             "cpu_surrender:tid %p cpu %p", tp, cpup);
1707 }
1708 
1709 /*
1710  * Commit to and ratify a scheduling decision
1711  */
1712 /*ARGSUSED*/
1713 static kthread_t *
1714 disp_ratify(kthread_t *tp, disp_t *kpq)
1715 {
1716         pri_t   tpri, maxpri;
1717         pri_t   maxkpri;
1718         cpu_t   *cpup;
1719 
1720         ASSERT(tp != NULL);
1721         /*
1722          * Commit to, then ratify scheduling decision
1723          */
1724         cpup = CPU;
1725         if (cpup->cpu_runrun != 0)
1726                 cpup->cpu_runrun = 0;
1727         if (cpup->cpu_kprunrun != 0)
1728                 cpup->cpu_kprunrun = 0;
1729         if (cpup->cpu_chosen_level != -1)
1730                 cpup->cpu_chosen_level = -1;
1731         membar_enter();
1732         tpri = DISP_PRIO(tp);
1733         maxpri = cpup->cpu_disp->disp_maxrunpri;
1734         maxkpri = kpq->disp_maxrunpri;
1735         if (maxpri < maxkpri)
1736                 maxpri = maxkpri;
1737         if (tpri < maxpri) {
1738                 /*
1739                  * should have done better
1740                  * put this one back and indicate to try again
1741                  */
1742                 cpup->cpu_dispthread = curthread;    /* fixup dispthread */
1743                 cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1744                 thread_lock_high(tp);
1745                 THREAD_TRANSITION(tp);
1746                 setfrontdq(tp);
1747                 thread_unlock_nopreempt(tp);
1748 
1749                 tp = NULL;
1750         }
1751         return (tp);
1752 }
1753 
1754 /*
1755  * See if there is any work on the dispatcher queue for other CPUs.
1756  * If there is, dequeue the best thread and return.
1757  */
1758 static kthread_t *
1759 disp_getwork(cpu_t *cp)
1760 {
1761         cpu_t           *ocp;           /* other CPU */
1762         cpu_t           *ocp_start;
1763         cpu_t           *tcp;           /* target local CPU */
1764         kthread_t       *tp;
1765         kthread_t       *retval = NULL;
1766         pri_t           maxpri;
1767         disp_t          *kpq;           /* kp queue for this partition */
1768         lpl_t           *lpl, *lpl_leaf;
1769         int             leafidx, startidx;
1770         hrtime_t        stealtime;
1771         lgrp_id_t       local_id;
1772 
1773         maxpri = -1;
1774         tcp = NULL;
1775 
1776         kpq = &cp->cpu_part->cp_kp_queue;
1777         while (kpq->disp_maxrunpri >= 0) {
1778                 /*
1779                  * Try to take a thread from the kp_queue.
1780                  */
1781                 tp = (disp_getbest(kpq));
1782                 if (tp)
1783                         return (disp_ratify(tp, kpq));
1784         }
1785 
1786         kpreempt_disable();             /* protect the cpu_active list */
1787 
1788         /*
1789          * Try to find something to do on another CPU's run queue.
1790          * Loop through all other CPUs looking for the one with the highest
1791          * priority unbound thread.
1792          *
1793          * On NUMA machines, the partition's CPUs are consulted in order of
1794          * distance from the current CPU. This way, the first available
1795          * work found is also the closest, and will suffer the least
1796          * from being migrated.
1797          */
1798         lpl = lpl_leaf = cp->cpu_lpl;
1799         local_id = lpl_leaf->lpl_lgrpid;
1800         leafidx = startidx = 0;
1801 
1802         /*
1803          * This loop traverses the lpl hierarchy. Higher level lpls represent
1804          * broader levels of locality
1805          */
1806         do {
1807                 /* This loop iterates over the lpl's leaves */
1808                 do {
1809                         if (lpl_leaf != cp->cpu_lpl)
1810                                 ocp = lpl_leaf->lpl_cpus;
1811                         else
1812                                 ocp = cp->cpu_next_lpl;
1813 
1814                         /* This loop iterates over the CPUs in the leaf */
1815                         ocp_start = ocp;
1816                         do {
1817                                 pri_t pri;
1818 
1819                                 ASSERT(CPU_ACTIVE(ocp));
1820 
1821                                 /*
1822                                  * End our stroll around this lpl if:
1823                                  *
1824                                  * - Something became runnable on the local
1825                                  *   queue...which also ends our stroll around
1826                                  *   the partition.
1827                                  *
1828                                  * - We happen across another idle CPU.
1829                                  *   Since it is patrolling the next portion
1830                                  *   of the lpl's list (assuming it's not
1831                                  *   halted, or busy servicing an interrupt),
1832                                  *   move to the next higher level of locality.
1833                                  */
1834                                 if (cp->cpu_disp->disp_nrunnable != 0) {
1835                                         kpreempt_enable();
1836                                         return (NULL);
1837                                 }
1838                                 if (ocp->cpu_dispatch_pri == -1) {
1839                                         if (ocp->cpu_disp_flags &
1840                                             CPU_DISP_HALTED ||
1841                                             ocp->cpu_intr_actv != 0)
1842                                                 continue;
1843                                         else
1844                                                 goto next_level;
1845                                 }
1846 
1847                                 /*
1848                                  * If there's only one thread and the CPU
1849                                  * is in the middle of a context switch,
1850                                  * or it's currently running the idle thread,
1851                                  * don't steal it.
1852                                  */
1853                                 if ((ocp->cpu_disp_flags &
1854                                     CPU_DISP_DONTSTEAL) &&
1855                                     ocp->cpu_disp->disp_nrunnable == 1)
1856                                         continue;
1857 
1858                                 pri = ocp->cpu_disp->disp_max_unbound_pri;
1859                                 if (pri > maxpri) {
1860                                         /*
1861                                          * Don't steal threads that we attempted
1862                                          * to steal recently until they're ready
1863                                          * to be stolen again.
1864                                          */
1865                                         stealtime = ocp->cpu_disp->disp_steal;
1866                                         if (stealtime == 0 ||
1867                                             stealtime - gethrtime() <= 0) {
1868                                                 maxpri = pri;
1869                                                 tcp = ocp;
1870                                         } else {
1871                                                 /*
1872                                                  * Don't update tcp, just set
1873                                                  * the retval to T_DONTSTEAL, so
1874                                                  * that if no acceptable CPUs
1875                                                  * are found the return value
1876                                                  * will be T_DONTSTEAL rather
1877                                                  * then NULL.
1878                                                  */
1879                                                 retval = T_DONTSTEAL;
1880                                         }
1881                                 }
1882                         } while ((ocp = ocp->cpu_next_lpl) != ocp_start);
1883 
1884                         /*
1885                          * Iterate to the next leaf lpl in the resource set
1886                          * at this level of locality. If we hit the end of
1887                          * the set, wrap back around to the beginning.
1888                          *
1889                          * Note: This iteration is NULL terminated for a reason
1890                          * see lpl_topo_bootstrap() in lgrp.c for details.
1891                          */
1892                         if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
1893                                 leafidx = 0;
1894                                 lpl_leaf = lpl->lpl_rset[leafidx];
1895                         }
1896                 } while (leafidx != startidx);
1897 
1898 next_level:
1899                 /*
1900                  * Expand the search to include farther away CPUs (next
1901                  * locality level). The closer CPUs that have already been
1902                  * checked will be checked again. In doing so, idle CPUs
1903                  * will tend to be more aggresive about stealing from CPUs
1904                  * that are closer (since the closer CPUs will be considered
1905                  * more often).
1906                  * Begin at this level with the CPUs local leaf lpl.
1907                  */
1908                 if ((lpl = lpl->lpl_parent) != NULL) {
1909                         leafidx = startidx = lpl->lpl_id2rset[local_id];
1910                         lpl_leaf = lpl->lpl_rset[leafidx];
1911                 }
1912         } while (!tcp && lpl);
1913 
1914         kpreempt_enable();
1915 
1916         /*
1917          * If another queue looks good, and there is still nothing on
1918          * the local queue, try to transfer one or more threads
1919          * from it to our queue.
1920          */
1921         if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
1922                 tp = disp_getbest(tcp->cpu_disp);
1923                 if (tp == NULL || tp == T_DONTSTEAL)
1924                         return (tp);
1925                 return (disp_ratify(tp, kpq));
1926         }
1927         return (retval);
1928 }
1929 
1930 
1931 /*
1932  * disp_fix_unbound_pri()
1933  *      Determines the maximum priority of unbound threads on the queue.
1934  *      The priority is kept for the queue, but is only increased, never
1935  *      reduced unless some CPU is looking for something on that queue.
1936  *
1937  *      The priority argument is the known upper limit.
1938  *
1939  *      Perhaps this should be kept accurately, but that probably means
1940  *      separate bitmaps for bound and unbound threads.  Since only idled
1941  *      CPUs will have to do this recalculation, it seems better this way.
1942  */
1943 static void
1944 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
1945 {
1946         kthread_t       *tp;
1947         dispq_t         *dq;
1948         ulong_t         *dqactmap = dp->disp_qactmap;
1949         ulong_t         mapword;
1950         int             wx;
1951 
1952         ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
1953 
1954         ASSERT(pri >= 0);                    /* checked by caller */
1955 
1956         /*
1957          * Start the search at the next lowest priority below the supplied
1958          * priority.  This depends on the bitmap implementation.
1959          */
1960         do {
1961                 wx = pri >> BT_ULSHIFT;           /* index of word in map */
1962 
1963                 /*
1964                  * Form mask for all lower priorities in the word.
1965                  */
1966                 mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
1967 
1968                 /*
1969                  * Get next lower active priority.
1970                  */
1971                 if (mapword != 0) {
1972                         pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
1973                 } else if (wx > 0) {
1974                         pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
1975                         if (pri < 0)
1976                                 break;
1977                 } else {
1978                         pri = -1;
1979                         break;
1980                 }
1981 
1982                 /*
1983                  * Search the queue for unbound, runnable threads.
1984                  */
1985                 dq = &dp->disp_q[pri];
1986                 tp = dq->dq_first;
1987 
1988                 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
1989                         tp = tp->t_link;
1990                 }
1991 
1992                 /*
1993                  * If a thread was found, set the priority and return.
1994                  */
1995         } while (tp == NULL);
1996 
1997         /*
1998          * pri holds the maximum unbound thread priority or -1.
1999          */
2000         if (dp->disp_max_unbound_pri != pri)
2001                 dp->disp_max_unbound_pri = pri;
2002 }
2003 
2004 /*
2005  * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2006  *      check if the CPU to which is was previously bound should have
2007  *      its disp_max_unbound_pri increased.
2008  */
2009 void
2010 disp_adjust_unbound_pri(kthread_t *tp)
2011 {
2012         disp_t *dp;
2013         pri_t tpri;
2014 
2015         ASSERT(THREAD_LOCK_HELD(tp));
2016 
2017         /*
2018          * Don't do anything if the thread is not bound, or
2019          * currently not runnable.
2020          */
2021         if (tp->t_bound_cpu == NULL ||
2022             tp->t_state != TS_RUN)
2023                 return;
2024 
2025         tpri = DISP_PRIO(tp);
2026         dp = tp->t_bound_cpu->cpu_disp;
2027         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2028         if (tpri > dp->disp_max_unbound_pri)
2029                 dp->disp_max_unbound_pri = tpri;
2030 }
2031 
2032 /*
2033  * disp_getbest()
2034  *   De-queue the highest priority unbound runnable thread.
2035  *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
2036  *   Returns NULL if nothing found.
2037  *   Returns T_DONTSTEAL if the thread was not stealable.
2038  *   so that the caller will try again later.
2039  *
2040  *   Passed a pointer to a dispatch queue not associated with this CPU, and
2041  *   its type.
2042  */
2043 static kthread_t *
2044 disp_getbest(disp_t *dp)
2045 {
2046         kthread_t       *tp;
2047         dispq_t         *dq;
2048         pri_t           pri;
2049         cpu_t           *cp, *tcp;
2050         boolean_t       allbound;
2051 
2052         disp_lock_enter(&dp->disp_lock);
2053 
2054         /*
2055          * If there is nothing to run, or the CPU is in the middle of a
2056          * context switch of the only thread, return NULL.
2057          */
2058         tcp = dp->disp_cpu;
2059         cp = CPU;
2060         pri = dp->disp_max_unbound_pri;
2061         if (pri == -1 ||
2062             (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2063             tcp->cpu_disp->disp_nrunnable == 1)) {
2064                 disp_lock_exit_nopreempt(&dp->disp_lock);
2065                 return (NULL);
2066         }
2067 
2068         dq = &dp->disp_q[pri];
2069 
2070 
2071         /*
2072          * Assume that all threads are bound on this queue, and change it
2073          * later when we find out that it is not the case.
2074          */
2075         allbound = B_TRUE;
2076         for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2077                 hrtime_t now, nosteal, rqtime;
2078 
2079                 /*
2080                  * Skip over bound threads which could be here even
2081                  * though disp_max_unbound_pri indicated this level.
2082                  */
2083                 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2084                         continue;
2085 
2086                 /*
2087                  * We've got some unbound threads on this queue, so turn
2088                  * the allbound flag off now.
2089                  */
2090                 allbound = B_FALSE;
2091 
2092                 /*
2093                  * The thread is a candidate for stealing from its run queue. We
2094                  * don't want to steal threads that became runnable just a
2095                  * moment ago. This improves CPU affinity for threads that get
2096                  * preempted for short periods of time and go back on the run
2097                  * queue.
2098                  *
2099                  * We want to let it stay on its run queue if it was only placed
2100                  * there recently and it was running on the same CPU before that
2101                  * to preserve its cache investment. For the thread to remain on
2102                  * its run queue, ALL of the following conditions must be
2103                  * satisfied:
2104                  *
2105                  * - the disp queue should not be the kernel preemption queue
2106                  * - delayed idle stealing should not be disabled
2107                  * - nosteal_nsec should be non-zero
2108                  * - it should run with user priority
2109                  * - it should be on the run queue of the CPU where it was
2110                  *   running before being placed on the run queue
2111                  * - it should be the only thread on the run queue (to prevent
2112                  *   extra scheduling latency for other threads)
2113                  * - it should sit on the run queue for less than per-chip
2114                  *   nosteal interval or global nosteal interval
2115                  * - in case of CPUs with shared cache it should sit in a run
2116                  *   queue of a CPU from a different chip
2117                  *
2118                  * The checks are arranged so that the ones that are faster are
2119                  * placed earlier.
2120                  */
2121                 if (tcp == NULL ||
2122                     pri >= minclsyspri ||
2123                     tp->t_cpu != tcp)
2124                         break;
2125 
2126                 /*
2127                  * Steal immediately if, due to CMT processor architecture
2128                  * migraiton between cp and tcp would incur no performance
2129                  * penalty.
2130                  */
2131                 if (pg_cmt_can_migrate(cp, tcp))
2132                         break;
2133 
2134                 nosteal = nosteal_nsec;
2135                 if (nosteal == 0)
2136                         break;
2137 
2138                 /*
2139                  * Calculate time spent sitting on run queue
2140                  */
2141                 now = gethrtime_unscaled();
2142                 rqtime = now - tp->t_waitrq;
2143                 scalehrtime(&rqtime);
2144 
2145                 /*
2146                  * Steal immediately if the time spent on this run queue is more
2147                  * than allowed nosteal delay.
2148                  *
2149                  * Negative rqtime check is needed here to avoid infinite
2150                  * stealing delays caused by unlikely but not impossible
2151                  * drifts between CPU times on different CPUs.
2152                  */
2153                 if (rqtime > nosteal || rqtime < 0)
2154                         break;
2155 
2156                 DTRACE_PROBE4(nosteal, kthread_t *, tp,
2157                     cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2158                 scalehrtime(&now);
2159                 /*
2160                  * Calculate when this thread becomes stealable
2161                  */
2162                 now += (nosteal - rqtime);
2163 
2164                 /*
2165                  * Calculate time when some thread becomes stealable
2166                  */
2167                 if (now < dp->disp_steal)
2168                         dp->disp_steal = now;
2169         }
2170 
2171         /*
2172          * If there were no unbound threads on this queue, find the queue
2173          * where they are and then return later. The value of
2174          * disp_max_unbound_pri is not always accurate because it isn't
2175          * reduced until another idle CPU looks for work.
2176          */
2177         if (allbound)
2178                 disp_fix_unbound_pri(dp, pri);
2179 
2180         /*
2181          * If we reached the end of the queue and found no unbound threads
2182          * then return NULL so that other CPUs will be considered.  If there
2183          * are unbound threads but they cannot yet be stolen, then
2184          * return T_DONTSTEAL and try again later.
2185          */
2186         if (tp == NULL) {
2187                 disp_lock_exit_nopreempt(&dp->disp_lock);
2188                 return (allbound ? NULL : T_DONTSTEAL);
2189         }
2190 
2191         /*
2192          * Found a runnable, unbound thread, so remove it from queue.
2193          * dispdeq() requires that we have the thread locked, and we do,
2194          * by virtue of holding the dispatch queue lock.  dispdeq() will
2195          * put the thread in transition state, thereby dropping the dispq
2196          * lock.
2197          */
2198 
2199 #ifdef DEBUG
2200         {
2201                 int     thread_was_on_queue;
2202 
2203                 thread_was_on_queue = dispdeq(tp);      /* drops disp_lock */
2204                 ASSERT(thread_was_on_queue);
2205         }
2206 
2207 #else /* DEBUG */
2208         (void) dispdeq(tp);                     /* drops disp_lock */
2209 #endif /* DEBUG */
2210 
2211         /*
2212          * Reset the disp_queue steal time - we do not know what is the smallest
2213          * value across the queue is.
2214          */
2215         dp->disp_steal = 0;
2216 
2217         /*
2218          * Setup thread to run on the current CPU.
2219          */
2220         tp->t_disp_queue = cp->cpu_disp;
2221 
2222         cp->cpu_dispthread = tp;             /* protected by spl only */
2223         cp->cpu_dispatch_pri = pri;
2224 
2225         /*
2226          * There can be a memory synchronization race between disp_getbest()
2227          * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2228          * to preempt the current thread to run the enqueued thread while
2229          * disp_getbest() and disp_ratify() are changing the current thread
2230          * to the stolen thread. This may lead to a situation where
2231          * cpu_resched() tries to preempt the wrong thread and the
2232          * stolen thread continues to run on the CPU which has been tagged
2233          * for preemption.
2234          * Later the clock thread gets enqueued but doesn't get to run on the
2235          * CPU causing the system to hang.
2236          *
2237          * To avoid this, grabbing and dropping the disp_lock (which does
2238          * a memory barrier) is needed to synchronize the execution of
2239          * cpu_resched() with disp_getbest() and disp_ratify() and
2240          * synchronize the memory read and written by cpu_resched(),
2241          * disp_getbest(), and disp_ratify() with each other.
2242          *  (see CR#6482861 for more details).
2243          */
2244         disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2245         disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2246 
2247         ASSERT(pri == DISP_PRIO(tp));
2248 
2249         DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2250 
2251         thread_onproc(tp, cp);                  /* set t_state to TS_ONPROC */
2252 
2253         /*
2254          * Return with spl high so that swtch() won't need to raise it.
2255          * The disp_lock was dropped by dispdeq().
2256          */
2257 
2258         return (tp);
2259 }
2260 
2261 /*
2262  * disp_bound_common() - common routine for higher level functions
2263  *      that check for bound threads under certain conditions.
2264  *      If 'threadlistsafe' is set then there is no need to acquire
2265  *      pidlock to stop the thread list from changing (eg, if
2266  *      disp_bound_* is called with cpus paused).
2267  */
2268 static int
2269 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2270 {
2271         int             found = 0;
2272         kthread_t       *tp;
2273 
2274         ASSERT(flag);
2275 
2276         if (!threadlistsafe)
2277                 mutex_enter(&pidlock);
2278         tp = curthread;         /* faster than allthreads */
2279         do {
2280                 if (tp->t_state != TS_FREE) {
2281                         /*
2282                          * If an interrupt thread is busy, but the
2283                          * caller doesn't care (i.e. BOUND_INTR is off),
2284                          * then just ignore it and continue through.
2285                          */
2286                         if ((tp->t_flag & T_INTR_THREAD) &&
2287                             !(flag & BOUND_INTR))
2288                                 continue;
2289 
2290                         /*
2291                          * Skip the idle thread for the CPU
2292                          * we're about to set offline.
2293                          */
2294                         if (tp == cp->cpu_idle_thread)
2295                                 continue;
2296 
2297                         /*
2298                          * Skip the pause thread for the CPU
2299                          * we're about to set offline.
2300                          */
2301                         if (tp == cp->cpu_pause_thread)
2302                                 continue;
2303 
2304                         if ((flag & BOUND_CPU) &&
2305                             (tp->t_bound_cpu == cp ||
2306                             tp->t_bind_cpu == cp->cpu_id ||
2307                             tp->t_weakbound_cpu == cp)) {
2308                                 found = 1;
2309                                 break;
2310                         }
2311 
2312                         if ((flag & BOUND_PARTITION) &&
2313                             (tp->t_cpupart == cp->cpu_part)) {
2314                                 found = 1;
2315                                 break;
2316                         }
2317                 }
2318         } while ((tp = tp->t_next) != curthread && found == 0);
2319         if (!threadlistsafe)
2320                 mutex_exit(&pidlock);
2321         return (found);
2322 }
2323 
2324 /*
2325  * disp_bound_threads - return nonzero if threads are bound to the processor.
2326  *      Called infrequently.  Keep this simple.
2327  *      Includes threads that are asleep or stopped but not onproc.
2328  */
2329 int
2330 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2331 {
2332         return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2333 }
2334 
2335 /*
2336  * disp_bound_anythreads - return nonzero if _any_ threads are bound
2337  * to the given processor, including interrupt threads.
2338  */
2339 int
2340 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2341 {
2342         return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2343 }
2344 
2345 /*
2346  * disp_bound_partition - return nonzero if threads are bound to the same
2347  * partition as the processor.
2348  *      Called infrequently.  Keep this simple.
2349  *      Includes threads that are asleep or stopped but not onproc.
2350  */
2351 int
2352 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2353 {
2354         return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2355 }
2356 
2357 /*
2358  * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2359  * threads to other CPUs.
2360  */
2361 void
2362 disp_cpu_inactive(cpu_t *cp)
2363 {
2364         kthread_t       *tp;
2365         disp_t          *dp = cp->cpu_disp;
2366         dispq_t         *dq;
2367         pri_t           pri;
2368         int             wasonq;
2369 
2370         disp_lock_enter(&dp->disp_lock);
2371         while ((pri = dp->disp_max_unbound_pri) != -1) {
2372                 dq = &dp->disp_q[pri];
2373                 tp = dq->dq_first;
2374 
2375                 /*
2376                  * Skip over bound threads.
2377                  */
2378                 while (tp != NULL && tp->t_bound_cpu != NULL) {
2379                         tp = tp->t_link;
2380                 }
2381 
2382                 if (tp == NULL) {
2383                         /* disp_max_unbound_pri must be inaccurate, so fix it */
2384                         disp_fix_unbound_pri(dp, pri);
2385                         continue;
2386                 }
2387 
2388                 wasonq = dispdeq(tp);           /* drops disp_lock */
2389                 ASSERT(wasonq);
2390                 ASSERT(tp->t_weakbound_cpu == NULL);
2391 
2392                 setbackdq(tp);
2393                 /*
2394                  * Called from cpu_offline:
2395                  *
2396                  * cp has already been removed from the list of active cpus
2397                  * and tp->t_cpu has been changed so there is no risk of
2398                  * tp ending up back on cp.
2399                  *
2400                  * Called from cpupart_move_cpu:
2401                  *
2402                  * The cpu has moved to a new cpupart.  Any threads that
2403                  * were on it's dispatch queues before the move remain
2404                  * in the old partition and can't run in the new partition.
2405                  */
2406                 ASSERT(tp->t_cpu != cp);
2407                 thread_unlock(tp);
2408 
2409                 disp_lock_enter(&dp->disp_lock);
2410         }
2411         disp_lock_exit(&dp->disp_lock);
2412 }
2413 
2414 /*
2415  * disp_lowpri_cpu - find CPU running the lowest priority thread.
2416  *      The hint passed in is used as a starting point so we don't favor
2417  *      CPU 0 or any other CPU.  The caller should pass in the most recently
2418  *      used CPU for the thread.
2419  *
2420  *      The lgroup and priority are used to determine the best CPU to run on
2421  *      in a NUMA machine.  The lgroup specifies which CPUs are closest while
2422  *      the thread priority will indicate whether the thread will actually run
2423  *      there.  To pick the best CPU, the CPUs inside and outside of the given
2424  *      lgroup which are running the lowest priority threads are found.  The
2425  *      remote CPU is chosen only if the thread will not run locally on a CPU
2426  *      within the lgroup, but will run on the remote CPU. If the thread
2427  *      cannot immediately run on any CPU, the best local CPU will be chosen.
2428  *
2429  *      The lpl specified also identifies the cpu partition from which
2430  *      disp_lowpri_cpu should select a CPU.
2431  *
2432  *      curcpu is used to indicate that disp_lowpri_cpu is being called on
2433  *      behalf of the current thread. (curthread is looking for a new cpu)
2434  *      In this case, cpu_dispatch_pri for this thread's cpu should be
2435  *      ignored.
2436  *
2437  *      If a cpu is the target of an offline request then try to avoid it.
2438  *
2439  *      This function must be called at either high SPL, or with preemption
2440  *      disabled, so that the "hint" CPU cannot be removed from the online
2441  *      CPU list while we are traversing it.
2442  */
2443 cpu_t *
2444 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2445 {
2446         cpu_t   *bestcpu;
2447         cpu_t   *besthomecpu;
2448         cpu_t   *cp, *cpstart;
2449 
2450         pri_t   bestpri;
2451         pri_t   cpupri;
2452 
2453         klgrpset_t      done;
2454         klgrpset_t      cur_set;
2455 
2456         lpl_t           *lpl_iter, *lpl_leaf;
2457         int             i;
2458 
2459         /*
2460          * Scan for a CPU currently running the lowest priority thread.
2461          * Cannot get cpu_lock here because it is adaptive.
2462          * We do not require lock on CPU list.
2463          */
2464         ASSERT(hint != NULL);
2465         ASSERT(lpl != NULL);
2466         ASSERT(lpl->lpl_ncpu > 0);
2467 
2468         /*
2469          * First examine local CPUs. Note that it's possible the hint CPU
2470          * passed in in remote to the specified home lgroup. If our priority
2471          * isn't sufficient enough such that we can run immediately at home,
2472          * then examine CPUs remote to our home lgroup.
2473          * We would like to give preference to CPUs closest to "home".
2474          * If we can't find a CPU where we'll run at a given level
2475          * of locality, we expand our search to include the next level.
2476          */
2477         bestcpu = besthomecpu = NULL;
2478         klgrpset_clear(done);
2479         /* start with lpl we were passed */
2480 
2481         lpl_iter = lpl;
2482 
2483         do {
2484 
2485                 bestpri = SHRT_MAX;
2486                 klgrpset_clear(cur_set);
2487 
2488                 for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2489                         lpl_leaf = lpl_iter->lpl_rset[i];
2490                         if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2491                                 continue;
2492 
2493                         klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2494 
2495                         if (hint->cpu_lpl == lpl_leaf)
2496                                 cp = cpstart = hint;
2497                         else
2498                                 cp = cpstart = lpl_leaf->lpl_cpus;
2499 
2500                         do {
2501                                 if (cp == curcpu)
2502                                         cpupri = -1;
2503                                 else if (cp == cpu_inmotion)
2504                                         cpupri = SHRT_MAX;
2505                                 else
2506                                         cpupri = cp->cpu_dispatch_pri;
2507                                 if (cp->cpu_disp->disp_maxrunpri > cpupri)
2508                                         cpupri = cp->cpu_disp->disp_maxrunpri;
2509                                 if (cp->cpu_chosen_level > cpupri)
2510                                         cpupri = cp->cpu_chosen_level;
2511                                 if (cpupri < bestpri) {
2512                                         if (CPU_IDLING(cpupri)) {
2513                                                 ASSERT((cp->cpu_flags &
2514                                                     CPU_QUIESCED) == 0);
2515                                                 return (cp);
2516                                         }
2517                                         bestcpu = cp;
2518                                         bestpri = cpupri;
2519                                 }
2520                         } while ((cp = cp->cpu_next_lpl) != cpstart);
2521                 }
2522 
2523                 if (bestcpu && (tpri > bestpri)) {
2524                         ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2525                         return (bestcpu);
2526                 }
2527                 if (besthomecpu == NULL)
2528                         besthomecpu = bestcpu;
2529                 /*
2530                  * Add the lgrps we just considered to the "done" set
2531                  */
2532                 klgrpset_or(done, cur_set);
2533 
2534         } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2535 
2536         /*
2537          * The specified priority isn't high enough to run immediately
2538          * anywhere, so just return the best CPU from the home lgroup.
2539          */
2540         ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2541         return (besthomecpu);
2542 }
2543 
2544 /*
2545  * This routine provides the generic idle cpu function for all processors.
2546  * If a processor has some specific code to execute when idle (say, to stop
2547  * the pipeline and save power) then that routine should be defined in the
2548  * processors specific code (module_xx.c) and the global variable idle_cpu
2549  * set to that function.
2550  */
2551 static void
2552 generic_idle_cpu(void)
2553 {
2554 }
2555 
2556 /*ARGSUSED*/
2557 static void
2558 generic_enq_thread(cpu_t *cpu, int bound)
2559 {
2560 }