1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28 
  29 
  30 #include <sys/types.h>
  31 #include <sys/param.h>
  32 #include <sys/sysmacros.h>
  33 #include <sys/signal.h>
  34 #include <sys/user.h>
  35 #include <sys/systm.h>
  36 #include <sys/sysinfo.h>
  37 #include <sys/var.h>
  38 #include <sys/errno.h>
  39 #include <sys/cmn_err.h>
  40 #include <sys/debug.h>
  41 #include <sys/inline.h>
  42 #include <sys/disp.h>
  43 #include <sys/class.h>
  44 #include <sys/bitmap.h>
  45 #include <sys/kmem.h>
  46 #include <sys/cpuvar.h>
  47 #include <sys/vtrace.h>
  48 #include <sys/tnf.h>
  49 #include <sys/cpupart.h>
  50 #include <sys/lgrp.h>
  51 #include <sys/pg.h>
  52 #include <sys/cmt.h>
  53 #include <sys/bitset.h>
  54 #include <sys/schedctl.h>
  55 #include <sys/atomic.h>
  56 #include <sys/dtrace.h>
  57 #include <sys/sdt.h>
  58 #include <sys/archsystm.h>
  59 
  60 #include <vm/as.h>
  61 
  62 #define BOUND_CPU       0x1
  63 #define BOUND_PARTITION 0x2
  64 #define BOUND_INTR      0x4
  65 
  66 /* Dispatch queue allocation structure and functions */
  67 struct disp_queue_info {
  68         disp_t  *dp;
  69         dispq_t *olddispq;
  70         dispq_t *newdispq;
  71         ulong_t *olddqactmap;
  72         ulong_t *newdqactmap;
  73         int     oldnglobpris;
  74 };
  75 static void     disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
  76     disp_t *dp);
  77 static void     disp_dq_assign(struct disp_queue_info *dptr, int numpris);
  78 static void     disp_dq_free(struct disp_queue_info *dptr);
  79 
  80 /* platform-specific routine to call when processor is idle */
  81 static void     generic_idle_cpu();
  82 void            (*idle_cpu)() = generic_idle_cpu;
  83 
  84 /* routines invoked when a CPU enters/exits the idle loop */
  85 static void     idle_enter();
  86 static void     idle_exit();
  87 
  88 /* platform-specific routine to call when thread is enqueued */
  89 static void     generic_enq_thread(cpu_t *, int);
  90 void            (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
  91 
  92 pri_t   kpreemptpri;            /* priority where kernel preemption applies */
  93 pri_t   upreemptpri = 0;        /* priority where normal preemption applies */
  94 pri_t   intr_pri;               /* interrupt thread priority base level */
  95 
  96 #define KPQPRI  -1              /* pri where cpu affinity is dropped for kpq */
  97 pri_t   kpqpri = KPQPRI;        /* can be set in /etc/system */
  98 disp_t  cpu0_disp;              /* boot CPU's dispatch queue */
  99 disp_lock_t     swapped_lock;   /* lock swapped threads and swap queue */
 100 int     nswapped;               /* total number of swapped threads */
 101 void    disp_swapped_enq(kthread_t *tp);
 102 static void     disp_swapped_setrun(kthread_t *tp);
 103 static void     cpu_resched(cpu_t *cp, pri_t tpri);
 104 
 105 /*
 106  * If this is set, only interrupt threads will cause kernel preemptions.
 107  * This is done by changing the value of kpreemptpri.  kpreemptpri
 108  * will either be the max sysclass pri + 1 or the min interrupt pri.
 109  */
 110 int     only_intr_kpreempt;
 111 
 112 extern void set_idle_cpu(int cpun);
 113 extern void unset_idle_cpu(int cpun);
 114 static void setkpdq(kthread_t *tp, int borf);
 115 #define SETKP_BACK      0
 116 #define SETKP_FRONT     1
 117 /*
 118  * Parameter that determines how recently a thread must have run
 119  * on the CPU to be considered loosely-bound to that CPU to reduce
 120  * cold cache effects.  The interval is in hertz.
 121  */
 122 #define RECHOOSE_INTERVAL 3
 123 int     rechoose_interval = RECHOOSE_INTERVAL;
 124 
 125 /*
 126  * Parameter that determines how long (in nanoseconds) a thread must
 127  * be sitting on a run queue before it can be stolen by another CPU
 128  * to reduce migrations.  The interval is in nanoseconds.
 129  *
 130  * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
 131  * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
 132  * here indicating it is uninitiallized.
 133  * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
 134  *
 135  */
 136 #define NOSTEAL_UNINITIALIZED   (-1)
 137 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
 138 extern void cmp_set_nosteal_interval(void);
 139 
 140 id_t    defaultcid;     /* system "default" class; see dispadmin(1M) */
 141 
 142 disp_lock_t     transition_lock;        /* lock on transitioning threads */
 143 disp_lock_t     stop_lock;              /* lock on stopped threads */
 144 
 145 static void     cpu_dispqalloc(int numpris);
 146 
 147 /*
 148  * This gets returned by disp_getwork/disp_getbest if we couldn't steal
 149  * a thread because it was sitting on its run queue for a very short
 150  * period of time.
 151  */
 152 #define T_DONTSTEAL     (kthread_t *)(-1) /* returned by disp_getwork/getbest */
 153 
 154 static kthread_t        *disp_getwork(cpu_t *to);
 155 static kthread_t        *disp_getbest(disp_t *from);
 156 static kthread_t        *disp_ratify(kthread_t *tp, disp_t *kpq);
 157 
 158 void    swtch_to(kthread_t *);
 159 
 160 /*
 161  * dispatcher and scheduler initialization
 162  */
 163 
 164 /*
 165  * disp_setup - Common code to calculate and allocate dispatcher
 166  *              variables and structures based on the maximum priority.
 167  */
 168 static void
 169 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
 170 {
 171         pri_t   newnglobpris;
 172 
 173         ASSERT(MUTEX_HELD(&cpu_lock));
 174 
 175         newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
 176 
 177         if (newnglobpris > oldnglobpris) {
 178                 /*
 179                  * Allocate new kp queues for each CPU partition.
 180                  */
 181                 cpupart_kpqalloc(newnglobpris);
 182 
 183                 /*
 184                  * Allocate new dispatch queues for each CPU.
 185                  */
 186                 cpu_dispqalloc(newnglobpris);
 187 
 188                 /*
 189                  * compute new interrupt thread base priority
 190                  */
 191                 intr_pri = maxglobpri;
 192                 if (only_intr_kpreempt) {
 193                         kpreemptpri = intr_pri + 1;
 194                         if (kpqpri == KPQPRI)
 195                                 kpqpri = kpreemptpri;
 196                 }
 197                 v.v_nglobpris = newnglobpris;
 198         }
 199 }
 200 
 201 /*
 202  * dispinit - Called to initialize all loaded classes and the
 203  *            dispatcher framework.
 204  */
 205 void
 206 dispinit(void)
 207 {
 208         id_t    cid;
 209         pri_t   maxglobpri;
 210         pri_t   cl_maxglobpri;
 211 
 212         maxglobpri = -1;
 213 
 214         /*
 215          * Initialize transition lock, which will always be set.
 216          */
 217         DISP_LOCK_INIT(&transition_lock);
 218         disp_lock_enter_high(&transition_lock);
 219         DISP_LOCK_INIT(&stop_lock);
 220 
 221         mutex_enter(&cpu_lock);
 222         CPU->cpu_disp->disp_maxrunpri = -1;
 223         CPU->cpu_disp->disp_max_unbound_pri = -1;
 224 
 225         /*
 226          * Initialize the default CPU partition.
 227          */
 228         cpupart_initialize_default();
 229         /*
 230          * Call the class specific initialization functions for
 231          * all pre-installed schedulers.
 232          *
 233          * We pass the size of a class specific parameter
 234          * buffer to each of the initialization functions
 235          * to try to catch problems with backward compatibility
 236          * of class modules.
 237          *
 238          * For example a new class module running on an old system
 239          * which didn't provide sufficiently large parameter buffers
 240          * would be bad news. Class initialization modules can check for
 241          * this and take action if they detect a problem.
 242          */
 243 
 244         for (cid = 0; cid < nclass; cid++) {
 245                 sclass_t        *sc;
 246 
 247                 sc = &sclass[cid];
 248                 if (SCHED_INSTALLED(sc)) {
 249                         cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
 250                             &sc->cl_funcs);
 251                         if (cl_maxglobpri > maxglobpri)
 252                                 maxglobpri = cl_maxglobpri;
 253                 }
 254         }
 255         kpreemptpri = (pri_t)v.v_maxsyspri + 1;
 256         if (kpqpri == KPQPRI)
 257                 kpqpri = kpreemptpri;
 258 
 259         ASSERT(maxglobpri >= 0);
 260         disp_setup(maxglobpri, 0);
 261 
 262         mutex_exit(&cpu_lock);
 263 
 264         /*
 265          * Platform specific sticky scheduler setup.
 266          */
 267         if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
 268                 cmp_set_nosteal_interval();
 269 
 270         /*
 271          * Get the default class ID; this may be later modified via
 272          * dispadmin(1M).  This will load the class (normally TS) and that will
 273          * call disp_add(), which is why we had to drop cpu_lock first.
 274          */
 275         if (getcid(defaultclass, &defaultcid) != 0) {
 276                 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
 277                     defaultclass);
 278         }
 279 }
 280 
 281 /*
 282  * disp_add - Called with class pointer to initialize the dispatcher
 283  *            for a newly loaded class.
 284  */
 285 void
 286 disp_add(sclass_t *clp)
 287 {
 288         pri_t   maxglobpri;
 289         pri_t   cl_maxglobpri;
 290 
 291         mutex_enter(&cpu_lock);
 292         /*
 293          * Initialize the scheduler class.
 294          */
 295         maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
 296         cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
 297         if (cl_maxglobpri > maxglobpri)
 298                 maxglobpri = cl_maxglobpri;
 299 
 300         /*
 301          * Save old queue information.  Since we're initializing a
 302          * new scheduling class which has just been loaded, then
 303          * the size of the dispq may have changed.  We need to handle
 304          * that here.
 305          */
 306         disp_setup(maxglobpri, v.v_nglobpris);
 307 
 308         mutex_exit(&cpu_lock);
 309 }
 310 
 311 
 312 /*
 313  * For each CPU, allocate new dispatch queues
 314  * with the stated number of priorities.
 315  */
 316 static void
 317 cpu_dispqalloc(int numpris)
 318 {
 319         cpu_t   *cpup;
 320         struct disp_queue_info  *disp_mem;
 321         int i, num;
 322 
 323         ASSERT(MUTEX_HELD(&cpu_lock));
 324 
 325         disp_mem = kmem_zalloc(NCPU *
 326             sizeof (struct disp_queue_info), KM_SLEEP);
 327 
 328         /*
 329          * This routine must allocate all of the memory before stopping
 330          * the cpus because it must not sleep in kmem_alloc while the
 331          * CPUs are stopped.  Locks they hold will not be freed until they
 332          * are restarted.
 333          */
 334         i = 0;
 335         cpup = cpu_list;
 336         do {
 337                 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
 338                 i++;
 339                 cpup = cpup->cpu_next;
 340         } while (cpup != cpu_list);
 341         num = i;
 342 
 343         pause_cpus(NULL, NULL);
 344         for (i = 0; i < num; i++)
 345                 disp_dq_assign(&disp_mem[i], numpris);
 346         start_cpus();
 347 
 348         /*
 349          * I must free all of the memory after starting the cpus because
 350          * I can not risk sleeping in kmem_free while the cpus are stopped.
 351          */
 352         for (i = 0; i < num; i++)
 353                 disp_dq_free(&disp_mem[i]);
 354 
 355         kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
 356 }
 357 
 358 static void
 359 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp)
 360 {
 361         dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
 362         dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
 363             sizeof (long), KM_SLEEP);
 364         dptr->dp = dp;
 365 }
 366 
 367 static void
 368 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
 369 {
 370         disp_t  *dp;
 371 
 372         dp = dptr->dp;
 373         dptr->olddispq = dp->disp_q;
 374         dptr->olddqactmap = dp->disp_qactmap;
 375         dptr->oldnglobpris = dp->disp_npri;
 376 
 377         ASSERT(dptr->oldnglobpris < numpris);
 378 
 379         if (dptr->olddispq != NULL) {
 380                 /*
 381                  * Use kcopy because bcopy is platform-specific
 382                  * and could block while we might have paused the cpus.
 383                  */
 384                 (void) kcopy(dptr->olddispq, dptr->newdispq,
 385                     dptr->oldnglobpris * sizeof (dispq_t));
 386                 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
 387                     ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
 388                     sizeof (long));
 389         }
 390         dp->disp_q = dptr->newdispq;
 391         dp->disp_qactmap = dptr->newdqactmap;
 392         dp->disp_q_limit = &dptr->newdispq[numpris];
 393         dp->disp_npri = numpris;
 394 }
 395 
 396 static void
 397 disp_dq_free(struct disp_queue_info *dptr)
 398 {
 399         if (dptr->olddispq != NULL)
 400                 kmem_free(dptr->olddispq,
 401                     dptr->oldnglobpris * sizeof (dispq_t));
 402         if (dptr->olddqactmap != NULL)
 403                 kmem_free(dptr->olddqactmap,
 404                     ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
 405 }
 406 
 407 /*
 408  * For a newly created CPU, initialize the dispatch queue.
 409  * This is called before the CPU is known through cpu[] or on any lists.
 410  */
 411 void
 412 disp_cpu_init(cpu_t *cp)
 413 {
 414         disp_t  *dp;
 415         dispq_t *newdispq;
 416         ulong_t *newdqactmap;
 417 
 418         ASSERT(MUTEX_HELD(&cpu_lock));      /* protect dispatcher queue sizes */
 419 
 420         if (cp == cpu0_disp.disp_cpu)
 421                 dp = &cpu0_disp;
 422         else
 423                 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
 424         bzero(dp, sizeof (disp_t));
 425         cp->cpu_disp = dp;
 426         dp->disp_cpu = cp;
 427         dp->disp_maxrunpri = -1;
 428         dp->disp_max_unbound_pri = -1;
 429         DISP_LOCK_INIT(&cp->cpu_thread_lock);
 430         /*
 431          * Allocate memory for the dispatcher queue headers
 432          * and the active queue bitmap.
 433          */
 434         newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
 435         newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
 436             sizeof (long), KM_SLEEP);
 437         dp->disp_q = newdispq;
 438         dp->disp_qactmap = newdqactmap;
 439         dp->disp_q_limit = &newdispq[v.v_nglobpris];
 440         dp->disp_npri = v.v_nglobpris;
 441 }
 442 
 443 void
 444 disp_cpu_fini(cpu_t *cp)
 445 {
 446         ASSERT(MUTEX_HELD(&cpu_lock));
 447 
 448         disp_kp_free(cp->cpu_disp);
 449         if (cp->cpu_disp != &cpu0_disp)
 450                 kmem_free(cp->cpu_disp, sizeof (disp_t));
 451 }
 452 
 453 /*
 454  * Allocate new, larger kpreempt dispatch queue to replace the old one.
 455  */
 456 void
 457 disp_kp_alloc(disp_t *dq, pri_t npri)
 458 {
 459         struct disp_queue_info  mem_info;
 460 
 461         if (npri > dq->disp_npri) {
 462                 /*
 463                  * Allocate memory for the new array.
 464                  */
 465                 disp_dq_alloc(&mem_info, npri, dq);
 466 
 467                 /*
 468                  * We need to copy the old structures to the new
 469                  * and free the old.
 470                  */
 471                 disp_dq_assign(&mem_info, npri);
 472                 disp_dq_free(&mem_info);
 473         }
 474 }
 475 
 476 /*
 477  * Free dispatch queue.
 478  * Used for the kpreempt queues for a removed CPU partition and
 479  * for the per-CPU queues of deleted CPUs.
 480  */
 481 void
 482 disp_kp_free(disp_t *dq)
 483 {
 484         struct disp_queue_info  mem_info;
 485 
 486         mem_info.olddispq = dq->disp_q;
 487         mem_info.olddqactmap = dq->disp_qactmap;
 488         mem_info.oldnglobpris = dq->disp_npri;
 489         disp_dq_free(&mem_info);
 490 }
 491 
 492 /*
 493  * End dispatcher and scheduler initialization.
 494  */
 495 
 496 /*
 497  * See if there's anything to do other than remain idle.
 498  * Return non-zero if there is.
 499  *
 500  * This function must be called with high spl, or with
 501  * kernel preemption disabled to prevent the partition's
 502  * active cpu list from changing while being traversed.
 503  *
 504  * This is essentially a simpler version of disp_getwork()
 505  * to be called by CPUs preparing to "halt".
 506  */
 507 int
 508 disp_anywork(void)
 509 {
 510         cpu_t           *cp = CPU;
 511         cpu_t           *ocp;
 512         volatile int    *local_nrunnable = &cp->cpu_disp->disp_nrunnable;
 513 
 514         if (!(cp->cpu_flags & CPU_OFFLINE)) {
 515                 if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
 516                         return (1);
 517 
 518                 for (ocp = cp->cpu_next_part; ocp != cp;
 519                     ocp = ocp->cpu_next_part) {
 520                         ASSERT(CPU_ACTIVE(ocp));
 521 
 522                         /*
 523                          * Something has appeared on the local run queue.
 524                          */
 525                         if (*local_nrunnable > 0)
 526                                 return (1);
 527                         /*
 528                          * If we encounter another idle CPU that will
 529                          * soon be trolling around through disp_anywork()
 530                          * terminate our walk here and let this other CPU
 531                          * patrol the next part of the list.
 532                          */
 533                         if (ocp->cpu_dispatch_pri == -1 &&
 534                             (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
 535                                 return (0);
 536                         /*
 537                          * Work can be taken from another CPU if:
 538                          *      - There is unbound work on the run queue
 539                          *      - That work isn't a thread undergoing a
 540                          *      - context switch on an otherwise empty queue.
 541                          *      - The CPU isn't running the idle loop.
 542                          */
 543                         if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
 544                             !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
 545                             ocp->cpu_disp->disp_nrunnable == 1) &&
 546                             ocp->cpu_dispatch_pri != -1)
 547                                 return (1);
 548                 }
 549         }
 550         return (0);
 551 }
 552 
 553 /*
 554  * Called when CPU enters the idle loop
 555  */
 556 static void
 557 idle_enter()
 558 {
 559         cpu_t           *cp = CPU;
 560 
 561         new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
 562         CPU_STATS_ADDQ(cp, sys, idlethread, 1);
 563         set_idle_cpu(cp->cpu_id);    /* arch-dependent hook */
 564 }
 565 
 566 /*
 567  * Called when CPU exits the idle loop
 568  */
 569 static void
 570 idle_exit()
 571 {
 572         cpu_t           *cp = CPU;
 573 
 574         new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
 575         unset_idle_cpu(cp->cpu_id);  /* arch-dependent hook */
 576 }
 577 
 578 /*
 579  * Idle loop.
 580  */
 581 void
 582 idle()
 583 {
 584         struct cpu      *cp = CPU;              /* pointer to this CPU */
 585         kthread_t       *t;                     /* taken thread */
 586 
 587         idle_enter();
 588 
 589         /*
 590          * Uniprocessor version of idle loop.
 591          * Do this until notified that we're on an actual multiprocessor.
 592          */
 593         while (ncpus == 1) {
 594                 if (cp->cpu_disp->disp_nrunnable == 0) {
 595                         (*idle_cpu)();
 596                         continue;
 597                 }
 598                 idle_exit();
 599                 swtch();
 600 
 601                 idle_enter(); /* returned from swtch */
 602         }
 603 
 604         /*
 605          * Multiprocessor idle loop.
 606          */
 607         for (;;) {
 608                 /*
 609                  * If CPU is completely quiesced by p_online(2), just wait
 610                  * here with minimal bus traffic until put online.
 611                  */
 612                 while (cp->cpu_flags & CPU_QUIESCED)
 613                         (*idle_cpu)();
 614 
 615                 if (cp->cpu_disp->disp_nrunnable != 0) {
 616                         idle_exit();
 617                         swtch();
 618                 } else {
 619                         if (cp->cpu_flags & CPU_OFFLINE)
 620                                 continue;
 621                         if ((t = disp_getwork(cp)) == NULL) {
 622                                 if (cp->cpu_chosen_level != -1) {
 623                                         disp_t *dp = cp->cpu_disp;
 624                                         disp_t *kpq;
 625 
 626                                         disp_lock_enter(&dp->disp_lock);
 627                                         /*
 628                                          * Set kpq under lock to prevent
 629                                          * migration between partitions.
 630                                          */
 631                                         kpq = &cp->cpu_part->cp_kp_queue;
 632                                         if (kpq->disp_maxrunpri == -1)
 633                                                 cp->cpu_chosen_level = -1;
 634                                         disp_lock_exit(&dp->disp_lock);
 635                                 }
 636                                 (*idle_cpu)();
 637                                 continue;
 638                         }
 639                         /*
 640                          * If there was a thread but we couldn't steal
 641                          * it, then keep trying.
 642                          */
 643                         if (t == T_DONTSTEAL)
 644                                 continue;
 645                         idle_exit();
 646                         swtch_to(t);
 647                 }
 648                 idle_enter(); /* returned from swtch/swtch_to */
 649         }
 650 }
 651 
 652 
 653 /*
 654  * Preempt the currently running thread in favor of the highest
 655  * priority thread.  The class of the current thread controls
 656  * where it goes on the dispatcher queues. If panicking, turn
 657  * preemption off.
 658  */
 659 void
 660 preempt()
 661 {
 662         kthread_t       *t = curthread;
 663         klwp_t          *lwp = ttolwp(curthread);
 664 
 665         if (panicstr)
 666                 return;
 667 
 668         TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
 669 
 670         thread_lock(t);
 671 
 672         if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
 673                 /*
 674                  * this thread has already been chosen to be run on
 675                  * another CPU. Clear kprunrun on this CPU since we're
 676                  * already headed for swtch().
 677                  */
 678                 CPU->cpu_kprunrun = 0;
 679                 thread_unlock_nopreempt(t);
 680                 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
 681         } else {
 682                 if (lwp != NULL)
 683                         lwp->lwp_ru.nivcsw++;
 684                 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
 685                 THREAD_TRANSITION(t);
 686                 CL_PREEMPT(t);
 687                 DTRACE_SCHED(preempt);
 688                 thread_unlock_nopreempt(t);
 689 
 690                 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
 691 
 692                 swtch();                /* clears CPU->cpu_runrun via disp() */
 693         }
 694 }
 695 
 696 extern kthread_t *thread_unpin();
 697 
 698 /*
 699  * disp() - find the highest priority thread for this processor to run, and
 700  * set it in TS_ONPROC state so that resume() can be called to run it.
 701  */
 702 static kthread_t *
 703 disp()
 704 {
 705         cpu_t           *cpup;
 706         disp_t          *dp;
 707         kthread_t       *tp;
 708         dispq_t         *dq;
 709         int             maxrunword;
 710         pri_t           pri;
 711         disp_t          *kpq;
 712 
 713         TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
 714 
 715         cpup = CPU;
 716         /*
 717          * Find the highest priority loaded, runnable thread.
 718          */
 719         dp = cpup->cpu_disp;
 720 
 721 reschedule:
 722         /*
 723          * If there is more important work on the global queue with a better
 724          * priority than the maximum on this CPU, take it now.
 725          */
 726         kpq = &cpup->cpu_part->cp_kp_queue;
 727         while ((pri = kpq->disp_maxrunpri) >= 0 &&
 728             pri >= dp->disp_maxrunpri &&
 729             (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
 730             (tp = disp_getbest(kpq)) != NULL) {
 731                 if (disp_ratify(tp, kpq) != NULL) {
 732                         TRACE_1(TR_FAC_DISP, TR_DISP_END,
 733                             "disp_end:tid %p", tp);
 734                         return (tp);
 735                 }
 736         }
 737 
 738         disp_lock_enter(&dp->disp_lock);
 739         pri = dp->disp_maxrunpri;
 740 
 741         /*
 742          * If there is nothing to run, look at what's runnable on other queues.
 743          * Choose the idle thread if the CPU is quiesced.
 744          * Note that CPUs that have the CPU_OFFLINE flag set can still run
 745          * interrupt threads, which will be the only threads on the CPU's own
 746          * queue, but cannot run threads from other queues.
 747          */
 748         if (pri == -1) {
 749                 if (!(cpup->cpu_flags & CPU_OFFLINE)) {
 750                         disp_lock_exit(&dp->disp_lock);
 751                         if ((tp = disp_getwork(cpup)) == NULL ||
 752                             tp == T_DONTSTEAL) {
 753                                 tp = cpup->cpu_idle_thread;
 754                                 (void) splhigh();
 755                                 THREAD_ONPROC(tp, cpup);
 756                                 cpup->cpu_dispthread = tp;
 757                                 cpup->cpu_dispatch_pri = -1;
 758                                 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
 759                                 cpup->cpu_chosen_level = -1;
 760                         }
 761                 } else {
 762                         disp_lock_exit_high(&dp->disp_lock);
 763                         tp = cpup->cpu_idle_thread;
 764                         THREAD_ONPROC(tp, cpup);
 765                         cpup->cpu_dispthread = tp;
 766                         cpup->cpu_dispatch_pri = -1;
 767                         cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
 768                         cpup->cpu_chosen_level = -1;
 769                 }
 770                 TRACE_1(TR_FAC_DISP, TR_DISP_END,
 771                     "disp_end:tid %p", tp);
 772                 return (tp);
 773         }
 774 
 775         dq = &dp->disp_q[pri];
 776         tp = dq->dq_first;
 777 
 778         ASSERT(tp != NULL);
 779         ASSERT(tp->t_schedflag & TS_LOAD);       /* thread must be swapped in */
 780 
 781         DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
 782 
 783         /*
 784          * Found it so remove it from queue.
 785          */
 786         dp->disp_nrunnable--;
 787         dq->dq_sruncnt--;
 788         if ((dq->dq_first = tp->t_link) == NULL) {
 789                 ulong_t *dqactmap = dp->disp_qactmap;
 790 
 791                 ASSERT(dq->dq_sruncnt == 0);
 792                 dq->dq_last = NULL;
 793 
 794                 /*
 795                  * The queue is empty, so the corresponding bit needs to be
 796                  * turned off in dqactmap.   If nrunnable != 0 just took the
 797                  * last runnable thread off the
 798                  * highest queue, so recompute disp_maxrunpri.
 799                  */
 800                 maxrunword = pri >> BT_ULSHIFT;
 801                 dqactmap[maxrunword] &= ~BT_BIW(pri);
 802 
 803                 if (dp->disp_nrunnable == 0) {
 804                         dp->disp_max_unbound_pri = -1;
 805                         dp->disp_maxrunpri = -1;
 806                 } else {
 807                         int ipri;
 808 
 809                         ipri = bt_gethighbit(dqactmap, maxrunword);
 810                         dp->disp_maxrunpri = ipri;
 811                         if (ipri < dp->disp_max_unbound_pri)
 812                                 dp->disp_max_unbound_pri = ipri;
 813                 }
 814         } else {
 815                 tp->t_link = NULL;
 816         }
 817 
 818         /*
 819          * Set TS_DONT_SWAP flag to prevent another processor from swapping
 820          * out this thread before we have a chance to run it.
 821          * While running, it is protected against swapping by t_lock.
 822          */
 823         tp->t_schedflag |= TS_DONT_SWAP;
 824         cpup->cpu_dispthread = tp;           /* protected by spl only */
 825         cpup->cpu_dispatch_pri = pri;
 826         ASSERT(pri == DISP_PRIO(tp));
 827         thread_onproc(tp, cpup);                /* set t_state to TS_ONPROC */
 828         disp_lock_exit_high(&dp->disp_lock);     /* drop run queue lock */
 829 
 830         ASSERT(tp != NULL);
 831         TRACE_1(TR_FAC_DISP, TR_DISP_END,
 832             "disp_end:tid %p", tp);
 833 
 834         if (disp_ratify(tp, kpq) == NULL)
 835                 goto reschedule;
 836 
 837         return (tp);
 838 }
 839 
 840 /*
 841  * swtch()
 842  *      Find best runnable thread and run it.
 843  *      Called with the current thread already switched to a new state,
 844  *      on a sleep queue, run queue, stopped, and not zombied.
 845  *      May be called at any spl level less than or equal to LOCK_LEVEL.
 846  *      Always drops spl to the base level (spl0()).
 847  */
 848 void
 849 swtch()
 850 {
 851         kthread_t       *t = curthread;
 852         kthread_t       *next;
 853         cpu_t           *cp;
 854 
 855         TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
 856 
 857         if (t->t_flag & T_INTR_THREAD)
 858                 cpu_intr_swtch_enter(t);
 859 
 860         if (t->t_intr != NULL) {
 861                 /*
 862                  * We are an interrupt thread.  Setup and return
 863                  * the interrupted thread to be resumed.
 864                  */
 865                 (void) splhigh();       /* block other scheduler action */
 866                 cp = CPU;               /* now protected against migration */
 867                 ASSERT(CPU_ON_INTR(cp) == 0);   /* not called with PIL > 10 */
 868                 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
 869                 CPU_STATS_ADDQ(cp, sys, intrblk, 1);
 870                 next = thread_unpin();
 871                 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 872                 resume_from_intr(next);
 873         } else {
 874 #ifdef  DEBUG
 875                 if (t->t_state == TS_ONPROC &&
 876                     t->t_disp_queue->disp_cpu == CPU &&
 877                     t->t_preempt == 0) {
 878                         thread_lock(t);
 879                         ASSERT(t->t_state != TS_ONPROC ||
 880                             t->t_disp_queue->disp_cpu != CPU ||
 881                             t->t_preempt != 0);      /* cannot migrate */
 882                         thread_unlock_nopreempt(t);
 883                 }
 884 #endif  /* DEBUG */
 885                 cp = CPU;
 886                 next = disp();          /* returns with spl high */
 887                 ASSERT(CPU_ON_INTR(cp) == 0);   /* not called with PIL > 10 */
 888 
 889                 /* OK to steal anything left on run queue */
 890                 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
 891 
 892                 if (next != t) {
 893                         hrtime_t now;
 894 
 895                         now = gethrtime_unscaled();
 896                         pg_ev_thread_swtch(cp, now, t, next);
 897 
 898                         /*
 899                          * If t was previously in the TS_ONPROC state,
 900                          * setfrontdq and setbackdq won't have set its t_waitrq.
 901                          * Since we now finally know that we're switching away
 902                          * from this thread, set its t_waitrq if it is on a run
 903                          * queue.
 904                          */
 905                         if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
 906                                 t->t_waitrq = now;
 907                         }
 908 
 909                         /*
 910                          * restore mstate of thread that we are switching to
 911                          */
 912                         restore_mstate(next);
 913 
 914                         CPU_STATS_ADDQ(cp, sys, pswitch, 1);
 915                         cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
 916                         TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 917 
 918                         if (dtrace_vtime_active)
 919                                 dtrace_vtime_switch(next);
 920 
 921                         resume(next);
 922                         /*
 923                          * The TR_RESUME_END and TR_SWTCH_END trace points
 924                          * appear at the end of resume(), because we may not
 925                          * return here
 926                          */
 927                 } else {
 928                         if (t->t_flag & T_INTR_THREAD)
 929                                 cpu_intr_swtch_exit(t);
 930                         /*
 931                          * Threads that enqueue themselves on a run queue defer
 932                          * setting t_waitrq. It is then either set in swtch()
 933                          * when the CPU is actually yielded, or not at all if it
 934                          * is remaining on the CPU.
 935                          * There is however a window between where the thread
 936                          * placed itself on a run queue, and where it selects
 937                          * itself in disp(), where a third party (eg. clock()
 938                          * doing tick processing) may have re-enqueued this
 939                          * thread, setting t_waitrq in the process. We detect
 940                          * this race by noticing that despite switching to
 941                          * ourself, our t_waitrq has been set, and should be
 942                          * cleared.
 943                          */
 944                         if (t->t_waitrq != 0)
 945                                 t->t_waitrq = 0;
 946 
 947                         pg_ev_thread_remain(cp, t);
 948 
 949                         DTRACE_SCHED(remain__cpu);
 950                         TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
 951                         (void) spl0();
 952                 }
 953         }
 954 }
 955 
 956 /*
 957  * swtch_from_zombie()
 958  *      Special case of swtch(), which allows checks for TS_ZOMB to be
 959  *      eliminated from normal resume.
 960  *      Find best runnable thread and run it.
 961  *      Called with the current thread zombied.
 962  *      Zombies cannot migrate, so CPU references are safe.
 963  */
 964 void
 965 swtch_from_zombie()
 966 {
 967         kthread_t       *next;
 968         cpu_t           *cpu = CPU;
 969 
 970         TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
 971 
 972         ASSERT(curthread->t_state == TS_ZOMB);
 973 
 974         next = disp();                  /* returns with spl high */
 975         ASSERT(CPU_ON_INTR(CPU) == 0);  /* not called with PIL > 10 */
 976         CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
 977         ASSERT(next != curthread);
 978         TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 979 
 980         pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
 981 
 982         restore_mstate(next);
 983 
 984         if (dtrace_vtime_active)
 985                 dtrace_vtime_switch(next);
 986 
 987         resume_from_zombie(next);
 988         /*
 989          * The TR_RESUME_END and TR_SWTCH_END trace points
 990          * appear at the end of resume(), because we certainly will not
 991          * return here
 992          */
 993 }
 994 
 995 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
 996 
 997 /*
 998  * search_disp_queues()
 999  *      Search the given dispatch queues for thread tp.
1000  *      Return 1 if tp is found, otherwise return 0.
1001  */
1002 static int
1003 search_disp_queues(disp_t *dp, kthread_t *tp)
1004 {
1005         dispq_t         *dq;
1006         dispq_t         *eq;
1007 
1008         disp_lock_enter_high(&dp->disp_lock);
1009 
1010         for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1011                 kthread_t       *rp;
1012 
1013                 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1014 
1015                 for (rp = dq->dq_first; rp; rp = rp->t_link)
1016                         if (tp == rp) {
1017                                 disp_lock_exit_high(&dp->disp_lock);
1018                                 return (1);
1019                         }
1020         }
1021         disp_lock_exit_high(&dp->disp_lock);
1022 
1023         return (0);
1024 }
1025 
1026 /*
1027  * thread_on_queue()
1028  *      Search all per-CPU dispatch queues and all partition-wide kpreempt
1029  *      queues for thread tp. Return 1 if tp is found, otherwise return 0.
1030  */
1031 static int
1032 thread_on_queue(kthread_t *tp)
1033 {
1034         cpu_t           *cp;
1035         struct cpupart  *part;
1036 
1037         ASSERT(getpil() >= DISP_LEVEL);
1038 
1039         /*
1040          * Search the per-CPU dispatch queues for tp.
1041          */
1042         cp = CPU;
1043         do {
1044                 if (search_disp_queues(cp->cpu_disp, tp))
1045                         return (1);
1046         } while ((cp = cp->cpu_next_onln) != CPU);
1047 
1048         /*
1049          * Search the partition-wide kpreempt queues for tp.
1050          */
1051         part = CPU->cpu_part;
1052         do {
1053                 if (search_disp_queues(&part->cp_kp_queue, tp))
1054                         return (1);
1055         } while ((part = part->cp_next) != CPU->cpu_part);
1056 
1057         return (0);
1058 }
1059 
1060 #else
1061 
1062 #define thread_on_queue(tp)     0       /* ASSERT must be !thread_on_queue */
1063 
1064 #endif  /* DEBUG */
1065 
1066 /*
1067  * like swtch(), but switch to a specified thread taken from another CPU.
1068  *      called with spl high..
1069  */
1070 void
1071 swtch_to(kthread_t *next)
1072 {
1073         cpu_t                   *cp = CPU;
1074         hrtime_t                now;
1075 
1076         TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1077 
1078         /*
1079          * Update context switch statistics.
1080          */
1081         CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1082 
1083         TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1084 
1085         now = gethrtime_unscaled();
1086         pg_ev_thread_swtch(cp, now, curthread, next);
1087 
1088         /* OK to steal anything left on run queue */
1089         cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1090 
1091         /* record last execution time */
1092         cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1093 
1094         /*
1095          * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1096          * won't have set its t_waitrq.  Since we now finally know that we're
1097          * switching away from this thread, set its t_waitrq if it is on a run
1098          * queue.
1099          */
1100         if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1101                 curthread->t_waitrq = now;
1102         }
1103 
1104         /* restore next thread to previously running microstate */
1105         restore_mstate(next);
1106 
1107         if (dtrace_vtime_active)
1108                 dtrace_vtime_switch(next);
1109 
1110         resume(next);
1111         /*
1112          * The TR_RESUME_END and TR_SWTCH_END trace points
1113          * appear at the end of resume(), because we may not
1114          * return here
1115          */
1116 }
1117 
1118 #define CPU_IDLING(pri) ((pri) == -1)
1119 
1120 static void
1121 cpu_resched(cpu_t *cp, pri_t tpri)
1122 {
1123         int     call_poke_cpu = 0;
1124         pri_t   cpupri = cp->cpu_dispatch_pri;
1125 
1126         if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1127                 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1128                     "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1129                 if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1130                         cp->cpu_runrun = 1;
1131                         aston(cp->cpu_dispthread);
1132                         if (tpri < kpreemptpri && cp != CPU)
1133                                 call_poke_cpu = 1;
1134                 }
1135                 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1136                         cp->cpu_kprunrun = 1;
1137                         if (cp != CPU)
1138                                 call_poke_cpu = 1;
1139                 }
1140         }
1141 
1142         /*
1143          * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1144          */
1145         membar_enter();
1146 
1147         if (call_poke_cpu)
1148                 poke_cpu(cp->cpu_id);
1149 }
1150 
1151 /*
1152  * setbackdq() keeps runqs balanced such that the difference in length
1153  * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1154  * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1155  * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1156  * try to keep runqs perfectly balanced regardless of the thread priority.
1157  */
1158 #define RUNQ_MATCH_PRI  16      /* pri below which queue lengths must match */
1159 #define RUNQ_MAX_DIFF   2       /* maximum runq length difference */
1160 #define RUNQ_LEN(cp, pri)       ((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1161 
1162 /*
1163  * Macro that evaluates to true if it is likely that the thread has cache
1164  * warmth. This is based on the amount of time that has elapsed since the
1165  * thread last ran. If that amount of time is less than "rechoose_interval"
1166  * ticks, then we decide that the thread has enough cache warmth to warrant
1167  * some affinity for t->t_cpu.
1168  */
1169 #define THREAD_HAS_CACHE_WARMTH(thread) \
1170         ((thread == curthread) ||       \
1171         ((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1172 /*
1173  * Put the specified thread on the back of the dispatcher
1174  * queue corresponding to its current priority.
1175  *
1176  * Called with the thread in transition, onproc or stopped state
1177  * and locked (transition implies locked) and at high spl.
1178  * Returns with the thread in TS_RUN state and still locked.
1179  */
1180 void
1181 setbackdq(kthread_t *tp)
1182 {
1183         dispq_t *dq;
1184         disp_t          *dp;
1185         cpu_t           *cp;
1186         pri_t           tpri;
1187         int             bound;
1188         boolean_t       self;
1189 
1190         ASSERT(THREAD_LOCK_HELD(tp));
1191         ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1192         ASSERT(!thread_on_queue(tp));   /* make sure tp isn't on a runq */
1193 
1194         /*
1195          * If thread is "swapped" or on the swap queue don't
1196          * queue it, but wake sched.
1197          */
1198         if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1199                 disp_swapped_setrun(tp);
1200                 return;
1201         }
1202 
1203         self = (tp == curthread);
1204 
1205         if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1206                 bound = 1;
1207         else
1208                 bound = 0;
1209 
1210         tpri = DISP_PRIO(tp);
1211         if (ncpus == 1)
1212                 cp = tp->t_cpu;
1213         else if (!bound) {
1214                 if (tpri >= kpqpri) {
1215                         setkpdq(tp, SETKP_BACK);
1216                         return;
1217                 }
1218 
1219                 /*
1220                  * We'll generally let this thread continue to run where
1221                  * it last ran...but will consider migration if:
1222                  * - We thread probably doesn't have much cache warmth.
1223                  * - The CPU where it last ran is the target of an offline
1224                  *   request.
1225                  * - The thread last ran outside it's home lgroup.
1226                  */
1227                 if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1228                     (tp->t_cpu == cpu_inmotion)) {
1229                         cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
1230                 } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1231                         cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1232                             self ? tp->t_cpu : NULL);
1233                 } else {
1234                         cp = tp->t_cpu;
1235                 }
1236 
1237                 if (tp->t_cpupart == cp->cpu_part) {
1238                         int     qlen;
1239 
1240                         /*
1241                          * Perform any CMT load balancing
1242                          */
1243                         cp = cmt_balance(tp, cp);
1244 
1245                         /*
1246                          * Balance across the run queues
1247                          */
1248                         qlen = RUNQ_LEN(cp, tpri);
1249                         if (tpri >= RUNQ_MATCH_PRI &&
1250                             !(tp->t_schedflag & TS_RUNQMATCH))
1251                                 qlen -= RUNQ_MAX_DIFF;
1252                         if (qlen > 0) {
1253                                 cpu_t *newcp;
1254 
1255                                 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1256                                         newcp = cp->cpu_next_part;
1257                                 } else if ((newcp = cp->cpu_next_lpl) == cp) {
1258                                         newcp = cp->cpu_next_part;
1259                                 }
1260 
1261                                 if (RUNQ_LEN(newcp, tpri) < qlen) {
1262                                         DTRACE_PROBE3(runq__balance,
1263                                             kthread_t *, tp,
1264                                             cpu_t *, cp, cpu_t *, newcp);
1265                                         cp = newcp;
1266                                 }
1267                         }
1268                 } else {
1269                         /*
1270                          * Migrate to a cpu in the new partition.
1271                          */
1272                         cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1273                             tp->t_lpl, tp->t_pri, NULL);
1274                 }
1275                 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1276         } else {
1277                 /*
1278                  * It is possible that t_weakbound_cpu != t_bound_cpu (for
1279                  * a short time until weak binding that existed when the
1280                  * strong binding was established has dropped) so we must
1281                  * favour weak binding over strong.
1282                  */
1283                 cp = tp->t_weakbound_cpu ?
1284                     tp->t_weakbound_cpu : tp->t_bound_cpu;
1285         }
1286         /*
1287          * A thread that is ONPROC may be temporarily placed on the run queue
1288          * but then chosen to run again by disp.  If the thread we're placing on
1289          * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1290          * replacement process is actually scheduled in swtch().  In this
1291          * situation, curthread is the only thread that could be in the ONPROC
1292          * state.
1293          */
1294         if ((!self) && (tp->t_waitrq == 0)) {
1295                 hrtime_t curtime;
1296 
1297                 curtime = gethrtime_unscaled();
1298                 (void) cpu_update_pct(tp, curtime);
1299                 tp->t_waitrq = curtime;
1300         } else {
1301                 (void) cpu_update_pct(tp, gethrtime_unscaled());
1302         }
1303 
1304         dp = cp->cpu_disp;
1305         disp_lock_enter_high(&dp->disp_lock);
1306 
1307         DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1308         TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1309             tpri, cp, tp);
1310 
1311 #ifndef NPROBE
1312         /* Kernel probe */
1313         if (tnf_tracing_active)
1314                 tnf_thread_queue(tp, cp, tpri);
1315 #endif /* NPROBE */
1316 
1317         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1318 
1319         THREAD_RUN(tp, &dp->disp_lock);          /* set t_state to TS_RUN */
1320         tp->t_disp_queue = dp;
1321         tp->t_link = NULL;
1322 
1323         dq = &dp->disp_q[tpri];
1324         dp->disp_nrunnable++;
1325         if (!bound)
1326                 dp->disp_steal = 0;
1327         membar_enter();
1328 
1329         if (dq->dq_sruncnt++ != 0) {
1330                 ASSERT(dq->dq_first != NULL);
1331                 dq->dq_last->t_link = tp;
1332                 dq->dq_last = tp;
1333         } else {
1334                 ASSERT(dq->dq_first == NULL);
1335                 ASSERT(dq->dq_last == NULL);
1336                 dq->dq_first = dq->dq_last = tp;
1337                 BT_SET(dp->disp_qactmap, tpri);
1338                 if (tpri > dp->disp_maxrunpri) {
1339                         dp->disp_maxrunpri = tpri;
1340                         membar_enter();
1341                         cpu_resched(cp, tpri);
1342                 }
1343         }
1344 
1345         if (!bound && tpri > dp->disp_max_unbound_pri) {
1346                 if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1347                         /*
1348                          * If there are no other unbound threads on the
1349                          * run queue, don't allow other CPUs to steal
1350                          * this thread while we are in the middle of a
1351                          * context switch. We may just switch to it
1352                          * again right away. CPU_DISP_DONTSTEAL is cleared
1353                          * in swtch and swtch_to.
1354                          */
1355                         cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1356                 }
1357                 dp->disp_max_unbound_pri = tpri;
1358         }
1359         (*disp_enq_thread)(cp, bound);
1360 }
1361 
1362 /*
1363  * Put the specified thread on the front of the dispatcher
1364  * queue corresponding to its current priority.
1365  *
1366  * Called with the thread in transition, onproc or stopped state
1367  * and locked (transition implies locked) and at high spl.
1368  * Returns with the thread in TS_RUN state and still locked.
1369  */
1370 void
1371 setfrontdq(kthread_t *tp)
1372 {
1373         disp_t          *dp;
1374         dispq_t         *dq;
1375         cpu_t           *cp;
1376         pri_t           tpri;
1377         int             bound;
1378 
1379         ASSERT(THREAD_LOCK_HELD(tp));
1380         ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1381         ASSERT(!thread_on_queue(tp));   /* make sure tp isn't on a runq */
1382 
1383         /*
1384          * If thread is "swapped" or on the swap queue don't
1385          * queue it, but wake sched.
1386          */
1387         if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1388                 disp_swapped_setrun(tp);
1389                 return;
1390         }
1391 
1392         if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1393                 bound = 1;
1394         else
1395                 bound = 0;
1396 
1397         tpri = DISP_PRIO(tp);
1398         if (ncpus == 1)
1399                 cp = tp->t_cpu;
1400         else if (!bound) {
1401                 if (tpri >= kpqpri) {
1402                         setkpdq(tp, SETKP_FRONT);
1403                         return;
1404                 }
1405                 cp = tp->t_cpu;
1406                 if (tp->t_cpupart == cp->cpu_part) {
1407                         /*
1408                          * We'll generally let this thread continue to run
1409                          * where it last ran, but will consider migration if:
1410                          * - The thread last ran outside it's home lgroup.
1411                          * - The CPU where it last ran is the target of an
1412                          *   offline request (a thread_nomigrate() on the in
1413                          *   motion CPU relies on this when forcing a preempt).
1414                          * - The thread isn't the highest priority thread where
1415                          *   it last ran, and it is considered not likely to
1416                          *   have significant cache warmth.
1417                          */
1418                         if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
1419                             (cp == cpu_inmotion)) {
1420                                 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1421                                     (tp == curthread) ? cp : NULL);
1422                         } else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
1423                             (!THREAD_HAS_CACHE_WARMTH(tp))) {
1424                                 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1425                                     NULL);
1426                         }
1427                 } else {
1428                         /*
1429                          * Migrate to a cpu in the new partition.
1430                          */
1431                         cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1432                             tp->t_lpl, tp->t_pri, NULL);
1433                 }
1434                 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1435         } else {
1436                 /*
1437                  * It is possible that t_weakbound_cpu != t_bound_cpu (for
1438                  * a short time until weak binding that existed when the
1439                  * strong binding was established has dropped) so we must
1440                  * favour weak binding over strong.
1441                  */
1442                 cp = tp->t_weakbound_cpu ?
1443                     tp->t_weakbound_cpu : tp->t_bound_cpu;
1444         }
1445 
1446         /*
1447          * A thread that is ONPROC may be temporarily placed on the run queue
1448          * but then chosen to run again by disp.  If the thread we're placing on
1449          * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1450          * replacement process is actually scheduled in swtch().  In this
1451          * situation, curthread is the only thread that could be in the ONPROC
1452          * state.
1453          */
1454         if ((tp != curthread) && (tp->t_waitrq == 0)) {
1455                 hrtime_t curtime;
1456 
1457                 curtime = gethrtime_unscaled();
1458                 (void) cpu_update_pct(tp, curtime);
1459                 tp->t_waitrq = curtime;
1460         } else {
1461                 (void) cpu_update_pct(tp, gethrtime_unscaled());
1462         }
1463 
1464         dp = cp->cpu_disp;
1465         disp_lock_enter_high(&dp->disp_lock);
1466 
1467         TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1468         DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1469 
1470 #ifndef NPROBE
1471         /* Kernel probe */
1472         if (tnf_tracing_active)
1473                 tnf_thread_queue(tp, cp, tpri);
1474 #endif /* NPROBE */
1475 
1476         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1477 
1478         THREAD_RUN(tp, &dp->disp_lock);          /* set TS_RUN state and lock */
1479         tp->t_disp_queue = dp;
1480 
1481         dq = &dp->disp_q[tpri];
1482         dp->disp_nrunnable++;
1483         if (!bound)
1484                 dp->disp_steal = 0;
1485         membar_enter();
1486 
1487         if (dq->dq_sruncnt++ != 0) {
1488                 ASSERT(dq->dq_last != NULL);
1489                 tp->t_link = dq->dq_first;
1490                 dq->dq_first = tp;
1491         } else {
1492                 ASSERT(dq->dq_last == NULL);
1493                 ASSERT(dq->dq_first == NULL);
1494                 tp->t_link = NULL;
1495                 dq->dq_first = dq->dq_last = tp;
1496                 BT_SET(dp->disp_qactmap, tpri);
1497                 if (tpri > dp->disp_maxrunpri) {
1498                         dp->disp_maxrunpri = tpri;
1499                         membar_enter();
1500                         cpu_resched(cp, tpri);
1501                 }
1502         }
1503 
1504         if (!bound && tpri > dp->disp_max_unbound_pri) {
1505                 if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1506                     cp == CPU) {
1507                         /*
1508                          * If there are no other unbound threads on the
1509                          * run queue, don't allow other CPUs to steal
1510                          * this thread while we are in the middle of a
1511                          * context switch. We may just switch to it
1512                          * again right away. CPU_DISP_DONTSTEAL is cleared
1513                          * in swtch and swtch_to.
1514                          */
1515                         cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1516                 }
1517                 dp->disp_max_unbound_pri = tpri;
1518         }
1519         (*disp_enq_thread)(cp, bound);
1520 }
1521 
1522 /*
1523  * Put a high-priority unbound thread on the kp queue
1524  */
1525 static void
1526 setkpdq(kthread_t *tp, int borf)
1527 {
1528         dispq_t *dq;
1529         disp_t  *dp;
1530         cpu_t   *cp;
1531         pri_t   tpri;
1532 
1533         tpri = DISP_PRIO(tp);
1534 
1535         dp = &tp->t_cpupart->cp_kp_queue;
1536         disp_lock_enter_high(&dp->disp_lock);
1537 
1538         TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1539 
1540         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1541         DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1542         THREAD_RUN(tp, &dp->disp_lock);          /* set t_state to TS_RUN */
1543         tp->t_disp_queue = dp;
1544         dp->disp_nrunnable++;
1545         dq = &dp->disp_q[tpri];
1546 
1547         if (dq->dq_sruncnt++ != 0) {
1548                 if (borf == SETKP_BACK) {
1549                         ASSERT(dq->dq_first != NULL);
1550                         tp->t_link = NULL;
1551                         dq->dq_last->t_link = tp;
1552                         dq->dq_last = tp;
1553                 } else {
1554                         ASSERT(dq->dq_last != NULL);
1555                         tp->t_link = dq->dq_first;
1556                         dq->dq_first = tp;
1557                 }
1558         } else {
1559                 if (borf == SETKP_BACK) {
1560                         ASSERT(dq->dq_first == NULL);
1561                         ASSERT(dq->dq_last == NULL);
1562                         dq->dq_first = dq->dq_last = tp;
1563                 } else {
1564                         ASSERT(dq->dq_last == NULL);
1565                         ASSERT(dq->dq_first == NULL);
1566                         tp->t_link = NULL;
1567                         dq->dq_first = dq->dq_last = tp;
1568                 }
1569                 BT_SET(dp->disp_qactmap, tpri);
1570                 if (tpri > dp->disp_max_unbound_pri)
1571                         dp->disp_max_unbound_pri = tpri;
1572                 if (tpri > dp->disp_maxrunpri) {
1573                         dp->disp_maxrunpri = tpri;
1574                         membar_enter();
1575                 }
1576         }
1577 
1578         cp = tp->t_cpu;
1579         if (tp->t_cpupart != cp->cpu_part) {
1580                 /* migrate to a cpu in the new partition */
1581                 cp = tp->t_cpupart->cp_cpulist;
1582         }
1583         cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1584         disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1585         ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1586 
1587 #ifndef NPROBE
1588         /* Kernel probe */
1589         if (tnf_tracing_active)
1590                 tnf_thread_queue(tp, cp, tpri);
1591 #endif /* NPROBE */
1592 
1593         if (cp->cpu_chosen_level < tpri)
1594                 cp->cpu_chosen_level = tpri;
1595         cpu_resched(cp, tpri);
1596         disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1597         (*disp_enq_thread)(cp, 0);
1598 }
1599 
1600 /*
1601  * Remove a thread from the dispatcher queue if it is on it.
1602  * It is not an error if it is not found but we return whether
1603  * or not it was found in case the caller wants to check.
1604  */
1605 int
1606 dispdeq(kthread_t *tp)
1607 {
1608         disp_t          *dp;
1609         dispq_t         *dq;
1610         kthread_t       *rp;
1611         kthread_t       *trp;
1612         kthread_t       **ptp;
1613         int             tpri;
1614 
1615         ASSERT(THREAD_LOCK_HELD(tp));
1616 
1617         if (tp->t_state != TS_RUN)
1618                 return (0);
1619 
1620         /*
1621          * The thread is "swapped" or is on the swap queue and
1622          * hence no longer on the run queue, so return true.
1623          */
1624         if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1625                 return (1);
1626 
1627         tpri = DISP_PRIO(tp);
1628         dp = tp->t_disp_queue;
1629         ASSERT(tpri < dp->disp_npri);
1630         dq = &dp->disp_q[tpri];
1631         ptp = &dq->dq_first;
1632         rp = *ptp;
1633         trp = NULL;
1634 
1635         ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1636 
1637         /*
1638          * Search for thread in queue.
1639          * Double links would simplify this at the expense of disp/setrun.
1640          */
1641         while (rp != tp && rp != NULL) {
1642                 trp = rp;
1643                 ptp = &trp->t_link;
1644                 rp = trp->t_link;
1645         }
1646 
1647         if (rp == NULL) {
1648                 panic("dispdeq: thread not on queue");
1649         }
1650 
1651         DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1652 
1653         /*
1654          * Found it so remove it from queue.
1655          */
1656         if ((*ptp = rp->t_link) == NULL)
1657                 dq->dq_last = trp;
1658 
1659         dp->disp_nrunnable--;
1660         if (--dq->dq_sruncnt == 0) {
1661                 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1662                 if (dp->disp_nrunnable == 0) {
1663                         dp->disp_max_unbound_pri = -1;
1664                         dp->disp_maxrunpri = -1;
1665                 } else if (tpri == dp->disp_maxrunpri) {
1666                         int ipri;
1667 
1668                         ipri = bt_gethighbit(dp->disp_qactmap,
1669                             dp->disp_maxrunpri >> BT_ULSHIFT);
1670                         if (ipri < dp->disp_max_unbound_pri)
1671                                 dp->disp_max_unbound_pri = ipri;
1672                         dp->disp_maxrunpri = ipri;
1673                 }
1674         }
1675         tp->t_link = NULL;
1676         THREAD_TRANSITION(tp);          /* put in intermediate state */
1677         return (1);
1678 }
1679 
1680 
1681 /*
1682  * dq_sruninc and dq_srundec are public functions for
1683  * incrementing/decrementing the sruncnts when a thread on
1684  * a dispatcher queue is made schedulable/unschedulable by
1685  * resetting the TS_LOAD flag.
1686  *
1687  * The caller MUST have the thread lock and therefore the dispatcher
1688  * queue lock so that the operation which changes
1689  * the flag, the operation that checks the status of the thread to
1690  * determine if it's on a disp queue AND the call to this function
1691  * are one atomic operation with respect to interrupts.
1692  */
1693 
1694 /*
1695  * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1696  */
1697 void
1698 dq_sruninc(kthread_t *t)
1699 {
1700         ASSERT(t->t_state == TS_RUN);
1701         ASSERT(t->t_schedflag & TS_LOAD);
1702 
1703         THREAD_TRANSITION(t);
1704         setfrontdq(t);
1705 }
1706 
1707 /*
1708  * See comment on calling conventions above.
1709  * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1710  */
1711 void
1712 dq_srundec(kthread_t *t)
1713 {
1714         ASSERT(t->t_schedflag & TS_LOAD);
1715 
1716         (void) dispdeq(t);
1717         disp_swapped_enq(t);
1718 }
1719 
1720 /*
1721  * Change the dispatcher lock of thread to the "swapped_lock"
1722  * and return with thread lock still held.
1723  *
1724  * Called with thread_lock held, in transition state, and at high spl.
1725  */
1726 void
1727 disp_swapped_enq(kthread_t *tp)
1728 {
1729         ASSERT(THREAD_LOCK_HELD(tp));
1730         ASSERT(tp->t_schedflag & TS_LOAD);
1731 
1732         switch (tp->t_state) {
1733         case TS_RUN:
1734                 disp_lock_enter_high(&swapped_lock);
1735                 THREAD_SWAP(tp, &swapped_lock);     /* set TS_RUN state and lock */
1736                 break;
1737         case TS_ONPROC:
1738                 disp_lock_enter_high(&swapped_lock);
1739                 THREAD_TRANSITION(tp);
1740                 wake_sched_sec = 1;             /* tell clock to wake sched */
1741                 THREAD_SWAP(tp, &swapped_lock);     /* set TS_RUN state and lock */
1742                 break;
1743         default:
1744                 panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1745         }
1746 }
1747 
1748 /*
1749  * This routine is called by setbackdq/setfrontdq if the thread is
1750  * not loaded or loaded and on the swap queue.
1751  *
1752  * Thread state TS_SLEEP implies that a swapped thread
1753  * has been woken up and needs to be swapped in by the swapper.
1754  *
1755  * Thread state TS_RUN, it implies that the priority of a swapped
1756  * thread is being increased by scheduling class (e.g. ts_update).
1757  */
1758 static void
1759 disp_swapped_setrun(kthread_t *tp)
1760 {
1761         ASSERT(THREAD_LOCK_HELD(tp));
1762         ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1763 
1764         switch (tp->t_state) {
1765         case TS_SLEEP:
1766                 disp_lock_enter_high(&swapped_lock);
1767                 /*
1768                  * Wakeup sched immediately (i.e., next tick) if the
1769                  * thread priority is above maxclsyspri.
1770                  */
1771                 if (DISP_PRIO(tp) > maxclsyspri)
1772                         wake_sched = 1;
1773                 else
1774                         wake_sched_sec = 1;
1775                 THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1776                 break;
1777         case TS_RUN:                            /* called from ts_update */
1778                 break;
1779         default:
1780                 panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
1781         }
1782 }
1783 
1784 /*
1785  *      Make a thread give up its processor.  Find the processor on
1786  *      which this thread is executing, and have that processor
1787  *      preempt.
1788  *
1789  *      We allow System Duty Cycle (SDC) threads to be preempted even if
1790  *      they are running at kernel priorities.  To implement this, we always
1791  *      set cpu_kprunrun; this ensures preempt() will be called.  Since SDC
1792  *      calls cpu_surrender() very often, we only preempt if there is anyone
1793  *      competing with us.
1794  */
1795 void
1796 cpu_surrender(kthread_t *tp)
1797 {
1798         cpu_t   *cpup;
1799         int     max_pri;
1800         int     max_run_pri;
1801         klwp_t  *lwp;
1802 
1803         ASSERT(THREAD_LOCK_HELD(tp));
1804 
1805         if (tp->t_state != TS_ONPROC)
1806                 return;
1807         cpup = tp->t_disp_queue->disp_cpu;        /* CPU thread dispatched to */
1808         max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1809         max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1810         if (max_pri < max_run_pri)
1811                 max_pri = max_run_pri;
1812 
1813         if (tp->t_cid == sysdccid) {
1814                 uint_t t_pri = DISP_PRIO(tp);
1815                 if (t_pri > max_pri)
1816                         return;         /* we are not competing w/ anyone */
1817                 cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1818         } else {
1819                 cpup->cpu_runrun = 1;
1820                 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1821                         cpup->cpu_kprunrun = 1;
1822                 }
1823         }
1824 
1825         /*
1826          * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1827          */
1828         membar_enter();
1829 
1830         DTRACE_SCHED1(surrender, kthread_t *, tp);
1831 
1832         /*
1833          * Make the target thread take an excursion through trap()
1834          * to do preempt() (unless we're already in trap or post_syscall,
1835          * calling cpu_surrender via CL_TRAPRET).
1836          */
1837         if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1838             lwp->lwp_state != LWP_USER) {
1839                 aston(tp);
1840                 if (cpup != CPU)
1841                         poke_cpu(cpup->cpu_id);
1842         }
1843         TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1844             "cpu_surrender:tid %p cpu %p", tp, cpup);
1845 }
1846 
1847 /*
1848  * Commit to and ratify a scheduling decision
1849  */
1850 /*ARGSUSED*/
1851 static kthread_t *
1852 disp_ratify(kthread_t *tp, disp_t *kpq)
1853 {
1854         pri_t   tpri, maxpri;
1855         pri_t   maxkpri;
1856         cpu_t   *cpup;
1857 
1858         ASSERT(tp != NULL);
1859         /*
1860          * Commit to, then ratify scheduling decision
1861          */
1862         cpup = CPU;
1863         if (cpup->cpu_runrun != 0)
1864                 cpup->cpu_runrun = 0;
1865         if (cpup->cpu_kprunrun != 0)
1866                 cpup->cpu_kprunrun = 0;
1867         if (cpup->cpu_chosen_level != -1)
1868                 cpup->cpu_chosen_level = -1;
1869         membar_enter();
1870         tpri = DISP_PRIO(tp);
1871         maxpri = cpup->cpu_disp->disp_maxrunpri;
1872         maxkpri = kpq->disp_maxrunpri;
1873         if (maxpri < maxkpri)
1874                 maxpri = maxkpri;
1875         if (tpri < maxpri) {
1876                 /*
1877                  * should have done better
1878                  * put this one back and indicate to try again
1879                  */
1880                 cpup->cpu_dispthread = curthread;    /* fixup dispthread */
1881                 cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1882                 thread_lock_high(tp);
1883                 THREAD_TRANSITION(tp);
1884                 setfrontdq(tp);
1885                 thread_unlock_nopreempt(tp);
1886 
1887                 tp = NULL;
1888         }
1889         return (tp);
1890 }
1891 
1892 /*
1893  * See if there is any work on the dispatcher queue for other CPUs.
1894  * If there is, dequeue the best thread and return.
1895  */
1896 static kthread_t *
1897 disp_getwork(cpu_t *cp)
1898 {
1899         cpu_t           *ocp;           /* other CPU */
1900         cpu_t           *ocp_start;
1901         cpu_t           *tcp;           /* target local CPU */
1902         kthread_t       *tp;
1903         kthread_t       *retval = NULL;
1904         pri_t           maxpri;
1905         disp_t          *kpq;           /* kp queue for this partition */
1906         lpl_t           *lpl, *lpl_leaf;
1907         int             leafidx, startidx;
1908         hrtime_t        stealtime;
1909         lgrp_id_t       local_id;
1910 
1911         maxpri = -1;
1912         tcp = NULL;
1913 
1914         kpq = &cp->cpu_part->cp_kp_queue;
1915         while (kpq->disp_maxrunpri >= 0) {
1916                 /*
1917                  * Try to take a thread from the kp_queue.
1918                  */
1919                 tp = (disp_getbest(kpq));
1920                 if (tp)
1921                         return (disp_ratify(tp, kpq));
1922         }
1923 
1924         kpreempt_disable();             /* protect the cpu_active list */
1925 
1926         /*
1927          * Try to find something to do on another CPU's run queue.
1928          * Loop through all other CPUs looking for the one with the highest
1929          * priority unbound thread.
1930          *
1931          * On NUMA machines, the partition's CPUs are consulted in order of
1932          * distance from the current CPU. This way, the first available
1933          * work found is also the closest, and will suffer the least
1934          * from being migrated.
1935          */
1936         lpl = lpl_leaf = cp->cpu_lpl;
1937         local_id = lpl_leaf->lpl_lgrpid;
1938         leafidx = startidx = 0;
1939 
1940         /*
1941          * This loop traverses the lpl hierarchy. Higher level lpls represent
1942          * broader levels of locality
1943          */
1944         do {
1945                 /* This loop iterates over the lpl's leaves */
1946                 do {
1947                         if (lpl_leaf != cp->cpu_lpl)
1948                                 ocp = lpl_leaf->lpl_cpus;
1949                         else
1950                                 ocp = cp->cpu_next_lpl;
1951 
1952                         /* This loop iterates over the CPUs in the leaf */
1953                         ocp_start = ocp;
1954                         do {
1955                                 pri_t pri;
1956 
1957                                 ASSERT(CPU_ACTIVE(ocp));
1958 
1959                                 /*
1960                                  * End our stroll around this lpl if:
1961                                  *
1962                                  * - Something became runnable on the local
1963                                  *   queue...which also ends our stroll around
1964                                  *   the partition.
1965                                  *
1966                                  * - We happen across another idle CPU.
1967                                  *   Since it is patrolling the next portion
1968                                  *   of the lpl's list (assuming it's not
1969                                  *   halted, or busy servicing an interrupt),
1970                                  *   move to the next higher level of locality.
1971                                  */
1972                                 if (cp->cpu_disp->disp_nrunnable != 0) {
1973                                         kpreempt_enable();
1974                                         return (NULL);
1975                                 }
1976                                 if (ocp->cpu_dispatch_pri == -1) {
1977                                         if (ocp->cpu_disp_flags &
1978                                             CPU_DISP_HALTED ||
1979                                             ocp->cpu_intr_actv != 0)
1980                                                 continue;
1981                                         else
1982                                                 goto next_level;
1983                                 }
1984 
1985                                 /*
1986                                  * If there's only one thread and the CPU
1987                                  * is in the middle of a context switch,
1988                                  * or it's currently running the idle thread,
1989                                  * don't steal it.
1990                                  */
1991                                 if ((ocp->cpu_disp_flags &
1992                                     CPU_DISP_DONTSTEAL) &&
1993                                     ocp->cpu_disp->disp_nrunnable == 1)
1994                                         continue;
1995 
1996                                 pri = ocp->cpu_disp->disp_max_unbound_pri;
1997                                 if (pri > maxpri) {
1998                                         /*
1999                                          * Don't steal threads that we attempted
2000                                          * to steal recently until they're ready
2001                                          * to be stolen again.
2002                                          */
2003                                         stealtime = ocp->cpu_disp->disp_steal;
2004                                         if (stealtime == 0 ||
2005                                             stealtime - gethrtime() <= 0) {
2006                                                 maxpri = pri;
2007                                                 tcp = ocp;
2008                                         } else {
2009                                                 /*
2010                                                  * Don't update tcp, just set
2011                                                  * the retval to T_DONTSTEAL, so
2012                                                  * that if no acceptable CPUs
2013                                                  * are found the return value
2014                                                  * will be T_DONTSTEAL rather
2015                                                  * then NULL.
2016                                                  */
2017                                                 retval = T_DONTSTEAL;
2018                                         }
2019                                 }
2020                         } while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2021 
2022                         /*
2023                          * Iterate to the next leaf lpl in the resource set
2024                          * at this level of locality. If we hit the end of
2025                          * the set, wrap back around to the beginning.
2026                          *
2027                          * Note: This iteration is NULL terminated for a reason
2028                          * see lpl_topo_bootstrap() in lgrp.c for details.
2029                          */
2030                         if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2031                                 leafidx = 0;
2032                                 lpl_leaf = lpl->lpl_rset[leafidx];
2033                         }
2034                 } while (leafidx != startidx);
2035 
2036 next_level:
2037                 /*
2038                  * Expand the search to include farther away CPUs (next
2039                  * locality level). The closer CPUs that have already been
2040                  * checked will be checked again. In doing so, idle CPUs
2041                  * will tend to be more aggresive about stealing from CPUs
2042                  * that are closer (since the closer CPUs will be considered
2043                  * more often).
2044                  * Begin at this level with the CPUs local leaf lpl.
2045                  */
2046                 if ((lpl = lpl->lpl_parent) != NULL) {
2047                         leafidx = startidx = lpl->lpl_id2rset[local_id];
2048                         lpl_leaf = lpl->lpl_rset[leafidx];
2049                 }
2050         } while (!tcp && lpl);
2051 
2052         kpreempt_enable();
2053 
2054         /*
2055          * If another queue looks good, and there is still nothing on
2056          * the local queue, try to transfer one or more threads
2057          * from it to our queue.
2058          */
2059         if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2060                 tp = disp_getbest(tcp->cpu_disp);
2061                 if (tp == NULL || tp == T_DONTSTEAL)
2062                         return (tp);
2063                 return (disp_ratify(tp, kpq));
2064         }
2065         return (retval);
2066 }
2067 
2068 
2069 /*
2070  * disp_fix_unbound_pri()
2071  *      Determines the maximum priority of unbound threads on the queue.
2072  *      The priority is kept for the queue, but is only increased, never
2073  *      reduced unless some CPU is looking for something on that queue.
2074  *
2075  *      The priority argument is the known upper limit.
2076  *
2077  *      Perhaps this should be kept accurately, but that probably means
2078  *      separate bitmaps for bound and unbound threads.  Since only idled
2079  *      CPUs will have to do this recalculation, it seems better this way.
2080  */
2081 static void
2082 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2083 {
2084         kthread_t       *tp;
2085         dispq_t         *dq;
2086         ulong_t         *dqactmap = dp->disp_qactmap;
2087         ulong_t         mapword;
2088         int             wx;
2089 
2090         ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2091 
2092         ASSERT(pri >= 0);                    /* checked by caller */
2093 
2094         /*
2095          * Start the search at the next lowest priority below the supplied
2096          * priority.  This depends on the bitmap implementation.
2097          */
2098         do {
2099                 wx = pri >> BT_ULSHIFT;           /* index of word in map */
2100 
2101                 /*
2102                  * Form mask for all lower priorities in the word.
2103                  */
2104                 mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2105 
2106                 /*
2107                  * Get next lower active priority.
2108                  */
2109                 if (mapword != 0) {
2110                         pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2111                 } else if (wx > 0) {
2112                         pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2113                         if (pri < 0)
2114                                 break;
2115                 } else {
2116                         pri = -1;
2117                         break;
2118                 }
2119 
2120                 /*
2121                  * Search the queue for unbound, runnable threads.
2122                  */
2123                 dq = &dp->disp_q[pri];
2124                 tp = dq->dq_first;
2125 
2126                 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2127                         tp = tp->t_link;
2128                 }
2129 
2130                 /*
2131                  * If a thread was found, set the priority and return.
2132                  */
2133         } while (tp == NULL);
2134 
2135         /*
2136          * pri holds the maximum unbound thread priority or -1.
2137          */
2138         if (dp->disp_max_unbound_pri != pri)
2139                 dp->disp_max_unbound_pri = pri;
2140 }
2141 
2142 /*
2143  * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2144  *      check if the CPU to which is was previously bound should have
2145  *      its disp_max_unbound_pri increased.
2146  */
2147 void
2148 disp_adjust_unbound_pri(kthread_t *tp)
2149 {
2150         disp_t *dp;
2151         pri_t tpri;
2152 
2153         ASSERT(THREAD_LOCK_HELD(tp));
2154 
2155         /*
2156          * Don't do anything if the thread is not bound, or
2157          * currently not runnable or swapped out.
2158          */
2159         if (tp->t_bound_cpu == NULL ||
2160             tp->t_state != TS_RUN ||
2161             tp->t_schedflag & TS_ON_SWAPQ)
2162                 return;
2163 
2164         tpri = DISP_PRIO(tp);
2165         dp = tp->t_bound_cpu->cpu_disp;
2166         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2167         if (tpri > dp->disp_max_unbound_pri)
2168                 dp->disp_max_unbound_pri = tpri;
2169 }
2170 
2171 /*
2172  * disp_getbest()
2173  *   De-queue the highest priority unbound runnable thread.
2174  *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
2175  *   Returns NULL if nothing found.
2176  *   Returns T_DONTSTEAL if the thread was not stealable.
2177  *   so that the caller will try again later.
2178  *
2179  *   Passed a pointer to a dispatch queue not associated with this CPU, and
2180  *   its type.
2181  */
2182 static kthread_t *
2183 disp_getbest(disp_t *dp)
2184 {
2185         kthread_t       *tp;
2186         dispq_t         *dq;
2187         pri_t           pri;
2188         cpu_t           *cp, *tcp;
2189         boolean_t       allbound;
2190 
2191         disp_lock_enter(&dp->disp_lock);
2192 
2193         /*
2194          * If there is nothing to run, or the CPU is in the middle of a
2195          * context switch of the only thread, return NULL.
2196          */
2197         tcp = dp->disp_cpu;
2198         cp = CPU;
2199         pri = dp->disp_max_unbound_pri;
2200         if (pri == -1 ||
2201             (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2202             tcp->cpu_disp->disp_nrunnable == 1)) {
2203                 disp_lock_exit_nopreempt(&dp->disp_lock);
2204                 return (NULL);
2205         }
2206 
2207         dq = &dp->disp_q[pri];
2208 
2209 
2210         /*
2211          * Assume that all threads are bound on this queue, and change it
2212          * later when we find out that it is not the case.
2213          */
2214         allbound = B_TRUE;
2215         for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2216                 hrtime_t now, nosteal, rqtime;
2217 
2218                 /*
2219                  * Skip over bound threads which could be here even
2220                  * though disp_max_unbound_pri indicated this level.
2221                  */
2222                 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2223                         continue;
2224 
2225                 /*
2226                  * We've got some unbound threads on this queue, so turn
2227                  * the allbound flag off now.
2228                  */
2229                 allbound = B_FALSE;
2230 
2231                 /*
2232                  * The thread is a candidate for stealing from its run queue. We
2233                  * don't want to steal threads that became runnable just a
2234                  * moment ago. This improves CPU affinity for threads that get
2235                  * preempted for short periods of time and go back on the run
2236                  * queue.
2237                  *
2238                  * We want to let it stay on its run queue if it was only placed
2239                  * there recently and it was running on the same CPU before that
2240                  * to preserve its cache investment. For the thread to remain on
2241                  * its run queue, ALL of the following conditions must be
2242                  * satisfied:
2243                  *
2244                  * - the disp queue should not be the kernel preemption queue
2245                  * - delayed idle stealing should not be disabled
2246                  * - nosteal_nsec should be non-zero
2247                  * - it should run with user priority
2248                  * - it should be on the run queue of the CPU where it was
2249                  *   running before being placed on the run queue
2250                  * - it should be the only thread on the run queue (to prevent
2251                  *   extra scheduling latency for other threads)
2252                  * - it should sit on the run queue for less than per-chip
2253                  *   nosteal interval or global nosteal interval
2254                  * - in case of CPUs with shared cache it should sit in a run
2255                  *   queue of a CPU from a different chip
2256                  *
2257                  * The checks are arranged so that the ones that are faster are
2258                  * placed earlier.
2259                  */
2260                 if (tcp == NULL ||
2261                     pri >= minclsyspri ||
2262                     tp->t_cpu != tcp)
2263                         break;
2264 
2265                 /*
2266                  * Steal immediately if, due to CMT processor architecture
2267                  * migraiton between cp and tcp would incur no performance
2268                  * penalty.
2269                  */
2270                 if (pg_cmt_can_migrate(cp, tcp))
2271                         break;
2272 
2273                 nosteal = nosteal_nsec;
2274                 if (nosteal == 0)
2275                         break;
2276 
2277                 /*
2278                  * Calculate time spent sitting on run queue
2279                  */
2280                 now = gethrtime_unscaled();
2281                 rqtime = now - tp->t_waitrq;
2282                 scalehrtime(&rqtime);
2283 
2284                 /*
2285                  * Steal immediately if the time spent on this run queue is more
2286                  * than allowed nosteal delay.
2287                  *
2288                  * Negative rqtime check is needed here to avoid infinite
2289                  * stealing delays caused by unlikely but not impossible
2290                  * drifts between CPU times on different CPUs.
2291                  */
2292                 if (rqtime > nosteal || rqtime < 0)
2293                         break;
2294 
2295                 DTRACE_PROBE4(nosteal, kthread_t *, tp,
2296                     cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2297                 scalehrtime(&now);
2298                 /*
2299                  * Calculate when this thread becomes stealable
2300                  */
2301                 now += (nosteal - rqtime);
2302 
2303                 /*
2304                  * Calculate time when some thread becomes stealable
2305                  */
2306                 if (now < dp->disp_steal)
2307                         dp->disp_steal = now;
2308         }
2309 
2310         /*
2311          * If there were no unbound threads on this queue, find the queue
2312          * where they are and then return later. The value of
2313          * disp_max_unbound_pri is not always accurate because it isn't
2314          * reduced until another idle CPU looks for work.
2315          */
2316         if (allbound)
2317                 disp_fix_unbound_pri(dp, pri);
2318 
2319         /*
2320          * If we reached the end of the queue and found no unbound threads
2321          * then return NULL so that other CPUs will be considered.  If there
2322          * are unbound threads but they cannot yet be stolen, then
2323          * return T_DONTSTEAL and try again later.
2324          */
2325         if (tp == NULL) {
2326                 disp_lock_exit_nopreempt(&dp->disp_lock);
2327                 return (allbound ? NULL : T_DONTSTEAL);
2328         }
2329 
2330         /*
2331          * Found a runnable, unbound thread, so remove it from queue.
2332          * dispdeq() requires that we have the thread locked, and we do,
2333          * by virtue of holding the dispatch queue lock.  dispdeq() will
2334          * put the thread in transition state, thereby dropping the dispq
2335          * lock.
2336          */
2337 
2338 #ifdef DEBUG
2339         {
2340                 int     thread_was_on_queue;
2341 
2342                 thread_was_on_queue = dispdeq(tp);      /* drops disp_lock */
2343                 ASSERT(thread_was_on_queue);
2344         }
2345 
2346 #else /* DEBUG */
2347         (void) dispdeq(tp);                     /* drops disp_lock */
2348 #endif /* DEBUG */
2349 
2350         /*
2351          * Reset the disp_queue steal time - we do not know what is the smallest
2352          * value across the queue is.
2353          */
2354         dp->disp_steal = 0;
2355 
2356         tp->t_schedflag |= TS_DONT_SWAP;
2357 
2358         /*
2359          * Setup thread to run on the current CPU.
2360          */
2361         tp->t_disp_queue = cp->cpu_disp;
2362 
2363         cp->cpu_dispthread = tp;             /* protected by spl only */
2364         cp->cpu_dispatch_pri = pri;
2365 
2366         /*
2367          * There can be a memory synchronization race between disp_getbest()
2368          * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2369          * to preempt the current thread to run the enqueued thread while
2370          * disp_getbest() and disp_ratify() are changing the current thread
2371          * to the stolen thread. This may lead to a situation where
2372          * cpu_resched() tries to preempt the wrong thread and the
2373          * stolen thread continues to run on the CPU which has been tagged
2374          * for preemption.
2375          * Later the clock thread gets enqueued but doesn't get to run on the
2376          * CPU causing the system to hang.
2377          *
2378          * To avoid this, grabbing and dropping the disp_lock (which does
2379          * a memory barrier) is needed to synchronize the execution of
2380          * cpu_resched() with disp_getbest() and disp_ratify() and
2381          * synchronize the memory read and written by cpu_resched(),
2382          * disp_getbest(), and disp_ratify() with each other.
2383          *  (see CR#6482861 for more details).
2384          */
2385         disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2386         disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2387 
2388         ASSERT(pri == DISP_PRIO(tp));
2389 
2390         DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2391 
2392         thread_onproc(tp, cp);                  /* set t_state to TS_ONPROC */
2393 
2394         /*
2395          * Return with spl high so that swtch() won't need to raise it.
2396          * The disp_lock was dropped by dispdeq().
2397          */
2398 
2399         return (tp);
2400 }
2401 
2402 /*
2403  * disp_bound_common() - common routine for higher level functions
2404  *      that check for bound threads under certain conditions.
2405  *      If 'threadlistsafe' is set then there is no need to acquire
2406  *      pidlock to stop the thread list from changing (eg, if
2407  *      disp_bound_* is called with cpus paused).
2408  */
2409 static int
2410 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2411 {
2412         int             found = 0;
2413         kthread_t       *tp;
2414 
2415         ASSERT(flag);
2416 
2417         if (!threadlistsafe)
2418                 mutex_enter(&pidlock);
2419         tp = curthread;         /* faster than allthreads */
2420         do {
2421                 if (tp->t_state != TS_FREE) {
2422                         /*
2423                          * If an interrupt thread is busy, but the
2424                          * caller doesn't care (i.e. BOUND_INTR is off),
2425                          * then just ignore it and continue through.
2426                          */
2427                         if ((tp->t_flag & T_INTR_THREAD) &&
2428                             !(flag & BOUND_INTR))
2429                                 continue;
2430 
2431                         /*
2432                          * Skip the idle thread for the CPU
2433                          * we're about to set offline.
2434                          */
2435                         if (tp == cp->cpu_idle_thread)
2436                                 continue;
2437 
2438                         /*
2439                          * Skip the pause thread for the CPU
2440                          * we're about to set offline.
2441                          */
2442                         if (tp == cp->cpu_pause_thread)
2443                                 continue;
2444 
2445                         if ((flag & BOUND_CPU) &&
2446                             (tp->t_bound_cpu == cp ||
2447                             tp->t_bind_cpu == cp->cpu_id ||
2448                             tp->t_weakbound_cpu == cp)) {
2449                                 found = 1;
2450                                 break;
2451                         }
2452 
2453                         if ((flag & BOUND_PARTITION) &&
2454                             (tp->t_cpupart == cp->cpu_part)) {
2455                                 found = 1;
2456                                 break;
2457                         }
2458                 }
2459         } while ((tp = tp->t_next) != curthread && found == 0);
2460         if (!threadlistsafe)
2461                 mutex_exit(&pidlock);
2462         return (found);
2463 }
2464 
2465 /*
2466  * disp_bound_threads - return nonzero if threads are bound to the processor.
2467  *      Called infrequently.  Keep this simple.
2468  *      Includes threads that are asleep or stopped but not onproc.
2469  */
2470 int
2471 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2472 {
2473         return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2474 }
2475 
2476 /*
2477  * disp_bound_anythreads - return nonzero if _any_ threads are bound
2478  * to the given processor, including interrupt threads.
2479  */
2480 int
2481 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2482 {
2483         return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2484 }
2485 
2486 /*
2487  * disp_bound_partition - return nonzero if threads are bound to the same
2488  * partition as the processor.
2489  *      Called infrequently.  Keep this simple.
2490  *      Includes threads that are asleep or stopped but not onproc.
2491  */
2492 int
2493 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2494 {
2495         return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2496 }
2497 
2498 /*
2499  * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2500  * threads to other CPUs.
2501  */
2502 void
2503 disp_cpu_inactive(cpu_t *cp)
2504 {
2505         kthread_t       *tp;
2506         disp_t          *dp = cp->cpu_disp;
2507         dispq_t         *dq;
2508         pri_t           pri;
2509         int             wasonq;
2510 
2511         disp_lock_enter(&dp->disp_lock);
2512         while ((pri = dp->disp_max_unbound_pri) != -1) {
2513                 dq = &dp->disp_q[pri];
2514                 tp = dq->dq_first;
2515 
2516                 /*
2517                  * Skip over bound threads.
2518                  */
2519                 while (tp != NULL && tp->t_bound_cpu != NULL) {
2520                         tp = tp->t_link;
2521                 }
2522 
2523                 if (tp == NULL) {
2524                         /* disp_max_unbound_pri must be inaccurate, so fix it */
2525                         disp_fix_unbound_pri(dp, pri);
2526                         continue;
2527                 }
2528 
2529                 wasonq = dispdeq(tp);           /* drops disp_lock */
2530                 ASSERT(wasonq);
2531                 ASSERT(tp->t_weakbound_cpu == NULL);
2532 
2533                 setbackdq(tp);
2534                 /*
2535                  * Called from cpu_offline:
2536                  *
2537                  * cp has already been removed from the list of active cpus
2538                  * and tp->t_cpu has been changed so there is no risk of
2539                  * tp ending up back on cp.
2540                  *
2541                  * Called from cpupart_move_cpu:
2542                  *
2543                  * The cpu has moved to a new cpupart.  Any threads that
2544                  * were on it's dispatch queues before the move remain
2545                  * in the old partition and can't run in the new partition.
2546                  */
2547                 ASSERT(tp->t_cpu != cp);
2548                 thread_unlock(tp);
2549 
2550                 disp_lock_enter(&dp->disp_lock);
2551         }
2552         disp_lock_exit(&dp->disp_lock);
2553 }
2554 
2555 /*
2556  * disp_lowpri_cpu - find CPU running the lowest priority thread.
2557  *      The hint passed in is used as a starting point so we don't favor
2558  *      CPU 0 or any other CPU.  The caller should pass in the most recently
2559  *      used CPU for the thread.
2560  *
2561  *      The lgroup and priority are used to determine the best CPU to run on
2562  *      in a NUMA machine.  The lgroup specifies which CPUs are closest while
2563  *      the thread priority will indicate whether the thread will actually run
2564  *      there.  To pick the best CPU, the CPUs inside and outside of the given
2565  *      lgroup which are running the lowest priority threads are found.  The
2566  *      remote CPU is chosen only if the thread will not run locally on a CPU
2567  *      within the lgroup, but will run on the remote CPU. If the thread
2568  *      cannot immediately run on any CPU, the best local CPU will be chosen.
2569  *
2570  *      The lpl specified also identifies the cpu partition from which
2571  *      disp_lowpri_cpu should select a CPU.
2572  *
2573  *      curcpu is used to indicate that disp_lowpri_cpu is being called on
2574  *      behalf of the current thread. (curthread is looking for a new cpu)
2575  *      In this case, cpu_dispatch_pri for this thread's cpu should be
2576  *      ignored.
2577  *
2578  *      If a cpu is the target of an offline request then try to avoid it.
2579  *
2580  *      This function must be called at either high SPL, or with preemption
2581  *      disabled, so that the "hint" CPU cannot be removed from the online
2582  *      CPU list while we are traversing it.
2583  */
2584 cpu_t *
2585 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2586 {
2587         cpu_t   *bestcpu;
2588         cpu_t   *besthomecpu;
2589         cpu_t   *cp, *cpstart;
2590 
2591         pri_t   bestpri;
2592         pri_t   cpupri;
2593 
2594         klgrpset_t      done;
2595         klgrpset_t      cur_set;
2596 
2597         lpl_t           *lpl_iter, *lpl_leaf;
2598         int             i;
2599 
2600         /*
2601          * Scan for a CPU currently running the lowest priority thread.
2602          * Cannot get cpu_lock here because it is adaptive.
2603          * We do not require lock on CPU list.
2604          */
2605         ASSERT(hint != NULL);
2606         ASSERT(lpl != NULL);
2607         ASSERT(lpl->lpl_ncpu > 0);
2608 
2609         /*
2610          * First examine local CPUs. Note that it's possible the hint CPU
2611          * passed in in remote to the specified home lgroup. If our priority
2612          * isn't sufficient enough such that we can run immediately at home,
2613          * then examine CPUs remote to our home lgroup.
2614          * We would like to give preference to CPUs closest to "home".
2615          * If we can't find a CPU where we'll run at a given level
2616          * of locality, we expand our search to include the next level.
2617          */
2618         bestcpu = besthomecpu = NULL;
2619         klgrpset_clear(done);
2620         /* start with lpl we were passed */
2621 
2622         lpl_iter = lpl;
2623 
2624         do {
2625 
2626                 bestpri = SHRT_MAX;
2627                 klgrpset_clear(cur_set);
2628 
2629                 for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2630                         lpl_leaf = lpl_iter->lpl_rset[i];
2631                         if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2632                                 continue;
2633 
2634                         klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2635 
2636                         if (hint->cpu_lpl == lpl_leaf)
2637                                 cp = cpstart = hint;
2638                         else
2639                                 cp = cpstart = lpl_leaf->lpl_cpus;
2640 
2641                         do {
2642                                 if (cp == curcpu)
2643                                         cpupri = -1;
2644                                 else if (cp == cpu_inmotion)
2645                                         cpupri = SHRT_MAX;
2646                                 else
2647                                         cpupri = cp->cpu_dispatch_pri;
2648                                 if (cp->cpu_disp->disp_maxrunpri > cpupri)
2649                                         cpupri = cp->cpu_disp->disp_maxrunpri;
2650                                 if (cp->cpu_chosen_level > cpupri)
2651                                         cpupri = cp->cpu_chosen_level;
2652                                 if (cpupri < bestpri) {
2653                                         if (CPU_IDLING(cpupri)) {
2654                                                 ASSERT((cp->cpu_flags &
2655                                                     CPU_QUIESCED) == 0);
2656                                                 return (cp);
2657                                         }
2658                                         bestcpu = cp;
2659                                         bestpri = cpupri;
2660                                 }
2661                         } while ((cp = cp->cpu_next_lpl) != cpstart);
2662                 }
2663 
2664                 if (bestcpu && (tpri > bestpri)) {
2665                         ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2666                         return (bestcpu);
2667                 }
2668                 if (besthomecpu == NULL)
2669                         besthomecpu = bestcpu;
2670                 /*
2671                  * Add the lgrps we just considered to the "done" set
2672                  */
2673                 klgrpset_or(done, cur_set);
2674 
2675         } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2676 
2677         /*
2678          * The specified priority isn't high enough to run immediately
2679          * anywhere, so just return the best CPU from the home lgroup.
2680          */
2681         ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2682         return (besthomecpu);
2683 }
2684 
2685 /*
2686  * This routine provides the generic idle cpu function for all processors.
2687  * If a processor has some specific code to execute when idle (say, to stop
2688  * the pipeline and save power) then that routine should be defined in the
2689  * processors specific code (module_xx.c) and the global variable idle_cpu
2690  * set to that function.
2691  */
2692 static void
2693 generic_idle_cpu(void)
2694 {
2695 }
2696 
2697 /*ARGSUSED*/
2698 static void
2699 generic_enq_thread(cpu_t *cpu, int bound)
2700 {
2701 }