1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 30 #include <sys/types.h> 31 #include <sys/param.h> 32 #include <sys/sysmacros.h> 33 #include <sys/signal.h> 34 #include <sys/user.h> 35 #include <sys/systm.h> 36 #include <sys/sysinfo.h> 37 #include <sys/var.h> 38 #include <sys/errno.h> 39 #include <sys/cmn_err.h> 40 #include <sys/debug.h> 41 #include <sys/inline.h> 42 #include <sys/disp.h> 43 #include <sys/class.h> 44 #include <sys/bitmap.h> 45 #include <sys/kmem.h> 46 #include <sys/cpuvar.h> 47 #include <sys/vtrace.h> 48 #include <sys/tnf.h> 49 #include <sys/cpupart.h> 50 #include <sys/lgrp.h> 51 #include <sys/pg.h> 52 #include <sys/cmt.h> 53 #include <sys/bitset.h> 54 #include <sys/schedctl.h> 55 #include <sys/atomic.h> 56 #include <sys/dtrace.h> 57 #include <sys/sdt.h> 58 #include <sys/archsystm.h> 59 60 #include <vm/as.h> 61 62 #define BOUND_CPU 0x1 63 #define BOUND_PARTITION 0x2 64 #define BOUND_INTR 0x4 65 66 /* Dispatch queue allocation structure and functions */ 67 struct disp_queue_info { 68 disp_t *dp; 69 dispq_t *olddispq; 70 dispq_t *newdispq; 71 ulong_t *olddqactmap; 72 ulong_t *newdqactmap; 73 int oldnglobpris; 74 }; 75 static void disp_dq_alloc(struct disp_queue_info *dptr, int numpris, 76 disp_t *dp); 77 static void disp_dq_assign(struct disp_queue_info *dptr, int numpris); 78 static void disp_dq_free(struct disp_queue_info *dptr); 79 80 /* platform-specific routine to call when processor is idle */ 81 static void generic_idle_cpu(); 82 void (*idle_cpu)() = generic_idle_cpu; 83 84 /* routines invoked when a CPU enters/exits the idle loop */ 85 static void idle_enter(); 86 static void idle_exit(); 87 88 /* platform-specific routine to call when thread is enqueued */ 89 static void generic_enq_thread(cpu_t *, int); 90 void (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread; 91 92 pri_t kpreemptpri; /* priority where kernel preemption applies */ 93 pri_t upreemptpri = 0; /* priority where normal preemption applies */ 94 pri_t intr_pri; /* interrupt thread priority base level */ 95 96 #define KPQPRI -1 /* pri where cpu affinity is dropped for kpq */ 97 pri_t kpqpri = KPQPRI; /* can be set in /etc/system */ 98 disp_t cpu0_disp; /* boot CPU's dispatch queue */ 99 int nswapped; /* total number of swapped threads */ 100 static void disp_swapped_setrun(kthread_t *tp); 101 static void cpu_resched(cpu_t *cp, pri_t tpri); 102 103 /* 104 * If this is set, only interrupt threads will cause kernel preemptions. 105 * This is done by changing the value of kpreemptpri. kpreemptpri 106 * will either be the max sysclass pri + 1 or the min interrupt pri. 107 */ 108 int only_intr_kpreempt; 109 110 extern void set_idle_cpu(int cpun); 111 extern void unset_idle_cpu(int cpun); 112 static void setkpdq(kthread_t *tp, int borf); 113 #define SETKP_BACK 0 114 #define SETKP_FRONT 1 115 /* 116 * Parameter that determines how recently a thread must have run 117 * on the CPU to be considered loosely-bound to that CPU to reduce 118 * cold cache effects. The interval is in hertz. 119 */ 120 #define RECHOOSE_INTERVAL 3 121 int rechoose_interval = RECHOOSE_INTERVAL; 122 123 /* 124 * Parameter that determines how long (in nanoseconds) a thread must 125 * be sitting on a run queue before it can be stolen by another CPU 126 * to reduce migrations. The interval is in nanoseconds. 127 * 128 * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval() 129 * to an appropriate value. nosteal_nsec is set to NOSTEAL_UNINITIALIZED 130 * here indicating it is uninitiallized. 131 * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'. 132 * 133 */ 134 #define NOSTEAL_UNINITIALIZED (-1) 135 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED; 136 extern void cmp_set_nosteal_interval(void); 137 138 id_t defaultcid; /* system "default" class; see dispadmin(1M) */ 139 140 disp_lock_t transition_lock; /* lock on transitioning threads */ 141 disp_lock_t stop_lock; /* lock on stopped threads */ 142 143 static void cpu_dispqalloc(int numpris); 144 145 /* 146 * This gets returned by disp_getwork/disp_getbest if we couldn't steal 147 * a thread because it was sitting on its run queue for a very short 148 * period of time. 149 */ 150 #define T_DONTSTEAL (kthread_t *)(-1) /* returned by disp_getwork/getbest */ 151 152 static kthread_t *disp_getwork(cpu_t *to); 153 static kthread_t *disp_getbest(disp_t *from); 154 static kthread_t *disp_ratify(kthread_t *tp, disp_t *kpq); 155 156 void swtch_to(kthread_t *); 157 158 /* 159 * dispatcher and scheduler initialization 160 */ 161 162 /* 163 * disp_setup - Common code to calculate and allocate dispatcher 164 * variables and structures based on the maximum priority. 165 */ 166 static void 167 disp_setup(pri_t maxglobpri, pri_t oldnglobpris) 168 { 169 pri_t newnglobpris; 170 171 ASSERT(MUTEX_HELD(&cpu_lock)); 172 173 newnglobpris = maxglobpri + 1 + LOCK_LEVEL; 174 175 if (newnglobpris > oldnglobpris) { 176 /* 177 * Allocate new kp queues for each CPU partition. 178 */ 179 cpupart_kpqalloc(newnglobpris); 180 181 /* 182 * Allocate new dispatch queues for each CPU. 183 */ 184 cpu_dispqalloc(newnglobpris); 185 186 /* 187 * compute new interrupt thread base priority 188 */ 189 intr_pri = maxglobpri; 190 if (only_intr_kpreempt) { 191 kpreemptpri = intr_pri + 1; 192 if (kpqpri == KPQPRI) 193 kpqpri = kpreemptpri; 194 } 195 v.v_nglobpris = newnglobpris; 196 } 197 } 198 199 /* 200 * dispinit - Called to initialize all loaded classes and the 201 * dispatcher framework. 202 */ 203 void 204 dispinit(void) 205 { 206 id_t cid; 207 pri_t maxglobpri; 208 pri_t cl_maxglobpri; 209 210 maxglobpri = -1; 211 212 /* 213 * Initialize transition lock, which will always be set. 214 */ 215 DISP_LOCK_INIT(&transition_lock); 216 disp_lock_enter_high(&transition_lock); 217 DISP_LOCK_INIT(&stop_lock); 218 219 mutex_enter(&cpu_lock); 220 CPU->cpu_disp->disp_maxrunpri = -1; 221 CPU->cpu_disp->disp_max_unbound_pri = -1; 222 223 /* 224 * Initialize the default CPU partition. 225 */ 226 cpupart_initialize_default(); 227 /* 228 * Call the class specific initialization functions for 229 * all pre-installed schedulers. 230 * 231 * We pass the size of a class specific parameter 232 * buffer to each of the initialization functions 233 * to try to catch problems with backward compatibility 234 * of class modules. 235 * 236 * For example a new class module running on an old system 237 * which didn't provide sufficiently large parameter buffers 238 * would be bad news. Class initialization modules can check for 239 * this and take action if they detect a problem. 240 */ 241 242 for (cid = 0; cid < nclass; cid++) { 243 sclass_t *sc; 244 245 sc = &sclass[cid]; 246 if (SCHED_INSTALLED(sc)) { 247 cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ, 248 &sc->cl_funcs); 249 if (cl_maxglobpri > maxglobpri) 250 maxglobpri = cl_maxglobpri; 251 } 252 } 253 kpreemptpri = (pri_t)v.v_maxsyspri + 1; 254 if (kpqpri == KPQPRI) 255 kpqpri = kpreemptpri; 256 257 ASSERT(maxglobpri >= 0); 258 disp_setup(maxglobpri, 0); 259 260 mutex_exit(&cpu_lock); 261 262 /* 263 * Platform specific sticky scheduler setup. 264 */ 265 if (nosteal_nsec == NOSTEAL_UNINITIALIZED) 266 cmp_set_nosteal_interval(); 267 268 /* 269 * Get the default class ID; this may be later modified via 270 * dispadmin(1M). This will load the class (normally TS) and that will 271 * call disp_add(), which is why we had to drop cpu_lock first. 272 */ 273 if (getcid(defaultclass, &defaultcid) != 0) { 274 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'", 275 defaultclass); 276 } 277 } 278 279 /* 280 * disp_add - Called with class pointer to initialize the dispatcher 281 * for a newly loaded class. 282 */ 283 void 284 disp_add(sclass_t *clp) 285 { 286 pri_t maxglobpri; 287 pri_t cl_maxglobpri; 288 289 mutex_enter(&cpu_lock); 290 /* 291 * Initialize the scheduler class. 292 */ 293 maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1); 294 cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs); 295 if (cl_maxglobpri > maxglobpri) 296 maxglobpri = cl_maxglobpri; 297 298 /* 299 * Save old queue information. Since we're initializing a 300 * new scheduling class which has just been loaded, then 301 * the size of the dispq may have changed. We need to handle 302 * that here. 303 */ 304 disp_setup(maxglobpri, v.v_nglobpris); 305 306 mutex_exit(&cpu_lock); 307 } 308 309 310 /* 311 * For each CPU, allocate new dispatch queues 312 * with the stated number of priorities. 313 */ 314 static void 315 cpu_dispqalloc(int numpris) 316 { 317 cpu_t *cpup; 318 struct disp_queue_info *disp_mem; 319 int i, num; 320 321 ASSERT(MUTEX_HELD(&cpu_lock)); 322 323 disp_mem = kmem_zalloc(NCPU * 324 sizeof (struct disp_queue_info), KM_SLEEP); 325 326 /* 327 * This routine must allocate all of the memory before stopping 328 * the cpus because it must not sleep in kmem_alloc while the 329 * CPUs are stopped. Locks they hold will not be freed until they 330 * are restarted. 331 */ 332 i = 0; 333 cpup = cpu_list; 334 do { 335 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp); 336 i++; 337 cpup = cpup->cpu_next; 338 } while (cpup != cpu_list); 339 num = i; 340 341 pause_cpus(NULL); 342 for (i = 0; i < num; i++) 343 disp_dq_assign(&disp_mem[i], numpris); 344 start_cpus(); 345 346 /* 347 * I must free all of the memory after starting the cpus because 348 * I can not risk sleeping in kmem_free while the cpus are stopped. 349 */ 350 for (i = 0; i < num; i++) 351 disp_dq_free(&disp_mem[i]); 352 353 kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info)); 354 } 355 356 static void 357 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp) 358 { 359 dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP); 360 dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) * 361 sizeof (long), KM_SLEEP); 362 dptr->dp = dp; 363 } 364 365 static void 366 disp_dq_assign(struct disp_queue_info *dptr, int numpris) 367 { 368 disp_t *dp; 369 370 dp = dptr->dp; 371 dptr->olddispq = dp->disp_q; 372 dptr->olddqactmap = dp->disp_qactmap; 373 dptr->oldnglobpris = dp->disp_npri; 374 375 ASSERT(dptr->oldnglobpris < numpris); 376 377 if (dptr->olddispq != NULL) { 378 /* 379 * Use kcopy because bcopy is platform-specific 380 * and could block while we might have paused the cpus. 381 */ 382 (void) kcopy(dptr->olddispq, dptr->newdispq, 383 dptr->oldnglobpris * sizeof (dispq_t)); 384 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap, 385 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * 386 sizeof (long)); 387 } 388 dp->disp_q = dptr->newdispq; 389 dp->disp_qactmap = dptr->newdqactmap; 390 dp->disp_q_limit = &dptr->newdispq[numpris]; 391 dp->disp_npri = numpris; 392 } 393 394 static void 395 disp_dq_free(struct disp_queue_info *dptr) 396 { 397 if (dptr->olddispq != NULL) 398 kmem_free(dptr->olddispq, 399 dptr->oldnglobpris * sizeof (dispq_t)); 400 if (dptr->olddqactmap != NULL) 401 kmem_free(dptr->olddqactmap, 402 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long)); 403 } 404 405 /* 406 * For a newly created CPU, initialize the dispatch queue. 407 * This is called before the CPU is known through cpu[] or on any lists. 408 */ 409 void 410 disp_cpu_init(cpu_t *cp) 411 { 412 disp_t *dp; 413 dispq_t *newdispq; 414 ulong_t *newdqactmap; 415 416 ASSERT(MUTEX_HELD(&cpu_lock)); /* protect dispatcher queue sizes */ 417 418 if (cp == cpu0_disp.disp_cpu) 419 dp = &cpu0_disp; 420 else 421 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP); 422 bzero(dp, sizeof (disp_t)); 423 cp->cpu_disp = dp; 424 dp->disp_cpu = cp; 425 dp->disp_maxrunpri = -1; 426 dp->disp_max_unbound_pri = -1; 427 DISP_LOCK_INIT(&cp->cpu_thread_lock); 428 /* 429 * Allocate memory for the dispatcher queue headers 430 * and the active queue bitmap. 431 */ 432 newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP); 433 newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) * 434 sizeof (long), KM_SLEEP); 435 dp->disp_q = newdispq; 436 dp->disp_qactmap = newdqactmap; 437 dp->disp_q_limit = &newdispq[v.v_nglobpris]; 438 dp->disp_npri = v.v_nglobpris; 439 } 440 441 void 442 disp_cpu_fini(cpu_t *cp) 443 { 444 ASSERT(MUTEX_HELD(&cpu_lock)); 445 446 disp_kp_free(cp->cpu_disp); 447 if (cp->cpu_disp != &cpu0_disp) 448 kmem_free(cp->cpu_disp, sizeof (disp_t)); 449 } 450 451 /* 452 * Allocate new, larger kpreempt dispatch queue to replace the old one. 453 */ 454 void 455 disp_kp_alloc(disp_t *dq, pri_t npri) 456 { 457 struct disp_queue_info mem_info; 458 459 if (npri > dq->disp_npri) { 460 /* 461 * Allocate memory for the new array. 462 */ 463 disp_dq_alloc(&mem_info, npri, dq); 464 465 /* 466 * We need to copy the old structures to the new 467 * and free the old. 468 */ 469 disp_dq_assign(&mem_info, npri); 470 disp_dq_free(&mem_info); 471 } 472 } 473 474 /* 475 * Free dispatch queue. 476 * Used for the kpreempt queues for a removed CPU partition and 477 * for the per-CPU queues of deleted CPUs. 478 */ 479 void 480 disp_kp_free(disp_t *dq) 481 { 482 struct disp_queue_info mem_info; 483 484 mem_info.olddispq = dq->disp_q; 485 mem_info.olddqactmap = dq->disp_qactmap; 486 mem_info.oldnglobpris = dq->disp_npri; 487 disp_dq_free(&mem_info); 488 } 489 490 /* 491 * End dispatcher and scheduler initialization. 492 */ 493 494 /* 495 * See if there's anything to do other than remain idle. 496 * Return non-zero if there is. 497 * 498 * This function must be called with high spl, or with 499 * kernel preemption disabled to prevent the partition's 500 * active cpu list from changing while being traversed. 501 * 502 * This is essentially a simpler version of disp_getwork() 503 * to be called by CPUs preparing to "halt". 504 */ 505 int 506 disp_anywork(void) 507 { 508 cpu_t *cp = CPU; 509 cpu_t *ocp; 510 volatile int *local_nrunnable = &cp->cpu_disp->disp_nrunnable; 511 512 if (!(cp->cpu_flags & CPU_OFFLINE)) { 513 if (CP_MAXRUNPRI(cp->cpu_part) >= 0) 514 return (1); 515 516 for (ocp = cp->cpu_next_part; ocp != cp; 517 ocp = ocp->cpu_next_part) { 518 ASSERT(CPU_ACTIVE(ocp)); 519 520 /* 521 * Something has appeared on the local run queue. 522 */ 523 if (*local_nrunnable > 0) 524 return (1); 525 /* 526 * If we encounter another idle CPU that will 527 * soon be trolling around through disp_anywork() 528 * terminate our walk here and let this other CPU 529 * patrol the next part of the list. 530 */ 531 if (ocp->cpu_dispatch_pri == -1 && 532 (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0) 533 return (0); 534 /* 535 * Work can be taken from another CPU if: 536 * - There is unbound work on the run queue 537 * - That work isn't a thread undergoing a 538 * - context switch on an otherwise empty queue. 539 * - The CPU isn't running the idle loop. 540 */ 541 if (ocp->cpu_disp->disp_max_unbound_pri != -1 && 542 !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) && 543 ocp->cpu_disp->disp_nrunnable == 1) && 544 ocp->cpu_dispatch_pri != -1) 545 return (1); 546 } 547 } 548 return (0); 549 } 550 551 /* 552 * Called when CPU enters the idle loop 553 */ 554 static void 555 idle_enter() 556 { 557 cpu_t *cp = CPU; 558 559 new_cpu_mstate(CMS_IDLE, gethrtime_unscaled()); 560 CPU_STATS_ADDQ(cp, sys, idlethread, 1); 561 set_idle_cpu(cp->cpu_id); /* arch-dependent hook */ 562 } 563 564 /* 565 * Called when CPU exits the idle loop 566 */ 567 static void 568 idle_exit() 569 { 570 cpu_t *cp = CPU; 571 572 new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled()); 573 unset_idle_cpu(cp->cpu_id); /* arch-dependent hook */ 574 } 575 576 /* 577 * Idle loop. 578 */ 579 void 580 idle() 581 { 582 struct cpu *cp = CPU; /* pointer to this CPU */ 583 kthread_t *t; /* taken thread */ 584 585 idle_enter(); 586 587 /* 588 * Uniprocessor version of idle loop. 589 * Do this until notified that we're on an actual multiprocessor. 590 */ 591 while (ncpus == 1) { 592 if (cp->cpu_disp->disp_nrunnable == 0) { 593 (*idle_cpu)(); 594 continue; 595 } 596 idle_exit(); 597 swtch(); 598 599 idle_enter(); /* returned from swtch */ 600 } 601 602 /* 603 * Multiprocessor idle loop. 604 */ 605 for (;;) { 606 /* 607 * If CPU is completely quiesced by p_online(2), just wait 608 * here with minimal bus traffic until put online. 609 */ 610 while (cp->cpu_flags & CPU_QUIESCED) 611 (*idle_cpu)(); 612 613 if (cp->cpu_disp->disp_nrunnable != 0) { 614 idle_exit(); 615 swtch(); 616 } else { 617 if (cp->cpu_flags & CPU_OFFLINE) 618 continue; 619 if ((t = disp_getwork(cp)) == NULL) { 620 if (cp->cpu_chosen_level != -1) { 621 disp_t *dp = cp->cpu_disp; 622 disp_t *kpq; 623 624 disp_lock_enter(&dp->disp_lock); 625 /* 626 * Set kpq under lock to prevent 627 * migration between partitions. 628 */ 629 kpq = &cp->cpu_part->cp_kp_queue; 630 if (kpq->disp_maxrunpri == -1) 631 cp->cpu_chosen_level = -1; 632 disp_lock_exit(&dp->disp_lock); 633 } 634 (*idle_cpu)(); 635 continue; 636 } 637 /* 638 * If there was a thread but we couldn't steal 639 * it, then keep trying. 640 */ 641 if (t == T_DONTSTEAL) 642 continue; 643 idle_exit(); 644 swtch_to(t); 645 } 646 idle_enter(); /* returned from swtch/swtch_to */ 647 } 648 } 649 650 651 /* 652 * Preempt the currently running thread in favor of the highest 653 * priority thread. The class of the current thread controls 654 * where it goes on the dispatcher queues. If panicking, turn 655 * preemption off. 656 */ 657 void 658 preempt() 659 { 660 kthread_t *t = curthread; 661 klwp_t *lwp = ttolwp(curthread); 662 663 if (panicstr) 664 return; 665 666 TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start"); 667 668 thread_lock(t); 669 670 if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) { 671 /* 672 * this thread has already been chosen to be run on 673 * another CPU. Clear kprunrun on this CPU since we're 674 * already headed for swtch(). 675 */ 676 CPU->cpu_kprunrun = 0; 677 thread_unlock_nopreempt(t); 678 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end"); 679 } else { 680 if (lwp != NULL) 681 lwp->lwp_ru.nivcsw++; 682 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1); 683 THREAD_TRANSITION(t); 684 CL_PREEMPT(t); 685 DTRACE_SCHED(preempt); 686 thread_unlock_nopreempt(t); 687 688 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end"); 689 690 swtch(); /* clears CPU->cpu_runrun via disp() */ 691 } 692 } 693 694 extern kthread_t *thread_unpin(); 695 696 /* 697 * disp() - find the highest priority thread for this processor to run, and 698 * set it in TS_ONPROC state so that resume() can be called to run it. 699 */ 700 static kthread_t * 701 disp() 702 { 703 cpu_t *cpup; 704 disp_t *dp; 705 kthread_t *tp; 706 dispq_t *dq; 707 int maxrunword; 708 pri_t pri; 709 disp_t *kpq; 710 711 TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start"); 712 713 cpup = CPU; 714 /* 715 * Find the highest priority loaded, runnable thread. 716 */ 717 dp = cpup->cpu_disp; 718 719 reschedule: 720 /* 721 * If there is more important work on the global queue with a better 722 * priority than the maximum on this CPU, take it now. 723 */ 724 kpq = &cpup->cpu_part->cp_kp_queue; 725 while ((pri = kpq->disp_maxrunpri) >= 0 && 726 pri >= dp->disp_maxrunpri && 727 (cpup->cpu_flags & CPU_OFFLINE) == 0 && 728 (tp = disp_getbest(kpq)) != NULL) { 729 if (disp_ratify(tp, kpq) != NULL) { 730 TRACE_1(TR_FAC_DISP, TR_DISP_END, 731 "disp_end:tid %p", tp); 732 return (tp); 733 } 734 } 735 736 disp_lock_enter(&dp->disp_lock); 737 pri = dp->disp_maxrunpri; 738 739 /* 740 * If there is nothing to run, look at what's runnable on other queues. 741 * Choose the idle thread if the CPU is quiesced. 742 * Note that CPUs that have the CPU_OFFLINE flag set can still run 743 * interrupt threads, which will be the only threads on the CPU's own 744 * queue, but cannot run threads from other queues. 745 */ 746 if (pri == -1) { 747 if (!(cpup->cpu_flags & CPU_OFFLINE)) { 748 disp_lock_exit(&dp->disp_lock); 749 if ((tp = disp_getwork(cpup)) == NULL || 750 tp == T_DONTSTEAL) { 751 tp = cpup->cpu_idle_thread; 752 (void) splhigh(); 753 THREAD_ONPROC(tp, cpup); 754 cpup->cpu_dispthread = tp; 755 cpup->cpu_dispatch_pri = -1; 756 cpup->cpu_runrun = cpup->cpu_kprunrun = 0; 757 cpup->cpu_chosen_level = -1; 758 } 759 } else { 760 disp_lock_exit_high(&dp->disp_lock); 761 tp = cpup->cpu_idle_thread; 762 THREAD_ONPROC(tp, cpup); 763 cpup->cpu_dispthread = tp; 764 cpup->cpu_dispatch_pri = -1; 765 cpup->cpu_runrun = cpup->cpu_kprunrun = 0; 766 cpup->cpu_chosen_level = -1; 767 } 768 TRACE_1(TR_FAC_DISP, TR_DISP_END, 769 "disp_end:tid %p", tp); 770 return (tp); 771 } 772 773 dq = &dp->disp_q[pri]; 774 tp = dq->dq_first; 775 776 ASSERT(tp != NULL); 777 778 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp); 779 780 /* 781 * Found it so remove it from queue. 782 */ 783 dp->disp_nrunnable--; 784 dq->dq_sruncnt--; 785 if ((dq->dq_first = tp->t_link) == NULL) { 786 ulong_t *dqactmap = dp->disp_qactmap; 787 788 ASSERT(dq->dq_sruncnt == 0); 789 dq->dq_last = NULL; 790 791 /* 792 * The queue is empty, so the corresponding bit needs to be 793 * turned off in dqactmap. If nrunnable != 0 just took the 794 * last runnable thread off the 795 * highest queue, so recompute disp_maxrunpri. 796 */ 797 maxrunword = pri >> BT_ULSHIFT; 798 dqactmap[maxrunword] &= ~BT_BIW(pri); 799 800 if (dp->disp_nrunnable == 0) { 801 dp->disp_max_unbound_pri = -1; 802 dp->disp_maxrunpri = -1; 803 } else { 804 int ipri; 805 806 ipri = bt_gethighbit(dqactmap, maxrunword); 807 dp->disp_maxrunpri = ipri; 808 if (ipri < dp->disp_max_unbound_pri) 809 dp->disp_max_unbound_pri = ipri; 810 } 811 } else { 812 tp->t_link = NULL; 813 } 814 815 cpup->cpu_dispthread = tp; /* protected by spl only */ 816 cpup->cpu_dispatch_pri = pri; 817 ASSERT(pri == DISP_PRIO(tp)); 818 thread_onproc(tp, cpup); /* set t_state to TS_ONPROC */ 819 disp_lock_exit_high(&dp->disp_lock); /* drop run queue lock */ 820 821 ASSERT(tp != NULL); 822 TRACE_1(TR_FAC_DISP, TR_DISP_END, 823 "disp_end:tid %p", tp); 824 825 if (disp_ratify(tp, kpq) == NULL) 826 goto reschedule; 827 828 return (tp); 829 } 830 831 /* 832 * swtch() 833 * Find best runnable thread and run it. 834 * Called with the current thread already switched to a new state, 835 * on a sleep queue, run queue, stopped, and not zombied. 836 * May be called at any spl level less than or equal to LOCK_LEVEL. 837 * Always drops spl to the base level (spl0()). 838 */ 839 void 840 swtch() 841 { 842 kthread_t *t = curthread; 843 kthread_t *next; 844 cpu_t *cp; 845 846 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 847 848 if (t->t_flag & T_INTR_THREAD) 849 cpu_intr_swtch_enter(t); 850 851 if (t->t_intr != NULL) { 852 /* 853 * We are an interrupt thread. Setup and return 854 * the interrupted thread to be resumed. 855 */ 856 (void) splhigh(); /* block other scheduler action */ 857 cp = CPU; /* now protected against migration */ 858 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */ 859 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 860 CPU_STATS_ADDQ(cp, sys, intrblk, 1); 861 next = thread_unpin(); 862 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 863 resume_from_intr(next); 864 } else { 865 #ifdef DEBUG 866 if (t->t_state == TS_ONPROC && 867 t->t_disp_queue->disp_cpu == CPU && 868 t->t_preempt == 0) { 869 thread_lock(t); 870 ASSERT(t->t_state != TS_ONPROC || 871 t->t_disp_queue->disp_cpu != CPU || 872 t->t_preempt != 0); /* cannot migrate */ 873 thread_unlock_nopreempt(t); 874 } 875 #endif /* DEBUG */ 876 cp = CPU; 877 next = disp(); /* returns with spl high */ 878 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */ 879 880 /* OK to steal anything left on run queue */ 881 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; 882 883 if (next != t) { 884 hrtime_t now; 885 886 now = gethrtime_unscaled(); 887 pg_ev_thread_swtch(cp, now, t, next); 888 889 /* 890 * If t was previously in the TS_ONPROC state, 891 * setfrontdq and setbackdq won't have set its t_waitrq. 892 * Since we now finally know that we're switching away 893 * from this thread, set its t_waitrq if it is on a run 894 * queue. 895 */ 896 if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) { 897 t->t_waitrq = now; 898 } 899 900 /* 901 * restore mstate of thread that we are switching to 902 */ 903 restore_mstate(next); 904 905 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 906 cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt(); 907 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 908 909 if (dtrace_vtime_active) 910 dtrace_vtime_switch(next); 911 912 resume(next); 913 /* 914 * The TR_RESUME_END and TR_SWTCH_END trace points 915 * appear at the end of resume(), because we may not 916 * return here 917 */ 918 } else { 919 if (t->t_flag & T_INTR_THREAD) 920 cpu_intr_swtch_exit(t); 921 /* 922 * Threads that enqueue themselves on a run queue defer 923 * setting t_waitrq. It is then either set in swtch() 924 * when the CPU is actually yielded, or not at all if it 925 * is remaining on the CPU. 926 * There is however a window between where the thread 927 * placed itself on a run queue, and where it selects 928 * itself in disp(), where a third party (eg. clock() 929 * doing tick processing) may have re-enqueued this 930 * thread, setting t_waitrq in the process. We detect 931 * this race by noticing that despite switching to 932 * ourself, our t_waitrq has been set, and should be 933 * cleared. 934 */ 935 if (t->t_waitrq != 0) 936 t->t_waitrq = 0; 937 938 pg_ev_thread_remain(cp, t); 939 940 DTRACE_SCHED(remain__cpu); 941 TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end"); 942 (void) spl0(); 943 } 944 } 945 } 946 947 /* 948 * swtch_from_zombie() 949 * Special case of swtch(), which allows checks for TS_ZOMB to be 950 * eliminated from normal resume. 951 * Find best runnable thread and run it. 952 * Called with the current thread zombied. 953 * Zombies cannot migrate, so CPU references are safe. 954 */ 955 void 956 swtch_from_zombie() 957 { 958 kthread_t *next; 959 cpu_t *cpu = CPU; 960 961 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 962 963 ASSERT(curthread->t_state == TS_ZOMB); 964 965 next = disp(); /* returns with spl high */ 966 ASSERT(CPU_ON_INTR(CPU) == 0); /* not called with PIL > 10 */ 967 CPU_STATS_ADDQ(CPU, sys, pswitch, 1); 968 ASSERT(next != curthread); 969 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 970 971 pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next); 972 973 restore_mstate(next); 974 975 if (dtrace_vtime_active) 976 dtrace_vtime_switch(next); 977 978 resume_from_zombie(next); 979 /* 980 * The TR_RESUME_END and TR_SWTCH_END trace points 981 * appear at the end of resume(), because we certainly will not 982 * return here 983 */ 984 } 985 986 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint)) 987 988 /* 989 * search_disp_queues() 990 * Search the given dispatch queues for thread tp. 991 * Return 1 if tp is found, otherwise return 0. 992 */ 993 static int 994 search_disp_queues(disp_t *dp, kthread_t *tp) 995 { 996 dispq_t *dq; 997 dispq_t *eq; 998 999 disp_lock_enter_high(&dp->disp_lock); 1000 1001 for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) { 1002 kthread_t *rp; 1003 1004 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL); 1005 1006 for (rp = dq->dq_first; rp; rp = rp->t_link) 1007 if (tp == rp) { 1008 disp_lock_exit_high(&dp->disp_lock); 1009 return (1); 1010 } 1011 } 1012 disp_lock_exit_high(&dp->disp_lock); 1013 1014 return (0); 1015 } 1016 1017 /* 1018 * thread_on_queue() 1019 * Search all per-CPU dispatch queues and all partition-wide kpreempt 1020 * queues for thread tp. Return 1 if tp is found, otherwise return 0. 1021 */ 1022 static int 1023 thread_on_queue(kthread_t *tp) 1024 { 1025 cpu_t *cp; 1026 struct cpupart *part; 1027 1028 ASSERT(getpil() >= DISP_LEVEL); 1029 1030 /* 1031 * Search the per-CPU dispatch queues for tp. 1032 */ 1033 cp = CPU; 1034 do { 1035 if (search_disp_queues(cp->cpu_disp, tp)) 1036 return (1); 1037 } while ((cp = cp->cpu_next_onln) != CPU); 1038 1039 /* 1040 * Search the partition-wide kpreempt queues for tp. 1041 */ 1042 part = CPU->cpu_part; 1043 do { 1044 if (search_disp_queues(&part->cp_kp_queue, tp)) 1045 return (1); 1046 } while ((part = part->cp_next) != CPU->cpu_part); 1047 1048 return (0); 1049 } 1050 1051 #else 1052 1053 #define thread_on_queue(tp) 0 /* ASSERT must be !thread_on_queue */ 1054 1055 #endif /* DEBUG */ 1056 1057 /* 1058 * like swtch(), but switch to a specified thread taken from another CPU. 1059 * called with spl high.. 1060 */ 1061 void 1062 swtch_to(kthread_t *next) 1063 { 1064 cpu_t *cp = CPU; 1065 hrtime_t now; 1066 1067 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 1068 1069 /* 1070 * Update context switch statistics. 1071 */ 1072 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 1073 1074 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 1075 1076 now = gethrtime_unscaled(); 1077 pg_ev_thread_swtch(cp, now, curthread, next); 1078 1079 /* OK to steal anything left on run queue */ 1080 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; 1081 1082 /* record last execution time */ 1083 cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt(); 1084 1085 /* 1086 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq 1087 * won't have set its t_waitrq. Since we now finally know that we're 1088 * switching away from this thread, set its t_waitrq if it is on a run 1089 * queue. 1090 */ 1091 if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) { 1092 curthread->t_waitrq = now; 1093 } 1094 1095 /* restore next thread to previously running microstate */ 1096 restore_mstate(next); 1097 1098 if (dtrace_vtime_active) 1099 dtrace_vtime_switch(next); 1100 1101 resume(next); 1102 /* 1103 * The TR_RESUME_END and TR_SWTCH_END trace points 1104 * appear at the end of resume(), because we may not 1105 * return here 1106 */ 1107 } 1108 1109 #define CPU_IDLING(pri) ((pri) == -1) 1110 1111 static void 1112 cpu_resched(cpu_t *cp, pri_t tpri) 1113 { 1114 int call_poke_cpu = 0; 1115 pri_t cpupri = cp->cpu_dispatch_pri; 1116 1117 if (!CPU_IDLING(cpupri) && (cpupri < tpri)) { 1118 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED, 1119 "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri); 1120 if (tpri >= upreemptpri && cp->cpu_runrun == 0) { 1121 cp->cpu_runrun = 1; 1122 aston(cp->cpu_dispthread); 1123 if (tpri < kpreemptpri && cp != CPU) 1124 call_poke_cpu = 1; 1125 } 1126 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) { 1127 cp->cpu_kprunrun = 1; 1128 if (cp != CPU) 1129 call_poke_cpu = 1; 1130 } 1131 } 1132 1133 /* 1134 * Propagate cpu_runrun, and cpu_kprunrun to global visibility. 1135 */ 1136 membar_enter(); 1137 1138 if (call_poke_cpu) 1139 poke_cpu(cp->cpu_id); 1140 } 1141 1142 /* 1143 * setbackdq() keeps runqs balanced such that the difference in length 1144 * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF. 1145 * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths 1146 * must match. When per-thread TS_RUNQMATCH flag is set, setbackdq() will 1147 * try to keep runqs perfectly balanced regardless of the thread priority. 1148 */ 1149 #define RUNQ_MATCH_PRI 16 /* pri below which queue lengths must match */ 1150 #define RUNQ_MAX_DIFF 2 /* maximum runq length difference */ 1151 #define RUNQ_LEN(cp, pri) ((cp)->cpu_disp->disp_q[pri].dq_sruncnt) 1152 1153 /* 1154 * Macro that evaluates to true if it is likely that the thread has cache 1155 * warmth. This is based on the amount of time that has elapsed since the 1156 * thread last ran. If that amount of time is less than "rechoose_interval" 1157 * ticks, then we decide that the thread has enough cache warmth to warrant 1158 * some affinity for t->t_cpu. 1159 */ 1160 #define THREAD_HAS_CACHE_WARMTH(thread) \ 1161 ((thread == curthread) || \ 1162 ((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval)) 1163 /* 1164 * Put the specified thread on the back of the dispatcher 1165 * queue corresponding to its current priority. 1166 * 1167 * Called with the thread in transition, onproc or stopped state 1168 * and locked (transition implies locked) and at high spl. 1169 * Returns with the thread in TS_RUN state and still locked. 1170 */ 1171 void 1172 setbackdq(kthread_t *tp) 1173 { 1174 dispq_t *dq; 1175 disp_t *dp; 1176 cpu_t *cp; 1177 pri_t tpri; 1178 int bound; 1179 boolean_t self; 1180 1181 ASSERT(THREAD_LOCK_HELD(tp)); 1182 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0); 1183 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */ 1184 1185 self = (tp == curthread); 1186 1187 if (tp->t_bound_cpu || tp->t_weakbound_cpu) 1188 bound = 1; 1189 else 1190 bound = 0; 1191 1192 tpri = DISP_PRIO(tp); 1193 if (ncpus == 1) 1194 cp = tp->t_cpu; 1195 else if (!bound) { 1196 if (tpri >= kpqpri) { 1197 setkpdq(tp, SETKP_BACK); 1198 return; 1199 } 1200 1201 /* 1202 * We'll generally let this thread continue to run where 1203 * it last ran...but will consider migration if: 1204 * - We thread probably doesn't have much cache warmth. 1205 * - The CPU where it last ran is the target of an offline 1206 * request. 1207 * - The thread last ran outside it's home lgroup. 1208 */ 1209 if ((!THREAD_HAS_CACHE_WARMTH(tp)) || 1210 (tp->t_cpu == cpu_inmotion)) { 1211 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL); 1212 } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) { 1213 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, 1214 self ? tp->t_cpu : NULL); 1215 } else { 1216 cp = tp->t_cpu; 1217 } 1218 1219 if (tp->t_cpupart == cp->cpu_part) { 1220 int qlen; 1221 1222 /* 1223 * Perform any CMT load balancing 1224 */ 1225 cp = cmt_balance(tp, cp); 1226 1227 /* 1228 * Balance across the run queues 1229 */ 1230 qlen = RUNQ_LEN(cp, tpri); 1231 if (tpri >= RUNQ_MATCH_PRI && 1232 !(tp->t_schedflag & TS_RUNQMATCH)) 1233 qlen -= RUNQ_MAX_DIFF; 1234 if (qlen > 0) { 1235 cpu_t *newcp; 1236 1237 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) { 1238 newcp = cp->cpu_next_part; 1239 } else if ((newcp = cp->cpu_next_lpl) == cp) { 1240 newcp = cp->cpu_next_part; 1241 } 1242 1243 if (RUNQ_LEN(newcp, tpri) < qlen) { 1244 DTRACE_PROBE3(runq__balance, 1245 kthread_t *, tp, 1246 cpu_t *, cp, cpu_t *, newcp); 1247 cp = newcp; 1248 } 1249 } 1250 } else { 1251 /* 1252 * Migrate to a cpu in the new partition. 1253 */ 1254 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, 1255 tp->t_lpl, tp->t_pri, NULL); 1256 } 1257 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1258 } else { 1259 /* 1260 * It is possible that t_weakbound_cpu != t_bound_cpu (for 1261 * a short time until weak binding that existed when the 1262 * strong binding was established has dropped) so we must 1263 * favour weak binding over strong. 1264 */ 1265 cp = tp->t_weakbound_cpu ? 1266 tp->t_weakbound_cpu : tp->t_bound_cpu; 1267 } 1268 /* 1269 * A thread that is ONPROC may be temporarily placed on the run queue 1270 * but then chosen to run again by disp. If the thread we're placing on 1271 * the queue is in TS_ONPROC state, don't set its t_waitrq until a 1272 * replacement process is actually scheduled in swtch(). In this 1273 * situation, curthread is the only thread that could be in the ONPROC 1274 * state. 1275 */ 1276 if ((!self) && (tp->t_waitrq == 0)) { 1277 hrtime_t curtime; 1278 1279 curtime = gethrtime_unscaled(); 1280 (void) cpu_update_pct(tp, curtime); 1281 tp->t_waitrq = curtime; 1282 } else { 1283 (void) cpu_update_pct(tp, gethrtime_unscaled()); 1284 } 1285 1286 dp = cp->cpu_disp; 1287 disp_lock_enter_high(&dp->disp_lock); 1288 1289 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0); 1290 TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p", 1291 tpri, cp, tp); 1292 1293 #ifndef NPROBE 1294 /* Kernel probe */ 1295 if (tnf_tracing_active) 1296 tnf_thread_queue(tp, cp, tpri); 1297 #endif /* NPROBE */ 1298 1299 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1300 1301 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */ 1302 tp->t_disp_queue = dp; 1303 tp->t_link = NULL; 1304 1305 dq = &dp->disp_q[tpri]; 1306 dp->disp_nrunnable++; 1307 if (!bound) 1308 dp->disp_steal = 0; 1309 membar_enter(); 1310 1311 if (dq->dq_sruncnt++ != 0) { 1312 ASSERT(dq->dq_first != NULL); 1313 dq->dq_last->t_link = tp; 1314 dq->dq_last = tp; 1315 } else { 1316 ASSERT(dq->dq_first == NULL); 1317 ASSERT(dq->dq_last == NULL); 1318 dq->dq_first = dq->dq_last = tp; 1319 BT_SET(dp->disp_qactmap, tpri); 1320 if (tpri > dp->disp_maxrunpri) { 1321 dp->disp_maxrunpri = tpri; 1322 membar_enter(); 1323 cpu_resched(cp, tpri); 1324 } 1325 } 1326 1327 if (!bound && tpri > dp->disp_max_unbound_pri) { 1328 if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) { 1329 /* 1330 * If there are no other unbound threads on the 1331 * run queue, don't allow other CPUs to steal 1332 * this thread while we are in the middle of a 1333 * context switch. We may just switch to it 1334 * again right away. CPU_DISP_DONTSTEAL is cleared 1335 * in swtch and swtch_to. 1336 */ 1337 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL; 1338 } 1339 dp->disp_max_unbound_pri = tpri; 1340 } 1341 (*disp_enq_thread)(cp, bound); 1342 } 1343 1344 /* 1345 * Put the specified thread on the front of the dispatcher 1346 * queue corresponding to its current priority. 1347 * 1348 * Called with the thread in transition, onproc or stopped state 1349 * and locked (transition implies locked) and at high spl. 1350 * Returns with the thread in TS_RUN state and still locked. 1351 */ 1352 void 1353 setfrontdq(kthread_t *tp) 1354 { 1355 disp_t *dp; 1356 dispq_t *dq; 1357 cpu_t *cp; 1358 pri_t tpri; 1359 int bound; 1360 1361 ASSERT(THREAD_LOCK_HELD(tp)); 1362 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0); 1363 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */ 1364 1365 if (tp->t_bound_cpu || tp->t_weakbound_cpu) 1366 bound = 1; 1367 else 1368 bound = 0; 1369 1370 tpri = DISP_PRIO(tp); 1371 if (ncpus == 1) 1372 cp = tp->t_cpu; 1373 else if (!bound) { 1374 if (tpri >= kpqpri) { 1375 setkpdq(tp, SETKP_FRONT); 1376 return; 1377 } 1378 cp = tp->t_cpu; 1379 if (tp->t_cpupart == cp->cpu_part) { 1380 /* 1381 * We'll generally let this thread continue to run 1382 * where it last ran, but will consider migration if: 1383 * - The thread last ran outside it's home lgroup. 1384 * - The CPU where it last ran is the target of an 1385 * offline request (a thread_nomigrate() on the in 1386 * motion CPU relies on this when forcing a preempt). 1387 * - The thread isn't the highest priority thread where 1388 * it last ran, and it is considered not likely to 1389 * have significant cache warmth. 1390 */ 1391 if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) || 1392 (cp == cpu_inmotion)) { 1393 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, 1394 (tp == curthread) ? cp : NULL); 1395 } else if ((tpri < cp->cpu_disp->disp_maxrunpri) && 1396 (!THREAD_HAS_CACHE_WARMTH(tp))) { 1397 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, 1398 NULL); 1399 } 1400 } else { 1401 /* 1402 * Migrate to a cpu in the new partition. 1403 */ 1404 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist, 1405 tp->t_lpl, tp->t_pri, NULL); 1406 } 1407 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1408 } else { 1409 /* 1410 * It is possible that t_weakbound_cpu != t_bound_cpu (for 1411 * a short time until weak binding that existed when the 1412 * strong binding was established has dropped) so we must 1413 * favour weak binding over strong. 1414 */ 1415 cp = tp->t_weakbound_cpu ? 1416 tp->t_weakbound_cpu : tp->t_bound_cpu; 1417 } 1418 1419 /* 1420 * A thread that is ONPROC may be temporarily placed on the run queue 1421 * but then chosen to run again by disp. If the thread we're placing on 1422 * the queue is in TS_ONPROC state, don't set its t_waitrq until a 1423 * replacement process is actually scheduled in swtch(). In this 1424 * situation, curthread is the only thread that could be in the ONPROC 1425 * state. 1426 */ 1427 if ((tp != curthread) && (tp->t_waitrq == 0)) { 1428 hrtime_t curtime; 1429 1430 curtime = gethrtime_unscaled(); 1431 (void) cpu_update_pct(tp, curtime); 1432 tp->t_waitrq = curtime; 1433 } else { 1434 (void) cpu_update_pct(tp, gethrtime_unscaled()); 1435 } 1436 1437 dp = cp->cpu_disp; 1438 disp_lock_enter_high(&dp->disp_lock); 1439 1440 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp); 1441 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1); 1442 1443 #ifndef NPROBE 1444 /* Kernel probe */ 1445 if (tnf_tracing_active) 1446 tnf_thread_queue(tp, cp, tpri); 1447 #endif /* NPROBE */ 1448 1449 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1450 1451 THREAD_RUN(tp, &dp->disp_lock); /* set TS_RUN state and lock */ 1452 tp->t_disp_queue = dp; 1453 1454 dq = &dp->disp_q[tpri]; 1455 dp->disp_nrunnable++; 1456 if (!bound) 1457 dp->disp_steal = 0; 1458 membar_enter(); 1459 1460 if (dq->dq_sruncnt++ != 0) { 1461 ASSERT(dq->dq_last != NULL); 1462 tp->t_link = dq->dq_first; 1463 dq->dq_first = tp; 1464 } else { 1465 ASSERT(dq->dq_last == NULL); 1466 ASSERT(dq->dq_first == NULL); 1467 tp->t_link = NULL; 1468 dq->dq_first = dq->dq_last = tp; 1469 BT_SET(dp->disp_qactmap, tpri); 1470 if (tpri > dp->disp_maxrunpri) { 1471 dp->disp_maxrunpri = tpri; 1472 membar_enter(); 1473 cpu_resched(cp, tpri); 1474 } 1475 } 1476 1477 if (!bound && tpri > dp->disp_max_unbound_pri) { 1478 if (tp == curthread && dp->disp_max_unbound_pri == -1 && 1479 cp == CPU) { 1480 /* 1481 * If there are no other unbound threads on the 1482 * run queue, don't allow other CPUs to steal 1483 * this thread while we are in the middle of a 1484 * context switch. We may just switch to it 1485 * again right away. CPU_DISP_DONTSTEAL is cleared 1486 * in swtch and swtch_to. 1487 */ 1488 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL; 1489 } 1490 dp->disp_max_unbound_pri = tpri; 1491 } 1492 (*disp_enq_thread)(cp, bound); 1493 } 1494 1495 /* 1496 * Put a high-priority unbound thread on the kp queue 1497 */ 1498 static void 1499 setkpdq(kthread_t *tp, int borf) 1500 { 1501 dispq_t *dq; 1502 disp_t *dp; 1503 cpu_t *cp; 1504 pri_t tpri; 1505 1506 tpri = DISP_PRIO(tp); 1507 1508 dp = &tp->t_cpupart->cp_kp_queue; 1509 disp_lock_enter_high(&dp->disp_lock); 1510 1511 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp); 1512 1513 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 1514 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf); 1515 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */ 1516 tp->t_disp_queue = dp; 1517 dp->disp_nrunnable++; 1518 dq = &dp->disp_q[tpri]; 1519 1520 if (dq->dq_sruncnt++ != 0) { 1521 if (borf == SETKP_BACK) { 1522 ASSERT(dq->dq_first != NULL); 1523 tp->t_link = NULL; 1524 dq->dq_last->t_link = tp; 1525 dq->dq_last = tp; 1526 } else { 1527 ASSERT(dq->dq_last != NULL); 1528 tp->t_link = dq->dq_first; 1529 dq->dq_first = tp; 1530 } 1531 } else { 1532 if (borf == SETKP_BACK) { 1533 ASSERT(dq->dq_first == NULL); 1534 ASSERT(dq->dq_last == NULL); 1535 dq->dq_first = dq->dq_last = tp; 1536 } else { 1537 ASSERT(dq->dq_last == NULL); 1538 ASSERT(dq->dq_first == NULL); 1539 tp->t_link = NULL; 1540 dq->dq_first = dq->dq_last = tp; 1541 } 1542 BT_SET(dp->disp_qactmap, tpri); 1543 if (tpri > dp->disp_max_unbound_pri) 1544 dp->disp_max_unbound_pri = tpri; 1545 if (tpri > dp->disp_maxrunpri) { 1546 dp->disp_maxrunpri = tpri; 1547 membar_enter(); 1548 } 1549 } 1550 1551 cp = tp->t_cpu; 1552 if (tp->t_cpupart != cp->cpu_part) { 1553 /* migrate to a cpu in the new partition */ 1554 cp = tp->t_cpupart->cp_cpulist; 1555 } 1556 cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL); 1557 disp_lock_enter_high(&cp->cpu_disp->disp_lock); 1558 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0); 1559 1560 #ifndef NPROBE 1561 /* Kernel probe */ 1562 if (tnf_tracing_active) 1563 tnf_thread_queue(tp, cp, tpri); 1564 #endif /* NPROBE */ 1565 1566 if (cp->cpu_chosen_level < tpri) 1567 cp->cpu_chosen_level = tpri; 1568 cpu_resched(cp, tpri); 1569 disp_lock_exit_high(&cp->cpu_disp->disp_lock); 1570 (*disp_enq_thread)(cp, 0); 1571 } 1572 1573 /* 1574 * Remove a thread from the dispatcher queue if it is on it. 1575 * It is not an error if it is not found but we return whether 1576 * or not it was found in case the caller wants to check. 1577 */ 1578 int 1579 dispdeq(kthread_t *tp) 1580 { 1581 disp_t *dp; 1582 dispq_t *dq; 1583 kthread_t *rp; 1584 kthread_t *trp; 1585 kthread_t **ptp; 1586 int tpri; 1587 1588 ASSERT(THREAD_LOCK_HELD(tp)); 1589 1590 if (tp->t_state != TS_RUN) 1591 return (0); 1592 1593 tpri = DISP_PRIO(tp); 1594 dp = tp->t_disp_queue; 1595 ASSERT(tpri < dp->disp_npri); 1596 dq = &dp->disp_q[tpri]; 1597 ptp = &dq->dq_first; 1598 rp = *ptp; 1599 trp = NULL; 1600 1601 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL); 1602 1603 /* 1604 * Search for thread in queue. 1605 * Double links would simplify this at the expense of disp/setrun. 1606 */ 1607 while (rp != tp && rp != NULL) { 1608 trp = rp; 1609 ptp = &trp->t_link; 1610 rp = trp->t_link; 1611 } 1612 1613 if (rp == NULL) { 1614 panic("dispdeq: thread not on queue"); 1615 } 1616 1617 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp); 1618 1619 /* 1620 * Found it so remove it from queue. 1621 */ 1622 if ((*ptp = rp->t_link) == NULL) 1623 dq->dq_last = trp; 1624 1625 dp->disp_nrunnable--; 1626 if (--dq->dq_sruncnt == 0) { 1627 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri); 1628 if (dp->disp_nrunnable == 0) { 1629 dp->disp_max_unbound_pri = -1; 1630 dp->disp_maxrunpri = -1; 1631 } else if (tpri == dp->disp_maxrunpri) { 1632 int ipri; 1633 1634 ipri = bt_gethighbit(dp->disp_qactmap, 1635 dp->disp_maxrunpri >> BT_ULSHIFT); 1636 if (ipri < dp->disp_max_unbound_pri) 1637 dp->disp_max_unbound_pri = ipri; 1638 dp->disp_maxrunpri = ipri; 1639 } 1640 } 1641 tp->t_link = NULL; 1642 THREAD_TRANSITION(tp); /* put in intermediate state */ 1643 return (1); 1644 } 1645 1646 /* 1647 * Make a thread give up its processor. Find the processor on 1648 * which this thread is executing, and have that processor 1649 * preempt. 1650 * 1651 * We allow System Duty Cycle (SDC) threads to be preempted even if 1652 * they are running at kernel priorities. To implement this, we always 1653 * set cpu_kprunrun; this ensures preempt() will be called. Since SDC 1654 * calls cpu_surrender() very often, we only preempt if there is anyone 1655 * competing with us. 1656 */ 1657 void 1658 cpu_surrender(kthread_t *tp) 1659 { 1660 cpu_t *cpup; 1661 int max_pri; 1662 int max_run_pri; 1663 klwp_t *lwp; 1664 1665 ASSERT(THREAD_LOCK_HELD(tp)); 1666 1667 if (tp->t_state != TS_ONPROC) 1668 return; 1669 cpup = tp->t_disp_queue->disp_cpu; /* CPU thread dispatched to */ 1670 max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */ 1671 max_run_pri = CP_MAXRUNPRI(cpup->cpu_part); 1672 if (max_pri < max_run_pri) 1673 max_pri = max_run_pri; 1674 1675 if (tp->t_cid == sysdccid) { 1676 uint_t t_pri = DISP_PRIO(tp); 1677 if (t_pri > max_pri) 1678 return; /* we are not competing w/ anyone */ 1679 cpup->cpu_runrun = cpup->cpu_kprunrun = 1; 1680 } else { 1681 cpup->cpu_runrun = 1; 1682 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) { 1683 cpup->cpu_kprunrun = 1; 1684 } 1685 } 1686 1687 /* 1688 * Propagate cpu_runrun, and cpu_kprunrun to global visibility. 1689 */ 1690 membar_enter(); 1691 1692 DTRACE_SCHED1(surrender, kthread_t *, tp); 1693 1694 /* 1695 * Make the target thread take an excursion through trap() 1696 * to do preempt() (unless we're already in trap or post_syscall, 1697 * calling cpu_surrender via CL_TRAPRET). 1698 */ 1699 if (tp != curthread || (lwp = tp->t_lwp) == NULL || 1700 lwp->lwp_state != LWP_USER) { 1701 aston(tp); 1702 if (cpup != CPU) 1703 poke_cpu(cpup->cpu_id); 1704 } 1705 TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER, 1706 "cpu_surrender:tid %p cpu %p", tp, cpup); 1707 } 1708 1709 /* 1710 * Commit to and ratify a scheduling decision 1711 */ 1712 /*ARGSUSED*/ 1713 static kthread_t * 1714 disp_ratify(kthread_t *tp, disp_t *kpq) 1715 { 1716 pri_t tpri, maxpri; 1717 pri_t maxkpri; 1718 cpu_t *cpup; 1719 1720 ASSERT(tp != NULL); 1721 /* 1722 * Commit to, then ratify scheduling decision 1723 */ 1724 cpup = CPU; 1725 if (cpup->cpu_runrun != 0) 1726 cpup->cpu_runrun = 0; 1727 if (cpup->cpu_kprunrun != 0) 1728 cpup->cpu_kprunrun = 0; 1729 if (cpup->cpu_chosen_level != -1) 1730 cpup->cpu_chosen_level = -1; 1731 membar_enter(); 1732 tpri = DISP_PRIO(tp); 1733 maxpri = cpup->cpu_disp->disp_maxrunpri; 1734 maxkpri = kpq->disp_maxrunpri; 1735 if (maxpri < maxkpri) 1736 maxpri = maxkpri; 1737 if (tpri < maxpri) { 1738 /* 1739 * should have done better 1740 * put this one back and indicate to try again 1741 */ 1742 cpup->cpu_dispthread = curthread; /* fixup dispthread */ 1743 cpup->cpu_dispatch_pri = DISP_PRIO(curthread); 1744 thread_lock_high(tp); 1745 THREAD_TRANSITION(tp); 1746 setfrontdq(tp); 1747 thread_unlock_nopreempt(tp); 1748 1749 tp = NULL; 1750 } 1751 return (tp); 1752 } 1753 1754 /* 1755 * See if there is any work on the dispatcher queue for other CPUs. 1756 * If there is, dequeue the best thread and return. 1757 */ 1758 static kthread_t * 1759 disp_getwork(cpu_t *cp) 1760 { 1761 cpu_t *ocp; /* other CPU */ 1762 cpu_t *ocp_start; 1763 cpu_t *tcp; /* target local CPU */ 1764 kthread_t *tp; 1765 kthread_t *retval = NULL; 1766 pri_t maxpri; 1767 disp_t *kpq; /* kp queue for this partition */ 1768 lpl_t *lpl, *lpl_leaf; 1769 int leafidx, startidx; 1770 hrtime_t stealtime; 1771 lgrp_id_t local_id; 1772 1773 maxpri = -1; 1774 tcp = NULL; 1775 1776 kpq = &cp->cpu_part->cp_kp_queue; 1777 while (kpq->disp_maxrunpri >= 0) { 1778 /* 1779 * Try to take a thread from the kp_queue. 1780 */ 1781 tp = (disp_getbest(kpq)); 1782 if (tp) 1783 return (disp_ratify(tp, kpq)); 1784 } 1785 1786 kpreempt_disable(); /* protect the cpu_active list */ 1787 1788 /* 1789 * Try to find something to do on another CPU's run queue. 1790 * Loop through all other CPUs looking for the one with the highest 1791 * priority unbound thread. 1792 * 1793 * On NUMA machines, the partition's CPUs are consulted in order of 1794 * distance from the current CPU. This way, the first available 1795 * work found is also the closest, and will suffer the least 1796 * from being migrated. 1797 */ 1798 lpl = lpl_leaf = cp->cpu_lpl; 1799 local_id = lpl_leaf->lpl_lgrpid; 1800 leafidx = startidx = 0; 1801 1802 /* 1803 * This loop traverses the lpl hierarchy. Higher level lpls represent 1804 * broader levels of locality 1805 */ 1806 do { 1807 /* This loop iterates over the lpl's leaves */ 1808 do { 1809 if (lpl_leaf != cp->cpu_lpl) 1810 ocp = lpl_leaf->lpl_cpus; 1811 else 1812 ocp = cp->cpu_next_lpl; 1813 1814 /* This loop iterates over the CPUs in the leaf */ 1815 ocp_start = ocp; 1816 do { 1817 pri_t pri; 1818 1819 ASSERT(CPU_ACTIVE(ocp)); 1820 1821 /* 1822 * End our stroll around this lpl if: 1823 * 1824 * - Something became runnable on the local 1825 * queue...which also ends our stroll around 1826 * the partition. 1827 * 1828 * - We happen across another idle CPU. 1829 * Since it is patrolling the next portion 1830 * of the lpl's list (assuming it's not 1831 * halted, or busy servicing an interrupt), 1832 * move to the next higher level of locality. 1833 */ 1834 if (cp->cpu_disp->disp_nrunnable != 0) { 1835 kpreempt_enable(); 1836 return (NULL); 1837 } 1838 if (ocp->cpu_dispatch_pri == -1) { 1839 if (ocp->cpu_disp_flags & 1840 CPU_DISP_HALTED || 1841 ocp->cpu_intr_actv != 0) 1842 continue; 1843 else 1844 goto next_level; 1845 } 1846 1847 /* 1848 * If there's only one thread and the CPU 1849 * is in the middle of a context switch, 1850 * or it's currently running the idle thread, 1851 * don't steal it. 1852 */ 1853 if ((ocp->cpu_disp_flags & 1854 CPU_DISP_DONTSTEAL) && 1855 ocp->cpu_disp->disp_nrunnable == 1) 1856 continue; 1857 1858 pri = ocp->cpu_disp->disp_max_unbound_pri; 1859 if (pri > maxpri) { 1860 /* 1861 * Don't steal threads that we attempted 1862 * to steal recently until they're ready 1863 * to be stolen again. 1864 */ 1865 stealtime = ocp->cpu_disp->disp_steal; 1866 if (stealtime == 0 || 1867 stealtime - gethrtime() <= 0) { 1868 maxpri = pri; 1869 tcp = ocp; 1870 } else { 1871 /* 1872 * Don't update tcp, just set 1873 * the retval to T_DONTSTEAL, so 1874 * that if no acceptable CPUs 1875 * are found the return value 1876 * will be T_DONTSTEAL rather 1877 * then NULL. 1878 */ 1879 retval = T_DONTSTEAL; 1880 } 1881 } 1882 } while ((ocp = ocp->cpu_next_lpl) != ocp_start); 1883 1884 /* 1885 * Iterate to the next leaf lpl in the resource set 1886 * at this level of locality. If we hit the end of 1887 * the set, wrap back around to the beginning. 1888 * 1889 * Note: This iteration is NULL terminated for a reason 1890 * see lpl_topo_bootstrap() in lgrp.c for details. 1891 */ 1892 if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) { 1893 leafidx = 0; 1894 lpl_leaf = lpl->lpl_rset[leafidx]; 1895 } 1896 } while (leafidx != startidx); 1897 1898 next_level: 1899 /* 1900 * Expand the search to include farther away CPUs (next 1901 * locality level). The closer CPUs that have already been 1902 * checked will be checked again. In doing so, idle CPUs 1903 * will tend to be more aggresive about stealing from CPUs 1904 * that are closer (since the closer CPUs will be considered 1905 * more often). 1906 * Begin at this level with the CPUs local leaf lpl. 1907 */ 1908 if ((lpl = lpl->lpl_parent) != NULL) { 1909 leafidx = startidx = lpl->lpl_id2rset[local_id]; 1910 lpl_leaf = lpl->lpl_rset[leafidx]; 1911 } 1912 } while (!tcp && lpl); 1913 1914 kpreempt_enable(); 1915 1916 /* 1917 * If another queue looks good, and there is still nothing on 1918 * the local queue, try to transfer one or more threads 1919 * from it to our queue. 1920 */ 1921 if (tcp && cp->cpu_disp->disp_nrunnable == 0) { 1922 tp = disp_getbest(tcp->cpu_disp); 1923 if (tp == NULL || tp == T_DONTSTEAL) 1924 return (tp); 1925 return (disp_ratify(tp, kpq)); 1926 } 1927 return (retval); 1928 } 1929 1930 1931 /* 1932 * disp_fix_unbound_pri() 1933 * Determines the maximum priority of unbound threads on the queue. 1934 * The priority is kept for the queue, but is only increased, never 1935 * reduced unless some CPU is looking for something on that queue. 1936 * 1937 * The priority argument is the known upper limit. 1938 * 1939 * Perhaps this should be kept accurately, but that probably means 1940 * separate bitmaps for bound and unbound threads. Since only idled 1941 * CPUs will have to do this recalculation, it seems better this way. 1942 */ 1943 static void 1944 disp_fix_unbound_pri(disp_t *dp, pri_t pri) 1945 { 1946 kthread_t *tp; 1947 dispq_t *dq; 1948 ulong_t *dqactmap = dp->disp_qactmap; 1949 ulong_t mapword; 1950 int wx; 1951 1952 ASSERT(DISP_LOCK_HELD(&dp->disp_lock)); 1953 1954 ASSERT(pri >= 0); /* checked by caller */ 1955 1956 /* 1957 * Start the search at the next lowest priority below the supplied 1958 * priority. This depends on the bitmap implementation. 1959 */ 1960 do { 1961 wx = pri >> BT_ULSHIFT; /* index of word in map */ 1962 1963 /* 1964 * Form mask for all lower priorities in the word. 1965 */ 1966 mapword = dqactmap[wx] & (BT_BIW(pri) - 1); 1967 1968 /* 1969 * Get next lower active priority. 1970 */ 1971 if (mapword != 0) { 1972 pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1; 1973 } else if (wx > 0) { 1974 pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */ 1975 if (pri < 0) 1976 break; 1977 } else { 1978 pri = -1; 1979 break; 1980 } 1981 1982 /* 1983 * Search the queue for unbound, runnable threads. 1984 */ 1985 dq = &dp->disp_q[pri]; 1986 tp = dq->dq_first; 1987 1988 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) { 1989 tp = tp->t_link; 1990 } 1991 1992 /* 1993 * If a thread was found, set the priority and return. 1994 */ 1995 } while (tp == NULL); 1996 1997 /* 1998 * pri holds the maximum unbound thread priority or -1. 1999 */ 2000 if (dp->disp_max_unbound_pri != pri) 2001 dp->disp_max_unbound_pri = pri; 2002 } 2003 2004 /* 2005 * disp_adjust_unbound_pri() - thread is becoming unbound, so we should 2006 * check if the CPU to which is was previously bound should have 2007 * its disp_max_unbound_pri increased. 2008 */ 2009 void 2010 disp_adjust_unbound_pri(kthread_t *tp) 2011 { 2012 disp_t *dp; 2013 pri_t tpri; 2014 2015 ASSERT(THREAD_LOCK_HELD(tp)); 2016 2017 /* 2018 * Don't do anything if the thread is not bound, or 2019 * currently not runnable. 2020 */ 2021 if (tp->t_bound_cpu == NULL || 2022 tp->t_state != TS_RUN) 2023 return; 2024 2025 tpri = DISP_PRIO(tp); 2026 dp = tp->t_bound_cpu->cpu_disp; 2027 ASSERT(tpri >= 0 && tpri < dp->disp_npri); 2028 if (tpri > dp->disp_max_unbound_pri) 2029 dp->disp_max_unbound_pri = tpri; 2030 } 2031 2032 /* 2033 * disp_getbest() 2034 * De-queue the highest priority unbound runnable thread. 2035 * Returns with the thread unlocked and onproc but at splhigh (like disp()). 2036 * Returns NULL if nothing found. 2037 * Returns T_DONTSTEAL if the thread was not stealable. 2038 * so that the caller will try again later. 2039 * 2040 * Passed a pointer to a dispatch queue not associated with this CPU, and 2041 * its type. 2042 */ 2043 static kthread_t * 2044 disp_getbest(disp_t *dp) 2045 { 2046 kthread_t *tp; 2047 dispq_t *dq; 2048 pri_t pri; 2049 cpu_t *cp, *tcp; 2050 boolean_t allbound; 2051 2052 disp_lock_enter(&dp->disp_lock); 2053 2054 /* 2055 * If there is nothing to run, or the CPU is in the middle of a 2056 * context switch of the only thread, return NULL. 2057 */ 2058 tcp = dp->disp_cpu; 2059 cp = CPU; 2060 pri = dp->disp_max_unbound_pri; 2061 if (pri == -1 || 2062 (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) && 2063 tcp->cpu_disp->disp_nrunnable == 1)) { 2064 disp_lock_exit_nopreempt(&dp->disp_lock); 2065 return (NULL); 2066 } 2067 2068 dq = &dp->disp_q[pri]; 2069 2070 2071 /* 2072 * Assume that all threads are bound on this queue, and change it 2073 * later when we find out that it is not the case. 2074 */ 2075 allbound = B_TRUE; 2076 for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) { 2077 hrtime_t now, nosteal, rqtime; 2078 2079 /* 2080 * Skip over bound threads which could be here even 2081 * though disp_max_unbound_pri indicated this level. 2082 */ 2083 if (tp->t_bound_cpu || tp->t_weakbound_cpu) 2084 continue; 2085 2086 /* 2087 * We've got some unbound threads on this queue, so turn 2088 * the allbound flag off now. 2089 */ 2090 allbound = B_FALSE; 2091 2092 /* 2093 * The thread is a candidate for stealing from its run queue. We 2094 * don't want to steal threads that became runnable just a 2095 * moment ago. This improves CPU affinity for threads that get 2096 * preempted for short periods of time and go back on the run 2097 * queue. 2098 * 2099 * We want to let it stay on its run queue if it was only placed 2100 * there recently and it was running on the same CPU before that 2101 * to preserve its cache investment. For the thread to remain on 2102 * its run queue, ALL of the following conditions must be 2103 * satisfied: 2104 * 2105 * - the disp queue should not be the kernel preemption queue 2106 * - delayed idle stealing should not be disabled 2107 * - nosteal_nsec should be non-zero 2108 * - it should run with user priority 2109 * - it should be on the run queue of the CPU where it was 2110 * running before being placed on the run queue 2111 * - it should be the only thread on the run queue (to prevent 2112 * extra scheduling latency for other threads) 2113 * - it should sit on the run queue for less than per-chip 2114 * nosteal interval or global nosteal interval 2115 * - in case of CPUs with shared cache it should sit in a run 2116 * queue of a CPU from a different chip 2117 * 2118 * The checks are arranged so that the ones that are faster are 2119 * placed earlier. 2120 */ 2121 if (tcp == NULL || 2122 pri >= minclsyspri || 2123 tp->t_cpu != tcp) 2124 break; 2125 2126 /* 2127 * Steal immediately if, due to CMT processor architecture 2128 * migraiton between cp and tcp would incur no performance 2129 * penalty. 2130 */ 2131 if (pg_cmt_can_migrate(cp, tcp)) 2132 break; 2133 2134 nosteal = nosteal_nsec; 2135 if (nosteal == 0) 2136 break; 2137 2138 /* 2139 * Calculate time spent sitting on run queue 2140 */ 2141 now = gethrtime_unscaled(); 2142 rqtime = now - tp->t_waitrq; 2143 scalehrtime(&rqtime); 2144 2145 /* 2146 * Steal immediately if the time spent on this run queue is more 2147 * than allowed nosteal delay. 2148 * 2149 * Negative rqtime check is needed here to avoid infinite 2150 * stealing delays caused by unlikely but not impossible 2151 * drifts between CPU times on different CPUs. 2152 */ 2153 if (rqtime > nosteal || rqtime < 0) 2154 break; 2155 2156 DTRACE_PROBE4(nosteal, kthread_t *, tp, 2157 cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime); 2158 scalehrtime(&now); 2159 /* 2160 * Calculate when this thread becomes stealable 2161 */ 2162 now += (nosteal - rqtime); 2163 2164 /* 2165 * Calculate time when some thread becomes stealable 2166 */ 2167 if (now < dp->disp_steal) 2168 dp->disp_steal = now; 2169 } 2170 2171 /* 2172 * If there were no unbound threads on this queue, find the queue 2173 * where they are and then return later. The value of 2174 * disp_max_unbound_pri is not always accurate because it isn't 2175 * reduced until another idle CPU looks for work. 2176 */ 2177 if (allbound) 2178 disp_fix_unbound_pri(dp, pri); 2179 2180 /* 2181 * If we reached the end of the queue and found no unbound threads 2182 * then return NULL so that other CPUs will be considered. If there 2183 * are unbound threads but they cannot yet be stolen, then 2184 * return T_DONTSTEAL and try again later. 2185 */ 2186 if (tp == NULL) { 2187 disp_lock_exit_nopreempt(&dp->disp_lock); 2188 return (allbound ? NULL : T_DONTSTEAL); 2189 } 2190 2191 /* 2192 * Found a runnable, unbound thread, so remove it from queue. 2193 * dispdeq() requires that we have the thread locked, and we do, 2194 * by virtue of holding the dispatch queue lock. dispdeq() will 2195 * put the thread in transition state, thereby dropping the dispq 2196 * lock. 2197 */ 2198 2199 #ifdef DEBUG 2200 { 2201 int thread_was_on_queue; 2202 2203 thread_was_on_queue = dispdeq(tp); /* drops disp_lock */ 2204 ASSERT(thread_was_on_queue); 2205 } 2206 2207 #else /* DEBUG */ 2208 (void) dispdeq(tp); /* drops disp_lock */ 2209 #endif /* DEBUG */ 2210 2211 /* 2212 * Reset the disp_queue steal time - we do not know what is the smallest 2213 * value across the queue is. 2214 */ 2215 dp->disp_steal = 0; 2216 2217 /* 2218 * Setup thread to run on the current CPU. 2219 */ 2220 tp->t_disp_queue = cp->cpu_disp; 2221 2222 cp->cpu_dispthread = tp; /* protected by spl only */ 2223 cp->cpu_dispatch_pri = pri; 2224 2225 /* 2226 * There can be a memory synchronization race between disp_getbest() 2227 * and disp_ratify() vs cpu_resched() where cpu_resched() is trying 2228 * to preempt the current thread to run the enqueued thread while 2229 * disp_getbest() and disp_ratify() are changing the current thread 2230 * to the stolen thread. This may lead to a situation where 2231 * cpu_resched() tries to preempt the wrong thread and the 2232 * stolen thread continues to run on the CPU which has been tagged 2233 * for preemption. 2234 * Later the clock thread gets enqueued but doesn't get to run on the 2235 * CPU causing the system to hang. 2236 * 2237 * To avoid this, grabbing and dropping the disp_lock (which does 2238 * a memory barrier) is needed to synchronize the execution of 2239 * cpu_resched() with disp_getbest() and disp_ratify() and 2240 * synchronize the memory read and written by cpu_resched(), 2241 * disp_getbest(), and disp_ratify() with each other. 2242 * (see CR#6482861 for more details). 2243 */ 2244 disp_lock_enter_high(&cp->cpu_disp->disp_lock); 2245 disp_lock_exit_high(&cp->cpu_disp->disp_lock); 2246 2247 ASSERT(pri == DISP_PRIO(tp)); 2248 2249 DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp); 2250 2251 thread_onproc(tp, cp); /* set t_state to TS_ONPROC */ 2252 2253 /* 2254 * Return with spl high so that swtch() won't need to raise it. 2255 * The disp_lock was dropped by dispdeq(). 2256 */ 2257 2258 return (tp); 2259 } 2260 2261 /* 2262 * disp_bound_common() - common routine for higher level functions 2263 * that check for bound threads under certain conditions. 2264 * If 'threadlistsafe' is set then there is no need to acquire 2265 * pidlock to stop the thread list from changing (eg, if 2266 * disp_bound_* is called with cpus paused). 2267 */ 2268 static int 2269 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag) 2270 { 2271 int found = 0; 2272 kthread_t *tp; 2273 2274 ASSERT(flag); 2275 2276 if (!threadlistsafe) 2277 mutex_enter(&pidlock); 2278 tp = curthread; /* faster than allthreads */ 2279 do { 2280 if (tp->t_state != TS_FREE) { 2281 /* 2282 * If an interrupt thread is busy, but the 2283 * caller doesn't care (i.e. BOUND_INTR is off), 2284 * then just ignore it and continue through. 2285 */ 2286 if ((tp->t_flag & T_INTR_THREAD) && 2287 !(flag & BOUND_INTR)) 2288 continue; 2289 2290 /* 2291 * Skip the idle thread for the CPU 2292 * we're about to set offline. 2293 */ 2294 if (tp == cp->cpu_idle_thread) 2295 continue; 2296 2297 /* 2298 * Skip the pause thread for the CPU 2299 * we're about to set offline. 2300 */ 2301 if (tp == cp->cpu_pause_thread) 2302 continue; 2303 2304 if ((flag & BOUND_CPU) && 2305 (tp->t_bound_cpu == cp || 2306 tp->t_bind_cpu == cp->cpu_id || 2307 tp->t_weakbound_cpu == cp)) { 2308 found = 1; 2309 break; 2310 } 2311 2312 if ((flag & BOUND_PARTITION) && 2313 (tp->t_cpupart == cp->cpu_part)) { 2314 found = 1; 2315 break; 2316 } 2317 } 2318 } while ((tp = tp->t_next) != curthread && found == 0); 2319 if (!threadlistsafe) 2320 mutex_exit(&pidlock); 2321 return (found); 2322 } 2323 2324 /* 2325 * disp_bound_threads - return nonzero if threads are bound to the processor. 2326 * Called infrequently. Keep this simple. 2327 * Includes threads that are asleep or stopped but not onproc. 2328 */ 2329 int 2330 disp_bound_threads(cpu_t *cp, int threadlistsafe) 2331 { 2332 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU)); 2333 } 2334 2335 /* 2336 * disp_bound_anythreads - return nonzero if _any_ threads are bound 2337 * to the given processor, including interrupt threads. 2338 */ 2339 int 2340 disp_bound_anythreads(cpu_t *cp, int threadlistsafe) 2341 { 2342 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR)); 2343 } 2344 2345 /* 2346 * disp_bound_partition - return nonzero if threads are bound to the same 2347 * partition as the processor. 2348 * Called infrequently. Keep this simple. 2349 * Includes threads that are asleep or stopped but not onproc. 2350 */ 2351 int 2352 disp_bound_partition(cpu_t *cp, int threadlistsafe) 2353 { 2354 return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION)); 2355 } 2356 2357 /* 2358 * disp_cpu_inactive - make a CPU inactive by moving all of its unbound 2359 * threads to other CPUs. 2360 */ 2361 void 2362 disp_cpu_inactive(cpu_t *cp) 2363 { 2364 kthread_t *tp; 2365 disp_t *dp = cp->cpu_disp; 2366 dispq_t *dq; 2367 pri_t pri; 2368 int wasonq; 2369 2370 disp_lock_enter(&dp->disp_lock); 2371 while ((pri = dp->disp_max_unbound_pri) != -1) { 2372 dq = &dp->disp_q[pri]; 2373 tp = dq->dq_first; 2374 2375 /* 2376 * Skip over bound threads. 2377 */ 2378 while (tp != NULL && tp->t_bound_cpu != NULL) { 2379 tp = tp->t_link; 2380 } 2381 2382 if (tp == NULL) { 2383 /* disp_max_unbound_pri must be inaccurate, so fix it */ 2384 disp_fix_unbound_pri(dp, pri); 2385 continue; 2386 } 2387 2388 wasonq = dispdeq(tp); /* drops disp_lock */ 2389 ASSERT(wasonq); 2390 ASSERT(tp->t_weakbound_cpu == NULL); 2391 2392 setbackdq(tp); 2393 /* 2394 * Called from cpu_offline: 2395 * 2396 * cp has already been removed from the list of active cpus 2397 * and tp->t_cpu has been changed so there is no risk of 2398 * tp ending up back on cp. 2399 * 2400 * Called from cpupart_move_cpu: 2401 * 2402 * The cpu has moved to a new cpupart. Any threads that 2403 * were on it's dispatch queues before the move remain 2404 * in the old partition and can't run in the new partition. 2405 */ 2406 ASSERT(tp->t_cpu != cp); 2407 thread_unlock(tp); 2408 2409 disp_lock_enter(&dp->disp_lock); 2410 } 2411 disp_lock_exit(&dp->disp_lock); 2412 } 2413 2414 /* 2415 * disp_lowpri_cpu - find CPU running the lowest priority thread. 2416 * The hint passed in is used as a starting point so we don't favor 2417 * CPU 0 or any other CPU. The caller should pass in the most recently 2418 * used CPU for the thread. 2419 * 2420 * The lgroup and priority are used to determine the best CPU to run on 2421 * in a NUMA machine. The lgroup specifies which CPUs are closest while 2422 * the thread priority will indicate whether the thread will actually run 2423 * there. To pick the best CPU, the CPUs inside and outside of the given 2424 * lgroup which are running the lowest priority threads are found. The 2425 * remote CPU is chosen only if the thread will not run locally on a CPU 2426 * within the lgroup, but will run on the remote CPU. If the thread 2427 * cannot immediately run on any CPU, the best local CPU will be chosen. 2428 * 2429 * The lpl specified also identifies the cpu partition from which 2430 * disp_lowpri_cpu should select a CPU. 2431 * 2432 * curcpu is used to indicate that disp_lowpri_cpu is being called on 2433 * behalf of the current thread. (curthread is looking for a new cpu) 2434 * In this case, cpu_dispatch_pri for this thread's cpu should be 2435 * ignored. 2436 * 2437 * If a cpu is the target of an offline request then try to avoid it. 2438 * 2439 * This function must be called at either high SPL, or with preemption 2440 * disabled, so that the "hint" CPU cannot be removed from the online 2441 * CPU list while we are traversing it. 2442 */ 2443 cpu_t * 2444 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu) 2445 { 2446 cpu_t *bestcpu; 2447 cpu_t *besthomecpu; 2448 cpu_t *cp, *cpstart; 2449 2450 pri_t bestpri; 2451 pri_t cpupri; 2452 2453 klgrpset_t done; 2454 klgrpset_t cur_set; 2455 2456 lpl_t *lpl_iter, *lpl_leaf; 2457 int i; 2458 2459 /* 2460 * Scan for a CPU currently running the lowest priority thread. 2461 * Cannot get cpu_lock here because it is adaptive. 2462 * We do not require lock on CPU list. 2463 */ 2464 ASSERT(hint != NULL); 2465 ASSERT(lpl != NULL); 2466 ASSERT(lpl->lpl_ncpu > 0); 2467 2468 /* 2469 * First examine local CPUs. Note that it's possible the hint CPU 2470 * passed in in remote to the specified home lgroup. If our priority 2471 * isn't sufficient enough such that we can run immediately at home, 2472 * then examine CPUs remote to our home lgroup. 2473 * We would like to give preference to CPUs closest to "home". 2474 * If we can't find a CPU where we'll run at a given level 2475 * of locality, we expand our search to include the next level. 2476 */ 2477 bestcpu = besthomecpu = NULL; 2478 klgrpset_clear(done); 2479 /* start with lpl we were passed */ 2480 2481 lpl_iter = lpl; 2482 2483 do { 2484 2485 bestpri = SHRT_MAX; 2486 klgrpset_clear(cur_set); 2487 2488 for (i = 0; i < lpl_iter->lpl_nrset; i++) { 2489 lpl_leaf = lpl_iter->lpl_rset[i]; 2490 if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid)) 2491 continue; 2492 2493 klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid); 2494 2495 if (hint->cpu_lpl == lpl_leaf) 2496 cp = cpstart = hint; 2497 else 2498 cp = cpstart = lpl_leaf->lpl_cpus; 2499 2500 do { 2501 if (cp == curcpu) 2502 cpupri = -1; 2503 else if (cp == cpu_inmotion) 2504 cpupri = SHRT_MAX; 2505 else 2506 cpupri = cp->cpu_dispatch_pri; 2507 if (cp->cpu_disp->disp_maxrunpri > cpupri) 2508 cpupri = cp->cpu_disp->disp_maxrunpri; 2509 if (cp->cpu_chosen_level > cpupri) 2510 cpupri = cp->cpu_chosen_level; 2511 if (cpupri < bestpri) { 2512 if (CPU_IDLING(cpupri)) { 2513 ASSERT((cp->cpu_flags & 2514 CPU_QUIESCED) == 0); 2515 return (cp); 2516 } 2517 bestcpu = cp; 2518 bestpri = cpupri; 2519 } 2520 } while ((cp = cp->cpu_next_lpl) != cpstart); 2521 } 2522 2523 if (bestcpu && (tpri > bestpri)) { 2524 ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0); 2525 return (bestcpu); 2526 } 2527 if (besthomecpu == NULL) 2528 besthomecpu = bestcpu; 2529 /* 2530 * Add the lgrps we just considered to the "done" set 2531 */ 2532 klgrpset_or(done, cur_set); 2533 2534 } while ((lpl_iter = lpl_iter->lpl_parent) != NULL); 2535 2536 /* 2537 * The specified priority isn't high enough to run immediately 2538 * anywhere, so just return the best CPU from the home lgroup. 2539 */ 2540 ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0); 2541 return (besthomecpu); 2542 } 2543 2544 /* 2545 * This routine provides the generic idle cpu function for all processors. 2546 * If a processor has some specific code to execute when idle (say, to stop 2547 * the pipeline and save power) then that routine should be defined in the 2548 * processors specific code (module_xx.c) and the global variable idle_cpu 2549 * set to that function. 2550 */ 2551 static void 2552 generic_idle_cpu(void) 2553 { 2554 } 2555 2556 /*ARGSUSED*/ 2557 static void 2558 generic_enq_thread(cpu_t *cpu, int bound) 2559 { 2560 }