1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28
29
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/sysmacros.h>
33 #include <sys/signal.h>
34 #include <sys/user.h>
35 #include <sys/systm.h>
36 #include <sys/sysinfo.h>
37 #include <sys/var.h>
38 #include <sys/errno.h>
39 #include <sys/cmn_err.h>
40 #include <sys/debug.h>
41 #include <sys/inline.h>
42 #include <sys/disp.h>
43 #include <sys/class.h>
44 #include <sys/bitmap.h>
45 #include <sys/kmem.h>
46 #include <sys/cpuvar.h>
47 #include <sys/vtrace.h>
48 #include <sys/tnf.h>
49 #include <sys/cpupart.h>
50 #include <sys/lgrp.h>
51 #include <sys/pg.h>
52 #include <sys/cmt.h>
53 #include <sys/bitset.h>
54 #include <sys/schedctl.h>
55 #include <sys/atomic.h>
56 #include <sys/dtrace.h>
57 #include <sys/sdt.h>
58 #include <sys/archsystm.h>
59
60 #include <vm/as.h>
61
62 #define BOUND_CPU 0x1
63 #define BOUND_PARTITION 0x2
64 #define BOUND_INTR 0x4
65
66 /* Dispatch queue allocation structure and functions */
67 struct disp_queue_info {
68 disp_t *dp;
69 dispq_t *olddispq;
70 dispq_t *newdispq;
71 ulong_t *olddqactmap;
72 ulong_t *newdqactmap;
73 int oldnglobpris;
74 };
75 static void disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
76 disp_t *dp);
77 static void disp_dq_assign(struct disp_queue_info *dptr, int numpris);
78 static void disp_dq_free(struct disp_queue_info *dptr);
79
80 /* platform-specific routine to call when processor is idle */
81 static void generic_idle_cpu();
82 void (*idle_cpu)() = generic_idle_cpu;
83
84 /* routines invoked when a CPU enters/exits the idle loop */
85 static void idle_enter();
86 static void idle_exit();
87
88 /* platform-specific routine to call when thread is enqueued */
89 static void generic_enq_thread(cpu_t *, int);
90 void (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
91
92 pri_t kpreemptpri; /* priority where kernel preemption applies */
93 pri_t upreemptpri = 0; /* priority where normal preemption applies */
94 pri_t intr_pri; /* interrupt thread priority base level */
95
96 #define KPQPRI -1 /* pri where cpu affinity is dropped for kpq */
97 pri_t kpqpri = KPQPRI; /* can be set in /etc/system */
98 disp_t cpu0_disp; /* boot CPU's dispatch queue */
99 int nswapped; /* total number of swapped threads */
100 static void disp_swapped_setrun(kthread_t *tp);
101 static void cpu_resched(cpu_t *cp, pri_t tpri);
102
103 /*
104 * If this is set, only interrupt threads will cause kernel preemptions.
105 * This is done by changing the value of kpreemptpri. kpreemptpri
106 * will either be the max sysclass pri + 1 or the min interrupt pri.
107 */
108 int only_intr_kpreempt;
109
110 extern void set_idle_cpu(int cpun);
111 extern void unset_idle_cpu(int cpun);
112 static void setkpdq(kthread_t *tp, int borf);
113 #define SETKP_BACK 0
114 #define SETKP_FRONT 1
115 /*
116 * Parameter that determines how recently a thread must have run
117 * on the CPU to be considered loosely-bound to that CPU to reduce
118 * cold cache effects. The interval is in hertz.
119 */
120 #define RECHOOSE_INTERVAL 3
121 int rechoose_interval = RECHOOSE_INTERVAL;
122
123 /*
124 * Parameter that determines how long (in nanoseconds) a thread must
125 * be sitting on a run queue before it can be stolen by another CPU
126 * to reduce migrations. The interval is in nanoseconds.
127 *
128 * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
129 * to an appropriate value. nosteal_nsec is set to NOSTEAL_UNINITIALIZED
130 * here indicating it is uninitiallized.
131 * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
132 *
133 */
134 #define NOSTEAL_UNINITIALIZED (-1)
135 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
136 extern void cmp_set_nosteal_interval(void);
137
138 id_t defaultcid; /* system "default" class; see dispadmin(1M) */
139
140 disp_lock_t transition_lock; /* lock on transitioning threads */
141 disp_lock_t stop_lock; /* lock on stopped threads */
142
143 static void cpu_dispqalloc(int numpris);
144
145 /*
146 * This gets returned by disp_getwork/disp_getbest if we couldn't steal
147 * a thread because it was sitting on its run queue for a very short
148 * period of time.
149 */
150 #define T_DONTSTEAL (kthread_t *)(-1) /* returned by disp_getwork/getbest */
151
152 static kthread_t *disp_getwork(cpu_t *to);
153 static kthread_t *disp_getbest(disp_t *from);
154 static kthread_t *disp_ratify(kthread_t *tp, disp_t *kpq);
155
156 void swtch_to(kthread_t *);
157
158 /*
159 * dispatcher and scheduler initialization
160 */
161
162 /*
163 * disp_setup - Common code to calculate and allocate dispatcher
164 * variables and structures based on the maximum priority.
165 */
166 static void
167 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
168 {
169 pri_t newnglobpris;
170
171 ASSERT(MUTEX_HELD(&cpu_lock));
172
173 newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
174
175 if (newnglobpris > oldnglobpris) {
176 /*
177 * Allocate new kp queues for each CPU partition.
178 */
179 cpupart_kpqalloc(newnglobpris);
180
181 /*
182 * Allocate new dispatch queues for each CPU.
183 */
184 cpu_dispqalloc(newnglobpris);
185
186 /*
187 * compute new interrupt thread base priority
188 */
189 intr_pri = maxglobpri;
190 if (only_intr_kpreempt) {
191 kpreemptpri = intr_pri + 1;
192 if (kpqpri == KPQPRI)
193 kpqpri = kpreemptpri;
194 }
195 v.v_nglobpris = newnglobpris;
196 }
197 }
198
199 /*
200 * dispinit - Called to initialize all loaded classes and the
201 * dispatcher framework.
202 */
203 void
204 dispinit(void)
205 {
206 id_t cid;
207 pri_t maxglobpri;
208 pri_t cl_maxglobpri;
209
210 maxglobpri = -1;
211
212 /*
213 * Initialize transition lock, which will always be set.
214 */
215 DISP_LOCK_INIT(&transition_lock);
216 disp_lock_enter_high(&transition_lock);
217 DISP_LOCK_INIT(&stop_lock);
218
219 mutex_enter(&cpu_lock);
220 CPU->cpu_disp->disp_maxrunpri = -1;
221 CPU->cpu_disp->disp_max_unbound_pri = -1;
222
223 /*
224 * Initialize the default CPU partition.
225 */
226 cpupart_initialize_default();
227 /*
228 * Call the class specific initialization functions for
229 * all pre-installed schedulers.
230 *
231 * We pass the size of a class specific parameter
232 * buffer to each of the initialization functions
233 * to try to catch problems with backward compatibility
234 * of class modules.
235 *
236 * For example a new class module running on an old system
237 * which didn't provide sufficiently large parameter buffers
238 * would be bad news. Class initialization modules can check for
239 * this and take action if they detect a problem.
240 */
241
242 for (cid = 0; cid < nclass; cid++) {
243 sclass_t *sc;
244
245 sc = &sclass[cid];
246 if (SCHED_INSTALLED(sc)) {
247 cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
248 &sc->cl_funcs);
249 if (cl_maxglobpri > maxglobpri)
250 maxglobpri = cl_maxglobpri;
251 }
252 }
253 kpreemptpri = (pri_t)v.v_maxsyspri + 1;
254 if (kpqpri == KPQPRI)
255 kpqpri = kpreemptpri;
256
257 ASSERT(maxglobpri >= 0);
258 disp_setup(maxglobpri, 0);
259
260 mutex_exit(&cpu_lock);
261
262 /*
263 * Platform specific sticky scheduler setup.
264 */
265 if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
266 cmp_set_nosteal_interval();
267
268 /*
269 * Get the default class ID; this may be later modified via
270 * dispadmin(1M). This will load the class (normally TS) and that will
271 * call disp_add(), which is why we had to drop cpu_lock first.
272 */
273 if (getcid(defaultclass, &defaultcid) != 0) {
274 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
275 defaultclass);
276 }
277 }
278
279 /*
280 * disp_add - Called with class pointer to initialize the dispatcher
281 * for a newly loaded class.
282 */
283 void
284 disp_add(sclass_t *clp)
285 {
286 pri_t maxglobpri;
287 pri_t cl_maxglobpri;
288
289 mutex_enter(&cpu_lock);
290 /*
291 * Initialize the scheduler class.
292 */
293 maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
294 cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
295 if (cl_maxglobpri > maxglobpri)
296 maxglobpri = cl_maxglobpri;
297
298 /*
299 * Save old queue information. Since we're initializing a
300 * new scheduling class which has just been loaded, then
301 * the size of the dispq may have changed. We need to handle
302 * that here.
303 */
304 disp_setup(maxglobpri, v.v_nglobpris);
305
306 mutex_exit(&cpu_lock);
307 }
308
309
310 /*
311 * For each CPU, allocate new dispatch queues
312 * with the stated number of priorities.
313 */
314 static void
315 cpu_dispqalloc(int numpris)
316 {
317 cpu_t *cpup;
318 struct disp_queue_info *disp_mem;
319 int i, num;
320
321 ASSERT(MUTEX_HELD(&cpu_lock));
322
323 disp_mem = kmem_zalloc(NCPU *
324 sizeof (struct disp_queue_info), KM_SLEEP);
325
326 /*
327 * This routine must allocate all of the memory before stopping
328 * the cpus because it must not sleep in kmem_alloc while the
329 * CPUs are stopped. Locks they hold will not be freed until they
330 * are restarted.
331 */
332 i = 0;
333 cpup = cpu_list;
334 do {
335 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
336 i++;
337 cpup = cpup->cpu_next;
338 } while (cpup != cpu_list);
339 num = i;
340
341 pause_cpus(NULL, NULL);
342 for (i = 0; i < num; i++)
343 disp_dq_assign(&disp_mem[i], numpris);
344 start_cpus();
345
346 /*
347 * I must free all of the memory after starting the cpus because
348 * I can not risk sleeping in kmem_free while the cpus are stopped.
349 */
350 for (i = 0; i < num; i++)
351 disp_dq_free(&disp_mem[i]);
352
353 kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
354 }
355
356 static void
357 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp)
358 {
359 dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
360 dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
361 sizeof (long), KM_SLEEP);
362 dptr->dp = dp;
363 }
364
365 static void
366 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
367 {
368 disp_t *dp;
369
370 dp = dptr->dp;
371 dptr->olddispq = dp->disp_q;
372 dptr->olddqactmap = dp->disp_qactmap;
373 dptr->oldnglobpris = dp->disp_npri;
374
375 ASSERT(dptr->oldnglobpris < numpris);
376
377 if (dptr->olddispq != NULL) {
378 /*
379 * Use kcopy because bcopy is platform-specific
380 * and could block while we might have paused the cpus.
381 */
382 (void) kcopy(dptr->olddispq, dptr->newdispq,
383 dptr->oldnglobpris * sizeof (dispq_t));
384 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
385 ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
386 sizeof (long));
387 }
388 dp->disp_q = dptr->newdispq;
389 dp->disp_qactmap = dptr->newdqactmap;
390 dp->disp_q_limit = &dptr->newdispq[numpris];
391 dp->disp_npri = numpris;
392 }
393
394 static void
395 disp_dq_free(struct disp_queue_info *dptr)
396 {
397 if (dptr->olddispq != NULL)
398 kmem_free(dptr->olddispq,
399 dptr->oldnglobpris * sizeof (dispq_t));
400 if (dptr->olddqactmap != NULL)
401 kmem_free(dptr->olddqactmap,
402 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
403 }
404
405 /*
406 * For a newly created CPU, initialize the dispatch queue.
407 * This is called before the CPU is known through cpu[] or on any lists.
408 */
409 void
410 disp_cpu_init(cpu_t *cp)
411 {
412 disp_t *dp;
413 dispq_t *newdispq;
414 ulong_t *newdqactmap;
415
416 ASSERT(MUTEX_HELD(&cpu_lock)); /* protect dispatcher queue sizes */
417
418 if (cp == cpu0_disp.disp_cpu)
419 dp = &cpu0_disp;
420 else
421 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
422 bzero(dp, sizeof (disp_t));
423 cp->cpu_disp = dp;
424 dp->disp_cpu = cp;
425 dp->disp_maxrunpri = -1;
426 dp->disp_max_unbound_pri = -1;
427 DISP_LOCK_INIT(&cp->cpu_thread_lock);
428 /*
429 * Allocate memory for the dispatcher queue headers
430 * and the active queue bitmap.
431 */
432 newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
433 newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
434 sizeof (long), KM_SLEEP);
435 dp->disp_q = newdispq;
436 dp->disp_qactmap = newdqactmap;
437 dp->disp_q_limit = &newdispq[v.v_nglobpris];
438 dp->disp_npri = v.v_nglobpris;
439 }
440
441 void
442 disp_cpu_fini(cpu_t *cp)
443 {
444 ASSERT(MUTEX_HELD(&cpu_lock));
445
446 disp_kp_free(cp->cpu_disp);
447 if (cp->cpu_disp != &cpu0_disp)
448 kmem_free(cp->cpu_disp, sizeof (disp_t));
449 }
450
451 /*
452 * Allocate new, larger kpreempt dispatch queue to replace the old one.
453 */
454 void
455 disp_kp_alloc(disp_t *dq, pri_t npri)
456 {
457 struct disp_queue_info mem_info;
458
459 if (npri > dq->disp_npri) {
460 /*
461 * Allocate memory for the new array.
462 */
463 disp_dq_alloc(&mem_info, npri, dq);
464
465 /*
466 * We need to copy the old structures to the new
467 * and free the old.
468 */
469 disp_dq_assign(&mem_info, npri);
470 disp_dq_free(&mem_info);
471 }
472 }
473
474 /*
475 * Free dispatch queue.
476 * Used for the kpreempt queues for a removed CPU partition and
477 * for the per-CPU queues of deleted CPUs.
478 */
479 void
480 disp_kp_free(disp_t *dq)
481 {
482 struct disp_queue_info mem_info;
483
484 mem_info.olddispq = dq->disp_q;
485 mem_info.olddqactmap = dq->disp_qactmap;
486 mem_info.oldnglobpris = dq->disp_npri;
487 disp_dq_free(&mem_info);
488 }
489
490 /*
491 * End dispatcher and scheduler initialization.
492 */
493
494 /*
495 * See if there's anything to do other than remain idle.
496 * Return non-zero if there is.
497 *
498 * This function must be called with high spl, or with
499 * kernel preemption disabled to prevent the partition's
500 * active cpu list from changing while being traversed.
501 *
502 * This is essentially a simpler version of disp_getwork()
503 * to be called by CPUs preparing to "halt".
504 */
505 int
506 disp_anywork(void)
507 {
508 cpu_t *cp = CPU;
509 cpu_t *ocp;
510 volatile int *local_nrunnable = &cp->cpu_disp->disp_nrunnable;
511
512 if (!(cp->cpu_flags & CPU_OFFLINE)) {
513 if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
514 return (1);
515
516 for (ocp = cp->cpu_next_part; ocp != cp;
517 ocp = ocp->cpu_next_part) {
518 ASSERT(CPU_ACTIVE(ocp));
519
520 /*
521 * Something has appeared on the local run queue.
522 */
523 if (*local_nrunnable > 0)
524 return (1);
525 /*
526 * If we encounter another idle CPU that will
527 * soon be trolling around through disp_anywork()
528 * terminate our walk here and let this other CPU
529 * patrol the next part of the list.
530 */
531 if (ocp->cpu_dispatch_pri == -1 &&
532 (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
533 return (0);
534 /*
535 * Work can be taken from another CPU if:
536 * - There is unbound work on the run queue
537 * - That work isn't a thread undergoing a
538 * - context switch on an otherwise empty queue.
539 * - The CPU isn't running the idle loop.
540 */
541 if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
542 !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
543 ocp->cpu_disp->disp_nrunnable == 1) &&
544 ocp->cpu_dispatch_pri != -1)
545 return (1);
546 }
547 }
548 return (0);
549 }
550
551 /*
552 * Called when CPU enters the idle loop
553 */
554 static void
555 idle_enter()
556 {
557 cpu_t *cp = CPU;
558
559 new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
560 CPU_STATS_ADDQ(cp, sys, idlethread, 1);
561 set_idle_cpu(cp->cpu_id); /* arch-dependent hook */
562 }
563
564 /*
565 * Called when CPU exits the idle loop
566 */
567 static void
568 idle_exit()
569 {
570 cpu_t *cp = CPU;
571
572 new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
573 unset_idle_cpu(cp->cpu_id); /* arch-dependent hook */
574 }
575
576 /*
577 * Idle loop.
578 */
579 void
580 idle()
581 {
582 struct cpu *cp = CPU; /* pointer to this CPU */
583 kthread_t *t; /* taken thread */
584
585 idle_enter();
586
587 /*
588 * Uniprocessor version of idle loop.
589 * Do this until notified that we're on an actual multiprocessor.
590 */
591 while (ncpus == 1) {
592 if (cp->cpu_disp->disp_nrunnable == 0) {
593 (*idle_cpu)();
594 continue;
595 }
596 idle_exit();
597 swtch();
598
599 idle_enter(); /* returned from swtch */
600 }
601
602 /*
603 * Multiprocessor idle loop.
604 */
605 for (;;) {
606 /*
607 * If CPU is completely quiesced by p_online(2), just wait
608 * here with minimal bus traffic until put online.
609 */
610 while (cp->cpu_flags & CPU_QUIESCED)
611 (*idle_cpu)();
612
613 if (cp->cpu_disp->disp_nrunnable != 0) {
614 idle_exit();
615 swtch();
616 } else {
617 if (cp->cpu_flags & CPU_OFFLINE)
618 continue;
619 if ((t = disp_getwork(cp)) == NULL) {
620 if (cp->cpu_chosen_level != -1) {
621 disp_t *dp = cp->cpu_disp;
622 disp_t *kpq;
623
624 disp_lock_enter(&dp->disp_lock);
625 /*
626 * Set kpq under lock to prevent
627 * migration between partitions.
628 */
629 kpq = &cp->cpu_part->cp_kp_queue;
630 if (kpq->disp_maxrunpri == -1)
631 cp->cpu_chosen_level = -1;
632 disp_lock_exit(&dp->disp_lock);
633 }
634 (*idle_cpu)();
635 continue;
636 }
637 /*
638 * If there was a thread but we couldn't steal
639 * it, then keep trying.
640 */
641 if (t == T_DONTSTEAL)
642 continue;
643 idle_exit();
644 swtch_to(t);
645 }
646 idle_enter(); /* returned from swtch/swtch_to */
647 }
648 }
649
650
651 /*
652 * Preempt the currently running thread in favor of the highest
653 * priority thread. The class of the current thread controls
654 * where it goes on the dispatcher queues. If panicking, turn
655 * preemption off.
656 */
657 void
658 preempt()
659 {
660 kthread_t *t = curthread;
661 klwp_t *lwp = ttolwp(curthread);
662
663 if (panicstr)
664 return;
665
666 TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
667
668 thread_lock(t);
669
670 if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
671 /*
672 * this thread has already been chosen to be run on
673 * another CPU. Clear kprunrun on this CPU since we're
674 * already headed for swtch().
675 */
676 CPU->cpu_kprunrun = 0;
677 thread_unlock_nopreempt(t);
678 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
679 } else {
680 if (lwp != NULL)
681 lwp->lwp_ru.nivcsw++;
682 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
683 THREAD_TRANSITION(t);
684 CL_PREEMPT(t);
685 DTRACE_SCHED(preempt);
686 thread_unlock_nopreempt(t);
687
688 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
689
690 swtch(); /* clears CPU->cpu_runrun via disp() */
691 }
692 }
693
694 extern kthread_t *thread_unpin();
695
696 /*
697 * disp() - find the highest priority thread for this processor to run, and
698 * set it in TS_ONPROC state so that resume() can be called to run it.
699 */
700 static kthread_t *
701 disp()
702 {
703 cpu_t *cpup;
704 disp_t *dp;
705 kthread_t *tp;
706 dispq_t *dq;
707 int maxrunword;
708 pri_t pri;
709 disp_t *kpq;
710
711 TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
712
713 cpup = CPU;
714 /*
715 * Find the highest priority loaded, runnable thread.
716 */
717 dp = cpup->cpu_disp;
718
719 reschedule:
720 /*
721 * If there is more important work on the global queue with a better
722 * priority than the maximum on this CPU, take it now.
723 */
724 kpq = &cpup->cpu_part->cp_kp_queue;
725 while ((pri = kpq->disp_maxrunpri) >= 0 &&
726 pri >= dp->disp_maxrunpri &&
727 (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
728 (tp = disp_getbest(kpq)) != NULL) {
729 if (disp_ratify(tp, kpq) != NULL) {
730 TRACE_1(TR_FAC_DISP, TR_DISP_END,
731 "disp_end:tid %p", tp);
732 return (tp);
733 }
734 }
735
736 disp_lock_enter(&dp->disp_lock);
737 pri = dp->disp_maxrunpri;
738
739 /*
740 * If there is nothing to run, look at what's runnable on other queues.
741 * Choose the idle thread if the CPU is quiesced.
742 * Note that CPUs that have the CPU_OFFLINE flag set can still run
743 * interrupt threads, which will be the only threads on the CPU's own
744 * queue, but cannot run threads from other queues.
745 */
746 if (pri == -1) {
747 if (!(cpup->cpu_flags & CPU_OFFLINE)) {
748 disp_lock_exit(&dp->disp_lock);
749 if ((tp = disp_getwork(cpup)) == NULL ||
750 tp == T_DONTSTEAL) {
751 tp = cpup->cpu_idle_thread;
752 (void) splhigh();
753 THREAD_ONPROC(tp, cpup);
754 cpup->cpu_dispthread = tp;
755 cpup->cpu_dispatch_pri = -1;
756 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
757 cpup->cpu_chosen_level = -1;
758 }
759 } else {
760 disp_lock_exit_high(&dp->disp_lock);
761 tp = cpup->cpu_idle_thread;
762 THREAD_ONPROC(tp, cpup);
763 cpup->cpu_dispthread = tp;
764 cpup->cpu_dispatch_pri = -1;
765 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
766 cpup->cpu_chosen_level = -1;
767 }
768 TRACE_1(TR_FAC_DISP, TR_DISP_END,
769 "disp_end:tid %p", tp);
770 return (tp);
771 }
772
773 dq = &dp->disp_q[pri];
774 tp = dq->dq_first;
775
776 ASSERT(tp != NULL);
777
778 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
779
780 /*
781 * Found it so remove it from queue.
782 */
783 dp->disp_nrunnable--;
784 dq->dq_sruncnt--;
785 if ((dq->dq_first = tp->t_link) == NULL) {
786 ulong_t *dqactmap = dp->disp_qactmap;
787
788 ASSERT(dq->dq_sruncnt == 0);
789 dq->dq_last = NULL;
790
791 /*
792 * The queue is empty, so the corresponding bit needs to be
793 * turned off in dqactmap. If nrunnable != 0 just took the
794 * last runnable thread off the
795 * highest queue, so recompute disp_maxrunpri.
796 */
797 maxrunword = pri >> BT_ULSHIFT;
798 dqactmap[maxrunword] &= ~BT_BIW(pri);
799
800 if (dp->disp_nrunnable == 0) {
801 dp->disp_max_unbound_pri = -1;
802 dp->disp_maxrunpri = -1;
803 } else {
804 int ipri;
805
806 ipri = bt_gethighbit(dqactmap, maxrunword);
807 dp->disp_maxrunpri = ipri;
808 if (ipri < dp->disp_max_unbound_pri)
809 dp->disp_max_unbound_pri = ipri;
810 }
811 } else {
812 tp->t_link = NULL;
813 }
814
815 cpup->cpu_dispthread = tp; /* protected by spl only */
816 cpup->cpu_dispatch_pri = pri;
817 ASSERT(pri == DISP_PRIO(tp));
818 thread_onproc(tp, cpup); /* set t_state to TS_ONPROC */
819 disp_lock_exit_high(&dp->disp_lock); /* drop run queue lock */
820
821 ASSERT(tp != NULL);
822 TRACE_1(TR_FAC_DISP, TR_DISP_END,
823 "disp_end:tid %p", tp);
824
825 if (disp_ratify(tp, kpq) == NULL)
826 goto reschedule;
827
828 return (tp);
829 }
830
831 /*
832 * swtch()
833 * Find best runnable thread and run it.
834 * Called with the current thread already switched to a new state,
835 * on a sleep queue, run queue, stopped, and not zombied.
836 * May be called at any spl level less than or equal to LOCK_LEVEL.
837 * Always drops spl to the base level (spl0()).
838 */
839 void
840 swtch()
841 {
842 kthread_t *t = curthread;
843 kthread_t *next;
844 cpu_t *cp;
845
846 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
847
848 if (t->t_flag & T_INTR_THREAD)
849 cpu_intr_swtch_enter(t);
850
851 if (t->t_intr != NULL) {
852 /*
853 * We are an interrupt thread. Setup and return
854 * the interrupted thread to be resumed.
855 */
856 (void) splhigh(); /* block other scheduler action */
857 cp = CPU; /* now protected against migration */
858 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */
859 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
860 CPU_STATS_ADDQ(cp, sys, intrblk, 1);
861 next = thread_unpin();
862 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
863 resume_from_intr(next);
864 } else {
865 #ifdef DEBUG
866 if (t->t_state == TS_ONPROC &&
867 t->t_disp_queue->disp_cpu == CPU &&
868 t->t_preempt == 0) {
869 thread_lock(t);
870 ASSERT(t->t_state != TS_ONPROC ||
871 t->t_disp_queue->disp_cpu != CPU ||
872 t->t_preempt != 0); /* cannot migrate */
873 thread_unlock_nopreempt(t);
874 }
875 #endif /* DEBUG */
876 cp = CPU;
877 next = disp(); /* returns with spl high */
878 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */
879
880 /* OK to steal anything left on run queue */
881 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
882
883 if (next != t) {
884 hrtime_t now;
885
886 now = gethrtime_unscaled();
887 pg_ev_thread_swtch(cp, now, t, next);
888
889 /*
890 * If t was previously in the TS_ONPROC state,
891 * setfrontdq and setbackdq won't have set its t_waitrq.
892 * Since we now finally know that we're switching away
893 * from this thread, set its t_waitrq if it is on a run
894 * queue.
895 */
896 if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
897 t->t_waitrq = now;
898 }
899
900 /*
901 * restore mstate of thread that we are switching to
902 */
903 restore_mstate(next);
904
905 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
906 cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
907 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
908
909 if (dtrace_vtime_active)
910 dtrace_vtime_switch(next);
911
912 resume(next);
913 /*
914 * The TR_RESUME_END and TR_SWTCH_END trace points
915 * appear at the end of resume(), because we may not
916 * return here
917 */
918 } else {
919 if (t->t_flag & T_INTR_THREAD)
920 cpu_intr_swtch_exit(t);
921 /*
922 * Threads that enqueue themselves on a run queue defer
923 * setting t_waitrq. It is then either set in swtch()
924 * when the CPU is actually yielded, or not at all if it
925 * is remaining on the CPU.
926 * There is however a window between where the thread
927 * placed itself on a run queue, and where it selects
928 * itself in disp(), where a third party (eg. clock()
929 * doing tick processing) may have re-enqueued this
930 * thread, setting t_waitrq in the process. We detect
931 * this race by noticing that despite switching to
932 * ourself, our t_waitrq has been set, and should be
933 * cleared.
934 */
935 if (t->t_waitrq != 0)
936 t->t_waitrq = 0;
937
938 pg_ev_thread_remain(cp, t);
939
940 DTRACE_SCHED(remain__cpu);
941 TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
942 (void) spl0();
943 }
944 }
945 }
946
947 /*
948 * swtch_from_zombie()
949 * Special case of swtch(), which allows checks for TS_ZOMB to be
950 * eliminated from normal resume.
951 * Find best runnable thread and run it.
952 * Called with the current thread zombied.
953 * Zombies cannot migrate, so CPU references are safe.
954 */
955 void
956 swtch_from_zombie()
957 {
958 kthread_t *next;
959 cpu_t *cpu = CPU;
960
961 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
962
963 ASSERT(curthread->t_state == TS_ZOMB);
964
965 next = disp(); /* returns with spl high */
966 ASSERT(CPU_ON_INTR(CPU) == 0); /* not called with PIL > 10 */
967 CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
968 ASSERT(next != curthread);
969 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
970
971 pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
972
973 restore_mstate(next);
974
975 if (dtrace_vtime_active)
976 dtrace_vtime_switch(next);
977
978 resume_from_zombie(next);
979 /*
980 * The TR_RESUME_END and TR_SWTCH_END trace points
981 * appear at the end of resume(), because we certainly will not
982 * return here
983 */
984 }
985
986 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
987
988 /*
989 * search_disp_queues()
990 * Search the given dispatch queues for thread tp.
991 * Return 1 if tp is found, otherwise return 0.
992 */
993 static int
994 search_disp_queues(disp_t *dp, kthread_t *tp)
995 {
996 dispq_t *dq;
997 dispq_t *eq;
998
999 disp_lock_enter_high(&dp->disp_lock);
1000
1001 for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1002 kthread_t *rp;
1003
1004 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1005
1006 for (rp = dq->dq_first; rp; rp = rp->t_link)
1007 if (tp == rp) {
1008 disp_lock_exit_high(&dp->disp_lock);
1009 return (1);
1010 }
1011 }
1012 disp_lock_exit_high(&dp->disp_lock);
1013
1014 return (0);
1015 }
1016
1017 /*
1018 * thread_on_queue()
1019 * Search all per-CPU dispatch queues and all partition-wide kpreempt
1020 * queues for thread tp. Return 1 if tp is found, otherwise return 0.
1021 */
1022 static int
1023 thread_on_queue(kthread_t *tp)
1024 {
1025 cpu_t *cp;
1026 struct cpupart *part;
1027
1028 ASSERT(getpil() >= DISP_LEVEL);
1029
1030 /*
1031 * Search the per-CPU dispatch queues for tp.
1032 */
1033 cp = CPU;
1034 do {
1035 if (search_disp_queues(cp->cpu_disp, tp))
1036 return (1);
1037 } while ((cp = cp->cpu_next_onln) != CPU);
1038
1039 /*
1040 * Search the partition-wide kpreempt queues for tp.
1041 */
1042 part = CPU->cpu_part;
1043 do {
1044 if (search_disp_queues(&part->cp_kp_queue, tp))
1045 return (1);
1046 } while ((part = part->cp_next) != CPU->cpu_part);
1047
1048 return (0);
1049 }
1050
1051 #else
1052
1053 #define thread_on_queue(tp) 0 /* ASSERT must be !thread_on_queue */
1054
1055 #endif /* DEBUG */
1056
1057 /*
1058 * like swtch(), but switch to a specified thread taken from another CPU.
1059 * called with spl high..
1060 */
1061 void
1062 swtch_to(kthread_t *next)
1063 {
1064 cpu_t *cp = CPU;
1065 hrtime_t now;
1066
1067 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1068
1069 /*
1070 * Update context switch statistics.
1071 */
1072 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1073
1074 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1075
1076 now = gethrtime_unscaled();
1077 pg_ev_thread_swtch(cp, now, curthread, next);
1078
1079 /* OK to steal anything left on run queue */
1080 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1081
1082 /* record last execution time */
1083 cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1084
1085 /*
1086 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1087 * won't have set its t_waitrq. Since we now finally know that we're
1088 * switching away from this thread, set its t_waitrq if it is on a run
1089 * queue.
1090 */
1091 if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1092 curthread->t_waitrq = now;
1093 }
1094
1095 /* restore next thread to previously running microstate */
1096 restore_mstate(next);
1097
1098 if (dtrace_vtime_active)
1099 dtrace_vtime_switch(next);
1100
1101 resume(next);
1102 /*
1103 * The TR_RESUME_END and TR_SWTCH_END trace points
1104 * appear at the end of resume(), because we may not
1105 * return here
1106 */
1107 }
1108
1109 #define CPU_IDLING(pri) ((pri) == -1)
1110
1111 static void
1112 cpu_resched(cpu_t *cp, pri_t tpri)
1113 {
1114 int call_poke_cpu = 0;
1115 pri_t cpupri = cp->cpu_dispatch_pri;
1116
1117 if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1118 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1119 "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1120 if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1121 cp->cpu_runrun = 1;
1122 aston(cp->cpu_dispthread);
1123 if (tpri < kpreemptpri && cp != CPU)
1124 call_poke_cpu = 1;
1125 }
1126 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1127 cp->cpu_kprunrun = 1;
1128 if (cp != CPU)
1129 call_poke_cpu = 1;
1130 }
1131 }
1132
1133 /*
1134 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1135 */
1136 membar_enter();
1137
1138 if (call_poke_cpu)
1139 poke_cpu(cp->cpu_id);
1140 }
1141
1142 /*
1143 * setbackdq() keeps runqs balanced such that the difference in length
1144 * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1145 * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1146 * must match. When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1147 * try to keep runqs perfectly balanced regardless of the thread priority.
1148 */
1149 #define RUNQ_MATCH_PRI 16 /* pri below which queue lengths must match */
1150 #define RUNQ_MAX_DIFF 2 /* maximum runq length difference */
1151 #define RUNQ_LEN(cp, pri) ((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1152
1153 /*
1154 * Macro that evaluates to true if it is likely that the thread has cache
1155 * warmth. This is based on the amount of time that has elapsed since the
1156 * thread last ran. If that amount of time is less than "rechoose_interval"
1157 * ticks, then we decide that the thread has enough cache warmth to warrant
1158 * some affinity for t->t_cpu.
1159 */
1160 #define THREAD_HAS_CACHE_WARMTH(thread) \
1161 ((thread == curthread) || \
1162 ((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1163 /*
1164 * Put the specified thread on the back of the dispatcher
1165 * queue corresponding to its current priority.
1166 *
1167 * Called with the thread in transition, onproc or stopped state
1168 * and locked (transition implies locked) and at high spl.
1169 * Returns with the thread in TS_RUN state and still locked.
1170 */
1171 void
1172 setbackdq(kthread_t *tp)
1173 {
1174 dispq_t *dq;
1175 disp_t *dp;
1176 cpu_t *cp;
1177 pri_t tpri;
1178 int bound;
1179 boolean_t self;
1180
1181 ASSERT(THREAD_LOCK_HELD(tp));
1182 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1183 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */
1184
1185 self = (tp == curthread);
1186
1187 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1188 bound = 1;
1189 else
1190 bound = 0;
1191
1192 tpri = DISP_PRIO(tp);
1193 if (ncpus == 1)
1194 cp = tp->t_cpu;
1195 else if (!bound) {
1196 if (tpri >= kpqpri) {
1197 setkpdq(tp, SETKP_BACK);
1198 return;
1199 }
1200
1201 /*
1202 * We'll generally let this thread continue to run where
1203 * it last ran...but will consider migration if:
1204 * - We thread probably doesn't have much cache warmth.
1205 * - The CPU where it last ran is the target of an offline
1206 * request.
1207 * - The thread last ran outside it's home lgroup.
1208 */
1209 if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1210 (tp->t_cpu == cpu_inmotion)) {
1211 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
1212 } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1213 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1214 self ? tp->t_cpu : NULL);
1215 } else {
1216 cp = tp->t_cpu;
1217 }
1218
1219 if (tp->t_cpupart == cp->cpu_part) {
1220 int qlen;
1221
1222 /*
1223 * Perform any CMT load balancing
1224 */
1225 cp = cmt_balance(tp, cp);
1226
1227 /*
1228 * Balance across the run queues
1229 */
1230 qlen = RUNQ_LEN(cp, tpri);
1231 if (tpri >= RUNQ_MATCH_PRI &&
1232 !(tp->t_schedflag & TS_RUNQMATCH))
1233 qlen -= RUNQ_MAX_DIFF;
1234 if (qlen > 0) {
1235 cpu_t *newcp;
1236
1237 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1238 newcp = cp->cpu_next_part;
1239 } else if ((newcp = cp->cpu_next_lpl) == cp) {
1240 newcp = cp->cpu_next_part;
1241 }
1242
1243 if (RUNQ_LEN(newcp, tpri) < qlen) {
1244 DTRACE_PROBE3(runq__balance,
1245 kthread_t *, tp,
1246 cpu_t *, cp, cpu_t *, newcp);
1247 cp = newcp;
1248 }
1249 }
1250 } else {
1251 /*
1252 * Migrate to a cpu in the new partition.
1253 */
1254 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1255 tp->t_lpl, tp->t_pri, NULL);
1256 }
1257 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1258 } else {
1259 /*
1260 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1261 * a short time until weak binding that existed when the
1262 * strong binding was established has dropped) so we must
1263 * favour weak binding over strong.
1264 */
1265 cp = tp->t_weakbound_cpu ?
1266 tp->t_weakbound_cpu : tp->t_bound_cpu;
1267 }
1268 /*
1269 * A thread that is ONPROC may be temporarily placed on the run queue
1270 * but then chosen to run again by disp. If the thread we're placing on
1271 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1272 * replacement process is actually scheduled in swtch(). In this
1273 * situation, curthread is the only thread that could be in the ONPROC
1274 * state.
1275 */
1276 if ((!self) && (tp->t_waitrq == 0)) {
1277 hrtime_t curtime;
1278
1279 curtime = gethrtime_unscaled();
1280 (void) cpu_update_pct(tp, curtime);
1281 tp->t_waitrq = curtime;
1282 } else {
1283 (void) cpu_update_pct(tp, gethrtime_unscaled());
1284 }
1285
1286 dp = cp->cpu_disp;
1287 disp_lock_enter_high(&dp->disp_lock);
1288
1289 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1290 TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1291 tpri, cp, tp);
1292
1293 #ifndef NPROBE
1294 /* Kernel probe */
1295 if (tnf_tracing_active)
1296 tnf_thread_queue(tp, cp, tpri);
1297 #endif /* NPROBE */
1298
1299 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1300
1301 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */
1302 tp->t_disp_queue = dp;
1303 tp->t_link = NULL;
1304
1305 dq = &dp->disp_q[tpri];
1306 dp->disp_nrunnable++;
1307 if (!bound)
1308 dp->disp_steal = 0;
1309 membar_enter();
1310
1311 if (dq->dq_sruncnt++ != 0) {
1312 ASSERT(dq->dq_first != NULL);
1313 dq->dq_last->t_link = tp;
1314 dq->dq_last = tp;
1315 } else {
1316 ASSERT(dq->dq_first == NULL);
1317 ASSERT(dq->dq_last == NULL);
1318 dq->dq_first = dq->dq_last = tp;
1319 BT_SET(dp->disp_qactmap, tpri);
1320 if (tpri > dp->disp_maxrunpri) {
1321 dp->disp_maxrunpri = tpri;
1322 membar_enter();
1323 cpu_resched(cp, tpri);
1324 }
1325 }
1326
1327 if (!bound && tpri > dp->disp_max_unbound_pri) {
1328 if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1329 /*
1330 * If there are no other unbound threads on the
1331 * run queue, don't allow other CPUs to steal
1332 * this thread while we are in the middle of a
1333 * context switch. We may just switch to it
1334 * again right away. CPU_DISP_DONTSTEAL is cleared
1335 * in swtch and swtch_to.
1336 */
1337 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1338 }
1339 dp->disp_max_unbound_pri = tpri;
1340 }
1341 (*disp_enq_thread)(cp, bound);
1342 }
1343
1344 /*
1345 * Put the specified thread on the front of the dispatcher
1346 * queue corresponding to its current priority.
1347 *
1348 * Called with the thread in transition, onproc or stopped state
1349 * and locked (transition implies locked) and at high spl.
1350 * Returns with the thread in TS_RUN state and still locked.
1351 */
1352 void
1353 setfrontdq(kthread_t *tp)
1354 {
1355 disp_t *dp;
1356 dispq_t *dq;
1357 cpu_t *cp;
1358 pri_t tpri;
1359 int bound;
1360
1361 ASSERT(THREAD_LOCK_HELD(tp));
1362 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1363 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */
1364
1365 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1366 bound = 1;
1367 else
1368 bound = 0;
1369
1370 tpri = DISP_PRIO(tp);
1371 if (ncpus == 1)
1372 cp = tp->t_cpu;
1373 else if (!bound) {
1374 if (tpri >= kpqpri) {
1375 setkpdq(tp, SETKP_FRONT);
1376 return;
1377 }
1378 cp = tp->t_cpu;
1379 if (tp->t_cpupart == cp->cpu_part) {
1380 /*
1381 * We'll generally let this thread continue to run
1382 * where it last ran, but will consider migration if:
1383 * - The thread last ran outside it's home lgroup.
1384 * - The CPU where it last ran is the target of an
1385 * offline request (a thread_nomigrate() on the in
1386 * motion CPU relies on this when forcing a preempt).
1387 * - The thread isn't the highest priority thread where
1388 * it last ran, and it is considered not likely to
1389 * have significant cache warmth.
1390 */
1391 if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
1392 (cp == cpu_inmotion)) {
1393 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1394 (tp == curthread) ? cp : NULL);
1395 } else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
1396 (!THREAD_HAS_CACHE_WARMTH(tp))) {
1397 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1398 NULL);
1399 }
1400 } else {
1401 /*
1402 * Migrate to a cpu in the new partition.
1403 */
1404 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1405 tp->t_lpl, tp->t_pri, NULL);
1406 }
1407 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1408 } else {
1409 /*
1410 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1411 * a short time until weak binding that existed when the
1412 * strong binding was established has dropped) so we must
1413 * favour weak binding over strong.
1414 */
1415 cp = tp->t_weakbound_cpu ?
1416 tp->t_weakbound_cpu : tp->t_bound_cpu;
1417 }
1418
1419 /*
1420 * A thread that is ONPROC may be temporarily placed on the run queue
1421 * but then chosen to run again by disp. If the thread we're placing on
1422 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1423 * replacement process is actually scheduled in swtch(). In this
1424 * situation, curthread is the only thread that could be in the ONPROC
1425 * state.
1426 */
1427 if ((tp != curthread) && (tp->t_waitrq == 0)) {
1428 hrtime_t curtime;
1429
1430 curtime = gethrtime_unscaled();
1431 (void) cpu_update_pct(tp, curtime);
1432 tp->t_waitrq = curtime;
1433 } else {
1434 (void) cpu_update_pct(tp, gethrtime_unscaled());
1435 }
1436
1437 dp = cp->cpu_disp;
1438 disp_lock_enter_high(&dp->disp_lock);
1439
1440 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1441 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1442
1443 #ifndef NPROBE
1444 /* Kernel probe */
1445 if (tnf_tracing_active)
1446 tnf_thread_queue(tp, cp, tpri);
1447 #endif /* NPROBE */
1448
1449 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1450
1451 THREAD_RUN(tp, &dp->disp_lock); /* set TS_RUN state and lock */
1452 tp->t_disp_queue = dp;
1453
1454 dq = &dp->disp_q[tpri];
1455 dp->disp_nrunnable++;
1456 if (!bound)
1457 dp->disp_steal = 0;
1458 membar_enter();
1459
1460 if (dq->dq_sruncnt++ != 0) {
1461 ASSERT(dq->dq_last != NULL);
1462 tp->t_link = dq->dq_first;
1463 dq->dq_first = tp;
1464 } else {
1465 ASSERT(dq->dq_last == NULL);
1466 ASSERT(dq->dq_first == NULL);
1467 tp->t_link = NULL;
1468 dq->dq_first = dq->dq_last = tp;
1469 BT_SET(dp->disp_qactmap, tpri);
1470 if (tpri > dp->disp_maxrunpri) {
1471 dp->disp_maxrunpri = tpri;
1472 membar_enter();
1473 cpu_resched(cp, tpri);
1474 }
1475 }
1476
1477 if (!bound && tpri > dp->disp_max_unbound_pri) {
1478 if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1479 cp == CPU) {
1480 /*
1481 * If there are no other unbound threads on the
1482 * run queue, don't allow other CPUs to steal
1483 * this thread while we are in the middle of a
1484 * context switch. We may just switch to it
1485 * again right away. CPU_DISP_DONTSTEAL is cleared
1486 * in swtch and swtch_to.
1487 */
1488 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1489 }
1490 dp->disp_max_unbound_pri = tpri;
1491 }
1492 (*disp_enq_thread)(cp, bound);
1493 }
1494
1495 /*
1496 * Put a high-priority unbound thread on the kp queue
1497 */
1498 static void
1499 setkpdq(kthread_t *tp, int borf)
1500 {
1501 dispq_t *dq;
1502 disp_t *dp;
1503 cpu_t *cp;
1504 pri_t tpri;
1505
1506 tpri = DISP_PRIO(tp);
1507
1508 dp = &tp->t_cpupart->cp_kp_queue;
1509 disp_lock_enter_high(&dp->disp_lock);
1510
1511 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1512
1513 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1514 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1515 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */
1516 tp->t_disp_queue = dp;
1517 dp->disp_nrunnable++;
1518 dq = &dp->disp_q[tpri];
1519
1520 if (dq->dq_sruncnt++ != 0) {
1521 if (borf == SETKP_BACK) {
1522 ASSERT(dq->dq_first != NULL);
1523 tp->t_link = NULL;
1524 dq->dq_last->t_link = tp;
1525 dq->dq_last = tp;
1526 } else {
1527 ASSERT(dq->dq_last != NULL);
1528 tp->t_link = dq->dq_first;
1529 dq->dq_first = tp;
1530 }
1531 } else {
1532 if (borf == SETKP_BACK) {
1533 ASSERT(dq->dq_first == NULL);
1534 ASSERT(dq->dq_last == NULL);
1535 dq->dq_first = dq->dq_last = tp;
1536 } else {
1537 ASSERT(dq->dq_last == NULL);
1538 ASSERT(dq->dq_first == NULL);
1539 tp->t_link = NULL;
1540 dq->dq_first = dq->dq_last = tp;
1541 }
1542 BT_SET(dp->disp_qactmap, tpri);
1543 if (tpri > dp->disp_max_unbound_pri)
1544 dp->disp_max_unbound_pri = tpri;
1545 if (tpri > dp->disp_maxrunpri) {
1546 dp->disp_maxrunpri = tpri;
1547 membar_enter();
1548 }
1549 }
1550
1551 cp = tp->t_cpu;
1552 if (tp->t_cpupart != cp->cpu_part) {
1553 /* migrate to a cpu in the new partition */
1554 cp = tp->t_cpupart->cp_cpulist;
1555 }
1556 cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1557 disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1558 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1559
1560 #ifndef NPROBE
1561 /* Kernel probe */
1562 if (tnf_tracing_active)
1563 tnf_thread_queue(tp, cp, tpri);
1564 #endif /* NPROBE */
1565
1566 if (cp->cpu_chosen_level < tpri)
1567 cp->cpu_chosen_level = tpri;
1568 cpu_resched(cp, tpri);
1569 disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1570 (*disp_enq_thread)(cp, 0);
1571 }
1572
1573 /*
1574 * Remove a thread from the dispatcher queue if it is on it.
1575 * It is not an error if it is not found but we return whether
1576 * or not it was found in case the caller wants to check.
1577 */
1578 int
1579 dispdeq(kthread_t *tp)
1580 {
1581 disp_t *dp;
1582 dispq_t *dq;
1583 kthread_t *rp;
1584 kthread_t *trp;
1585 kthread_t **ptp;
1586 int tpri;
1587
1588 ASSERT(THREAD_LOCK_HELD(tp));
1589
1590 if (tp->t_state != TS_RUN)
1591 return (0);
1592
1593 tpri = DISP_PRIO(tp);
1594 dp = tp->t_disp_queue;
1595 ASSERT(tpri < dp->disp_npri);
1596 dq = &dp->disp_q[tpri];
1597 ptp = &dq->dq_first;
1598 rp = *ptp;
1599 trp = NULL;
1600
1601 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1602
1603 /*
1604 * Search for thread in queue.
1605 * Double links would simplify this at the expense of disp/setrun.
1606 */
1607 while (rp != tp && rp != NULL) {
1608 trp = rp;
1609 ptp = &trp->t_link;
1610 rp = trp->t_link;
1611 }
1612
1613 if (rp == NULL) {
1614 panic("dispdeq: thread not on queue");
1615 }
1616
1617 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1618
1619 /*
1620 * Found it so remove it from queue.
1621 */
1622 if ((*ptp = rp->t_link) == NULL)
1623 dq->dq_last = trp;
1624
1625 dp->disp_nrunnable--;
1626 if (--dq->dq_sruncnt == 0) {
1627 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1628 if (dp->disp_nrunnable == 0) {
1629 dp->disp_max_unbound_pri = -1;
1630 dp->disp_maxrunpri = -1;
1631 } else if (tpri == dp->disp_maxrunpri) {
1632 int ipri;
1633
1634 ipri = bt_gethighbit(dp->disp_qactmap,
1635 dp->disp_maxrunpri >> BT_ULSHIFT);
1636 if (ipri < dp->disp_max_unbound_pri)
1637 dp->disp_max_unbound_pri = ipri;
1638 dp->disp_maxrunpri = ipri;
1639 }
1640 }
1641 tp->t_link = NULL;
1642 THREAD_TRANSITION(tp); /* put in intermediate state */
1643 return (1);
1644 }
1645
1646 /*
1647 * Make a thread give up its processor. Find the processor on
1648 * which this thread is executing, and have that processor
1649 * preempt.
1650 *
1651 * We allow System Duty Cycle (SDC) threads to be preempted even if
1652 * they are running at kernel priorities. To implement this, we always
1653 * set cpu_kprunrun; this ensures preempt() will be called. Since SDC
1654 * calls cpu_surrender() very often, we only preempt if there is anyone
1655 * competing with us.
1656 */
1657 void
1658 cpu_surrender(kthread_t *tp)
1659 {
1660 cpu_t *cpup;
1661 int max_pri;
1662 int max_run_pri;
1663 klwp_t *lwp;
1664
1665 ASSERT(THREAD_LOCK_HELD(tp));
1666
1667 if (tp->t_state != TS_ONPROC)
1668 return;
1669 cpup = tp->t_disp_queue->disp_cpu; /* CPU thread dispatched to */
1670 max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1671 max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1672 if (max_pri < max_run_pri)
1673 max_pri = max_run_pri;
1674
1675 if (tp->t_cid == sysdccid) {
1676 uint_t t_pri = DISP_PRIO(tp);
1677 if (t_pri > max_pri)
1678 return; /* we are not competing w/ anyone */
1679 cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1680 } else {
1681 cpup->cpu_runrun = 1;
1682 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1683 cpup->cpu_kprunrun = 1;
1684 }
1685 }
1686
1687 /*
1688 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1689 */
1690 membar_enter();
1691
1692 DTRACE_SCHED1(surrender, kthread_t *, tp);
1693
1694 /*
1695 * Make the target thread take an excursion through trap()
1696 * to do preempt() (unless we're already in trap or post_syscall,
1697 * calling cpu_surrender via CL_TRAPRET).
1698 */
1699 if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1700 lwp->lwp_state != LWP_USER) {
1701 aston(tp);
1702 if (cpup != CPU)
1703 poke_cpu(cpup->cpu_id);
1704 }
1705 TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1706 "cpu_surrender:tid %p cpu %p", tp, cpup);
1707 }
1708
1709 /*
1710 * Commit to and ratify a scheduling decision
1711 */
1712 /*ARGSUSED*/
1713 static kthread_t *
1714 disp_ratify(kthread_t *tp, disp_t *kpq)
1715 {
1716 pri_t tpri, maxpri;
1717 pri_t maxkpri;
1718 cpu_t *cpup;
1719
1720 ASSERT(tp != NULL);
1721 /*
1722 * Commit to, then ratify scheduling decision
1723 */
1724 cpup = CPU;
1725 if (cpup->cpu_runrun != 0)
1726 cpup->cpu_runrun = 0;
1727 if (cpup->cpu_kprunrun != 0)
1728 cpup->cpu_kprunrun = 0;
1729 if (cpup->cpu_chosen_level != -1)
1730 cpup->cpu_chosen_level = -1;
1731 membar_enter();
1732 tpri = DISP_PRIO(tp);
1733 maxpri = cpup->cpu_disp->disp_maxrunpri;
1734 maxkpri = kpq->disp_maxrunpri;
1735 if (maxpri < maxkpri)
1736 maxpri = maxkpri;
1737 if (tpri < maxpri) {
1738 /*
1739 * should have done better
1740 * put this one back and indicate to try again
1741 */
1742 cpup->cpu_dispthread = curthread; /* fixup dispthread */
1743 cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1744 thread_lock_high(tp);
1745 THREAD_TRANSITION(tp);
1746 setfrontdq(tp);
1747 thread_unlock_nopreempt(tp);
1748
1749 tp = NULL;
1750 }
1751 return (tp);
1752 }
1753
1754 /*
1755 * See if there is any work on the dispatcher queue for other CPUs.
1756 * If there is, dequeue the best thread and return.
1757 */
1758 static kthread_t *
1759 disp_getwork(cpu_t *cp)
1760 {
1761 cpu_t *ocp; /* other CPU */
1762 cpu_t *ocp_start;
1763 cpu_t *tcp; /* target local CPU */
1764 kthread_t *tp;
1765 kthread_t *retval = NULL;
1766 pri_t maxpri;
1767 disp_t *kpq; /* kp queue for this partition */
1768 lpl_t *lpl, *lpl_leaf;
1769 int leafidx, startidx;
1770 hrtime_t stealtime;
1771 lgrp_id_t local_id;
1772
1773 maxpri = -1;
1774 tcp = NULL;
1775
1776 kpq = &cp->cpu_part->cp_kp_queue;
1777 while (kpq->disp_maxrunpri >= 0) {
1778 /*
1779 * Try to take a thread from the kp_queue.
1780 */
1781 tp = (disp_getbest(kpq));
1782 if (tp)
1783 return (disp_ratify(tp, kpq));
1784 }
1785
1786 kpreempt_disable(); /* protect the cpu_active list */
1787
1788 /*
1789 * Try to find something to do on another CPU's run queue.
1790 * Loop through all other CPUs looking for the one with the highest
1791 * priority unbound thread.
1792 *
1793 * On NUMA machines, the partition's CPUs are consulted in order of
1794 * distance from the current CPU. This way, the first available
1795 * work found is also the closest, and will suffer the least
1796 * from being migrated.
1797 */
1798 lpl = lpl_leaf = cp->cpu_lpl;
1799 local_id = lpl_leaf->lpl_lgrpid;
1800 leafidx = startidx = 0;
1801
1802 /*
1803 * This loop traverses the lpl hierarchy. Higher level lpls represent
1804 * broader levels of locality
1805 */
1806 do {
1807 /* This loop iterates over the lpl's leaves */
1808 do {
1809 if (lpl_leaf != cp->cpu_lpl)
1810 ocp = lpl_leaf->lpl_cpus;
1811 else
1812 ocp = cp->cpu_next_lpl;
1813
1814 /* This loop iterates over the CPUs in the leaf */
1815 ocp_start = ocp;
1816 do {
1817 pri_t pri;
1818
1819 ASSERT(CPU_ACTIVE(ocp));
1820
1821 /*
1822 * End our stroll around this lpl if:
1823 *
1824 * - Something became runnable on the local
1825 * queue...which also ends our stroll around
1826 * the partition.
1827 *
1828 * - We happen across another idle CPU.
1829 * Since it is patrolling the next portion
1830 * of the lpl's list (assuming it's not
1831 * halted, or busy servicing an interrupt),
1832 * move to the next higher level of locality.
1833 */
1834 if (cp->cpu_disp->disp_nrunnable != 0) {
1835 kpreempt_enable();
1836 return (NULL);
1837 }
1838 if (ocp->cpu_dispatch_pri == -1) {
1839 if (ocp->cpu_disp_flags &
1840 CPU_DISP_HALTED ||
1841 ocp->cpu_intr_actv != 0)
1842 continue;
1843 else
1844 goto next_level;
1845 }
1846
1847 /*
1848 * If there's only one thread and the CPU
1849 * is in the middle of a context switch,
1850 * or it's currently running the idle thread,
1851 * don't steal it.
1852 */
1853 if ((ocp->cpu_disp_flags &
1854 CPU_DISP_DONTSTEAL) &&
1855 ocp->cpu_disp->disp_nrunnable == 1)
1856 continue;
1857
1858 pri = ocp->cpu_disp->disp_max_unbound_pri;
1859 if (pri > maxpri) {
1860 /*
1861 * Don't steal threads that we attempted
1862 * to steal recently until they're ready
1863 * to be stolen again.
1864 */
1865 stealtime = ocp->cpu_disp->disp_steal;
1866 if (stealtime == 0 ||
1867 stealtime - gethrtime() <= 0) {
1868 maxpri = pri;
1869 tcp = ocp;
1870 } else {
1871 /*
1872 * Don't update tcp, just set
1873 * the retval to T_DONTSTEAL, so
1874 * that if no acceptable CPUs
1875 * are found the return value
1876 * will be T_DONTSTEAL rather
1877 * then NULL.
1878 */
1879 retval = T_DONTSTEAL;
1880 }
1881 }
1882 } while ((ocp = ocp->cpu_next_lpl) != ocp_start);
1883
1884 /*
1885 * Iterate to the next leaf lpl in the resource set
1886 * at this level of locality. If we hit the end of
1887 * the set, wrap back around to the beginning.
1888 *
1889 * Note: This iteration is NULL terminated for a reason
1890 * see lpl_topo_bootstrap() in lgrp.c for details.
1891 */
1892 if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
1893 leafidx = 0;
1894 lpl_leaf = lpl->lpl_rset[leafidx];
1895 }
1896 } while (leafidx != startidx);
1897
1898 next_level:
1899 /*
1900 * Expand the search to include farther away CPUs (next
1901 * locality level). The closer CPUs that have already been
1902 * checked will be checked again. In doing so, idle CPUs
1903 * will tend to be more aggresive about stealing from CPUs
1904 * that are closer (since the closer CPUs will be considered
1905 * more often).
1906 * Begin at this level with the CPUs local leaf lpl.
1907 */
1908 if ((lpl = lpl->lpl_parent) != NULL) {
1909 leafidx = startidx = lpl->lpl_id2rset[local_id];
1910 lpl_leaf = lpl->lpl_rset[leafidx];
1911 }
1912 } while (!tcp && lpl);
1913
1914 kpreempt_enable();
1915
1916 /*
1917 * If another queue looks good, and there is still nothing on
1918 * the local queue, try to transfer one or more threads
1919 * from it to our queue.
1920 */
1921 if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
1922 tp = disp_getbest(tcp->cpu_disp);
1923 if (tp == NULL || tp == T_DONTSTEAL)
1924 return (tp);
1925 return (disp_ratify(tp, kpq));
1926 }
1927 return (retval);
1928 }
1929
1930
1931 /*
1932 * disp_fix_unbound_pri()
1933 * Determines the maximum priority of unbound threads on the queue.
1934 * The priority is kept for the queue, but is only increased, never
1935 * reduced unless some CPU is looking for something on that queue.
1936 *
1937 * The priority argument is the known upper limit.
1938 *
1939 * Perhaps this should be kept accurately, but that probably means
1940 * separate bitmaps for bound and unbound threads. Since only idled
1941 * CPUs will have to do this recalculation, it seems better this way.
1942 */
1943 static void
1944 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
1945 {
1946 kthread_t *tp;
1947 dispq_t *dq;
1948 ulong_t *dqactmap = dp->disp_qactmap;
1949 ulong_t mapword;
1950 int wx;
1951
1952 ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
1953
1954 ASSERT(pri >= 0); /* checked by caller */
1955
1956 /*
1957 * Start the search at the next lowest priority below the supplied
1958 * priority. This depends on the bitmap implementation.
1959 */
1960 do {
1961 wx = pri >> BT_ULSHIFT; /* index of word in map */
1962
1963 /*
1964 * Form mask for all lower priorities in the word.
1965 */
1966 mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
1967
1968 /*
1969 * Get next lower active priority.
1970 */
1971 if (mapword != 0) {
1972 pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
1973 } else if (wx > 0) {
1974 pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
1975 if (pri < 0)
1976 break;
1977 } else {
1978 pri = -1;
1979 break;
1980 }
1981
1982 /*
1983 * Search the queue for unbound, runnable threads.
1984 */
1985 dq = &dp->disp_q[pri];
1986 tp = dq->dq_first;
1987
1988 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
1989 tp = tp->t_link;
1990 }
1991
1992 /*
1993 * If a thread was found, set the priority and return.
1994 */
1995 } while (tp == NULL);
1996
1997 /*
1998 * pri holds the maximum unbound thread priority or -1.
1999 */
2000 if (dp->disp_max_unbound_pri != pri)
2001 dp->disp_max_unbound_pri = pri;
2002 }
2003
2004 /*
2005 * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2006 * check if the CPU to which is was previously bound should have
2007 * its disp_max_unbound_pri increased.
2008 */
2009 void
2010 disp_adjust_unbound_pri(kthread_t *tp)
2011 {
2012 disp_t *dp;
2013 pri_t tpri;
2014
2015 ASSERT(THREAD_LOCK_HELD(tp));
2016
2017 /*
2018 * Don't do anything if the thread is not bound, or
2019 * currently not runnable.
2020 */
2021 if (tp->t_bound_cpu == NULL ||
2022 tp->t_state != TS_RUN)
2023 return;
2024
2025 tpri = DISP_PRIO(tp);
2026 dp = tp->t_bound_cpu->cpu_disp;
2027 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2028 if (tpri > dp->disp_max_unbound_pri)
2029 dp->disp_max_unbound_pri = tpri;
2030 }
2031
2032 /*
2033 * disp_getbest()
2034 * De-queue the highest priority unbound runnable thread.
2035 * Returns with the thread unlocked and onproc but at splhigh (like disp()).
2036 * Returns NULL if nothing found.
2037 * Returns T_DONTSTEAL if the thread was not stealable.
2038 * so that the caller will try again later.
2039 *
2040 * Passed a pointer to a dispatch queue not associated with this CPU, and
2041 * its type.
2042 */
2043 static kthread_t *
2044 disp_getbest(disp_t *dp)
2045 {
2046 kthread_t *tp;
2047 dispq_t *dq;
2048 pri_t pri;
2049 cpu_t *cp, *tcp;
2050 boolean_t allbound;
2051
2052 disp_lock_enter(&dp->disp_lock);
2053
2054 /*
2055 * If there is nothing to run, or the CPU is in the middle of a
2056 * context switch of the only thread, return NULL.
2057 */
2058 tcp = dp->disp_cpu;
2059 cp = CPU;
2060 pri = dp->disp_max_unbound_pri;
2061 if (pri == -1 ||
2062 (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2063 tcp->cpu_disp->disp_nrunnable == 1)) {
2064 disp_lock_exit_nopreempt(&dp->disp_lock);
2065 return (NULL);
2066 }
2067
2068 dq = &dp->disp_q[pri];
2069
2070
2071 /*
2072 * Assume that all threads are bound on this queue, and change it
2073 * later when we find out that it is not the case.
2074 */
2075 allbound = B_TRUE;
2076 for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2077 hrtime_t now, nosteal, rqtime;
2078
2079 /*
2080 * Skip over bound threads which could be here even
2081 * though disp_max_unbound_pri indicated this level.
2082 */
2083 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2084 continue;
2085
2086 /*
2087 * We've got some unbound threads on this queue, so turn
2088 * the allbound flag off now.
2089 */
2090 allbound = B_FALSE;
2091
2092 /*
2093 * The thread is a candidate for stealing from its run queue. We
2094 * don't want to steal threads that became runnable just a
2095 * moment ago. This improves CPU affinity for threads that get
2096 * preempted for short periods of time and go back on the run
2097 * queue.
2098 *
2099 * We want to let it stay on its run queue if it was only placed
2100 * there recently and it was running on the same CPU before that
2101 * to preserve its cache investment. For the thread to remain on
2102 * its run queue, ALL of the following conditions must be
2103 * satisfied:
2104 *
2105 * - the disp queue should not be the kernel preemption queue
2106 * - delayed idle stealing should not be disabled
2107 * - nosteal_nsec should be non-zero
2108 * - it should run with user priority
2109 * - it should be on the run queue of the CPU where it was
2110 * running before being placed on the run queue
2111 * - it should be the only thread on the run queue (to prevent
2112 * extra scheduling latency for other threads)
2113 * - it should sit on the run queue for less than per-chip
2114 * nosteal interval or global nosteal interval
2115 * - in case of CPUs with shared cache it should sit in a run
2116 * queue of a CPU from a different chip
2117 *
2118 * The checks are arranged so that the ones that are faster are
2119 * placed earlier.
2120 */
2121 if (tcp == NULL ||
2122 pri >= minclsyspri ||
2123 tp->t_cpu != tcp)
2124 break;
2125
2126 /*
2127 * Steal immediately if, due to CMT processor architecture
2128 * migraiton between cp and tcp would incur no performance
2129 * penalty.
2130 */
2131 if (pg_cmt_can_migrate(cp, tcp))
2132 break;
2133
2134 nosteal = nosteal_nsec;
2135 if (nosteal == 0)
2136 break;
2137
2138 /*
2139 * Calculate time spent sitting on run queue
2140 */
2141 now = gethrtime_unscaled();
2142 rqtime = now - tp->t_waitrq;
2143 scalehrtime(&rqtime);
2144
2145 /*
2146 * Steal immediately if the time spent on this run queue is more
2147 * than allowed nosteal delay.
2148 *
2149 * Negative rqtime check is needed here to avoid infinite
2150 * stealing delays caused by unlikely but not impossible
2151 * drifts between CPU times on different CPUs.
2152 */
2153 if (rqtime > nosteal || rqtime < 0)
2154 break;
2155
2156 DTRACE_PROBE4(nosteal, kthread_t *, tp,
2157 cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2158 scalehrtime(&now);
2159 /*
2160 * Calculate when this thread becomes stealable
2161 */
2162 now += (nosteal - rqtime);
2163
2164 /*
2165 * Calculate time when some thread becomes stealable
2166 */
2167 if (now < dp->disp_steal)
2168 dp->disp_steal = now;
2169 }
2170
2171 /*
2172 * If there were no unbound threads on this queue, find the queue
2173 * where they are and then return later. The value of
2174 * disp_max_unbound_pri is not always accurate because it isn't
2175 * reduced until another idle CPU looks for work.
2176 */
2177 if (allbound)
2178 disp_fix_unbound_pri(dp, pri);
2179
2180 /*
2181 * If we reached the end of the queue and found no unbound threads
2182 * then return NULL so that other CPUs will be considered. If there
2183 * are unbound threads but they cannot yet be stolen, then
2184 * return T_DONTSTEAL and try again later.
2185 */
2186 if (tp == NULL) {
2187 disp_lock_exit_nopreempt(&dp->disp_lock);
2188 return (allbound ? NULL : T_DONTSTEAL);
2189 }
2190
2191 /*
2192 * Found a runnable, unbound thread, so remove it from queue.
2193 * dispdeq() requires that we have the thread locked, and we do,
2194 * by virtue of holding the dispatch queue lock. dispdeq() will
2195 * put the thread in transition state, thereby dropping the dispq
2196 * lock.
2197 */
2198
2199 #ifdef DEBUG
2200 {
2201 int thread_was_on_queue;
2202
2203 thread_was_on_queue = dispdeq(tp); /* drops disp_lock */
2204 ASSERT(thread_was_on_queue);
2205 }
2206
2207 #else /* DEBUG */
2208 (void) dispdeq(tp); /* drops disp_lock */
2209 #endif /* DEBUG */
2210
2211 /*
2212 * Reset the disp_queue steal time - we do not know what is the smallest
2213 * value across the queue is.
2214 */
2215 dp->disp_steal = 0;
2216
2217 /*
2218 * Setup thread to run on the current CPU.
2219 */
2220 tp->t_disp_queue = cp->cpu_disp;
2221
2222 cp->cpu_dispthread = tp; /* protected by spl only */
2223 cp->cpu_dispatch_pri = pri;
2224
2225 /*
2226 * There can be a memory synchronization race between disp_getbest()
2227 * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2228 * to preempt the current thread to run the enqueued thread while
2229 * disp_getbest() and disp_ratify() are changing the current thread
2230 * to the stolen thread. This may lead to a situation where
2231 * cpu_resched() tries to preempt the wrong thread and the
2232 * stolen thread continues to run on the CPU which has been tagged
2233 * for preemption.
2234 * Later the clock thread gets enqueued but doesn't get to run on the
2235 * CPU causing the system to hang.
2236 *
2237 * To avoid this, grabbing and dropping the disp_lock (which does
2238 * a memory barrier) is needed to synchronize the execution of
2239 * cpu_resched() with disp_getbest() and disp_ratify() and
2240 * synchronize the memory read and written by cpu_resched(),
2241 * disp_getbest(), and disp_ratify() with each other.
2242 * (see CR#6482861 for more details).
2243 */
2244 disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2245 disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2246
2247 ASSERT(pri == DISP_PRIO(tp));
2248
2249 DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2250
2251 thread_onproc(tp, cp); /* set t_state to TS_ONPROC */
2252
2253 /*
2254 * Return with spl high so that swtch() won't need to raise it.
2255 * The disp_lock was dropped by dispdeq().
2256 */
2257
2258 return (tp);
2259 }
2260
2261 /*
2262 * disp_bound_common() - common routine for higher level functions
2263 * that check for bound threads under certain conditions.
2264 * If 'threadlistsafe' is set then there is no need to acquire
2265 * pidlock to stop the thread list from changing (eg, if
2266 * disp_bound_* is called with cpus paused).
2267 */
2268 static int
2269 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2270 {
2271 int found = 0;
2272 kthread_t *tp;
2273
2274 ASSERT(flag);
2275
2276 if (!threadlistsafe)
2277 mutex_enter(&pidlock);
2278 tp = curthread; /* faster than allthreads */
2279 do {
2280 if (tp->t_state != TS_FREE) {
2281 /*
2282 * If an interrupt thread is busy, but the
2283 * caller doesn't care (i.e. BOUND_INTR is off),
2284 * then just ignore it and continue through.
2285 */
2286 if ((tp->t_flag & T_INTR_THREAD) &&
2287 !(flag & BOUND_INTR))
2288 continue;
2289
2290 /*
2291 * Skip the idle thread for the CPU
2292 * we're about to set offline.
2293 */
2294 if (tp == cp->cpu_idle_thread)
2295 continue;
2296
2297 /*
2298 * Skip the pause thread for the CPU
2299 * we're about to set offline.
2300 */
2301 if (tp == cp->cpu_pause_thread)
2302 continue;
2303
2304 if ((flag & BOUND_CPU) &&
2305 (tp->t_bound_cpu == cp ||
2306 tp->t_bind_cpu == cp->cpu_id ||
2307 tp->t_weakbound_cpu == cp)) {
2308 found = 1;
2309 break;
2310 }
2311
2312 if ((flag & BOUND_PARTITION) &&
2313 (tp->t_cpupart == cp->cpu_part)) {
2314 found = 1;
2315 break;
2316 }
2317 }
2318 } while ((tp = tp->t_next) != curthread && found == 0);
2319 if (!threadlistsafe)
2320 mutex_exit(&pidlock);
2321 return (found);
2322 }
2323
2324 /*
2325 * disp_bound_threads - return nonzero if threads are bound to the processor.
2326 * Called infrequently. Keep this simple.
2327 * Includes threads that are asleep or stopped but not onproc.
2328 */
2329 int
2330 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2331 {
2332 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2333 }
2334
2335 /*
2336 * disp_bound_anythreads - return nonzero if _any_ threads are bound
2337 * to the given processor, including interrupt threads.
2338 */
2339 int
2340 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2341 {
2342 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2343 }
2344
2345 /*
2346 * disp_bound_partition - return nonzero if threads are bound to the same
2347 * partition as the processor.
2348 * Called infrequently. Keep this simple.
2349 * Includes threads that are asleep or stopped but not onproc.
2350 */
2351 int
2352 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2353 {
2354 return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2355 }
2356
2357 /*
2358 * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2359 * threads to other CPUs.
2360 */
2361 void
2362 disp_cpu_inactive(cpu_t *cp)
2363 {
2364 kthread_t *tp;
2365 disp_t *dp = cp->cpu_disp;
2366 dispq_t *dq;
2367 pri_t pri;
2368 int wasonq;
2369
2370 disp_lock_enter(&dp->disp_lock);
2371 while ((pri = dp->disp_max_unbound_pri) != -1) {
2372 dq = &dp->disp_q[pri];
2373 tp = dq->dq_first;
2374
2375 /*
2376 * Skip over bound threads.
2377 */
2378 while (tp != NULL && tp->t_bound_cpu != NULL) {
2379 tp = tp->t_link;
2380 }
2381
2382 if (tp == NULL) {
2383 /* disp_max_unbound_pri must be inaccurate, so fix it */
2384 disp_fix_unbound_pri(dp, pri);
2385 continue;
2386 }
2387
2388 wasonq = dispdeq(tp); /* drops disp_lock */
2389 ASSERT(wasonq);
2390 ASSERT(tp->t_weakbound_cpu == NULL);
2391
2392 setbackdq(tp);
2393 /*
2394 * Called from cpu_offline:
2395 *
2396 * cp has already been removed from the list of active cpus
2397 * and tp->t_cpu has been changed so there is no risk of
2398 * tp ending up back on cp.
2399 *
2400 * Called from cpupart_move_cpu:
2401 *
2402 * The cpu has moved to a new cpupart. Any threads that
2403 * were on it's dispatch queues before the move remain
2404 * in the old partition and can't run in the new partition.
2405 */
2406 ASSERT(tp->t_cpu != cp);
2407 thread_unlock(tp);
2408
2409 disp_lock_enter(&dp->disp_lock);
2410 }
2411 disp_lock_exit(&dp->disp_lock);
2412 }
2413
2414 /*
2415 * disp_lowpri_cpu - find CPU running the lowest priority thread.
2416 * The hint passed in is used as a starting point so we don't favor
2417 * CPU 0 or any other CPU. The caller should pass in the most recently
2418 * used CPU for the thread.
2419 *
2420 * The lgroup and priority are used to determine the best CPU to run on
2421 * in a NUMA machine. The lgroup specifies which CPUs are closest while
2422 * the thread priority will indicate whether the thread will actually run
2423 * there. To pick the best CPU, the CPUs inside and outside of the given
2424 * lgroup which are running the lowest priority threads are found. The
2425 * remote CPU is chosen only if the thread will not run locally on a CPU
2426 * within the lgroup, but will run on the remote CPU. If the thread
2427 * cannot immediately run on any CPU, the best local CPU will be chosen.
2428 *
2429 * The lpl specified also identifies the cpu partition from which
2430 * disp_lowpri_cpu should select a CPU.
2431 *
2432 * curcpu is used to indicate that disp_lowpri_cpu is being called on
2433 * behalf of the current thread. (curthread is looking for a new cpu)
2434 * In this case, cpu_dispatch_pri for this thread's cpu should be
2435 * ignored.
2436 *
2437 * If a cpu is the target of an offline request then try to avoid it.
2438 *
2439 * This function must be called at either high SPL, or with preemption
2440 * disabled, so that the "hint" CPU cannot be removed from the online
2441 * CPU list while we are traversing it.
2442 */
2443 cpu_t *
2444 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2445 {
2446 cpu_t *bestcpu;
2447 cpu_t *besthomecpu;
2448 cpu_t *cp, *cpstart;
2449
2450 pri_t bestpri;
2451 pri_t cpupri;
2452
2453 klgrpset_t done;
2454 klgrpset_t cur_set;
2455
2456 lpl_t *lpl_iter, *lpl_leaf;
2457 int i;
2458
2459 /*
2460 * Scan for a CPU currently running the lowest priority thread.
2461 * Cannot get cpu_lock here because it is adaptive.
2462 * We do not require lock on CPU list.
2463 */
2464 ASSERT(hint != NULL);
2465 ASSERT(lpl != NULL);
2466 ASSERT(lpl->lpl_ncpu > 0);
2467
2468 /*
2469 * First examine local CPUs. Note that it's possible the hint CPU
2470 * passed in in remote to the specified home lgroup. If our priority
2471 * isn't sufficient enough such that we can run immediately at home,
2472 * then examine CPUs remote to our home lgroup.
2473 * We would like to give preference to CPUs closest to "home".
2474 * If we can't find a CPU where we'll run at a given level
2475 * of locality, we expand our search to include the next level.
2476 */
2477 bestcpu = besthomecpu = NULL;
2478 klgrpset_clear(done);
2479 /* start with lpl we were passed */
2480
2481 lpl_iter = lpl;
2482
2483 do {
2484
2485 bestpri = SHRT_MAX;
2486 klgrpset_clear(cur_set);
2487
2488 for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2489 lpl_leaf = lpl_iter->lpl_rset[i];
2490 if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2491 continue;
2492
2493 klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2494
2495 if (hint->cpu_lpl == lpl_leaf)
2496 cp = cpstart = hint;
2497 else
2498 cp = cpstart = lpl_leaf->lpl_cpus;
2499
2500 do {
2501 if (cp == curcpu)
2502 cpupri = -1;
2503 else if (cp == cpu_inmotion)
2504 cpupri = SHRT_MAX;
2505 else
2506 cpupri = cp->cpu_dispatch_pri;
2507 if (cp->cpu_disp->disp_maxrunpri > cpupri)
2508 cpupri = cp->cpu_disp->disp_maxrunpri;
2509 if (cp->cpu_chosen_level > cpupri)
2510 cpupri = cp->cpu_chosen_level;
2511 if (cpupri < bestpri) {
2512 if (CPU_IDLING(cpupri)) {
2513 ASSERT((cp->cpu_flags &
2514 CPU_QUIESCED) == 0);
2515 return (cp);
2516 }
2517 bestcpu = cp;
2518 bestpri = cpupri;
2519 }
2520 } while ((cp = cp->cpu_next_lpl) != cpstart);
2521 }
2522
2523 if (bestcpu && (tpri > bestpri)) {
2524 ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2525 return (bestcpu);
2526 }
2527 if (besthomecpu == NULL)
2528 besthomecpu = bestcpu;
2529 /*
2530 * Add the lgrps we just considered to the "done" set
2531 */
2532 klgrpset_or(done, cur_set);
2533
2534 } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2535
2536 /*
2537 * The specified priority isn't high enough to run immediately
2538 * anywhere, so just return the best CPU from the home lgroup.
2539 */
2540 ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2541 return (besthomecpu);
2542 }
2543
2544 /*
2545 * This routine provides the generic idle cpu function for all processors.
2546 * If a processor has some specific code to execute when idle (say, to stop
2547 * the pipeline and save power) then that routine should be defined in the
2548 * processors specific code (module_xx.c) and the global variable idle_cpu
2549 * set to that function.
2550 */
2551 static void
2552 generic_idle_cpu(void)
2553 {
2554 }
2555
2556 /*ARGSUSED*/
2557 static void
2558 generic_enq_thread(cpu_t *cpu, int bound)
2559 {
2560 }