Print this page
XXXX pass in cpu_pause_func via pause_cpus
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/i86xpv/os/mp_xen.c
+++ new/usr/src/uts/i86xpv/os/mp_xen.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 24 * Use is subject to license terms.
25 25 */
26 26
27 27 /*
28 28 * Virtual CPU management.
29 29 *
30 30 * VCPUs can be controlled in one of two ways; through the domain itself
31 31 * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()).
32 32 * Unfortunately, the terminology is used in different ways; they work out as
33 33 * follows:
34 34 *
35 35 * P_ONLINE: the VCPU is up and running, taking interrupts and running threads
36 36 *
37 37 * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the
38 38 * hypervisor on the idle thread). It must be up since a downed VCPU cannot
39 39 * receive interrupts, and we require this for offline CPUs in Solaris.
40 40 *
41 41 * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called
42 42 * xen_vcpu_down() for it). It can't take interrupts or run anything, though
43 43 * if it has run previously, its software state (cpu_t, machcpu structures, IPI
44 44 * event channels, etc.) will still exist.
45 45 *
46 46 * The hypervisor has two notions of CPU states as represented in the store:
47 47 *
48 48 * "offline": the VCPU is down. Corresponds to P_POWEROFF.
49 49 *
50 50 * "online": the VCPU is running. Corresponds to a CPU state other than
51 51 * P_POWEROFF.
52 52 *
53 53 * Currently, only a notification via xenstore can bring a CPU into a
54 54 * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR,
55 55 * P_OFFLINE, etc. We need to be careful to treat xenstore notifications
56 56 * idempotently, as we'll get 'duplicate' entries when we resume a domain.
57 57 *
58 58 * Note that the xenstore configuration is strictly advisory, in that a domain
59 59 * can choose to ignore it and still power up a VCPU in the offline state. To
60 60 * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is
61 61 * ENOTSUP from within Solaris.
62 62 *
63 63 * Powering off a VCPU and suspending the domain use similar code. The
64 64 * difficulty here is that we must ensure that each VCPU is in a stable
65 65 * state: it must have a saved PCB, and not be responding to interrupts
66 66 * (since we are just about to remove its ability to run on a real CPU,
67 67 * possibly forever). However, an offline CPU in Solaris can take
68 68 * cross-call interrupts, as mentioned, so we must go through a
69 69 * two-stage process. First, we use the standard Solaris pause_cpus().
70 70 * This ensures that all CPUs are either in mach_cpu_pause() or
71 71 * mach_cpu_idle(), and nothing will cross-call them.
72 72 *
73 73 * Powered-off-CPUs are already safe, as we own the cpu_lock needed to
74 74 * bring them back up, and in state CPU_PHASE_POWERED_OFF.
75 75 *
76 76 * Running CPUs are spinning in mach_cpu_pause() waiting for either
77 77 * PAUSE_IDLE or CPU_PHASE_WAIT_SAFE.
78 78 *
79 79 * Offline CPUs are either running the idle thread and periodically
80 80 * checking for CPU_PHASE_WAIT_SAFE, or blocked in the hypervisor.
81 81 *
82 82 * Thus, we set CPU_PHASE_WAIT_SAFE for every powered-on CPU, as well as
83 83 * poking them to make sure they're not blocked[1]. When every CPU has
84 84 * responded by reaching a safe state and setting CPU_PHASE_SAFE, we
85 85 * know we can suspend, or power-off a CPU, without problems.
86 86 *
87 87 * [1] note that we have to repeatedly poke offline CPUs: it's the only
88 88 * way to ensure that the CPU doesn't miss the state change before
89 89 * dropping into HYPERVISOR_block().
90 90 */
91 91
92 92 #include <sys/types.h>
93 93 #include <sys/systm.h>
94 94 #include <sys/param.h>
95 95 #include <sys/taskq.h>
96 96 #include <sys/cmn_err.h>
97 97 #include <sys/archsystm.h>
98 98 #include <sys/machsystm.h>
99 99 #include <sys/segments.h>
100 100 #include <sys/cpuvar.h>
101 101 #include <sys/x86_archext.h>
102 102 #include <sys/controlregs.h>
103 103 #include <sys/hypervisor.h>
104 104 #include <sys/xpv_panic.h>
105 105 #include <sys/mman.h>
106 106 #include <sys/psw.h>
107 107 #include <sys/cpu.h>
108 108 #include <sys/sunddi.h>
109 109 #include <util/sscanf.h>
110 110 #include <vm/hat_i86.h>
111 111 #include <vm/hat.h>
112 112 #include <vm/as.h>
113 113
114 114 #include <xen/public/io/xs_wire.h>
115 115 #include <xen/sys/xenbus_impl.h>
116 116 #include <xen/public/vcpu.h>
117 117
118 118 extern cpuset_t cpu_ready_set;
119 119
120 120 #define CPU_PHASE_NONE 0
121 121 #define CPU_PHASE_WAIT_SAFE 1
122 122 #define CPU_PHASE_SAFE 2
123 123 #define CPU_PHASE_POWERED_OFF 3
124 124
125 125 /*
126 126 * We can only poke CPUs during barrier enter 256 times a second at
127 127 * most.
128 128 */
129 129 #define POKE_TIMEOUT (NANOSEC / 256)
130 130
131 131 static taskq_t *cpu_config_tq;
132 132 static int cpu_phase[NCPU];
133 133
134 134 static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t);
135 135 static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *);
136 136
137 137 /*
138 138 * Return whether or not the vcpu is actually running on a pcpu
139 139 */
140 140 int
141 141 vcpu_on_pcpu(processorid_t cpu)
142 142 {
143 143 struct vcpu_runstate_info runstate;
144 144 int ret = VCPU_STATE_UNKNOWN;
145 145
146 146 ASSERT(cpu < NCPU);
147 147 /*
148 148 * Don't bother with hypercall if we are asking about ourself
149 149 */
150 150 if (cpu == CPU->cpu_id)
151 151 return (VCPU_ON_PCPU);
152 152 if (HYPERVISOR_vcpu_op(VCPUOP_get_runstate_info, cpu, &runstate) != 0)
153 153 goto out;
154 154
155 155 switch (runstate.state) {
156 156 case RUNSTATE_running:
157 157 ret = VCPU_ON_PCPU;
158 158 break;
159 159
160 160 case RUNSTATE_runnable:
161 161 case RUNSTATE_offline:
162 162 case RUNSTATE_blocked:
163 163 ret = VCPU_NOT_ON_PCPU;
164 164 break;
165 165
166 166 default:
167 167 break;
168 168 }
169 169
170 170 out:
171 171 return (ret);
172 172 }
173 173
174 174 /*
175 175 * These routines allocate any global state that might be needed
176 176 * while starting cpus. For virtual cpus, there is no such state.
177 177 */
178 178 int
179 179 mach_cpucontext_init(void)
180 180 {
181 181 return (0);
182 182 }
183 183
184 184 void
185 185 do_cpu_config_watch(int state)
186 186 {
187 187 static struct xenbus_watch cpu_config_watch;
188 188
189 189 if (state != XENSTORE_UP)
190 190 return;
191 191 cpu_config_watch.node = "cpu";
192 192 cpu_config_watch.callback = vcpu_config_event;
193 193 if (register_xenbus_watch(&cpu_config_watch)) {
194 194 taskq_destroy(cpu_config_tq);
195 195 cmn_err(CE_WARN, "do_cpu_config_watch: "
196 196 "failed to set vcpu config watch");
197 197 }
198 198
199 199 }
200 200
201 201 /*
202 202 * This routine is called after all the "normal" MP startup has
203 203 * been done; a good place to start watching xen store for virtual
204 204 * cpu hot plug events.
205 205 */
206 206 void
207 207 mach_cpucontext_fini(void)
208 208 {
209 209
210 210 cpu_config_tq = taskq_create("vcpu config taskq", 1,
211 211 maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
212 212
213 213 (void) xs_register_xenbus_callback(do_cpu_config_watch);
214 214 }
215 215
216 216 /*
217 217 * Fill in the remaining CPU context and initialize it.
218 218 */
219 219 static int
220 220 mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp)
221 221 {
222 222 uint_t vec, iopl;
223 223
224 224 vgc->flags = VGCF_IN_KERNEL;
225 225
226 226 /*
227 227 * fpu_ctx we leave as zero; on first fault we'll store
228 228 * sse_initial into it anyway.
229 229 */
230 230
231 231 #if defined(__amd64)
232 232 vgc->user_regs.cs = KCS_SEL | SEL_KPL; /* force to ring 3 */
233 233 #else
234 234 vgc->user_regs.cs = KCS_SEL;
235 235 #endif
236 236 vgc->user_regs.ds = KDS_SEL;
237 237 vgc->user_regs.es = KDS_SEL;
238 238 vgc->user_regs.ss = KDS_SEL;
239 239 vgc->kernel_ss = KDS_SEL;
240 240
241 241 /*
242 242 * Allow I/O privilege level for Dom0 kernel.
243 243 */
244 244 if (DOMAIN_IS_INITDOMAIN(xen_info))
245 245 iopl = (PS_IOPL & 0x1000); /* ring 1 */
246 246 else
247 247 iopl = 0;
248 248
249 249 #if defined(__amd64)
250 250 vgc->user_regs.fs = 0;
251 251 vgc->user_regs.gs = 0;
252 252 vgc->user_regs.rflags = F_OFF | iopl;
253 253 #elif defined(__i386)
254 254 vgc->user_regs.fs = KFS_SEL;
255 255 vgc->user_regs.gs = KGS_SEL;
256 256 vgc->user_regs.eflags = F_OFF | iopl;
257 257 vgc->event_callback_cs = vgc->user_regs.cs;
258 258 vgc->failsafe_callback_cs = vgc->user_regs.cs;
259 259 #endif
260 260
261 261 /*
262 262 * Initialize the trap_info_t from the IDT
263 263 */
264 264 #if !defined(__lint)
265 265 ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0]));
266 266 #endif
267 267 for (vec = 0; vec < NIDT; vec++) {
268 268 trap_info_t *ti = &vgc->trap_ctxt[vec];
269 269
270 270 if (xen_idt_to_trap_info(vec,
271 271 &cp->cpu_m.mcpu_idt[vec], ti) == 0) {
272 272 ti->cs = KCS_SEL;
273 273 ti->vector = vec;
274 274 }
275 275 }
276 276
277 277 /*
278 278 * No LDT
279 279 */
280 280
281 281 /*
282 282 * (We assert in various places that the GDT is (a) aligned on a
283 283 * page boundary and (b) one page long, so this really should fit..)
284 284 */
285 285 #ifdef CRASH_XEN
286 286 vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa));
287 287 #else
288 288 vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa));
289 289 #endif
290 290 vgc->gdt_ents = NGDT;
291 291
292 292 vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0());
293 293
294 294 #if defined(__i386)
295 295 if (mmu.pae_hat)
296 296 vgc->ctrlreg[3] =
297 297 xen_pfn_to_cr3(pfn_to_mfn(kas.a_hat->hat_htable->ht_pfn));
298 298 else
299 299 #endif
300 300 vgc->ctrlreg[3] =
301 301 pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn));
302 302
303 303 vgc->ctrlreg[4] = getcr4();
304 304
305 305 vgc->event_callback_eip = (uintptr_t)xen_callback;
306 306 vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback;
307 307 vgc->flags |= VGCF_failsafe_disables_events;
308 308
309 309 #if defined(__amd64)
310 310 /*
311 311 * XXPV should this be moved to init_cpu_syscall?
312 312 */
313 313 vgc->syscall_callback_eip = (uintptr_t)sys_syscall;
314 314 vgc->flags |= VGCF_syscall_disables_events;
315 315
316 316 ASSERT(vgc->user_regs.gs == 0);
317 317 vgc->gs_base_kernel = (uintptr_t)cp;
318 318 #endif
319 319
320 320 return (xen_vcpu_initialize(cp->cpu_id, vgc));
321 321 }
322 322
323 323 /*
324 324 * Create a guest virtual cpu context so that the virtual cpu
325 325 * springs into life in the domain just about to call mp_startup()
326 326 *
327 327 * Virtual CPUs must be initialized once in the lifetime of the domain;
328 328 * after that subsequent attempts to start them will fail with X_EEXIST.
329 329 *
330 330 * Thus 'alloc' -really- creates and initializes the virtual
331 331 * CPU context just once. Once the initialisation succeeds, we never
332 332 * free it, nor the regular cpu_t to which it refers.
333 333 */
334 334 void *
335 335 mach_cpucontext_alloc(struct cpu *cp)
336 336 {
337 337 kthread_t *tp = cp->cpu_thread;
338 338 vcpu_guest_context_t vgc;
339 339
340 340 int err = 1;
341 341
342 342 /*
343 343 * First, augment the incoming cpu structure
344 344 * - vcpu pointer reference
345 345 * - pending event storage area
346 346 * - physical address of GDT
347 347 */
348 348 cp->cpu_m.mcpu_vcpu_info =
349 349 &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id];
350 350 cp->cpu_m.mcpu_evt_pend = kmem_zalloc(
351 351 sizeof (struct xen_evt_data), KM_SLEEP);
352 352 cp->cpu_m.mcpu_gdtpa =
353 353 mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt));
354 354
355 355 if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0)
356 356 goto done;
357 357
358 358 /*
359 359 * Now set up the vcpu context so that we can start this vcpu
360 360 * in the kernel at tp->t_pc (mp_startup). Note that the
361 361 * thread will thread_exit() shortly after performing the
362 362 * initialization; in particular, we will *never* take a
363 363 * privilege transition on this thread.
364 364 */
365 365
366 366 bzero(&vgc, sizeof (vgc));
367 367
368 368 #ifdef __amd64
369 369 vgc.user_regs.rip = tp->t_pc;
370 370 vgc.user_regs.rsp = tp->t_sp;
371 371 vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t);
372 372 #else
373 373 vgc.user_regs.eip = tp->t_pc;
374 374 vgc.user_regs.esp = tp->t_sp;
375 375 vgc.user_regs.ebp = tp->t_sp - 2 * sizeof (greg_t);
376 376 #endif
377 377 /*
378 378 * XXPV Fix resume, if Russ didn't already fix it.
379 379 *
380 380 * Note that resume unconditionally puts t->t_stk + sizeof (regs)
381 381 * into kernel_sp via HYPERVISOR_stack_switch. This anticipates
382 382 * that only lwps take traps that switch to the kernel stack;
383 383 * part of creating an lwp adjusts the stack by subtracting
384 384 * sizeof (struct regs) off t_stk.
385 385 *
386 386 * The more interesting question is, why do we do all the work
387 387 * of a fully fledged lwp for a plain thread? In particular
388 388 * we don't have to call HYPERVISOR_stack_switch for lwp-less threads
389 389 * or futz with the LDT. This should probably all be done with
390 390 * an lwp context operator to keep pure thread context switch fast.
391 391 */
392 392 vgc.kernel_sp = (ulong_t)tp->t_stk;
393 393
394 394 err = mp_set_cpu_context(&vgc, cp);
395 395
396 396 done:
397 397 if (err) {
398 398 mach_cpucontext_free(cp, NULL, err);
399 399 return (NULL);
400 400 }
401 401 return (cp);
402 402 }
403 403
404 404 /*
405 405 * By the time we are called either we have successfully started
406 406 * the cpu, or our attempt to start it has failed.
407 407 */
408 408
409 409 /*ARGSUSED*/
410 410 void
411 411 mach_cpucontext_free(struct cpu *cp, void *arg, int err)
412 412 {
413 413 switch (err) {
414 414 case 0:
415 415 break;
416 416 case ETIMEDOUT:
417 417 /*
418 418 * The vcpu context is loaded into the hypervisor, and
419 419 * we've tried to start it, but the vcpu has not been set
420 420 * running yet, for whatever reason. We arrange to -not-
421 421 * free any data structures it may be referencing. In
422 422 * particular, we've already told the hypervisor about
423 423 * the GDT, and so we can't map it read-write again.
424 424 */
425 425 break;
426 426 default:
427 427 (void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE);
428 428 kmem_free(cp->cpu_m.mcpu_evt_pend,
429 429 sizeof (struct xen_evt_data));
430 430 break;
431 431 }
432 432 }
433 433
434 434 /*
435 435 * Reset this CPU's context. Clear out any pending evtchn data, since event
436 436 * channel numbers will all change when we resume.
437 437 */
438 438 void
439 439 mach_cpucontext_reset(cpu_t *cp)
440 440 {
441 441 bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data));
442 442 /* mcpu_intr_pending ? */
443 443 }
444 444
445 445 static void
446 446 pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc)
447 447 {
448 448 #ifdef __amd64
449 449 vgc->user_regs.rip = pcb->val[REG_LABEL_PC];
450 450 vgc->user_regs.rsp = pcb->val[REG_LABEL_SP];
451 451 vgc->user_regs.rbp = pcb->val[REG_LABEL_BP];
452 452 vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX];
453 453 vgc->user_regs.r12 = pcb->val[REG_LABEL_R12];
454 454 vgc->user_regs.r13 = pcb->val[REG_LABEL_R13];
455 455 vgc->user_regs.r14 = pcb->val[REG_LABEL_R14];
456 456 vgc->user_regs.r15 = pcb->val[REG_LABEL_R15];
457 457 #else /* __amd64 */
458 458 vgc->user_regs.eip = pcb->val[REG_LABEL_PC];
459 459 vgc->user_regs.esp = pcb->val[REG_LABEL_SP];
460 460 vgc->user_regs.ebp = pcb->val[REG_LABEL_BP];
461 461 vgc->user_regs.ebx = pcb->val[REG_LABEL_EBX];
462 462 vgc->user_regs.esi = pcb->val[REG_LABEL_ESI];
463 463 vgc->user_regs.edi = pcb->val[REG_LABEL_EDI];
464 464 #endif /* __amd64 */
465 465 }
466 466
467 467 /*
468 468 * Restore the context of a CPU during resume. This context is always
469 469 * inside enter_safe_phase(), below.
470 470 */
471 471 void
472 472 mach_cpucontext_restore(cpu_t *cp)
473 473 {
474 474 vcpu_guest_context_t vgc;
475 475 int err;
476 476
477 477 ASSERT(cp->cpu_thread == cp->cpu_pause_thread ||
478 478 cp->cpu_thread == cp->cpu_idle_thread);
479 479
480 480 bzero(&vgc, sizeof (vgc));
481 481
482 482 pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc);
483 483
484 484 /*
485 485 * We're emulating a longjmp() here: in particular, we need to bump the
486 486 * stack pointer to account for the pop of xIP that returning from
487 487 * longjmp() normally would do, and set the return value in xAX to 1.
488 488 */
489 489 #ifdef __amd64
490 490 vgc.user_regs.rax = 1;
491 491 vgc.user_regs.rsp += sizeof (ulong_t);
492 492 #else
493 493 vgc.user_regs.eax = 1;
494 494 vgc.user_regs.esp += sizeof (ulong_t);
495 495 #endif
496 496
497 497 vgc.kernel_sp = cp->cpu_thread->t_sp;
498 498
499 499 err = mp_set_cpu_context(&vgc, cp);
500 500
501 501 ASSERT(err == 0);
502 502 }
503 503
504 504 /*
505 505 * Reach a point at which the CPU can be safely powered-off or
506 506 * suspended. Nothing can wake this CPU out of the loop.
507 507 */
508 508 static void
509 509 enter_safe_phase(void)
510 510 {
511 511 ulong_t flags = intr_clear();
512 512
513 513 if (setjmp(&curthread->t_pcb) == 0) {
514 514 cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE;
515 515 while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE)
516 516 SMT_PAUSE();
517 517 }
518 518
519 519 ASSERT(!interrupts_enabled());
520 520
521 521 intr_restore(flags);
522 522 }
523 523
524 524 /*
525 525 * Offline CPUs run this code even under a pause_cpus(), so we must
526 526 * check if we need to enter the safe phase.
527 527 */
528 528 void
529 529 mach_cpu_idle(void)
530 530 {
531 531 if (IN_XPV_PANIC()) {
532 532 xpv_panic_halt();
533 533 } else {
534 534 (void) HYPERVISOR_block();
535 535 if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
536 536 enter_safe_phase();
537 537 }
538 538 }
539 539
540 540 /*
541 541 * Spin until either start_cpus() wakes us up, or we get a request to
542 542 * enter the safe phase (followed by a later start_cpus()).
543 543 */
544 544 void
545 545 mach_cpu_pause(volatile char *safe)
546 546 {
547 547 *safe = PAUSE_WAIT;
548 548 membar_enter();
549 549
550 550 while (*safe != PAUSE_IDLE) {
551 551 if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
552 552 enter_safe_phase();
553 553 SMT_PAUSE();
554 554 }
555 555 }
556 556
557 557 void
558 558 mach_cpu_halt(char *msg)
559 559 {
560 560 if (msg)
561 561 prom_printf("%s\n", msg);
562 562 (void) xen_vcpu_down(CPU->cpu_id);
563 563 }
564 564
565 565 /*ARGSUSED*/
566 566 int
567 567 mp_cpu_poweron(struct cpu *cp)
568 568 {
569 569 return (ENOTSUP);
570 570 }
571 571
572 572 /*ARGSUSED*/
573 573 int
574 574 mp_cpu_poweroff(struct cpu *cp)
575 575 {
576 576 return (ENOTSUP);
577 577 }
578 578
↓ open down ↓ |
578 lines elided |
↑ open up ↑ |
579 579 void
580 580 mp_enter_barrier(void)
581 581 {
582 582 hrtime_t last_poke_time = 0;
583 583 int poke_allowed = 0;
584 584 int done = 0;
585 585 int i;
586 586
587 587 ASSERT(MUTEX_HELD(&cpu_lock));
588 588
589 - pause_cpus(NULL);
589 + pause_cpus(NULL, NULL);
590 590
591 591 while (!done) {
592 592 done = 1;
593 593 poke_allowed = 0;
594 594
595 595 if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) {
596 596 last_poke_time = xpv_gethrtime();
597 597 poke_allowed = 1;
598 598 }
599 599
600 600 for (i = 0; i < NCPU; i++) {
601 601 cpu_t *cp = cpu_get(i);
602 602
603 603 if (cp == NULL || cp == CPU)
604 604 continue;
605 605
606 606 switch (cpu_phase[i]) {
607 607 case CPU_PHASE_NONE:
608 608 cpu_phase[i] = CPU_PHASE_WAIT_SAFE;
609 609 poke_cpu(i);
610 610 done = 0;
611 611 break;
612 612
613 613 case CPU_PHASE_WAIT_SAFE:
614 614 if (poke_allowed)
615 615 poke_cpu(i);
616 616 done = 0;
617 617 break;
618 618
619 619 case CPU_PHASE_SAFE:
620 620 case CPU_PHASE_POWERED_OFF:
621 621 break;
622 622 }
623 623 }
624 624
625 625 SMT_PAUSE();
626 626 }
627 627 }
628 628
629 629 void
630 630 mp_leave_barrier(void)
631 631 {
632 632 int i;
633 633
634 634 ASSERT(MUTEX_HELD(&cpu_lock));
635 635
636 636 for (i = 0; i < NCPU; i++) {
637 637 cpu_t *cp = cpu_get(i);
638 638
639 639 if (cp == NULL || cp == CPU)
640 640 continue;
641 641
642 642 switch (cpu_phase[i]) {
643 643 /*
644 644 * If we see a CPU in one of these phases, something has
645 645 * gone badly wrong with the guarantees
646 646 * mp_enter_barrier() is supposed to provide. Rather
647 647 * than attempt to stumble along (and since we can't
648 648 * panic properly in this context), we tell the
649 649 * hypervisor we've crashed.
650 650 */
651 651 case CPU_PHASE_NONE:
652 652 case CPU_PHASE_WAIT_SAFE:
653 653 (void) HYPERVISOR_shutdown(SHUTDOWN_crash);
654 654 break;
655 655
656 656 case CPU_PHASE_POWERED_OFF:
657 657 break;
658 658
659 659 case CPU_PHASE_SAFE:
660 660 cpu_phase[i] = CPU_PHASE_NONE;
661 661 }
662 662 }
663 663
664 664 start_cpus();
665 665 }
666 666
667 667 static int
668 668 poweroff_vcpu(struct cpu *cp)
669 669 {
670 670 int error;
671 671
672 672 ASSERT(MUTEX_HELD(&cpu_lock));
673 673
674 674 ASSERT(CPU->cpu_id != cp->cpu_id);
675 675 ASSERT(cp->cpu_flags & CPU_QUIESCED);
676 676
677 677 mp_enter_barrier();
678 678
679 679 if ((error = xen_vcpu_down(cp->cpu_id)) == 0) {
680 680 ASSERT(cpu_phase[cp->cpu_id] == CPU_PHASE_SAFE);
681 681
682 682 CPUSET_DEL(cpu_ready_set, cp->cpu_id);
683 683
684 684 cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE;
685 685 cp->cpu_flags &=
686 686 ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE);
687 687
688 688 cpu_phase[cp->cpu_id] = CPU_PHASE_POWERED_OFF;
689 689
690 690 cpu_set_state(cp);
691 691 }
692 692
693 693 mp_leave_barrier();
694 694
695 695 return (error);
696 696 }
697 697
698 698 static int
699 699 vcpu_config_poweroff(processorid_t id)
700 700 {
701 701 int oldstate;
702 702 int error;
703 703 cpu_t *cp;
704 704
705 705 mutex_enter(&cpu_lock);
706 706
707 707 if ((cp = cpu_get(id)) == NULL) {
708 708 mutex_exit(&cpu_lock);
709 709 return (ESRCH);
710 710 }
711 711
712 712 if (cpu_get_state(cp) == P_POWEROFF) {
713 713 mutex_exit(&cpu_lock);
714 714 return (0);
715 715 }
716 716
717 717 mutex_exit(&cpu_lock);
718 718
719 719 do {
720 720 error = p_online_internal(id, P_OFFLINE,
721 721 &oldstate);
722 722
723 723 if (error != 0)
724 724 break;
725 725
726 726 /*
727 727 * So we just changed it to P_OFFLINE. But then we dropped
728 728 * cpu_lock, so now it is possible for another thread to change
729 729 * the cpu back to a different, non-quiesced state e.g.
730 730 * P_ONLINE.
731 731 */
732 732 mutex_enter(&cpu_lock);
733 733 if ((cp = cpu_get(id)) == NULL)
734 734 error = ESRCH;
735 735 else {
736 736 if (cp->cpu_flags & CPU_QUIESCED)
737 737 error = poweroff_vcpu(cp);
738 738 else
739 739 error = EBUSY;
740 740 }
741 741 mutex_exit(&cpu_lock);
742 742 } while (error == EBUSY);
743 743
744 744 return (error);
745 745 }
746 746
747 747 /*
748 748 * Add a new virtual cpu to the domain.
749 749 */
750 750 static int
751 751 vcpu_config_new(processorid_t id)
752 752 {
753 753 extern int start_cpu(processorid_t);
754 754 int error;
755 755
756 756 if (ncpus == 1) {
757 757 printf("cannot (yet) add cpus to a single-cpu domain\n");
758 758 return (ENOTSUP);
759 759 }
760 760
761 761 affinity_set(CPU_CURRENT);
762 762 error = start_cpu(id);
763 763 affinity_clear();
764 764 return (error);
765 765 }
766 766
767 767 static int
768 768 poweron_vcpu(struct cpu *cp)
769 769 {
770 770 int error;
771 771
772 772 ASSERT(MUTEX_HELD(&cpu_lock));
773 773
774 774 if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) {
775 775 printf("poweron_vcpu: vcpu%d is not available!\n",
776 776 cp->cpu_id);
777 777 return (ENXIO);
778 778 }
779 779
780 780 if ((error = xen_vcpu_up(cp->cpu_id)) == 0) {
781 781 CPUSET_ADD(cpu_ready_set, cp->cpu_id);
782 782 cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING;
783 783 cp->cpu_flags &= ~CPU_POWEROFF;
784 784 /*
785 785 * There are some nasty races possible here.
786 786 * Tell the vcpu it's up one more time.
787 787 * XXPV Is this enough? Is this safe?
788 788 */
789 789 (void) xen_vcpu_up(cp->cpu_id);
790 790
791 791 cpu_phase[cp->cpu_id] = CPU_PHASE_NONE;
792 792
793 793 cpu_set_state(cp);
794 794 }
795 795 return (error);
796 796 }
797 797
798 798 static int
799 799 vcpu_config_poweron(processorid_t id)
800 800 {
801 801 cpu_t *cp;
802 802 int oldstate;
803 803 int error;
804 804
805 805 if (id >= ncpus)
806 806 return (vcpu_config_new(id));
807 807
808 808 mutex_enter(&cpu_lock);
809 809
810 810 if ((cp = cpu_get(id)) == NULL) {
811 811 mutex_exit(&cpu_lock);
812 812 return (ESRCH);
813 813 }
814 814
815 815 if (cpu_get_state(cp) != P_POWEROFF) {
816 816 mutex_exit(&cpu_lock);
817 817 return (0);
818 818 }
819 819
820 820 if ((error = poweron_vcpu(cp)) != 0) {
821 821 mutex_exit(&cpu_lock);
822 822 return (error);
823 823 }
824 824
825 825 mutex_exit(&cpu_lock);
826 826
827 827 return (p_online_internal(id, P_ONLINE, &oldstate));
828 828 }
829 829
830 830 #define REPORT_LEN 128
831 831
832 832 static void
833 833 vcpu_config_report(processorid_t id, uint_t newstate, int error)
834 834 {
835 835 char *report = kmem_alloc(REPORT_LEN, KM_SLEEP);
836 836 size_t len;
837 837 char *ps;
838 838
839 839 switch (newstate) {
840 840 case P_ONLINE:
841 841 ps = PS_ONLINE;
842 842 break;
843 843 case P_POWEROFF:
844 844 ps = PS_POWEROFF;
845 845 break;
846 846 default:
847 847 cmn_err(CE_PANIC, "unknown state %u\n", newstate);
848 848 break;
849 849 }
850 850
851 851 len = snprintf(report, REPORT_LEN,
852 852 "cpu%d: externally initiated %s", id, ps);
853 853
854 854 if (!error) {
855 855 cmn_err(CE_CONT, "!%s\n", report);
856 856 kmem_free(report, REPORT_LEN);
857 857 return;
858 858 }
859 859
860 860 len += snprintf(report + len, REPORT_LEN - len,
861 861 " failed, error %d: ", error);
862 862 switch (error) {
863 863 case EEXIST:
864 864 len += snprintf(report + len, REPORT_LEN - len,
865 865 "cpu already %s", ps ? ps : "?");
866 866 break;
867 867 case ESRCH:
868 868 len += snprintf(report + len, REPORT_LEN - len,
869 869 "cpu not found");
870 870 break;
871 871 case EINVAL:
872 872 case EALREADY:
873 873 break;
874 874 case EPERM:
875 875 len += snprintf(report + len, REPORT_LEN - len,
876 876 "insufficient privilege (0x%x)", id);
877 877 break;
878 878 case EBUSY:
879 879 switch (newstate) {
880 880 case P_ONLINE:
881 881 /*
882 882 * This return comes from mp_cpu_start -
883 883 * we cannot 'start' the boot CPU.
884 884 */
885 885 len += snprintf(report + len, REPORT_LEN - len,
886 886 "already running");
887 887 break;
888 888 case P_POWEROFF:
889 889 len += snprintf(report + len, REPORT_LEN - len,
890 890 "bound lwps?");
891 891 break;
892 892 default:
893 893 break;
894 894 }
895 895 default:
896 896 break;
897 897 }
898 898
899 899 cmn_err(CE_CONT, "%s\n", report);
900 900 kmem_free(report, REPORT_LEN);
901 901 }
902 902
903 903 static void
904 904 vcpu_config(void *arg)
905 905 {
906 906 int id = (int)(uintptr_t)arg;
907 907 int error;
908 908 char dir[16];
909 909 char *state;
910 910
911 911 if ((uint_t)id >= max_ncpus) {
912 912 cmn_err(CE_WARN,
913 913 "vcpu_config: cpu%d does not fit in this domain", id);
914 914 return;
915 915 }
916 916
917 917 (void) snprintf(dir, sizeof (dir), "cpu/%d", id);
918 918 state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
919 919 if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) {
920 920 if (strcmp(state, "online") == 0) {
921 921 error = vcpu_config_poweron(id);
922 922 vcpu_config_report(id, P_ONLINE, error);
923 923 } else if (strcmp(state, "offline") == 0) {
924 924 error = vcpu_config_poweroff(id);
925 925 vcpu_config_report(id, P_POWEROFF, error);
926 926 } else {
927 927 cmn_err(CE_WARN,
928 928 "cpu%d: unknown target state '%s'", id, state);
929 929 }
930 930 } else
931 931 cmn_err(CE_WARN,
932 932 "cpu%d: unable to read target state from xenstore", id);
933 933
934 934 kmem_free(state, MAXPATHLEN);
935 935 }
936 936
937 937 /*ARGSUSED*/
938 938 static void
939 939 vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len)
940 940 {
941 941 const char *path = vec[XS_WATCH_PATH];
942 942 processorid_t id;
943 943 char *s;
944 944
945 945 if ((s = strstr(path, "cpu/")) != NULL &&
946 946 sscanf(s, "cpu/%d", &id) == 1) {
947 947 /*
948 948 * Run the virtual CPU configuration on a separate thread to
949 949 * avoid blocking on this event for too long (and for now,
950 950 * to ensure configuration requests are serialized.)
951 951 */
952 952 (void) taskq_dispatch(cpu_config_tq,
953 953 vcpu_config, (void *)(uintptr_t)id, 0);
954 954 }
955 955 }
956 956
957 957 static int
958 958 xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc)
959 959 {
960 960 int err;
961 961
962 962 if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) {
963 963 char *str;
964 964 int level = CE_WARN;
965 965
966 966 switch (err) {
967 967 case -X_EINVAL:
968 968 /*
969 969 * This interface squashes multiple error sources
970 970 * to one error code. In particular, an X_EINVAL
971 971 * code can mean:
972 972 *
973 973 * - the vcpu id is out of range
974 974 * - cs or ss are in ring 0
975 975 * - cr3 is wrong
976 976 * - an entry in the new gdt is above the
977 977 * reserved entry
978 978 * - a frame underneath the new gdt is bad
979 979 */
980 980 str = "something is wrong :(";
981 981 break;
982 982 case -X_ENOENT:
983 983 str = "no such cpu";
984 984 break;
985 985 case -X_ENOMEM:
986 986 str = "no mem to copy ctxt";
987 987 break;
988 988 case -X_EFAULT:
989 989 str = "bad address";
990 990 break;
991 991 case -X_EEXIST:
992 992 /*
993 993 * Hmm. This error is returned if the vcpu has already
994 994 * been initialized once before in the lifetime of this
995 995 * domain. This is a logic error in the kernel.
996 996 */
997 997 level = CE_PANIC;
998 998 str = "already initialized";
999 999 break;
1000 1000 default:
1001 1001 level = CE_PANIC;
1002 1002 str = "<unexpected>";
1003 1003 break;
1004 1004 }
1005 1005
1006 1006 cmn_err(level, "vcpu%d: failed to init: error %d: %s",
1007 1007 id, -err, str);
1008 1008 }
1009 1009 return (err);
1010 1010 }
1011 1011
1012 1012 long
1013 1013 xen_vcpu_up(processorid_t id)
1014 1014 {
1015 1015 long err;
1016 1016
1017 1017 if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) {
1018 1018 char *str;
1019 1019
1020 1020 switch (err) {
1021 1021 case -X_ENOENT:
1022 1022 str = "no such cpu";
1023 1023 break;
1024 1024 case -X_EINVAL:
1025 1025 /*
1026 1026 * Perhaps this is diagnostic overkill.
1027 1027 */
1028 1028 if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0)
1029 1029 str = "bad cpuid";
1030 1030 else
1031 1031 str = "not initialized";
1032 1032 break;
1033 1033 default:
1034 1034 str = "<unexpected>";
1035 1035 break;
1036 1036 }
1037 1037
1038 1038 printf("vcpu%d: failed to start: error %d: %s\n",
1039 1039 id, -(int)err, str);
1040 1040 return (EBFONT); /* deliberately silly */
1041 1041 }
1042 1042 return (err);
1043 1043 }
1044 1044
1045 1045 long
1046 1046 xen_vcpu_down(processorid_t id)
1047 1047 {
1048 1048 long err;
1049 1049
1050 1050 if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) {
1051 1051 /*
1052 1052 * X_ENOENT: no such cpu
1053 1053 * X_EINVAL: bad cpuid
1054 1054 */
1055 1055 panic("vcpu%d: failed to stop: error %d", id, -(int)err);
1056 1056 }
1057 1057
1058 1058 return (err);
1059 1059 }
↓ open down ↓ |
460 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX