XXXX-pass-in-cpu_pause_func-via-pause_cpus Wdiff usr/src/uts/i86xpv/os/mp_xen.c

Print this page

XXXX pass in cpu_pause_func via pause_cpus

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/i86xpv/os/mp_xen.c
          +++ new/usr/src/uts/i86xpv/os/mp_xen.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24   24   * Use is subject to license terms.
  25   25   */
  26   26  
  27   27  /*
  28   28   * Virtual CPU management.
  29   29   *
  30   30   * VCPUs can be controlled in one of two ways; through the domain itself
  31   31   * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()).
  32   32   * Unfortunately, the terminology is used in different ways; they work out as
  33   33   * follows:
  34   34   *
  35   35   * P_ONLINE: the VCPU is up and running, taking interrupts and running threads
  36   36   *
  37   37   * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the
  38   38   * hypervisor on the idle thread).  It must be up since a downed VCPU cannot
  39   39   * receive interrupts, and we require this for offline CPUs in Solaris.
  40   40   *
  41   41   * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called
  42   42   * xen_vcpu_down() for it).  It can't take interrupts or run anything, though
  43   43   * if it has run previously, its software state (cpu_t, machcpu structures, IPI
  44   44   * event channels, etc.) will still exist.
  45   45   *
  46   46   * The hypervisor has two notions of CPU states as represented in the store:
  47   47   *
  48   48   * "offline": the VCPU is down.  Corresponds to P_POWEROFF.
  49   49   *
  50   50   * "online": the VCPU is running.  Corresponds to a CPU state other than
  51   51   * P_POWEROFF.
  52   52   *
  53   53   * Currently, only a notification via xenstore can bring a CPU into a
  54   54   * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR,
  55   55   * P_OFFLINE, etc.  We need to be careful to treat xenstore notifications
  56   56   * idempotently, as we'll get 'duplicate' entries when we resume a domain.
  57   57   *
  58   58   * Note that the xenstore configuration is strictly advisory, in that a domain
  59   59   * can choose to ignore it and still power up a VCPU in the offline state. To
  60   60   * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is
  61   61   * ENOTSUP from within Solaris.
  62   62   *
  63   63   * Powering off a VCPU and suspending the domain use similar code. The
  64   64   * difficulty here is that we must ensure that each VCPU is in a stable
  65   65   * state: it must have a saved PCB, and not be responding to interrupts
  66   66   * (since we are just about to remove its ability to run on a real CPU,
  67   67   * possibly forever).  However, an offline CPU in Solaris can take
  68   68   * cross-call interrupts, as mentioned, so we must go through a
  69   69   * two-stage process.  First, we use the standard Solaris pause_cpus().
  70   70   * This ensures that all CPUs are either in mach_cpu_pause() or
  71   71   * mach_cpu_idle(), and nothing will cross-call them.
  72   72   *
  73   73   * Powered-off-CPUs are already safe, as we own the cpu_lock needed to
  74   74   * bring them back up, and in state CPU_PHASE_POWERED_OFF.
  75   75   *
  76   76   * Running CPUs are spinning in mach_cpu_pause() waiting for either
  77   77   * PAUSE_IDLE or CPU_PHASE_WAIT_SAFE.
  78   78   *
  79   79   * Offline CPUs are either running the idle thread and periodically
  80   80   * checking for CPU_PHASE_WAIT_SAFE, or blocked in the hypervisor.
  81   81   *
  82   82   * Thus, we set CPU_PHASE_WAIT_SAFE for every powered-on CPU, as well as
  83   83   * poking them to make sure they're not blocked[1]. When every CPU has
  84   84   * responded by reaching a safe state and setting CPU_PHASE_SAFE, we
  85   85   * know we can suspend, or power-off a CPU, without problems.
  86   86   *
  87   87   * [1] note that we have to repeatedly poke offline CPUs: it's the only
  88   88   * way to ensure that the CPU doesn't miss the state change before
  89   89   * dropping into HYPERVISOR_block().
  90   90   */
  91   91  
  92   92  #include <sys/types.h>
  93   93  #include <sys/systm.h>
  94   94  #include <sys/param.h>
  95   95  #include <sys/taskq.h>
  96   96  #include <sys/cmn_err.h>
  97   97  #include <sys/archsystm.h>
  98   98  #include <sys/machsystm.h>
  99   99  #include <sys/segments.h>
 100  100  #include <sys/cpuvar.h>
 101  101  #include <sys/x86_archext.h>
 102  102  #include <sys/controlregs.h>
 103  103  #include <sys/hypervisor.h>
 104  104  #include <sys/xpv_panic.h>
 105  105  #include <sys/mman.h>
 106  106  #include <sys/psw.h>
 107  107  #include <sys/cpu.h>
 108  108  #include <sys/sunddi.h>
 109  109  #include <util/sscanf.h>
 110  110  #include <vm/hat_i86.h>
 111  111  #include <vm/hat.h>
 112  112  #include <vm/as.h>
 113  113  
 114  114  #include <xen/public/io/xs_wire.h>
 115  115  #include <xen/sys/xenbus_impl.h>
 116  116  #include <xen/public/vcpu.h>
 117  117  
 118  118  extern cpuset_t cpu_ready_set;
 119  119  
 120  120  #define CPU_PHASE_NONE 0
 121  121  #define CPU_PHASE_WAIT_SAFE 1
 122  122  #define CPU_PHASE_SAFE 2
 123  123  #define CPU_PHASE_POWERED_OFF 3
 124  124  
 125  125  /*
 126  126   * We can only poke CPUs during barrier enter 256 times a second at
 127  127   * most.
 128  128   */
 129  129  #define POKE_TIMEOUT (NANOSEC / 256)
 130  130  
 131  131  static taskq_t *cpu_config_tq;
 132  132  static int cpu_phase[NCPU];
 133  133  
 134  134  static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t);
 135  135  static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *);
 136  136  
 137  137  /*
 138  138   * Return whether or not the vcpu is actually running on a pcpu
 139  139   */
 140  140  int
 141  141  vcpu_on_pcpu(processorid_t cpu)
 142  142  {
 143  143          struct vcpu_runstate_info runstate;
 144  144          int     ret = VCPU_STATE_UNKNOWN;
 145  145  
 146  146          ASSERT(cpu < NCPU);
 147  147          /*
 148  148           * Don't bother with hypercall if we are asking about ourself
 149  149           */
 150  150          if (cpu == CPU->cpu_id)
 151  151                  return (VCPU_ON_PCPU);
 152  152          if (HYPERVISOR_vcpu_op(VCPUOP_get_runstate_info, cpu, &runstate) != 0)
 153  153                  goto out;
 154  154  
 155  155          switch (runstate.state) {
 156  156          case RUNSTATE_running:
 157  157                  ret = VCPU_ON_PCPU;
 158  158                  break;
 159  159  
 160  160          case RUNSTATE_runnable:
 161  161          case RUNSTATE_offline:
 162  162          case RUNSTATE_blocked:
 163  163                  ret = VCPU_NOT_ON_PCPU;
 164  164                  break;
 165  165  
 166  166          default:
 167  167                  break;
 168  168          }
 169  169  
 170  170  out:
 171  171          return (ret);
 172  172  }
 173  173  
 174  174  /*
 175  175   * These routines allocate any global state that might be needed
 176  176   * while starting cpus.  For virtual cpus, there is no such state.
 177  177   */
 178  178  int
 179  179  mach_cpucontext_init(void)
 180  180  {
 181  181          return (0);
 182  182  }
 183  183  
 184  184  void
 185  185  do_cpu_config_watch(int state)
 186  186  {
 187  187          static struct xenbus_watch cpu_config_watch;
 188  188  
 189  189          if (state != XENSTORE_UP)
 190  190                  return;
 191  191          cpu_config_watch.node = "cpu";
 192  192          cpu_config_watch.callback = vcpu_config_event;
 193  193          if (register_xenbus_watch(&cpu_config_watch)) {
 194  194                  taskq_destroy(cpu_config_tq);
 195  195                  cmn_err(CE_WARN, "do_cpu_config_watch: "
 196  196                      "failed to set vcpu config watch");
 197  197          }
 198  198  
 199  199  }
 200  200  
 201  201  /*
 202  202   * This routine is called after all the "normal" MP startup has
 203  203   * been done; a good place to start watching xen store for virtual
 204  204   * cpu hot plug events.
 205  205   */
 206  206  void
 207  207  mach_cpucontext_fini(void)
 208  208  {
 209  209  
 210  210          cpu_config_tq = taskq_create("vcpu config taskq", 1,
 211  211              maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
 212  212  
 213  213          (void) xs_register_xenbus_callback(do_cpu_config_watch);
 214  214  }
 215  215  
 216  216  /*
 217  217   * Fill in the remaining CPU context and initialize it.
 218  218   */
 219  219  static int
 220  220  mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp)
 221  221  {
 222  222          uint_t vec, iopl;
 223  223  
 224  224          vgc->flags = VGCF_IN_KERNEL;
 225  225  
 226  226          /*
 227  227           * fpu_ctx we leave as zero; on first fault we'll store
 228  228           * sse_initial into it anyway.
 229  229           */
 230  230  
 231  231  #if defined(__amd64)
 232  232          vgc->user_regs.cs = KCS_SEL | SEL_KPL;  /* force to ring 3 */
 233  233  #else
 234  234          vgc->user_regs.cs = KCS_SEL;
 235  235  #endif
 236  236          vgc->user_regs.ds = KDS_SEL;
 237  237          vgc->user_regs.es = KDS_SEL;
 238  238          vgc->user_regs.ss = KDS_SEL;
 239  239          vgc->kernel_ss = KDS_SEL;
 240  240  
 241  241          /*
 242  242           * Allow I/O privilege level for Dom0 kernel.
 243  243           */
 244  244          if (DOMAIN_IS_INITDOMAIN(xen_info))
 245  245                  iopl = (PS_IOPL & 0x1000); /* ring 1 */
 246  246          else
 247  247                  iopl = 0;
 248  248  
 249  249  #if defined(__amd64)
 250  250          vgc->user_regs.fs = 0;
 251  251          vgc->user_regs.gs = 0;
 252  252          vgc->user_regs.rflags = F_OFF | iopl;
 253  253  #elif defined(__i386)
 254  254          vgc->user_regs.fs = KFS_SEL;
 255  255          vgc->user_regs.gs = KGS_SEL;
 256  256          vgc->user_regs.eflags = F_OFF | iopl;
 257  257          vgc->event_callback_cs = vgc->user_regs.cs;
 258  258          vgc->failsafe_callback_cs = vgc->user_regs.cs;
 259  259  #endif
 260  260  
 261  261          /*
 262  262           * Initialize the trap_info_t from the IDT
 263  263           */
 264  264  #if !defined(__lint)
 265  265          ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0]));
 266  266  #endif
 267  267          for (vec = 0; vec < NIDT; vec++) {
 268  268                  trap_info_t *ti = &vgc->trap_ctxt[vec];
 269  269  
 270  270                  if (xen_idt_to_trap_info(vec,
 271  271                      &cp->cpu_m.mcpu_idt[vec], ti) == 0) {
 272  272                          ti->cs = KCS_SEL;
 273  273                          ti->vector = vec;
 274  274                  }
 275  275          }
 276  276  
 277  277          /*
 278  278           * No LDT
 279  279           */
 280  280  
 281  281          /*
 282  282           * (We assert in various places that the GDT is (a) aligned on a
 283  283           * page boundary and (b) one page long, so this really should fit..)
 284  284           */
 285  285  #ifdef CRASH_XEN
 286  286          vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa));
 287  287  #else
 288  288          vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa));
 289  289  #endif
 290  290          vgc->gdt_ents = NGDT;
 291  291  
 292  292          vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0());
 293  293  
 294  294  #if defined(__i386)
 295  295          if (mmu.pae_hat)
 296  296                  vgc->ctrlreg[3] =
 297  297                      xen_pfn_to_cr3(pfn_to_mfn(kas.a_hat->hat_htable->ht_pfn));
 298  298          else
 299  299  #endif
 300  300                  vgc->ctrlreg[3] =
 301  301                      pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn));
 302  302  
 303  303          vgc->ctrlreg[4] = getcr4();
 304  304  
 305  305          vgc->event_callback_eip = (uintptr_t)xen_callback;
 306  306          vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback;
 307  307          vgc->flags |= VGCF_failsafe_disables_events;
 308  308  
 309  309  #if defined(__amd64)
 310  310          /*
 311  311           * XXPV should this be moved to init_cpu_syscall?
 312  312           */
 313  313          vgc->syscall_callback_eip = (uintptr_t)sys_syscall;
 314  314          vgc->flags |= VGCF_syscall_disables_events;
 315  315  
 316  316          ASSERT(vgc->user_regs.gs == 0);
 317  317          vgc->gs_base_kernel = (uintptr_t)cp;
 318  318  #endif
 319  319  
 320  320          return (xen_vcpu_initialize(cp->cpu_id, vgc));
 321  321  }
 322  322  
 323  323  /*
 324  324   * Create a guest virtual cpu context so that the virtual cpu
 325  325   * springs into life in the domain just about to call mp_startup()
 326  326   *
 327  327   * Virtual CPUs must be initialized once in the lifetime of the domain;
 328  328   * after that subsequent attempts to start them will fail with X_EEXIST.
 329  329   *
 330  330   * Thus 'alloc' -really- creates and initializes the virtual
 331  331   * CPU context just once. Once the initialisation succeeds, we never
 332  332   * free it, nor the regular cpu_t to which it refers.
 333  333   */
 334  334  void *
 335  335  mach_cpucontext_alloc(struct cpu *cp)
 336  336  {
 337  337          kthread_t *tp = cp->cpu_thread;
 338  338          vcpu_guest_context_t vgc;
 339  339  
 340  340          int err = 1;
 341  341  
 342  342          /*
 343  343           * First, augment the incoming cpu structure
 344  344           * - vcpu pointer reference
 345  345           * - pending event storage area
 346  346           * - physical address of GDT
 347  347           */
 348  348          cp->cpu_m.mcpu_vcpu_info =
 349  349              &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id];
 350  350          cp->cpu_m.mcpu_evt_pend = kmem_zalloc(
 351  351              sizeof (struct xen_evt_data), KM_SLEEP);
 352  352          cp->cpu_m.mcpu_gdtpa =
 353  353              mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt));
 354  354  
 355  355          if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0)
 356  356                  goto done;
 357  357  
 358  358          /*
 359  359           * Now set up the vcpu context so that we can start this vcpu
 360  360           * in the kernel at tp->t_pc (mp_startup).  Note that the
 361  361           * thread will thread_exit() shortly after performing the
 362  362           * initialization; in particular, we will *never* take a
 363  363           * privilege transition on this thread.
 364  364           */
 365  365  
 366  366          bzero(&vgc, sizeof (vgc));
 367  367  
 368  368  #ifdef __amd64
 369  369          vgc.user_regs.rip = tp->t_pc;
 370  370          vgc.user_regs.rsp = tp->t_sp;
 371  371          vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t);
 372  372  #else
 373  373          vgc.user_regs.eip = tp->t_pc;
 374  374          vgc.user_regs.esp = tp->t_sp;
 375  375          vgc.user_regs.ebp = tp->t_sp - 2 * sizeof (greg_t);
 376  376  #endif
 377  377          /*
 378  378           * XXPV Fix resume, if Russ didn't already fix it.
 379  379           *
 380  380           * Note that resume unconditionally puts t->t_stk + sizeof (regs)
 381  381           * into kernel_sp via HYPERVISOR_stack_switch. This anticipates
 382  382           * that only lwps take traps that switch to the kernel stack;
 383  383           * part of creating an lwp adjusts the stack by subtracting
 384  384           * sizeof (struct regs) off t_stk.
 385  385           *
 386  386           * The more interesting question is, why do we do all the work
 387  387           * of a fully fledged lwp for a plain thread?  In particular
 388  388           * we don't have to call HYPERVISOR_stack_switch for lwp-less threads
 389  389           * or futz with the LDT.  This should probably all be done with
 390  390           * an lwp context operator to keep pure thread context switch fast.
 391  391           */
 392  392          vgc.kernel_sp = (ulong_t)tp->t_stk;
 393  393  
 394  394          err = mp_set_cpu_context(&vgc, cp);
 395  395  
 396  396  done:
 397  397          if (err) {
 398  398                  mach_cpucontext_free(cp, NULL, err);
 399  399                  return (NULL);
 400  400          }
 401  401          return (cp);
 402  402  }
 403  403  
 404  404  /*
 405  405   * By the time we are called either we have successfully started
 406  406   * the cpu, or our attempt to start it has failed.
 407  407   */
 408  408  
 409  409  /*ARGSUSED*/
 410  410  void
 411  411  mach_cpucontext_free(struct cpu *cp, void *arg, int err)
 412  412  {
 413  413          switch (err) {
 414  414          case 0:
 415  415                  break;
 416  416          case ETIMEDOUT:
 417  417                  /*
 418  418                   * The vcpu context is loaded into the hypervisor, and
 419  419                   * we've tried to start it, but the vcpu has not been set
 420  420                   * running yet, for whatever reason.  We arrange to -not-
 421  421                   * free any data structures it may be referencing.  In
 422  422                   * particular, we've already told the hypervisor about
 423  423                   * the GDT, and so we can't map it read-write again.
 424  424                   */
 425  425                  break;
 426  426          default:
 427  427                  (void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE);
 428  428                  kmem_free(cp->cpu_m.mcpu_evt_pend,
 429  429                      sizeof (struct xen_evt_data));
 430  430                  break;
 431  431          }
 432  432  }
 433  433  
 434  434  /*
 435  435   * Reset this CPU's context.  Clear out any pending evtchn data, since event
 436  436   * channel numbers will all change when we resume.
 437  437   */
 438  438  void
 439  439  mach_cpucontext_reset(cpu_t *cp)
 440  440  {
 441  441          bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data));
 442  442          /* mcpu_intr_pending ? */
 443  443  }
 444  444  
 445  445  static void
 446  446  pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc)
 447  447  {
 448  448  #ifdef __amd64
 449  449          vgc->user_regs.rip = pcb->val[REG_LABEL_PC];
 450  450          vgc->user_regs.rsp = pcb->val[REG_LABEL_SP];
 451  451          vgc->user_regs.rbp = pcb->val[REG_LABEL_BP];
 452  452          vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX];
 453  453          vgc->user_regs.r12 = pcb->val[REG_LABEL_R12];
 454  454          vgc->user_regs.r13 = pcb->val[REG_LABEL_R13];
 455  455          vgc->user_regs.r14 = pcb->val[REG_LABEL_R14];
 456  456          vgc->user_regs.r15 = pcb->val[REG_LABEL_R15];
 457  457  #else /* __amd64 */
 458  458          vgc->user_regs.eip = pcb->val[REG_LABEL_PC];
 459  459          vgc->user_regs.esp = pcb->val[REG_LABEL_SP];
 460  460          vgc->user_regs.ebp = pcb->val[REG_LABEL_BP];
 461  461          vgc->user_regs.ebx = pcb->val[REG_LABEL_EBX];
 462  462          vgc->user_regs.esi = pcb->val[REG_LABEL_ESI];
 463  463          vgc->user_regs.edi = pcb->val[REG_LABEL_EDI];
 464  464  #endif /* __amd64 */
 465  465  }
 466  466  
 467  467  /*
 468  468   * Restore the context of a CPU during resume.  This context is always
 469  469   * inside enter_safe_phase(), below.
 470  470   */
 471  471  void
 472  472  mach_cpucontext_restore(cpu_t *cp)
 473  473  {
 474  474          vcpu_guest_context_t vgc;
 475  475          int err;
 476  476  
 477  477          ASSERT(cp->cpu_thread == cp->cpu_pause_thread ||
 478  478              cp->cpu_thread == cp->cpu_idle_thread);
 479  479  
 480  480          bzero(&vgc, sizeof (vgc));
 481  481  
 482  482          pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc);
 483  483  
 484  484          /*
 485  485           * We're emulating a longjmp() here: in particular, we need to bump the
 486  486           * stack pointer to account for the pop of xIP that returning from
 487  487           * longjmp() normally would do, and set the return value in xAX to 1.
 488  488           */
 489  489  #ifdef __amd64
 490  490          vgc.user_regs.rax = 1;
 491  491          vgc.user_regs.rsp += sizeof (ulong_t);
 492  492  #else
 493  493          vgc.user_regs.eax = 1;
 494  494          vgc.user_regs.esp += sizeof (ulong_t);
 495  495  #endif
 496  496  
 497  497          vgc.kernel_sp = cp->cpu_thread->t_sp;
 498  498  
 499  499          err = mp_set_cpu_context(&vgc, cp);
 500  500  
 501  501          ASSERT(err == 0);
 502  502  }
 503  503  
 504  504  /*
 505  505   * Reach a point at which the CPU can be safely powered-off or
 506  506   * suspended.  Nothing can wake this CPU out of the loop.
 507  507   */
 508  508  static void
 509  509  enter_safe_phase(void)
 510  510  {
 511  511          ulong_t flags = intr_clear();
 512  512  
 513  513          if (setjmp(&curthread->t_pcb) == 0) {
 514  514                  cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE;
 515  515                  while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE)
 516  516                          SMT_PAUSE();
 517  517          }
 518  518  
 519  519          ASSERT(!interrupts_enabled());
 520  520  
 521  521          intr_restore(flags);
 522  522  }
 523  523  
 524  524  /*
 525  525   * Offline CPUs run this code even under a pause_cpus(), so we must
 526  526   * check if we need to enter the safe phase.
 527  527   */
 528  528  void
 529  529  mach_cpu_idle(void)
 530  530  {
 531  531          if (IN_XPV_PANIC()) {
 532  532                  xpv_panic_halt();
 533  533          } else  {
 534  534                  (void) HYPERVISOR_block();
 535  535                  if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
 536  536                          enter_safe_phase();
 537  537          }
 538  538  }
 539  539  
 540  540  /*
 541  541   * Spin until either start_cpus() wakes us up, or we get a request to
 542  542   * enter the safe phase (followed by a later start_cpus()).
 543  543   */
 544  544  void
 545  545  mach_cpu_pause(volatile char *safe)
 546  546  {
 547  547          *safe = PAUSE_WAIT;
 548  548          membar_enter();
 549  549  
 550  550          while (*safe != PAUSE_IDLE) {
 551  551                  if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
 552  552                          enter_safe_phase();
 553  553                  SMT_PAUSE();
 554  554          }
 555  555  }
 556  556  
 557  557  void
 558  558  mach_cpu_halt(char *msg)
 559  559  {
 560  560          if (msg)
 561  561                  prom_printf("%s\n", msg);
 562  562          (void) xen_vcpu_down(CPU->cpu_id);
 563  563  }
 564  564  
 565  565  /*ARGSUSED*/
 566  566  int
 567  567  mp_cpu_poweron(struct cpu *cp)
 568  568  {
 569  569          return (ENOTSUP);
 570  570  }
 571  571  
 572  572  /*ARGSUSED*/
 573  573  int
 574  574  mp_cpu_poweroff(struct cpu *cp)
 575  575  {
 576  576          return (ENOTSUP);
 577  577  }
 578  578

↓ open down ↓

578 lines elided

↑ open up ↑

 579  579  void
 580  580  mp_enter_barrier(void)
 581  581  {
 582  582          hrtime_t last_poke_time = 0;
 583  583          int poke_allowed = 0;
 584  584          int done = 0;
 585  585          int i;
 586  586  
 587  587          ASSERT(MUTEX_HELD(&cpu_lock));
 588  588  
 589      -        pause_cpus(NULL);
      589 +        pause_cpus(NULL, NULL);
 590  590  
 591  591          while (!done) {
 592  592                  done = 1;
 593  593                  poke_allowed = 0;
 594  594  
 595  595                  if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) {
 596  596                          last_poke_time = xpv_gethrtime();
 597  597                          poke_allowed = 1;
 598  598                  }
 599  599

 600  600                  for (i = 0; i < NCPU; i++) {
 601  601                          cpu_t *cp = cpu_get(i);
 602  602  
 603  603                          if (cp == NULL || cp == CPU)
 604  604                                  continue;
 605  605  
 606  606                          switch (cpu_phase[i]) {
 607  607                          case CPU_PHASE_NONE:
 608  608                                  cpu_phase[i] = CPU_PHASE_WAIT_SAFE;
 609  609                                  poke_cpu(i);
 610  610                                  done = 0;
 611  611                                  break;
 612  612  
 613  613                          case CPU_PHASE_WAIT_SAFE:
 614  614                                  if (poke_allowed)
 615  615                                          poke_cpu(i);
 616  616                                  done = 0;
 617  617                                  break;
 618  618  
 619  619                          case CPU_PHASE_SAFE:
 620  620                          case CPU_PHASE_POWERED_OFF:
 621  621                                  break;
 622  622                          }
 623  623                  }
 624  624  
 625  625                  SMT_PAUSE();
 626  626          }
 627  627  }
 628  628  
 629  629  void
 630  630  mp_leave_barrier(void)
 631  631  {
 632  632          int i;
 633  633  
 634  634          ASSERT(MUTEX_HELD(&cpu_lock));
 635  635  
 636  636          for (i = 0; i < NCPU; i++) {
 637  637                  cpu_t *cp = cpu_get(i);
 638  638  
 639  639                  if (cp == NULL || cp == CPU)
 640  640                          continue;
 641  641  
 642  642                  switch (cpu_phase[i]) {
 643  643                  /*
 644  644                   * If we see a CPU in one of these phases, something has
 645  645                   * gone badly wrong with the guarantees
 646  646                   * mp_enter_barrier() is supposed to provide.  Rather
 647  647                   * than attempt to stumble along (and since we can't
 648  648                   * panic properly in this context), we tell the
 649  649                   * hypervisor we've crashed.
 650  650                   */
 651  651                  case CPU_PHASE_NONE:
 652  652                  case CPU_PHASE_WAIT_SAFE:
 653  653                          (void) HYPERVISOR_shutdown(SHUTDOWN_crash);
 654  654                          break;
 655  655  
 656  656                  case CPU_PHASE_POWERED_OFF:
 657  657                          break;
 658  658  
 659  659                  case CPU_PHASE_SAFE:
 660  660                          cpu_phase[i] = CPU_PHASE_NONE;
 661  661                  }
 662  662          }
 663  663  
 664  664          start_cpus();
 665  665  }
 666  666  
 667  667  static int
 668  668  poweroff_vcpu(struct cpu *cp)
 669  669  {
 670  670          int error;
 671  671  
 672  672          ASSERT(MUTEX_HELD(&cpu_lock));
 673  673  
 674  674          ASSERT(CPU->cpu_id != cp->cpu_id);
 675  675          ASSERT(cp->cpu_flags & CPU_QUIESCED);
 676  676  
 677  677          mp_enter_barrier();
 678  678  
 679  679          if ((error = xen_vcpu_down(cp->cpu_id)) == 0) {
 680  680                  ASSERT(cpu_phase[cp->cpu_id] == CPU_PHASE_SAFE);
 681  681  
 682  682                  CPUSET_DEL(cpu_ready_set, cp->cpu_id);
 683  683  
 684  684                  cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE;
 685  685                  cp->cpu_flags &=
 686  686                      ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE);
 687  687  
 688  688                  cpu_phase[cp->cpu_id] = CPU_PHASE_POWERED_OFF;
 689  689  
 690  690                  cpu_set_state(cp);
 691  691          }
 692  692  
 693  693          mp_leave_barrier();
 694  694  
 695  695          return (error);
 696  696  }
 697  697  
 698  698  static int
 699  699  vcpu_config_poweroff(processorid_t id)
 700  700  {
 701  701          int oldstate;
 702  702          int error;
 703  703          cpu_t *cp;
 704  704  
 705  705          mutex_enter(&cpu_lock);
 706  706  
 707  707          if ((cp = cpu_get(id)) == NULL) {
 708  708                  mutex_exit(&cpu_lock);
 709  709                  return (ESRCH);
 710  710          }
 711  711  
 712  712          if (cpu_get_state(cp) == P_POWEROFF) {
 713  713                  mutex_exit(&cpu_lock);
 714  714                  return (0);
 715  715          }
 716  716  
 717  717          mutex_exit(&cpu_lock);
 718  718  
 719  719          do {
 720  720                  error = p_online_internal(id, P_OFFLINE,
 721  721                      &oldstate);
 722  722  
 723  723                  if (error != 0)
 724  724                          break;
 725  725  
 726  726                  /*
 727  727                   * So we just changed it to P_OFFLINE.  But then we dropped
 728  728                   * cpu_lock, so now it is possible for another thread to change
 729  729                   * the cpu back to a different, non-quiesced state e.g.
 730  730                   * P_ONLINE.
 731  731                   */
 732  732                  mutex_enter(&cpu_lock);
 733  733                  if ((cp = cpu_get(id)) == NULL)
 734  734                          error = ESRCH;
 735  735                  else {
 736  736                          if (cp->cpu_flags & CPU_QUIESCED)
 737  737                                  error = poweroff_vcpu(cp);
 738  738                          else
 739  739                                  error = EBUSY;
 740  740                  }
 741  741                  mutex_exit(&cpu_lock);
 742  742          } while (error == EBUSY);
 743  743  
 744  744          return (error);
 745  745  }
 746  746  
 747  747  /*
 748  748   * Add a new virtual cpu to the domain.
 749  749   */
 750  750  static int
 751  751  vcpu_config_new(processorid_t id)
 752  752  {
 753  753          extern int start_cpu(processorid_t);
 754  754          int error;
 755  755  
 756  756          if (ncpus == 1) {
 757  757                  printf("cannot (yet) add cpus to a single-cpu domain\n");
 758  758                  return (ENOTSUP);
 759  759          }
 760  760  
 761  761          affinity_set(CPU_CURRENT);
 762  762          error = start_cpu(id);
 763  763          affinity_clear();
 764  764          return (error);
 765  765  }
 766  766  
 767  767  static int
 768  768  poweron_vcpu(struct cpu *cp)
 769  769  {
 770  770          int error;
 771  771  
 772  772          ASSERT(MUTEX_HELD(&cpu_lock));
 773  773  
 774  774          if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) {
 775  775                  printf("poweron_vcpu: vcpu%d is not available!\n",
 776  776                      cp->cpu_id);
 777  777                  return (ENXIO);
 778  778          }
 779  779  
 780  780          if ((error = xen_vcpu_up(cp->cpu_id)) == 0) {
 781  781                  CPUSET_ADD(cpu_ready_set, cp->cpu_id);
 782  782                  cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING;
 783  783                  cp->cpu_flags &= ~CPU_POWEROFF;
 784  784                  /*
 785  785                   * There are some nasty races possible here.
 786  786                   * Tell the vcpu it's up one more time.
 787  787                   * XXPV Is this enough?  Is this safe?
 788  788                   */
 789  789                  (void) xen_vcpu_up(cp->cpu_id);
 790  790  
 791  791                  cpu_phase[cp->cpu_id] = CPU_PHASE_NONE;
 792  792  
 793  793                  cpu_set_state(cp);
 794  794          }
 795  795          return (error);
 796  796  }
 797  797  
 798  798  static int
 799  799  vcpu_config_poweron(processorid_t id)
 800  800  {
 801  801          cpu_t *cp;
 802  802          int oldstate;
 803  803          int error;
 804  804  
 805  805          if (id >= ncpus)
 806  806                  return (vcpu_config_new(id));
 807  807  
 808  808          mutex_enter(&cpu_lock);
 809  809  
 810  810          if ((cp = cpu_get(id)) == NULL) {
 811  811                  mutex_exit(&cpu_lock);
 812  812                  return (ESRCH);
 813  813          }
 814  814  
 815  815          if (cpu_get_state(cp) != P_POWEROFF) {
 816  816                  mutex_exit(&cpu_lock);
 817  817                  return (0);
 818  818          }
 819  819  
 820  820          if ((error = poweron_vcpu(cp)) != 0) {
 821  821                  mutex_exit(&cpu_lock);
 822  822                  return (error);
 823  823          }
 824  824  
 825  825          mutex_exit(&cpu_lock);
 826  826  
 827  827          return (p_online_internal(id, P_ONLINE, &oldstate));
 828  828  }
 829  829  
 830  830  #define REPORT_LEN      128
 831  831  
 832  832  static void
 833  833  vcpu_config_report(processorid_t id, uint_t newstate, int error)
 834  834  {
 835  835          char *report = kmem_alloc(REPORT_LEN, KM_SLEEP);
 836  836          size_t len;
 837  837          char *ps;
 838  838  
 839  839          switch (newstate) {
 840  840          case P_ONLINE:
 841  841                  ps = PS_ONLINE;
 842  842                  break;
 843  843          case P_POWEROFF:
 844  844                  ps = PS_POWEROFF;
 845  845                  break;
 846  846          default:
 847  847                  cmn_err(CE_PANIC, "unknown state %u\n", newstate);
 848  848                  break;
 849  849          }
 850  850  
 851  851          len = snprintf(report, REPORT_LEN,
 852  852              "cpu%d: externally initiated %s", id, ps);
 853  853  
 854  854          if (!error) {
 855  855                  cmn_err(CE_CONT, "!%s\n", report);
 856  856                  kmem_free(report, REPORT_LEN);
 857  857                  return;
 858  858          }
 859  859  
 860  860          len += snprintf(report + len, REPORT_LEN - len,
 861  861              " failed, error %d: ", error);
 862  862          switch (error) {
 863  863          case EEXIST:
 864  864                  len += snprintf(report + len, REPORT_LEN - len,
 865  865                      "cpu already %s", ps ? ps : "?");
 866  866                  break;
 867  867          case ESRCH:
 868  868                  len += snprintf(report + len, REPORT_LEN - len,
 869  869                      "cpu not found");
 870  870                  break;
 871  871          case EINVAL:
 872  872          case EALREADY:
 873  873                  break;
 874  874          case EPERM:
 875  875                  len += snprintf(report + len, REPORT_LEN - len,
 876  876                      "insufficient privilege (0x%x)", id);
 877  877                  break;
 878  878          case EBUSY:
 879  879                  switch (newstate) {
 880  880                  case P_ONLINE:
 881  881                          /*
 882  882                           * This return comes from mp_cpu_start -
 883  883                           * we cannot 'start' the boot CPU.
 884  884                           */
 885  885                          len += snprintf(report + len, REPORT_LEN - len,
 886  886                              "already running");
 887  887                          break;
 888  888                  case P_POWEROFF:
 889  889                          len += snprintf(report + len, REPORT_LEN - len,
 890  890                              "bound lwps?");
 891  891                          break;
 892  892                  default:
 893  893                          break;
 894  894                  }
 895  895          default:
 896  896                  break;
 897  897          }
 898  898  
 899  899          cmn_err(CE_CONT, "%s\n", report);
 900  900          kmem_free(report, REPORT_LEN);
 901  901  }
 902  902  
 903  903  static void
 904  904  vcpu_config(void *arg)
 905  905  {
 906  906          int id = (int)(uintptr_t)arg;
 907  907          int error;
 908  908          char dir[16];
 909  909          char *state;
 910  910  
 911  911          if ((uint_t)id >= max_ncpus) {
 912  912                  cmn_err(CE_WARN,
 913  913                      "vcpu_config: cpu%d does not fit in this domain", id);
 914  914                  return;
 915  915          }
 916  916  
 917  917          (void) snprintf(dir, sizeof (dir), "cpu/%d", id);
 918  918          state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 919  919          if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) {
 920  920                  if (strcmp(state, "online") == 0) {
 921  921                          error = vcpu_config_poweron(id);
 922  922                          vcpu_config_report(id, P_ONLINE, error);
 923  923                  } else if (strcmp(state, "offline") == 0) {
 924  924                          error = vcpu_config_poweroff(id);
 925  925                          vcpu_config_report(id, P_POWEROFF, error);
 926  926                  } else {
 927  927                          cmn_err(CE_WARN,
 928  928                              "cpu%d: unknown target state '%s'", id, state);
 929  929                  }
 930  930          } else
 931  931                  cmn_err(CE_WARN,
 932  932                      "cpu%d: unable to read target state from xenstore", id);
 933  933  
 934  934          kmem_free(state, MAXPATHLEN);
 935  935  }
 936  936  
 937  937  /*ARGSUSED*/
 938  938  static void
 939  939  vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len)
 940  940  {
 941  941          const char *path = vec[XS_WATCH_PATH];
 942  942          processorid_t id;
 943  943          char *s;
 944  944  
 945  945          if ((s = strstr(path, "cpu/")) != NULL &&
 946  946              sscanf(s, "cpu/%d", &id) == 1) {
 947  947                  /*
 948  948                   * Run the virtual CPU configuration on a separate thread to
 949  949                   * avoid blocking on this event for too long (and for now,
 950  950                   * to ensure configuration requests are serialized.)
 951  951                   */
 952  952                  (void) taskq_dispatch(cpu_config_tq,
 953  953                      vcpu_config, (void *)(uintptr_t)id, 0);
 954  954          }
 955  955  }
 956  956  
 957  957  static int
 958  958  xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc)
 959  959  {
 960  960          int err;
 961  961  
 962  962          if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) {
 963  963                  char *str;
 964  964                  int level = CE_WARN;
 965  965  
 966  966                  switch (err) {
 967  967                  case -X_EINVAL:
 968  968                          /*
 969  969                           * This interface squashes multiple error sources
 970  970                           * to one error code.  In particular, an X_EINVAL
 971  971                           * code can mean:
 972  972                           *
 973  973                           * -    the vcpu id is out of range
 974  974                           * -    cs or ss are in ring 0
 975  975                           * -    cr3 is wrong
 976  976                           * -    an entry in the new gdt is above the
 977  977                           *      reserved entry
 978  978                           * -    a frame underneath the new gdt is bad
 979  979                           */
 980  980                          str = "something is wrong :(";
 981  981                          break;
 982  982                  case -X_ENOENT:
 983  983                          str = "no such cpu";
 984  984                          break;
 985  985                  case -X_ENOMEM:
 986  986                          str = "no mem to copy ctxt";
 987  987                          break;
 988  988                  case -X_EFAULT:
 989  989                          str = "bad address";
 990  990                          break;
 991  991                  case -X_EEXIST:
 992  992                          /*
 993  993                           * Hmm.  This error is returned if the vcpu has already
 994  994                           * been initialized once before in the lifetime of this
 995  995                           * domain.  This is a logic error in the kernel.
 996  996                           */
 997  997                          level = CE_PANIC;
 998  998                          str = "already initialized";
 999  999                          break;
1000 1000                  default:
1001 1001                          level = CE_PANIC;
1002 1002                          str = "<unexpected>";
1003 1003                          break;
1004 1004                  }
1005 1005  
1006 1006                  cmn_err(level, "vcpu%d: failed to init: error %d: %s",
1007 1007                      id, -err, str);
1008 1008          }
1009 1009          return (err);
1010 1010  }
1011 1011  
1012 1012  long
1013 1013  xen_vcpu_up(processorid_t id)
1014 1014  {
1015 1015          long err;
1016 1016  
1017 1017          if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) {
1018 1018                  char *str;
1019 1019  
1020 1020                  switch (err) {
1021 1021                  case -X_ENOENT:
1022 1022                          str = "no such cpu";
1023 1023                          break;
1024 1024                  case -X_EINVAL:
1025 1025                          /*
1026 1026                           * Perhaps this is diagnostic overkill.
1027 1027                           */
1028 1028                          if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0)
1029 1029                                  str = "bad cpuid";
1030 1030                          else
1031 1031                                  str = "not initialized";
1032 1032                          break;
1033 1033                  default:
1034 1034                          str = "<unexpected>";
1035 1035                          break;
1036 1036                  }
1037 1037  
1038 1038                  printf("vcpu%d: failed to start: error %d: %s\n",
1039 1039                      id, -(int)err, str);
1040 1040                  return (EBFONT);        /* deliberately silly */
1041 1041          }
1042 1042          return (err);
1043 1043  }
1044 1044  
1045 1045  long
1046 1046  xen_vcpu_down(processorid_t id)
1047 1047  {
1048 1048          long err;
1049 1049  
1050 1050          if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) {
1051 1051                  /*
1052 1052                   * X_ENOENT:    no such cpu
1053 1053                   * X_EINVAL:    bad cpuid
1054 1054                   */
1055 1055                  panic("vcpu%d: failed to stop: error %d", id, -(int)err);
1056 1056          }
1057 1057  
1058 1058          return (err);
1059 1059  }

↓ open down ↓

460 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX