1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 #include <sys/mutex.h>
  26 #include <sys/cpuvar.h>
  27 #include <sys/cyclic.h>
  28 #include <sys/disp.h>
  29 #include <sys/ddi.h>
  30 #include <sys/wdt.h>
  31 #include <sys/callb.h>
  32 #include <sys/cmn_err.h>
  33 #include <sys/hypervisor_api.h>
  34 #include <sys/membar.h>
  35 #include <sys/x_call.h>
  36 #include <sys/promif.h>
  37 #include <sys/systm.h>
  38 #include <sys/mach_descrip.h>
  39 #include <sys/cpu_module.h>
  40 #include <sys/pg.h>
  41 #include <sys/lgrp.h>
  42 #include <sys/sysmacros.h>
  43 #include <sys/sunddi.h>
  44 #include <sys/cpupart.h>
  45 #include <sys/hsvc.h>
  46 #include <sys/mpo.h>
  47 #include <vm/hat_sfmmu.h>
  48 #include <sys/time.h>
  49 #include <sys/clock.h>
  50 
  51 /*
  52  * Sun4v OS Suspend
  53  *
  54  * Provides a means to suspend a sun4v guest domain by pausing CPUs and then
  55  * calling into the HV to initiate a suspension. Suspension is sequenced
  56  * externally by calling suspend_pre, suspend_start, and suspend_post.
  57  * suspend_pre and suspend_post are meant to perform any special operations
  58  * that should be done before or after a suspend/resume operation. e.g.,
  59  * callbacks to cluster software to disable heartbeat monitoring before the
  60  * system is suspended. suspend_start prepares kernel services to be suspended
  61  * and then suspends the domain by calling hv_guest_suspend.
  62  *
  63  * Special Handling for %tick and %stick Registers
  64  *
  65  * After a suspend/resume operation, the %tick and %stick registers may have
  66  * jumped forwards or backwards. The delta is assumed to be consistent across
  67  * all CPUs, within the negligible level of %tick and %stick variation
  68  * acceptable on a cold boot. In order to maintain increasing %tick and %stick
  69  * counter values without exposing large positive or negative jumps to kernel
  70  * or user code, a %tick and %stick offset is used. Kernel reads of these
  71  * counters return the sum of the hardware register counter and offset
  72  * variable. After a suspend/resume operation, user reads of %tick or %stick
  73  * are emulated. Suspend code enables emulation by setting the
  74  * %{tick,stick}.NPT fields which trigger a privileged instruction access
  75  * trap whenever the registers are read from user mode. If emulation has been
  76  * enabled, the trap handler emulates the instruction. Emulation is only
  77  * enabled during a successful suspend/resume operation. When emulation is
  78  * enabled, CPUs that are DR'd into the system will have their
  79  * %{tick,stick}.NPT bits set to 1 as well.
  80  */
  81 
  82 extern u_longlong_t gettick(void);      /* returns %stick */
  83 extern uint64_t gettick_counter(void);  /* returns %tick */
  84 extern uint64_t gettick_npt(void);
  85 extern uint64_t getstick_npt(void);
  86 extern int mach_descrip_update(void);
  87 extern cpuset_t cpu_ready_set;
  88 extern uint64_t native_tick_offset;
  89 extern uint64_t native_stick_offset;
  90 extern uint64_t sys_tick_freq;
  91 
  92 /*
  93  * Global Sun Cluster pre/post callbacks.
  94  */
  95 const char *(*cl_suspend_error_decode)(int);
  96 int (*cl_suspend_pre_callback)(void);
  97 int (*cl_suspend_post_callback)(void);
  98 #define SC_PRE_FAIL_STR_FMT     "Sun Cluster pre-suspend failure: %d"
  99 #define SC_POST_FAIL_STR_FMT    "Sun Cluster post-suspend failure: %d"
 100 #define SC_FAIL_STR_MAX         256
 101 
 102 /*
 103  * The minimum major and minor version of the HSVC_GROUP_CORE API group
 104  * required in order to use OS suspend.
 105  */
 106 #define SUSPEND_CORE_MAJOR      1
 107 #define SUSPEND_CORE_MINOR      2
 108 
 109 /*
 110  * By default, sun4v OS suspend is supported if the required HV version
 111  * is present. suspend_disabled should be set on platforms that do not
 112  * allow OS suspend regardless of whether or not the HV supports it.
 113  * It can also be set in /etc/system.
 114  */
 115 static int suspend_disabled = 0;
 116 
 117 /*
 118  * Controls whether or not user-land tick and stick register emulation
 119  * will be enabled following a successful suspend operation.
 120  */
 121 static int enable_user_tick_stick_emulation = 1;
 122 
 123 /*
 124  * Indicates whether or not tick and stick emulation is currently active.
 125  * After a successful suspend operation, if emulation is enabled, this
 126  * variable is set to B_TRUE. Global scope to allow emulation code to
 127  * check if emulation is active.
 128  */
 129 boolean_t tick_stick_emulation_active = B_FALSE;
 130 
 131 /*
 132  * When non-zero, after a successful suspend and resume, cpunodes, CPU HW
 133  * sharing data structures, and processor groups will be updated using
 134  * information from the updated MD.
 135  */
 136 static int suspend_update_cpu_mappings = 1;
 137 
 138 /*
 139  * The maximum number of microseconds by which the %tick or %stick register
 140  * can vary between any two CPUs in the system. To calculate the
 141  * native_stick_offset and native_tick_offset, we measure the change in these
 142  * registers on one CPU over a suspend/resume. Other CPUs may experience
 143  * slightly larger or smaller changes. %tick and %stick should be synchronized
 144  * between CPUs, but there may be some variation. So we add an additional value
 145  * derived from this variable to ensure that these registers always increase
 146  * over a suspend/resume operation, assuming all %tick and %stick registers
 147  * are synchronized (within a certain limit) across CPUs in the system. The
 148  * delta between %sticks on different CPUs should be a small number of cycles,
 149  * not perceptible to readers of %stick that migrate between CPUs. We set this
 150  * value to 1 millisecond which means that over a suspend/resume operation,
 151  * all CPU's %tick and %stick will advance forwards as long as, across all
 152  * CPUs, the %tick and %stick are synchronized to within 1 ms. This applies to
 153  * CPUs before the suspend and CPUs after the resume. 1 ms is conservative,
 154  * but small enough to not trigger TOD faults.
 155  */
 156 static uint64_t suspend_tick_stick_max_delta = 1000; /* microseconds */
 157 
 158 /*
 159  * The number of times the system has been suspended and resumed.
 160  */
 161 static uint64_t suspend_count = 0;
 162 
 163 /*
 164  * DBG and DBG_PROM() macro.
 165  */
 166 #ifdef  DEBUG
 167 
 168 static int suspend_debug_flag = 0;
 169 
 170 #define DBG_PROM                \
 171 if (suspend_debug_flag)         \
 172         prom_printf
 173 
 174 #define DBG                     \
 175 if (suspend_debug_flag)         \
 176         suspend_debug
 177 
 178 static void
 179 suspend_debug(const char *fmt, ...)
 180 {
 181         char    buf[512];
 182         va_list ap;
 183 
 184         va_start(ap, fmt);
 185         (void) vsprintf(buf, fmt, ap);
 186         va_end(ap);
 187 
 188         cmn_err(CE_NOTE, "%s", buf);
 189 }
 190 
 191 #else /* DEBUG */
 192 
 193 #define DBG_PROM
 194 #define DBG
 195 
 196 #endif /* DEBUG */
 197 
 198 /*
 199  * Return true if the HV supports OS suspend and if suspend has not been
 200  * disabled on this platform.
 201  */
 202 boolean_t
 203 suspend_supported(void)
 204 {
 205         uint64_t major, minor;
 206 
 207         if (suspend_disabled)
 208                 return (B_FALSE);
 209 
 210         if (hsvc_version(HSVC_GROUP_CORE, &major, &minor) != 0)
 211                 return (B_FALSE);
 212 
 213         return ((major == SUSPEND_CORE_MAJOR && minor >= SUSPEND_CORE_MINOR) ||
 214             (major > SUSPEND_CORE_MAJOR));
 215 }
 216 
 217 /*
 218  * Memory DR is not permitted if the system has been suspended and resumed.
 219  * It is the responsibility of the caller of suspend_start and the DR
 220  * subsystem to serialize DR operations and suspend_memdr_allowed() checks.
 221  */
 222 boolean_t
 223 suspend_memdr_allowed(void)
 224 {
 225         return (suspend_count == 0);
 226 }
 227 
 228 /*
 229  * Given a source tick, stick, and tod value, set the tick and stick offsets
 230  * such that the (current physical register value) + offset == (source value)
 231  * and in addition account for some variation between the %tick/%stick on
 232  * different CPUs. We account for this variation by adding in double the value
 233  * of suspend_tick_stick_max_delta. The following is an explanation of why
 234  * suspend_tick_stick_max_delta must be multplied by two and added to
 235  * native_stick_offset.
 236  *
 237  * Consider a guest instance that is yet to be suspended with CPUs p0 and p1
 238  * with physical "source" %stick values s0 and s1 respectively. When the guest
 239  * is first resumed, the physical "target" %stick values are t0 and t1
 240  * respectively. The virtual %stick values after the resume are v0 and v1
 241  * respectively. Let x be the maximum difference between any two CPU's %stick
 242  * register at a given point in time and let the %stick values be assigned
 243  * such that
 244  *
 245  *     s1 = s0 + x and
 246  *     t1 = t0 - x
 247  *
 248  * Let us assume that p0 is driving the suspend and resume. Then, we will
 249  * calculate the stick offset f and the virtual %stick on p0 after the
 250  * resume as follows.
 251  *
 252  *      f = s0 - t0 and
 253  *     v0 = t0 + f
 254  *
 255  * We calculate the virtual %stick v1 on p1 after the resume as
 256  *
 257  *     v1 = t1 + f
 258  *
 259  * Substitution yields
 260  *
 261  *     v1 = t1 + (s0 - t0)
 262  *     v1 = (t0 - x) + (s0 - t0)
 263  *     v1 = -x + s0
 264  *     v1 = s0 - x
 265  *     v1 = (s1 - x) - x
 266  *     v1 = s1 - 2x
 267  *
 268  * Therefore, in this scenario, without accounting for %stick variation in
 269  * the calculation of the native_stick_offset f, the virtual %stick on p1
 270  * is less than the value of the %stick on p1 before the suspend which is
 271  * unacceptable. By adding 2x to v1, we guarantee it will be equal to s1
 272  * which means the %stick on p1 after the resume will always be greater
 273  * than or equal to the %stick on p1 before the suspend. Since v1 = t1 + f
 274  * at any point in time, we can accomplish this by adding 2x to f. This
 275  * guarantees any processes bound to CPU P0 or P1 will not see a %stick
 276  * decrease across a suspend/resume. Hence, in the code below, we multiply
 277  * suspend_tick_stick_max_delta by two in the calculation for
 278  * native_stick_offset, native_tick_offset, and target_hrtime.
 279  */
 280 static void
 281 set_tick_offsets(uint64_t source_tick, uint64_t source_stick, timestruc_t *tsp)
 282 {
 283         uint64_t target_tick;
 284         uint64_t target_stick;
 285         hrtime_t source_hrtime;
 286         hrtime_t target_hrtime;
 287 
 288         /*
 289          * Temporarily set the offsets to zero so that the following reads
 290          * of the registers will yield physical unadjusted counter values.
 291          */
 292         native_tick_offset = 0;
 293         native_stick_offset = 0;
 294 
 295         target_tick = gettick_counter();        /* returns %tick */
 296         target_stick = gettick();               /* returns %stick */
 297 
 298         /*
 299          * Calculate the new offsets. In addition to the delta observed on
 300          * this CPU, add an additional value. Multiply the %tick/%stick
 301          * frequency by suspend_tick_stick_max_delta (us). Then, multiply by 2
 302          * to account for a delta between CPUs before the suspend and a
 303          * delta between CPUs after the resume.
 304          */
 305         native_tick_offset = (source_tick - target_tick) +
 306             (CPU->cpu_curr_clock * suspend_tick_stick_max_delta * 2 / MICROSEC);
 307         native_stick_offset = (source_stick - target_stick) +
 308             (sys_tick_freq * suspend_tick_stick_max_delta * 2 / MICROSEC);
 309 
 310         /*
 311          * We've effectively increased %stick and %tick by twice the value
 312          * of suspend_tick_stick_max_delta to account for variation across
 313          * CPUs. Now adjust the preserved TOD by the same amount.
 314          */
 315         source_hrtime = ts2hrt(tsp);
 316         target_hrtime = source_hrtime +
 317             (suspend_tick_stick_max_delta * 2 * (NANOSEC/MICROSEC));
 318         hrt2ts(target_hrtime, tsp);
 319 }
 320 
 321 /*
 322  * Set the {tick,stick}.NPT field to 1 on this CPU.
 323  */
 324 static void
 325 enable_tick_stick_npt(void)
 326 {
 327         (void) hv_stick_set_npt(1);
 328         (void) hv_tick_set_npt(1);
 329 }
 330 
 331 /*
 332  * Synchronize a CPU's {tick,stick}.NPT fields with the current state
 333  * of the system. This is used when a CPU is DR'd into the system.
 334  */
 335 void
 336 suspend_sync_tick_stick_npt(void)
 337 {
 338         if (tick_stick_emulation_active) {
 339                 DBG("enabling {%%tick/%%stick}.NPT on CPU 0x%x", CPU->cpu_id);
 340                 (void) hv_stick_set_npt(1);
 341                 (void) hv_tick_set_npt(1);
 342         } else {
 343                 ASSERT(gettick_npt() == 0);
 344                 ASSERT(getstick_npt() == 0);
 345         }
 346 }
 347 
 348 /*
 349  * Obtain an updated MD from the hypervisor and update cpunodes, CPU HW
 350  * sharing data structures, and processor groups.
 351  */
 352 static void
 353 update_cpu_mappings(void)
 354 {
 355         md_t            *mdp;
 356         processorid_t   id;
 357         cpu_t           *cp;
 358         cpu_pg_t        *pgps[NCPU];
 359 
 360         if ((mdp = md_get_handle()) == NULL) {
 361                 DBG("suspend: md_get_handle failed");
 362                 return;
 363         }
 364 
 365         DBG("suspend: updating CPU mappings");
 366 
 367         mutex_enter(&cpu_lock);
 368 
 369         setup_chip_mappings(mdp);
 370         setup_exec_unit_mappings(mdp);
 371         for (id = 0; id < NCPU; id++) {
 372                 if ((cp = cpu_get(id)) == NULL)
 373                         continue;
 374                 cpu_map_exec_units(cp);
 375         }
 376 
 377         /*
 378          * Re-calculate processor groups.
 379          *
 380          * First tear down all PG information before adding any new PG
 381          * information derived from the MD we just downloaded. We must
 382          * call pg_cpu_inactive and pg_cpu_active with CPUs paused and
 383          * we want to minimize the number of times pause_cpus is called.
 384          * Inactivating all CPUs would leave PGs without any active CPUs,
 385          * so while CPUs are paused, call pg_cpu_inactive and swap in the
 386          * bootstrap PG structure saving the original PG structure to be
 387          * fini'd afterwards. This prevents the dispatcher from encountering
 388          * PGs in which all CPUs are inactive. Offline CPUs are already
 389          * inactive in their PGs and shouldn't be reactivated, so we must
 390          * not call pg_cpu_inactive or pg_cpu_active for those CPUs.
 391          */
 392         pause_cpus(NULL, NULL);
 393         for (id = 0; id < NCPU; id++) {
 394                 if ((cp = cpu_get(id)) == NULL)
 395                         continue;
 396                 if ((cp->cpu_flags & CPU_OFFLINE) == 0)
 397                         pg_cpu_inactive(cp);
 398                 pgps[id] = cp->cpu_pg;
 399                 pg_cpu_bootstrap(cp);
 400         }
 401         start_cpus();
 402 
 403         /*
 404          * pg_cpu_fini* and pg_cpu_init* must be called while CPUs are
 405          * not paused. Use two separate loops here so that we do not
 406          * initialize PG data for CPUs until all the old PG data structures
 407          * are torn down.
 408          */
 409         for (id = 0; id < NCPU; id++) {
 410                 if ((cp = cpu_get(id)) == NULL)
 411                         continue;
 412                 pg_cpu_fini(cp, pgps[id]);
 413                 mpo_cpu_remove(id);
 414         }
 415 
 416         /*
 417          * Initialize PG data for each CPU, but leave the bootstrapped
 418          * PG structure in place to avoid running with any PGs containing
 419          * nothing but inactive CPUs.
 420          */
 421         for (id = 0; id < NCPU; id++) {
 422                 if ((cp = cpu_get(id)) == NULL)
 423                         continue;
 424                 mpo_cpu_add(mdp, id);
 425                 pgps[id] = pg_cpu_init(cp, B_TRUE);
 426         }
 427 
 428         /*
 429          * Now that PG data has been initialized for all CPUs in the
 430          * system, replace the bootstrapped PG structure with the
 431          * initialized PG structure and call pg_cpu_active for each CPU.
 432          */
 433         pause_cpus(NULL, NULL);
 434         for (id = 0; id < NCPU; id++) {
 435                 if ((cp = cpu_get(id)) == NULL)
 436                         continue;
 437                 cp->cpu_pg = pgps[id];
 438                 if ((cp->cpu_flags & CPU_OFFLINE) == 0)
 439                         pg_cpu_active(cp);
 440         }
 441         start_cpus();
 442 
 443         mutex_exit(&cpu_lock);
 444 
 445         (void) md_fini_handle(mdp);
 446 }
 447 
 448 /*
 449  * Wrapper for the Sun Cluster error decoding function.
 450  */
 451 static int
 452 cluster_error_decode(int error, char *error_reason, size_t max_reason_len)
 453 {
 454         const char      *decoded;
 455         size_t          decoded_len;
 456 
 457         ASSERT(error_reason != NULL);
 458         ASSERT(max_reason_len > 0);
 459 
 460         max_reason_len = MIN(max_reason_len, SC_FAIL_STR_MAX);
 461 
 462         if (cl_suspend_error_decode == NULL)
 463                 return (-1);
 464 
 465         if ((decoded = (*cl_suspend_error_decode)(error)) == NULL)
 466                 return (-1);
 467 
 468         /* Get number of non-NULL bytes */
 469         if ((decoded_len = strnlen(decoded, max_reason_len - 1)) == 0)
 470                 return (-1);
 471 
 472         bcopy(decoded, error_reason, decoded_len);
 473 
 474         /*
 475          * The error string returned from cl_suspend_error_decode
 476          * should be NULL-terminated, but set the terminator here
 477          * because we only copied non-NULL bytes. If the decoded
 478          * string was not NULL-terminated, this guarantees that
 479          * error_reason will be.
 480          */
 481         error_reason[decoded_len] = '\0';
 482 
 483         return (0);
 484 }
 485 
 486 /*
 487  * Wrapper for the Sun Cluster pre-suspend callback.
 488  */
 489 static int
 490 cluster_pre_wrapper(char *error_reason, size_t max_reason_len)
 491 {
 492         int rv = 0;
 493 
 494         if (cl_suspend_pre_callback != NULL) {
 495                 rv = (*cl_suspend_pre_callback)();
 496                 DBG("suspend: cl_suspend_pre_callback returned %d", rv);
 497                 if (rv != 0 && error_reason != NULL && max_reason_len > 0) {
 498                         if (cluster_error_decode(rv, error_reason,
 499                             max_reason_len)) {
 500                                 (void) snprintf(error_reason, max_reason_len,
 501                                     SC_PRE_FAIL_STR_FMT, rv);
 502                         }
 503                 }
 504         }
 505 
 506         return (rv);
 507 }
 508 
 509 /*
 510  * Wrapper for the Sun Cluster post-suspend callback.
 511  */
 512 static int
 513 cluster_post_wrapper(char *error_reason, size_t max_reason_len)
 514 {
 515         int rv = 0;
 516 
 517         if (cl_suspend_post_callback != NULL) {
 518                 rv = (*cl_suspend_post_callback)();
 519                 DBG("suspend: cl_suspend_post_callback returned %d", rv);
 520                 if (rv != 0 && error_reason != NULL && max_reason_len > 0) {
 521                         if (cluster_error_decode(rv, error_reason,
 522                             max_reason_len)) {
 523                                 (void) snprintf(error_reason,
 524                                     max_reason_len, SC_POST_FAIL_STR_FMT, rv);
 525                         }
 526                 }
 527         }
 528 
 529         return (rv);
 530 }
 531 
 532 /*
 533  * Execute pre-suspend callbacks preparing the system for a suspend operation.
 534  * Returns zero on success, non-zero on failure. Sets the recovered argument
 535  * to indicate whether or not callbacks could be undone in the event of a
 536  * failure--if callbacks were successfully undone, *recovered is set to B_TRUE,
 537  * otherwise *recovered is set to B_FALSE. Must be called successfully before
 538  * suspend_start can be called. Callers should first call suspend_support to
 539  * determine if OS suspend is supported.
 540  */
 541 int
 542 suspend_pre(char *error_reason, size_t max_reason_len, boolean_t *recovered)
 543 {
 544         int rv;
 545 
 546         ASSERT(recovered != NULL);
 547 
 548         /*
 549          * Return an error if suspend_pre is erreoneously called
 550          * when OS suspend is not supported.
 551          */
 552         ASSERT(suspend_supported());
 553         if (!suspend_supported()) {
 554                 DBG("suspend: suspend_pre called without suspend support");
 555                 *recovered = B_TRUE;
 556                 return (ENOTSUP);
 557         }
 558         DBG("suspend: %s", __func__);
 559 
 560         rv = cluster_pre_wrapper(error_reason, max_reason_len);
 561 
 562         /*
 563          * At present, only one pre-suspend operation exists.
 564          * If it fails, no recovery needs to be done.
 565          */
 566         if (rv != 0 && recovered != NULL)
 567                 *recovered = B_TRUE;
 568 
 569         return (rv);
 570 }
 571 
 572 /*
 573  * Execute post-suspend callbacks. Returns zero on success, non-zero on
 574  * failure. Must be called after suspend_start is called, regardless of
 575  * whether or not suspend_start is successful.
 576  */
 577 int
 578 suspend_post(char *error_reason, size_t max_reason_len)
 579 {
 580         ASSERT(suspend_supported());
 581         DBG("suspend: %s", __func__);
 582         return (cluster_post_wrapper(error_reason, max_reason_len));
 583 }
 584 
 585 /*
 586  * Suspends the OS by pausing CPUs and calling into the HV to initiate
 587  * the suspend. When the HV routine hv_guest_suspend returns, the system
 588  * will be resumed. Must be called after a successful call to suspend_pre.
 589  * suspend_post must be called after suspend_start, whether or not
 590  * suspend_start returns an error.
 591  */
 592 /*ARGSUSED*/
 593 int
 594 suspend_start(char *error_reason, size_t max_reason_len)
 595 {
 596         uint64_t        source_tick;
 597         uint64_t        source_stick;
 598         uint64_t        rv;
 599         timestruc_t     source_tod;
 600         int             spl;
 601 
 602         ASSERT(suspend_supported());
 603         DBG("suspend: %s", __func__);
 604 
 605         sfmmu_ctxdoms_lock();
 606 
 607         mutex_enter(&cpu_lock);
 608 
 609         /* Suspend the watchdog */
 610         watchdog_suspend();
 611 
 612         /* Record the TOD */
 613         mutex_enter(&tod_lock);
 614         source_tod = tod_get();
 615         mutex_exit(&tod_lock);
 616 
 617         /* Pause all other CPUs */
 618         pause_cpus(NULL, NULL);
 619         DBG_PROM("suspend: CPUs paused\n");
 620 
 621         /* Suspend cyclics */
 622         cyclic_suspend();
 623         DBG_PROM("suspend: cyclics suspended\n");
 624 
 625         /* Disable interrupts */
 626         spl = spl8();
 627         DBG_PROM("suspend: spl8()\n");
 628 
 629         source_tick = gettick_counter();
 630         source_stick = gettick();
 631         DBG_PROM("suspend: source_tick: 0x%lx\n", source_tick);
 632         DBG_PROM("suspend: source_stick: 0x%lx\n", source_stick);
 633 
 634         /*
 635          * Call into the HV to initiate the suspend. hv_guest_suspend()
 636          * returns after the guest has been resumed or if the suspend
 637          * operation failed or was cancelled. After a successful suspend,
 638          * the %tick and %stick registers may have changed by an amount
 639          * that is not proportional to the amount of time that has passed.
 640          * They may have jumped forwards or backwards. Some variation is
 641          * allowed and accounted for using suspend_tick_stick_max_delta,
 642          * but otherwise this jump must be uniform across all CPUs and we
 643          * operate under the assumption that it is (maintaining two global
 644          * offset variables--one for %tick and one for %stick.)
 645          */
 646         DBG_PROM("suspend: suspending... \n");
 647         rv = hv_guest_suspend();
 648         if (rv != 0) {
 649                 splx(spl);
 650                 cyclic_resume();
 651                 start_cpus();
 652                 watchdog_resume();
 653                 mutex_exit(&cpu_lock);
 654                 sfmmu_ctxdoms_unlock();
 655                 DBG("suspend: failed, rv: %ld\n", rv);
 656                 return (rv);
 657         }
 658 
 659         suspend_count++;
 660 
 661         /* Update the global tick and stick offsets and the preserved TOD */
 662         set_tick_offsets(source_tick, source_stick, &source_tod);
 663 
 664         /* Ensure new offsets are globally visible before resuming CPUs */
 665         membar_sync();
 666 
 667         /* Enable interrupts */
 668         splx(spl);
 669 
 670         /* Set the {%tick,%stick}.NPT bits on all CPUs */
 671         if (enable_user_tick_stick_emulation) {
 672                 xc_all((xcfunc_t *)enable_tick_stick_npt, NULL, NULL);
 673                 xt_sync(cpu_ready_set);
 674                 ASSERT(gettick_npt() != 0);
 675                 ASSERT(getstick_npt() != 0);
 676         }
 677 
 678         /* If emulation is enabled, but not currently active, enable it */
 679         if (enable_user_tick_stick_emulation && !tick_stick_emulation_active) {
 680                 tick_stick_emulation_active = B_TRUE;
 681         }
 682 
 683         sfmmu_ctxdoms_remove();
 684 
 685         /* Resume cyclics, unpause CPUs */
 686         cyclic_resume();
 687         start_cpus();
 688 
 689         /* Set the TOD */
 690         mutex_enter(&tod_lock);
 691         tod_set(source_tod);
 692         mutex_exit(&tod_lock);
 693 
 694         /* Re-enable the watchdog */
 695         watchdog_resume();
 696 
 697         mutex_exit(&cpu_lock);
 698 
 699         /* Download the latest MD */
 700         if ((rv = mach_descrip_update()) != 0)
 701                 cmn_err(CE_PANIC, "suspend: mach_descrip_update failed: %ld",
 702                     rv);
 703 
 704         sfmmu_ctxdoms_update();
 705         sfmmu_ctxdoms_unlock();
 706 
 707         /* Get new MD, update CPU mappings/relationships */
 708         if (suspend_update_cpu_mappings)
 709                 update_cpu_mappings();
 710 
 711         DBG("suspend: target tick: 0x%lx", gettick_counter());
 712         DBG("suspend: target stick: 0x%llx", gettick());
 713         DBG("suspend: user %%tick/%%stick emulation is %d",
 714             tick_stick_emulation_active);
 715         DBG("suspend: finished");
 716 
 717         return (0);
 718 }