1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*
  26  * Copyright (c) 2011, Joyent, Inc. All rights reserved.
  27  */
  28 
  29 /*
  30  * When the operating system detects that it is in an invalid state, a panic
  31  * is initiated in order to minimize potential damage to user data and to
  32  * facilitate debugging.  There are three major tasks to be performed in
  33  * a system panic: recording information about the panic in memory (and thus
  34  * making it part of the crash dump), synchronizing the file systems to
  35  * preserve user file data, and generating the crash dump.  We define the
  36  * system to be in one of four states with respect to the panic code:
  37  *
  38  * CALM    - the state of the system prior to any thread initiating a panic
  39  *
  40  * QUIESCE - the state of the system when the first thread to initiate
  41  *           a system panic records information about the cause of the panic
  42  *           and renders the system quiescent by stopping other processors
  43  *
  44  * SYNC    - the state of the system when we synchronize the file systems
  45  * DUMP    - the state when we generate the crash dump.
  46  *
  47  * The transitions between these states are irreversible: once we begin
  48  * panicking, we only make one attempt to perform the actions associated with
  49  * each state.
  50  *
  51  * The panic code itself must be re-entrant because actions taken during any
  52  * state may lead to another system panic.  Additionally, any Solaris
  53  * thread may initiate a panic at any time, and so we must have synchronization
  54  * between threads which attempt to initiate a state transition simultaneously.
  55  * The panic code makes use of a special locking primitive, a trigger, to
  56  * perform this synchronization.  A trigger is simply a word which is set
  57  * atomically and can only be set once.  We declare three triggers, one for
  58  * each transition between the four states.  When a thread enters the panic
  59  * code it attempts to set each trigger; if it fails it moves on to the
  60  * next trigger.  A special case is the first trigger: if two threads race
  61  * to perform the transition to QUIESCE, the losing thread may execute before
  62  * the winner has a chance to stop its CPU.  To solve this problem, we have
  63  * the loser look ahead to see if any other triggers are set; if not, it
  64  * presumes a panic is underway and simply spins.  Unfortunately, since we
  65  * are panicking, it is not possible to know this with absolute certainty.
  66  *
  67  * There are two common reasons for re-entering the panic code once a panic
  68  * has been initiated: (1) after we debug_enter() at the end of QUIESCE,
  69  * the operator may type "sync" instead of "go", and the PROM's sync callback
  70  * routine will invoke panic(); (2) if the clock routine decides that sync
  71  * or dump is not making progress, it will invoke panic() to force a timeout.
  72  * The design assumes that a third possibility, another thread causing an
  73  * unrelated panic while sync or dump is still underway, is extremely unlikely.
  74  * If this situation occurs, we may end up triggering dump while sync is
  75  * still in progress.  This third case is considered extremely unlikely because
  76  * all other CPUs are stopped and low-level interrupts have been blocked.
  77  *
  78  * The panic code is entered via a call directly to the vpanic() function,
  79  * or its varargs wrappers panic() and cmn_err(9F).  The vpanic routine
  80  * is implemented in assembly language to record the current machine
  81  * registers, attempt to set the trigger for the QUIESCE state, and
  82  * if successful, switch stacks on to the panic_stack before calling into
  83  * the common panicsys() routine.  The first thread to initiate a panic
  84  * is allowed to make use of the reserved panic_stack so that executing
  85  * the panic code itself does not overwrite valuable data on that thread's
  86  * stack *ahead* of the current stack pointer.  This data will be preserved
  87  * in the crash dump and may prove invaluable in determining what this
  88  * thread has previously been doing.  The first thread, saved in panic_thread,
  89  * is also responsible for stopping the other CPUs as quickly as possible,
  90  * and then setting the various panic_* variables.  Most important among
  91  * these is panicstr, which allows threads to subsequently bypass held
  92  * locks so that we can proceed without ever blocking.  We must stop the
  93  * other CPUs *prior* to setting panicstr in case threads running there are
  94  * currently spinning to acquire a lock; we want that state to be preserved.
  95  * Every thread which initiates a panic has its T_PANIC flag set so we can
  96  * identify all such threads in the crash dump.
  97  *
  98  * The panic_thread is also allowed to make use of the special memory buffer
  99  * panicbuf, which on machines with appropriate hardware is preserved across
 100  * reboots.  We allow the panic_thread to store its register set and panic
 101  * message in this buffer, so even if we fail to obtain a crash dump we will
 102  * be able to examine the machine after reboot and determine some of the
 103  * state at the time of the panic.  If we do get a dump, the panic buffer
 104  * data is structured so that a debugger can easily consume the information
 105  * therein (see <sys/panic.h>).
 106  *
 107  * Each platform or architecture is required to implement the functions
 108  * panic_savetrap() to record trap-specific information to panicbuf,
 109  * panic_saveregs() to record a register set to panicbuf, panic_stopcpus()
 110  * to halt all CPUs but the panicking CPU, panic_quiesce_hw() to perform
 111  * miscellaneous platform-specific tasks *after* panicstr is set,
 112  * panic_showtrap() to print trap-specific information to the console,
 113  * and panic_dump_hw() to perform platform tasks prior to calling dumpsys().
 114  *
 115  * A Note on Word Formation, courtesy of the Oxford Guide to English Usage:
 116  *
 117  * Words ending in -c interpose k before suffixes which otherwise would
 118  * indicate a soft c, and thus the verb and adjective forms of 'panic' are
 119  * spelled "panicked", "panicking", and "panicky" respectively.  Use of
 120  * the ill-conceived "panicing" and "panic'd" is discouraged.
 121  */
 122 
 123 #include <sys/types.h>
 124 #include <sys/varargs.h>
 125 #include <sys/sysmacros.h>
 126 #include <sys/cmn_err.h>
 127 #include <sys/cpuvar.h>
 128 #include <sys/thread.h>
 129 #include <sys/t_lock.h>
 130 #include <sys/cred.h>
 131 #include <sys/systm.h>
 132 #include <sys/archsystm.h>
 133 #include <sys/uadmin.h>
 134 #include <sys/callb.h>
 135 #include <sys/vfs.h>
 136 #include <sys/log.h>
 137 #include <sys/disp.h>
 138 #include <sys/param.h>
 139 #include <sys/dumphdr.h>
 140 #include <sys/ftrace.h>
 141 #include <sys/reboot.h>
 142 #include <sys/debug.h>
 143 #include <sys/stack.h>
 144 #include <sys/spl.h>
 145 #include <sys/errorq.h>
 146 #include <sys/panic.h>
 147 #include <sys/fm/util.h>
 148 #include <sys/clock_impl.h>
 149 
 150 /*
 151  * Panic variables which are set once during the QUIESCE state by the
 152  * first thread to initiate a panic.  These are examined by post-mortem
 153  * debugging tools; the inconsistent use of 'panic' versus 'panic_' in
 154  * the variable naming is historical and allows legacy tools to work.
 155  */
 156 #pragma align STACK_ALIGN(panic_stack)
 157 char panic_stack[PANICSTKSIZE];         /* reserved stack for panic_thread */
 158 kthread_t *panic_thread;                /* first thread to call panicsys() */
 159 cpu_t panic_cpu;                        /* cpu from first call to panicsys() */
 160 label_t panic_regs;                     /* setjmp label from panic_thread */
 161 label_t panic_pcb;                      /* t_pcb at time of panic */
 162 struct regs *panic_reg;                 /* regs struct from first panicsys() */
 163 char *volatile panicstr;                /* format string to first panicsys() */
 164 va_list panicargs;                      /* arguments to first panicsys() */
 165 clock_t panic_lbolt;                    /* lbolt at time of panic */
 166 int64_t panic_lbolt64;                  /* lbolt64 at time of panic */
 167 hrtime_t panic_hrtime;                  /* hrtime at time of panic */
 168 timespec_t panic_hrestime;              /* hrestime at time of panic */
 169 int panic_ipl;                          /* ipl on panic_cpu at time of panic */
 170 ushort_t panic_schedflag;               /* t_schedflag for panic_thread */
 171 cpu_t *panic_bound_cpu;                 /* t_bound_cpu for panic_thread */
 172 char panic_preempt;                     /* t_preempt for panic_thread */
 173 
 174 /*
 175  * Panic variables which can be set via /etc/system or patched while
 176  * the system is in operation.  Again, the stupid names are historic.
 177  */
 178 char *panic_bootstr = NULL;             /* mdboot string to use after panic */
 179 int panic_bootfcn = AD_BOOT;            /* mdboot function to use after panic */
 180 int halt_on_panic = 0;                  /* halt after dump instead of reboot? */
 181 int nopanicdebug = 0;                   /* reboot instead of call debugger? */
 182 int in_sync = 0;                        /* skip vfs_syncall() and just dump? */
 183 
 184 /*
 185  * The do_polled_io flag is set by the panic code to inform the SCSI subsystem
 186  * to use polled mode instead of interrupt-driven i/o.
 187  */
 188 int do_polled_io = 0;
 189 
 190 /*
 191  * The panic_forced flag is set by the uadmin A_DUMP code to inform the
 192  * panic subsystem that it should not attempt an initial debug_enter.
 193  */
 194 int panic_forced = 0;
 195 
 196 /*
 197  * Triggers for panic state transitions:
 198  */
 199 int panic_quiesce;                      /* trigger for CALM    -> QUIESCE */
 200 int panic_sync;                         /* trigger for QUIESCE -> SYNC */
 201 int panic_dump;                         /* trigger for SYNC    -> DUMP */
 202 
 203 /*
 204  * Variable signifying quiesce(9E) is in progress.
 205  */
 206 volatile int quiesce_active = 0;
 207 
 208 void
 209 panicsys(const char *format, va_list alist, struct regs *rp, int on_panic_stack)
 210 {
 211         int s = spl8();
 212         kthread_t *t = curthread;
 213         cpu_t *cp = CPU;
 214 
 215         caddr_t intr_stack = NULL;
 216         uint_t intr_actv;
 217 
 218         ushort_t schedflag = t->t_schedflag;
 219         cpu_t *bound_cpu = t->t_bound_cpu;
 220         char preempt = t->t_preempt;
 221         label_t pcb = t->t_pcb;
 222 
 223         (void) setjmp(&t->t_pcb);
 224         t->t_flag |= T_PANIC;
 225 
 226         t->t_schedflag |= TS_DONT_SWAP;
 227         t->t_bound_cpu = cp;
 228         t->t_preempt++;
 229 
 230         panic_enter_hw(s);
 231 
 232         /*
 233          * If we're on the interrupt stack and an interrupt thread is available
 234          * in this CPU's pool, preserve the interrupt stack by detaching an
 235          * interrupt thread and making its stack the intr_stack.
 236          */
 237         if (CPU_ON_INTR(cp) && cp->cpu_intr_thread != NULL) {
 238                 kthread_t *it = cp->cpu_intr_thread;
 239 
 240                 intr_stack = cp->cpu_intr_stack;
 241                 intr_actv = cp->cpu_intr_actv;
 242 
 243                 cp->cpu_intr_stack = thread_stk_init(it->t_stk);
 244                 cp->cpu_intr_thread = it->t_link;
 245 
 246                 /*
 247                  * Clear only the high level bits of cpu_intr_actv.
 248                  * We want to indicate that high-level interrupts are
 249                  * not active without destroying the low-level interrupt
 250                  * information stored there.
 251                  */
 252                 cp->cpu_intr_actv &= ((1 << (LOCK_LEVEL + 1)) - 1);
 253         }
 254 
 255         /*
 256          * Record one-time panic information and quiesce the other CPUs.
 257          * Then print out the panic message and stack trace.
 258          */
 259         if (on_panic_stack) {
 260                 panic_data_t *pdp = (panic_data_t *)panicbuf;
 261 
 262                 pdp->pd_version = PANICBUFVERS;
 263                 pdp->pd_msgoff = sizeof (panic_data_t) - sizeof (panic_nv_t);
 264 
 265                 (void) strncpy(pdp->pd_uuid, dump_get_uuid(),
 266                     sizeof (pdp->pd_uuid));
 267 
 268                 if (t->t_panic_trap != NULL)
 269                         panic_savetrap(pdp, t->t_panic_trap);
 270                 else
 271                         panic_saveregs(pdp, rp);
 272 
 273                 (void) vsnprintf(&panicbuf[pdp->pd_msgoff],
 274                     PANICBUFSIZE - pdp->pd_msgoff, format, alist);
 275 
 276                 /*
 277                  * Call into the platform code to stop the other CPUs.
 278                  * We currently have all interrupts blocked, and expect that
 279                  * the platform code will lower ipl only as far as needed to
 280                  * perform cross-calls, and will acquire as *few* locks as is
 281                  * possible -- panicstr is not set so we can still deadlock.
 282                  */
 283                 panic_stopcpus(cp, t, s);
 284 
 285                 panicstr = (char *)format;
 286                 va_copy(panicargs, alist);
 287                 panic_lbolt = LBOLT_NO_ACCOUNT;
 288                 panic_lbolt64 = LBOLT_NO_ACCOUNT64;
 289                 panic_hrestime = hrestime;
 290                 panic_hrtime = gethrtime_waitfree();
 291                 panic_thread = t;
 292                 panic_regs = t->t_pcb;
 293                 panic_reg = rp;
 294                 panic_cpu = *cp;
 295                 panic_ipl = spltoipl(s);
 296                 panic_schedflag = schedflag;
 297                 panic_bound_cpu = bound_cpu;
 298                 panic_preempt = preempt;
 299                 panic_pcb = pcb;
 300 
 301                 if (intr_stack != NULL) {
 302                         panic_cpu.cpu_intr_stack = intr_stack;
 303                         panic_cpu.cpu_intr_actv = intr_actv;
 304                 }
 305 
 306                 /*
 307                  * Lower ipl to 10 to keep clock() from running, but allow
 308                  * keyboard interrupts to enter the debugger.  These callbacks
 309                  * are executed with panicstr set so they can bypass locks.
 310                  */
 311                 splx(ipltospl(CLOCK_LEVEL));
 312                 panic_quiesce_hw(pdp);
 313                 (void) FTRACE_STOP();
 314                 (void) callb_execute_class(CB_CL_PANIC, NULL);
 315 
 316                 if (log_intrq != NULL)
 317                         log_flushq(log_intrq);
 318 
 319                 /*
 320                  * If log_consq has been initialized and syslogd has started,
 321                  * print any messages in log_consq that haven't been consumed.
 322                  */
 323                 if (log_consq != NULL && log_consq != log_backlogq)
 324                         log_printq(log_consq);
 325 
 326                 fm_banner();
 327 
 328 #if defined(__x86)
 329                 /*
 330                  * A hypervisor panic originates outside of Solaris, so we
 331                  * don't want to prepend the panic message with misleading
 332                  * pointers from within Solaris.
 333                  */
 334                 if (!IN_XPV_PANIC())
 335 #endif
 336                         printf("\n\rpanic[cpu%d]/thread=%p: ", cp->cpu_id,
 337                             (void *)t);
 338                 vprintf(format, alist);
 339                 printf("\n\n");
 340 
 341                 if (t->t_panic_trap != NULL) {
 342                         panic_showtrap(t->t_panic_trap);
 343                         printf("\n");
 344                 }
 345 
 346                 traceregs(rp);
 347                 printf("\n");
 348 
 349                 if (((boothowto & RB_DEBUG) || obpdebug) &&
 350                     !nopanicdebug && !panic_forced) {
 351                         if (dumpvp != NULL) {
 352                                 debug_enter("panic: entering debugger "
 353                                     "(continue to save dump)");
 354                         } else {
 355                                 debug_enter("panic: entering debugger "
 356                                     "(no dump device, continue to reboot)");
 357                         }
 358                 }
 359 
 360         } else if (panic_dump != 0 || panic_sync != 0 || panicstr != NULL) {
 361                 printf("\n\rpanic[cpu%d]/thread=%p: ", cp->cpu_id, (void *)t);
 362                 vprintf(format, alist);
 363                 printf("\n");
 364         } else
 365                 goto spin;
 366 
 367         /*
 368          * Prior to performing sync or dump, we make sure that do_polled_io is
 369          * set, but we'll leave ipl at 10; deadman(), a CY_HIGH_LEVEL cyclic,
 370          * will re-enter panic if we are not making progress with sync or dump.
 371          */
 372 
 373         /*
 374          * Sync the filesystems.  Reset t_cred if not set because much of
 375          * the filesystem code depends on CRED() being valid.
 376          */
 377         if (!in_sync && panic_trigger(&panic_sync)) {
 378                 if (t->t_cred == NULL)
 379                         t->t_cred = kcred;
 380                 splx(ipltospl(CLOCK_LEVEL));
 381                 do_polled_io = 1;
 382                 vfs_syncall();
 383         }
 384 
 385         /*
 386          * Take the crash dump.  If the dump trigger is already set, try to
 387          * enter the debugger again before rebooting the system.
 388          */
 389         if (panic_trigger(&panic_dump)) {
 390                 panic_dump_hw(s);
 391                 splx(ipltospl(CLOCK_LEVEL));
 392                 errorq_panic();
 393                 do_polled_io = 1;
 394                 dumpsys();
 395         } else if (((boothowto & RB_DEBUG) || obpdebug) && !nopanicdebug) {
 396                 debug_enter("panic: entering debugger (continue to reboot)");
 397         } else
 398                 printf("dump aborted: please record the above information!\n");
 399 
 400         if (halt_on_panic)
 401                 mdboot(A_REBOOT, AD_HALT, NULL, B_FALSE);
 402         else
 403                 mdboot(A_REBOOT, panic_bootfcn, panic_bootstr, B_FALSE);
 404 spin:
 405         /*
 406          * Restore ipl to at most CLOCK_LEVEL so we don't end up spinning
 407          * and unable to jump into the debugger.
 408          */
 409         splx(MIN(s, ipltospl(CLOCK_LEVEL)));
 410         for (;;)
 411                 ;
 412 }
 413 
 414 void
 415 panic(const char *format, ...)
 416 {
 417         va_list alist;
 418 
 419         va_start(alist, format);
 420         vpanic(format, alist);
 421         va_end(alist);
 422 }