1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * Kernel Error Queues
  28  *
  29  * A common problem when handling hardware error traps and interrupts is that
  30  * these errors frequently must be handled at high interrupt level, where
  31  * reliably producing error messages and safely examining and manipulating
  32  * other kernel state may not be possible.  The kernel error queue primitive is
  33  * a common set of routines that allow a subsystem to maintain a queue of
  34  * errors that can be processed by an explicit call from a safe context or by a
  35  * soft interrupt that fires at a specific lower interrupt level.  The queue
  36  * management code also ensures that if the system panics, all in-transit
  37  * errors are logged prior to reset.  Each queue has an associated kstat for
  38  * observing the number of errors dispatched and logged, and mdb(1) debugging
  39  * support is provided for live and post-mortem observability.
  40  *
  41  * Memory Allocation
  42  *
  43  *      All of the queue data structures are allocated in advance as part of
  44  *      the errorq_create() call.  No additional memory allocations are
  45  *      performed as part of errorq_dispatch(), errorq_reserve(),
  46  *      errorq_commit() or errorq_drain().  This design
  47  *      facilitates reliable error queue processing even when the system is low
  48  *      on memory, and ensures that errorq_dispatch() can be called from any
  49  *      context.  When the queue is created, the maximum queue length is
  50  *      specified as a parameter to errorq_create() and errorq_nvcreate().  This
  51  *      length should represent a reasonable upper bound on the number of
  52  *      simultaneous errors.  If errorq_dispatch() or errorq_reserve() is
  53  *      invoked and no free queue elements are available, the error is
  54  *      dropped and will not be logged.  Typically, the queue will only be
  55  *      exhausted by an error storm, and in this case
  56  *      the earlier errors provide the most important data for analysis.
  57  *      When a new error is dispatched, the error data is copied into the
  58  *      preallocated queue element so that the caller's buffer can be reused.
  59  *
  60  *      When a new error is reserved, an element is moved from the free pool
  61  *      and returned to the caller.  The element buffer data, eqe_data, may be
  62  *      managed by the caller and dispatched to the errorq by calling
  63  *      errorq_commit().  This is useful for additions to errorq's
  64  *      created with errorq_nvcreate() to handle name-value pair (nvpair) data.
  65  *      See below for a discussion on nvlist errorq's.
  66  *
  67  * Queue Drain Callback
  68  *
  69  *      When the error queue is drained, the caller's queue drain callback is
  70  *      invoked with a pointer to the saved error data.  This function may be
  71  *      called from passive kernel context or soft interrupt context at or
  72  *      below LOCK_LEVEL, or as part of panic().  As such, the callback should
  73  *      basically only be calling cmn_err (but NOT with the CE_PANIC flag).
  74  *      The callback must not call panic(), attempt to allocate memory, or wait
  75  *      on a condition variable.  The callback may not call errorq_destroy()
  76  *      or errorq_drain() on the same error queue that called it.
  77  *
  78  *      The queue drain callback will always be called for each pending error
  79  *      in the order in which errors were enqueued (oldest to newest).  The
  80  *      queue drain callback is guaranteed to provide at *least* once semantics
  81  *      for all errors that are successfully dispatched (i.e. for which
  82  *      errorq_dispatch() has successfully completed).  If an unrelated panic
  83  *      occurs while the queue drain callback is running on a vital queue, the
  84  *      panic subsystem will continue the queue drain and the callback may be
  85  *      invoked again for the same error.  Therefore, the callback should
  86  *      restrict itself to logging messages and taking other actions that are
  87  *      not destructive if repeated.
  88  *
  89  * Name-Value Pair Error Queues
  90  *
  91  *      During error handling, it may be more convenient to store error
  92  *      queue element data as a fixed buffer of name-value pairs.  The
  93  *      nvpair library allows construction and destruction of nvlists
  94  *      in pre-allocated memory buffers.
  95  *
  96  *      Error queues created via errorq_nvcreate() store queue element
  97  *      data as fixed buffer nvlists (ereports).  errorq_reserve()
  98  *      allocates an errorq element from eqp->eq_bitmap and returns a valid
  99  *      pointer to a errorq_elem_t (queue element) and a pre-allocated
 100  *      fixed buffer nvlist.  errorq_elem_nvl() is used to gain access
 101  *      to the nvlist to add name-value ereport members prior to
 102  *      dispatching the error queue element in errorq_commit().
 103  *
 104  *      Once dispatched, the drain function will return the element to
 105  *      eqp->eq_bitmap and reset the associated nv_alloc structure.
 106  *      error_cancel() may be called to cancel an element reservation
 107  *      element that was never dispatched (committed).  This is useful in
 108  *      cases where a programming error prevents a queue element from being
 109  *      dispatched.
 110  *
 111  * Queue Management
 112  *
 113  *      The queue element structures and error data buffers are allocated in
 114  *      two contiguous chunks as part of errorq_create() or errorq_nvcreate().
 115  *      Each queue element structure contains a next pointer,
 116  *      a previous pointer, and a pointer to the corresponding error data
 117  *      buffer.  The data buffer for a nvlist errorq is a shared buffer
 118  *      for the allocation of name-value pair lists. The elements are kept on
 119  *      one of four lists:
 120  *
 121  *      Unused elements are kept in the free pool, managed by eqp->eq_bitmap.
 122  *      The eqe_prev and eqe_next pointers are not used while in the free pool
 123  *      and will be set to NULL.
 124  *
 125  *      Pending errors are kept on the pending list, a singly-linked list
 126  *      pointed to by eqp->eq_pend, and linked together using eqe_prev.  This
 127  *      list is maintained in order from newest error to oldest.  The eqe_next
 128  *      pointer is not used by the pending list and will be set to NULL.
 129  *
 130  *      The processing list is a doubly-linked list pointed to by eqp->eq_phead
 131  *      (the oldest element) and eqp->eq_ptail (the newest element).  The
 132  *      eqe_next pointer is used to traverse from eq_phead to eq_ptail, and the
 133  *      eqe_prev pointer is used to traverse from eq_ptail to eq_phead.  Once a
 134  *      queue drain operation begins, the current pending list is moved to the
 135  *      processing list in a two-phase commit fashion (eq_ptail being cleared
 136  *      at the beginning but eq_phead only at the end), allowing the panic code
 137  *      to always locate and process all pending errors in the event that a
 138  *      panic occurs in the middle of queue processing.
 139  *
 140  *      A fourth list is maintained for nvlist errorqs.  The dump list,
 141  *      eq_dump is used to link all errorq elements that should be stored
 142  *      in a crash dump file in the event of a system panic.  During
 143  *      errorq_panic(), the list is created and subsequently traversed
 144  *      in errorq_dump() during the final phases of a crash dump.
 145  *
 146  * Platform Considerations
 147  *
 148  *      In order to simplify their implementation, error queues make use of the
 149  *      C wrappers for compare-and-swap.  If the platform itself does not
 150  *      support compare-and-swap in hardware and the kernel emulation routines
 151  *      are used instead, then the context in which errorq_dispatch() can be
 152  *      safely invoked is further constrained by the implementation of the
 153  *      compare-and-swap emulation.  Specifically, if errorq_dispatch() is
 154  *      called from a code path that can be executed above ATOMIC_LEVEL on such
 155  *      a platform, the dispatch code could potentially deadlock unless the
 156  *      corresponding error interrupt is blocked or disabled prior to calling
 157  *      errorq_dispatch().  Error queues should therefore be deployed with
 158  *      caution on these platforms.
 159  *
 160  * Interfaces
 161  *
 162  * errorq_t *errorq_create(name, func, private, qlen, eltsize, ipl, flags);
 163  * errorq_t *errorq_nvcreate(name, func, private, qlen, eltsize, ipl, flags);
 164  *
 165  *      Create a new error queue with the specified name, callback, and
 166  *      properties.  A pointer to the new error queue is returned upon success,
 167  *      or NULL is returned to indicate that the queue could not be created.
 168  *      This function must be called from passive kernel context with no locks
 169  *      held that can prevent a sleeping memory allocation from occurring.
 170  *      errorq_create() will return failure if the queue kstats cannot be
 171  *      created, or if a soft interrupt handler cannot be registered.
 172  *
 173  *      The queue 'name' is a string that is recorded for live and post-mortem
 174  *      examination by a debugger.  The queue callback 'func' will be invoked
 175  *      for each error drained from the queue, and will receive the 'private'
 176  *      pointer as its first argument.  The callback must obey the rules for
 177  *      callbacks described above.  The queue will have maximum length 'qlen'
 178  *      and each element will be able to record up to 'eltsize' bytes of data.
 179  *      The queue's soft interrupt (see errorq_dispatch(), below) will fire
 180  *      at 'ipl', which should not exceed LOCK_LEVEL.  The queue 'flags' may
 181  *      include the following flag:
 182  *
 183  *      ERRORQ_VITAL    - This queue contains information that is considered
 184  *         vital to problem diagnosis.  Error queues that are marked vital will
 185  *         be automatically drained by the panic subsystem prior to printing
 186  *         the panic messages to the console.
 187  *
 188  * void errorq_destroy(errorq);
 189  *
 190  *      Destroy the specified error queue.  The queue is drained of any
 191  *      pending elements and these are logged before errorq_destroy returns.
 192  *      Once errorq_destroy() begins draining the queue, any simultaneous
 193  *      calls to dispatch errors will result in the errors being dropped.
 194  *      The caller must invoke a higher-level abstraction (e.g. disabling
 195  *      an error interrupt) to ensure that error handling code does not
 196  *      attempt to dispatch errors to the queue while it is being freed.
 197  *
 198  * void errorq_dispatch(errorq, data, len, flag);
 199  *
 200  *      Attempt to enqueue the specified error data.  If a free queue element
 201  *      is available, the data is copied into a free element and placed on a
 202  *      pending list.  If no free queue element is available, the error is
 203  *      dropped.  The data length (len) is specified in bytes and should not
 204  *      exceed the queue's maximum element size.  If the data length is less
 205  *      than the maximum element size, the remainder of the queue element is
 206  *      filled with zeroes.  The flag parameter should be one of:
 207  *
 208  *      ERRORQ_ASYNC    - Schedule a soft interrupt at the previously specified
 209  *         IPL to asynchronously drain the queue on behalf of the caller.
 210  *
 211  *      ERRORQ_SYNC     - Do not schedule a soft interrupt to drain the queue.
 212  *         The caller is presumed to be calling errorq_drain() or panic() in
 213  *         the near future in order to drain the queue and log the error.
 214  *
 215  *      The errorq_dispatch() function may be called from any context, subject
 216  *      to the Platform Considerations described above.
 217  *
 218  * void errorq_drain(errorq);
 219  *
 220  *      Drain the error queue of all pending errors.  The queue's callback
 221  *      function is invoked for each error in order from oldest to newest.
 222  *      This function may be used at or below LOCK_LEVEL or from panic context.
 223  *
 224  * errorq_elem_t *errorq_reserve(errorq);
 225  *
 226  *      Reserve an error queue element for later processing and dispatching.
 227  *      The element is returned to the caller who may add error-specific data
 228  *      to element.  The element is retured to the free pool when either
 229  *      errorq_commit() is called and the element asynchronously processed
 230  *      or immediately when errorq_cancel() is called.
 231  *
 232  * void errorq_commit(errorq, errorq_elem, flag);
 233  *
 234  *      Commit an errorq element (eqep) for dispatching, see
 235  *      errorq_dispatch().
 236  *
 237  * void errorq_cancel(errorq, errorq_elem);
 238  *
 239  *      Cancel a pending errorq element reservation.  The errorq element is
 240  *      returned to the free pool upon cancelation.
 241  */
 242 
 243 #include <sys/errorq_impl.h>
 244 #include <sys/sysmacros.h>
 245 #include <sys/machlock.h>
 246 #include <sys/cmn_err.h>
 247 #include <sys/atomic.h>
 248 #include <sys/systm.h>
 249 #include <sys/kmem.h>
 250 #include <sys/conf.h>
 251 #include <sys/ddi.h>
 252 #include <sys/sunddi.h>
 253 #include <sys/bootconf.h>
 254 #include <sys/spl.h>
 255 #include <sys/dumphdr.h>
 256 #include <sys/compress.h>
 257 #include <sys/time.h>
 258 #include <sys/panic.h>
 259 #include <sys/bitmap.h>
 260 #include <sys/fm/protocol.h>
 261 #include <sys/fm/util.h>
 262 
 263 static struct errorq_kstat errorq_kstat_template = {
 264         { "dispatched", KSTAT_DATA_UINT64 },
 265         { "dropped", KSTAT_DATA_UINT64 },
 266         { "logged", KSTAT_DATA_UINT64 },
 267         { "reserved", KSTAT_DATA_UINT64 },
 268         { "reserve_fail", KSTAT_DATA_UINT64 },
 269         { "committed", KSTAT_DATA_UINT64 },
 270         { "commit_fail", KSTAT_DATA_UINT64 },
 271         { "cancelled", KSTAT_DATA_UINT64 }
 272 };
 273 
 274 static uint64_t errorq_lost = 0;
 275 static errorq_t *errorq_list = NULL;
 276 static kmutex_t errorq_lock;
 277 static uint64_t errorq_vitalmin = 5;
 278 
 279 static uint_t
 280 errorq_intr(caddr_t eqp)
 281 {
 282         errorq_drain((errorq_t *)eqp);
 283         return (DDI_INTR_CLAIMED);
 284 }
 285 
 286 /*
 287  * Create a new error queue with the specified properties and add a software
 288  * interrupt handler and kstat for it.  This function must be called from
 289  * passive kernel context with no locks held that can prevent a sleeping
 290  * memory allocation from occurring.  This function will return NULL if the
 291  * softint or kstat for this queue cannot be created.
 292  */
 293 errorq_t *
 294 errorq_create(const char *name, errorq_func_t func, void *private,
 295     ulong_t qlen, size_t size, uint_t ipl, uint_t flags)
 296 {
 297         errorq_t *eqp = kmem_alloc(sizeof (errorq_t), KM_SLEEP);
 298         ddi_iblock_cookie_t ibc = (ddi_iblock_cookie_t)(uintptr_t)ipltospl(ipl);
 299         dev_info_t *dip = ddi_root_node();
 300 
 301         errorq_elem_t *eep;
 302         ddi_softintr_t id = NULL;
 303         caddr_t data;
 304 
 305         ASSERT(qlen != 0 && size != 0);
 306         ASSERT(ipl > 0 && ipl <= LOCK_LEVEL);
 307 
 308         /*
 309          * If a queue is created very early in boot before device tree services
 310          * are available, the queue softint handler cannot be created.  We
 311          * manually drain these queues and create their softint handlers when
 312          * it is safe to do so as part of errorq_init(), below.
 313          */
 314         if (modrootloaded && ddi_add_softintr(dip, DDI_SOFTINT_FIXED, &id,
 315             &ibc, NULL, errorq_intr, (caddr_t)eqp) != DDI_SUCCESS) {
 316                 cmn_err(CE_WARN, "errorq_create: failed to register "
 317                     "IPL %u softint for queue %s", ipl, name);
 318                 kmem_free(eqp, sizeof (errorq_t));
 319                 return (NULL);
 320         }
 321 
 322         if ((eqp->eq_ksp = kstat_create("unix", 0, name, "errorq",
 323             KSTAT_TYPE_NAMED, sizeof (struct errorq_kstat) /
 324             sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL)) == NULL) {
 325                 cmn_err(CE_WARN, "errorq_create: failed to create kstat "
 326                     "for queue %s", name);
 327                 if (id != NULL)
 328                         ddi_remove_softintr(id);
 329                 kmem_free(eqp, sizeof (errorq_t));
 330                 return (NULL);
 331         }
 332 
 333         bcopy(&errorq_kstat_template, &eqp->eq_kstat,
 334             sizeof (struct errorq_kstat));
 335         eqp->eq_ksp->ks_data = &eqp->eq_kstat;
 336         eqp->eq_ksp->ks_private = eqp;
 337         kstat_install(eqp->eq_ksp);
 338 
 339         (void) strncpy(eqp->eq_name, name, ERRORQ_NAMELEN);
 340         eqp->eq_name[ERRORQ_NAMELEN] = '\0';
 341         eqp->eq_func = func;
 342         eqp->eq_private = private;
 343         eqp->eq_data = kmem_alloc(qlen * size, KM_SLEEP);
 344         eqp->eq_qlen = qlen;
 345         eqp->eq_size = size;
 346         eqp->eq_ipl = ipl;
 347         eqp->eq_flags = flags | ERRORQ_ACTIVE;
 348         eqp->eq_id = id;
 349         mutex_init(&eqp->eq_lock, NULL, MUTEX_DEFAULT, NULL);
 350         eqp->eq_elems = kmem_alloc(qlen * sizeof (errorq_elem_t), KM_SLEEP);
 351         eqp->eq_phead = NULL;
 352         eqp->eq_ptail = NULL;
 353         eqp->eq_pend = NULL;
 354         eqp->eq_dump = NULL;
 355         eqp->eq_bitmap = kmem_zalloc(BT_SIZEOFMAP(qlen), KM_SLEEP);
 356         eqp->eq_rotor = 0;
 357 
 358         /*
 359          * Iterate over the array of errorq_elem_t structures and set its
 360          * data pointer.
 361          */
 362         for (eep = eqp->eq_elems, data = eqp->eq_data; qlen > 1; qlen--) {
 363                 eep->eqe_next = NULL;
 364                 eep->eqe_dump = NULL;
 365                 eep->eqe_prev = NULL;
 366                 eep->eqe_data = data;
 367                 data += size;
 368                 eep++;
 369         }
 370         eep->eqe_next = NULL;
 371         eep->eqe_prev = NULL;
 372         eep->eqe_data = data;
 373         eep->eqe_dump = NULL;
 374 
 375         /*
 376          * Once the errorq is initialized, add it to the global list of queues,
 377          * and then return a pointer to the new queue to the caller.
 378          */
 379         mutex_enter(&errorq_lock);
 380         eqp->eq_next = errorq_list;
 381         errorq_list = eqp;
 382         mutex_exit(&errorq_lock);
 383 
 384         return (eqp);
 385 }
 386 
 387 /*
 388  * Create a new errorq as if by errorq_create(), but set the ERRORQ_NVLIST
 389  * flag and initialize each element to have the start of its data region used
 390  * as an errorq_nvelem_t with a nvlist allocator that consumes the data region.
 391  */
 392 errorq_t *
 393 errorq_nvcreate(const char *name, errorq_func_t func, void *private,
 394     ulong_t qlen, size_t size, uint_t ipl, uint_t flags)
 395 {
 396         errorq_t *eqp;
 397         errorq_elem_t *eep;
 398 
 399         eqp = errorq_create(name, func, private, qlen,
 400             size + sizeof (errorq_nvelem_t), ipl, flags | ERRORQ_NVLIST);
 401 
 402         if (eqp == NULL)
 403                 return (NULL);
 404 
 405         mutex_enter(&eqp->eq_lock);
 406 
 407         for (eep = eqp->eq_elems; qlen != 0; eep++, qlen--) {
 408                 errorq_nvelem_t *eqnp = eep->eqe_data;
 409                 eqnp->eqn_buf = (char *)eqnp + sizeof (errorq_nvelem_t);
 410                 eqnp->eqn_nva = fm_nva_xcreate(eqnp->eqn_buf, size);
 411         }
 412 
 413         mutex_exit(&eqp->eq_lock);
 414         return (eqp);
 415 }
 416 
 417 /*
 418  * To destroy an error queue, we mark it as disabled and then explicitly drain
 419  * all pending errors.  Once the drain is complete, we can remove the queue
 420  * from the global list of queues examined by errorq_panic(), and then free
 421  * the various queue data structures.  The caller must use some higher-level
 422  * abstraction (e.g. disabling an error interrupt) to ensure that no one will
 423  * attempt to enqueue new errors while we are freeing this queue.
 424  */
 425 void
 426 errorq_destroy(errorq_t *eqp)
 427 {
 428         errorq_t *p, **pp;
 429         errorq_elem_t *eep;
 430         ulong_t i;
 431 
 432         ASSERT(eqp != NULL);
 433         eqp->eq_flags &= ~ERRORQ_ACTIVE;
 434         errorq_drain(eqp);
 435 
 436         mutex_enter(&errorq_lock);
 437         pp = &errorq_list;
 438 
 439         for (p = errorq_list; p != NULL; p = p->eq_next) {
 440                 if (p == eqp) {
 441                         *pp = p->eq_next;
 442                         break;
 443                 }
 444                 pp = &p->eq_next;
 445         }
 446 
 447         mutex_exit(&errorq_lock);
 448         ASSERT(p != NULL);
 449 
 450         if (eqp->eq_flags & ERRORQ_NVLIST) {
 451                 for (eep = eqp->eq_elems, i = 0; i < eqp->eq_qlen; i++, eep++) {
 452                         errorq_nvelem_t *eqnp = eep->eqe_data;
 453                         fm_nva_xdestroy(eqnp->eqn_nva);
 454                 }
 455         }
 456 
 457         mutex_destroy(&eqp->eq_lock);
 458         kstat_delete(eqp->eq_ksp);
 459 
 460         if (eqp->eq_id != NULL)
 461                 ddi_remove_softintr(eqp->eq_id);
 462 
 463         kmem_free(eqp->eq_elems, eqp->eq_qlen * sizeof (errorq_elem_t));
 464         kmem_free(eqp->eq_bitmap, BT_SIZEOFMAP(eqp->eq_qlen));
 465         kmem_free(eqp->eq_data, eqp->eq_qlen * eqp->eq_size);
 466 
 467         kmem_free(eqp, sizeof (errorq_t));
 468 }
 469 
 470 /*
 471  * private version of bt_availbit which makes a best-efforts attempt
 472  * at allocating in a round-robin fashion in order to facilitate post-mortem
 473  * diagnosis.
 474  */
 475 static index_t
 476 errorq_availbit(ulong_t *bitmap, size_t nbits, index_t curindex)
 477 {
 478         ulong_t bit, maxbit, bx;
 479         index_t rval, nextindex = curindex + 1;
 480         index_t nextword = nextindex >> BT_ULSHIFT;
 481         ulong_t nextbitindex = nextindex & BT_ULMASK;
 482         index_t maxindex = nbits - 1;
 483         index_t maxword = maxindex >> BT_ULSHIFT;
 484         ulong_t maxbitindex = maxindex & BT_ULMASK;
 485 
 486         /*
 487          * First check if there are still some bits remaining in the current
 488          * word, and see if any of those are available. We need to do this by
 489          * hand as the bt_availbit() function always starts at the beginning
 490          * of a word.
 491          */
 492         if (nextindex <= maxindex && nextbitindex != 0) {
 493                 maxbit = (nextword == maxword) ? maxbitindex : BT_ULMASK;
 494                 for (bx = 0, bit = 1; bx <= maxbit; bx++, bit <<= 1)
 495                         if (bx >= nextbitindex && !(bitmap[nextword] & bit))
 496                                 return ((nextword << BT_ULSHIFT) + bx);
 497                 nextword++;
 498         }
 499         /*
 500          * Now check if there are any words remaining before the end of the
 501          * bitmap. Use bt_availbit() to find any free bits.
 502          */
 503         if (nextword <= maxword)
 504                 if ((rval = bt_availbit(&bitmap[nextword],
 505                     nbits - (nextword << BT_ULSHIFT))) != -1)
 506                         return ((nextword << BT_ULSHIFT) + rval);
 507         /*
 508          * Finally loop back to the start and look for any free bits starting
 509          * from the beginning of the bitmap to the current rotor position.
 510          */
 511         return (bt_availbit(bitmap, nextindex));
 512 }
 513 
 514 /*
 515  * Dispatch a new error into the queue for later processing.  The specified
 516  * data buffer is copied into a preallocated queue element.  If 'len' is
 517  * smaller than the queue element size, the remainder of the queue element is
 518  * filled with zeroes.  This function may be called from any context subject
 519  * to the Platform Considerations described above.
 520  */
 521 void
 522 errorq_dispatch(errorq_t *eqp, const void *data, size_t len, uint_t flag)
 523 {
 524         errorq_elem_t *eep, *old;
 525 
 526         if (eqp == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE)) {
 527                 atomic_add_64(&errorq_lost, 1);
 528                 return; /* drop error if queue is uninitialized or disabled */
 529         }
 530 
 531         for (;;) {
 532                 int i, rval;
 533 
 534                 if ((i = errorq_availbit(eqp->eq_bitmap, eqp->eq_qlen,
 535                     eqp->eq_rotor)) == -1) {
 536                         atomic_add_64(&eqp->eq_kstat.eqk_dropped.value.ui64, 1);
 537                         return;
 538                 }
 539                 BT_ATOMIC_SET_EXCL(eqp->eq_bitmap, i, rval);
 540                 if (rval == 0) {
 541                         eqp->eq_rotor = i;
 542                         eep = &eqp->eq_elems[i];
 543                         break;
 544                 }
 545         }
 546 
 547         ASSERT(len <= eqp->eq_size);
 548         bcopy(data, eep->eqe_data, MIN(eqp->eq_size, len));
 549 
 550         if (len < eqp->eq_size)
 551                 bzero((caddr_t)eep->eqe_data + len, eqp->eq_size - len);
 552 
 553         for (;;) {
 554                 old = eqp->eq_pend;
 555                 eep->eqe_prev = old;
 556                 membar_producer();
 557 
 558                 if (atomic_cas_ptr(&eqp->eq_pend, old, eep) == old)
 559                         break;
 560         }
 561 
 562         atomic_add_64(&eqp->eq_kstat.eqk_dispatched.value.ui64, 1);
 563 
 564         if (flag == ERRORQ_ASYNC && eqp->eq_id != NULL)
 565                 ddi_trigger_softintr(eqp->eq_id);
 566 }
 567 
 568 /*
 569  * Drain the specified error queue by calling eq_func() for each pending error.
 570  * This function must be called at or below LOCK_LEVEL or from panic context.
 571  * In order to synchronize with other attempts to drain the queue, we acquire
 572  * the adaptive eq_lock, blocking other consumers.  Once this lock is held,
 573  * we must use compare-and-swap to move the pending list to the processing
 574  * list and to return elements to the free pool in order to synchronize
 575  * with producers, who do not acquire any locks and only use atomic set/clear.
 576  *
 577  * An additional constraint on this function is that if the system panics
 578  * while this function is running, the panic code must be able to detect and
 579  * handle all intermediate states and correctly dequeue all errors.  The
 580  * errorq_panic() function below will be used for detecting and handling
 581  * these intermediate states.  The comments in errorq_drain() below explain
 582  * how we make sure each intermediate state is distinct and consistent.
 583  */
 584 void
 585 errorq_drain(errorq_t *eqp)
 586 {
 587         errorq_elem_t *eep, *dep;
 588 
 589         ASSERT(eqp != NULL);
 590         mutex_enter(&eqp->eq_lock);
 591 
 592         /*
 593          * If there are one or more pending errors, set eq_ptail to point to
 594          * the first element on the pending list and then attempt to compare-
 595          * and-swap NULL to the pending list.  We use membar_producer() to
 596          * make sure that eq_ptail will be visible to errorq_panic() below
 597          * before the pending list is NULLed out.  This section is labeled
 598          * case (1) for errorq_panic, below.  If eq_ptail is not yet set (1A)
 599          * eq_pend has all the pending errors.  If atomic_cas_ptr fails or
 600          * has not been called yet (1B), eq_pend still has all the pending
 601          * errors.  If atomic_cas_ptr succeeds (1C), eq_ptail has all the
 602          * pending errors.
 603          */
 604         while ((eep = eqp->eq_pend) != NULL) {
 605                 eqp->eq_ptail = eep;
 606                 membar_producer();
 607 
 608                 if (atomic_cas_ptr(&eqp->eq_pend, eep, NULL) == eep)
 609                         break;
 610         }
 611 
 612         /*
 613          * If no errors were pending, assert that eq_ptail is set to NULL,
 614          * drop the consumer lock, and return without doing anything.
 615          */
 616         if (eep == NULL) {
 617                 ASSERT(eqp->eq_ptail == NULL);
 618                 mutex_exit(&eqp->eq_lock);
 619                 return;
 620         }
 621 
 622         /*
 623          * Now iterate from eq_ptail (a.k.a. eep, the newest error) to the
 624          * oldest error, setting the eqe_next pointer so that we can iterate
 625          * over the errors from oldest to newest.  We use membar_producer()
 626          * to make sure that these stores are visible before we set eq_phead.
 627          * If we panic before, during, or just after this loop (case 2),
 628          * errorq_panic() will simply redo this work, as described below.
 629          */
 630         for (eep->eqe_next = NULL; eep->eqe_prev != NULL; eep = eep->eqe_prev)
 631                 eep->eqe_prev->eqe_next = eep;
 632         membar_producer();
 633 
 634         /*
 635          * Now set eq_phead to the head of the processing list (the oldest
 636          * error) and issue another membar_producer() to make sure that
 637          * eq_phead is seen as non-NULL before we clear eq_ptail.  If we panic
 638          * after eq_phead is set (case 3), we will detect and log these errors
 639          * in errorq_panic(), as described below.
 640          */
 641         eqp->eq_phead = eep;
 642         membar_producer();
 643 
 644         eqp->eq_ptail = NULL;
 645         membar_producer();
 646 
 647         /*
 648          * If we enter from errorq_panic_drain(), we may already have
 649          * errorq elements on the dump list.  Find the tail of
 650          * the list ready for append.
 651          */
 652         if (panicstr && (dep = eqp->eq_dump) != NULL) {
 653                 while (dep->eqe_dump != NULL)
 654                         dep = dep->eqe_dump;
 655         }
 656 
 657         /*
 658          * Now iterate over the processing list from oldest (eq_phead) to
 659          * newest and log each error.  Once an error is logged, we use
 660          * atomic clear to return it to the free pool.  If we panic before,
 661          * during, or after calling eq_func() (case 4), the error will still be
 662          * found on eq_phead and will be logged in errorq_panic below.
 663          */
 664 
 665         while ((eep = eqp->eq_phead) != NULL) {
 666                 eqp->eq_func(eqp->eq_private, eep->eqe_data, eep);
 667                 eqp->eq_kstat.eqk_logged.value.ui64++;
 668 
 669                 eqp->eq_phead = eep->eqe_next;
 670                 membar_producer();
 671 
 672                 eep->eqe_next = NULL;
 673 
 674                 /*
 675                  * On panic, we add the element to the dump list for each
 676                  * nvlist errorq.  Elements are stored oldest to newest.
 677                  * Then continue, so we don't free and subsequently overwrite
 678                  * any elements which we've put on the dump queue.
 679                  */
 680                 if (panicstr && (eqp->eq_flags & ERRORQ_NVLIST)) {
 681                         if (eqp->eq_dump == NULL)
 682                                 dep = eqp->eq_dump = eep;
 683                         else
 684                                 dep = dep->eqe_dump = eep;
 685                         membar_producer();
 686                         continue;
 687                 }
 688 
 689                 eep->eqe_prev = NULL;
 690                 BT_ATOMIC_CLEAR(eqp->eq_bitmap, eep - eqp->eq_elems);
 691         }
 692 
 693         mutex_exit(&eqp->eq_lock);
 694 }
 695 
 696 /*
 697  * Now that device tree services are available, set up the soft interrupt
 698  * handlers for any queues that were created early in boot.  We then
 699  * manually drain these queues to report any pending early errors.
 700  */
 701 void
 702 errorq_init(void)
 703 {
 704         dev_info_t *dip = ddi_root_node();
 705         ddi_softintr_t id;
 706         errorq_t *eqp;
 707 
 708         ASSERT(modrootloaded != 0);
 709         ASSERT(dip != NULL);
 710 
 711         mutex_enter(&errorq_lock);
 712 
 713         for (eqp = errorq_list; eqp != NULL; eqp = eqp->eq_next) {
 714                 ddi_iblock_cookie_t ibc =
 715                     (ddi_iblock_cookie_t)(uintptr_t)ipltospl(eqp->eq_ipl);
 716 
 717                 if (eqp->eq_id != NULL)
 718                         continue; /* softint already initialized */
 719 
 720                 if (ddi_add_softintr(dip, DDI_SOFTINT_FIXED, &id, &ibc, NULL,
 721                     errorq_intr, (caddr_t)eqp) != DDI_SUCCESS) {
 722                         panic("errorq_init: failed to register IPL %u softint "
 723                             "for queue %s", eqp->eq_ipl, eqp->eq_name);
 724                 }
 725 
 726                 eqp->eq_id = id;
 727                 errorq_drain(eqp);
 728         }
 729 
 730         mutex_exit(&errorq_lock);
 731 }
 732 
 733 /*
 734  * This function is designed to be called from panic context only, and
 735  * therefore does not need to acquire errorq_lock when iterating over
 736  * errorq_list.  This function must be called no more than once for each
 737  * 'what' value (if you change this then review the manipulation of 'dep'.
 738  */
 739 static uint64_t
 740 errorq_panic_drain(uint_t what)
 741 {
 742         errorq_elem_t *eep, *nep, *dep;
 743         errorq_t *eqp;
 744         uint64_t loggedtmp;
 745         uint64_t logged = 0;
 746 
 747         for (eqp = errorq_list; eqp != NULL; eqp = eqp->eq_next) {
 748                 if ((eqp->eq_flags & (ERRORQ_VITAL | ERRORQ_NVLIST)) != what)
 749                         continue; /* do not drain this queue on this pass */
 750 
 751                 loggedtmp = eqp->eq_kstat.eqk_logged.value.ui64;
 752 
 753                 /*
 754                  * In case (1B) above, eq_ptail may be set but the
 755                  * atomic_cas_ptr may not have been executed yet or may have
 756                  * failed.  Either way, we must log errors in chronological
 757                  * order.  So we search the pending list for the error
 758                  * pointed to by eq_ptail.  If it is found, we know that all
 759                  * subsequent errors are also still on the pending list, so
 760                  * just NULL out eq_ptail and let errorq_drain(), below,
 761                  * take care of the logging.
 762                  */
 763                 for (eep = eqp->eq_pend; eep != NULL; eep = eep->eqe_prev) {
 764                         if (eep == eqp->eq_ptail) {
 765                                 ASSERT(eqp->eq_phead == NULL);
 766                                 eqp->eq_ptail = NULL;
 767                                 break;
 768                         }
 769                 }
 770 
 771                 /*
 772                  * In cases (1C) and (2) above, eq_ptail will be set to the
 773                  * newest error on the processing list but eq_phead will still
 774                  * be NULL.  We set the eqe_next pointers so we can iterate
 775                  * over the processing list in order from oldest error to the
 776                  * newest error.  We then set eq_phead to point to the oldest
 777                  * error and fall into the for-loop below.
 778                  */
 779                 if (eqp->eq_phead == NULL && (eep = eqp->eq_ptail) != NULL) {
 780                         for (eep->eqe_next = NULL; eep->eqe_prev != NULL;
 781                             eep = eep->eqe_prev)
 782                                 eep->eqe_prev->eqe_next = eep;
 783 
 784                         eqp->eq_phead = eep;
 785                         eqp->eq_ptail = NULL;
 786                 }
 787 
 788                 /*
 789                  * In cases (3) and (4) above (or after case (1C/2) handling),
 790                  * eq_phead will be set to the oldest error on the processing
 791                  * list.  We log each error and return it to the free pool.
 792                  *
 793                  * Unlike errorq_drain(), we don't need to worry about updating
 794                  * eq_phead because errorq_panic() will be called at most once.
 795                  * However, we must use atomic_cas_ptr to update the
 796                  * freelist in case errors are still being enqueued during
 797                  * panic.
 798                  */
 799                 for (eep = eqp->eq_phead; eep != NULL; eep = nep) {
 800                         eqp->eq_func(eqp->eq_private, eep->eqe_data, eep);
 801                         eqp->eq_kstat.eqk_logged.value.ui64++;
 802 
 803                         nep = eep->eqe_next;
 804                         eep->eqe_next = NULL;
 805 
 806                         /*
 807                          * On panic, we add the element to the dump list for
 808                          * each nvlist errorq, stored oldest to newest. Then
 809                          * continue, so we don't free and subsequently overwrite
 810                          * any elements which we've put on the dump queue.
 811                          */
 812                         if (eqp->eq_flags & ERRORQ_NVLIST) {
 813                                 if (eqp->eq_dump == NULL)
 814                                         dep = eqp->eq_dump = eep;
 815                                 else
 816                                         dep = dep->eqe_dump = eep;
 817                                 membar_producer();
 818                                 continue;
 819                         }
 820 
 821                         eep->eqe_prev = NULL;
 822                         BT_ATOMIC_CLEAR(eqp->eq_bitmap, eep - eqp->eq_elems);
 823                 }
 824 
 825                 /*
 826                  * Now go ahead and drain any other errors on the pending list.
 827                  * This call transparently handles case (1A) above, as well as
 828                  * any other errors that were dispatched after errorq_drain()
 829                  * completed its first compare-and-swap.
 830                  */
 831                 errorq_drain(eqp);
 832 
 833                 logged += eqp->eq_kstat.eqk_logged.value.ui64 - loggedtmp;
 834         }
 835         return (logged);
 836 }
 837 
 838 /*
 839  * Drain all error queues - called only from panic context.  Some drain
 840  * functions may enqueue errors to ERRORQ_NVLIST error queues so that
 841  * they may be written out in the panic dump - so ERRORQ_NVLIST queues
 842  * must be drained last.  Drain ERRORQ_VITAL queues before nonvital queues
 843  * so that vital errors get to fill the ERRORQ_NVLIST queues first, and
 844  * do not drain the nonvital queues if there are many vital errors.
 845  */
 846 void
 847 errorq_panic(void)
 848 {
 849         ASSERT(panicstr != NULL);
 850 
 851         if (errorq_panic_drain(ERRORQ_VITAL) <= errorq_vitalmin)
 852                 (void) errorq_panic_drain(0);
 853         (void) errorq_panic_drain(ERRORQ_VITAL | ERRORQ_NVLIST);
 854         (void) errorq_panic_drain(ERRORQ_NVLIST);
 855 }
 856 
 857 /*
 858  * Reserve an error queue element for later processing and dispatching.  The
 859  * element is returned to the caller who may add error-specific data to
 860  * element.  The element is retured to the free pool when either
 861  * errorq_commit() is called and the element asynchronously processed
 862  * or immediately when errorq_cancel() is called.
 863  */
 864 errorq_elem_t *
 865 errorq_reserve(errorq_t *eqp)
 866 {
 867         errorq_elem_t *eqep;
 868 
 869         if (eqp == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE)) {
 870                 atomic_add_64(&errorq_lost, 1);
 871                 return (NULL);
 872         }
 873 
 874         for (;;) {
 875                 int i, rval;
 876 
 877                 if ((i = errorq_availbit(eqp->eq_bitmap, eqp->eq_qlen,
 878                     eqp->eq_rotor)) == -1) {
 879                         atomic_add_64(&eqp->eq_kstat.eqk_dropped.value.ui64, 1);
 880                         return (NULL);
 881                 }
 882                 BT_ATOMIC_SET_EXCL(eqp->eq_bitmap, i, rval);
 883                 if (rval == 0) {
 884                         eqp->eq_rotor = i;
 885                         eqep = &eqp->eq_elems[i];
 886                         break;
 887                 }
 888         }
 889 
 890         if (eqp->eq_flags & ERRORQ_NVLIST) {
 891                 errorq_nvelem_t *eqnp = eqep->eqe_data;
 892                 nv_alloc_reset(eqnp->eqn_nva);
 893                 eqnp->eqn_nvl = fm_nvlist_create(eqnp->eqn_nva);
 894         }
 895 
 896         atomic_add_64(&eqp->eq_kstat.eqk_reserved.value.ui64, 1);
 897         return (eqep);
 898 }
 899 
 900 /*
 901  * Commit an errorq element (eqep) for dispatching.
 902  * This function may be called from any context subject
 903  * to the Platform Considerations described above.
 904  */
 905 void
 906 errorq_commit(errorq_t *eqp, errorq_elem_t *eqep, uint_t flag)
 907 {
 908         errorq_elem_t *old;
 909 
 910         if (eqep == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE)) {
 911                 atomic_add_64(&eqp->eq_kstat.eqk_commit_fail.value.ui64, 1);
 912                 return;
 913         }
 914 
 915         for (;;) {
 916                 old = eqp->eq_pend;
 917                 eqep->eqe_prev = old;
 918                 membar_producer();
 919 
 920                 if (atomic_cas_ptr(&eqp->eq_pend, old, eqep) == old)
 921                         break;
 922         }
 923 
 924         atomic_add_64(&eqp->eq_kstat.eqk_committed.value.ui64, 1);
 925 
 926         if (flag == ERRORQ_ASYNC && eqp->eq_id != NULL)
 927                 ddi_trigger_softintr(eqp->eq_id);
 928 }
 929 
 930 /*
 931  * Cancel an errorq element reservation by returning the specified element
 932  * to the free pool.  Duplicate or invalid frees are not supported.
 933  */
 934 void
 935 errorq_cancel(errorq_t *eqp, errorq_elem_t *eqep)
 936 {
 937         if (eqep == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE))
 938                 return;
 939 
 940         BT_ATOMIC_CLEAR(eqp->eq_bitmap, eqep - eqp->eq_elems);
 941 
 942         atomic_add_64(&eqp->eq_kstat.eqk_cancelled.value.ui64, 1);
 943 }
 944 
 945 /*
 946  * Write elements on the dump list of each nvlist errorq to the dump device.
 947  * Upon reboot, fmd(1M) will extract and replay them for diagnosis.
 948  */
 949 void
 950 errorq_dump(void)
 951 {
 952         errorq_elem_t *eep;
 953         errorq_t *eqp;
 954 
 955         if (ereport_dumpbuf == NULL)
 956                 return; /* reboot or panic before errorq is even set up */
 957 
 958         for (eqp = errorq_list; eqp != NULL; eqp = eqp->eq_next) {
 959                 if (!(eqp->eq_flags & ERRORQ_NVLIST) ||
 960                     !(eqp->eq_flags & ERRORQ_ACTIVE))
 961                         continue; /* do not dump this queue on panic */
 962 
 963                 for (eep = eqp->eq_dump; eep != NULL; eep = eep->eqe_dump) {
 964                         errorq_nvelem_t *eqnp = eep->eqe_data;
 965                         size_t len = 0;
 966                         erpt_dump_t ed;
 967                         int err;
 968 
 969                         (void) nvlist_size(eqnp->eqn_nvl,
 970                             &len, NV_ENCODE_NATIVE);
 971 
 972                         if (len > ereport_dumplen || len == 0) {
 973                                 cmn_err(CE_WARN, "%s: unable to save error "
 974                                     "report %p due to size %lu\n",
 975                                     eqp->eq_name, (void *)eep, len);
 976                                 continue;
 977                         }
 978 
 979                         if ((err = nvlist_pack(eqnp->eqn_nvl,
 980                             (char **)&ereport_dumpbuf, &ereport_dumplen,
 981                             NV_ENCODE_NATIVE, KM_NOSLEEP)) != 0) {
 982                                 cmn_err(CE_WARN, "%s: unable to save error "
 983                                     "report %p due to pack error %d\n",
 984                                     eqp->eq_name, (void *)eep, err);
 985                                 continue;
 986                         }
 987 
 988                         ed.ed_magic = ERPT_MAGIC;
 989                         ed.ed_chksum = checksum32(ereport_dumpbuf, len);
 990                         ed.ed_size = (uint32_t)len;
 991                         ed.ed_pad = 0;
 992                         ed.ed_hrt_nsec = 0;
 993                         ed.ed_hrt_base = panic_hrtime;
 994                         ed.ed_tod_base.sec = panic_hrestime.tv_sec;
 995                         ed.ed_tod_base.nsec = panic_hrestime.tv_nsec;
 996 
 997                         dumpvp_write(&ed, sizeof (ed));
 998                         dumpvp_write(ereport_dumpbuf, len);
 999                 }
1000         }
1001 }
1002 
1003 nvlist_t *
1004 errorq_elem_nvl(errorq_t *eqp, const errorq_elem_t *eqep)
1005 {
1006         errorq_nvelem_t *eqnp = eqep->eqe_data;
1007 
1008         ASSERT(eqp->eq_flags & ERRORQ_ACTIVE && eqp->eq_flags & ERRORQ_NVLIST);
1009 
1010         return (eqnp->eqn_nvl);
1011 }
1012 
1013 nv_alloc_t *
1014 errorq_elem_nva(errorq_t *eqp, const errorq_elem_t *eqep)
1015 {
1016         errorq_nvelem_t *eqnp = eqep->eqe_data;
1017 
1018         ASSERT(eqp->eq_flags & ERRORQ_ACTIVE && eqp->eq_flags & ERRORQ_NVLIST);
1019 
1020         return (eqnp->eqn_nva);
1021 }
1022 
1023 /*
1024  * Reserve a new element and duplicate the data of the original into it.
1025  */
1026 void *
1027 errorq_elem_dup(errorq_t *eqp, const errorq_elem_t *eqep, errorq_elem_t **neqep)
1028 {
1029         ASSERT(eqp->eq_flags & ERRORQ_ACTIVE);
1030         ASSERT(!(eqp->eq_flags & ERRORQ_NVLIST));
1031 
1032         if ((*neqep = errorq_reserve(eqp)) == NULL)
1033                 return (NULL);
1034 
1035         bcopy(eqep->eqe_data, (*neqep)->eqe_data, eqp->eq_size);
1036         return ((*neqep)->eqe_data);
1037 }