1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*
  27  * FMD Case Subsystem
  28  *
  29  * Diagnosis engines are expected to group telemetry events related to the
  30  * diagnosis of a particular problem on the system into a set of cases.  The
  31  * diagnosis engine may have any number of cases open at a given point in time.
  32  * Some cases may eventually be *solved* by associating a suspect list of one
  33  * or more problems with the case, at which point fmd publishes a list.suspect
  34  * event for the case and it becomes visible to administrators and agents.
  35  *
  36  * Every case is named using a UUID, and is globally visible in the case hash.
  37  * Cases are reference-counted, except for the reference from the case hash
  38  * itself.  Consumers of case references include modules, which store active
  39  * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code.
  40  *
  41  * Cases obey the following state machine.  In states UNSOLVED, SOLVED, and
  42  * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine
  43  * or transport) and the case is referenced by the mod_cases list.  Once the
  44  * case reaches the CLOSED or REPAIRED states, a case's module changes to refer
  45  * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases.
  46  *
  47  *                      +------------+
  48  *           +----------|  UNSOLVED  |
  49  *           |          +------------+
  50  *           |                1 |
  51  *           |                  |
  52  *           |          +-------v----+
  53  *         2 |          |    SOLVED  |
  54  *           |          +------------+
  55  *           |              3 |  5 |
  56  *           +------------+   |    |
  57  *                        |   |    |
  58  *                      +-v---v----v-+
  59  *                      | CLOSE_WAIT |
  60  *                      +------------+
  61  *                        |   |    |
  62  *            +-----------+   |    +------------+
  63  *            |             4 |                 |
  64  *            v         +-----v------+          |
  65  *         discard      |   CLOSED   |        6 |
  66  *                      +------------+          |
  67  *                            |                 |
  68  *                            |    +------------+
  69  *                          7 |    |
  70  *                      +-----v----v-+
  71  *                      |  REPAIRED  |
  72  *                      +------------+
  73  *                            |
  74  *                          8 |
  75  *                      +-----v------+
  76  *                      |  RESOLVED  |
  77  *                      +------------+
  78  *                            |
  79  *                            v
  80  *                         discard
  81  *
  82  * The state machine changes are triggered by calls to fmd_case_transition()
  83  * from various locations inside of fmd, as described below:
  84  *
  85  * [1] Called by: fmd_case_solve()
  86  *       Actions: FMD_CF_SOLVED flag is set in ci_flags
  87  *                conviction policy is applied to suspect list
  88  *                suspects convicted are marked faulty (F) in R$
  89  *                list.suspect event logged and dispatched
  90  *
  91  * [2] Called by: fmd_case_close(), fmd_case_uuclose()
  92  *       Actions: diagnosis engine fmdo_close() entry point scheduled
  93  *                case discarded upon exit from CLOSE_WAIT
  94  *
  95  * [3] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose()
  96  *       Actions: FMD_CF_ISOLATED flag is set in ci_flags
  97  *                suspects convicted (F) are marked unusable (U) in R$
  98  *                diagnosis engine fmdo_close() entry point scheduled
  99  *                case transitions to CLOSED [4] upon exit from CLOSE_WAIT
 100  *
 101  * [4] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
 102  *       Actions: list.isolated event dispatched
 103  *                case deleted from module's list of open cases
 104  *
 105  * [5] Called by: fmd_case_repair(), fmd_case_update()
 106  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
 107  *                diagnosis engine fmdo_close() entry point scheduled
 108  *                case transitions to REPAIRED [6] upon exit from CLOSE_WAIT
 109  *
 110  * [6] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
 111  *       Actions: suspects convicted are marked non faulty (!F) in R$
 112  *                list.repaired or list.updated event dispatched
 113  *
 114  * [7] Called by: fmd_case_repair(), fmd_case_update()
 115  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
 116  *                suspects convicted are marked non faulty (!F) in R$
 117  *                list.repaired or list.updated event dispatched
 118  *
 119  * [8] Called by: fmd_case_uuresolve()
 120  *       Actions: list.resolved event dispatched
 121  *                case is discarded
 122  */
 123 
 124 #include <sys/fm/protocol.h>
 125 #include <uuid/uuid.h>
 126 #include <alloca.h>
 127 
 128 #include <fmd_alloc.h>
 129 #include <fmd_module.h>
 130 #include <fmd_error.h>
 131 #include <fmd_conf.h>
 132 #include <fmd_case.h>
 133 #include <fmd_string.h>
 134 #include <fmd_subr.h>
 135 #include <fmd_protocol.h>
 136 #include <fmd_event.h>
 137 #include <fmd_eventq.h>
 138 #include <fmd_dispq.h>
 139 #include <fmd_buf.h>
 140 #include <fmd_log.h>
 141 #include <fmd_asru.h>
 142 #include <fmd_fmri.h>
 143 #include <fmd_xprt.h>
 144 
 145 #include <fmd.h>
 146 
 147 static const char *const _fmd_case_snames[] = {
 148         "UNSOLVED",     /* FMD_CASE_UNSOLVED */
 149         "SOLVED",       /* FMD_CASE_SOLVED */
 150         "CLOSE_WAIT",   /* FMD_CASE_CLOSE_WAIT */
 151         "CLOSED",       /* FMD_CASE_CLOSED */
 152         "REPAIRED",     /* FMD_CASE_REPAIRED */
 153         "RESOLVED"      /* FMD_CASE_RESOLVED */
 154 };
 155 
 156 static fmd_case_impl_t *fmd_case_tryhold(fmd_case_impl_t *);
 157 
 158 fmd_case_hash_t *
 159 fmd_case_hash_create(void)
 160 {
 161         fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP);
 162 
 163         (void) pthread_rwlock_init(&chp->ch_lock, NULL);
 164         chp->ch_hashlen = fmd.d_str_buckets;
 165         chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP);
 166         chp->ch_code_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen,
 167             FMD_SLEEP);
 168         chp->ch_count = 0;
 169 
 170         return (chp);
 171 }
 172 
 173 /*
 174  * Destroy the case hash.  Unlike most of our hash tables, no active references
 175  * are kept by the case hash itself; all references come from other subsystems.
 176  * The hash must be destroyed after all modules are unloaded; if anything was
 177  * present in the hash it would be by definition a reference count leak.
 178  */
 179 void
 180 fmd_case_hash_destroy(fmd_case_hash_t *chp)
 181 {
 182         fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen);
 183         fmd_free(chp->ch_code_hash, sizeof (void *) * chp->ch_hashlen);
 184         fmd_free(chp, sizeof (fmd_case_hash_t));
 185 }
 186 
 187 /*
 188  * Take a snapshot of the case hash by placing an additional hold on each
 189  * member in an auxiliary array, and then call 'func' for each case.
 190  */
 191 void
 192 fmd_case_hash_apply(fmd_case_hash_t *chp,
 193     void (*func)(fmd_case_t *, void *), void *arg)
 194 {
 195         fmd_case_impl_t *cp, **cps, **cpp;
 196         uint_t cpc, i;
 197 
 198         (void) pthread_rwlock_rdlock(&chp->ch_lock);
 199 
 200         cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP);
 201         cpc = chp->ch_count;
 202 
 203         for (i = 0; i < chp->ch_hashlen; i++) {
 204                 for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next)
 205                         *cpp++ = fmd_case_tryhold(cp);
 206         }
 207 
 208         ASSERT(cpp == cps + cpc);
 209         (void) pthread_rwlock_unlock(&chp->ch_lock);
 210 
 211         for (i = 0; i < cpc; i++) {
 212                 if (cps[i] != NULL) {
 213                         func((fmd_case_t *)cps[i], arg);
 214                         fmd_case_rele((fmd_case_t *)cps[i]);
 215                 }
 216         }
 217 
 218         fmd_free(cps, cpc * sizeof (fmd_case_t *));
 219 }
 220 
 221 static void
 222 fmd_case_code_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
 223 {
 224         uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
 225 
 226         cip->ci_code_next = chp->ch_code_hash[h];
 227         chp->ch_code_hash[h] = cip;
 228 }
 229 
 230 static void
 231 fmd_case_code_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
 232 {
 233         fmd_case_impl_t **pp, *cp;
 234 
 235         if (cip->ci_code) {
 236                 uint_t h = fmd_strhash(cip->ci_code) % chp->ch_hashlen;
 237 
 238                 pp = &chp->ch_code_hash[h];
 239                 for (cp = *pp; cp != NULL; cp = cp->ci_code_next) {
 240                         if (cp != cip)
 241                                 pp = &cp->ci_code_next;
 242                         else
 243                                 break;
 244                 }
 245                 if (cp != NULL) {
 246                         *pp = cp->ci_code_next;
 247                         cp->ci_code_next = NULL;
 248                 }
 249         }
 250 }
 251 
 252 /*
 253  * Look up the diagcode for this case and cache it in ci_code.  If no suspects
 254  * were defined for this case or if the lookup fails, the event dictionary or
 255  * module code is broken, and we set the event code to a precomputed default.
 256  */
 257 static const char *
 258 fmd_case_mkcode(fmd_case_t *cp)
 259 {
 260         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
 261         fmd_case_susp_t *cis;
 262         fmd_case_hash_t *chp = fmd.d_cases;
 263 
 264         char **keys, **keyp;
 265         const char *s;
 266 
 267         ASSERT(MUTEX_HELD(&cip->ci_lock));
 268         ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
 269 
 270         /*
 271          * delete any existing entry from code hash if it is on it
 272          */
 273         fmd_case_code_hash_delete(chp, cip);
 274 
 275         fmd_free(cip->ci_code, cip->ci_codelen);
 276         cip->ci_codelen = cip->ci_mod->mod_codelen;
 277         cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
 278         keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1));
 279 
 280         for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
 281                 if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0)
 282                         keyp++;
 283         }
 284 
 285         *keyp = NULL; /* mark end of keys[] array for libdiagcode */
 286 
 287         if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code(
 288             cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) {
 289                 (void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s);
 290                 fmd_free(cip->ci_code, cip->ci_codelen);
 291                 cip->ci_codelen = strlen(s) + 1;
 292                 cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
 293                 (void) strcpy(cip->ci_code, s);
 294         }
 295 
 296         /*
 297          * add into hash of solved cases
 298          */
 299         fmd_case_code_hash_insert(chp, cip);
 300 
 301         return (cip->ci_code);
 302 }
 303 
 304 typedef struct {
 305         int     *fcl_countp;
 306         int     fcl_maxcount;
 307         uint8_t *fcl_ba;
 308         nvlist_t **fcl_nva;
 309         int     *fcl_msgp;
 310 } fmd_case_lst_t;
 311 
 312 static void
 313 fmd_case_set_lst(fmd_asru_link_t *alp, void *arg)
 314 {
 315         fmd_case_lst_t *entryp = (fmd_case_lst_t *)arg;
 316         boolean_t b;
 317         int state;
 318 
 319         if (*entryp->fcl_countp >= entryp->fcl_maxcount)
 320                 return;
 321         if (nvlist_lookup_boolean_value(alp->al_event, FM_SUSPECT_MESSAGE,
 322             &b) == 0 && b == B_FALSE)
 323                 *entryp->fcl_msgp = B_FALSE;
 324         entryp->fcl_ba[*entryp->fcl_countp] = 0;
 325         state = fmd_asru_al_getstate(alp);
 326         if (state & FMD_ASRU_DEGRADED)
 327                 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_DEGRADED;
 328         if (state & FMD_ASRU_UNUSABLE)
 329                 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_UNUSABLE;
 330         if (state & FMD_ASRU_FAULTY)
 331                 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_FAULTY;
 332         if (!(state & FMD_ASRU_PRESENT))
 333                 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_NOT_PRESENT;
 334         if (alp->al_reason == FMD_ASRU_REPAIRED)
 335                 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPAIRED;
 336         else if (alp->al_reason == FMD_ASRU_REPLACED)
 337                 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_REPLACED;
 338         else if (alp->al_reason == FMD_ASRU_ACQUITTED)
 339                 entryp->fcl_ba[*entryp->fcl_countp] |= FM_SUSPECT_ACQUITTED;
 340         entryp->fcl_nva[*entryp->fcl_countp] = alp->al_event;
 341         (*entryp->fcl_countp)++;
 342 }
 343 
 344 static void
 345 fmd_case_faulty(fmd_asru_link_t *alp, void *arg)
 346 {
 347         int *faultyp = (int *)arg;
 348 
 349         *faultyp |= (alp->al_flags & FMD_ASRU_FAULTY);
 350 }
 351 
 352 static void
 353 fmd_case_usable(fmd_asru_link_t *alp, void *arg)
 354 {
 355         int *usablep = (int *)arg;
 356 
 357         *usablep |= !(fmd_asru_al_getstate(alp) & FMD_ASRU_UNUSABLE);
 358 }
 359 
 360 static void
 361 fmd_case_not_faulty(fmd_asru_link_t *alp, void *arg)
 362 {
 363         int *not_faultyp = (int *)arg;
 364 
 365         *not_faultyp |= !(alp->al_flags & FMD_ASRU_FAULTY);
 366 }
 367 
 368 /*
 369  * Have we got any suspects with an asru that are still unusable and present?
 370  */
 371 static void
 372 fmd_case_unusable_and_present(fmd_asru_link_t *alp, void *arg)
 373 {
 374         int *rvalp = (int *)arg;
 375         int state;
 376         nvlist_t *asru;
 377 
 378         /*
 379          * if this a proxy case and this suspect doesn't have an local asru
 380          * then state is unknown so we must assume it may still be unusable.
 381          */
 382         if ((alp->al_flags & FMD_ASRU_PROXY) &&
 383             !(alp->al_flags & FMD_ASRU_PROXY_WITH_ASRU)) {
 384                 *rvalp |= B_TRUE;
 385                 return;
 386         }
 387 
 388         state = fmd_asru_al_getstate(alp);
 389         if (nvlist_lookup_nvlist(alp->al_event, FM_FAULT_ASRU, &asru) != 0)
 390                 return;
 391         *rvalp |= ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_PRESENT));
 392 }
 393 
 394 nvlist_t *
 395 fmd_case_mkevent(fmd_case_t *cp, const char *class)
 396 {
 397         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
 398         nvlist_t **nva, *nvl;
 399         uint8_t *ba;
 400         int msg = B_TRUE;
 401         const char *code;
 402         fmd_case_lst_t fcl;
 403         int count = 0;
 404 
 405         (void) pthread_mutex_lock(&cip->ci_lock);
 406         ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
 407 
 408         nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects);
 409         ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects);
 410 
 411         /*
 412          * For each suspect associated with the case, store its fault event
 413          * nvlist in 'nva'.  We also look to see if any of the suspect faults
 414          * have asked not to be messaged.  If any of them have made such a
 415          * request, propagate that attribute to the composite list.* event.
 416          * Finally, store each suspect's faulty status into the bitmap 'ba'.
 417          */
 418         fcl.fcl_countp = &count;
 419         fcl.fcl_maxcount = cip->ci_nsuspects;
 420         fcl.fcl_msgp = &msg;
 421         fcl.fcl_ba = ba;
 422         fcl.fcl_nva = nva;
 423         fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl);
 424 
 425         if (cip->ci_code == NULL)
 426                 (void) fmd_case_mkcode(cp);
 427         /*
 428          * For repair and updated event, we lookup diagcode from dict using key
 429          * "list.repaired" or "list.updated" or "list.resolved".
 430          */
 431         if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0)
 432                 (void) fmd_conf_getprop(fmd.d_conf, "repaircode", &code);
 433         else if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0)
 434                 (void) fmd_conf_getprop(fmd.d_conf, "resolvecode", &code);
 435         else if (strcmp(class, FM_LIST_UPDATED_CLASS) == 0)
 436                 (void) fmd_conf_getprop(fmd.d_conf, "updatecode", &code);
 437         else
 438                 code = cip->ci_code;
 439 
 440         if (msg == B_FALSE)
 441                 cip->ci_flags |= FMD_CF_INVISIBLE;
 442 
 443         /*
 444          * Use the ci_diag_de if one has been saved (eg for an injected fault).
 445          * Otherwise use the authority for the current module.
 446          */
 447         nvl = fmd_protocol_list(class, cip->ci_diag_de == NULL ?
 448             cip->ci_mod->mod_fmri : cip->ci_diag_de, cip->ci_uuid, code, count,
 449             nva, ba, msg, &cip->ci_tv, cip->ci_injected);
 450 
 451         (void) pthread_mutex_unlock(&cip->ci_lock);
 452         return (nvl);
 453 }
 454 
 455 static int fmd_case_match_on_faulty_overlap = 1;
 456 static int fmd_case_match_on_acquit_overlap = 1;
 457 static int fmd_case_auto_acquit_isolated = 1;
 458 static int fmd_case_auto_acquit_non_acquitted = 1;
 459 static int fmd_case_too_recent = 10; /* time in seconds */
 460 
 461 static boolean_t
 462 fmd_case_compare_elem(nvlist_t *nvl, nvlist_t *xnvl, const char *elem)
 463 {
 464         nvlist_t *new_rsrc;
 465         nvlist_t *rsrc;
 466         char *new_name = NULL;
 467         char *name = NULL;
 468         ssize_t new_namelen;
 469         ssize_t namelen;
 470         int fmri_present = 1;
 471         int new_fmri_present = 1;
 472         int match = B_FALSE;
 473         fmd_topo_t *ftp = fmd_topo_hold();
 474 
 475         if (nvlist_lookup_nvlist(xnvl, elem, &rsrc) != 0)
 476                 fmri_present = 0;
 477         else {
 478                 if ((namelen = fmd_fmri_nvl2str(rsrc, NULL, 0)) == -1)
 479                         goto done;
 480                 name = fmd_alloc(namelen + 1, FMD_SLEEP);
 481                 if (fmd_fmri_nvl2str(rsrc, name, namelen + 1) == -1)
 482                         goto done;
 483         }
 484         if (nvlist_lookup_nvlist(nvl, elem, &new_rsrc) != 0)
 485                 new_fmri_present = 0;
 486         else {
 487                 if ((new_namelen = fmd_fmri_nvl2str(new_rsrc, NULL, 0)) == -1)
 488                         goto done;
 489                 new_name = fmd_alloc(new_namelen + 1, FMD_SLEEP);
 490                 if (fmd_fmri_nvl2str(new_rsrc, new_name, new_namelen + 1) == -1)
 491                         goto done;
 492         }
 493         match = (fmri_present == new_fmri_present &&
 494             (fmri_present == 0 ||
 495             topo_fmri_strcmp(ftp->ft_hdl, name, new_name)));
 496 done:
 497         if (name != NULL)
 498                 fmd_free(name, namelen + 1);
 499         if (new_name != NULL)
 500                 fmd_free(new_name, new_namelen + 1);
 501         fmd_topo_rele(ftp);
 502         return (match);
 503 }
 504 
 505 static int
 506 fmd_case_match_suspect(nvlist_t *nvl1, nvlist_t *nvl2)
 507 {
 508         char *class, *new_class;
 509 
 510         if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_ASRU))
 511                 return (0);
 512         if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_RESOURCE))
 513                 return (0);
 514         if (!fmd_case_compare_elem(nvl1, nvl2, FM_FAULT_FRU))
 515                 return (0);
 516         (void) nvlist_lookup_string(nvl2, FM_CLASS, &class);
 517         (void) nvlist_lookup_string(nvl1, FM_CLASS, &new_class);
 518         return (strcmp(class, new_class) == 0);
 519 }
 520 
 521 typedef struct {
 522         int     *fcms_countp;
 523         int     fcms_maxcount;
 524         fmd_case_impl_t *fcms_cip;
 525         uint8_t *fcms_new_susp_state;
 526         uint8_t *fcms_old_susp_state;
 527         uint8_t *fcms_old_match_state;
 528 } fcms_t;
 529 #define SUSPECT_STATE_FAULTY                            0x1
 530 #define SUSPECT_STATE_ISOLATED                          0x2
 531 #define SUSPECT_STATE_REMOVED                           0x4
 532 #define SUSPECT_STATE_ACQUITED                          0x8
 533 #define SUSPECT_STATE_REPAIRED                          0x10
 534 #define SUSPECT_STATE_REPLACED                          0x20
 535 #define SUSPECT_STATE_NO_MATCH                          0x1
 536 
 537 /*
 538  * This is called for each suspect in the old case. Compare it against each
 539  * suspect in the new case, setting fcms_old_susp_state and fcms_new_susp_state
 540  * as appropriate. fcms_new_susp_state will left as 0 if the suspect is not
 541  * found in the old case.
 542  */
 543 static void
 544 fmd_case_match_suspects(fmd_asru_link_t *alp, void *arg)
 545 {
 546         fcms_t *fcmsp = (fcms_t *)arg;
 547         fmd_case_impl_t *cip = fcmsp->fcms_cip;
 548         fmd_case_susp_t *cis;
 549         int i = 0;
 550         int state = fmd_asru_al_getstate(alp);
 551 
 552         if (*fcmsp->fcms_countp >= fcmsp->fcms_maxcount)
 553                 return;
 554 
 555         if (!(state & FMD_ASRU_PRESENT) || (!(state & FMD_ASRU_FAULTY) &&
 556             alp->al_reason == FMD_ASRU_REMOVED))
 557                 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
 558                     SUSPECT_STATE_REMOVED;
 559         else if ((state & FMD_ASRU_UNUSABLE) && (state & FMD_ASRU_FAULTY))
 560                 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
 561                     SUSPECT_STATE_ISOLATED;
 562         else if (state & FMD_ASRU_FAULTY)
 563                 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
 564                     SUSPECT_STATE_FAULTY;
 565         else if (alp->al_reason == FMD_ASRU_REPLACED)
 566                 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
 567                     SUSPECT_STATE_REPLACED;
 568         else if (alp->al_reason == FMD_ASRU_ACQUITTED)
 569                 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
 570                     SUSPECT_STATE_ACQUITED;
 571         else
 572                 fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp] =
 573                     SUSPECT_STATE_REPAIRED;
 574 
 575         for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next, i++)
 576                 if (fmd_case_match_suspect(cis->cis_nvl, alp->al_event) == 1)
 577                         break;
 578         if (cis != NULL)
 579                 fcmsp->fcms_new_susp_state[i] =
 580                     fcmsp->fcms_old_susp_state[*fcmsp->fcms_countp];
 581         else
 582                 fcmsp->fcms_old_match_state[*fcmsp->fcms_countp] |=
 583                     SUSPECT_STATE_NO_MATCH;
 584         (*fcmsp->fcms_countp)++;
 585 }
 586 
 587 typedef struct {
 588         int     *fca_do_update;
 589         fmd_case_impl_t *fca_cip;
 590 } fca_t;
 591 
 592 /*
 593  * Re-fault all acquitted suspects that are still present in the new list.
 594  */
 595 static void
 596 fmd_case_fault_acquitted_matching(fmd_asru_link_t *alp, void *arg)
 597 {
 598         fca_t *fcap = (fca_t *)arg;
 599         fmd_case_impl_t *cip = fcap->fca_cip;
 600         fmd_case_susp_t *cis;
 601         int state = fmd_asru_al_getstate(alp);
 602 
 603         if (!(state & FMD_ASRU_FAULTY) &&
 604             alp->al_reason == FMD_ASRU_ACQUITTED) {
 605                 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
 606                         if (fmd_case_match_suspect(cis->cis_nvl,
 607                             alp->al_event) == 1)
 608                                 break;
 609                 if (cis != NULL) {
 610                         (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
 611                         *fcap->fca_do_update = 1;
 612                 }
 613         }
 614 }
 615 
 616 /*
 617  * Re-fault all suspects that are still present in the new list.
 618  */
 619 static void
 620 fmd_case_fault_all_matching(fmd_asru_link_t *alp, void *arg)
 621 {
 622         fca_t *fcap = (fca_t *)arg;
 623         fmd_case_impl_t *cip = fcap->fca_cip;
 624         fmd_case_susp_t *cis;
 625         int state = fmd_asru_al_getstate(alp);
 626 
 627         if (!(state & FMD_ASRU_FAULTY)) {
 628                 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
 629                         if (fmd_case_match_suspect(cis->cis_nvl,
 630                             alp->al_event) == 1)
 631                                 break;
 632                 if (cis != NULL) {
 633                         (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
 634                         *fcap->fca_do_update = 1;
 635                 }
 636         }
 637 }
 638 
 639 /*
 640  * Acquit all suspects that are no longer present in the new list.
 641  */
 642 static void
 643 fmd_case_acquit_no_match(fmd_asru_link_t *alp, void *arg)
 644 {
 645         fca_t *fcap = (fca_t *)arg;
 646         fmd_case_impl_t *cip = fcap->fca_cip;
 647         fmd_case_susp_t *cis;
 648         int state = fmd_asru_al_getstate(alp);
 649 
 650         if (state & FMD_ASRU_FAULTY) {
 651                 for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next)
 652                         if (fmd_case_match_suspect(cis->cis_nvl,
 653                             alp->al_event) == 1)
 654                                 break;
 655                 if (cis == NULL) {
 656                         (void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY,
 657                             FMD_ASRU_ACQUITTED);
 658                         *fcap->fca_do_update = 1;
 659                 }
 660         }
 661 }
 662 
 663 /*
 664  * Acquit all isolated suspects.
 665  */
 666 static void
 667 fmd_case_acquit_isolated(fmd_asru_link_t *alp, void *arg)
 668 {
 669         int *do_update = (int *)arg;
 670         int state = fmd_asru_al_getstate(alp);
 671 
 672         if ((state & FMD_ASRU_PRESENT) && (state & FMD_ASRU_UNUSABLE) &&
 673             (state & FMD_ASRU_FAULTY)) {
 674                 (void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY,
 675                     FMD_ASRU_ACQUITTED);
 676                 *do_update = 1;
 677         }
 678 }
 679 
 680 /*
 681  * Acquit suspect which matches specified nvlist
 682  */
 683 static void
 684 fmd_case_acquit_suspect(fmd_asru_link_t *alp, void *arg)
 685 {
 686         nvlist_t *nvl = (nvlist_t *)arg;
 687         int state = fmd_asru_al_getstate(alp);
 688 
 689         if ((state & FMD_ASRU_FAULTY) &&
 690             fmd_case_match_suspect(nvl, alp->al_event) == 1)
 691                 (void) fmd_asru_clrflags(alp, FMD_ASRU_FAULTY,
 692                     FMD_ASRU_ACQUITTED);
 693 }
 694 
 695 typedef struct {
 696         fmd_case_impl_t *fccd_cip;
 697         uint8_t *fccd_new_susp_state;
 698         uint8_t *fccd_new_match_state;
 699         int *fccd_discard_new;
 700         int *fccd_adjust_new;
 701 } fccd_t;
 702 
 703 /*
 704  * see if a matching suspect list already exists in the cache
 705  */
 706 static void
 707 fmd_case_check_for_dups(fmd_case_t *old_cp, void *arg)
 708 {
 709         fccd_t *fccdp = (fccd_t *)arg;
 710         fmd_case_impl_t *new_cip = fccdp->fccd_cip;
 711         fmd_case_impl_t *old_cip = (fmd_case_impl_t *)old_cp;
 712         int i, count = 0, do_update = 0, got_isolated_overlap = 0;
 713         int got_faulty_overlap = 0;
 714         int got_acquit_overlap = 0;
 715         boolean_t too_recent;
 716         uint64_t most_recent = 0;
 717         fcms_t fcms;
 718         fca_t fca;
 719         uint8_t *new_susp_state;
 720         uint8_t *old_susp_state;
 721         uint8_t *old_match_state;
 722 
 723         new_susp_state = alloca(new_cip->ci_nsuspects * sizeof (uint8_t));
 724         for (i = 0; i < new_cip->ci_nsuspects; i++)
 725                 new_susp_state[i] = 0;
 726         old_susp_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t));
 727         for (i = 0; i < old_cip->ci_nsuspects; i++)
 728                 old_susp_state[i] = 0;
 729         old_match_state = alloca(old_cip->ci_nsuspects * sizeof (uint8_t));
 730         for (i = 0; i < old_cip->ci_nsuspects; i++)
 731                 old_match_state[i] = 0;
 732 
 733         /*
 734          * Compare with each suspect in the existing case.
 735          */
 736         fcms.fcms_countp = &count;
 737         fcms.fcms_maxcount = old_cip->ci_nsuspects;
 738         fcms.fcms_cip = new_cip;
 739         fcms.fcms_new_susp_state = new_susp_state;
 740         fcms.fcms_old_susp_state = old_susp_state;
 741         fcms.fcms_old_match_state = old_match_state;
 742         fmd_asru_hash_apply_by_case(fmd.d_asrus, (fmd_case_t *)old_cip,
 743             fmd_case_match_suspects, &fcms);
 744 
 745         /*
 746          * If we have some faulty, non-isolated suspects that overlap, then most
 747          * likely it is the suspects that overlap in the suspect lists that are
 748          * to blame. So we can consider this to be a match.
 749          */
 750         for (i = 0; i < new_cip->ci_nsuspects; i++)
 751                 if (new_susp_state[i] == SUSPECT_STATE_FAULTY)
 752                         got_faulty_overlap = 1;
 753         if (got_faulty_overlap && fmd_case_match_on_faulty_overlap)
 754                 goto got_match;
 755 
 756         /*
 757          * If we have no faulty, non-isolated suspects in the old case, but we
 758          * do have some acquitted suspects that overlap, then most likely it is
 759          * the acquitted suspects that overlap in the suspect lists that are
 760          * to blame. So we can consider this to be a match.
 761          */
 762         for (i = 0; i < new_cip->ci_nsuspects; i++)
 763                 if (new_susp_state[i] == SUSPECT_STATE_ACQUITED)
 764                         got_acquit_overlap = 1;
 765         for (i = 0; i < old_cip->ci_nsuspects; i++)
 766                 if (old_susp_state[i] == SUSPECT_STATE_FAULTY)
 767                         got_acquit_overlap = 0;
 768         if (got_acquit_overlap && fmd_case_match_on_acquit_overlap)
 769                 goto got_match;
 770 
 771         /*
 772          * Check that all suspects in the new list are present in the old list.
 773          * Return if we find one that isn't.
 774          */
 775         for (i = 0; i < new_cip->ci_nsuspects; i++)
 776                 if (new_susp_state[i] == 0)
 777                         return;
 778 
 779         /*
 780          * Check that all suspects in the old list are present in the new list
 781          * *or* they are isolated or removed/replaced (which would explain why
 782          * they are not present in the new list). Return if we find one that is
 783          * faulty and unisolated or repaired or acquitted, and that is not
 784          * present in the new case.
 785          */
 786         for (i = 0; i < old_cip->ci_nsuspects; i++)
 787                 if (old_match_state[i] == SUSPECT_STATE_NO_MATCH &&
 788                     (old_susp_state[i] == SUSPECT_STATE_FAULTY ||
 789                     old_susp_state[i] == SUSPECT_STATE_ACQUITED ||
 790                     old_susp_state[i] == SUSPECT_STATE_REPAIRED))
 791                         return;
 792 
 793 got_match:
 794         /*
 795          * If the old case is already in repaired/resolved state, we can't
 796          * do anything more with it, so keep the new case, but acquit some
 797          * of the suspects if appropriate.
 798          */
 799         if (old_cip->ci_state >= FMD_CASE_REPAIRED) {
 800                 if (fmd_case_auto_acquit_non_acquitted) {
 801                         *fccdp->fccd_adjust_new = 1;
 802                         for (i = 0; i < new_cip->ci_nsuspects; i++) {
 803                                 fccdp->fccd_new_susp_state[i] |=
 804                                     new_susp_state[i];
 805                                 if (new_susp_state[i] == 0)
 806                                         fccdp->fccd_new_susp_state[i] =
 807                                             SUSPECT_STATE_NO_MATCH;
 808                         }
 809                 }
 810                 return;
 811         }
 812 
 813         /*
 814          * Otherwise discard the new case and keep the old, again updating the
 815          * state of the suspects as appropriate
 816          */
 817         *fccdp->fccd_discard_new = 1;
 818         fca.fca_cip = new_cip;
 819         fca.fca_do_update = &do_update;
 820 
 821         /*
 822          * See if new case occurred within fmd_case_too_recent seconds of the
 823          * most recent modification to the old case and if so don't do
 824          * auto-acquit. This avoids problems if a flood of ereports come in and
 825          * they don't all get diagnosed before the first case causes some of
 826          * the devices to be isolated making it appear that an isolated device
 827          * was in the suspect list.
 828          */
 829         fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
 830             fmd_asru_most_recent, &most_recent);
 831         too_recent = (new_cip->ci_tv.tv_sec - most_recent <
 832             fmd_case_too_recent);
 833 
 834         if (got_faulty_overlap) {
 835                 /*
 836                  * Acquit any suspects not present in the new list, plus
 837                  * any that are are present but are isolated.
 838                  */
 839                 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
 840                     fmd_case_acquit_no_match, &fca);
 841                 if (fmd_case_auto_acquit_isolated && !too_recent)
 842                         fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
 843                             fmd_case_acquit_isolated, &do_update);
 844         } else if (got_acquit_overlap) {
 845                 /*
 846                  * Re-fault the acquitted matching suspects and acquit all
 847                  * isolated suspects.
 848                  */
 849                 if (fmd_case_auto_acquit_isolated && !too_recent) {
 850                         fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
 851                             fmd_case_fault_acquitted_matching, &fca);
 852                         fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
 853                             fmd_case_acquit_isolated, &do_update);
 854                 }
 855         } else if (fmd_case_auto_acquit_isolated) {
 856                 /*
 857                  * To get here, there must be no faulty or acquitted suspects,
 858                  * but there must be at least one isolated suspect. Just acquit
 859                  * non-matching isolated suspects. If there are no matching
 860                  * isolated suspects, then re-fault all matching suspects.
 861                  */
 862                 for (i = 0; i < new_cip->ci_nsuspects; i++)
 863                         if (new_susp_state[i] == SUSPECT_STATE_ISOLATED)
 864                                 got_isolated_overlap = 1;
 865                 if (!got_isolated_overlap)
 866                         fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
 867                             fmd_case_fault_all_matching, &fca);
 868                 fmd_asru_hash_apply_by_case(fmd.d_asrus, old_cp,
 869                     fmd_case_acquit_no_match, &fca);
 870         }
 871 
 872         /*
 873          * If we've updated anything in the old case, call fmd_case_update()
 874          */
 875         if (do_update)
 876                 fmd_case_update(old_cp);
 877 }
 878 
 879 /*
 880  * Convict suspects in a case by applying a conviction policy and updating the
 881  * resource cache prior to emitting the list.suspect event for the given case.
 882  * At present, our policy is very simple: convict every suspect in the case.
 883  * In the future, this policy can be extended and made configurable to permit:
 884  *
 885  * - convicting the suspect with the highest FIT rate
 886  * - convicting the suspect with the cheapest FRU
 887  * - convicting the suspect with the FRU that is in a depot's inventory
 888  * - convicting the suspect with the longest lifetime
 889  *
 890  * and so forth.  A word to the wise: this problem is significantly harder that
 891  * it seems at first glance.  Future work should heed the following advice:
 892  *
 893  * Hacking the policy into C code here is a very bad idea.  The policy needs to
 894  * be decided upon very carefully and fundamentally encodes knowledge of what
 895  * suspect list combinations can be emitted by what diagnosis engines.  As such
 896  * fmd's code is the wrong location, because that would require fmd itself to
 897  * be updated for every diagnosis engine change, defeating the entire design.
 898  * The FMA Event Registry knows the suspect list combinations: policy inputs
 899  * can be derived from it and used to produce per-module policy configuration.
 900  *
 901  * If the policy needs to be dynamic and not statically fixed at either fmd
 902  * startup or module load time, any implementation of dynamic policy retrieval
 903  * must employ some kind of caching mechanism or be part of a built-in module.
 904  * The fmd_case_convict() function is called with locks held inside of fmd and
 905  * is not a place where unbounded blocking on some inter-process or inter-
 906  * system communication to another service (e.g. another daemon) can occur.
 907  */
 908 static int
 909 fmd_case_convict(fmd_case_t *cp)
 910 {
 911         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
 912         fmd_asru_hash_t *ahp = fmd.d_asrus;
 913         int discard_new = 0, i;
 914         fmd_case_susp_t *cis;
 915         fmd_asru_link_t *alp;
 916         uint8_t *new_susp_state;
 917         uint8_t *new_match_state;
 918         int adjust_new = 0;
 919         fccd_t fccd;
 920         fmd_case_impl_t *ncp, **cps, **cpp;
 921         uint_t cpc;
 922         fmd_case_hash_t *chp;
 923 
 924         /*
 925          * First we must see if any matching cases already exist.
 926          */
 927         new_susp_state = alloca(cip->ci_nsuspects * sizeof (uint8_t));
 928         for (i = 0; i < cip->ci_nsuspects; i++)
 929                 new_susp_state[i] = 0;
 930         new_match_state = alloca(cip->ci_nsuspects * sizeof (uint8_t));
 931         for (i = 0; i < cip->ci_nsuspects; i++)
 932                 new_match_state[i] = 0;
 933         fccd.fccd_cip = cip;
 934         fccd.fccd_adjust_new = &adjust_new;
 935         fccd.fccd_new_susp_state = new_susp_state;
 936         fccd.fccd_new_match_state = new_match_state;
 937         fccd.fccd_discard_new = &discard_new;
 938 
 939         /*
 940          * Hold all cases
 941          */
 942         chp = fmd.d_cases;
 943         (void) pthread_rwlock_rdlock(&chp->ch_lock);
 944         cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP);
 945         cpc = chp->ch_count;
 946         for (i = 0; i < chp->ch_hashlen; i++)
 947                 for (ncp = chp->ch_hash[i]; ncp != NULL; ncp = ncp->ci_next)
 948                         *cpp++ = fmd_case_tryhold(ncp);
 949         ASSERT(cpp == cps + cpc);
 950         (void) pthread_rwlock_unlock(&chp->ch_lock);
 951 
 952         /*
 953          * Run fmd_case_check_for_dups() on all cases except the current one.
 954          */
 955         for (i = 0; i < cpc; i++) {
 956                 if (cps[i] != NULL) {
 957                         if (cps[i] != (fmd_case_impl_t *)cp)
 958                                 fmd_case_check_for_dups((fmd_case_t *)cps[i],
 959                                     &fccd);
 960                         fmd_case_rele((fmd_case_t *)cps[i]);
 961                 }
 962         }
 963         fmd_free(cps, cpc * sizeof (fmd_case_t *));
 964 
 965         (void) pthread_mutex_lock(&cip->ci_lock);
 966         if (cip->ci_code == NULL)
 967                 (void) fmd_case_mkcode(cp);
 968         else if (cip->ci_precanned)
 969                 fmd_case_code_hash_insert(fmd.d_cases, cip);
 970 
 971         if (discard_new) {
 972                 /*
 973                  * We've found an existing case that is a match and it is not
 974                  * already in repaired or resolved state. So we can close this
 975                  * one as a duplicate.
 976                  */
 977                 (void) pthread_mutex_unlock(&cip->ci_lock);
 978                 return (1);
 979         }
 980 
 981         /*
 982          * Allocate new cache entries
 983          */
 984         for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
 985                 if ((alp = fmd_asru_hash_create_entry(ahp,
 986                     cp, cis->cis_nvl)) == NULL) {
 987                         fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in "
 988                             "%s: %s\n", cip->ci_uuid, fmd_strerror(errno));
 989                         continue;
 990                 }
 991                 alp->al_flags |= FMD_ASRU_PRESENT;
 992                 alp->al_asru->asru_flags |= FMD_ASRU_PRESENT;
 993                 (void) fmd_asru_clrflags(alp, FMD_ASRU_UNUSABLE, 0);
 994                 (void) fmd_asru_setflags(alp, FMD_ASRU_FAULTY);
 995         }
 996 
 997         if (adjust_new) {
 998                 int some_suspect = 0, some_not_suspect = 0;
 999 
1000                 /*
1001                  * There is one or more matching case but they are already in
1002                  * repaired or resolved state. So we need to keep the new
1003                  * case, but we can adjust it. Repaired/removed/replaced
1004                  * suspects are unlikely to be to blame (unless there are
1005                  * actually two separate faults). So if we have a combination of
1006                  * repaired/replaced/removed suspects and acquitted suspects in
1007                  * the old lists, then we should acquit in the new list those
1008                  * that were repaired/replaced/removed in the old.
1009                  */
1010                 for (i = 0; i < cip->ci_nsuspects; i++) {
1011                         if ((new_susp_state[i] & SUSPECT_STATE_REPLACED) ||
1012                             (new_susp_state[i] & SUSPECT_STATE_REPAIRED) ||
1013                             (new_susp_state[i] & SUSPECT_STATE_REMOVED) ||
1014                             (new_match_state[i] & SUSPECT_STATE_NO_MATCH))
1015                                 some_not_suspect = 1;
1016                         else
1017                                 some_suspect = 1;
1018                 }
1019                 if (some_suspect && some_not_suspect) {
1020                         for (cis = cip->ci_suspects, i = 0; cis != NULL;
1021                             cis = cis->cis_next, i++)
1022                                 if ((new_susp_state[i] &
1023                                     SUSPECT_STATE_REPLACED) ||
1024                                     (new_susp_state[i] &
1025                                     SUSPECT_STATE_REPAIRED) ||
1026                                     (new_susp_state[i] &
1027                                     SUSPECT_STATE_REMOVED) ||
1028                                     (new_match_state[i] &
1029                                     SUSPECT_STATE_NO_MATCH))
1030                                         fmd_asru_hash_apply_by_case(fmd.d_asrus,
1031                                             cp, fmd_case_acquit_suspect,
1032                                             cis->cis_nvl);
1033                 }
1034         }
1035 
1036         (void) pthread_mutex_unlock(&cip->ci_lock);
1037         return (0);
1038 }
1039 
1040 void
1041 fmd_case_publish(fmd_case_t *cp, uint_t state)
1042 {
1043         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1044         fmd_event_t *e;
1045         nvlist_t *nvl;
1046         char *class;
1047 
1048         if (state == FMD_CASE_CURRENT)
1049                 state = cip->ci_state; /* use current state */
1050 
1051         switch (state) {
1052         case FMD_CASE_SOLVED:
1053                 (void) pthread_mutex_lock(&cip->ci_lock);
1054 
1055                 /*
1056                  * If we already have a code, then case is already solved.
1057                  */
1058                 if (cip->ci_precanned == 0 && cip->ci_xprt == NULL &&
1059                     cip->ci_code != NULL) {
1060                         (void) pthread_mutex_unlock(&cip->ci_lock);
1061                         break;
1062                 }
1063 
1064                 if (cip->ci_tv_valid == 0) {
1065                         fmd_time_gettimeofday(&cip->ci_tv);
1066                         cip->ci_tv_valid = 1;
1067                 }
1068                 (void) pthread_mutex_unlock(&cip->ci_lock);
1069 
1070                 if (fmd_case_convict(cp) == 1) { /* dupclose */
1071                         cip->ci_flags &= ~FMD_CF_SOLVED;
1072                         fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, 0);
1073                         break;
1074                 }
1075                 if (cip->ci_xprt != NULL) {
1076                         /*
1077                          * For proxy, save some information about the transport
1078                          * in the resource cache.
1079                          */
1080                         int count = 0;
1081                         fmd_asru_set_on_proxy_t fasp;
1082                         fmd_xprt_impl_t *xip = (fmd_xprt_impl_t *)cip->ci_xprt;
1083 
1084                         fasp.fasp_countp = &count;
1085                         fasp.fasp_maxcount = cip->ci_nsuspects;
1086                         fasp.fasp_proxy_asru = cip->ci_proxy_asru;
1087                         fasp.fasp_proxy_external = xip->xi_flags &
1088                             FMD_XPRT_EXTERNAL;
1089                         fasp.fasp_proxy_rdonly = ((xip->xi_flags &
1090                             FMD_XPRT_RDWR) == FMD_XPRT_RDONLY);
1091                         fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1092                             fmd_asru_set_on_proxy, &fasp);
1093                 }
1094                 nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS);
1095                 (void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1096 
1097                 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1098                 (void) pthread_rwlock_rdlock(&fmd.d_log_lock);
1099                 fmd_log_append(fmd.d_fltlog, e, cp);
1100                 (void) pthread_rwlock_unlock(&fmd.d_log_lock);
1101                 fmd_dispq_dispatch(fmd.d_disp, e, class);
1102 
1103                 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1104                 cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++;
1105                 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1106 
1107                 break;
1108 
1109         case FMD_CASE_CLOSE_WAIT:
1110                 fmd_case_hold(cp);
1111                 e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp);
1112                 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
1113 
1114                 (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1115                 cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++;
1116                 (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1117 
1118                 break;
1119 
1120         case FMD_CASE_CLOSED:
1121                 nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS);
1122                 (void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1123                 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1124                 fmd_dispq_dispatch(fmd.d_disp, e, class);
1125                 break;
1126 
1127         case FMD_CASE_REPAIRED:
1128                 nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
1129                 (void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1130                 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1131                 (void) pthread_rwlock_rdlock(&fmd.d_log_lock);
1132                 fmd_log_append(fmd.d_fltlog, e, cp);
1133                 (void) pthread_rwlock_unlock(&fmd.d_log_lock);
1134                 fmd_dispq_dispatch(fmd.d_disp, e, class);
1135                 break;
1136 
1137         case FMD_CASE_RESOLVED:
1138                 nvl = fmd_case_mkevent(cp, FM_LIST_RESOLVED_CLASS);
1139                 (void) nvlist_lookup_string(nvl, FM_CLASS, &class);
1140                 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
1141                 (void) pthread_rwlock_rdlock(&fmd.d_log_lock);
1142                 fmd_log_append(fmd.d_fltlog, e, cp);
1143                 (void) pthread_rwlock_unlock(&fmd.d_log_lock);
1144                 fmd_dispq_dispatch(fmd.d_disp, e, class);
1145                 break;
1146         }
1147 }
1148 
1149 fmd_case_t *
1150 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid)
1151 {
1152         fmd_case_impl_t *cip;
1153         uint_t h;
1154 
1155         (void) pthread_rwlock_rdlock(&chp->ch_lock);
1156         h = fmd_strhash(uuid) % chp->ch_hashlen;
1157 
1158         for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) {
1159                 if (strcmp(cip->ci_uuid, uuid) == 0)
1160                         break;
1161         }
1162 
1163         /*
1164          * If deleting bit is set, treat the case as if it doesn't exist.
1165          */
1166         if (cip != NULL)
1167                 cip = fmd_case_tryhold(cip);
1168 
1169         if (cip == NULL)
1170                 (void) fmd_set_errno(EFMD_CASE_INVAL);
1171 
1172         (void) pthread_rwlock_unlock(&chp->ch_lock);
1173         return ((fmd_case_t *)cip);
1174 }
1175 
1176 static fmd_case_impl_t *
1177 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
1178 {
1179         fmd_case_impl_t *eip;
1180         uint_t h;
1181 
1182         (void) pthread_rwlock_wrlock(&chp->ch_lock);
1183         h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
1184 
1185         for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) {
1186                 if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0 &&
1187                     fmd_case_tryhold(eip) != NULL) {
1188                         (void) pthread_rwlock_unlock(&chp->ch_lock);
1189                         return (eip); /* uuid already present */
1190                 }
1191         }
1192 
1193         cip->ci_next = chp->ch_hash[h];
1194         chp->ch_hash[h] = cip;
1195 
1196         chp->ch_count++;
1197         ASSERT(chp->ch_count != 0);
1198 
1199         (void) pthread_rwlock_unlock(&chp->ch_lock);
1200         return (cip);
1201 }
1202 
1203 static void
1204 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
1205 {
1206         fmd_case_impl_t *cp, **pp;
1207         uint_t h;
1208 
1209         ASSERT(MUTEX_HELD(&cip->ci_lock));
1210 
1211         cip->ci_flags |= FMD_CF_DELETING;
1212         (void) pthread_mutex_unlock(&cip->ci_lock);
1213 
1214         (void) pthread_rwlock_wrlock(&chp->ch_lock);
1215 
1216         h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
1217         pp = &chp->ch_hash[h];
1218 
1219         for (cp = *pp; cp != NULL; cp = cp->ci_next) {
1220                 if (cp != cip)
1221                         pp = &cp->ci_next;
1222                 else
1223                         break;
1224         }
1225 
1226         if (cp == NULL) {
1227                 fmd_panic("case %p (%s) not found on hash chain %u\n",
1228                     (void *)cip, cip->ci_uuid, h);
1229         }
1230 
1231         *pp = cp->ci_next;
1232         cp->ci_next = NULL;
1233 
1234         /*
1235          * delete from code hash if it is on it
1236          */
1237         fmd_case_code_hash_delete(chp, cip);
1238 
1239         ASSERT(chp->ch_count != 0);
1240         chp->ch_count--;
1241 
1242         (void) pthread_rwlock_unlock(&chp->ch_lock);
1243 
1244         (void) pthread_mutex_lock(&cip->ci_lock);
1245         ASSERT(cip->ci_flags & FMD_CF_DELETING);
1246 }
1247 
1248 fmd_case_t *
1249 fmd_case_create(fmd_module_t *mp, const char *uuidstr, void *data)
1250 {
1251         fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
1252         fmd_case_impl_t *eip = NULL;
1253         uuid_t uuid;
1254 
1255         (void) pthread_mutex_init(&cip->ci_lock, NULL);
1256         fmd_buf_hash_create(&cip->ci_bufs);
1257 
1258         fmd_module_hold(mp);
1259         cip->ci_mod = mp;
1260         cip->ci_refs = 1;
1261         cip->ci_state = FMD_CASE_UNSOLVED;
1262         cip->ci_flags = FMD_CF_DIRTY;
1263         cip->ci_data = data;
1264 
1265         /*
1266          * Calling libuuid: get a clue.  The library interfaces cleverly do not
1267          * define any constant for the length of an unparse string, and do not
1268          * permit the caller to specify a buffer length for safety.  The spec
1269          * says it will be 36 bytes, but we make it tunable just in case.
1270          */
1271         (void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen);
1272         cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP);
1273 
1274         if (uuidstr == NULL) {
1275                 /*
1276                  * We expect this loop to execute only once, but code it
1277                  * defensively against the possibility of libuuid bugs.
1278                  * Keep generating uuids and attempting to do a hash insert
1279                  * until we get a unique one.
1280                  */
1281                 do {
1282                         if (eip != NULL)
1283                                 fmd_case_rele((fmd_case_t *)eip);
1284                         uuid_generate(uuid);
1285                         uuid_unparse(uuid, cip->ci_uuid);
1286                 } while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip);
1287         } else {
1288                 /*
1289                  * If a uuid was specified we must succeed with that uuid,
1290                  * or return NULL indicating a case with that uuid already
1291                  * exists.
1292                  */
1293                 (void) strncpy(cip->ci_uuid, uuidstr, cip->ci_uuidlen + 1);
1294                 if (fmd_case_hash_insert(fmd.d_cases, cip) != cip) {
1295                         fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1);
1296                         (void) fmd_buf_hash_destroy(&cip->ci_bufs);
1297                         fmd_module_rele(mp);
1298                         pthread_mutex_destroy(&cip->ci_lock);
1299                         fmd_free(cip, sizeof (*cip));
1300                         return (NULL);
1301                 }
1302         }
1303 
1304         ASSERT(fmd_module_locked(mp));
1305         fmd_list_append(&mp->mod_cases, cip);
1306         fmd_module_setcdirty(mp);
1307 
1308         (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1309         cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
1310         (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1311 
1312         return ((fmd_case_t *)cip);
1313 }
1314 
1315 static void
1316 fmd_case_destroy_suspects(fmd_case_impl_t *cip)
1317 {
1318         fmd_case_susp_t *cis, *ncis;
1319 
1320         ASSERT(MUTEX_HELD(&cip->ci_lock));
1321 
1322         if (cip->ci_proxy_asru)
1323                 fmd_free(cip->ci_proxy_asru, sizeof (uint8_t) *
1324                     cip->ci_nsuspects);
1325         nvlist_free(cip->ci_diag_de);
1326         if (cip->ci_diag_asru)
1327                 fmd_free(cip->ci_diag_asru, sizeof (uint8_t) *
1328                     cip->ci_nsuspects);
1329 
1330         for (cis = cip->ci_suspects; cis != NULL; cis = ncis) {
1331                 ncis = cis->cis_next;
1332                 nvlist_free(cis->cis_nvl);
1333                 fmd_free(cis, sizeof (fmd_case_susp_t));
1334         }
1335 
1336         cip->ci_suspects = NULL;
1337         cip->ci_nsuspects = 0;
1338 }
1339 
1340 fmd_case_t *
1341 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp,
1342     uint_t state, const char *uuid, const char *code)
1343 {
1344         fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
1345         fmd_case_impl_t *eip;
1346 
1347         (void) pthread_mutex_init(&cip->ci_lock, NULL);
1348         fmd_buf_hash_create(&cip->ci_bufs);
1349 
1350         fmd_module_hold(mp);
1351         cip->ci_mod = mp;
1352         cip->ci_xprt = xp;
1353         cip->ci_refs = 1;
1354         cip->ci_state = state;
1355         cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP);
1356         cip->ci_uuidlen = strlen(cip->ci_uuid);
1357         cip->ci_code = fmd_strdup(code, FMD_SLEEP);
1358         cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0;
1359 
1360         if (state > FMD_CASE_CLOSE_WAIT)
1361                 cip->ci_flags |= FMD_CF_SOLVED;
1362 
1363         /*
1364          * Insert the case into the global case hash.  If the specified UUID is
1365          * already present, check to see if it is an orphan: if so, reclaim it;
1366          * otherwise if it is owned by a different module then return NULL.
1367          */
1368         if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) {
1369                 (void) pthread_mutex_lock(&cip->ci_lock);
1370                 cip->ci_refs--; /* decrement to zero */
1371                 fmd_case_destroy((fmd_case_t *)cip, B_FALSE);
1372 
1373                 cip = eip; /* switch 'cip' to the existing case */
1374                 (void) pthread_mutex_lock(&cip->ci_lock);
1375 
1376                 /*
1377                  * If the ASRU cache is trying to recreate an orphan, then just
1378                  * return the existing case that we found without changing it.
1379                  */
1380                 if (mp == fmd.d_rmod) {
1381                         /*
1382                          * In case the case has already been created from
1383                          * a checkpoint file we need to set up code now.
1384                          */
1385                         if (cip->ci_state < FMD_CASE_CLOSED) {
1386                                 if (code != NULL && cip->ci_code == NULL) {
1387                                         cip->ci_code = fmd_strdup(code,
1388                                             FMD_SLEEP);
1389                                         cip->ci_codelen = cip->ci_code ?
1390                                             strlen(cip->ci_code) + 1 : 0;
1391                                         fmd_case_code_hash_insert(fmd.d_cases,
1392                                             cip);
1393                                 }
1394                         }
1395 
1396                         /*
1397                          * When recreating an orphan case, state passed in may
1398                          * be CLOSED (faulty) or REPAIRED/RESOLVED (!faulty). If
1399                          * any suspects are still CLOSED (faulty) then the
1400                          * overall state needs to be CLOSED.
1401                          */
1402                         if ((cip->ci_state == FMD_CASE_REPAIRED ||
1403                             cip->ci_state == FMD_CASE_RESOLVED) &&
1404                             state == FMD_CASE_CLOSED)
1405                                 cip->ci_state = FMD_CASE_CLOSED;
1406                         (void) pthread_mutex_unlock(&cip->ci_lock);
1407                         fmd_case_rele((fmd_case_t *)cip);
1408                         return ((fmd_case_t *)cip);
1409                 }
1410 
1411                 /*
1412                  * If the existing case isn't an orphan or is being proxied,
1413                  * then we have a UUID conflict: return failure to the caller.
1414                  */
1415                 if (cip->ci_mod != fmd.d_rmod || xp != NULL) {
1416                         (void) pthread_mutex_unlock(&cip->ci_lock);
1417                         fmd_case_rele((fmd_case_t *)cip);
1418                         return (NULL);
1419                 }
1420 
1421                 /*
1422                  * If the new module is reclaiming an orphaned case, remove
1423                  * the case from the root module, switch ci_mod, and then fall
1424                  * through to adding the case to the new owner module 'mp'.
1425                  */
1426                 fmd_module_lock(cip->ci_mod);
1427                 fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1428                 fmd_module_unlock(cip->ci_mod);
1429 
1430                 fmd_module_rele(cip->ci_mod);
1431                 cip->ci_mod = mp;
1432                 fmd_module_hold(mp);
1433 
1434                 /*
1435                  * It's possible that fmd crashed or was restarted during a
1436                  * previous solve operation between the asru cache being created
1437                  * and the ckpt file being updated to SOLVED. Thus when the DE
1438                  * recreates the case here from the checkpoint file, the state
1439                  * will be UNSOLVED and yet we are having to reclaim because
1440                  * the case was in the asru cache. If this happens, revert the
1441                  * case back to the UNSOLVED state and let the DE solve it again
1442                  */
1443                 if (state == FMD_CASE_UNSOLVED) {
1444                         fmd_asru_hash_delete_case(fmd.d_asrus,
1445                             (fmd_case_t *)cip);
1446                         fmd_case_destroy_suspects(cip);
1447                         fmd_case_code_hash_delete(fmd.d_cases, cip);
1448                         fmd_free(cip->ci_code, cip->ci_codelen);
1449                         cip->ci_code = NULL;
1450                         cip->ci_codelen = 0;
1451                         cip->ci_tv_valid = 0;
1452                 }
1453 
1454                 cip->ci_state = state;
1455 
1456                 (void) pthread_mutex_unlock(&cip->ci_lock);
1457                 fmd_case_rele((fmd_case_t *)cip);
1458         } else {
1459                 /*
1460                  * add into hash of solved cases
1461                  */
1462                 if (cip->ci_code)
1463                         fmd_case_code_hash_insert(fmd.d_cases, cip);
1464         }
1465 
1466         ASSERT(fmd_module_locked(mp));
1467         fmd_list_append(&mp->mod_cases, cip);
1468 
1469         (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
1470         cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
1471         (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
1472 
1473         return ((fmd_case_t *)cip);
1474 }
1475 
1476 void
1477 fmd_case_destroy(fmd_case_t *cp, int visible)
1478 {
1479         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1480         fmd_case_item_t *cit, *ncit;
1481 
1482         ASSERT(MUTEX_HELD(&cip->ci_lock));
1483         ASSERT(cip->ci_refs == 0);
1484 
1485         if (visible) {
1486                 TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid));
1487                 fmd_case_hash_delete(fmd.d_cases, cip);
1488         }
1489 
1490         for (cit = cip->ci_items; cit != NULL; cit = ncit) {
1491                 ncit = cit->cit_next;
1492                 fmd_event_rele(cit->cit_event);
1493                 fmd_free(cit, sizeof (fmd_case_item_t));
1494         }
1495 
1496         fmd_case_destroy_suspects(cip);
1497 
1498         if (cip->ci_principal != NULL)
1499                 fmd_event_rele(cip->ci_principal);
1500 
1501         fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1);
1502         fmd_free(cip->ci_code, cip->ci_codelen);
1503         (void) fmd_buf_hash_destroy(&cip->ci_bufs);
1504 
1505         fmd_module_rele(cip->ci_mod);
1506         fmd_free(cip, sizeof (fmd_case_impl_t));
1507 }
1508 
1509 void
1510 fmd_case_hold(fmd_case_t *cp)
1511 {
1512         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1513 
1514         (void) pthread_mutex_lock(&cip->ci_lock);
1515         fmd_case_hold_locked(cp);
1516         (void) pthread_mutex_unlock(&cip->ci_lock);
1517 }
1518 
1519 void
1520 fmd_case_hold_locked(fmd_case_t *cp)
1521 {
1522         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1523 
1524         ASSERT(MUTEX_HELD(&cip->ci_lock));
1525         if (cip->ci_flags & FMD_CF_DELETING)
1526                 fmd_panic("attempt to hold a deleting case %p (%s)\n",
1527                     (void *)cip, cip->ci_uuid);
1528         cip->ci_refs++;
1529         ASSERT(cip->ci_refs != 0);
1530 }
1531 
1532 static fmd_case_impl_t *
1533 fmd_case_tryhold(fmd_case_impl_t *cip)
1534 {
1535         /*
1536          * If the case's "deleting" bit is unset, hold and return case,
1537          * otherwise, return NULL.
1538          */
1539         (void) pthread_mutex_lock(&cip->ci_lock);
1540         if (cip->ci_flags & FMD_CF_DELETING) {
1541                 (void) pthread_mutex_unlock(&cip->ci_lock);
1542                 cip = NULL;
1543         } else {
1544                 fmd_case_hold_locked((fmd_case_t *)cip);
1545                 (void) pthread_mutex_unlock(&cip->ci_lock);
1546         }
1547         return (cip);
1548 }
1549 
1550 void
1551 fmd_case_rele(fmd_case_t *cp)
1552 {
1553         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1554 
1555         (void) pthread_mutex_lock(&cip->ci_lock);
1556         ASSERT(cip->ci_refs != 0);
1557 
1558         if (--cip->ci_refs == 0)
1559                 fmd_case_destroy((fmd_case_t *)cip, B_TRUE);
1560         else
1561                 (void) pthread_mutex_unlock(&cip->ci_lock);
1562 }
1563 
1564 void
1565 fmd_case_rele_locked(fmd_case_t *cp)
1566 {
1567         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1568 
1569         ASSERT(MUTEX_HELD(&cip->ci_lock));
1570         --cip->ci_refs;
1571         ASSERT(cip->ci_refs != 0);
1572 }
1573 
1574 int
1575 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep)
1576 {
1577         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1578         fmd_case_item_t *cit;
1579         fmd_event_t *oep;
1580         uint_t state;
1581         int new;
1582 
1583         fmd_event_hold(ep);
1584         (void) pthread_mutex_lock(&cip->ci_lock);
1585 
1586         if (cip->ci_flags & FMD_CF_SOLVED)
1587                 state = FMD_EVS_DIAGNOSED;
1588         else
1589                 state = FMD_EVS_ACCEPTED;
1590 
1591         oep = cip->ci_principal;
1592         cip->ci_principal = ep;
1593 
1594         for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1595                 if (cit->cit_event == ep)
1596                         break;
1597         }
1598 
1599         cip->ci_flags |= FMD_CF_DIRTY;
1600         new = cit == NULL && ep != oep;
1601 
1602         (void) pthread_mutex_unlock(&cip->ci_lock);
1603 
1604         fmd_module_setcdirty(cip->ci_mod);
1605         fmd_event_transition(ep, state);
1606 
1607         if (oep != NULL)
1608                 fmd_event_rele(oep);
1609 
1610         return (new);
1611 }
1612 
1613 int
1614 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep)
1615 {
1616         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1617         fmd_case_item_t *cit;
1618         uint_t state;
1619         int new;
1620         boolean_t injected;
1621 
1622         (void) pthread_mutex_lock(&cip->ci_lock);
1623 
1624         if (cip->ci_flags & FMD_CF_SOLVED)
1625                 state = FMD_EVS_DIAGNOSED;
1626         else
1627                 state = FMD_EVS_ACCEPTED;
1628 
1629         for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
1630                 if (cit->cit_event == ep)
1631                         break;
1632         }
1633 
1634         new = cit == NULL && ep != cip->ci_principal;
1635 
1636         /*
1637          * If the event is already in the case or the case is already solved,
1638          * there is no reason to save it: just transition it appropriately.
1639          */
1640         if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) {
1641                 (void) pthread_mutex_unlock(&cip->ci_lock);
1642                 fmd_event_transition(ep, state);
1643                 return (new);
1644         }
1645 
1646         cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP);
1647         fmd_event_hold(ep);
1648 
1649         if (nvlist_lookup_boolean_value(((fmd_event_impl_t *)ep)->ev_nvl,
1650             "__injected", &injected) == 0 && injected)
1651                 fmd_case_set_injected(cp);
1652 
1653         cit->cit_next = cip->ci_items;
1654         cit->cit_event = ep;
1655 
1656         cip->ci_items = cit;
1657         cip->ci_nitems++;
1658 
1659         cip->ci_flags |= FMD_CF_DIRTY;
1660         (void) pthread_mutex_unlock(&cip->ci_lock);
1661 
1662         fmd_module_setcdirty(cip->ci_mod);
1663         fmd_event_transition(ep, state);
1664 
1665         return (new);
1666 }
1667 
1668 void
1669 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl)
1670 {
1671         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1672         fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
1673 
1674         (void) pthread_mutex_lock(&cip->ci_lock);
1675         ASSERT(cip->ci_state < FMD_CASE_CLOSE_WAIT);
1676         cip->ci_flags |= FMD_CF_DIRTY;
1677 
1678         cis->cis_next = cip->ci_suspects;
1679         cis->cis_nvl = nvl;
1680 
1681         cip->ci_suspects = cis;
1682         cip->ci_nsuspects++;
1683 
1684         (void) pthread_mutex_unlock(&cip->ci_lock);
1685         if (cip->ci_xprt == NULL)
1686                 fmd_module_setcdirty(cip->ci_mod);
1687 }
1688 
1689 void
1690 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl)
1691 {
1692         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1693         fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
1694         boolean_t b;
1695 
1696         (void) pthread_mutex_lock(&cip->ci_lock);
1697 
1698         cis->cis_next = cip->ci_suspects;
1699         cis->cis_nvl = nvl;
1700 
1701         if (nvlist_lookup_boolean_value(nvl,
1702             FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE)
1703                 cip->ci_flags |= FMD_CF_INVISIBLE;
1704 
1705         cip->ci_suspects = cis;
1706         cip->ci_nsuspects++;
1707 
1708         (void) pthread_mutex_unlock(&cip->ci_lock);
1709 }
1710 
1711 void
1712 fmd_case_reset_suspects(fmd_case_t *cp)
1713 {
1714         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1715 
1716         (void) pthread_mutex_lock(&cip->ci_lock);
1717         ASSERT(cip->ci_state < FMD_CASE_SOLVED);
1718 
1719         fmd_case_destroy_suspects(cip);
1720         cip->ci_flags |= FMD_CF_DIRTY;
1721 
1722         (void) pthread_mutex_unlock(&cip->ci_lock);
1723         fmd_module_setcdirty(cip->ci_mod);
1724 }
1725 
1726 /*ARGSUSED*/
1727 static void
1728 fmd_case_unusable(fmd_asru_link_t *alp, void *arg)
1729 {
1730         (void) fmd_asru_setflags(alp, FMD_ASRU_UNUSABLE);
1731 }
1732 
1733 /*
1734  * Grab ci_lock and update the case state and set the dirty bit.  Then perform
1735  * whatever actions and emit whatever events are appropriate for the state.
1736  * Refer to the topmost block comment explaining the state machine for details.
1737  */
1738 void
1739 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags)
1740 {
1741         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1742         fmd_case_item_t *cit;
1743         fmd_event_t *e;
1744         int resolved = 0;
1745         int any_unusable_and_present = 0;
1746 
1747         ASSERT(state <= FMD_CASE_RESOLVED);
1748         (void) pthread_mutex_lock(&cip->ci_lock);
1749 
1750         if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED))
1751                 flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED | FMD_CF_RESOLVED);
1752 
1753         cip->ci_flags |= flags;
1754 
1755         if (cip->ci_state >= state) {
1756                 (void) pthread_mutex_unlock(&cip->ci_lock);
1757                 return; /* already in specified state */
1758         }
1759 
1760         TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
1761             _fmd_case_snames[cip->ci_state], _fmd_case_snames[state]));
1762 
1763         cip->ci_state = state;
1764         cip->ci_flags |= FMD_CF_DIRTY;
1765 
1766         if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod)
1767                 fmd_module_setcdirty(cip->ci_mod);
1768 
1769         switch (state) {
1770         case FMD_CASE_SOLVED:
1771                 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
1772                         fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED);
1773 
1774                 if (cip->ci_principal != NULL) {
1775                         fmd_event_transition(cip->ci_principal,
1776                             FMD_EVS_DIAGNOSED);
1777                 }
1778                 break;
1779 
1780         case FMD_CASE_CLOSE_WAIT:
1781                 /*
1782                  * If the case was never solved, do not change ASRUs.
1783                  * If the case was never fmd_case_closed, do not change ASRUs.
1784                  * If the case was repaired, do not change ASRUs.
1785                  */
1786                 if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED |
1787                     FMD_CF_REPAIRED)) == (FMD_CF_SOLVED | FMD_CF_ISOLATED))
1788                         fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1789                             fmd_case_unusable, NULL);
1790 
1791                 /*
1792                  * If an orphaned case transitions to CLOSE_WAIT, the owning
1793                  * module is no longer loaded: continue on to CASE_CLOSED or
1794                  * CASE_REPAIRED as appropriate.
1795                  */
1796                 if (fmd_case_orphaned(cp)) {
1797                         if (cip->ci_flags & FMD_CF_REPAIRED) {
1798                                 state = cip->ci_state = FMD_CASE_REPAIRED;
1799                                 TRACE((FMD_DBG_CASE, "case %s %s->%s",
1800                                     cip->ci_uuid,
1801                                     _fmd_case_snames[FMD_CASE_CLOSE_WAIT],
1802                                     _fmd_case_snames[FMD_CASE_REPAIRED]));
1803                                 goto do_repair;
1804                         } else {
1805                                 state = cip->ci_state = FMD_CASE_CLOSED;
1806                                 TRACE((FMD_DBG_CASE, "case %s %s->%s",
1807                                     cip->ci_uuid,
1808                                     _fmd_case_snames[FMD_CASE_CLOSE_WAIT],
1809                                     _fmd_case_snames[FMD_CASE_CLOSED]));
1810                         }
1811                 }
1812                 break;
1813 
1814         case FMD_CASE_REPAIRED:
1815 do_repair:
1816                 ASSERT(cip->ci_xprt != NULL || fmd_case_orphaned(cp));
1817 
1818                 /*
1819                  * If we've been requested to transition straight on to the
1820                  * RESOLVED state (which can happen with fault proxying where a
1821                  * list.resolved or a uuresolved is received from the other
1822                  * side), or if all suspects are already either usable or not
1823                  * present then transition straight to RESOLVED state,
1824                  * publishing both the list.repaired and list.resolved. For a
1825                  * proxy, if we discover here that all suspects are already
1826                  * either usable or not present, notify the diag side instead
1827                  * using fmd_xprt_uuresolved().
1828                  */
1829                 if (flags & FMD_CF_RESOLVED) {
1830                         if (cip->ci_xprt != NULL)
1831                                 fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1832                 } else {
1833                         fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1834                             fmd_case_unusable_and_present,
1835                             &any_unusable_and_present);
1836                         if (any_unusable_and_present)
1837                                 break;
1838                         if (cip->ci_xprt != NULL) {
1839                                 fmd_xprt_uuresolved(cip->ci_xprt, cip->ci_uuid);
1840                                 break;
1841                         }
1842                 }
1843 
1844                 cip->ci_state = FMD_CASE_RESOLVED;
1845                 (void) pthread_mutex_unlock(&cip->ci_lock);
1846                 fmd_case_publish(cp, state);
1847                 TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
1848                     _fmd_case_snames[FMD_CASE_REPAIRED],
1849                     _fmd_case_snames[FMD_CASE_RESOLVED]));
1850                 state = FMD_CASE_RESOLVED;
1851                 resolved = 1;
1852                 (void) pthread_mutex_lock(&cip->ci_lock);
1853                 break;
1854 
1855         case FMD_CASE_RESOLVED:
1856                 /*
1857                  * For a proxy, no need to check that all suspects are already
1858                  * either usable or not present - this request has come from
1859                  * the diagnosing side which makes the final decision on this.
1860                  */
1861                 if (cip->ci_xprt != NULL) {
1862                         fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1863                         resolved = 1;
1864                         break;
1865                 }
1866 
1867                 ASSERT(fmd_case_orphaned(cp));
1868 
1869                 /*
1870                  * If all suspects are already either usable or not present then
1871                  * carry on, publish list.resolved and discard the case.
1872                  */
1873                 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1874                     fmd_case_unusable_and_present, &any_unusable_and_present);
1875                 if (any_unusable_and_present) {
1876                         (void) pthread_mutex_unlock(&cip->ci_lock);
1877                         return;
1878                 }
1879 
1880                 resolved = 1;
1881                 break;
1882         }
1883 
1884         (void) pthread_mutex_unlock(&cip->ci_lock);
1885 
1886         /*
1887          * If the module has initialized, then publish the appropriate event
1888          * for the new case state.  If not, we are being called from the
1889          * checkpoint code during module load, in which case the module's
1890          * _fmd_init() routine hasn't finished yet, and our event dictionaries
1891          * may not be open yet, which will prevent us from computing the event
1892          * code.  Defer the call to fmd_case_publish() by enqueuing a PUBLISH
1893          * event in our queue: this won't be processed until _fmd_init is done.
1894          */
1895         if (cip->ci_mod->mod_flags & FMD_MOD_INIT)
1896                 fmd_case_publish(cp, state);
1897         else {
1898                 fmd_case_hold(cp);
1899                 e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp);
1900                 fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
1901         }
1902 
1903         if (resolved) {
1904                 if (cip->ci_xprt != NULL) {
1905                         /*
1906                          * If we transitioned to RESOLVED, adjust the reference
1907                          * count to reflect our removal from
1908                          * fmd.d_rmod->mod_cases above.  If the caller has not
1909                          * placed an additional hold on the case, it will now
1910                          * be freed.
1911                          */
1912                         (void) pthread_mutex_lock(&cip->ci_lock);
1913                         fmd_asru_hash_delete_case(fmd.d_asrus, cp);
1914                         (void) pthread_mutex_unlock(&cip->ci_lock);
1915                         fmd_case_rele(cp);
1916                 } else {
1917                         fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1918                             fmd_asru_log_resolved, NULL);
1919                         (void) pthread_mutex_lock(&cip->ci_lock);
1920                         /* mark as "ready to be discarded */
1921                         cip->ci_flags |= FMD_CF_RES_CMPL;
1922                         (void) pthread_mutex_unlock(&cip->ci_lock);
1923                 }
1924         }
1925 }
1926 
1927 /*
1928  * Discard any case if it is in RESOLVED state (and if check_if_aged argument
1929  * is set if all suspects have passed the rsrc.aged time).
1930  */
1931 void
1932 fmd_case_discard_resolved(fmd_case_t *cp, void *arg)
1933 {
1934         int check_if_aged = *(int *)arg;
1935         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1936 
1937         /*
1938          * First check if case has completed transition to resolved.
1939          */
1940         (void) pthread_mutex_lock(&cip->ci_lock);
1941         if (!(cip->ci_flags & FMD_CF_RES_CMPL)) {
1942                 (void) pthread_mutex_unlock(&cip->ci_lock);
1943                 return;
1944         }
1945 
1946         /*
1947          * Now if check_is_aged is set, see if all suspects have aged.
1948          */
1949         if (check_if_aged) {
1950                 int aged = 1;
1951 
1952                 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
1953                     fmd_asru_check_if_aged, &aged);
1954                 if (!aged) {
1955                         (void) pthread_mutex_unlock(&cip->ci_lock);
1956                         return;
1957                 }
1958         }
1959 
1960         /*
1961          * Finally discard the case, clearing FMD_CF_RES_CMPL so we don't
1962          * do it twice.
1963          */
1964         fmd_module_lock(cip->ci_mod);
1965         fmd_list_delete(&cip->ci_mod->mod_cases, cip);
1966         fmd_module_unlock(cip->ci_mod);
1967         fmd_asru_hash_delete_case(fmd.d_asrus, cp);
1968         cip->ci_flags &= ~FMD_CF_RES_CMPL;
1969         (void) pthread_mutex_unlock(&cip->ci_lock);
1970         fmd_case_rele(cp);
1971 }
1972 
1973 /*
1974  * Transition the specified case to *at least* the specified state by first
1975  * re-validating the suspect list using the resource cache.  This function is
1976  * employed by the checkpoint code when restoring a saved, solved case to see
1977  * if the state of the case has effectively changed while fmd was not running
1978  * or the module was not loaded.
1979  */
1980 void
1981 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags)
1982 {
1983         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
1984 
1985         int usable = 0;         /* are any suspects usable? */
1986 
1987         ASSERT(state >= FMD_CASE_SOLVED);
1988         (void) pthread_mutex_lock(&cip->ci_lock);
1989 
1990         fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_usable, &usable);
1991 
1992         (void) pthread_mutex_unlock(&cip->ci_lock);
1993 
1994         if (!usable) {
1995                 state = MAX(state, FMD_CASE_CLOSE_WAIT);
1996                 flags |= FMD_CF_ISOLATED;
1997         }
1998 
1999         fmd_case_transition(cp, state, flags);
2000 }
2001 
2002 void
2003 fmd_case_setdirty(fmd_case_t *cp)
2004 {
2005         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2006 
2007         (void) pthread_mutex_lock(&cip->ci_lock);
2008         cip->ci_flags |= FMD_CF_DIRTY;
2009         (void) pthread_mutex_unlock(&cip->ci_lock);
2010 
2011         fmd_module_setcdirty(cip->ci_mod);
2012 }
2013 
2014 void
2015 fmd_case_clrdirty(fmd_case_t *cp)
2016 {
2017         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2018 
2019         (void) pthread_mutex_lock(&cip->ci_lock);
2020         cip->ci_flags &= ~FMD_CF_DIRTY;
2021         (void) pthread_mutex_unlock(&cip->ci_lock);
2022 }
2023 
2024 void
2025 fmd_case_commit(fmd_case_t *cp)
2026 {
2027         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2028         fmd_case_item_t *cit;
2029 
2030         (void) pthread_mutex_lock(&cip->ci_lock);
2031 
2032         if (cip->ci_flags & FMD_CF_DIRTY) {
2033                 for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
2034                         fmd_event_commit(cit->cit_event);
2035 
2036                 if (cip->ci_principal != NULL)
2037                         fmd_event_commit(cip->ci_principal);
2038 
2039                 fmd_buf_hash_commit(&cip->ci_bufs);
2040                 cip->ci_flags &= ~FMD_CF_DIRTY;
2041         }
2042 
2043         (void) pthread_mutex_unlock(&cip->ci_lock);
2044 }
2045 
2046 /*
2047  * On proxy side, send back repair/acquit/etc request to diagnosing side
2048  */
2049 void
2050 fmd_case_xprt_updated(fmd_case_t *cp)
2051 {
2052         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2053         nvlist_t **nva;
2054         uint8_t *ba;
2055         int msg = B_TRUE;
2056         int count = 0;
2057         fmd_case_lst_t fcl;
2058 
2059         ASSERT(cip->ci_xprt != NULL);
2060         (void) pthread_mutex_lock(&cip->ci_lock);
2061         ba = alloca(sizeof (uint8_t) * cip->ci_nsuspects);
2062         nva = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects);
2063         fcl.fcl_countp = &count;
2064         fcl.fcl_maxcount = cip->ci_nsuspects;
2065         fcl.fcl_msgp = &msg;
2066         fcl.fcl_ba = ba;
2067         fcl.fcl_nva = nva;
2068         fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_set_lst, &fcl);
2069         (void) pthread_mutex_unlock(&cip->ci_lock);
2070         fmd_xprt_updated(cip->ci_xprt, cip->ci_uuid, ba, cip->ci_proxy_asru,
2071             count);
2072 }
2073 
2074 /*
2075  * fmd_case_update_status() can be called on either the proxy side when a
2076  * list.suspect is received, or on the diagnosing side when an update request
2077  * is received from the proxy. It updates the status in the resource cache.
2078  */
2079 void
2080 fmd_case_update_status(fmd_case_t *cp, uint8_t *statusp, uint8_t *proxy_asrup,
2081     uint8_t *diag_asrup)
2082 {
2083         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2084         int count = 0;
2085         fmd_asru_update_status_t faus;
2086 
2087         /*
2088          * update status of resource cache entries
2089          */
2090         faus.faus_countp = &count;
2091         faus.faus_maxcount = cip->ci_nsuspects;
2092         faus.faus_ba = statusp;
2093         faus.faus_proxy_asru = proxy_asrup;
2094         faus.faus_diag_asru = diag_asrup;
2095         faus.faus_is_proxy = (cip->ci_xprt != NULL);
2096         (void) pthread_mutex_lock(&cip->ci_lock);
2097         fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_update_status,
2098             &faus);
2099         (void) pthread_mutex_unlock(&cip->ci_lock);
2100 }
2101 
2102 /*
2103  * Called on either the proxy side or the diag side when a repair has taken
2104  * place on the other side but this side may know the asru "contains"
2105  * relationships.
2106  */
2107 void
2108 fmd_case_update_containees(fmd_case_t *cp)
2109 {
2110         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2111 
2112         (void) pthread_mutex_lock(&cip->ci_lock);
2113         fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
2114             fmd_asru_update_containees, NULL);
2115         (void) pthread_mutex_unlock(&cip->ci_lock);
2116 }
2117 
2118 /*
2119  * fmd_case_close_status() is called on diagnosing side when proxy side
2120  * has had a uuclose. It updates the status in the resource cache.
2121  */
2122 void
2123 fmd_case_close_status(fmd_case_t *cp)
2124 {
2125         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2126         int count = 0;
2127         fmd_asru_close_status_t facs;
2128 
2129         /*
2130          * update status of resource cache entries
2131          */
2132         facs.facs_countp = &count;
2133         facs.facs_maxcount = cip->ci_nsuspects;
2134         (void) pthread_mutex_lock(&cip->ci_lock);
2135         fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_close_status,
2136             &facs);
2137         (void) pthread_mutex_unlock(&cip->ci_lock);
2138 }
2139 
2140 /*
2141  * Indicate that the case may need to change state because one or more of the
2142  * ASRUs named as a suspect has changed state.  We examine all the suspects
2143  * and if none are still faulty, we initiate a case close transition.
2144  */
2145 void
2146 fmd_case_update(fmd_case_t *cp)
2147 {
2148         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2149         uint_t cstate;
2150         int faulty = 0;
2151 
2152         (void) pthread_mutex_lock(&cip->ci_lock);
2153         cstate = cip->ci_state;
2154 
2155         if (cip->ci_state < FMD_CASE_SOLVED) {
2156                 (void) pthread_mutex_unlock(&cip->ci_lock);
2157                 return; /* update is not appropriate */
2158         }
2159 
2160         if (cip->ci_flags & FMD_CF_REPAIRED) {
2161                 (void) pthread_mutex_unlock(&cip->ci_lock);
2162                 return; /* already repaired */
2163         }
2164 
2165         TRACE((FMD_DBG_CASE, "case update %s", cip->ci_uuid));
2166         fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
2167         (void) pthread_mutex_unlock(&cip->ci_lock);
2168 
2169         if (faulty) {
2170                 nvlist_t *nvl;
2171                 fmd_event_t *e;
2172                 char *class;
2173 
2174                 TRACE((FMD_DBG_CASE, "sending list.updated %s", cip->ci_uuid));
2175                 nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS);
2176                 (void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2177                 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
2178                 (void) pthread_rwlock_rdlock(&fmd.d_log_lock);
2179                 fmd_log_append(fmd.d_fltlog, e, cp);
2180                 (void) pthread_rwlock_unlock(&fmd.d_log_lock);
2181                 fmd_dispq_dispatch(fmd.d_disp, e, class);
2182                 return; /* one or more suspects are still marked faulty */
2183         }
2184 
2185         if (cstate == FMD_CASE_CLOSED)
2186                 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
2187         else
2188                 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
2189 }
2190 
2191 /*
2192  * Delete a closed case from the module's case list once the fmdo_close() entry
2193  * point has run to completion.  If the case is owned by a transport module,
2194  * tell the transport to proxy a case close on the other end of the transport.
2195  * Transition to the appropriate next state based on ci_flags.  This
2196  * function represents the end of CLOSE_WAIT and transitions the case to either
2197  * CLOSED or REPAIRED or discards it entirely because it was never solved;
2198  * refer to the topmost block comment explaining the state machine for details.
2199  */
2200 void
2201 fmd_case_delete(fmd_case_t *cp)
2202 {
2203         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2204         fmd_modstat_t *msp;
2205         size_t buftotal;
2206 
2207         TRACE((FMD_DBG_CASE, "case delete %s", cip->ci_uuid));
2208         ASSERT(fmd_module_locked(cip->ci_mod));
2209         fmd_list_delete(&cip->ci_mod->mod_cases, cip);
2210         buftotal = fmd_buf_hash_destroy(&cip->ci_bufs);
2211 
2212         (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
2213         msp = cip->ci_mod->mod_stats;
2214 
2215         ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0);
2216         msp->ms_caseopen.fmds_value.ui64--;
2217 
2218         ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal);
2219         msp->ms_buftotal.fmds_value.ui64 -= buftotal;
2220 
2221         (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
2222 
2223         if (cip->ci_xprt == NULL)
2224                 fmd_module_setcdirty(cip->ci_mod);
2225 
2226         fmd_module_rele(cip->ci_mod);
2227         cip->ci_mod = fmd.d_rmod;
2228         fmd_module_hold(cip->ci_mod);
2229 
2230         /*
2231          * If the case has been solved, then retain it
2232          * on the root module's case list at least until we're transitioned.
2233          * Otherwise free the case with our final fmd_case_rele() below.
2234          */
2235         if (cip->ci_flags & FMD_CF_SOLVED) {
2236                 fmd_module_lock(cip->ci_mod);
2237                 fmd_list_append(&cip->ci_mod->mod_cases, cip);
2238                 fmd_module_unlock(cip->ci_mod);
2239                 fmd_case_hold(cp);
2240         }
2241 
2242         /*
2243          * Transition onwards to REPAIRED or CLOSED as originally requested.
2244          * Note that for proxy case if we're transitioning to CLOSED it means
2245          * the case was isolated locally, so call fmd_xprt_uuclose() to notify
2246          * the diagnosing side. No need to notify the diagnosing side if we are
2247          * transitioning to REPAIRED as we only do this when requested to do
2248          * so by the diagnosing side anyway.
2249          */
2250         if (cip->ci_flags & FMD_CF_REPAIRED)
2251                 fmd_case_transition(cp, FMD_CASE_REPAIRED, 0);
2252         else if (cip->ci_flags & FMD_CF_ISOLATED) {
2253                 fmd_case_transition(cp, FMD_CASE_CLOSED, 0);
2254                 if (cip->ci_xprt != NULL)
2255                         fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid);
2256         }
2257 
2258         fmd_case_rele(cp);
2259 }
2260 
2261 void
2262 fmd_case_discard(fmd_case_t *cp, boolean_t delete_from_asru_cache)
2263 {
2264         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2265 
2266         (void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
2267         cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--;
2268         (void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
2269 
2270         ASSERT(fmd_module_locked(cip->ci_mod));
2271         fmd_list_delete(&cip->ci_mod->mod_cases, cip);
2272         if (delete_from_asru_cache) {
2273                 (void) pthread_mutex_lock(&cip->ci_lock);
2274                 fmd_asru_hash_delete_case(fmd.d_asrus, cp);
2275                 (void) pthread_mutex_unlock(&cip->ci_lock);
2276         }
2277         fmd_case_rele(cp);
2278 }
2279 
2280 /*
2281  * Indicate that the problem corresponding to a case has been repaired by
2282  * clearing the faulty bit on each ASRU named as a suspect.  If the case hasn't
2283  * already been closed, this function initiates the transition to CLOSE_WAIT.
2284  * The caller must have the case held from fmd_case_hash_lookup(), so we can
2285  * grab and drop ci_lock without the case being able to be freed in between.
2286  */
2287 int
2288 fmd_case_repair(fmd_case_t *cp)
2289 {
2290         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2291         uint_t cstate;
2292         fmd_asru_rep_arg_t fara;
2293 
2294         (void) pthread_mutex_lock(&cip->ci_lock);
2295         cstate = cip->ci_state;
2296 
2297         if (cstate < FMD_CASE_SOLVED) {
2298                 (void) pthread_mutex_unlock(&cip->ci_lock);
2299                 return (fmd_set_errno(EFMD_CASE_STATE));
2300         }
2301 
2302         if (cip->ci_flags & FMD_CF_REPAIRED) {
2303                 (void) pthread_mutex_unlock(&cip->ci_lock);
2304                 return (0); /* already repaired */
2305         }
2306 
2307         TRACE((FMD_DBG_CASE, "case repair %s", cip->ci_uuid));
2308         fara.fara_reason = FMD_ASRU_REPAIRED;
2309         fara.fara_bywhat = FARA_BY_CASE;
2310         fara.fara_rval = NULL;
2311         fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara);
2312         (void) pthread_mutex_unlock(&cip->ci_lock);
2313 
2314         /*
2315          * if this is a proxied case, send the repair across the transport.
2316          * The remote side will then do the repair and send a list.repaired back
2317          * again such that we can finally repair the case on this side.
2318          */
2319         if (cip->ci_xprt != NULL) {
2320                 fmd_case_xprt_updated(cp);
2321                 return (0);
2322         }
2323 
2324         if (cstate == FMD_CASE_CLOSED)
2325                 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
2326         else
2327                 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
2328 
2329         return (0);
2330 }
2331 
2332 int
2333 fmd_case_acquit(fmd_case_t *cp)
2334 {
2335         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2336         uint_t cstate;
2337         fmd_asru_rep_arg_t fara;
2338 
2339         (void) pthread_mutex_lock(&cip->ci_lock);
2340         cstate = cip->ci_state;
2341 
2342         if (cstate < FMD_CASE_SOLVED) {
2343                 (void) pthread_mutex_unlock(&cip->ci_lock);
2344                 return (fmd_set_errno(EFMD_CASE_STATE));
2345         }
2346 
2347         if (cip->ci_flags & FMD_CF_REPAIRED) {
2348                 (void) pthread_mutex_unlock(&cip->ci_lock);
2349                 return (0); /* already repaired */
2350         }
2351 
2352         TRACE((FMD_DBG_CASE, "case acquit %s", cip->ci_uuid));
2353         fara.fara_reason = FMD_ASRU_ACQUITTED;
2354         fara.fara_bywhat = FARA_BY_CASE;
2355         fara.fara_rval = NULL;
2356         fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_asru_repaired, &fara);
2357         (void) pthread_mutex_unlock(&cip->ci_lock);
2358 
2359         /*
2360          * if this is a proxied case, send the repair across the transport.
2361          * The remote side will then do the repair and send a list.repaired back
2362          * again such that we can finally repair the case on this side.
2363          */
2364         if (cip->ci_xprt != NULL) {
2365                 fmd_case_xprt_updated(cp);
2366                 return (0);
2367         }
2368 
2369         if (cstate == FMD_CASE_CLOSED)
2370                 fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
2371         else
2372                 fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
2373 
2374         return (0);
2375 }
2376 
2377 int
2378 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep)
2379 {
2380         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2381         fmd_case_item_t *cit;
2382         uint_t state;
2383         int rv = 0;
2384 
2385         (void) pthread_mutex_lock(&cip->ci_lock);
2386 
2387         if (cip->ci_state >= FMD_CASE_SOLVED)
2388                 state = FMD_EVS_DIAGNOSED;
2389         else
2390                 state = FMD_EVS_ACCEPTED;
2391 
2392         for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
2393                 if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0)
2394                         break;
2395         }
2396 
2397         if (rv == 0 && cip->ci_principal != NULL)
2398                 rv = fmd_event_equal(ep, cip->ci_principal);
2399 
2400         (void) pthread_mutex_unlock(&cip->ci_lock);
2401 
2402         if (rv != 0)
2403                 fmd_event_transition(ep, state);
2404 
2405         return (rv);
2406 }
2407 
2408 int
2409 fmd_case_orphaned(fmd_case_t *cp)
2410 {
2411         return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod);
2412 }
2413 
2414 void
2415 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec)
2416 {
2417         ((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec;
2418         ((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec;
2419         ((fmd_case_impl_t *)cp)->ci_tv_valid = 1;
2420 }
2421 
2422 void
2423 fmd_case_set_injected(fmd_case_t *cp)
2424 {
2425         ((fmd_case_impl_t *)cp)->ci_injected = 1;
2426 }
2427 
2428 void
2429 fmd_case_set_de_fmri(fmd_case_t *cp, nvlist_t *nvl)
2430 {
2431         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2432 
2433         nvlist_free(cip->ci_diag_de);
2434         cip->ci_diag_de = nvl;
2435 }
2436 
2437 void
2438 fmd_case_setcode(fmd_case_t *cp, char *code)
2439 {
2440         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2441 
2442         cip->ci_code = fmd_strdup(code, FMD_SLEEP);
2443         cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0;
2444 }
2445 
2446 /*ARGSUSED*/
2447 static void
2448 fmd_case_repair_replay_case(fmd_case_t *cp, void *arg)
2449 {
2450         int not_faulty = 0;
2451         int faulty = 0;
2452         nvlist_t *nvl;
2453         fmd_event_t *e;
2454         char *class;
2455         int any_unusable_and_present = 0;
2456         fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
2457 
2458         if (cip->ci_state < FMD_CASE_SOLVED || cip->ci_xprt != NULL)
2459                 return;
2460 
2461         if (cip->ci_state == FMD_CASE_RESOLVED) {
2462                 cip->ci_flags |= FMD_CF_RES_CMPL;
2463                 return;
2464         }
2465 
2466         fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_faulty, &faulty);
2467         fmd_asru_hash_apply_by_case(fmd.d_asrus, cp, fmd_case_not_faulty,
2468             &not_faulty);
2469 
2470         if (cip->ci_state >= FMD_CASE_REPAIRED && !faulty) {
2471                 /*
2472                  * If none of the suspects is faulty, replay the list.repaired.
2473                  * If all suspects are already either usable or not present then
2474                  * also transition straight to RESOLVED state.
2475                  */
2476                 fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
2477                     fmd_case_unusable_and_present, &any_unusable_and_present);
2478                 if (!any_unusable_and_present) {
2479                         cip->ci_state = FMD_CASE_RESOLVED;
2480 
2481                         TRACE((FMD_DBG_CASE, "replay sending list.repaired %s",
2482                             cip->ci_uuid));
2483                         nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
2484                         (void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2485                         e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl,
2486                             class);
2487                         fmd_dispq_dispatch(fmd.d_disp, e, class);
2488 
2489                         TRACE((FMD_DBG_CASE, "replay sending list.resolved %s",
2490                             cip->ci_uuid));
2491                         fmd_case_publish(cp, FMD_CASE_RESOLVED);
2492                         fmd_asru_hash_apply_by_case(fmd.d_asrus, cp,
2493                             fmd_asru_log_resolved, NULL);
2494                         cip->ci_flags |= FMD_CF_RES_CMPL;
2495                 } else {
2496                         TRACE((FMD_DBG_CASE, "replay sending list.repaired %s",
2497                             cip->ci_uuid));
2498                         nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
2499                         (void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2500                         e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl,
2501                             class);
2502                         fmd_dispq_dispatch(fmd.d_disp, e, class);
2503                 }
2504         } else if (faulty && not_faulty) {
2505                 /*
2506                  * if some but not all of the suspects are not faulty, replay
2507                  * the list.updated.
2508                  */
2509                 TRACE((FMD_DBG_CASE, "replay sending list.updated %s",
2510                     cip->ci_uuid));
2511                 nvl = fmd_case_mkevent(cp, FM_LIST_UPDATED_CLASS);
2512                 (void) nvlist_lookup_string(nvl, FM_CLASS, &class);
2513                 e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
2514                 fmd_dispq_dispatch(fmd.d_disp, e, class);
2515         }
2516 }
2517 
2518 void
2519 fmd_case_repair_replay()
2520 {
2521         fmd_case_hash_apply(fmd.d_cases, fmd_case_repair_replay_case, NULL);
2522 }