1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #pragma ident   "%Z%%M% %I%     %E% SMI"
  27 
  28 
  29 /*
  30  * rename or exchange identities of virtual device nodes
  31  */
  32 
  33 #include <sys/param.h>
  34 #include <sys/systm.h>
  35 #include <sys/debug.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/types.h>
  38 #include <sys/ddi.h>
  39 #include <sys/sunddi.h>
  40 
  41 #include <sys/lvm/mdvar.h>
  42 #include <sys/lvm/md_rename.h>
  43 
  44 #include <sys/sysevent/eventdefs.h>
  45 #include <sys/sysevent/svm.h>
  46 
  47 extern  major_t         md_major;
  48 extern  unit_t          md_nunits;
  49 extern  set_t           md_nsets;
  50 extern  md_set_t        md_set[];
  51 
  52 #define ROLE(r)                                         \
  53         ((r) == MDRR_PARENT?    "parent":               \
  54         (r) == MDRR_SELF?       "self":                 \
  55         (r) == MDRR_CHILD?      "child":                \
  56         (r) == MDRR_UNK?        "<unknown>": "<garbage>")
  57 
  58 #define OP_STR(op)                                                      \
  59                 (((op) == MDRNOP_UNK)?          "<unknown>"       :       \
  60                     ((op) == MDRNOP_RENAME)?    "rename"        :       \
  61                     ((op) == MDRNOP_EXCHANGE)?  "exchange"      :       \
  62                                                 "<garbage>")
  63 int md_rename_debug = 0;
  64 
  65 /* delta guard rails */
  66 const unsigned long long        DELTA_BEG       = (0xDad08888a110beefull);
  67 const unsigned long long        DELTA_END       = (0xa110Beef88880Dadull);
  68 
  69 const unsigned long long        DELTA_BEG_FREED = (0xBad0c0ed0fed0dadull);
  70 const unsigned long long        DELTA_END_FREED = (0x0Fed0dadbad0c0edull);
  71 
  72 /* transaction guard rails */
  73 const unsigned long long        TXN_BEG         = (0xDad01eadc0ed2badull);
  74 const unsigned long long        TXN_END         = (0xc0ed2badDad01eadull);
  75 
  76 const unsigned long long        TXNUN_BEG       = (0xcafe0fedbad0beefull);
  77 const unsigned long long        TXNUN_END       = (0xbad0beefcafe0fedull);
  78 
  79 const unsigned int              guard_shift     = (sizeof (u_longlong_t) - 3);
  80 const md_stackcap_t             MD_CAN_DO_ANYTHING      = (md_stackcap_t)0;
  81 
  82 typedef struct role_change_mapping_tab_t {
  83         const int                       ord;
  84         const md_renrole_t              old_role;
  85         const md_renrole_t              new_role;
  86         const char                      *svc_name;
  87         md_ren_roleswap_svc_t * const   default_svc;
  88 } role_change_tab_t;
  89 
  90 /*
  91  *  The actual table is at the end of the file, so we don't need
  92  *  many forward references
  93  */
  94 static  role_change_tab_t       role_swap_tab[];
  95 
  96 #define ILLEGAL_ROLESWAP_SVC    ((md_ren_roleswap_svc_t *)(0xA1100BAD))
  97 #define NO_DEFAULT_ROLESWAP_SVC ((md_ren_roleswap_svc_t *)(NULL))
  98 #define ILLEGAL_SVC_NAME        (NULL)
  99 
 100 /*
 101  *
 102  * Role swap rule table:
 103  *
 104  *                                New Role
 105  *      +---------------------------------------------------------------|
 106  *      |        |    Parent       |       Self     |      Child        |
 107  *      +--------+-----------------+----------------+-------------------+
 108  *      | Parent | no default      | ...no default  | illegal           |
 109  *      |        | 1 (update kids) | 2  (update to) | 3                 |
 110  * Old  +--------+-----------------+----------------+-------------------+
 111  * Role | Self   | ...self update  | ...rename self | no default (down  |
 112  *      |        | 4   update up | 5                | 6    update from) |
 113  *      +--------+-----------------+----------------+-------------------+
 114  *      | Child  | illegal         | ...child       | ...update         |
 115  *      |        | 7               | 8   update to  | 9 parent          |
 116  *      +---------------------------------------------------------------+
 117  *
 118  * and notes:
 119  *
 120  * - Boxes 1, 4 and 6 are the most interesting. They are responsible
 121  *   for updating the from unit's data structures. These may involve
 122  *   finding (former or future) children, resetting name keys and the like.
 123  *
 124  * - The "rename" operation is boxes 1, 5 and 9. Most of the work
 125  *   is done in box 5, since that contains both the "from" and "to"
 126  *   unit struct for rename.
 127  *
 128  *  (There's got to be an eigen function for this; that diagonal
 129  *   axis is a role identity operation searching for an expression.)
 130  *
 131  * - Almost every transaction will call more than one of these.
 132  *   (Only a rename of a unit with no relatives will only call
 133  *   a single box.)
 134  *
 135  * - Box 4 "...update from" is the generic self->parent modifier.
 136  * - Box 8 "...update to" is the generic child->self modifier.
 137  *   These can be generic because all of the information which
 138  *   needs to be updated is in the common portion of the unit
 139  *   structure when changing from their respective roles.
 140  *
 141  * - Boxes 1, 2 and 6 ("no default") indicate that per-metadevice
 142  *   information must be updated. For example, in box 1, children
 143  *   identities must be updated. Since different metadevice types
 144  *   detect and manipulate their children differently, there can
 145  *   be no generic "md_rename" function in this box.
 146  *
 147  * In addition to the named services in the table above, there
 148  * are other named services used by rename/exchange.
 149  * MDRNM_LIST_URFOLKS, MDRNM_LIST_URSELF, MDRNM_LIST_URKIDS
 150  * list a device's parents, self and children, respectively.
 151  * In most cases the default functions can be used for parents
 152  * and self. Top-level devices, are not required to have a
 153  * "list folks" named service. Likewise, devices which can
 154  * not have metadevice children, are not required to have the
 155  * "list kids" named service. The LIST_UR* functions call back into
 156  * the base driver (md_build_rendelta()) to package the changes to
 157  * a device for addition onto the tree. The LIST_UR* named service
 158  * then adds this "rename delta" onto the delta tree itself.
 159  * This keeps private knowledge appropriately encapsulated.
 160  * They return the number of devices which will need to be changed,
 161  * and hence the number of elements they've added to the delta list
 162  * or -1 for error.
 163  *
 164  * Other named services used by rename/exchange are:
 165  * "lock" (MDRNM_LOCK), "unlock" (MDRNM_UNLOCK) and "check" (MDRNM_CHECK).
 166  * These (un) write-lock all of the relevant in-core structs,
 167  * including the unit structs for the device and quiesce i/o as necessary.
 168  * The "check" named service verifies that this device
 169  * is in a state where rename could and may occur at this time.
 170  * Since the role_swap functions themselves cannot be undone
 171  * (at least in this implementation), it is check()'s job to
 172  * verify that the device is renamable (sic) or, if not, abort.
 173  * The check function for the device participating in the role
 174  * of "self" is usually where rename or exchange validity is verified.
 175  *
 176  * All of these functions take two arguments which may be thought
 177  * of as the collective state changes of the tree of devices
 178  * (md_rendelta_t *family) and the rename transaction state
 179  * (md_rentxn_t rtxn or rtxnp).
 180  *
 181  */
 182 
 183 
 184 /*
 185  * rename unit lock
 186  * (default name service routine MDRNM_LOCK)
 187  */
 188 static intptr_t
 189 md_rename_lock(md_rendelta_t *delta, md_rentxn_t *rtxnp)
 190 {
 191         minor_t          mnum;
 192         md_renop_t       op;
 193 
 194         ASSERT(delta);
 195         ASSERT(rtxnp);
 196 
 197         if (!delta || !rtxnp) {
 198                 (void) mdsyserror(&rtxnp->mde, EINVAL);
 199                 return (EINVAL);
 200         }
 201         mnum = md_getminor(delta->dev);
 202         op = rtxnp->op;
 203 
 204         /*
 205          * target doesn't exist if renaming (by definition),
 206          * so it need not be locked
 207          */
 208         if (op == MDRNOP_RENAME && mnum == rtxnp->to.mnum) {
 209                 return (0);
 210         }
 211 
 212         ASSERT(delta->uip);
 213         if (!delta->uip) {
 214                 (void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP, mnum);
 215                 return (ENODEV);
 216         }
 217 
 218         ASSERT(delta->unp);
 219         if (!delta->unp) {
 220                 (void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP, mnum);
 221                 return (ENODEV);
 222         }
 223 
 224         ASSERT(!UNIT_WRITER_HELD(delta->unp));
 225 
 226         (void) md_unit_writerlock(delta->uip);
 227 
 228         ASSERT(UNIT_WRITER_HELD(delta->unp));
 229 
 230         return (0);
 231 }
 232 
 233 /*
 234  * (default name service routine MDRNM_UNLOCK)
 235  */
 236 /* ARGSUSED */
 237 static void
 238 md_rename_unlock(
 239         md_rendelta_t   *delta,
 240         md_rentxn_t     *rtxnp)
 241 {
 242         ASSERT(delta);
 243         ASSERT(delta->uip);
 244         ASSERT(delta->unp);
 245 
 246         ASSERT(UNIT_WRITER_HELD(delta->unp));
 247 
 248         (void) md_unit_writerexit(delta->uip);
 249 
 250         ASSERT(!UNIT_WRITER_HELD(delta->unp));
 251 }
 252 
 253 /*
 254  * This is used by the various MDRNM_LIST* named services.
 255  */
 256 md_rendelta_t *
 257 md_build_rendelta(
 258         md_renrole_t     old_role,
 259         md_renrole_t     new_role,
 260         md_dev64_t       dev,
 261         md_rendelta_t   *prev,
 262         md_unit_t       *unp,
 263         mdi_unit_t      *uip,
 264         md_error_t      *ep)
 265 {
 266         int              err    = 0;
 267         md_rendelta_t   *new;
 268 
 269         new = (md_rendelta_t *)kmem_alloc(sizeof (md_rendelta_t), KM_SLEEP);
 270 
 271         new->beginning       = DELTA_BEG;
 272         new->dev     = dev;
 273         new->new_role        = new_role;
 274         new->old_role        = old_role;
 275         new->next    = NULL;
 276         new->prev    = prev;
 277         new->unp = unp;
 278         new->uip = uip;
 279         bzero((void *) &new->txn_stat, sizeof (md_rendstat_t));
 280 
 281         /*
 282          * For non-meta devices that are being renamed (in the future,
 283          * that is) we would need to pass in default functions to
 284          * accommodate them, provided the default function is
 285          * truly capable of performing the lock/check/unlock function
 286          * on opaque devices.
 287          */
 288 
 289         new->lock    = md_get_named_service(dev, /* modindex */ 0,
 290                                                 MDRNM_LOCK, md_rename_lock);
 291 
 292         new->unlock  = (md_ren_void_svc_t *)md_get_named_service(dev,
 293                                         /* modindex */ 0, MDRNM_UNLOCK,
 294                                         (intptr_t (*)()) md_rename_unlock);
 295 
 296         new->check   = md_get_named_service(dev, /* modindex */ 0,
 297                                             MDRNM_CHECK, /* Default */ NULL);
 298 
 299         new->role_swap       = NULL; /* set this when the roles are determined */
 300 
 301         if (!new->lock || !new->unlock || !new->check) {
 302                 (void) mdmderror(ep, MDE_RENAME_CONFIG_ERROR, md_getminor(dev));
 303                 err = EINVAL;
 304                 goto out;
 305         }
 306 
 307         new->end = DELTA_END;
 308 
 309 out:
 310         if (err != 0) {
 311                 if (new) {
 312                         new->beginning       = DELTA_BEG_FREED;
 313                         new->end     = DELTA_END_FREED;
 314 
 315                         kmem_free(new, sizeof (md_rendelta_t));
 316                         new = NULL;
 317                 }
 318         }
 319 
 320         if (prev) {
 321                 prev->next = new;
 322         }
 323 
 324         return (new);
 325 }
 326 
 327 /*
 328  * md_store_recid()
 329  * used by role swap functions
 330  */
 331 void
 332 md_store_recid(
 333         int             *prec_idx,
 334         mddb_recid_t    *recid_list,
 335         md_unit_t       *un)
 336 {
 337         mddb_recid_t    *rp;
 338         bool_t           add_recid;
 339 
 340         ASSERT(prec_idx);
 341         ASSERT(recid_list);
 342         ASSERT(recid_list[*prec_idx] == 0);
 343         ASSERT(*prec_idx >= 0);
 344 
 345         for (add_recid = TRUE, rp = recid_list; add_recid && rp && *rp; rp++) {
 346                 if (MD_RECID(un) == *rp) {
 347                         add_recid = FALSE;
 348                 }
 349         }
 350 
 351         if (add_recid) {
 352                 recid_list[(*prec_idx)++] = MD_RECID(un);
 353         }
 354 }
 355 
 356 /*
 357  * MDRNM_LIST_URFOLKS: generic named svc entry point
 358  * add all parents onto the list pointed to by dlpp
 359  * (only weird multi-parented devices need to have their
 360  * own named svc  to do this.)
 361  */
 362 static int
 363 md_rename_listfolks(md_rendelta_t **dlpp, md_rentxn_t *rtxnp)
 364 {
 365         md_rendelta_t   *new;
 366 
 367         ASSERT(rtxnp);
 368         ASSERT(dlpp);
 369         ASSERT(*dlpp == NULL);
 370         ASSERT((rtxnp->op == MDRNOP_EXCHANGE) || (rtxnp->op == MDRNOP_RENAME));
 371         ASSERT(rtxnp->from.uip);
 372         ASSERT(rtxnp->from.unp);
 373 
 374         if ((!rtxnp->from.uip) || (!rtxnp->from.unp)) {
 375                 (void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP,
 376                                                         rtxnp->from.mnum);
 377                 return (-1);
 378         }
 379 
 380         if (!MD_HAS_PARENT(MD_PARENT(rtxnp->from.unp))) {
 381                 return (0);
 382         }
 383 
 384         /*
 385          * If supporting log renaming (and other multiparented devices)
 386          * callout to each misc module to claim this waif and return the
 387          * md_dev64_t of its parents.
 388          */
 389         if (MD_PARENT(rtxnp->from.unp) == MD_MULTI_PARENT) {
 390                 (void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD,
 391                                                         rtxnp->from.mnum);
 392                 return (2);
 393         }
 394 
 395         if ((rtxnp->op == MDRNOP_RENAME) ||
 396             (MD_PARENT(rtxnp->from.unp) != MD_SID(rtxnp->to.unp))) {
 397 
 398                 new = md_build_rendelta(
 399                             MDRR_PARENT,
 400                             MDRR_PARENT,
 401                             md_makedevice(md_major, MD_PARENT(rtxnp->from.unp)),
 402                             NULL,
 403                             MD_UNIT(MD_PARENT(rtxnp->from.unp)),
 404                             MDI_UNIT(MD_PARENT(rtxnp->from.unp)),
 405                             &rtxnp->mde);
 406         } else {
 407                 /* parent is swapping roles with self */
 408                 new = md_build_rendelta(
 409                             MDRR_PARENT,
 410                             MDRR_SELF,
 411                             md_makedevice(md_major, MD_SID(rtxnp->to.unp)),
 412                             NULL,
 413                             rtxnp->to.unp,
 414                             rtxnp->to.uip,
 415                             &rtxnp->mde);
 416         }
 417 
 418         if (!new) {
 419                 if (mdisok(&rtxnp->mde)) {
 420                         (void) mdsyserror(&rtxnp->mde, ENOMEM);
 421                 }
 422                 return (-1);
 423         }
 424 
 425         *dlpp = new;
 426 
 427         return (1);
 428 }
 429 
 430 /*
 431  * MDRNM_LIST_URSELF: named svc entry point
 432  * add all delta entries appropriate for ourselves onto the deltalist pointed
 433  * to by dlpp
 434  */
 435 static int
 436 md_rename_listself(md_rendelta_t **dlpp, md_rentxn_t *rtxnp)
 437 {
 438         md_rendelta_t   *new, *p;
 439         bool_t           exchange_up    = FALSE;
 440 
 441         ASSERT(rtxnp);
 442         ASSERT(dlpp);
 443         ASSERT((rtxnp->op == MDRNOP_EXCHANGE) || (rtxnp->op == MDRNOP_RENAME));
 444         ASSERT(rtxnp->from.unp);
 445         ASSERT(rtxnp->from.uip);
 446 
 447         if ((!rtxnp->from.uip) || (!rtxnp->from.unp)) {
 448                 (void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP,
 449                                                         rtxnp->from.mnum);
 450                 return (-1);
 451         }
 452 
 453         for (p = *dlpp; p && p->next != NULL; p = p->next) {
 454                 /* NULL */
 455         }
 456 
 457         /*
 458          * renaming or
 459          * from's parent is not to and to's parent is not from
 460          */
 461         if (rtxnp->op == MDRNOP_RENAME) {
 462                 new = md_build_rendelta(
 463                                 MDRR_SELF,
 464                                 MDRR_SELF,
 465                                 md_makedevice(md_major, rtxnp->from.mnum),
 466                                 p,
 467                                 rtxnp->from.unp,
 468                                 rtxnp->from.uip,
 469                                 &rtxnp->mde);
 470         } else {
 471 
 472                 if (MD_PARENT(rtxnp->from.unp) == MD_SID(rtxnp->to.unp)) {
 473                         exchange_up = TRUE;
 474                 }
 475 
 476                 /* self and parent are flipping */
 477                 new = md_build_rendelta(
 478                                 MDRR_SELF,
 479                                 exchange_up? MDRR_PARENT: MDRR_CHILD,
 480                                 md_makedevice(md_major, rtxnp->from.mnum),
 481                                 p,
 482                                 rtxnp->from.unp,
 483                                 rtxnp->from.uip,
 484                                 &rtxnp->mde);
 485         }
 486 
 487         if (!new) {
 488                 if (mdisok(&rtxnp->mde)) {
 489                         (void) mdsyserror(&rtxnp->mde, ENOMEM);
 490                 }
 491                 return (-1);
 492         }
 493 
 494         if (!*dlpp) {
 495                 *dlpp = new;
 496         }
 497 
 498         return (1);
 499 }
 500 
 501 /*
 502  * free the tree of all deltas to devices involved in the rename transaction
 503  */
 504 static void
 505 free_dtree(md_rendelta_t *family)
 506 {
 507         md_rendelta_t   *next           = NULL;
 508         int              i              = 0;
 509         md_rendelta_t   *r;
 510 
 511         for (r = family; (NULL != r); r = next, i++) {
 512 
 513                 next            = r->next;
 514 
 515                 /* shift << because it makes the resultant pattern readable */
 516                 r->beginning = DELTA_BEG_FREED ^ (i << guard_shift);
 517                 r->end               = DELTA_END_FREED ^ (i << guard_shift);
 518 
 519                 kmem_free(r, sizeof (md_rendelta_t));
 520         }
 521 }
 522 
 523 /*
 524  * walk down family tree, calling lock service function
 525  */
 526 static int
 527 lock_dtree(md_rendelta_t *family, md_rentxn_t *rtxnp)
 528 {
 529         md_rendelta_t   *r;
 530         int              rc;
 531 
 532         ASSERT(family);
 533         ASSERT(rtxnp);
 534 
 535         if (!family || !rtxnp) {
 536                 return (EINVAL);
 537         }
 538 
 539         for (rc = 0, r = family; r; r = r->next) {
 540 
 541                 ASSERT(r->unp);
 542                 ASSERT(!UNIT_WRITER_HELD(r->unp));
 543                 ASSERT(r->lock);
 544 
 545                 if ((rc = (int)(*r->lock) (r, rtxnp)) != 0) {
 546                         return (rc);
 547                 }
 548                 r->txn_stat.locked = TRUE;
 549         }
 550 
 551         return (0);
 552 }
 553 
 554 /*
 555  * We rely on check() (MDRNM_CHECK) to make exhaustive checks,
 556  * since we don't attempt to undo role_swap() failures.
 557  *
 558  * To implement an undo() function would require each role_swap()
 559  * to store a log of previous state of the structures it changes,
 560  * presumably anchored by the rendelta.
 561  *
 562  */
 563 static int
 564 check_dtree(md_rendelta_t *family, md_rentxn_t *rtxnp)
 565 {
 566         md_rendelta_t   *r;
 567         int              rc;
 568 
 569         ASSERT(family);
 570         ASSERT(rtxnp);
 571 
 572         if (!family || !rtxnp) {
 573                 /* no error packet to set? */
 574                 return (EINVAL);
 575         }
 576 
 577         for (r = family, rc = 0; r; r = r->next) {
 578 
 579                 ASSERT(UNIT_WRITER_HELD(r->unp));
 580                 ASSERT(r->txn_stat.locked);
 581 
 582                 /*
 583                  * <to> doesn't exist for rename
 584                  */
 585                 if (!(rtxnp->op == MDRNOP_RENAME &&
 586                     md_getminor(r->dev) == rtxnp->to.mnum)) {
 587                         ASSERT(r->uip);
 588                         r->txn_stat.is_open = md_unit_isopen(r->uip);
 589                 }
 590 
 591                 /*
 592                  * if only allowing offline rename/exchanges, check
 593                  * for top being trans because it opens its sub-devices
 594                  */
 595 
 596                 switch (rtxnp->revision) {
 597                 case MD_RENAME_VERSION_OFFLINE:
 598                         if ((r->txn_stat.is_open) &&
 599                                 (!rtxnp->stat.trans_in_stack)) {
 600                                 (void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY,
 601                                                         md_getminor(r->dev));
 602                                 return (EBUSY);
 603                         }
 604                         break;
 605 
 606                 case MD_RENAME_VERSION_ONLINE:
 607                         break;
 608 
 609                 default:
 610                         (void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
 611                                                 md_getminor(r->dev));
 612                         return (EINVAL);
 613                 }
 614 
 615                 /* MD_UN_MOD_INPROGRESS includes the MD_UN_RENAMING bit */
 616 
 617                 if (MD_STATUS(r->unp) & MD_UN_MOD_INPROGRESS) {
 618                         (void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY,
 619                                                         md_getminor(r->dev));
 620                         return (EBUSY);
 621                 }
 622 
 623                 MD_STATUS(r->unp) |= MD_UN_RENAMING;
 624 
 625                 if ((rc = (int)(*r->check)(r, rtxnp)) != 0) {
 626                         return (rc);
 627                 }
 628 
 629                 /* and be sure we can proceed */
 630                 if (!(r->role_swap)) {
 631                         (void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
 632                                                         md_getminor(r->dev));
 633                         return (EINVAL);
 634                 }
 635                 r->txn_stat.checked = TRUE;
 636         }
 637 
 638         return (0);
 639 }
 640 
 641 
 642 /*
 643  * rename role_swap() functions are responsible for updating their
 644  * own parent, self and children references in both on-disk
 645  * and in-core structures, as well as storing the changed
 646  * record ids into recids and incrementing rec_idx.
 647  */
 648 
 649 static void
 650 role_swap_dtree(md_rendelta_t *family, md_rentxn_t *rtxnp)
 651 {
 652         md_rendelta_t   *r;
 653 
 654         ASSERT(family);
 655         ASSERT(rtxnp);
 656 
 657         for (r = family; r; r = r->next) {
 658                 ASSERT(r->role_swap);
 659                 ASSERT(r->txn_stat.locked);
 660                 ASSERT(r->txn_stat.checked);
 661 
 662                 (*r->role_swap)(r, rtxnp);
 663 
 664                 r->txn_stat.role_swapped = TRUE;
 665         }
 666 
 667         /*
 668          * there's some work to do, but not more than expected
 669          */
 670         ASSERT(rtxnp->rec_idx > 0);
 671         ASSERT(rtxnp->rec_idx < rtxnp->n_recids);
 672 
 673         if (rtxnp->rec_idx >= rtxnp->n_recids || rtxnp->rec_idx <= 0) {
 674                 /*
 675                  * There's no way to indicate error from here,
 676                  * and even if we could, there's no undo mechanism.
 677                  * We've already modified the in-core structs, so
 678                  * We can't continue w/o committing, but we
 679                  * don't appear to have anything to commit.
 680                  */
 681                 cmn_err(CE_PANIC,
 682                         "md_rename: role_swap_dtree(family:%p, rtxnp:%p)",
 683                                         (void *) family, (void *) rtxnp);
 684                 return;
 685         }
 686         rtxnp->recids[rtxnp->rec_idx] = 0;
 687 
 688         mddb_commitrecs_wrapper(rtxnp->recids);
 689 }
 690 
 691 /*
 692  * walk down delta tree, calling the unlock service for each device,
 693  * provided any of the devices appear to have been locked
 694  */
 695 static void
 696 unlock_dtree(md_rendelta_t *family, md_rentxn_t *rtxnp)
 697 {
 698         md_rendelta_t   *r;
 699         uint_t           any_locked     = FALSE;
 700 
 701         ASSERT(family);
 702         ASSERT(rtxnp);
 703 
 704         for (r = family; r; r = r->next) {
 705 
 706                 ASSERT(!(r->txn_stat.unlocked)); /* "has been unlocked" */
 707                 any_locked |= r->txn_stat.locked;
 708         }
 709 
 710         if (any_locked) {
 711 
 712                 /* unwind in reverse order */
 713                 for (r = family; NULL != r->next; r = r->next) {
 714                         /* NULL */
 715                 }
 716 
 717                 for (; NULL != r; r = r->prev) {
 718                         MD_STATUS(r->unp) &= ~MD_UN_RENAMING;
 719                         ASSERT(r->unlock);
 720                         r->unlock(r, rtxnp);
 721                         r->txn_stat.unlocked = TRUE;
 722                 }
 723         }
 724 }
 725 
 726 /*
 727  * MDRNM_UPDATE_SELF
 728  * This role swap function is identical for all unit types,
 729  * so keep it here. It's also the best example because it
 730  * touches all the modified portions of the relevant
 731  * in-common structures.
 732  */
 733 static void
 734 md_rename_update_self(
 735         md_rendelta_t   *delta,
 736         md_rentxn_t     *rtxnp)
 737 {
 738         minor_t         from_min, to_min;
 739         sv_dev_t        sv;
 740         mddb_de_ic_t    *dep;
 741         mddb_rb32_t     *rbp;
 742 
 743         ASSERT(rtxnp);
 744         ASSERT(rtxnp->op == MDRNOP_RENAME);
 745         ASSERT(delta);
 746         ASSERT(delta->unp);
 747         ASSERT(delta->uip);
 748         ASSERT(rtxnp->rec_idx >= 0);
 749         ASSERT(rtxnp->recids);
 750         ASSERT(delta->old_role == MDRR_SELF);
 751         ASSERT(delta->new_role == MDRR_SELF);
 752         ASSERT(md_getminor(delta->dev) == rtxnp->from.mnum);
 753 
 754         from_min = rtxnp->from.mnum;
 755         to_min = rtxnp->to.mnum;
 756 
 757         /*
 758          * self id changes in our own unit struct
 759          */
 760         MD_SID(delta->unp) = to_min;
 761 
 762         /*
 763          * make sure that dest always has correct un_revision
 764          * and rb_revision
 765          */
 766         delta->unp->c.un_revision |= MD_FN_META_DEV;
 767         dep = mddb_getrecdep(MD_RECID(delta->unp));
 768         ASSERT(dep);
 769         rbp = dep->de_rb;
 770         if (rbp->rb_revision & MDDB_REV_RB) {
 771                 rbp->rb_revision = MDDB_REV_RBFN;
 772         } else if (rbp->rb_revision & MDDB_REV_RB64) {
 773                 rbp->rb_revision = MDDB_REV_RB64FN;
 774         }
 775 
 776         /*
 777          * clear old array pointers to unit in-core and unit
 778          */
 779 
 780         MDI_VOIDUNIT(from_min) = NULL;
 781         MD_VOIDUNIT(from_min) = NULL;
 782 
 783         /*
 784          * and point the new slots at the unit in-core and unit structs
 785          */
 786 
 787         MDI_VOIDUNIT(to_min) = delta->uip;
 788         MD_VOIDUNIT(to_min) = delta->unp;
 789 
 790         /*
 791          * recreate kstats
 792          * - destroy the ones associated with our former identity
 793          * - reallocate and associate them with our new identity
 794          */
 795         md_kstat_destroy_ui(delta->uip);
 796         md_kstat_init_ui(to_min, delta->uip);
 797 
 798         /*
 799          * the unit in-core reference to the get next link's id changes
 800          */
 801 
 802         delta->uip->ui_link.ln_id = to_min;
 803 
 804         /*
 805          * name space addition of new key was done from user-level
 806          * remove the old name's key here
 807          */
 808 
 809         sv.setno = MD_MIN2SET(from_min);
 810         sv.key = rtxnp->from.key;
 811 
 812         md_rem_names(&sv, 1);
 813 
 814         /*
 815          * Remove associated device node as well
 816          */
 817         md_remove_minor_node(from_min);
 818 
 819         /*
 820          * and store the record id (from the unit struct) into recids
 821          * for later commitment by md_rename()
 822          */
 823         md_store_recid(&rtxnp->rec_idx, rtxnp->recids, delta->unp);
 824 }
 825 
 826 /*
 827  * Either one of our siblings and/or our parent changed identities.
 828  */
 829 static void
 830 md_renexch_update_parent(
 831         md_rendelta_t   *delta,
 832         md_rentxn_t     *rtxnp)
 833 {
 834         ASSERT(rtxnp);
 835         ASSERT((MDRNOP_RENAME == rtxnp->op) || (rtxnp->op == MDRNOP_EXCHANGE));
 836         ASSERT(rtxnp->rec_idx >= 0);
 837         ASSERT(rtxnp->recids);
 838         ASSERT(delta);
 839         ASSERT(delta->unp);
 840         ASSERT(delta->old_role == MDRR_CHILD);
 841         ASSERT(delta->new_role == MDRR_CHILD);
 842         ASSERT((MD_PARENT(delta->unp) == rtxnp->from.mnum) ||
 843                 (MD_PARENT(delta->unp) == rtxnp->to.mnum));
 844 
 845         if (MD_PARENT(delta->unp) == rtxnp->from.mnum) {
 846                 MD_PARENT(delta->unp) = rtxnp->to.mnum;
 847         }
 848 
 849         md_store_recid(&rtxnp->rec_idx, rtxnp->recids, delta->unp);
 850 }
 851 
 852 /*
 853  * exchange up (child->self)
 854  */
 855 static void
 856 md_exchange_child_update_to(
 857         md_rendelta_t   *delta,
 858         md_rentxn_t     *rtxnp)
 859 {
 860         minor_t from_min, to_min;
 861 
 862         ASSERT(rtxnp);
 863         ASSERT(rtxnp->op == MDRNOP_EXCHANGE);
 864         ASSERT(rtxnp->rec_idx >= 0);
 865         ASSERT(rtxnp->recids);
 866         ASSERT(delta);
 867         ASSERT(delta->unp);
 868         ASSERT(delta->uip);
 869         ASSERT(delta->old_role == MDRR_CHILD);
 870         ASSERT(delta->new_role == MDRR_SELF);
 871         ASSERT(md_getminor(delta->dev) == rtxnp->to.mnum);
 872 
 873         from_min = rtxnp->from.mnum;
 874         to_min = rtxnp->to.mnum;
 875 
 876         /*
 877          * self id changes in our own unit struct
 878          * Note:
 879          * - Since we're assuming the identity of "from" we use its mnum even
 880          *   though we're updating the "to" structures.
 881          */
 882 
 883         MD_SID(delta->unp) = from_min;
 884 
 885         /*
 886          * our parent identifier becomes the new self, who was "to"
 887          */
 888 
 889         MD_PARENT(delta->unp) = to_min;
 890 
 891         /*
 892          * point the set array pointers at the "new" unit and unit in-cores
 893          * Note:
 894          * - The other half of this transfer is done in the "update from"
 895          *   rename/exchange named service.
 896          */
 897 
 898         MD_VOIDUNIT(from_min) = delta->unp;
 899         MDI_VOIDUNIT(from_min) = delta->uip;
 900 
 901         /*
 902          * transfer kstats
 903          */
 904 
 905         delta->uip->ui_kstat = rtxnp->from.kstatp;
 906 
 907         /*
 908          * the unit in-core reference to the get next link's id changes
 909          */
 910 
 911         delta->uip->ui_link.ln_id = from_min;
 912 
 913         /*
 914          * name space additions, if necessary, were done from user-level.
 915          * name space deletions, if necessary, were done in "exchange_from"
 916          */
 917 
 918         /*
 919          * and store the record id (from the unit struct) into recids
 920          * for later comitment by md_rename()
 921          */
 922 
 923         md_store_recid(&rtxnp->rec_idx, rtxnp->recids, delta->unp);
 924 }
 925 
 926 /*
 927  * exchange up (self->parent)
 928  */
 929 static void
 930 md_exchange_self_update_from_up(
 931         md_rendelta_t   *delta,
 932         md_rentxn_t     *rtxnp)
 933 {
 934         minor_t from_min, to_min;
 935 
 936         ASSERT(rtxnp);
 937         ASSERT(rtxnp->op == MDRNOP_EXCHANGE);
 938         ASSERT(rtxnp->rec_idx >= 0);
 939         ASSERT(rtxnp->recids);
 940         ASSERT(delta);
 941         ASSERT(delta->unp);
 942         ASSERT(delta->uip);
 943         ASSERT(delta->old_role == MDRR_SELF);
 944         ASSERT(delta->new_role == MDRR_PARENT);
 945         ASSERT(md_getminor(delta->dev) == rtxnp->from.mnum);
 946 
 947         from_min = rtxnp->from.mnum;
 948         to_min = rtxnp->to.mnum;
 949 
 950         /*
 951          * self id changes in our own unit struct
 952          * Note:
 953          * - Since we're assuming the identity of "to" we use its mnum
 954          *   while we're updating the "to" structures.
 955          */
 956 
 957         MD_SID(delta->unp) = to_min;
 958 
 959         /*
 960          * our parent identifier becomes the new parent, who was "from"
 961          */
 962 
 963         MD_PARENT(delta->unp) = from_min;
 964 
 965         /*
 966          * point the set array pointers at the "new" unit and unit in-cores
 967          * Note:
 968          * - The other half of this transfer is done in the "update from"
 969          *   rename/exchange named service.
 970          */
 971 
 972         MD_VOIDUNIT(to_min) = delta->unp;
 973         MDI_VOIDUNIT(to_min) = delta->uip;
 974 
 975         /*
 976          * transfer kstats
 977          */
 978 
 979         delta->uip->ui_kstat = rtxnp->to.kstatp;
 980 
 981         /*
 982          * the unit in-core reference to the get next link's id changes
 983          */
 984 
 985         delta->uip->ui_link.ln_id = to_min;
 986 
 987         /*
 988          * name space additions, if necessary, were done from user-level.
 989          * name space deletions, if necessary, were done in "exchange_from"
 990          */
 991 
 992         /*
 993          * and store the record id (from the unit struct) into recids
 994          * for later comitment by md_rename()
 995          */
 996 
 997         md_store_recid(&rtxnp->rec_idx, rtxnp->recids, delta->unp);
 998 }
 999 
1000 /*
1001  * The order of the called role swap functions is critical.
1002  * If they're not ordered as "all parents", then "all self"
1003  * then "all child" transitions, we will almost certainly
1004  * corrupt the data base and the in-core linkages. So,
1005  * verify that the list built by the individual drivers is
1006  * ok here.
1007  *
1008  * We could have done fancy bit encodings of the roles so
1009  * it all fit into a single word and we wouldn't need the
1010  * prev_ord field. But, since cpu power is cheaper than
1011  * than people power, they're all separate for easier
1012  * debugging and maintaining. (In the unlikely event that
1013  * rename/exchange ever becomes cpu-limited, and this
1014  * algorithm is the bottleneck, we should revisit this.)
1015  */
1016 
1017 static bool_t
1018 role_swap_is_valid(
1019         int              previous,
1020         int              current,
1021         md_rendelta_t   *delta,
1022         md_rentxn_t     *rtxnp)
1023 {
1024         bool_t  valid   = FALSE;
1025 
1026         /*
1027          * we've backed up in processing the role table
1028          */
1029         if ((previous > current) &&
1030             (delta->prev && (delta->old_role != delta->prev->old_role))) {
1031                 goto out;
1032         }
1033 
1034         /*
1035          * we're repeating the same role transition
1036          */
1037         if (previous == current) {
1038                 switch (delta->old_role) {
1039                 case MDRR_PARENT:
1040                         /*
1041                          * require at least one of the devices to
1042                          * be multiparented for us to allow another
1043                          * parent transition
1044                          */
1045                         if ((MD_MULTI_PARENT != MD_PARENT(rtxnp->from.unp)) &&
1046                             (MD_MULTI_PARENT != MD_PARENT(rtxnp->to.unp))) {
1047                                 goto out;
1048                         }
1049                         break;
1050 
1051                 case MDRR_CHILD:
1052                         /* it's ok to have multiple children */
1053                         break;
1054 
1055                 case MDRR_SELF:
1056                         /* it's never ok to have multiple self transitions */
1057                         /* FALLTHROUGH */
1058                 default:
1059                         goto out;
1060                 }
1061         }
1062 
1063         valid = TRUE;
1064 out:
1065         if (!valid) {
1066                 if (md_rename_debug != 0) {
1067                         cmn_err(CE_NOTE, "previous: %d, current: %d, role: %s",
1068                                         previous, current,
1069                                         ROLE(delta->old_role));
1070                         delay(drv_sectohz(3));
1071                         ASSERT(FALSE);
1072                 }
1073         }
1074 
1075         return (valid);
1076 }
1077 
1078 static role_change_tab_t *
1079 lookup_role(md_renrole_t old_role, md_renrole_t new_role)
1080 {
1081         role_change_tab_t       *rp;
1082         role_change_tab_t       *found = NULL;
1083 
1084         for (rp = role_swap_tab; !found && (rp->old_role != MDRR_UNK); rp++) {
1085 
1086                 if (rp->old_role == old_role && rp->new_role == new_role) {
1087                         found = rp;
1088                 }
1089         }
1090         /*
1091          * we require a named svc if we've got two devices
1092          * claiming to be changing roles in this manner
1093          */
1094         ASSERT(found);
1095         ASSERT(found->default_svc != ILLEGAL_ROLESWAP_SVC);
1096         ASSERT(found->svc_name != ILLEGAL_SVC_NAME);
1097 
1098         if (!found ||
1099             (found->default_svc == ILLEGAL_ROLESWAP_SVC) ||
1100             (found->svc_name == ILLEGAL_SVC_NAME)) {
1101                 return (NULL);
1102         }
1103 
1104         return (found);
1105 }
1106 
1107 /*
1108  * fill in the role swap named svc., now that we know each device
1109  * and its changing role
1110  */
1111 static int
1112 valid_roleswap_dtree(
1113         md_rendelta_t   *family,
1114         md_rentxn_t     *rtxnp
1115 )
1116 {
1117         md_rendelta_t           *r;
1118         role_change_tab_t       *rolep;
1119         minor_t                  from_min, to_min;
1120         int                      prev_ord       = -1;
1121         bool_t                  found_self      = FALSE;
1122         int                      err            = 0;
1123 
1124         ASSERT(family);
1125         ASSERT(rtxnp);
1126 
1127         from_min = rtxnp->from.mnum;
1128         to_min = rtxnp->to.mnum;
1129 
1130         for (r = family; r; r = r->next, prev_ord = rolep->ord) {
1131 
1132                 if (!(rolep = lookup_role(r->old_role, r->new_role))) {
1133                         (void) mdmderror(&rtxnp->mde,
1134                                         MDE_RENAME_CONFIG_ERROR, from_min);
1135                         err = EOPNOTSUPP;
1136                         goto out;
1137                 }
1138                 r->role_swap = (md_ren_roleswap_svc_t *)md_get_named_service(
1139                                         r->dev, /* modindex */ 0,
1140                                         (char *)rolep->svc_name,
1141                                         (intptr_t (*)()) rolep->default_svc);
1142 
1143                 /*
1144                  * someone probably called the ioctl directly and
1145                  * incorrectly, rather than via the libmeta wrappers
1146                  */
1147                 if (!(r->role_swap)) {
1148                         (void) mdmderror(&rtxnp->mde,
1149                                         MDE_RENAME_TARGET_UNRELATED, to_min);
1150                         err = EOPNOTSUPP;
1151                         goto out;
1152                 }
1153 
1154                 if (!role_swap_is_valid(prev_ord, rolep->ord, r, rtxnp)) {
1155                         (void) mdmderror(&rtxnp->mde,
1156                                         MDE_RENAME_CONFIG_ERROR, from_min);
1157                         err = EINVAL;
1158                         goto out;
1159                 }
1160 
1161                 if (rolep->old_role == MDRR_SELF) {
1162                         found_self = TRUE;
1163                 }
1164 
1165                 if (MD_PARENT(r->unp) == MD_MULTI_PARENT) {
1166                         (void) mdmderror(&rtxnp->mde, MDE_RENAME_TARGET_BAD,
1167                                                         md_getminor(r->dev));
1168                         err = EINVAL;
1169                         goto out;
1170                 }
1171         }
1172 
1173         /*
1174          * must be at least one selfish device
1175          */
1176         ASSERT(found_self);
1177         if (!found_self) {
1178                 (void) mdmderror(&rtxnp->mde,
1179                                         MDE_RENAME_CONFIG_ERROR, from_min);
1180                 err = EINVAL;
1181                 goto out;
1182         }
1183 
1184 out:
1185         return (err);
1186 }
1187 
1188 /*
1189  * dump contents of rename transaction
1190  */
1191 static void
1192 dump_txn(md_rentxn_t *rtxnp) {
1193 
1194         if (md_rename_debug == 0) {
1195                 return;
1196         }
1197 
1198         cmn_err(CE_NOTE, "rtxnp: %p", (void *) rtxnp);
1199         if (rtxnp) {
1200                 cmn_err(CE_NOTE, "beginning: %llx, op: %s",
1201                         rtxnp->beginning, OP_STR(rtxnp->op));
1202 
1203                 cmn_err(CE_NOTE,
1204         "revision: %d, uflags: %d, rec_idx: %d, n_recids: %d, rec_ids: %p%s",
1205                         rtxnp->revision, rtxnp->uflags,
1206                         rtxnp->rec_idx, rtxnp->n_recids, (void *) rtxnp->recids,
1207                         rtxnp->stat.trans_in_stack? " (trans in stack)": "");
1208                 cmn_err(CE_NOTE, " from: beginning: %llx",
1209                                                         rtxnp->from.beginning);
1210                 cmn_err(CE_NOTE, "    minor: %lX, key: %lX",
1211                         (ulong_t)rtxnp->from.mnum, (ulong_t)rtxnp->from.key);
1212                 cmn_err(CE_NOTE, "    unp: %lX, uip: %lX",
1213                         (ulong_t)rtxnp->from.unp, (ulong_t)rtxnp->from.uip);
1214                 cmn_err(CE_NOTE, "    end: %llx", rtxnp->from.end);
1215                 cmn_err(CE_NOTE, "  to: beginning: %llx", rtxnp->to.beginning);
1216                 cmn_err(CE_NOTE, "    minor: %lX, key: %lX",
1217                         (ulong_t)rtxnp->to.mnum, (ulong_t)rtxnp->to.key);
1218                 cmn_err(CE_NOTE, "    unp: %lX, uip: %lX",
1219                         (ulong_t)rtxnp->to.unp, (ulong_t)rtxnp->to.uip);
1220                 cmn_err(CE_NOTE, "    end: %llx", rtxnp->to.end);
1221                 cmn_err(CE_NOTE, "end: %llx\n", rtxnp->end);
1222         }
1223         delay(drv_sectohz(1));
1224 }
1225 
1226 /*
1227  * dump contents of all deltas
1228  */
1229 static void
1230 dump_dtree(md_rendelta_t *family)
1231 {
1232         md_rendelta_t   *r;
1233         int             i;
1234 
1235         if (md_rename_debug == 0) {
1236                 return;
1237         }
1238 
1239         for (r = family, i = 0; r; r = r->next, i++) {
1240                 cmn_err(CE_NOTE, "%d.  beginning: %llx", i, r->beginning);
1241                 cmn_err(CE_NOTE, "  r: %lX, dev: %lX, next: %lx, prev: %lx",
1242                                         (ulong_t)r, (ulong_t)r->dev,
1243                                         (ulong_t)r->next, (ulong_t)r->prev);
1244 
1245                 cmn_err(CE_NOTE, "  role: %s -> %s, unp: %lx, uip: %lx",
1246                         ROLE(r->old_role), ROLE(r->new_role),
1247                         (ulong_t)r->unp, (ulong_t)r->uip);
1248                 cmn_err(CE_NOTE,
1249                 "  lock: %lx, unlock: %lx\n\t  check: %lx, role_swap: %lx",
1250                         (ulong_t)r->lock, (ulong_t)r->unlock,
1251                         (ulong_t)r->check, (ulong_t)r->role_swap);
1252                 if (*((uint_t *)(&r->txn_stat)) != 0) {
1253                         cmn_err(CE_NOTE, "status: (0x%x) %s%s%s%s%s",
1254                         *((uint_t *)(&r->txn_stat)),
1255                         r->txn_stat.is_open?         "is_open "      : "",
1256                         r->txn_stat.locked?          "locked "       : "",
1257                         r->txn_stat.checked?         "checked "      : "",
1258                         r->txn_stat.role_swapped?    "role_swapped " : "",
1259                         r->txn_stat.unlocked?                "unlocked"      : "");
1260                 }
1261                 cmn_err(CE_NOTE, "end: %llx\n", r->end);
1262         }
1263         delay(drv_sectohz(1));
1264 }
1265 
1266 /*
1267  * validate the rename request parameters
1268  */
1269 static int
1270 validate_txn_parms(md_rentxn_t *rtxnp)
1271 {
1272         minor_t to_min, from_min;
1273 
1274         ASSERT(rtxnp);
1275 
1276         from_min = rtxnp->from.mnum;
1277         to_min = rtxnp->to.mnum;
1278 
1279         switch (rtxnp->revision) {
1280         case MD_RENAME_VERSION_OFFLINE:
1281                 if (rtxnp->uflags != 0) {
1282                         (void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
1283                                                                 from_min);
1284                         return (ENOTSUP);
1285                 }
1286                 break;
1287 
1288         case MD_RENAME_VERSION_ONLINE:
1289                 /* not supported until 5.0 */
1290                 /* FALLTHROUGH */
1291 
1292         default:
1293                 (void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
1294                                                                 from_min);
1295                 return (EPROTONOSUPPORT);
1296         }
1297 
1298         if ((rtxnp->from.uip = MDI_UNIT(from_min)) == NULL) {
1299                 (void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP, from_min);
1300                 return (ENODEV);
1301         }
1302 
1303         if (!md_dev_exists(md_makedevice(md_major, from_min))) {
1304                 (void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP, from_min);
1305                 return (ENODEV);
1306         }
1307 
1308         if ((rtxnp->from.key == MD_KEYBAD) || (rtxnp->from.key == MD_KEYWILD)) {
1309                 (void) mdmderror(&rtxnp->mde, MDE_INVAL_UNIT, from_min);
1310                 return (EINVAL);
1311         }
1312 
1313         rtxnp->from.kstatp = rtxnp->from.uip->ui_kstat;
1314         rtxnp->from.unp = MD_UNIT(from_min);
1315 
1316         if (MD_MIN2SET(to_min) != MD_MIN2SET(from_min)) {
1317                 (void) mdmderror(&rtxnp->mde, MDE_INVAL_UNIT, to_min);
1318                 return (EINVAL);
1319         }
1320 
1321         switch (rtxnp->op) {
1322         case MDRNOP_EXCHANGE:
1323                 rtxnp->to.unp = MD_UNIT(to_min);
1324                 rtxnp->to.uip = MDI_UNIT(to_min);
1325 
1326                 /*
1327                  * exchange requires target to exist
1328                  */
1329 
1330                 if ((rtxnp->to.uip == NULL) ||
1331                     (md_dev_exists(md_makedevice(md_major, to_min)) == NULL)) {
1332                         (void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP,
1333                                                                         to_min);
1334                         return (ENODEV);
1335                 }
1336 
1337                 if ((rtxnp->to.key == MD_KEYBAD) ||
1338                     (rtxnp->to.key == MD_KEYWILD)) {
1339                         (void) mdmderror(&rtxnp->mde, MDE_INVAL_UNIT, to_min);
1340                         return (EINVAL);
1341                 }
1342 
1343                 /*
1344                  * <from> is not in the role of <self>,
1345                  * that is,
1346                  * <from> has a parent, which is <to> and <to> has a parent too
1347                  * or
1348                  * <to> has a parent, which is <from> and <to> can have a child
1349                  */
1350                 if ((MD_HAS_PARENT(MD_PARENT(rtxnp->from.unp))) &&
1351                     (MD_PARENT(rtxnp->from.unp) == to_min) &&
1352                     MD_HAS_PARENT(MD_PARENT(rtxnp->to.unp))) {
1353                         (void) mdmderror(&rtxnp->mde, MDE_RENAME_ORDER,
1354                                                                 from_min);
1355                         return (EINVAL);
1356                 }
1357 
1358                 if ((MD_HAS_PARENT(MD_PARENT(rtxnp->to.unp))) &&
1359                     (MD_PARENT(rtxnp->to.unp) == from_min) &&
1360                     (MD_CAPAB(rtxnp->to.unp) & MD_CAN_META_CHILD)) {
1361                         (void) mdmderror(&rtxnp->mde, MDE_RENAME_ORDER,
1362                                                                 from_min);
1363                         return (EINVAL);
1364                 }
1365 
1366                 rtxnp->to.kstatp = rtxnp->to.uip->ui_kstat;
1367                 break;
1368 
1369         case MDRNOP_RENAME:
1370 
1371                 /*
1372                  * rename requires <to> not to exist
1373                  */
1374 
1375                 if (MDI_UNIT(to_min) ||
1376                     md_dev_exists(md_makedevice(md_major, to_min))) {
1377 
1378                         (void) mdmderror(&rtxnp->mde, MDE_UNIT_ALREADY_SETUP,
1379                                                                         to_min);
1380                         return (EEXIST);
1381                 }
1382 
1383                 /*
1384                  * and to be within valid ranges for the current
1385                  * limits on number of sets and metadevices
1386                  */
1387                 if ((MD_MIN2SET(to_min) >= md_nsets) ||
1388                     (MD_MIN2UNIT(to_min) >= md_nunits)) {
1389                         (void) mdmderror(&rtxnp->mde, MDE_INVAL_UNIT, to_min);
1390                         return (EINVAL);
1391                 }
1392 
1393                 rtxnp->to.unp = NULL;
1394                 rtxnp->to.uip = NULL;
1395                 rtxnp->to.kstatp = NULL;
1396                 break;
1397 
1398         default:
1399                 (void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
1400                                                                 from_min);
1401                 return (EINVAL);
1402         }
1403 
1404         /*
1405          * install guard rails
1406          */
1407         rtxnp->beginning = TXN_BEG;
1408 
1409         rtxnp->from.beginning        = TXNUN_BEG;
1410         rtxnp->from.end              = TXNUN_END;
1411 
1412         rtxnp->to.beginning  = TXNUN_BEG;
1413         rtxnp->to.end                = TXNUN_END;
1414 
1415         rtxnp->end = TXN_END;
1416 
1417         return (0);
1418 }
1419 
1420 /*
1421  * If the device being changed exhibits this capability, set the list
1422  * relatives function pointer to the named service that lists the
1423  * appropriate relatives for this capability.
1424  */
1425 static int
1426 set_list_rels_funcp(
1427         md_rentxn_t              *rtxnp,
1428         md_stackcap_t            capability,
1429         char                     *svc_name,
1430         md_ren_list_svc_t        default_svc_func,
1431         md_ren_list_svc_t        **list_relatives_funcp
1432 )
1433 {
1434         int              err;
1435         minor_t          from_min;
1436         md_dev64_t       from_dev;
1437         md_unit_t       *from_un;
1438         mdi_unit_t      *from_ui;
1439 
1440         ASSERT(rtxnp);
1441         ASSERT((rtxnp->op == MDRNOP_RENAME) || (rtxnp->op == MDRNOP_EXCHANGE));
1442         ASSERT(list_relatives_funcp);
1443 
1444         from_min        = rtxnp->from.mnum;
1445         from_dev        = md_makedevice(md_major, from_min);
1446         from_un         = MD_UNIT(from_min);
1447         from_ui         = MDI_UNIT(from_min);
1448         err             = 0;
1449 
1450         if (!from_ui || !from_un) {
1451                 (void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP, from_min);
1452                 err = EINVAL;
1453                 goto out;
1454         }
1455 
1456         if ((capability == MD_CAN_DO_ANYTHING) ||
1457             ((MD_CAPAB(from_un) & capability) == capability)) {
1458 
1459                         *list_relatives_funcp = (md_ren_list_svc_t *)
1460                                         md_get_named_service(from_dev,
1461                                         /* modindex */ 0, svc_name,
1462                                         (intptr_t (*)()) default_svc_func);
1463 
1464                         ASSERT(*list_relatives_funcp);
1465                         if (!(*list_relatives_funcp)) {
1466                                 (void) mdmderror(&rtxnp->mde,
1467                                         MDE_RENAME_CONFIG_ERROR, from_min);
1468                                 err = EINVAL;
1469                                 goto out;
1470                         }
1471         } else {
1472                 *list_relatives_funcp = (md_ren_list_svc_t *)NULL;
1473         }
1474 
1475 out:
1476         return (err);
1477 }
1478 
1479 /*
1480  * call list relations function, bump recid counter
1481  * by number of members added to the delta list.
1482  * Validate that the number of members added is within bounds.
1483  */
1484 static int
1485 list_relations(
1486                 md_rendelta_t           **family,
1487                 md_rentxn_t              *rtxnp,
1488                 md_ren_list_svc_t        *add_relatives_funcp,
1489                 int                       valid_min,
1490                 int                       valid_max
1491 )
1492 {
1493         int     n_added;
1494         int     err = 0;
1495 
1496         ASSERT(family);
1497         ASSERT(rtxnp);
1498 
1499         if (!family || !rtxnp) {
1500                 err = EINVAL;
1501                 goto out;
1502         }
1503 
1504         n_added = 0;
1505 
1506         /* no relations of this type */
1507         if (!add_relatives_funcp) {
1508                 goto out;
1509         }
1510 
1511         n_added = (*add_relatives_funcp) (family, rtxnp);
1512 
1513         if ((n_added < valid_min) || (n_added > valid_max)) {
1514                 if (mdisok(&rtxnp->mde)) {
1515                         (void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
1516                                                         rtxnp->from.mnum);
1517                 }
1518                 err = EINVAL;
1519                 goto out;
1520         }
1521 
1522         rtxnp->n_recids += n_added;
1523 
1524 out:
1525         return (err);
1526 }
1527 
1528 /*
1529  * build recid array
1530  */
1531 static int
1532 alloc_recids(md_rendelta_t *family, md_rentxn_t *rtxnp)
1533 {
1534         int     err     = 0;
1535 
1536         if (!family || !rtxnp) {
1537                 err = ENOMEM;
1538                 goto out;
1539         }
1540 
1541         rtxnp->rec_idx = 0;
1542 
1543         if (rtxnp->n_recids == 0) {
1544                 err = EINVAL;
1545                 goto out;
1546         }
1547 
1548         rtxnp->n_recids += 1;        /* terminator */
1549 
1550         rtxnp->recids = kmem_alloc(sizeof (mddb_recid_t) * rtxnp->n_recids,
1551             KM_SLEEP);
1552         if (!(rtxnp->recids)) {
1553                 err = ENOMEM;
1554                 goto out;
1555         }
1556 
1557         bzero((void *) rtxnp->recids,
1558                                 (sizeof (mddb_recid_t) * rtxnp->n_recids));
1559 out:
1560         if (err != 0) {
1561                 (void) mdsyserror(&rtxnp->mde, err);
1562         }
1563 
1564         return (err);
1565 }
1566 
1567 /*
1568  * build family tree (parent(s), self, children)
1569  * The order of the resultant list is important, as it governs
1570  * the order of locking, checking and changing the unit structures.
1571  * Since we'll be changing them, we may not use the MD_UNIT, MDI_UNIT,
1572  * and other pointer which depend on the array being correct.
1573  * Use only the cached pointers (in rtxnp.)
1574  */
1575 static md_rendelta_t *
1576 build_dtree(md_rentxn_t *rtxnp)
1577 {
1578         md_ren_list_svc_t       *add_folks, *add_self, *add_kids;
1579         int                      err;
1580         md_rendelta_t           *family = NULL;
1581 
1582         ASSERT(rtxnp);
1583         ASSERT((rtxnp->op == MDRNOP_RENAME) || (rtxnp->op == MDRNOP_EXCHANGE));
1584 
1585         err = set_list_rels_funcp(rtxnp, MD_CAN_PARENT, MDRNM_LIST_URFOLKS,
1586                                         md_rename_listfolks, &add_folks);
1587 
1588         if (err) {
1589                 goto out;
1590         }
1591 
1592         err = set_list_rels_funcp(rtxnp, MD_CAN_DO_ANYTHING, MDRNM_LIST_URSELF,
1593                                                 md_rename_listself, &add_self);
1594         if (err) {
1595                 goto out;
1596         }
1597 
1598         err = set_list_rels_funcp(rtxnp, MD_CAN_META_CHILD, MDRNM_LIST_URKIDS,
1599                                 /* no default list func */ ((int (*)()) NULL),
1600                                                                 &add_kids);
1601         if (err) {
1602                 goto out;
1603         }
1604 
1605         rtxnp->n_recids = 0; /* accumulated by list_relations() */
1606 
1607         if ((err = list_relations(&family, rtxnp, add_folks, 0, 1)) != 0) {
1608                 goto out;
1609         }
1610 
1611         if ((err = list_relations(&family, rtxnp, add_self, 1, 1)) != 0) {
1612                 goto out;
1613         }
1614 
1615         err = list_relations(&family, rtxnp, add_kids, 0, md_nunits);
1616         if (err != 0) {
1617                 goto out;
1618         }
1619 
1620         /*
1621          * delta tree is still empty?
1622          */
1623         if ((!family) || (rtxnp->n_recids == 0)) {
1624                 (void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
1625                                                         rtxnp->from.mnum);
1626                 err = EINVAL;
1627                 goto out;
1628         }
1629 
1630         /*
1631          * verify role change interactions
1632          */
1633         if ((err = valid_roleswap_dtree(family, rtxnp)) != 0) {
1634                 goto out;
1635         }
1636 
1637         if ((err = alloc_recids(family, rtxnp)) != 0) {
1638                 goto out;
1639         }
1640 
1641 out:
1642         if (err != 0) {
1643                 free_dtree(family);
1644                 dump_dtree(family);     /* yes, after freeing it */
1645                 family = NULL;
1646         }
1647 
1648         return (family);
1649 }
1650 
1651 
1652 /*
1653  * (MD_IOCRENAME) rename/exchange ioctl entry point
1654  * calls individual driver named service entry points
1655  * to build a list of devices which need state changed,
1656  * to verify that they're in a state where renames may occur,
1657  * and to modify themselves into their new identities
1658  */
1659 
1660 int
1661 md_rename(
1662         md_rename_t     *mrp,
1663         IOLOCK          *iolockp)
1664 {
1665         md_rendelta_t   *family         = NULL;
1666         md_rentxn_t     rtxn;
1667         int             err             = 0;
1668         set_t           setno;
1669         mdc_unit_t      *mdc;
1670 
1671         ASSERT(iolockp);
1672         if (mrp == NULL)
1673                 return (EINVAL);
1674 
1675         setno = MD_MIN2SET(mrp->from.mnum);
1676         if (setno >= md_nsets) {
1677                 return (EINVAL);
1678         }
1679 
1680         /*
1681          * Early exit if top is eof trans
1682          */
1683         mdc = (mdc_unit_t *)md_set[setno].s_un[MD_MIN2UNIT(mrp->from.mnum)];
1684         while (mdc != NULL) {
1685             if (!MD_HAS_PARENT(mdc->un_parent)) {
1686                 break;
1687             } else {
1688                 mdc = (mdc_unit_t *)md_set[setno].s_un[MD_MIN2UNIT
1689                     (mdc->un_parent)];
1690             }
1691         }
1692 
1693         if (mdc && mdc->un_type == MD_METATRANS) {
1694                 return (EINVAL);
1695         }
1696 
1697 
1698         mdclrerror(&mrp->mde);
1699 
1700         bzero((void *) &rtxn, sizeof (md_rentxn_t));
1701         mdclrerror(&rtxn.mde);
1702 
1703         /*
1704          * encapsulate user parameters
1705          */
1706         rtxn.from.key   = mrp->from.key;
1707         rtxn.to.key     = mrp->to.key;
1708         rtxn.from.mnum  = mrp->from.mnum;
1709         rtxn.to.mnum    = mrp->to.mnum;
1710         rtxn.op         = mrp->op;
1711         rtxn.uflags     = mrp->flags;
1712         rtxn.revision   = mrp->revision;
1713 
1714         if (MD_MIN2UNIT(mrp->to.mnum) >= md_nunits) {
1715                 err = EINVAL;
1716                 goto cleanup;
1717         }
1718 
1719         /*
1720          * catch this early, before taking any locks
1721          */
1722         if (md_get_setstatus(setno) & MD_SET_STALE) {
1723                 (void) (mdmddberror(&rtxn.mde, MDE_DB_STALE, rtxn.from.mnum,
1724                                                 MD_MIN2SET(rtxn.from.mnum)));
1725                 err = EROFS;
1726                 goto cleanup;
1727         }
1728 
1729         /*
1730          * Locking and re-validation (of the per-unit state) is
1731          * done by the rename lock/unlock service, for now only take
1732          * the array lock.
1733          */
1734         md_array_writer(iolockp);
1735 
1736         /*
1737          * validate the rename/exchange parameters
1738          * rtxn is filled in on succesful completion of validate_txn_parms()
1739          */
1740         if ((err = validate_txn_parms(&rtxn)) != 0) {
1741                 goto cleanup;
1742         }
1743 
1744         /*
1745          * build list of work to do, the "delta tree" for related devices
1746          */
1747         if (!(family = build_dtree(&rtxn))) {
1748                 err = ENOMEM;
1749                 goto cleanup;
1750         }
1751         dump_txn(&rtxn);
1752         dump_dtree(family);
1753 
1754         if ((err = lock_dtree(family, &rtxn)) != 0) {
1755                 goto cleanup;
1756         }
1757 
1758         if ((err = check_dtree(family, &rtxn)) != 0) {
1759                 goto cleanup;
1760         }
1761         dump_txn(&rtxn);
1762 
1763         role_swap_dtree(family, &rtxn);     /* commits the recids */
1764 
1765         /*
1766          * let folks know
1767          */
1768         SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_RENAME_SRC, SVM_TAG_METADEVICE,
1769             MD_MIN2SET(rtxn.from.mnum), rtxn.from.mnum);
1770         SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_RENAME_DST, SVM_TAG_METADEVICE,
1771             MD_MIN2SET(rtxn.from.mnum), rtxn.from.mnum);
1772 
1773 cleanup:
1774 
1775         if (err != 0 && mdisok(&rtxn.mde)) {
1776                 (void) mdsyserror(&rtxn.mde, EINVAL);
1777         }
1778 
1779         if (family) {
1780                 unlock_dtree(family, &rtxn);
1781                 free_dtree(family);
1782                 dump_dtree(family);
1783                 family = NULL;
1784         }
1785 
1786         if (rtxn.recids && (rtxn.n_recids > 0)) {
1787                 kmem_free(rtxn.recids, sizeof (mddb_recid_t) * rtxn.n_recids);
1788         }
1789 
1790         if (!mdisok(&rtxn.mde)) {
1791                 (void) mdstealerror(&mrp->mde, &rtxn.mde);
1792         }
1793 
1794         return (0);     /* success/failure will be communicated via rtxn.mde */
1795 }
1796 
1797 static role_change_tab_t
1798 role_swap_tab[] =
1799 {
1800         {
1801                 1,                      /* ordinal */
1802                 MDRR_PARENT,            /* old role */
1803                 MDRR_PARENT,            /* new role */
1804                 MDRNM_UPDATE_KIDS,      /* named service */
1805                 NO_DEFAULT_ROLESWAP_SVC /* default role swap function */
1806         },
1807         {
1808                 2,
1809                 MDRR_PARENT,
1810                 MDRR_SELF,
1811                 MDRNM_PARENT_UPDATE_TO,
1812                 NO_DEFAULT_ROLESWAP_SVC
1813         },
1814         {
1815                 3,
1816                 MDRR_PARENT,
1817                 MDRR_CHILD,
1818                 ILLEGAL_SVC_NAME,
1819                 ILLEGAL_ROLESWAP_SVC
1820         },
1821         {
1822                 4,
1823                 MDRR_SELF,
1824                 MDRR_PARENT,
1825                 MDRNM_SELF_UPDATE_FROM_UP,
1826                 md_exchange_self_update_from_up
1827         },
1828         {
1829                 5,
1830                 MDRR_SELF,
1831                 MDRR_SELF,
1832                 MDRNM_UPDATE_SELF,
1833                 md_rename_update_self
1834         },
1835         {
1836                 6,
1837                 MDRR_SELF,
1838                 MDRR_CHILD,
1839                 MDRNM_SELF_UPDATE_FROM_DOWN,
1840                 NO_DEFAULT_ROLESWAP_SVC
1841         },
1842         {
1843                 7,
1844                 MDRR_CHILD,
1845                 MDRR_PARENT,
1846                 ILLEGAL_SVC_NAME,
1847                 ILLEGAL_ROLESWAP_SVC
1848         },
1849         {
1850                 8,
1851                 MDRR_CHILD,
1852                 MDRR_SELF,
1853                 MDRNM_CHILD_UPDATE_TO,
1854                 md_exchange_child_update_to
1855         },
1856         {
1857                 9,
1858                 MDRR_CHILD,
1859                 MDRR_CHILD,
1860                 MDRNM_UPDATE_FOLKS,
1861                 md_renexch_update_parent
1862         },
1863 
1864         /* terminator is old_role == MDRR_UNK */
1865         {
1866                 0,
1867                 MDRR_UNK,
1868                 MDRR_UNK,
1869                 ILLEGAL_SVC_NAME,
1870                 NO_DEFAULT_ROLESWAP_SVC
1871         }
1872 };