1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License, Version 1.0 only
   6  * (the "License").  You may not use this file except in compliance
   7  * with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 /*
  23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #pragma ident   "%Z%%M% %I%     %E% SMI"
  28 
  29 /*
  30  * Multidata, as described in the following papers:
  31  *
  32  * Adi Masputra,
  33  * Multidata V.2: VA-Disjoint Packet Extents Framework Interface
  34  * Design Specification.  August 2004.
  35  * Available as http://sac.sfbay/PSARC/2004/594/materials/mmd2.pdf.
  36  *
  37  * Adi Masputra,
  38  * Multidata Interface Design Specification.  Sep 2002.
  39  * Available as http://sac.sfbay/PSARC/2002/276/materials/mmd.pdf.
  40  *
  41  * Adi Masputra, Frank DiMambro, Kacheong Poon,
  42  * An Efficient Networking Transmit Mechanism for Solaris:
  43  * Multidata Transmit (MDT).  May 2002.
  44  * Available as http://sac.sfbay/PSARC/2002/276/materials/mdt.pdf.
  45  */
  46 
  47 #include <sys/types.h>
  48 #include <sys/stream.h>
  49 #include <sys/dlpi.h>
  50 #include <sys/stropts.h>
  51 #include <sys/strsun.h>
  52 #include <sys/strlog.h>
  53 #include <sys/strsubr.h>
  54 #include <sys/sysmacros.h>
  55 #include <sys/cmn_err.h>
  56 #include <sys/debug.h>
  57 #include <sys/kmem.h>
  58 #include <sys/atomic.h>
  59 
  60 #include <sys/multidata.h>
  61 #include <sys/multidata_impl.h>
  62 
  63 static int mmd_constructor(void *, void *, int);
  64 static void mmd_destructor(void *, void *);
  65 static int pdslab_constructor(void *, void *, int);
  66 static void pdslab_destructor(void *, void *);
  67 static int pattbl_constructor(void *, void *, int);
  68 static void pattbl_destructor(void *, void *);
  69 static void mmd_esballoc_free(caddr_t);
  70 static int mmd_copy_pattbl(patbkt_t *, multidata_t *, pdesc_t *, int);
  71 
  72 static boolean_t pbuf_ref_valid(multidata_t *, pdescinfo_t *);
  73 #pragma inline(pbuf_ref_valid)
  74 
  75 static boolean_t pdi_in_range(pdescinfo_t *, pdescinfo_t *);
  76 #pragma inline(pdi_in_range)
  77 
  78 static pdesc_t *mmd_addpdesc_int(multidata_t *, pdescinfo_t *, int *, int);
  79 #pragma inline(mmd_addpdesc_int)
  80 
  81 static void mmd_destroy_pattbl(patbkt_t **);
  82 #pragma inline(mmd_destroy_pattbl)
  83 
  84 static pattr_t *mmd_find_pattr(patbkt_t *, uint_t);
  85 #pragma inline(mmd_find_pattr)
  86 
  87 static pdesc_t *mmd_destroy_pdesc(multidata_t *, pdesc_t *);
  88 #pragma inline(mmd_destroy_pdesc)
  89 
  90 static pdesc_t *mmd_getpdesc(multidata_t *, pdesc_t *, pdescinfo_t *, uint_t,
  91     boolean_t);
  92 #pragma inline(mmd_getpdesc)
  93 
  94 static struct kmem_cache *mmd_cache;
  95 static struct kmem_cache *pd_slab_cache;
  96 static struct kmem_cache *pattbl_cache;
  97 
  98 int mmd_debug = 1;
  99 #define MMD_DEBUG(s)    if (mmd_debug > 0) cmn_err s
 100 
 101 /*
 102  * Set to this to true to bypass pdesc bounds checking.
 103  */
 104 boolean_t mmd_speed_over_safety = B_FALSE;
 105 
 106 /*
 107  * Patchable kmem_cache flags.
 108  */
 109 int mmd_kmem_flags = 0;
 110 int pdslab_kmem_flags = 0;
 111 int pattbl_kmem_flags = 0;
 112 
 113 /*
 114  * Alignment (in bytes) of our kmem caches.
 115  */
 116 #define MULTIDATA_CACHE_ALIGN   64
 117 
 118 /*
 119  * Default number of packet descriptors per descriptor slab.  Making
 120  * this too small will trigger more descriptor slab allocation; making
 121  * it too large will create too many unclaimed descriptors.
 122  */
 123 #define PDSLAB_SZ       15
 124 uint_t pdslab_sz = PDSLAB_SZ;
 125 
 126 /*
 127  * Default attribute hash table size.  It's okay to set this to a small
 128  * value (even to 1) because there aren't that many attributes currently
 129  * defined, and because we assume there won't be many attributes associated
 130  * with a Multidata at a given time.  Increasing the size will reduce
 131  * attribute search time (given a large number of attributes in a Multidata),
 132  * and decreasing it will reduce the memory footprints and the overhead
 133  * associated with managing the table.
 134  */
 135 #define PATTBL_SZ       1
 136 uint_t pattbl_sz = PATTBL_SZ;
 137 
 138 /*
 139  * Attribute hash key.
 140  */
 141 #define PATTBL_HASH(x, sz)      ((x) % (sz))
 142 
 143 /*
 144  * Structure that precedes each Multidata metadata.
 145  */
 146 struct mmd_buf_info {
 147         frtn_t  frp;            /* free routine */
 148         uint_t  buf_len;        /* length of kmem buffer */
 149 };
 150 
 151 /*
 152  * The size of each metadata buffer.
 153  */
 154 #define MMD_CACHE_SIZE  \
 155         (sizeof (struct mmd_buf_info) + sizeof (multidata_t))
 156 
 157 /*
 158  * Called during startup in order to create the Multidata kmem caches.
 159  */
 160 void
 161 mmd_init(void)
 162 {
 163         pdslab_sz = MAX(1, pdslab_sz);  /* at least 1 descriptor */
 164         pattbl_sz = MAX(1, pattbl_sz);  /* at least 1 bucket */
 165 
 166         mmd_cache = kmem_cache_create("multidata", MMD_CACHE_SIZE,
 167             MULTIDATA_CACHE_ALIGN, mmd_constructor, mmd_destructor,
 168             NULL, NULL, NULL, mmd_kmem_flags);
 169 
 170         pd_slab_cache = kmem_cache_create("multidata_pdslab",
 171             PDESC_SLAB_SIZE(pdslab_sz), MULTIDATA_CACHE_ALIGN,
 172             pdslab_constructor, pdslab_destructor, NULL,
 173             (void *)(uintptr_t)pdslab_sz, NULL, pdslab_kmem_flags);
 174 
 175         pattbl_cache = kmem_cache_create("multidata_pattbl",
 176             sizeof (patbkt_t) * pattbl_sz, MULTIDATA_CACHE_ALIGN,
 177             pattbl_constructor, pattbl_destructor, NULL,
 178             (void *)(uintptr_t)pattbl_sz, NULL, pattbl_kmem_flags);
 179 }
 180 
 181 /*
 182  * Create a Multidata message block.
 183  */
 184 multidata_t *
 185 mmd_alloc(mblk_t *hdr_mp, mblk_t **mmd_mp, int kmflags)
 186 {
 187         uchar_t *buf;
 188         multidata_t *mmd;
 189         uint_t mmd_mplen;
 190         struct mmd_buf_info *buf_info;
 191 
 192         ASSERT(hdr_mp != NULL);
 193         ASSERT(mmd_mp != NULL);
 194 
 195         /*
 196          * Caller should never pass in a chain of mblks since we
 197          * only care about the first one, hence the assertions.
 198          */
 199         ASSERT(hdr_mp->b_cont == NULL);
 200 
 201         if ((buf = kmem_cache_alloc(mmd_cache, kmflags)) == NULL)
 202                 return (NULL);
 203 
 204         buf_info = (struct mmd_buf_info *)buf;
 205         buf_info->frp.free_arg = (caddr_t)buf;
 206 
 207         mmd = (multidata_t *)(buf_info + 1);
 208         mmd_mplen = sizeof (*mmd);
 209 
 210         if ((*mmd_mp = desballoc((uchar_t *)mmd, mmd_mplen, BPRI_HI,
 211             &(buf_info->frp))) == NULL) {
 212                 kmem_cache_free(mmd_cache, buf);
 213                 return (NULL);
 214         }
 215 
 216         DB_TYPE(*mmd_mp) = M_MULTIDATA;
 217         (*mmd_mp)->b_wptr += mmd_mplen;
 218         mmd->mmd_dp = (*mmd_mp)->b_datap;
 219         mmd->mmd_hbuf = hdr_mp;
 220 
 221         return (mmd);
 222 }
 223 
 224 /*
 225  * Associate additional payload buffer to the Multidata.
 226  */
 227 int
 228 mmd_addpldbuf(multidata_t *mmd, mblk_t *pld_mp)
 229 {
 230         int i;
 231 
 232         ASSERT(mmd != NULL);
 233         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
 234         ASSERT(pld_mp != NULL);
 235 
 236         mutex_enter(&mmd->mmd_pd_slab_lock);
 237         for (i = 0; i < MULTIDATA_MAX_PBUFS &&
 238             mmd->mmd_pbuf_cnt < MULTIDATA_MAX_PBUFS; i++) {
 239                 if (mmd->mmd_pbuf[i] == pld_mp) {
 240                         /* duplicate entry */
 241                         MMD_DEBUG((CE_WARN, "mmd_addpldbuf: error adding "
 242                             "pld 0x%p to mmd 0x%p since it has been "
 243                             "previously added into slot %d (total %d)\n",
 244                             (void *)pld_mp, (void *)mmd, i, mmd->mmd_pbuf_cnt));
 245                         mutex_exit(&mmd->mmd_pd_slab_lock);
 246                         return (-1);
 247                 } else if (mmd->mmd_pbuf[i] == NULL) {
 248                         mmd->mmd_pbuf[i] = pld_mp;
 249                         mmd->mmd_pbuf_cnt++;
 250                         mutex_exit(&mmd->mmd_pd_slab_lock);
 251                         return (i);
 252                 }
 253         }
 254 
 255         /* all slots are taken */
 256         MMD_DEBUG((CE_WARN, "mmd_addpldbuf: error adding pld 0x%p to mmd 0x%p "
 257             "since no slot space is left (total %d max %d)\n", (void *)pld_mp,
 258             (void *)mmd, mmd->mmd_pbuf_cnt, MULTIDATA_MAX_PBUFS));
 259         mutex_exit(&mmd->mmd_pd_slab_lock);
 260 
 261         return (-1);
 262 }
 263 
 264 /*
 265  * Multidata metadata kmem cache constructor routine.
 266  */
 267 /* ARGSUSED */
 268 static int
 269 mmd_constructor(void *buf, void *cdrarg, int kmflags)
 270 {
 271         struct mmd_buf_info *buf_info;
 272         multidata_t *mmd;
 273 
 274         bzero((void *)buf, MMD_CACHE_SIZE);
 275 
 276         buf_info = (struct mmd_buf_info *)buf;
 277         buf_info->frp.free_func = mmd_esballoc_free;
 278         buf_info->buf_len = MMD_CACHE_SIZE;
 279 
 280         mmd = (multidata_t *)(buf_info + 1);
 281         mmd->mmd_magic = MULTIDATA_MAGIC;
 282 
 283         mutex_init(&(mmd->mmd_pd_slab_lock), NULL, MUTEX_DRIVER, NULL);
 284         QL_INIT(&(mmd->mmd_pd_slab_q));
 285         QL_INIT(&(mmd->mmd_pd_q));
 286 
 287         return (0);
 288 }
 289 
 290 /*
 291  * Multidata metadata kmem cache destructor routine.
 292  */
 293 /* ARGSUSED */
 294 static void
 295 mmd_destructor(void *buf, void *cdrarg)
 296 {
 297         multidata_t *mmd;
 298 #ifdef DEBUG
 299         int i;
 300 #endif
 301 
 302         mmd = (multidata_t *)((uchar_t *)buf + sizeof (struct mmd_buf_info));
 303 
 304         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
 305         ASSERT(mmd->mmd_dp == NULL);
 306         ASSERT(mmd->mmd_hbuf == NULL);
 307         ASSERT(mmd->mmd_pbuf_cnt == 0);
 308 #ifdef DEBUG
 309         for (i = 0; i < MULTIDATA_MAX_PBUFS; i++)
 310                 ASSERT(mmd->mmd_pbuf[i] == NULL);
 311 #endif
 312         ASSERT(mmd->mmd_pattbl == NULL);
 313 
 314         mutex_destroy(&(mmd->mmd_pd_slab_lock));
 315         ASSERT(mmd->mmd_pd_slab_q.ql_next == &(mmd->mmd_pd_slab_q));
 316         ASSERT(mmd->mmd_slab_cnt == 0);
 317         ASSERT(mmd->mmd_pd_q.ql_next == &(mmd->mmd_pd_q));
 318         ASSERT(mmd->mmd_pd_cnt == 0);
 319         ASSERT(mmd->mmd_hbuf_ref == 0);
 320         ASSERT(mmd->mmd_pbuf_ref == 0);
 321 }
 322 
 323 /*
 324  * Multidata message block free callback routine.
 325  */
 326 static void
 327 mmd_esballoc_free(caddr_t buf)
 328 {
 329         multidata_t *mmd;
 330         pdesc_t *pd;
 331         pdesc_slab_t *slab;
 332         int i;
 333 
 334         ASSERT(buf != NULL);
 335         ASSERT(((struct mmd_buf_info *)buf)->buf_len == MMD_CACHE_SIZE);
 336 
 337         mmd = (multidata_t *)(buf + sizeof (struct mmd_buf_info));
 338         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
 339 
 340         ASSERT(mmd->mmd_dp != NULL);
 341         ASSERT(mmd->mmd_dp->db_ref == 1);
 342 
 343         /* remove all packet descriptors and private attributes */
 344         pd = Q2PD(mmd->mmd_pd_q.ql_next);
 345         while (pd != Q2PD(&(mmd->mmd_pd_q)))
 346                 pd = mmd_destroy_pdesc(mmd, pd);
 347 
 348         ASSERT(mmd->mmd_pd_q.ql_next == &(mmd->mmd_pd_q));
 349         ASSERT(mmd->mmd_pd_cnt == 0);
 350         ASSERT(mmd->mmd_hbuf_ref == 0);
 351         ASSERT(mmd->mmd_pbuf_ref == 0);
 352 
 353         /* remove all global attributes */
 354         if (mmd->mmd_pattbl != NULL)
 355                 mmd_destroy_pattbl(&(mmd->mmd_pattbl));
 356 
 357         /* remove all descriptor slabs */
 358         slab = Q2PDSLAB(mmd->mmd_pd_slab_q.ql_next);
 359         while (slab != Q2PDSLAB(&(mmd->mmd_pd_slab_q))) {
 360                 pdesc_slab_t *slab_next = Q2PDSLAB(slab->pds_next);
 361 
 362                 remque(&(slab->pds_next));
 363                 slab->pds_next = NULL;
 364                 slab->pds_prev = NULL;
 365                 slab->pds_mmd = NULL;
 366                 slab->pds_used = 0;
 367                 kmem_cache_free(pd_slab_cache, slab);
 368 
 369                 ASSERT(mmd->mmd_slab_cnt > 0);
 370                 mmd->mmd_slab_cnt--;
 371                 slab = slab_next;
 372         }
 373         ASSERT(mmd->mmd_pd_slab_q.ql_next == &(mmd->mmd_pd_slab_q));
 374         ASSERT(mmd->mmd_slab_cnt == 0);
 375 
 376         mmd->mmd_dp = NULL;
 377 
 378         /* finally, free all associated message blocks */
 379         if (mmd->mmd_hbuf != NULL) {
 380                 freeb(mmd->mmd_hbuf);
 381                 mmd->mmd_hbuf = NULL;
 382         }
 383 
 384         for (i = 0; i < MULTIDATA_MAX_PBUFS; i++) {
 385                 if (mmd->mmd_pbuf[i] != NULL) {
 386                         freeb(mmd->mmd_pbuf[i]);
 387                         mmd->mmd_pbuf[i] = NULL;
 388                         ASSERT(mmd->mmd_pbuf_cnt > 0);
 389                         mmd->mmd_pbuf_cnt--;
 390                 }
 391         }
 392 
 393         ASSERT(mmd->mmd_pbuf_cnt == 0);
 394         ASSERT(MUTEX_NOT_HELD(&(mmd->mmd_pd_slab_lock)));
 395         kmem_cache_free(mmd_cache, buf);
 396 }
 397 
 398 /*
 399  * Multidata message block copy routine, called by copyb() when it
 400  * encounters a M_MULTIDATA data block type.  This routine should
 401  * not be called by anyone other than copyb(), since it may go away
 402  * (read: become static to this module) once some sort of copy callback
 403  * routine is made available.
 404  */
 405 mblk_t *
 406 mmd_copy(mblk_t *bp, int kmflags)
 407 {
 408         multidata_t *mmd, *n_mmd;
 409         mblk_t *n_hbuf = NULL, *n_pbuf[MULTIDATA_MAX_PBUFS];
 410         mblk_t **pmp_last = &n_pbuf[MULTIDATA_MAX_PBUFS - 1];
 411         mblk_t **pmp;
 412         mblk_t *n_bp = NULL;
 413         pdesc_t *pd;
 414         uint_t n_pbuf_cnt = 0;
 415         int idx, i;
 416 
 417 #define FREE_PBUFS() {                                  \
 418         for (pmp = &n_pbuf[0]; pmp <= pmp_last; pmp++)   \
 419                 if (*pmp != NULL) freeb(*pmp);          \
 420 }
 421 
 422 #define REL_OFF(p, base, n_base)                        \
 423         ((uchar_t *)(n_base) + ((uchar_t *)(p) - (uchar_t *)base))
 424 
 425         ASSERT(bp != NULL && DB_TYPE(bp) == M_MULTIDATA);
 426         mmd = mmd_getmultidata(bp);
 427 
 428         /* copy the header buffer */
 429         if (mmd->mmd_hbuf != NULL && (n_hbuf = copyb(mmd->mmd_hbuf)) == NULL)
 430                 return (NULL);
 431 
 432         /* copy the payload buffer(s) */
 433         mutex_enter(&mmd->mmd_pd_slab_lock);
 434         bzero((void *)&n_pbuf[0], sizeof (mblk_t *) * MULTIDATA_MAX_PBUFS);
 435         n_pbuf_cnt = mmd->mmd_pbuf_cnt;
 436         for (i = 0; i < n_pbuf_cnt; i++) {
 437                 ASSERT(mmd->mmd_pbuf[i] != NULL);
 438                 n_pbuf[i] = copyb(mmd->mmd_pbuf[i]);
 439                 if (n_pbuf[i] == NULL) {
 440                         FREE_PBUFS();
 441                         mutex_exit(&mmd->mmd_pd_slab_lock);
 442                         return (NULL);
 443                 }
 444         }
 445 
 446         /* allocate new Multidata */
 447         n_mmd = mmd_alloc(n_hbuf, &n_bp, kmflags);
 448         if (n_mmd == NULL) {
 449                 if (n_hbuf != NULL)
 450                         freeb(n_hbuf);
 451                 if (n_pbuf_cnt != 0)
 452                         FREE_PBUFS();
 453                 mutex_exit(&mmd->mmd_pd_slab_lock);
 454                 return (NULL);
 455         }
 456 
 457         /*
 458          * Add payload buffer(s); upon success, leave n_pbuf array
 459          * alone, as the newly-created Multidata had already contained
 460          * the mblk pointers stored in the array.  These will be freed
 461          * along with the Multidata itself.
 462          */
 463         for (i = 0, pmp = &n_pbuf[0]; i < n_pbuf_cnt; i++, pmp++) {
 464                 idx = mmd_addpldbuf(n_mmd, *pmp);
 465                 if (idx < 0) {
 466                         FREE_PBUFS();
 467                         freeb(n_bp);
 468                         mutex_exit(&mmd->mmd_pd_slab_lock);
 469                         return (NULL);
 470                 }
 471         }
 472 
 473         /* copy over global attributes */
 474         if (mmd->mmd_pattbl != NULL &&
 475             mmd_copy_pattbl(mmd->mmd_pattbl, n_mmd, NULL, kmflags) < 0) {
 476                 freeb(n_bp);
 477                 mutex_exit(&mmd->mmd_pd_slab_lock);
 478                 return (NULL);
 479         }
 480 
 481         /* copy over packet descriptors and their atttributes */
 482         pd = mmd_getpdesc(mmd, NULL, NULL, 1, B_TRUE);  /* first pdesc */
 483         while (pd != NULL) {
 484                 pdesc_t *n_pd;
 485                 pdescinfo_t *pdi, n_pdi;
 486                 uchar_t *n_base, *base;
 487                 pdesc_t *pd_next;
 488 
 489                 /* next pdesc */
 490                 pd_next = mmd_getpdesc(pd->pd_slab->pds_mmd, pd, NULL,
 491                     1, B_TRUE);
 492 
 493                 /* skip if already removed */
 494                 if (pd->pd_flags & PDESC_REM_DEFER) {
 495                         pd = pd_next;
 496                         continue;
 497                 }
 498 
 499                 pdi = &(pd->pd_pdi);
 500                 bzero(&n_pdi, sizeof (n_pdi));
 501 
 502                 /*
 503                  * Calculate new descriptor values based on the offset of
 504                  * each pointer relative to the associated buffer(s).
 505                  */
 506                 ASSERT(pdi->flags & PDESC_HAS_REF);
 507                 if (pdi->flags & PDESC_HBUF_REF) {
 508                         n_base = n_mmd->mmd_hbuf->b_rptr;
 509                         base = mmd->mmd_hbuf->b_rptr;
 510 
 511                         n_pdi.flags |= PDESC_HBUF_REF;
 512                         n_pdi.hdr_base = REL_OFF(pdi->hdr_base, base, n_base);
 513                         n_pdi.hdr_rptr = REL_OFF(pdi->hdr_rptr, base, n_base);
 514                         n_pdi.hdr_wptr = REL_OFF(pdi->hdr_wptr, base, n_base);
 515                         n_pdi.hdr_lim = REL_OFF(pdi->hdr_lim, base, n_base);
 516                 }
 517 
 518                 if (pdi->flags & PDESC_PBUF_REF) {
 519                         n_pdi.flags |= PDESC_PBUF_REF;
 520                         n_pdi.pld_cnt = pdi->pld_cnt;
 521 
 522                         for (i = 0; i < pdi->pld_cnt; i++) {
 523                                 idx = pdi->pld_ary[i].pld_pbuf_idx;
 524                                 ASSERT(idx < MULTIDATA_MAX_PBUFS);
 525                                 ASSERT(n_mmd->mmd_pbuf[idx] != NULL);
 526                                 ASSERT(mmd->mmd_pbuf[idx] != NULL);
 527 
 528                                 n_base = n_mmd->mmd_pbuf[idx]->b_rptr;
 529                                 base = mmd->mmd_pbuf[idx]->b_rptr;
 530 
 531                                 n_pdi.pld_ary[i].pld_pbuf_idx = idx;
 532 
 533                                 /*
 534                                  * We can't copy the pointers just like that,
 535                                  * so calculate the relative offset.
 536                                  */
 537                                 n_pdi.pld_ary[i].pld_rptr =
 538                                     REL_OFF(pdi->pld_ary[i].pld_rptr,
 539                                         base, n_base);
 540                                 n_pdi.pld_ary[i].pld_wptr =
 541                                     REL_OFF(pdi->pld_ary[i].pld_wptr,
 542                                         base, n_base);
 543                         }
 544                 }
 545 
 546                 /* add the new descriptor to the new Multidata */
 547                 n_pd = mmd_addpdesc_int(n_mmd, &n_pdi, NULL, kmflags);
 548 
 549                 if (n_pd == NULL || (pd->pd_pattbl != NULL &&
 550                     mmd_copy_pattbl(pd->pd_pattbl, n_mmd, n_pd, kmflags) < 0)) {
 551                         freeb(n_bp);
 552                         mutex_exit(&mmd->mmd_pd_slab_lock);
 553                         return (NULL);
 554                 }
 555 
 556                 pd = pd_next;
 557         }
 558 #undef REL_OFF
 559 #undef FREE_PBUFS
 560 
 561         mutex_exit(&mmd->mmd_pd_slab_lock);
 562         return (n_bp);
 563 }
 564 
 565 /*
 566  * Given a Multidata message block, return the Multidata metadata handle.
 567  */
 568 multidata_t *
 569 mmd_getmultidata(mblk_t *mp)
 570 {
 571         multidata_t *mmd;
 572 
 573         ASSERT(mp != NULL);
 574 
 575         if (DB_TYPE(mp) != M_MULTIDATA)
 576                 return (NULL);
 577 
 578         mmd = (multidata_t *)mp->b_rptr;
 579         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
 580 
 581         return (mmd);
 582 }
 583 
 584 /*
 585  * Return the start and end addresses of the associated buffer(s).
 586  */
 587 void
 588 mmd_getregions(multidata_t *mmd, mbufinfo_t *mbi)
 589 {
 590         int i;
 591 
 592         ASSERT(mmd != NULL);
 593         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
 594         ASSERT(mbi != NULL);
 595 
 596         bzero((void *)mbi, sizeof (mbufinfo_t));
 597 
 598         if (mmd->mmd_hbuf != NULL) {
 599                 mbi->hbuf_rptr = mmd->mmd_hbuf->b_rptr;
 600                 mbi->hbuf_wptr = mmd->mmd_hbuf->b_wptr;
 601         }
 602 
 603         mutex_enter(&mmd->mmd_pd_slab_lock);
 604         for (i = 0; i < mmd->mmd_pbuf_cnt; i++) {
 605                 ASSERT(mmd->mmd_pbuf[i] != NULL);
 606                 mbi->pbuf_ary[i].pbuf_rptr = mmd->mmd_pbuf[i]->b_rptr;
 607                 mbi->pbuf_ary[i].pbuf_wptr = mmd->mmd_pbuf[i]->b_wptr;
 608 
 609         }
 610         mbi->pbuf_cnt = mmd->mmd_pbuf_cnt;
 611         mutex_exit(&mmd->mmd_pd_slab_lock);
 612 }
 613 
 614 /*
 615  * Return the Multidata statistics.
 616  */
 617 uint_t
 618 mmd_getcnt(multidata_t *mmd, uint_t *hbuf_ref, uint_t *pbuf_ref)
 619 {
 620         uint_t pd_cnt;
 621 
 622         ASSERT(mmd != NULL);
 623         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
 624 
 625         mutex_enter(&(mmd->mmd_pd_slab_lock));
 626         if (hbuf_ref != NULL)
 627                 *hbuf_ref = mmd->mmd_hbuf_ref;
 628         if (pbuf_ref != NULL)
 629                 *pbuf_ref = mmd->mmd_pbuf_ref;
 630         pd_cnt = mmd->mmd_pd_cnt;
 631         mutex_exit(&(mmd->mmd_pd_slab_lock));
 632 
 633         return (pd_cnt);
 634 }
 635 
 636 #define HBUF_REF_VALID(mmd, pdi)                                        \
 637         ((mmd)->mmd_hbuf != NULL && (pdi)->hdr_rptr != NULL &&            \
 638         (pdi)->hdr_wptr != NULL && (pdi)->hdr_base != NULL &&             \
 639         (pdi)->hdr_lim != NULL && (pdi)->hdr_lim >= (pdi)->hdr_base &&      \
 640         (pdi)->hdr_wptr >= (pdi)->hdr_rptr &&                          \
 641         (pdi)->hdr_base <= (pdi)->hdr_rptr &&                          \
 642         (pdi)->hdr_lim >= (pdi)->hdr_wptr &&                           \
 643         (pdi)->hdr_base >= (mmd)->mmd_hbuf->b_rptr &&                       \
 644         MBLKIN((mmd)->mmd_hbuf,                                              \
 645         (pdi->hdr_base - (mmd)->mmd_hbuf->b_rptr),                     \
 646         PDESC_HDRSIZE(pdi)))
 647 
 648 /*
 649  * Bounds check payload area(s).
 650  */
 651 static boolean_t
 652 pbuf_ref_valid(multidata_t *mmd, pdescinfo_t *pdi)
 653 {
 654         int i = 0, idx;
 655         boolean_t valid = B_TRUE;
 656         struct pld_ary_s *pa;
 657 
 658         mutex_enter(&mmd->mmd_pd_slab_lock);
 659         if (pdi->pld_cnt == 0 || pdi->pld_cnt > mmd->mmd_pbuf_cnt) {
 660                 mutex_exit(&mmd->mmd_pd_slab_lock);
 661                 return (B_FALSE);
 662         }
 663 
 664         pa = &pdi->pld_ary[0];
 665         while (valid && i < pdi->pld_cnt) {
 666                 valid = (((idx = pa->pld_pbuf_idx) < mmd->mmd_pbuf_cnt) &&
 667                     pa->pld_rptr != NULL && pa->pld_wptr != NULL &&
 668                     pa->pld_wptr >= pa->pld_rptr &&
 669                     pa->pld_rptr >= mmd->mmd_pbuf[idx]->b_rptr &&
 670                     MBLKIN(mmd->mmd_pbuf[idx], (pa->pld_rptr -
 671                         mmd->mmd_pbuf[idx]->b_rptr),
 672                         PDESC_PLD_SPAN_SIZE(pdi, i)));
 673 
 674                 if (!valid) {
 675                         MMD_DEBUG((CE_WARN,
 676                             "pbuf_ref_valid: pdi 0x%p pld out of bound; "
 677                             "index %d has pld_cnt %d pbuf_idx %d "
 678                             "(mmd_pbuf_cnt %d), "
 679                             "pld_rptr 0x%p pld_wptr 0x%p len %d "
 680                             "(valid 0x%p-0x%p len %d)\n", (void *)pdi,
 681                             i, pdi->pld_cnt, idx, mmd->mmd_pbuf_cnt,
 682                             (void *)pa->pld_rptr,
 683                             (void *)pa->pld_wptr,
 684                             (int)PDESC_PLD_SPAN_SIZE(pdi, i),
 685                             (void *)mmd->mmd_pbuf[idx]->b_rptr,
 686                             (void *)mmd->mmd_pbuf[idx]->b_wptr,
 687                             (int)MBLKL(mmd->mmd_pbuf[idx])));
 688                 }
 689 
 690                 /* advance to next entry */
 691                 i++;
 692                 pa++;
 693         }
 694 
 695         mutex_exit(&mmd->mmd_pd_slab_lock);
 696         return (valid);
 697 }
 698 
 699 /*
 700  * Add a packet descriptor to the Multidata.
 701  */
 702 pdesc_t *
 703 mmd_addpdesc(multidata_t *mmd, pdescinfo_t *pdi, int *err, int kmflags)
 704 {
 705         ASSERT(mmd != NULL);
 706         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
 707         ASSERT(pdi != NULL);
 708         ASSERT(pdi->flags & PDESC_HAS_REF);
 709 
 710         /* do the references refer to invalid memory regions? */
 711         if (!mmd_speed_over_safety &&
 712             (((pdi->flags & PDESC_HBUF_REF) && !HBUF_REF_VALID(mmd, pdi)) ||
 713             ((pdi->flags & PDESC_PBUF_REF) && !pbuf_ref_valid(mmd, pdi)))) {
 714                 if (err != NULL)
 715                         *err = EINVAL;
 716                 return (NULL);
 717         }
 718 
 719         return (mmd_addpdesc_int(mmd, pdi, err, kmflags));
 720 }
 721 
 722 /*
 723  * Internal routine to add a packet descriptor, called when mmd_addpdesc
 724  * or mmd_copy tries to allocate and add a descriptor to a Multidata.
 725  */
 726 static pdesc_t *
 727 mmd_addpdesc_int(multidata_t *mmd, pdescinfo_t *pdi, int *err, int kmflags)
 728 {
 729         pdesc_slab_t *slab, *slab_last;
 730         pdesc_t *pd;
 731 
 732         ASSERT(pdi->flags & PDESC_HAS_REF);
 733         ASSERT(!(pdi->flags & PDESC_HBUF_REF) || HBUF_REF_VALID(mmd, pdi));
 734         ASSERT(!(pdi->flags & PDESC_PBUF_REF) || pbuf_ref_valid(mmd, pdi));
 735 
 736         if (err != NULL)
 737                 *err = 0;
 738 
 739         mutex_enter(&(mmd->mmd_pd_slab_lock));
 740         /*
 741          * Is slab list empty or the last-added slab is full?  If so,
 742          * allocate new slab for the descriptor; otherwise, use the
 743          * last-added slab instead.
 744          */
 745         slab_last = Q2PDSLAB(mmd->mmd_pd_slab_q.ql_prev);
 746         if (mmd->mmd_pd_slab_q.ql_next == &(mmd->mmd_pd_slab_q) ||
 747             slab_last->pds_used == slab_last->pds_sz) {
 748                 slab = kmem_cache_alloc(pd_slab_cache, kmflags);
 749                 if (slab == NULL) {
 750                         if (err != NULL)
 751                                 *err = ENOMEM;
 752                         mutex_exit(&(mmd->mmd_pd_slab_lock));
 753                         return (NULL);
 754                 }
 755                 slab->pds_mmd = mmd;
 756 
 757                 ASSERT(slab->pds_used == 0);
 758                 ASSERT(slab->pds_next == NULL && slab->pds_prev == NULL);
 759 
 760                 /* insert slab at end of list */
 761                 insque(&(slab->pds_next), mmd->mmd_pd_slab_q.ql_prev);
 762                 mmd->mmd_slab_cnt++;
 763         } else {
 764                 slab = slab_last;
 765         }
 766         ASSERT(slab->pds_used < slab->pds_sz);
 767         pd = &(slab->pds_free_desc[slab->pds_used++]);
 768         ASSERT(pd->pd_magic == PDESC_MAGIC);
 769         pd->pd_next = NULL;
 770         pd->pd_prev = NULL;
 771         pd->pd_slab = slab;
 772         pd->pd_pattbl = NULL;
 773 
 774         /* copy over the descriptor info from caller */
 775         PDI_COPY(pdi, &(pd->pd_pdi));
 776 
 777         if (pd->pd_flags & PDESC_HBUF_REF)
 778                 mmd->mmd_hbuf_ref++;
 779         if (pd->pd_flags & PDESC_PBUF_REF)
 780                 mmd->mmd_pbuf_ref += pd->pd_pdi.pld_cnt;
 781         mmd->mmd_pd_cnt++;
 782 
 783         /* insert descriptor at end of list */
 784         insque(&(pd->pd_next), mmd->mmd_pd_q.ql_prev);
 785         mutex_exit(&(mmd->mmd_pd_slab_lock));
 786 
 787         return (pd);
 788 }
 789 
 790 /*
 791  * Packet descriptor slab kmem cache constructor routine.
 792  */
 793 /* ARGSUSED */
 794 static int
 795 pdslab_constructor(void *buf, void *cdrarg, int kmflags)
 796 {
 797         pdesc_slab_t *slab;
 798         uint_t cnt = (uint_t)(uintptr_t)cdrarg;
 799         int i;
 800 
 801         ASSERT(cnt > 0);     /* slab size can't be zero */
 802 
 803         slab = (pdesc_slab_t *)buf;
 804         slab->pds_next = NULL;
 805         slab->pds_prev = NULL;
 806         slab->pds_mmd = NULL;
 807         slab->pds_used = 0;
 808         slab->pds_sz = cnt;
 809 
 810         for (i = 0; i < cnt; i++) {
 811                 pdesc_t *pd = &(slab->pds_free_desc[i]);
 812                 pd->pd_magic = PDESC_MAGIC;
 813         }
 814         return (0);
 815 }
 816 
 817 /*
 818  * Packet descriptor slab kmem cache destructor routine.
 819  */
 820 /* ARGSUSED */
 821 static void
 822 pdslab_destructor(void *buf, void *cdrarg)
 823 {
 824         pdesc_slab_t *slab;
 825 
 826         slab = (pdesc_slab_t *)buf;
 827         ASSERT(slab->pds_next == NULL);
 828         ASSERT(slab->pds_prev == NULL);
 829         ASSERT(slab->pds_mmd == NULL);
 830         ASSERT(slab->pds_used == 0);
 831         ASSERT(slab->pds_sz > 0);
 832 }
 833 
 834 /*
 835  * Remove a packet descriptor from the in-use descriptor list,
 836  * called by mmd_rempdesc or during free.
 837  */
 838 static pdesc_t *
 839 mmd_destroy_pdesc(multidata_t *mmd, pdesc_t *pd)
 840 {
 841         pdesc_t *pd_next;
 842 
 843         pd_next = Q2PD(pd->pd_next);
 844         remque(&(pd->pd_next));
 845 
 846         /* remove all local attributes */
 847         if (pd->pd_pattbl != NULL)
 848                 mmd_destroy_pattbl(&(pd->pd_pattbl));
 849 
 850         /* don't decrease counts for a removed descriptor */
 851         if (!(pd->pd_flags & PDESC_REM_DEFER)) {
 852                 if (pd->pd_flags & PDESC_HBUF_REF) {
 853                         ASSERT(mmd->mmd_hbuf_ref > 0);
 854                         mmd->mmd_hbuf_ref--;
 855                 }
 856                 if (pd->pd_flags & PDESC_PBUF_REF) {
 857                         ASSERT(mmd->mmd_pbuf_ref > 0);
 858                         mmd->mmd_pbuf_ref -= pd->pd_pdi.pld_cnt;
 859                 }
 860                 ASSERT(mmd->mmd_pd_cnt > 0);
 861                 mmd->mmd_pd_cnt--;
 862         }
 863         return (pd_next);
 864 }
 865 
 866 /*
 867  * Remove a packet descriptor from the Multidata.
 868  */
 869 void
 870 mmd_rempdesc(pdesc_t *pd)
 871 {
 872         multidata_t *mmd;
 873 
 874         ASSERT(pd->pd_magic == PDESC_MAGIC);
 875         ASSERT(pd->pd_slab != NULL);
 876 
 877         mmd = pd->pd_slab->pds_mmd;
 878         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
 879 
 880         mutex_enter(&(mmd->mmd_pd_slab_lock));
 881         /*
 882          * We can't deallocate the associated resources if the Multidata
 883          * is shared with other threads, because it's possible that the
 884          * descriptor handle value is held by those threads.  That's why
 885          * we simply mark the entry as "removed" and decrement the counts.
 886          * If there are no other threads, then we free the descriptor.
 887          */
 888         if (mmd->mmd_dp->db_ref > 1) {
 889                 pd->pd_flags |= PDESC_REM_DEFER;
 890                 if (pd->pd_flags & PDESC_HBUF_REF) {
 891                         ASSERT(mmd->mmd_hbuf_ref > 0);
 892                         mmd->mmd_hbuf_ref--;
 893                 }
 894                 if (pd->pd_flags & PDESC_PBUF_REF) {
 895                         ASSERT(mmd->mmd_pbuf_ref > 0);
 896                         mmd->mmd_pbuf_ref -= pd->pd_pdi.pld_cnt;
 897                 }
 898                 ASSERT(mmd->mmd_pd_cnt > 0);
 899                 mmd->mmd_pd_cnt--;
 900         } else {
 901                 (void) mmd_destroy_pdesc(mmd, pd);
 902         }
 903         mutex_exit(&(mmd->mmd_pd_slab_lock));
 904 }
 905 
 906 /*
 907  * A generic routine to traverse the packet descriptor in-use list.
 908  */
 909 static pdesc_t *
 910 mmd_getpdesc(multidata_t *mmd, pdesc_t *pd, pdescinfo_t *pdi, uint_t forw,
 911     boolean_t mutex_held)
 912 {
 913         pdesc_t *pd_head;
 914 
 915         ASSERT(pd == NULL || pd->pd_slab->pds_mmd == mmd);
 916         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
 917         ASSERT(!mutex_held || MUTEX_HELD(&(mmd->mmd_pd_slab_lock)));
 918 
 919         if (!mutex_held)
 920                 mutex_enter(&(mmd->mmd_pd_slab_lock));
 921         pd_head = Q2PD(&(mmd->mmd_pd_q));
 922 
 923         if (pd == NULL) {
 924                 /*
 925                  * We're called by mmd_get{first,last}pdesc, and so
 926                  * return either the first or last list element.
 927                  */
 928                 pd = forw ? Q2PD(mmd->mmd_pd_q.ql_next) :
 929                     Q2PD(mmd->mmd_pd_q.ql_prev);
 930         } else {
 931                 /*
 932                  * We're called by mmd_get{next,prev}pdesc, and so
 933                  * return either the next or previous list element.
 934                  */
 935                 pd = forw ? Q2PD(pd->pd_next) : Q2PD(pd->pd_prev);
 936         }
 937 
 938         while (pd != pd_head) {
 939                 /* skip element if it has been removed */
 940                 if (!(pd->pd_flags & PDESC_REM_DEFER))
 941                         break;
 942                 pd = forw ? Q2PD(pd->pd_next) : Q2PD(pd->pd_prev);
 943         }
 944         if (!mutex_held)
 945                 mutex_exit(&(mmd->mmd_pd_slab_lock));
 946 
 947         /* return NULL if we're back at the beginning */
 948         if (pd == pd_head)
 949                 pd = NULL;
 950 
 951         /* got an entry; copy descriptor info to caller */
 952         if (pd != NULL && pdi != NULL)
 953                 PDI_COPY(&(pd->pd_pdi), pdi);
 954 
 955         ASSERT(pd == NULL || pd->pd_magic == PDESC_MAGIC);
 956         return (pd);
 957 
 958 }
 959 
 960 /*
 961  * Return the first packet descriptor in the in-use list.
 962  */
 963 pdesc_t *
 964 mmd_getfirstpdesc(multidata_t *mmd, pdescinfo_t *pdi)
 965 {
 966         return (mmd_getpdesc(mmd, NULL, pdi, 1, B_FALSE));
 967 }
 968 
 969 /*
 970  * Return the last packet descriptor in the in-use list.
 971  */
 972 pdesc_t *
 973 mmd_getlastpdesc(multidata_t *mmd, pdescinfo_t *pdi)
 974 {
 975         return (mmd_getpdesc(mmd, NULL, pdi, 0, B_FALSE));
 976 }
 977 
 978 /*
 979  * Return the next packet descriptor in the in-use list.
 980  */
 981 pdesc_t *
 982 mmd_getnextpdesc(pdesc_t *pd, pdescinfo_t *pdi)
 983 {
 984         return (mmd_getpdesc(pd->pd_slab->pds_mmd, pd, pdi, 1, B_FALSE));
 985 }
 986 
 987 /*
 988  * Return the previous packet descriptor in the in-use list.
 989  */
 990 pdesc_t *
 991 mmd_getprevpdesc(pdesc_t *pd, pdescinfo_t *pdi)
 992 {
 993         return (mmd_getpdesc(pd->pd_slab->pds_mmd, pd, pdi, 0, B_FALSE));
 994 }
 995 
 996 /*
 997  * Check to see if pdi stretches over c_pdi; used to ensure that a packet
 998  * descriptor's header and payload span may not be extended beyond the
 999  * current boundaries.
1000  */
1001 static boolean_t
1002 pdi_in_range(pdescinfo_t *pdi, pdescinfo_t *c_pdi)
1003 {
1004         int i;
1005         struct pld_ary_s *pa = &pdi->pld_ary[0];
1006         struct pld_ary_s *c_pa = &c_pdi->pld_ary[0];
1007 
1008         if (pdi->hdr_base < c_pdi->hdr_base || pdi->hdr_lim > c_pdi->hdr_lim)
1009                 return (B_FALSE);
1010 
1011         /*
1012          * We don't allow the number of span to be reduced, for the sake
1013          * of simplicity.  Instead, we provide PDESC_PLD_SPAN_CLEAR() to
1014          * clear a packet descriptor.  Note that we allow the span count to
1015          * be increased, and the bounds check for the new one happens
1016          * in pbuf_ref_valid.
1017          */
1018         if (pdi->pld_cnt < c_pdi->pld_cnt)
1019                 return (B_FALSE);
1020 
1021         /* compare only those which are currently defined */
1022         for (i = 0; i < c_pdi->pld_cnt; i++, pa++, c_pa++) {
1023                 if (pa->pld_pbuf_idx != c_pa->pld_pbuf_idx ||
1024                     pa->pld_rptr < c_pa->pld_rptr ||
1025                     pa->pld_wptr > c_pa->pld_wptr)
1026                         return (B_FALSE);
1027         }
1028         return (B_TRUE);
1029 }
1030 
1031 /*
1032  * Modify the layout of a packet descriptor.
1033  */
1034 pdesc_t *
1035 mmd_adjpdesc(pdesc_t *pd, pdescinfo_t *pdi)
1036 {
1037         multidata_t *mmd;
1038         pdescinfo_t *c_pdi;
1039 
1040         ASSERT(pd != NULL);
1041         ASSERT(pdi != NULL);
1042         ASSERT(pd->pd_magic == PDESC_MAGIC);
1043 
1044         mmd = pd->pd_slab->pds_mmd;
1045         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1046 
1047         /* entry has been removed */
1048         if (pd->pd_flags & PDESC_REM_DEFER)
1049                 return (NULL);
1050 
1051         /* caller doesn't intend to specify any buffer reference? */
1052         if (!(pdi->flags & PDESC_HAS_REF))
1053                 return (NULL);
1054 
1055         /* do the references refer to invalid memory regions? */
1056         if (!mmd_speed_over_safety &&
1057             (((pdi->flags & PDESC_HBUF_REF) && !HBUF_REF_VALID(mmd, pdi)) ||
1058             ((pdi->flags & PDESC_PBUF_REF) && !pbuf_ref_valid(mmd, pdi))))
1059                 return (NULL);
1060 
1061         /* they're not subsets of current references? */
1062         c_pdi = &(pd->pd_pdi);
1063         if (!pdi_in_range(pdi, c_pdi))
1064                 return (NULL);
1065 
1066         /* copy over the descriptor info from caller */
1067         PDI_COPY(pdi, c_pdi);
1068 
1069         return (pd);
1070 }
1071 
1072 /*
1073  * Copy the contents of a packet descriptor into a new buffer.  If the
1074  * descriptor points to more than one buffer fragments, the contents
1075  * of both fragments will be joined, with the header buffer fragment
1076  * preceding the payload buffer fragment(s).
1077  */
1078 mblk_t *
1079 mmd_transform(pdesc_t *pd)
1080 {
1081         multidata_t *mmd;
1082         pdescinfo_t *pdi;
1083         mblk_t *mp;
1084         int h_size = 0, p_size = 0;
1085         int i, len;
1086 
1087         ASSERT(pd != NULL);
1088         ASSERT(pd->pd_magic == PDESC_MAGIC);
1089 
1090         mmd = pd->pd_slab->pds_mmd;
1091         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1092 
1093         /* entry has been removed */
1094         if (pd->pd_flags & PDESC_REM_DEFER)
1095                 return (NULL);
1096 
1097         mutex_enter(&mmd->mmd_pd_slab_lock);
1098         pdi = &(pd->pd_pdi);
1099         if (pdi->flags & PDESC_HBUF_REF)
1100                 h_size = PDESC_HDRL(pdi);
1101         if (pdi->flags & PDESC_PBUF_REF) {
1102                 for (i = 0; i < pdi->pld_cnt; i++)
1103                         p_size += PDESC_PLD_SPAN_SIZE(pdi, i);
1104         }
1105 
1106         /* allocate space large enough to hold the fragment(s) */
1107         ASSERT(h_size + p_size >= 0);
1108         if ((mp = allocb(h_size + p_size, BPRI_HI)) == NULL) {
1109                 mutex_exit(&mmd->mmd_pd_slab_lock);
1110                 return (NULL);
1111         }
1112 
1113         /* copy over the header fragment */
1114         if ((pdi->flags & PDESC_HBUF_REF) && h_size > 0) {
1115                 bcopy(pdi->hdr_rptr, mp->b_wptr, h_size);
1116                 mp->b_wptr += h_size;
1117         }
1118 
1119         /* copy over the payload fragment */
1120         if ((pdi->flags & PDESC_PBUF_REF) && p_size > 0) {
1121                 for (i = 0; i < pdi->pld_cnt; i++) {
1122                         len = PDESC_PLD_SPAN_SIZE(pdi, i);
1123                         if (len > 0) {
1124                                 bcopy(pdi->pld_ary[i].pld_rptr,
1125                                     mp->b_wptr, len);
1126                                 mp->b_wptr += len;
1127                         }
1128                 }
1129         }
1130 
1131         mutex_exit(&mmd->mmd_pd_slab_lock);
1132         return (mp);
1133 }
1134 
1135 /*
1136  * Return a chain of mblks representing the Multidata packet.
1137  */
1138 mblk_t *
1139 mmd_transform_link(pdesc_t *pd)
1140 {
1141         multidata_t *mmd;
1142         pdescinfo_t *pdi;
1143         mblk_t *nmp = NULL;
1144 
1145         ASSERT(pd != NULL);
1146         ASSERT(pd->pd_magic == PDESC_MAGIC);
1147 
1148         mmd = pd->pd_slab->pds_mmd;
1149         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1150 
1151         /* entry has been removed */
1152         if (pd->pd_flags & PDESC_REM_DEFER)
1153                 return (NULL);
1154 
1155         pdi = &(pd->pd_pdi);
1156 
1157         /* duplicate header buffer */
1158         if ((pdi->flags & PDESC_HBUF_REF)) {
1159                 if ((nmp = dupb(mmd->mmd_hbuf)) == NULL)
1160                         return (NULL);
1161                 nmp->b_rptr = pdi->hdr_rptr;
1162                 nmp->b_wptr = pdi->hdr_wptr;
1163         }
1164 
1165         /* duplicate payload buffer(s) */
1166         if (pdi->flags & PDESC_PBUF_REF) {
1167                 int i;
1168                 mblk_t *mp;
1169                 struct pld_ary_s *pa = &pdi->pld_ary[0];
1170 
1171                 mutex_enter(&mmd->mmd_pd_slab_lock);
1172                 for (i = 0; i < pdi->pld_cnt; i++, pa++) {
1173                         ASSERT(mmd->mmd_pbuf[pa->pld_pbuf_idx] != NULL);
1174 
1175                         /* skip empty ones */
1176                         if (PDESC_PLD_SPAN_SIZE(pdi, i) == 0)
1177                                 continue;
1178 
1179                         mp = dupb(mmd->mmd_pbuf[pa->pld_pbuf_idx]);
1180                         if (mp == NULL) {
1181                                 if (nmp != NULL)
1182                                         freemsg(nmp);
1183                                 mutex_exit(&mmd->mmd_pd_slab_lock);
1184                                 return (NULL);
1185                         }
1186                         mp->b_rptr = pa->pld_rptr;
1187                         mp->b_wptr = pa->pld_wptr;
1188                         if (nmp == NULL)
1189                                 nmp = mp;
1190                         else
1191                                 linkb(nmp, mp);
1192                 }
1193                 mutex_exit(&mmd->mmd_pd_slab_lock);
1194         }
1195 
1196         return (nmp);
1197 }
1198 
1199 /*
1200  * Return duplicate message block(s) of the associated buffer(s).
1201  */
1202 int
1203 mmd_dupbufs(multidata_t *mmd, mblk_t **hmp, mblk_t **pmp)
1204 {
1205         ASSERT(mmd != NULL);
1206         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1207 
1208         if (hmp != NULL) {
1209                 *hmp = NULL;
1210                 if (mmd->mmd_hbuf != NULL &&
1211                     (*hmp = dupb(mmd->mmd_hbuf)) == NULL)
1212                         return (-1);
1213         }
1214 
1215         if (pmp != NULL) {
1216                 int i;
1217                 mblk_t *mp;
1218 
1219                 mutex_enter(&mmd->mmd_pd_slab_lock);
1220                 *pmp = NULL;
1221                 for (i = 0; i < mmd->mmd_pbuf_cnt; i++) {
1222                         ASSERT(mmd->mmd_pbuf[i] != NULL);
1223                         mp = dupb(mmd->mmd_pbuf[i]);
1224                         if (mp == NULL) {
1225                                 if (hmp != NULL && *hmp != NULL)
1226                                         freeb(*hmp);
1227                                 if (*pmp != NULL)
1228                                         freemsg(*pmp);
1229                                 mutex_exit(&mmd->mmd_pd_slab_lock);
1230                                 return (-1);
1231                         }
1232                         if (*pmp == NULL)
1233                                 *pmp = mp;
1234                         else
1235                                 linkb(*pmp, mp);
1236                 }
1237                 mutex_exit(&mmd->mmd_pd_slab_lock);
1238         }
1239 
1240         return (0);
1241 }
1242 
1243 /*
1244  * Return the layout of a packet descriptor.
1245  */
1246 int
1247 mmd_getpdescinfo(pdesc_t *pd, pdescinfo_t *pdi)
1248 {
1249         ASSERT(pd != NULL);
1250         ASSERT(pd->pd_magic == PDESC_MAGIC);
1251         ASSERT(pd->pd_slab != NULL);
1252         ASSERT(pd->pd_slab->pds_mmd->mmd_magic == MULTIDATA_MAGIC);
1253         ASSERT(pdi != NULL);
1254 
1255         /* entry has been removed */
1256         if (pd->pd_flags & PDESC_REM_DEFER)
1257                 return (-1);
1258 
1259         /* copy descriptor info to caller */
1260         PDI_COPY(&(pd->pd_pdi), pdi);
1261 
1262         return (0);
1263 }
1264 
1265 /*
1266  * Add a global or local attribute to a Multidata.  Global attribute
1267  * association is specified by a NULL packet descriptor.
1268  */
1269 pattr_t *
1270 mmd_addpattr(multidata_t *mmd, pdesc_t *pd, pattrinfo_t *pai,
1271     boolean_t persistent, int kmflags)
1272 {
1273         patbkt_t **tbl_p;
1274         patbkt_t *tbl, *o_tbl;
1275         patbkt_t *bkt;
1276         pattr_t *pa;
1277         uint_t size;
1278 
1279         ASSERT(mmd != NULL);
1280         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1281         ASSERT(pd == NULL || pd->pd_magic == PDESC_MAGIC);
1282         ASSERT(pai != NULL);
1283 
1284         /* pointer to the attribute hash table (local or global) */
1285         tbl_p = pd != NULL ? &(pd->pd_pattbl) : &(mmd->mmd_pattbl);
1286 
1287         /*
1288          * See if the hash table has not yet been created; if so,
1289          * we create the table and store its address atomically.
1290          */
1291         if ((tbl = *tbl_p) == NULL) {
1292                 tbl = kmem_cache_alloc(pattbl_cache, kmflags);
1293                 if (tbl == NULL)
1294                         return (NULL);
1295 
1296                 /* if someone got there first, use his table instead */
1297                 if ((o_tbl = casptr(tbl_p, NULL, tbl)) != NULL) {
1298                         kmem_cache_free(pattbl_cache, tbl);
1299                         tbl = o_tbl;
1300                 }
1301         }
1302 
1303         ASSERT(tbl->pbkt_tbl_sz > 0);
1304         bkt = &(tbl[PATTBL_HASH(pai->type, tbl->pbkt_tbl_sz)]);
1305 
1306         /* attribute of the same type already exists? */
1307         if ((pa = mmd_find_pattr(bkt, pai->type)) != NULL)
1308                 return (NULL);
1309 
1310         size = sizeof (*pa) + pai->len;
1311         if ((pa = kmem_zalloc(size, kmflags)) == NULL)
1312                 return (NULL);
1313 
1314         pa->pat_magic = PATTR_MAGIC;
1315         pa->pat_lock = &(bkt->pbkt_lock);
1316         pa->pat_mmd = mmd;
1317         pa->pat_buflen = size;
1318         pa->pat_type = pai->type;
1319         pai->buf = pai->len > 0 ? ((uchar_t *)(pa + 1)) : NULL;
1320 
1321         if (persistent)
1322                 pa->pat_flags = PATTR_PERSIST;
1323 
1324         /* insert attribute at end of hash chain */
1325         mutex_enter(&(bkt->pbkt_lock));
1326         insque(&(pa->pat_next), bkt->pbkt_pattr_q.ql_prev);
1327         mutex_exit(&(bkt->pbkt_lock));
1328 
1329         return (pa);
1330 }
1331 
1332 /*
1333  * Attribute hash table kmem cache constructor routine.
1334  */
1335 /* ARGSUSED */
1336 static int
1337 pattbl_constructor(void *buf, void *cdrarg, int kmflags)
1338 {
1339         patbkt_t *bkt;
1340         uint_t tbl_sz = (uint_t)(uintptr_t)cdrarg;
1341         uint_t i;
1342 
1343         ASSERT(tbl_sz > 0);  /* table size can't be zero */
1344 
1345         for (i = 0, bkt = (patbkt_t *)buf; i < tbl_sz; i++, bkt++) {
1346                 mutex_init(&(bkt->pbkt_lock), NULL, MUTEX_DRIVER, NULL);
1347                 QL_INIT(&(bkt->pbkt_pattr_q));
1348 
1349                 /* first bucket contains the table size */
1350                 bkt->pbkt_tbl_sz = i == 0 ? tbl_sz : 0;
1351         }
1352         return (0);
1353 }
1354 
1355 /*
1356  * Attribute hash table kmem cache destructor routine.
1357  */
1358 /* ARGSUSED */
1359 static void
1360 pattbl_destructor(void *buf, void *cdrarg)
1361 {
1362         patbkt_t *bkt;
1363         uint_t tbl_sz = (uint_t)(uintptr_t)cdrarg;
1364         uint_t i;
1365 
1366         ASSERT(tbl_sz > 0);  /* table size can't be zero */
1367 
1368         for (i = 0, bkt = (patbkt_t *)buf; i < tbl_sz; i++, bkt++) {
1369                 mutex_destroy(&(bkt->pbkt_lock));
1370                 ASSERT(bkt->pbkt_pattr_q.ql_next == &(bkt->pbkt_pattr_q));
1371                 ASSERT(i > 0 || bkt->pbkt_tbl_sz == tbl_sz);
1372         }
1373 }
1374 
1375 /*
1376  * Destroy an attribute hash table, called by mmd_rempdesc or during free.
1377  */
1378 static void
1379 mmd_destroy_pattbl(patbkt_t **tbl)
1380 {
1381         patbkt_t *bkt;
1382         pattr_t *pa, *pa_next;
1383         uint_t i, tbl_sz;
1384 
1385         ASSERT(tbl != NULL);
1386         bkt = *tbl;
1387         tbl_sz = bkt->pbkt_tbl_sz;
1388 
1389         /* make sure caller passes in the first bucket */
1390         ASSERT(tbl_sz > 0);
1391 
1392         /* destroy the contents of each bucket */
1393         for (i = 0; i < tbl_sz; i++, bkt++) {
1394                 /* we ought to be exclusive at this point */
1395                 ASSERT(MUTEX_NOT_HELD(&(bkt->pbkt_lock)));
1396 
1397                 pa = Q2PATTR(bkt->pbkt_pattr_q.ql_next);
1398                 while (pa != Q2PATTR(&(bkt->pbkt_pattr_q))) {
1399                         ASSERT(pa->pat_magic == PATTR_MAGIC);
1400                         pa_next = Q2PATTR(pa->pat_next);
1401                         remque(&(pa->pat_next));
1402                         kmem_free(pa, pa->pat_buflen);
1403                         pa = pa_next;
1404                 }
1405         }
1406 
1407         kmem_cache_free(pattbl_cache, *tbl);
1408         *tbl = NULL;
1409 
1410         /* commit all previous stores */
1411         membar_producer();
1412 }
1413 
1414 /*
1415  * Copy the contents of an attribute hash table, called by mmd_copy.
1416  */
1417 static int
1418 mmd_copy_pattbl(patbkt_t *src_tbl, multidata_t *n_mmd, pdesc_t *n_pd,
1419     int kmflags)
1420 {
1421         patbkt_t *bkt;
1422         pattr_t *pa;
1423         pattrinfo_t pai;
1424         uint_t i, tbl_sz;
1425 
1426         ASSERT(src_tbl != NULL);
1427         bkt = src_tbl;
1428         tbl_sz = bkt->pbkt_tbl_sz;
1429 
1430         /* make sure caller passes in the first bucket */
1431         ASSERT(tbl_sz > 0);
1432 
1433         for (i = 0; i < tbl_sz; i++, bkt++) {
1434                 mutex_enter(&(bkt->pbkt_lock));
1435                 pa = Q2PATTR(bkt->pbkt_pattr_q.ql_next);
1436                 while (pa != Q2PATTR(&(bkt->pbkt_pattr_q))) {
1437                         pattr_t *pa_next = Q2PATTR(pa->pat_next);
1438 
1439                         /* skip if it's removed */
1440                         if (pa->pat_flags & PATTR_REM_DEFER) {
1441                                 pa = pa_next;
1442                                 continue;
1443                         }
1444 
1445                         pai.type = pa->pat_type;
1446                         pai.len = pa->pat_buflen - sizeof (*pa);
1447                         if (mmd_addpattr(n_mmd, n_pd, &pai, (pa->pat_flags &
1448                             PATTR_PERSIST) != 0, kmflags) == NULL) {
1449                                 mutex_exit(&(bkt->pbkt_lock));
1450                                 return (-1);
1451                         }
1452 
1453                         /* copy over the contents */
1454                         if (pai.buf != NULL)
1455                                 bcopy(pa + 1, pai.buf, pai.len);
1456 
1457                         pa = pa_next;
1458                 }
1459                 mutex_exit(&(bkt->pbkt_lock));
1460         }
1461 
1462         return (0);
1463 }
1464 
1465 /*
1466  * Search for an attribute type within an attribute hash bucket.
1467  */
1468 static pattr_t *
1469 mmd_find_pattr(patbkt_t *bkt, uint_t type)
1470 {
1471         pattr_t *pa_head, *pa;
1472 
1473         mutex_enter(&(bkt->pbkt_lock));
1474         pa_head = Q2PATTR(&(bkt->pbkt_pattr_q));
1475         pa = Q2PATTR(bkt->pbkt_pattr_q.ql_next);
1476 
1477         while (pa != pa_head) {
1478                 ASSERT(pa->pat_magic == PATTR_MAGIC);
1479 
1480                 /* return a match; we treat removed entry as non-existent */
1481                 if (pa->pat_type == type && !(pa->pat_flags & PATTR_REM_DEFER))
1482                         break;
1483                 pa = Q2PATTR(pa->pat_next);
1484         }
1485         mutex_exit(&(bkt->pbkt_lock));
1486 
1487         return (pa == pa_head ? NULL : pa);
1488 }
1489 
1490 /*
1491  * Remove an attribute from a Multidata.
1492  */
1493 void
1494 mmd_rempattr(pattr_t *pa)
1495 {
1496         kmutex_t *pat_lock = pa->pat_lock;
1497 
1498         ASSERT(pa->pat_magic == PATTR_MAGIC);
1499 
1500         /* ignore if attribute was marked as persistent */
1501         if ((pa->pat_flags & PATTR_PERSIST) != 0)
1502                 return;
1503 
1504         mutex_enter(pat_lock);
1505         /*
1506          * We can't deallocate the associated resources if the Multidata
1507          * is shared with other threads, because it's possible that the
1508          * attribute handle value is held by those threads.  That's why
1509          * we simply mark the entry as "removed".  If there are no other
1510          * threads, then we free the attribute.
1511          */
1512         if (pa->pat_mmd->mmd_dp->db_ref > 1) {
1513                 pa->pat_flags |= PATTR_REM_DEFER;
1514         } else {
1515                 remque(&(pa->pat_next));
1516                 kmem_free(pa, pa->pat_buflen);
1517         }
1518         mutex_exit(pat_lock);
1519 }
1520 
1521 /*
1522  * Find an attribute (according to its type) and return its handle.
1523  */
1524 pattr_t *
1525 mmd_getpattr(multidata_t *mmd, pdesc_t *pd, pattrinfo_t *pai)
1526 {
1527         patbkt_t *tbl, *bkt;
1528         pattr_t *pa;
1529 
1530         ASSERT(mmd != NULL);
1531         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1532         ASSERT(pai != NULL);
1533 
1534         /* get the right attribute hash table (local or global) */
1535         tbl = pd != NULL ? pd->pd_pattbl : mmd->mmd_pattbl;
1536 
1537         /* attribute hash table doesn't exist? */
1538         if (tbl == NULL)
1539                 return (NULL);
1540 
1541         ASSERT(tbl->pbkt_tbl_sz > 0);
1542         bkt = &(tbl[PATTBL_HASH(pai->type, tbl->pbkt_tbl_sz)]);
1543 
1544         if ((pa = mmd_find_pattr(bkt, pai->type)) != NULL) {
1545                 ASSERT(pa->pat_buflen >= sizeof (*pa));
1546                 pai->len = pa->pat_buflen - sizeof (*pa);
1547                 pai->buf = pai->len > 0 ?
1548                     (uchar_t *)pa + sizeof (pattr_t) : NULL;
1549         }
1550         ASSERT(pa == NULL || pa->pat_magic == PATTR_MAGIC);
1551         return (pa);
1552 }
1553 
1554 /*
1555  * Return total size of buffers and total size of areas referenced
1556  * by all in-use (unremoved) packet descriptors.
1557  */
1558 void
1559 mmd_getsize(multidata_t *mmd, uint_t *ptotal, uint_t *pinuse)
1560 {
1561         pdesc_t *pd;
1562         pdescinfo_t *pdi;
1563         int i;
1564 
1565         ASSERT(mmd != NULL);
1566         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1567 
1568         mutex_enter(&mmd->mmd_pd_slab_lock);
1569         if (ptotal != NULL) {
1570                 *ptotal = 0;
1571 
1572                 if (mmd->mmd_hbuf != NULL)
1573                         *ptotal += MBLKL(mmd->mmd_hbuf);
1574 
1575                 for (i = 0; i < mmd->mmd_pbuf_cnt; i++) {
1576                         ASSERT(mmd->mmd_pbuf[i] != NULL);
1577                         *ptotal += MBLKL(mmd->mmd_pbuf[i]);
1578                 }
1579         }
1580         if (pinuse != NULL) {
1581                 *pinuse = 0;
1582 
1583                 /* first pdesc */
1584                 pd = mmd_getpdesc(mmd, NULL, NULL, 1, B_TRUE);
1585                 while (pd != NULL) {
1586                         pdi = &pd->pd_pdi;
1587 
1588                         /* next pdesc */
1589                         pd = mmd_getpdesc(mmd, pd, NULL, 1, B_TRUE);
1590 
1591                         /* skip over removed descriptor */
1592                         if (pdi->flags & PDESC_REM_DEFER)
1593                                 continue;
1594 
1595                         if (pdi->flags & PDESC_HBUF_REF)
1596                                 *pinuse += PDESC_HDRL(pdi);
1597 
1598                         if (pdi->flags & PDESC_PBUF_REF) {
1599                                 for (i = 0; i < pdi->pld_cnt; i++)
1600                                         *pinuse += PDESC_PLDL(pdi, i);
1601                         }
1602                 }
1603         }
1604         mutex_exit(&mmd->mmd_pd_slab_lock);
1605 }