1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License, Version 1.0 only
   6  * (the "License").  You may not use this file except in compliance
   7  * with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 /*
  23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * Multidata, as described in the following papers:
  29  *
  30  * Adi Masputra,
  31  * Multidata V.2: VA-Disjoint Packet Extents Framework Interface
  32  * Design Specification.  August 2004.
  33  * Available as http://sac.sfbay/PSARC/2004/594/materials/mmd2.pdf.
  34  *
  35  * Adi Masputra,
  36  * Multidata Interface Design Specification.  Sep 2002.
  37  * Available as http://sac.sfbay/PSARC/2002/276/materials/mmd.pdf.
  38  *
  39  * Adi Masputra, Frank DiMambro, Kacheong Poon,
  40  * An Efficient Networking Transmit Mechanism for Solaris:
  41  * Multidata Transmit (MDT).  May 2002.
  42  * Available as http://sac.sfbay/PSARC/2002/276/materials/mdt.pdf.
  43  */
  44 
  45 #include <sys/types.h>
  46 #include <sys/stream.h>
  47 #include <sys/dlpi.h>
  48 #include <sys/stropts.h>
  49 #include <sys/strsun.h>
  50 #include <sys/strlog.h>
  51 #include <sys/strsubr.h>
  52 #include <sys/sysmacros.h>
  53 #include <sys/cmn_err.h>
  54 #include <sys/debug.h>
  55 #include <sys/kmem.h>
  56 #include <sys/atomic.h>
  57 
  58 #include <sys/multidata.h>
  59 #include <sys/multidata_impl.h>
  60 
  61 static int mmd_constructor(void *, void *, int);
  62 static void mmd_destructor(void *, void *);
  63 static int pdslab_constructor(void *, void *, int);
  64 static void pdslab_destructor(void *, void *);
  65 static int pattbl_constructor(void *, void *, int);
  66 static void pattbl_destructor(void *, void *);
  67 static void mmd_esballoc_free(caddr_t);
  68 static int mmd_copy_pattbl(patbkt_t *, multidata_t *, pdesc_t *, int);
  69 
  70 static boolean_t pbuf_ref_valid(multidata_t *, pdescinfo_t *);
  71 #pragma inline(pbuf_ref_valid)
  72 
  73 static boolean_t pdi_in_range(pdescinfo_t *, pdescinfo_t *);
  74 #pragma inline(pdi_in_range)
  75 
  76 static pdesc_t *mmd_addpdesc_int(multidata_t *, pdescinfo_t *, int *, int);
  77 #pragma inline(mmd_addpdesc_int)
  78 
  79 static void mmd_destroy_pattbl(patbkt_t **);
  80 #pragma inline(mmd_destroy_pattbl)
  81 
  82 static pattr_t *mmd_find_pattr(patbkt_t *, uint_t);
  83 #pragma inline(mmd_find_pattr)
  84 
  85 static pdesc_t *mmd_destroy_pdesc(multidata_t *, pdesc_t *);
  86 #pragma inline(mmd_destroy_pdesc)
  87 
  88 static pdesc_t *mmd_getpdesc(multidata_t *, pdesc_t *, pdescinfo_t *, uint_t,
  89     boolean_t);
  90 #pragma inline(mmd_getpdesc)
  91 
  92 static struct kmem_cache *mmd_cache;
  93 static struct kmem_cache *pd_slab_cache;
  94 static struct kmem_cache *pattbl_cache;
  95 
  96 int mmd_debug = 1;
  97 #define MMD_DEBUG(s)    if (mmd_debug > 0) cmn_err s
  98 
  99 /*
 100  * Set to this to true to bypass pdesc bounds checking.
 101  */
 102 boolean_t mmd_speed_over_safety = B_FALSE;
 103 
 104 /*
 105  * Patchable kmem_cache flags.
 106  */
 107 int mmd_kmem_flags = 0;
 108 int pdslab_kmem_flags = 0;
 109 int pattbl_kmem_flags = 0;
 110 
 111 /*
 112  * Alignment (in bytes) of our kmem caches.
 113  */
 114 #define MULTIDATA_CACHE_ALIGN   64
 115 
 116 /*
 117  * Default number of packet descriptors per descriptor slab.  Making
 118  * this too small will trigger more descriptor slab allocation; making
 119  * it too large will create too many unclaimed descriptors.
 120  */
 121 #define PDSLAB_SZ       15
 122 uint_t pdslab_sz = PDSLAB_SZ;
 123 
 124 /*
 125  * Default attribute hash table size.  It's okay to set this to a small
 126  * value (even to 1) because there aren't that many attributes currently
 127  * defined, and because we assume there won't be many attributes associated
 128  * with a Multidata at a given time.  Increasing the size will reduce
 129  * attribute search time (given a large number of attributes in a Multidata),
 130  * and decreasing it will reduce the memory footprints and the overhead
 131  * associated with managing the table.
 132  */
 133 #define PATTBL_SZ       1
 134 uint_t pattbl_sz = PATTBL_SZ;
 135 
 136 /*
 137  * Attribute hash key.
 138  */
 139 #define PATTBL_HASH(x, sz)      ((x) % (sz))
 140 
 141 /*
 142  * Structure that precedes each Multidata metadata.
 143  */
 144 struct mmd_buf_info {
 145         frtn_t  frp;            /* free routine */
 146         uint_t  buf_len;        /* length of kmem buffer */
 147 };
 148 
 149 /*
 150  * The size of each metadata buffer.
 151  */
 152 #define MMD_CACHE_SIZE  \
 153         (sizeof (struct mmd_buf_info) + sizeof (multidata_t))
 154 
 155 /*
 156  * Called during startup in order to create the Multidata kmem caches.
 157  */
 158 void
 159 mmd_init(void)
 160 {
 161         pdslab_sz = MAX(1, pdslab_sz);  /* at least 1 descriptor */
 162         pattbl_sz = MAX(1, pattbl_sz);  /* at least 1 bucket */
 163 
 164         mmd_cache = kmem_cache_create("multidata", MMD_CACHE_SIZE,
 165             MULTIDATA_CACHE_ALIGN, mmd_constructor, mmd_destructor,
 166             NULL, NULL, NULL, mmd_kmem_flags);
 167 
 168         pd_slab_cache = kmem_cache_create("multidata_pdslab",
 169             PDESC_SLAB_SIZE(pdslab_sz), MULTIDATA_CACHE_ALIGN,
 170             pdslab_constructor, pdslab_destructor, NULL,
 171             (void *)(uintptr_t)pdslab_sz, NULL, pdslab_kmem_flags);
 172 
 173         pattbl_cache = kmem_cache_create("multidata_pattbl",
 174             sizeof (patbkt_t) * pattbl_sz, MULTIDATA_CACHE_ALIGN,
 175             pattbl_constructor, pattbl_destructor, NULL,
 176             (void *)(uintptr_t)pattbl_sz, NULL, pattbl_kmem_flags);
 177 }
 178 
 179 /*
 180  * Create a Multidata message block.
 181  */
 182 multidata_t *
 183 mmd_alloc(mblk_t *hdr_mp, mblk_t **mmd_mp, int kmflags)
 184 {
 185         uchar_t *buf;
 186         multidata_t *mmd;
 187         uint_t mmd_mplen;
 188         struct mmd_buf_info *buf_info;
 189 
 190         ASSERT(hdr_mp != NULL);
 191         ASSERT(mmd_mp != NULL);
 192 
 193         /*
 194          * Caller should never pass in a chain of mblks since we
 195          * only care about the first one, hence the assertions.
 196          */
 197         ASSERT(hdr_mp->b_cont == NULL);
 198 
 199         if ((buf = kmem_cache_alloc(mmd_cache, kmflags)) == NULL)
 200                 return (NULL);
 201 
 202         buf_info = (struct mmd_buf_info *)buf;
 203         buf_info->frp.free_arg = (caddr_t)buf;
 204 
 205         mmd = (multidata_t *)(buf_info + 1);
 206         mmd_mplen = sizeof (*mmd);
 207 
 208         if ((*mmd_mp = desballoc((uchar_t *)mmd, mmd_mplen, BPRI_HI,
 209             &(buf_info->frp))) == NULL) {
 210                 kmem_cache_free(mmd_cache, buf);
 211                 return (NULL);
 212         }
 213 
 214         DB_TYPE(*mmd_mp) = M_MULTIDATA;
 215         (*mmd_mp)->b_wptr += mmd_mplen;
 216         mmd->mmd_dp = (*mmd_mp)->b_datap;
 217         mmd->mmd_hbuf = hdr_mp;
 218 
 219         return (mmd);
 220 }
 221 
 222 /*
 223  * Associate additional payload buffer to the Multidata.
 224  */
 225 int
 226 mmd_addpldbuf(multidata_t *mmd, mblk_t *pld_mp)
 227 {
 228         int i;
 229 
 230         ASSERT(mmd != NULL);
 231         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
 232         ASSERT(pld_mp != NULL);
 233 
 234         mutex_enter(&mmd->mmd_pd_slab_lock);
 235         for (i = 0; i < MULTIDATA_MAX_PBUFS &&
 236             mmd->mmd_pbuf_cnt < MULTIDATA_MAX_PBUFS; i++) {
 237                 if (mmd->mmd_pbuf[i] == pld_mp) {
 238                         /* duplicate entry */
 239                         MMD_DEBUG((CE_WARN, "mmd_addpldbuf: error adding "
 240                             "pld 0x%p to mmd 0x%p since it has been "
 241                             "previously added into slot %d (total %d)\n",
 242                             (void *)pld_mp, (void *)mmd, i, mmd->mmd_pbuf_cnt));
 243                         mutex_exit(&mmd->mmd_pd_slab_lock);
 244                         return (-1);
 245                 } else if (mmd->mmd_pbuf[i] == NULL) {
 246                         mmd->mmd_pbuf[i] = pld_mp;
 247                         mmd->mmd_pbuf_cnt++;
 248                         mutex_exit(&mmd->mmd_pd_slab_lock);
 249                         return (i);
 250                 }
 251         }
 252 
 253         /* all slots are taken */
 254         MMD_DEBUG((CE_WARN, "mmd_addpldbuf: error adding pld 0x%p to mmd 0x%p "
 255             "since no slot space is left (total %d max %d)\n", (void *)pld_mp,
 256             (void *)mmd, mmd->mmd_pbuf_cnt, MULTIDATA_MAX_PBUFS));
 257         mutex_exit(&mmd->mmd_pd_slab_lock);
 258 
 259         return (-1);
 260 }
 261 
 262 /*
 263  * Multidata metadata kmem cache constructor routine.
 264  */
 265 /* ARGSUSED */
 266 static int
 267 mmd_constructor(void *buf, void *cdrarg, int kmflags)
 268 {
 269         struct mmd_buf_info *buf_info;
 270         multidata_t *mmd;
 271 
 272         bzero((void *)buf, MMD_CACHE_SIZE);
 273 
 274         buf_info = (struct mmd_buf_info *)buf;
 275         buf_info->frp.free_func = mmd_esballoc_free;
 276         buf_info->buf_len = MMD_CACHE_SIZE;
 277 
 278         mmd = (multidata_t *)(buf_info + 1);
 279         mmd->mmd_magic = MULTIDATA_MAGIC;
 280 
 281         mutex_init(&(mmd->mmd_pd_slab_lock), NULL, MUTEX_DRIVER, NULL);
 282         QL_INIT(&(mmd->mmd_pd_slab_q));
 283         QL_INIT(&(mmd->mmd_pd_q));
 284 
 285         return (0);
 286 }
 287 
 288 /*
 289  * Multidata metadata kmem cache destructor routine.
 290  */
 291 /* ARGSUSED */
 292 static void
 293 mmd_destructor(void *buf, void *cdrarg)
 294 {
 295         multidata_t *mmd;
 296 #ifdef DEBUG
 297         int i;
 298 #endif
 299 
 300         mmd = (multidata_t *)((uchar_t *)buf + sizeof (struct mmd_buf_info));
 301 
 302         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
 303         ASSERT(mmd->mmd_dp == NULL);
 304         ASSERT(mmd->mmd_hbuf == NULL);
 305         ASSERT(mmd->mmd_pbuf_cnt == 0);
 306 #ifdef DEBUG
 307         for (i = 0; i < MULTIDATA_MAX_PBUFS; i++)
 308                 ASSERT(mmd->mmd_pbuf[i] == NULL);
 309 #endif
 310         ASSERT(mmd->mmd_pattbl == NULL);
 311 
 312         mutex_destroy(&(mmd->mmd_pd_slab_lock));
 313         ASSERT(mmd->mmd_pd_slab_q.ql_next == &(mmd->mmd_pd_slab_q));
 314         ASSERT(mmd->mmd_slab_cnt == 0);
 315         ASSERT(mmd->mmd_pd_q.ql_next == &(mmd->mmd_pd_q));
 316         ASSERT(mmd->mmd_pd_cnt == 0);
 317         ASSERT(mmd->mmd_hbuf_ref == 0);
 318         ASSERT(mmd->mmd_pbuf_ref == 0);
 319 }
 320 
 321 /*
 322  * Multidata message block free callback routine.
 323  */
 324 static void
 325 mmd_esballoc_free(caddr_t buf)
 326 {
 327         multidata_t *mmd;
 328         pdesc_t *pd;
 329         pdesc_slab_t *slab;
 330         int i;
 331 
 332         ASSERT(buf != NULL);
 333         ASSERT(((struct mmd_buf_info *)buf)->buf_len == MMD_CACHE_SIZE);
 334 
 335         mmd = (multidata_t *)(buf + sizeof (struct mmd_buf_info));
 336         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
 337 
 338         ASSERT(mmd->mmd_dp != NULL);
 339         ASSERT(mmd->mmd_dp->db_ref == 1);
 340 
 341         /* remove all packet descriptors and private attributes */
 342         pd = Q2PD(mmd->mmd_pd_q.ql_next);
 343         while (pd != Q2PD(&(mmd->mmd_pd_q)))
 344                 pd = mmd_destroy_pdesc(mmd, pd);
 345 
 346         ASSERT(mmd->mmd_pd_q.ql_next == &(mmd->mmd_pd_q));
 347         ASSERT(mmd->mmd_pd_cnt == 0);
 348         ASSERT(mmd->mmd_hbuf_ref == 0);
 349         ASSERT(mmd->mmd_pbuf_ref == 0);
 350 
 351         /* remove all global attributes */
 352         if (mmd->mmd_pattbl != NULL)
 353                 mmd_destroy_pattbl(&(mmd->mmd_pattbl));
 354 
 355         /* remove all descriptor slabs */
 356         slab = Q2PDSLAB(mmd->mmd_pd_slab_q.ql_next);
 357         while (slab != Q2PDSLAB(&(mmd->mmd_pd_slab_q))) {
 358                 pdesc_slab_t *slab_next = Q2PDSLAB(slab->pds_next);
 359 
 360                 remque(&(slab->pds_next));
 361                 slab->pds_next = NULL;
 362                 slab->pds_prev = NULL;
 363                 slab->pds_mmd = NULL;
 364                 slab->pds_used = 0;
 365                 kmem_cache_free(pd_slab_cache, slab);
 366 
 367                 ASSERT(mmd->mmd_slab_cnt > 0);
 368                 mmd->mmd_slab_cnt--;
 369                 slab = slab_next;
 370         }
 371         ASSERT(mmd->mmd_pd_slab_q.ql_next == &(mmd->mmd_pd_slab_q));
 372         ASSERT(mmd->mmd_slab_cnt == 0);
 373 
 374         mmd->mmd_dp = NULL;
 375 
 376         /* finally, free all associated message blocks */
 377         if (mmd->mmd_hbuf != NULL) {
 378                 freeb(mmd->mmd_hbuf);
 379                 mmd->mmd_hbuf = NULL;
 380         }
 381 
 382         for (i = 0; i < MULTIDATA_MAX_PBUFS; i++) {
 383                 if (mmd->mmd_pbuf[i] != NULL) {
 384                         freeb(mmd->mmd_pbuf[i]);
 385                         mmd->mmd_pbuf[i] = NULL;
 386                         ASSERT(mmd->mmd_pbuf_cnt > 0);
 387                         mmd->mmd_pbuf_cnt--;
 388                 }
 389         }
 390 
 391         ASSERT(mmd->mmd_pbuf_cnt == 0);
 392         ASSERT(MUTEX_NOT_HELD(&(mmd->mmd_pd_slab_lock)));
 393         kmem_cache_free(mmd_cache, buf);
 394 }
 395 
 396 /*
 397  * Multidata message block copy routine, called by copyb() when it
 398  * encounters a M_MULTIDATA data block type.  This routine should
 399  * not be called by anyone other than copyb(), since it may go away
 400  * (read: become static to this module) once some sort of copy callback
 401  * routine is made available.
 402  */
 403 mblk_t *
 404 mmd_copy(mblk_t *bp, int kmflags)
 405 {
 406         multidata_t *mmd, *n_mmd;
 407         mblk_t *n_hbuf = NULL, *n_pbuf[MULTIDATA_MAX_PBUFS];
 408         mblk_t **pmp_last = &n_pbuf[MULTIDATA_MAX_PBUFS - 1];
 409         mblk_t **pmp;
 410         mblk_t *n_bp = NULL;
 411         pdesc_t *pd;
 412         uint_t n_pbuf_cnt = 0;
 413         int idx, i;
 414 
 415 #define FREE_PBUFS() {                                  \
 416         for (pmp = &n_pbuf[0]; pmp <= pmp_last; pmp++)   \
 417                 if (*pmp != NULL) freeb(*pmp);          \
 418 }
 419 
 420 #define REL_OFF(p, base, n_base)                        \
 421         ((uchar_t *)(n_base) + ((uchar_t *)(p) - (uchar_t *)base))
 422 
 423         ASSERT(bp != NULL && DB_TYPE(bp) == M_MULTIDATA);
 424         mmd = mmd_getmultidata(bp);
 425 
 426         /* copy the header buffer */
 427         if (mmd->mmd_hbuf != NULL && (n_hbuf = copyb(mmd->mmd_hbuf)) == NULL)
 428                 return (NULL);
 429 
 430         /* copy the payload buffer(s) */
 431         mutex_enter(&mmd->mmd_pd_slab_lock);
 432         bzero((void *)&n_pbuf[0], sizeof (mblk_t *) * MULTIDATA_MAX_PBUFS);
 433         n_pbuf_cnt = mmd->mmd_pbuf_cnt;
 434         for (i = 0; i < n_pbuf_cnt; i++) {
 435                 ASSERT(mmd->mmd_pbuf[i] != NULL);
 436                 n_pbuf[i] = copyb(mmd->mmd_pbuf[i]);
 437                 if (n_pbuf[i] == NULL) {
 438                         FREE_PBUFS();
 439                         mutex_exit(&mmd->mmd_pd_slab_lock);
 440                         return (NULL);
 441                 }
 442         }
 443 
 444         /* allocate new Multidata */
 445         n_mmd = mmd_alloc(n_hbuf, &n_bp, kmflags);
 446         if (n_mmd == NULL) {
 447                 if (n_hbuf != NULL)
 448                         freeb(n_hbuf);
 449                 if (n_pbuf_cnt != 0)
 450                         FREE_PBUFS();
 451                 mutex_exit(&mmd->mmd_pd_slab_lock);
 452                 return (NULL);
 453         }
 454 
 455         /*
 456          * Add payload buffer(s); upon success, leave n_pbuf array
 457          * alone, as the newly-created Multidata had already contained
 458          * the mblk pointers stored in the array.  These will be freed
 459          * along with the Multidata itself.
 460          */
 461         for (i = 0, pmp = &n_pbuf[0]; i < n_pbuf_cnt; i++, pmp++) {
 462                 idx = mmd_addpldbuf(n_mmd, *pmp);
 463                 if (idx < 0) {
 464                         FREE_PBUFS();
 465                         freeb(n_bp);
 466                         mutex_exit(&mmd->mmd_pd_slab_lock);
 467                         return (NULL);
 468                 }
 469         }
 470 
 471         /* copy over global attributes */
 472         if (mmd->mmd_pattbl != NULL &&
 473             mmd_copy_pattbl(mmd->mmd_pattbl, n_mmd, NULL, kmflags) < 0) {
 474                 freeb(n_bp);
 475                 mutex_exit(&mmd->mmd_pd_slab_lock);
 476                 return (NULL);
 477         }
 478 
 479         /* copy over packet descriptors and their atttributes */
 480         pd = mmd_getpdesc(mmd, NULL, NULL, 1, B_TRUE);  /* first pdesc */
 481         while (pd != NULL) {
 482                 pdesc_t *n_pd;
 483                 pdescinfo_t *pdi, n_pdi;
 484                 uchar_t *n_base, *base;
 485                 pdesc_t *pd_next;
 486 
 487                 /* next pdesc */
 488                 pd_next = mmd_getpdesc(pd->pd_slab->pds_mmd, pd, NULL,
 489                     1, B_TRUE);
 490 
 491                 /* skip if already removed */
 492                 if (pd->pd_flags & PDESC_REM_DEFER) {
 493                         pd = pd_next;
 494                         continue;
 495                 }
 496 
 497                 pdi = &(pd->pd_pdi);
 498                 bzero(&n_pdi, sizeof (n_pdi));
 499 
 500                 /*
 501                  * Calculate new descriptor values based on the offset of
 502                  * each pointer relative to the associated buffer(s).
 503                  */
 504                 ASSERT(pdi->flags & PDESC_HAS_REF);
 505                 if (pdi->flags & PDESC_HBUF_REF) {
 506                         n_base = n_mmd->mmd_hbuf->b_rptr;
 507                         base = mmd->mmd_hbuf->b_rptr;
 508 
 509                         n_pdi.flags |= PDESC_HBUF_REF;
 510                         n_pdi.hdr_base = REL_OFF(pdi->hdr_base, base, n_base);
 511                         n_pdi.hdr_rptr = REL_OFF(pdi->hdr_rptr, base, n_base);
 512                         n_pdi.hdr_wptr = REL_OFF(pdi->hdr_wptr, base, n_base);
 513                         n_pdi.hdr_lim = REL_OFF(pdi->hdr_lim, base, n_base);
 514                 }
 515 
 516                 if (pdi->flags & PDESC_PBUF_REF) {
 517                         n_pdi.flags |= PDESC_PBUF_REF;
 518                         n_pdi.pld_cnt = pdi->pld_cnt;
 519 
 520                         for (i = 0; i < pdi->pld_cnt; i++) {
 521                                 idx = pdi->pld_ary[i].pld_pbuf_idx;
 522                                 ASSERT(idx < MULTIDATA_MAX_PBUFS);
 523                                 ASSERT(n_mmd->mmd_pbuf[idx] != NULL);
 524                                 ASSERT(mmd->mmd_pbuf[idx] != NULL);
 525 
 526                                 n_base = n_mmd->mmd_pbuf[idx]->b_rptr;
 527                                 base = mmd->mmd_pbuf[idx]->b_rptr;
 528 
 529                                 n_pdi.pld_ary[i].pld_pbuf_idx = idx;
 530 
 531                                 /*
 532                                  * We can't copy the pointers just like that,
 533                                  * so calculate the relative offset.
 534                                  */
 535                                 n_pdi.pld_ary[i].pld_rptr =
 536                                     REL_OFF(pdi->pld_ary[i].pld_rptr,
 537                                         base, n_base);
 538                                 n_pdi.pld_ary[i].pld_wptr =
 539                                     REL_OFF(pdi->pld_ary[i].pld_wptr,
 540                                         base, n_base);
 541                         }
 542                 }
 543 
 544                 /* add the new descriptor to the new Multidata */
 545                 n_pd = mmd_addpdesc_int(n_mmd, &n_pdi, NULL, kmflags);
 546 
 547                 if (n_pd == NULL || (pd->pd_pattbl != NULL &&
 548                     mmd_copy_pattbl(pd->pd_pattbl, n_mmd, n_pd, kmflags) < 0)) {
 549                         freeb(n_bp);
 550                         mutex_exit(&mmd->mmd_pd_slab_lock);
 551                         return (NULL);
 552                 }
 553 
 554                 pd = pd_next;
 555         }
 556 #undef REL_OFF
 557 #undef FREE_PBUFS
 558 
 559         mutex_exit(&mmd->mmd_pd_slab_lock);
 560         return (n_bp);
 561 }
 562 
 563 /*
 564  * Given a Multidata message block, return the Multidata metadata handle.
 565  */
 566 multidata_t *
 567 mmd_getmultidata(mblk_t *mp)
 568 {
 569         multidata_t *mmd;
 570 
 571         ASSERT(mp != NULL);
 572 
 573         if (DB_TYPE(mp) != M_MULTIDATA)
 574                 return (NULL);
 575 
 576         mmd = (multidata_t *)mp->b_rptr;
 577         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
 578 
 579         return (mmd);
 580 }
 581 
 582 /*
 583  * Return the start and end addresses of the associated buffer(s).
 584  */
 585 void
 586 mmd_getregions(multidata_t *mmd, mbufinfo_t *mbi)
 587 {
 588         int i;
 589 
 590         ASSERT(mmd != NULL);
 591         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
 592         ASSERT(mbi != NULL);
 593 
 594         bzero((void *)mbi, sizeof (mbufinfo_t));
 595 
 596         if (mmd->mmd_hbuf != NULL) {
 597                 mbi->hbuf_rptr = mmd->mmd_hbuf->b_rptr;
 598                 mbi->hbuf_wptr = mmd->mmd_hbuf->b_wptr;
 599         }
 600 
 601         mutex_enter(&mmd->mmd_pd_slab_lock);
 602         for (i = 0; i < mmd->mmd_pbuf_cnt; i++) {
 603                 ASSERT(mmd->mmd_pbuf[i] != NULL);
 604                 mbi->pbuf_ary[i].pbuf_rptr = mmd->mmd_pbuf[i]->b_rptr;
 605                 mbi->pbuf_ary[i].pbuf_wptr = mmd->mmd_pbuf[i]->b_wptr;
 606 
 607         }
 608         mbi->pbuf_cnt = mmd->mmd_pbuf_cnt;
 609         mutex_exit(&mmd->mmd_pd_slab_lock);
 610 }
 611 
 612 /*
 613  * Return the Multidata statistics.
 614  */
 615 uint_t
 616 mmd_getcnt(multidata_t *mmd, uint_t *hbuf_ref, uint_t *pbuf_ref)
 617 {
 618         uint_t pd_cnt;
 619 
 620         ASSERT(mmd != NULL);
 621         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
 622 
 623         mutex_enter(&(mmd->mmd_pd_slab_lock));
 624         if (hbuf_ref != NULL)
 625                 *hbuf_ref = mmd->mmd_hbuf_ref;
 626         if (pbuf_ref != NULL)
 627                 *pbuf_ref = mmd->mmd_pbuf_ref;
 628         pd_cnt = mmd->mmd_pd_cnt;
 629         mutex_exit(&(mmd->mmd_pd_slab_lock));
 630 
 631         return (pd_cnt);
 632 }
 633 
 634 #define HBUF_REF_VALID(mmd, pdi)                                        \
 635         ((mmd)->mmd_hbuf != NULL && (pdi)->hdr_rptr != NULL &&            \
 636         (pdi)->hdr_wptr != NULL && (pdi)->hdr_base != NULL &&             \
 637         (pdi)->hdr_lim != NULL && (pdi)->hdr_lim >= (pdi)->hdr_base &&      \
 638         (pdi)->hdr_wptr >= (pdi)->hdr_rptr &&                          \
 639         (pdi)->hdr_base <= (pdi)->hdr_rptr &&                          \
 640         (pdi)->hdr_lim >= (pdi)->hdr_wptr &&                           \
 641         (pdi)->hdr_base >= (mmd)->mmd_hbuf->b_rptr &&                       \
 642         MBLKIN((mmd)->mmd_hbuf,                                              \
 643         (pdi->hdr_base - (mmd)->mmd_hbuf->b_rptr),                     \
 644         PDESC_HDRSIZE(pdi)))
 645 
 646 /*
 647  * Bounds check payload area(s).
 648  */
 649 static boolean_t
 650 pbuf_ref_valid(multidata_t *mmd, pdescinfo_t *pdi)
 651 {
 652         int i = 0, idx;
 653         boolean_t valid = B_TRUE;
 654         struct pld_ary_s *pa;
 655 
 656         mutex_enter(&mmd->mmd_pd_slab_lock);
 657         if (pdi->pld_cnt == 0 || pdi->pld_cnt > mmd->mmd_pbuf_cnt) {
 658                 mutex_exit(&mmd->mmd_pd_slab_lock);
 659                 return (B_FALSE);
 660         }
 661 
 662         pa = &pdi->pld_ary[0];
 663         while (valid && i < pdi->pld_cnt) {
 664                 valid = (((idx = pa->pld_pbuf_idx) < mmd->mmd_pbuf_cnt) &&
 665                     pa->pld_rptr != NULL && pa->pld_wptr != NULL &&
 666                     pa->pld_wptr >= pa->pld_rptr &&
 667                     pa->pld_rptr >= mmd->mmd_pbuf[idx]->b_rptr &&
 668                     MBLKIN(mmd->mmd_pbuf[idx], (pa->pld_rptr -
 669                         mmd->mmd_pbuf[idx]->b_rptr),
 670                         PDESC_PLD_SPAN_SIZE(pdi, i)));
 671 
 672                 if (!valid) {
 673                         MMD_DEBUG((CE_WARN,
 674                             "pbuf_ref_valid: pdi 0x%p pld out of bound; "
 675                             "index %d has pld_cnt %d pbuf_idx %d "
 676                             "(mmd_pbuf_cnt %d), "
 677                             "pld_rptr 0x%p pld_wptr 0x%p len %d "
 678                             "(valid 0x%p-0x%p len %d)\n", (void *)pdi,
 679                             i, pdi->pld_cnt, idx, mmd->mmd_pbuf_cnt,
 680                             (void *)pa->pld_rptr,
 681                             (void *)pa->pld_wptr,
 682                             (int)PDESC_PLD_SPAN_SIZE(pdi, i),
 683                             (void *)mmd->mmd_pbuf[idx]->b_rptr,
 684                             (void *)mmd->mmd_pbuf[idx]->b_wptr,
 685                             (int)MBLKL(mmd->mmd_pbuf[idx])));
 686                 }
 687 
 688                 /* advance to next entry */
 689                 i++;
 690                 pa++;
 691         }
 692 
 693         mutex_exit(&mmd->mmd_pd_slab_lock);
 694         return (valid);
 695 }
 696 
 697 /*
 698  * Add a packet descriptor to the Multidata.
 699  */
 700 pdesc_t *
 701 mmd_addpdesc(multidata_t *mmd, pdescinfo_t *pdi, int *err, int kmflags)
 702 {
 703         ASSERT(mmd != NULL);
 704         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
 705         ASSERT(pdi != NULL);
 706         ASSERT(pdi->flags & PDESC_HAS_REF);
 707 
 708         /* do the references refer to invalid memory regions? */
 709         if (!mmd_speed_over_safety &&
 710             (((pdi->flags & PDESC_HBUF_REF) && !HBUF_REF_VALID(mmd, pdi)) ||
 711             ((pdi->flags & PDESC_PBUF_REF) && !pbuf_ref_valid(mmd, pdi)))) {
 712                 if (err != NULL)
 713                         *err = EINVAL;
 714                 return (NULL);
 715         }
 716 
 717         return (mmd_addpdesc_int(mmd, pdi, err, kmflags));
 718 }
 719 
 720 /*
 721  * Internal routine to add a packet descriptor, called when mmd_addpdesc
 722  * or mmd_copy tries to allocate and add a descriptor to a Multidata.
 723  */
 724 static pdesc_t *
 725 mmd_addpdesc_int(multidata_t *mmd, pdescinfo_t *pdi, int *err, int kmflags)
 726 {
 727         pdesc_slab_t *slab, *slab_last;
 728         pdesc_t *pd;
 729 
 730         ASSERT(pdi->flags & PDESC_HAS_REF);
 731         ASSERT(!(pdi->flags & PDESC_HBUF_REF) || HBUF_REF_VALID(mmd, pdi));
 732         ASSERT(!(pdi->flags & PDESC_PBUF_REF) || pbuf_ref_valid(mmd, pdi));
 733 
 734         if (err != NULL)
 735                 *err = 0;
 736 
 737         mutex_enter(&(mmd->mmd_pd_slab_lock));
 738         /*
 739          * Is slab list empty or the last-added slab is full?  If so,
 740          * allocate new slab for the descriptor; otherwise, use the
 741          * last-added slab instead.
 742          */
 743         slab_last = Q2PDSLAB(mmd->mmd_pd_slab_q.ql_prev);
 744         if (mmd->mmd_pd_slab_q.ql_next == &(mmd->mmd_pd_slab_q) ||
 745             slab_last->pds_used == slab_last->pds_sz) {
 746                 slab = kmem_cache_alloc(pd_slab_cache, kmflags);
 747                 if (slab == NULL) {
 748                         if (err != NULL)
 749                                 *err = ENOMEM;
 750                         mutex_exit(&(mmd->mmd_pd_slab_lock));
 751                         return (NULL);
 752                 }
 753                 slab->pds_mmd = mmd;
 754 
 755                 ASSERT(slab->pds_used == 0);
 756                 ASSERT(slab->pds_next == NULL && slab->pds_prev == NULL);
 757 
 758                 /* insert slab at end of list */
 759                 insque(&(slab->pds_next), mmd->mmd_pd_slab_q.ql_prev);
 760                 mmd->mmd_slab_cnt++;
 761         } else {
 762                 slab = slab_last;
 763         }
 764         ASSERT(slab->pds_used < slab->pds_sz);
 765         pd = &(slab->pds_free_desc[slab->pds_used++]);
 766         ASSERT(pd->pd_magic == PDESC_MAGIC);
 767         pd->pd_next = NULL;
 768         pd->pd_prev = NULL;
 769         pd->pd_slab = slab;
 770         pd->pd_pattbl = NULL;
 771 
 772         /* copy over the descriptor info from caller */
 773         PDI_COPY(pdi, &(pd->pd_pdi));
 774 
 775         if (pd->pd_flags & PDESC_HBUF_REF)
 776                 mmd->mmd_hbuf_ref++;
 777         if (pd->pd_flags & PDESC_PBUF_REF)
 778                 mmd->mmd_pbuf_ref += pd->pd_pdi.pld_cnt;
 779         mmd->mmd_pd_cnt++;
 780 
 781         /* insert descriptor at end of list */
 782         insque(&(pd->pd_next), mmd->mmd_pd_q.ql_prev);
 783         mutex_exit(&(mmd->mmd_pd_slab_lock));
 784 
 785         return (pd);
 786 }
 787 
 788 /*
 789  * Packet descriptor slab kmem cache constructor routine.
 790  */
 791 /* ARGSUSED */
 792 static int
 793 pdslab_constructor(void *buf, void *cdrarg, int kmflags)
 794 {
 795         pdesc_slab_t *slab;
 796         uint_t cnt = (uint_t)(uintptr_t)cdrarg;
 797         int i;
 798 
 799         ASSERT(cnt > 0);     /* slab size can't be zero */
 800 
 801         slab = (pdesc_slab_t *)buf;
 802         slab->pds_next = NULL;
 803         slab->pds_prev = NULL;
 804         slab->pds_mmd = NULL;
 805         slab->pds_used = 0;
 806         slab->pds_sz = cnt;
 807 
 808         for (i = 0; i < cnt; i++) {
 809                 pdesc_t *pd = &(slab->pds_free_desc[i]);
 810                 pd->pd_magic = PDESC_MAGIC;
 811         }
 812         return (0);
 813 }
 814 
 815 /*
 816  * Packet descriptor slab kmem cache destructor routine.
 817  */
 818 /* ARGSUSED */
 819 static void
 820 pdslab_destructor(void *buf, void *cdrarg)
 821 {
 822         pdesc_slab_t *slab;
 823 
 824         slab = (pdesc_slab_t *)buf;
 825         ASSERT(slab->pds_next == NULL);
 826         ASSERT(slab->pds_prev == NULL);
 827         ASSERT(slab->pds_mmd == NULL);
 828         ASSERT(slab->pds_used == 0);
 829         ASSERT(slab->pds_sz > 0);
 830 }
 831 
 832 /*
 833  * Remove a packet descriptor from the in-use descriptor list,
 834  * called by mmd_rempdesc or during free.
 835  */
 836 static pdesc_t *
 837 mmd_destroy_pdesc(multidata_t *mmd, pdesc_t *pd)
 838 {
 839         pdesc_t *pd_next;
 840 
 841         pd_next = Q2PD(pd->pd_next);
 842         remque(&(pd->pd_next));
 843 
 844         /* remove all local attributes */
 845         if (pd->pd_pattbl != NULL)
 846                 mmd_destroy_pattbl(&(pd->pd_pattbl));
 847 
 848         /* don't decrease counts for a removed descriptor */
 849         if (!(pd->pd_flags & PDESC_REM_DEFER)) {
 850                 if (pd->pd_flags & PDESC_HBUF_REF) {
 851                         ASSERT(mmd->mmd_hbuf_ref > 0);
 852                         mmd->mmd_hbuf_ref--;
 853                 }
 854                 if (pd->pd_flags & PDESC_PBUF_REF) {
 855                         ASSERT(mmd->mmd_pbuf_ref > 0);
 856                         mmd->mmd_pbuf_ref -= pd->pd_pdi.pld_cnt;
 857                 }
 858                 ASSERT(mmd->mmd_pd_cnt > 0);
 859                 mmd->mmd_pd_cnt--;
 860         }
 861         return (pd_next);
 862 }
 863 
 864 /*
 865  * Remove a packet descriptor from the Multidata.
 866  */
 867 void
 868 mmd_rempdesc(pdesc_t *pd)
 869 {
 870         multidata_t *mmd;
 871 
 872         ASSERT(pd->pd_magic == PDESC_MAGIC);
 873         ASSERT(pd->pd_slab != NULL);
 874 
 875         mmd = pd->pd_slab->pds_mmd;
 876         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
 877 
 878         mutex_enter(&(mmd->mmd_pd_slab_lock));
 879         /*
 880          * We can't deallocate the associated resources if the Multidata
 881          * is shared with other threads, because it's possible that the
 882          * descriptor handle value is held by those threads.  That's why
 883          * we simply mark the entry as "removed" and decrement the counts.
 884          * If there are no other threads, then we free the descriptor.
 885          */
 886         if (mmd->mmd_dp->db_ref > 1) {
 887                 pd->pd_flags |= PDESC_REM_DEFER;
 888                 if (pd->pd_flags & PDESC_HBUF_REF) {
 889                         ASSERT(mmd->mmd_hbuf_ref > 0);
 890                         mmd->mmd_hbuf_ref--;
 891                 }
 892                 if (pd->pd_flags & PDESC_PBUF_REF) {
 893                         ASSERT(mmd->mmd_pbuf_ref > 0);
 894                         mmd->mmd_pbuf_ref -= pd->pd_pdi.pld_cnt;
 895                 }
 896                 ASSERT(mmd->mmd_pd_cnt > 0);
 897                 mmd->mmd_pd_cnt--;
 898         } else {
 899                 (void) mmd_destroy_pdesc(mmd, pd);
 900         }
 901         mutex_exit(&(mmd->mmd_pd_slab_lock));
 902 }
 903 
 904 /*
 905  * A generic routine to traverse the packet descriptor in-use list.
 906  */
 907 static pdesc_t *
 908 mmd_getpdesc(multidata_t *mmd, pdesc_t *pd, pdescinfo_t *pdi, uint_t forw,
 909     boolean_t mutex_held)
 910 {
 911         pdesc_t *pd_head;
 912 
 913         ASSERT(pd == NULL || pd->pd_slab->pds_mmd == mmd);
 914         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
 915         ASSERT(!mutex_held || MUTEX_HELD(&(mmd->mmd_pd_slab_lock)));
 916 
 917         if (!mutex_held)
 918                 mutex_enter(&(mmd->mmd_pd_slab_lock));
 919         pd_head = Q2PD(&(mmd->mmd_pd_q));
 920 
 921         if (pd == NULL) {
 922                 /*
 923                  * We're called by mmd_get{first,last}pdesc, and so
 924                  * return either the first or last list element.
 925                  */
 926                 pd = forw ? Q2PD(mmd->mmd_pd_q.ql_next) :
 927                     Q2PD(mmd->mmd_pd_q.ql_prev);
 928         } else {
 929                 /*
 930                  * We're called by mmd_get{next,prev}pdesc, and so
 931                  * return either the next or previous list element.
 932                  */
 933                 pd = forw ? Q2PD(pd->pd_next) : Q2PD(pd->pd_prev);
 934         }
 935 
 936         while (pd != pd_head) {
 937                 /* skip element if it has been removed */
 938                 if (!(pd->pd_flags & PDESC_REM_DEFER))
 939                         break;
 940                 pd = forw ? Q2PD(pd->pd_next) : Q2PD(pd->pd_prev);
 941         }
 942         if (!mutex_held)
 943                 mutex_exit(&(mmd->mmd_pd_slab_lock));
 944 
 945         /* return NULL if we're back at the beginning */
 946         if (pd == pd_head)
 947                 pd = NULL;
 948 
 949         /* got an entry; copy descriptor info to caller */
 950         if (pd != NULL && pdi != NULL)
 951                 PDI_COPY(&(pd->pd_pdi), pdi);
 952 
 953         ASSERT(pd == NULL || pd->pd_magic == PDESC_MAGIC);
 954         return (pd);
 955 
 956 }
 957 
 958 /*
 959  * Return the first packet descriptor in the in-use list.
 960  */
 961 pdesc_t *
 962 mmd_getfirstpdesc(multidata_t *mmd, pdescinfo_t *pdi)
 963 {
 964         return (mmd_getpdesc(mmd, NULL, pdi, 1, B_FALSE));
 965 }
 966 
 967 /*
 968  * Return the last packet descriptor in the in-use list.
 969  */
 970 pdesc_t *
 971 mmd_getlastpdesc(multidata_t *mmd, pdescinfo_t *pdi)
 972 {
 973         return (mmd_getpdesc(mmd, NULL, pdi, 0, B_FALSE));
 974 }
 975 
 976 /*
 977  * Return the next packet descriptor in the in-use list.
 978  */
 979 pdesc_t *
 980 mmd_getnextpdesc(pdesc_t *pd, pdescinfo_t *pdi)
 981 {
 982         return (mmd_getpdesc(pd->pd_slab->pds_mmd, pd, pdi, 1, B_FALSE));
 983 }
 984 
 985 /*
 986  * Return the previous packet descriptor in the in-use list.
 987  */
 988 pdesc_t *
 989 mmd_getprevpdesc(pdesc_t *pd, pdescinfo_t *pdi)
 990 {
 991         return (mmd_getpdesc(pd->pd_slab->pds_mmd, pd, pdi, 0, B_FALSE));
 992 }
 993 
 994 /*
 995  * Check to see if pdi stretches over c_pdi; used to ensure that a packet
 996  * descriptor's header and payload span may not be extended beyond the
 997  * current boundaries.
 998  */
 999 static boolean_t
1000 pdi_in_range(pdescinfo_t *pdi, pdescinfo_t *c_pdi)
1001 {
1002         int i;
1003         struct pld_ary_s *pa = &pdi->pld_ary[0];
1004         struct pld_ary_s *c_pa = &c_pdi->pld_ary[0];
1005 
1006         if (pdi->hdr_base < c_pdi->hdr_base || pdi->hdr_lim > c_pdi->hdr_lim)
1007                 return (B_FALSE);
1008 
1009         /*
1010          * We don't allow the number of span to be reduced, for the sake
1011          * of simplicity.  Instead, we provide PDESC_PLD_SPAN_CLEAR() to
1012          * clear a packet descriptor.  Note that we allow the span count to
1013          * be increased, and the bounds check for the new one happens
1014          * in pbuf_ref_valid.
1015          */
1016         if (pdi->pld_cnt < c_pdi->pld_cnt)
1017                 return (B_FALSE);
1018 
1019         /* compare only those which are currently defined */
1020         for (i = 0; i < c_pdi->pld_cnt; i++, pa++, c_pa++) {
1021                 if (pa->pld_pbuf_idx != c_pa->pld_pbuf_idx ||
1022                     pa->pld_rptr < c_pa->pld_rptr ||
1023                     pa->pld_wptr > c_pa->pld_wptr)
1024                         return (B_FALSE);
1025         }
1026         return (B_TRUE);
1027 }
1028 
1029 /*
1030  * Modify the layout of a packet descriptor.
1031  */
1032 pdesc_t *
1033 mmd_adjpdesc(pdesc_t *pd, pdescinfo_t *pdi)
1034 {
1035         multidata_t *mmd;
1036         pdescinfo_t *c_pdi;
1037 
1038         ASSERT(pd != NULL);
1039         ASSERT(pdi != NULL);
1040         ASSERT(pd->pd_magic == PDESC_MAGIC);
1041 
1042         mmd = pd->pd_slab->pds_mmd;
1043         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1044 
1045         /* entry has been removed */
1046         if (pd->pd_flags & PDESC_REM_DEFER)
1047                 return (NULL);
1048 
1049         /* caller doesn't intend to specify any buffer reference? */
1050         if (!(pdi->flags & PDESC_HAS_REF))
1051                 return (NULL);
1052 
1053         /* do the references refer to invalid memory regions? */
1054         if (!mmd_speed_over_safety &&
1055             (((pdi->flags & PDESC_HBUF_REF) && !HBUF_REF_VALID(mmd, pdi)) ||
1056             ((pdi->flags & PDESC_PBUF_REF) && !pbuf_ref_valid(mmd, pdi))))
1057                 return (NULL);
1058 
1059         /* they're not subsets of current references? */
1060         c_pdi = &(pd->pd_pdi);
1061         if (!pdi_in_range(pdi, c_pdi))
1062                 return (NULL);
1063 
1064         /* copy over the descriptor info from caller */
1065         PDI_COPY(pdi, c_pdi);
1066 
1067         return (pd);
1068 }
1069 
1070 /*
1071  * Copy the contents of a packet descriptor into a new buffer.  If the
1072  * descriptor points to more than one buffer fragments, the contents
1073  * of both fragments will be joined, with the header buffer fragment
1074  * preceding the payload buffer fragment(s).
1075  */
1076 mblk_t *
1077 mmd_transform(pdesc_t *pd)
1078 {
1079         multidata_t *mmd;
1080         pdescinfo_t *pdi;
1081         mblk_t *mp;
1082         int h_size = 0, p_size = 0;
1083         int i, len;
1084 
1085         ASSERT(pd != NULL);
1086         ASSERT(pd->pd_magic == PDESC_MAGIC);
1087 
1088         mmd = pd->pd_slab->pds_mmd;
1089         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1090 
1091         /* entry has been removed */
1092         if (pd->pd_flags & PDESC_REM_DEFER)
1093                 return (NULL);
1094 
1095         mutex_enter(&mmd->mmd_pd_slab_lock);
1096         pdi = &(pd->pd_pdi);
1097         if (pdi->flags & PDESC_HBUF_REF)
1098                 h_size = PDESC_HDRL(pdi);
1099         if (pdi->flags & PDESC_PBUF_REF) {
1100                 for (i = 0; i < pdi->pld_cnt; i++)
1101                         p_size += PDESC_PLD_SPAN_SIZE(pdi, i);
1102         }
1103 
1104         /* allocate space large enough to hold the fragment(s) */
1105         ASSERT(h_size + p_size >= 0);
1106         if ((mp = allocb(h_size + p_size, BPRI_HI)) == NULL) {
1107                 mutex_exit(&mmd->mmd_pd_slab_lock);
1108                 return (NULL);
1109         }
1110 
1111         /* copy over the header fragment */
1112         if ((pdi->flags & PDESC_HBUF_REF) && h_size > 0) {
1113                 bcopy(pdi->hdr_rptr, mp->b_wptr, h_size);
1114                 mp->b_wptr += h_size;
1115         }
1116 
1117         /* copy over the payload fragment */
1118         if ((pdi->flags & PDESC_PBUF_REF) && p_size > 0) {
1119                 for (i = 0; i < pdi->pld_cnt; i++) {
1120                         len = PDESC_PLD_SPAN_SIZE(pdi, i);
1121                         if (len > 0) {
1122                                 bcopy(pdi->pld_ary[i].pld_rptr,
1123                                     mp->b_wptr, len);
1124                                 mp->b_wptr += len;
1125                         }
1126                 }
1127         }
1128 
1129         mutex_exit(&mmd->mmd_pd_slab_lock);
1130         return (mp);
1131 }
1132 
1133 /*
1134  * Return a chain of mblks representing the Multidata packet.
1135  */
1136 mblk_t *
1137 mmd_transform_link(pdesc_t *pd)
1138 {
1139         multidata_t *mmd;
1140         pdescinfo_t *pdi;
1141         mblk_t *nmp = NULL;
1142 
1143         ASSERT(pd != NULL);
1144         ASSERT(pd->pd_magic == PDESC_MAGIC);
1145 
1146         mmd = pd->pd_slab->pds_mmd;
1147         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1148 
1149         /* entry has been removed */
1150         if (pd->pd_flags & PDESC_REM_DEFER)
1151                 return (NULL);
1152 
1153         pdi = &(pd->pd_pdi);
1154 
1155         /* duplicate header buffer */
1156         if ((pdi->flags & PDESC_HBUF_REF)) {
1157                 if ((nmp = dupb(mmd->mmd_hbuf)) == NULL)
1158                         return (NULL);
1159                 nmp->b_rptr = pdi->hdr_rptr;
1160                 nmp->b_wptr = pdi->hdr_wptr;
1161         }
1162 
1163         /* duplicate payload buffer(s) */
1164         if (pdi->flags & PDESC_PBUF_REF) {
1165                 int i;
1166                 mblk_t *mp;
1167                 struct pld_ary_s *pa = &pdi->pld_ary[0];
1168 
1169                 mutex_enter(&mmd->mmd_pd_slab_lock);
1170                 for (i = 0; i < pdi->pld_cnt; i++, pa++) {
1171                         ASSERT(mmd->mmd_pbuf[pa->pld_pbuf_idx] != NULL);
1172 
1173                         /* skip empty ones */
1174                         if (PDESC_PLD_SPAN_SIZE(pdi, i) == 0)
1175                                 continue;
1176 
1177                         mp = dupb(mmd->mmd_pbuf[pa->pld_pbuf_idx]);
1178                         if (mp == NULL) {
1179                                 if (nmp != NULL)
1180                                         freemsg(nmp);
1181                                 mutex_exit(&mmd->mmd_pd_slab_lock);
1182                                 return (NULL);
1183                         }
1184                         mp->b_rptr = pa->pld_rptr;
1185                         mp->b_wptr = pa->pld_wptr;
1186                         if (nmp == NULL)
1187                                 nmp = mp;
1188                         else
1189                                 linkb(nmp, mp);
1190                 }
1191                 mutex_exit(&mmd->mmd_pd_slab_lock);
1192         }
1193 
1194         return (nmp);
1195 }
1196 
1197 /*
1198  * Return duplicate message block(s) of the associated buffer(s).
1199  */
1200 int
1201 mmd_dupbufs(multidata_t *mmd, mblk_t **hmp, mblk_t **pmp)
1202 {
1203         ASSERT(mmd != NULL);
1204         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1205 
1206         if (hmp != NULL) {
1207                 *hmp = NULL;
1208                 if (mmd->mmd_hbuf != NULL &&
1209                     (*hmp = dupb(mmd->mmd_hbuf)) == NULL)
1210                         return (-1);
1211         }
1212 
1213         if (pmp != NULL) {
1214                 int i;
1215                 mblk_t *mp;
1216 
1217                 mutex_enter(&mmd->mmd_pd_slab_lock);
1218                 *pmp = NULL;
1219                 for (i = 0; i < mmd->mmd_pbuf_cnt; i++) {
1220                         ASSERT(mmd->mmd_pbuf[i] != NULL);
1221                         mp = dupb(mmd->mmd_pbuf[i]);
1222                         if (mp == NULL) {
1223                                 if (hmp != NULL && *hmp != NULL)
1224                                         freeb(*hmp);
1225                                 if (*pmp != NULL)
1226                                         freemsg(*pmp);
1227                                 mutex_exit(&mmd->mmd_pd_slab_lock);
1228                                 return (-1);
1229                         }
1230                         if (*pmp == NULL)
1231                                 *pmp = mp;
1232                         else
1233                                 linkb(*pmp, mp);
1234                 }
1235                 mutex_exit(&mmd->mmd_pd_slab_lock);
1236         }
1237 
1238         return (0);
1239 }
1240 
1241 /*
1242  * Return the layout of a packet descriptor.
1243  */
1244 int
1245 mmd_getpdescinfo(pdesc_t *pd, pdescinfo_t *pdi)
1246 {
1247         ASSERT(pd != NULL);
1248         ASSERT(pd->pd_magic == PDESC_MAGIC);
1249         ASSERT(pd->pd_slab != NULL);
1250         ASSERT(pd->pd_slab->pds_mmd->mmd_magic == MULTIDATA_MAGIC);
1251         ASSERT(pdi != NULL);
1252 
1253         /* entry has been removed */
1254         if (pd->pd_flags & PDESC_REM_DEFER)
1255                 return (-1);
1256 
1257         /* copy descriptor info to caller */
1258         PDI_COPY(&(pd->pd_pdi), pdi);
1259 
1260         return (0);
1261 }
1262 
1263 /*
1264  * Add a global or local attribute to a Multidata.  Global attribute
1265  * association is specified by a NULL packet descriptor.
1266  */
1267 pattr_t *
1268 mmd_addpattr(multidata_t *mmd, pdesc_t *pd, pattrinfo_t *pai,
1269     boolean_t persistent, int kmflags)
1270 {
1271         patbkt_t **tbl_p;
1272         patbkt_t *tbl, *o_tbl;
1273         patbkt_t *bkt;
1274         pattr_t *pa;
1275         uint_t size;
1276 
1277         ASSERT(mmd != NULL);
1278         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1279         ASSERT(pd == NULL || pd->pd_magic == PDESC_MAGIC);
1280         ASSERT(pai != NULL);
1281 
1282         /* pointer to the attribute hash table (local or global) */
1283         tbl_p = pd != NULL ? &(pd->pd_pattbl) : &(mmd->mmd_pattbl);
1284 
1285         /*
1286          * See if the hash table has not yet been created; if so,
1287          * we create the table and store its address atomically.
1288          */
1289         if ((tbl = *tbl_p) == NULL) {
1290                 tbl = kmem_cache_alloc(pattbl_cache, kmflags);
1291                 if (tbl == NULL)
1292                         return (NULL);
1293 
1294                 /* if someone got there first, use his table instead */
1295                 if ((o_tbl = atomic_cas_ptr(tbl_p, NULL, tbl)) != NULL) {
1296                         kmem_cache_free(pattbl_cache, tbl);
1297                         tbl = o_tbl;
1298                 }
1299         }
1300 
1301         ASSERT(tbl->pbkt_tbl_sz > 0);
1302         bkt = &(tbl[PATTBL_HASH(pai->type, tbl->pbkt_tbl_sz)]);
1303 
1304         /* attribute of the same type already exists? */
1305         if ((pa = mmd_find_pattr(bkt, pai->type)) != NULL)
1306                 return (NULL);
1307 
1308         size = sizeof (*pa) + pai->len;
1309         if ((pa = kmem_zalloc(size, kmflags)) == NULL)
1310                 return (NULL);
1311 
1312         pa->pat_magic = PATTR_MAGIC;
1313         pa->pat_lock = &(bkt->pbkt_lock);
1314         pa->pat_mmd = mmd;
1315         pa->pat_buflen = size;
1316         pa->pat_type = pai->type;
1317         pai->buf = pai->len > 0 ? ((uchar_t *)(pa + 1)) : NULL;
1318 
1319         if (persistent)
1320                 pa->pat_flags = PATTR_PERSIST;
1321 
1322         /* insert attribute at end of hash chain */
1323         mutex_enter(&(bkt->pbkt_lock));
1324         insque(&(pa->pat_next), bkt->pbkt_pattr_q.ql_prev);
1325         mutex_exit(&(bkt->pbkt_lock));
1326 
1327         return (pa);
1328 }
1329 
1330 /*
1331  * Attribute hash table kmem cache constructor routine.
1332  */
1333 /* ARGSUSED */
1334 static int
1335 pattbl_constructor(void *buf, void *cdrarg, int kmflags)
1336 {
1337         patbkt_t *bkt;
1338         uint_t tbl_sz = (uint_t)(uintptr_t)cdrarg;
1339         uint_t i;
1340 
1341         ASSERT(tbl_sz > 0);  /* table size can't be zero */
1342 
1343         for (i = 0, bkt = (patbkt_t *)buf; i < tbl_sz; i++, bkt++) {
1344                 mutex_init(&(bkt->pbkt_lock), NULL, MUTEX_DRIVER, NULL);
1345                 QL_INIT(&(bkt->pbkt_pattr_q));
1346 
1347                 /* first bucket contains the table size */
1348                 bkt->pbkt_tbl_sz = i == 0 ? tbl_sz : 0;
1349         }
1350         return (0);
1351 }
1352 
1353 /*
1354  * Attribute hash table kmem cache destructor routine.
1355  */
1356 /* ARGSUSED */
1357 static void
1358 pattbl_destructor(void *buf, void *cdrarg)
1359 {
1360         patbkt_t *bkt;
1361         uint_t tbl_sz = (uint_t)(uintptr_t)cdrarg;
1362         uint_t i;
1363 
1364         ASSERT(tbl_sz > 0);  /* table size can't be zero */
1365 
1366         for (i = 0, bkt = (patbkt_t *)buf; i < tbl_sz; i++, bkt++) {
1367                 mutex_destroy(&(bkt->pbkt_lock));
1368                 ASSERT(bkt->pbkt_pattr_q.ql_next == &(bkt->pbkt_pattr_q));
1369                 ASSERT(i > 0 || bkt->pbkt_tbl_sz == tbl_sz);
1370         }
1371 }
1372 
1373 /*
1374  * Destroy an attribute hash table, called by mmd_rempdesc or during free.
1375  */
1376 static void
1377 mmd_destroy_pattbl(patbkt_t **tbl)
1378 {
1379         patbkt_t *bkt;
1380         pattr_t *pa, *pa_next;
1381         uint_t i, tbl_sz;
1382 
1383         ASSERT(tbl != NULL);
1384         bkt = *tbl;
1385         tbl_sz = bkt->pbkt_tbl_sz;
1386 
1387         /* make sure caller passes in the first bucket */
1388         ASSERT(tbl_sz > 0);
1389 
1390         /* destroy the contents of each bucket */
1391         for (i = 0; i < tbl_sz; i++, bkt++) {
1392                 /* we ought to be exclusive at this point */
1393                 ASSERT(MUTEX_NOT_HELD(&(bkt->pbkt_lock)));
1394 
1395                 pa = Q2PATTR(bkt->pbkt_pattr_q.ql_next);
1396                 while (pa != Q2PATTR(&(bkt->pbkt_pattr_q))) {
1397                         ASSERT(pa->pat_magic == PATTR_MAGIC);
1398                         pa_next = Q2PATTR(pa->pat_next);
1399                         remque(&(pa->pat_next));
1400                         kmem_free(pa, pa->pat_buflen);
1401                         pa = pa_next;
1402                 }
1403         }
1404 
1405         kmem_cache_free(pattbl_cache, *tbl);
1406         *tbl = NULL;
1407 
1408         /* commit all previous stores */
1409         membar_producer();
1410 }
1411 
1412 /*
1413  * Copy the contents of an attribute hash table, called by mmd_copy.
1414  */
1415 static int
1416 mmd_copy_pattbl(patbkt_t *src_tbl, multidata_t *n_mmd, pdesc_t *n_pd,
1417     int kmflags)
1418 {
1419         patbkt_t *bkt;
1420         pattr_t *pa;
1421         pattrinfo_t pai;
1422         uint_t i, tbl_sz;
1423 
1424         ASSERT(src_tbl != NULL);
1425         bkt = src_tbl;
1426         tbl_sz = bkt->pbkt_tbl_sz;
1427 
1428         /* make sure caller passes in the first bucket */
1429         ASSERT(tbl_sz > 0);
1430 
1431         for (i = 0; i < tbl_sz; i++, bkt++) {
1432                 mutex_enter(&(bkt->pbkt_lock));
1433                 pa = Q2PATTR(bkt->pbkt_pattr_q.ql_next);
1434                 while (pa != Q2PATTR(&(bkt->pbkt_pattr_q))) {
1435                         pattr_t *pa_next = Q2PATTR(pa->pat_next);
1436 
1437                         /* skip if it's removed */
1438                         if (pa->pat_flags & PATTR_REM_DEFER) {
1439                                 pa = pa_next;
1440                                 continue;
1441                         }
1442 
1443                         pai.type = pa->pat_type;
1444                         pai.len = pa->pat_buflen - sizeof (*pa);
1445                         if (mmd_addpattr(n_mmd, n_pd, &pai, (pa->pat_flags &
1446                             PATTR_PERSIST) != 0, kmflags) == NULL) {
1447                                 mutex_exit(&(bkt->pbkt_lock));
1448                                 return (-1);
1449                         }
1450 
1451                         /* copy over the contents */
1452                         if (pai.buf != NULL)
1453                                 bcopy(pa + 1, pai.buf, pai.len);
1454 
1455                         pa = pa_next;
1456                 }
1457                 mutex_exit(&(bkt->pbkt_lock));
1458         }
1459 
1460         return (0);
1461 }
1462 
1463 /*
1464  * Search for an attribute type within an attribute hash bucket.
1465  */
1466 static pattr_t *
1467 mmd_find_pattr(patbkt_t *bkt, uint_t type)
1468 {
1469         pattr_t *pa_head, *pa;
1470 
1471         mutex_enter(&(bkt->pbkt_lock));
1472         pa_head = Q2PATTR(&(bkt->pbkt_pattr_q));
1473         pa = Q2PATTR(bkt->pbkt_pattr_q.ql_next);
1474 
1475         while (pa != pa_head) {
1476                 ASSERT(pa->pat_magic == PATTR_MAGIC);
1477 
1478                 /* return a match; we treat removed entry as non-existent */
1479                 if (pa->pat_type == type && !(pa->pat_flags & PATTR_REM_DEFER))
1480                         break;
1481                 pa = Q2PATTR(pa->pat_next);
1482         }
1483         mutex_exit(&(bkt->pbkt_lock));
1484 
1485         return (pa == pa_head ? NULL : pa);
1486 }
1487 
1488 /*
1489  * Remove an attribute from a Multidata.
1490  */
1491 void
1492 mmd_rempattr(pattr_t *pa)
1493 {
1494         kmutex_t *pat_lock = pa->pat_lock;
1495 
1496         ASSERT(pa->pat_magic == PATTR_MAGIC);
1497 
1498         /* ignore if attribute was marked as persistent */
1499         if ((pa->pat_flags & PATTR_PERSIST) != 0)
1500                 return;
1501 
1502         mutex_enter(pat_lock);
1503         /*
1504          * We can't deallocate the associated resources if the Multidata
1505          * is shared with other threads, because it's possible that the
1506          * attribute handle value is held by those threads.  That's why
1507          * we simply mark the entry as "removed".  If there are no other
1508          * threads, then we free the attribute.
1509          */
1510         if (pa->pat_mmd->mmd_dp->db_ref > 1) {
1511                 pa->pat_flags |= PATTR_REM_DEFER;
1512         } else {
1513                 remque(&(pa->pat_next));
1514                 kmem_free(pa, pa->pat_buflen);
1515         }
1516         mutex_exit(pat_lock);
1517 }
1518 
1519 /*
1520  * Find an attribute (according to its type) and return its handle.
1521  */
1522 pattr_t *
1523 mmd_getpattr(multidata_t *mmd, pdesc_t *pd, pattrinfo_t *pai)
1524 {
1525         patbkt_t *tbl, *bkt;
1526         pattr_t *pa;
1527 
1528         ASSERT(mmd != NULL);
1529         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1530         ASSERT(pai != NULL);
1531 
1532         /* get the right attribute hash table (local or global) */
1533         tbl = pd != NULL ? pd->pd_pattbl : mmd->mmd_pattbl;
1534 
1535         /* attribute hash table doesn't exist? */
1536         if (tbl == NULL)
1537                 return (NULL);
1538 
1539         ASSERT(tbl->pbkt_tbl_sz > 0);
1540         bkt = &(tbl[PATTBL_HASH(pai->type, tbl->pbkt_tbl_sz)]);
1541 
1542         if ((pa = mmd_find_pattr(bkt, pai->type)) != NULL) {
1543                 ASSERT(pa->pat_buflen >= sizeof (*pa));
1544                 pai->len = pa->pat_buflen - sizeof (*pa);
1545                 pai->buf = pai->len > 0 ?
1546                     (uchar_t *)pa + sizeof (pattr_t) : NULL;
1547         }
1548         ASSERT(pa == NULL || pa->pat_magic == PATTR_MAGIC);
1549         return (pa);
1550 }
1551 
1552 /*
1553  * Return total size of buffers and total size of areas referenced
1554  * by all in-use (unremoved) packet descriptors.
1555  */
1556 void
1557 mmd_getsize(multidata_t *mmd, uint_t *ptotal, uint_t *pinuse)
1558 {
1559         pdesc_t *pd;
1560         pdescinfo_t *pdi;
1561         int i;
1562 
1563         ASSERT(mmd != NULL);
1564         ASSERT(mmd->mmd_magic == MULTIDATA_MAGIC);
1565 
1566         mutex_enter(&mmd->mmd_pd_slab_lock);
1567         if (ptotal != NULL) {
1568                 *ptotal = 0;
1569 
1570                 if (mmd->mmd_hbuf != NULL)
1571                         *ptotal += MBLKL(mmd->mmd_hbuf);
1572 
1573                 for (i = 0; i < mmd->mmd_pbuf_cnt; i++) {
1574                         ASSERT(mmd->mmd_pbuf[i] != NULL);
1575                         *ptotal += MBLKL(mmd->mmd_pbuf[i]);
1576                 }
1577         }
1578         if (pinuse != NULL) {
1579                 *pinuse = 0;
1580 
1581                 /* first pdesc */
1582                 pd = mmd_getpdesc(mmd, NULL, NULL, 1, B_TRUE);
1583                 while (pd != NULL) {
1584                         pdi = &pd->pd_pdi;
1585 
1586                         /* next pdesc */
1587                         pd = mmd_getpdesc(mmd, pd, NULL, 1, B_TRUE);
1588 
1589                         /* skip over removed descriptor */
1590                         if (pdi->flags & PDESC_REM_DEFER)
1591                                 continue;
1592 
1593                         if (pdi->flags & PDESC_HBUF_REF)
1594                                 *pinuse += PDESC_HDRL(pdi);
1595 
1596                         if (pdi->flags & PDESC_PBUF_REF) {
1597                                 for (i = 0; i < pdi->pld_cnt; i++)
1598                                         *pinuse += PDESC_PLDL(pdi, i);
1599                         }
1600                 }
1601         }
1602         mutex_exit(&mmd->mmd_pd_slab_lock);
1603 }