1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * This module implements a STREAMS driver that provides layer-two (Ethernet)
  29  * bridging functionality.  The STREAMS interface is used to provide
  30  * observability (snoop/wireshark) and control, but not for interface plumbing.
  31  */
  32 
  33 #include <sys/types.h>
  34 #include <sys/bitmap.h>
  35 #include <sys/cmn_err.h>
  36 #include <sys/conf.h>
  37 #include <sys/ddi.h>
  38 #include <sys/errno.h>
  39 #include <sys/kstat.h>
  40 #include <sys/modctl.h>
  41 #include <sys/note.h>
  42 #include <sys/param.h>
  43 #include <sys/policy.h>
  44 #include <sys/sdt.h>
  45 #include <sys/stat.h>
  46 #include <sys/stream.h>
  47 #include <sys/stropts.h>
  48 #include <sys/strsun.h>
  49 #include <sys/sunddi.h>
  50 #include <sys/sysmacros.h>
  51 #include <sys/systm.h>
  52 #include <sys/time.h>
  53 #include <sys/dlpi.h>
  54 #include <sys/dls.h>
  55 #include <sys/mac_ether.h>
  56 #include <sys/mac_provider.h>
  57 #include <sys/mac_client_priv.h>
  58 #include <sys/mac_impl.h>
  59 #include <sys/vlan.h>
  60 #include <net/bridge.h>
  61 #include <net/bridge_impl.h>
  62 #include <net/trill.h>
  63 #include <sys/dld_ioc.h>
  64 
  65 /*
  66  * Locks and reference counts: object lifetime and design.
  67  *
  68  * bridge_mac_t
  69  *   Bridge mac (snoop) instances are in bmac_list, which is protected by
  70  *   bmac_rwlock.  They're allocated by bmac_alloc and freed by bridge_timer().
  71  *   Every bridge_inst_t has a single bridge_mac_t, but when bridge_inst_t goes
  72  *   away, the bridge_mac_t remains until either all of the users go away
  73  *   (detected by a timer) or until the instance is picked up again by the same
  74  *   bridge starting back up.
  75  *
  76  * bridge_inst_t
  77  *   Bridge instances are in inst_list, which is protected by inst_lock.
  78  *   They're allocated by inst_alloc() and freed by inst_free().  After
  79  *   allocation, an instance is placed in inst_list, and the reference count is
  80  *   incremented to represent this.  That reference is decremented when the
  81  *   BIF_SHUTDOWN flag is set, and no new increments may occur.  When the last
  82  *   reference is freed, the instance is removed from the list.
  83  *
  84  *   Bridge instances have lists of links and an AVL tree of forwarding
  85  *   entries.  Each of these structures holds one reference on the bridge
  86  *   instance.  These lists and tree are protected by bi_rwlock.
  87  *
  88  * bridge_stream_t
  89  *   Bridge streams are allocated by stream_alloc() and freed by stream_free().
  90  *   These streams are created when "bridged" opens /dev/bridgectl, and are
  91  *   used to create new bridge instances (via BRIOC_NEWBRIDGE) and control the
  92  *   links on the bridge.  When a stream closes, the bridge instance created is
  93  *   destroyed.  There's at most one bridge instance for a given control
  94  *   stream.
  95  *
  96  * bridge_link_t
  97  *   Links are allocated by bridge_add_link() and freed by link_free().  The
  98  *   bi_links list holds a reference to the link.  When the BLF_DELETED flag is
  99  *   set, that reference is dropped.  The link isn't removed from the list
 100  *   until the last reference drops.  Each forwarding entry that uses a given
 101  *   link holds a reference, as does each thread transmitting a packet via the
 102  *   link.  The MAC layer calls in via bridge_ref_cb() to hold a reference on
 103  *   a link when transmitting.
 104  *
 105  *   It's important that once BLF_DELETED is set, there's no way for the
 106  *   reference count to increase again.  If it can, then the link may be
 107  *   double-freed.  The BLF_FREED flag is intended for use with assertions to
 108  *   guard against this in testing.
 109  *
 110  * bridge_fwd_t
 111  *   Bridge forwarding entries are allocated by bridge_recv_cb() and freed by
 112  *   fwd_free().  The bi_fwd AVL tree holds one reference to the entry.  Unlike
 113  *   other data structures, the reference is dropped when the entry is removed
 114  *   from the tree by fwd_delete(), and the BFF_INTREE flag is removed.  Each
 115  *   thread that's forwarding a packet to a known destination holds a reference
 116  *   to a forwarding entry.
 117  *
 118  * TRILL notes:
 119  *
 120  *   The TRILL module does all of its I/O through bridging.  It uses references
 121  *   on the bridge_inst_t and bridge_link_t structures, and has seven entry
 122  *   points and four callbacks.  One entry point is for setting the callbacks
 123  *   (bridge_trill_register_cb).  There are four entry points for taking bridge
 124  *   and link references (bridge_trill_{br,ln}{ref,unref}).  The final two
 125  *   entry points are for decapsulated packets from TRILL (bridge_trill_decaps)
 126  *   that need to be bridged locally, and for TRILL-encapsulated output packets
 127  *   (bridge_trill_output).
 128  *
 129  *   The four callbacks comprise two notification functions for bridges and
 130  *   links being deleted, one function for raw received TRILL packets, and one
 131  *   for bridge output to non-local TRILL destinations (tunnel entry).
 132  */
 133 
 134 /*
 135  * Ethernet reserved multicast addresses for TRILL; used also in TRILL module.
 136  */
 137 const uint8_t all_isis_rbridges[] = ALL_ISIS_RBRIDGES;
 138 static const uint8_t all_esadi_rbridges[] = ALL_ESADI_RBRIDGES;
 139 const uint8_t bridge_group_address[] = BRIDGE_GROUP_ADDRESS;
 140 
 141 static const char *inst_kstats_list[] = { KSINST_NAMES };
 142 static const char *link_kstats_list[] = { KSLINK_NAMES };
 143 
 144 #define KREF(p, m, vn)  p->m.vn.value.ui64
 145 #define KINCR(p, m, vn) ++KREF(p, m, vn)
 146 #define KDECR(p, m, vn) --KREF(p, m, vn)
 147 
 148 #define KIPINCR(p, vn)  KINCR(p, bi_kstats, vn)
 149 #define KIPDECR(p, vn)  KDECR(p, bi_kstats, vn)
 150 #define KLPINCR(p, vn)  KINCR(p, bl_kstats, vn)
 151 
 152 #define KIINCR(vn)      KIPINCR(bip, vn)
 153 #define KIDECR(vn)      KIPDECR(bip, vn)
 154 #define KLINCR(vn)      KLPINCR(blp, vn)
 155 
 156 #define Dim(x)          (sizeof (x) / sizeof (*(x)))
 157 
 158 /* Amount of overhead added when encapsulating with VLAN headers */
 159 #define VLAN_INCR       (sizeof (struct ether_vlan_header) -    \
 160                         sizeof (struct ether_header))
 161 
 162 static dev_info_t *bridge_dev_info;
 163 static major_t bridge_major;
 164 static ddi_taskq_t *bridge_taskq;
 165 
 166 /*
 167  * These are the bridge instance management data structures.  The mutex lock
 168  * protects the list of bridge instances.  A reference count is then used on
 169  * each instance to determine when to free it.  We use mac_minor_hold() to
 170  * allocate minor_t values, which are used both for self-cloning /dev/net/
 171  * device nodes as well as client streams.  Minor node 0 is reserved for the
 172  * allocation control node.
 173  */
 174 static list_t inst_list;
 175 static kcondvar_t inst_cv;              /* Allows us to wait for shutdown */
 176 static kmutex_t inst_lock;
 177 
 178 static krwlock_t bmac_rwlock;
 179 static list_t bmac_list;
 180 
 181 /* Wait for taskq entries that use STREAMS */
 182 static kcondvar_t stream_ref_cv;
 183 static kmutex_t stream_ref_lock;
 184 
 185 static timeout_id_t bridge_timerid;
 186 static clock_t bridge_scan_interval;
 187 static clock_t bridge_fwd_age;
 188 
 189 static bridge_inst_t *bridge_find_name(const char *);
 190 static void bridge_timer(void *);
 191 static void bridge_unref(bridge_inst_t *);
 192 
 193 static const uint8_t zero_addr[ETHERADDRL] = { 0 };
 194 
 195 /* Global TRILL linkage */
 196 static trill_recv_pkt_t trill_recv_fn;
 197 static trill_encap_pkt_t trill_encap_fn;
 198 static trill_br_dstr_t trill_brdstr_fn;
 199 static trill_ln_dstr_t trill_lndstr_fn;
 200 
 201 /* special settings to accommodate DLD flow control; see dld_str.c */
 202 static struct module_info bridge_dld_modinfo = {
 203         0,                      /* mi_idnum */
 204         BRIDGE_DEV_NAME,        /* mi_idname */
 205         0,                      /* mi_minpsz */
 206         INFPSZ,                 /* mi_maxpsz */
 207         1,                      /* mi_hiwat */
 208         0                       /* mi_lowat */
 209 };
 210 
 211 static struct qinit bridge_dld_rinit = {
 212         NULL,                   /* qi_putp */
 213         NULL,                   /* qi_srvp */
 214         dld_open,               /* qi_qopen */
 215         dld_close,              /* qi_qclose */
 216         NULL,                   /* qi_qadmin */
 217         &bridge_dld_modinfo,        /* qi_minfo */
 218         NULL                    /* qi_mstat */
 219 };
 220 
 221 static struct qinit bridge_dld_winit = {
 222         (int (*)())dld_wput,    /* qi_putp */
 223         (int (*)())dld_wsrv,    /* qi_srvp */
 224         NULL,                   /* qi_qopen */
 225         NULL,                   /* qi_qclose */
 226         NULL,                   /* qi_qadmin */
 227         &bridge_dld_modinfo,        /* qi_minfo */
 228         NULL                    /* qi_mstat */
 229 };
 230 
 231 static int bridge_ioc_listfwd(void *, intptr_t, int, cred_t *, int *);
 232 
 233 /* GLDv3 control ioctls used by Bridging */
 234 static dld_ioc_info_t bridge_ioc_list[] = {
 235         {BRIDGE_IOC_LISTFWD, DLDCOPYINOUT, sizeof (bridge_listfwd_t),
 236             bridge_ioc_listfwd, NULL},
 237 };
 238 
 239 /*
 240  * Given a bridge mac pointer, get a ref-held pointer to the corresponding
 241  * bridge instance, if any.  We must hold the global bmac_rwlock so that
 242  * bm_inst doesn't slide out from under us.
 243  */
 244 static bridge_inst_t *
 245 mac_to_inst(const bridge_mac_t *bmp)
 246 {
 247         bridge_inst_t *bip;
 248 
 249         rw_enter(&bmac_rwlock, RW_READER);
 250         if ((bip = bmp->bm_inst) != NULL)
 251                 atomic_inc_uint(&bip->bi_refs);
 252         rw_exit(&bmac_rwlock);
 253         return (bip);
 254 }
 255 
 256 static void
 257 link_sdu_fail(bridge_link_t *blp, boolean_t failed, mblk_t **mlist)
 258 {
 259         mblk_t *mp;
 260         bridge_ctl_t *bcp;
 261         bridge_link_t *blcmp;
 262         bridge_inst_t *bip;
 263         bridge_mac_t *bmp;
 264 
 265         if (failed) {
 266                 if (blp->bl_flags & BLF_SDUFAIL)
 267                         return;
 268                 blp->bl_flags |= BLF_SDUFAIL;
 269         } else {
 270                 if (!(blp->bl_flags & BLF_SDUFAIL))
 271                         return;
 272                 blp->bl_flags &= ~BLF_SDUFAIL;
 273         }
 274 
 275         /*
 276          * If this link is otherwise up, then check if there are any other
 277          * non-failed non-down links.  If not, then we control the state of the
 278          * whole bridge.
 279          */
 280         bip = blp->bl_inst;
 281         bmp = bip->bi_mac;
 282         if (blp->bl_linkstate != LINK_STATE_DOWN) {
 283                 for (blcmp = list_head(&bip->bi_links); blcmp != NULL;
 284                     blcmp = list_next(&bip->bi_links, blcmp)) {
 285                         if (blp != blcmp &&
 286                             !(blcmp->bl_flags & (BLF_DELETED|BLF_SDUFAIL)) &&
 287                             blcmp->bl_linkstate != LINK_STATE_DOWN)
 288                                 break;
 289                 }
 290                 if (blcmp == NULL) {
 291                         bmp->bm_linkstate = failed ? LINK_STATE_DOWN :
 292                             LINK_STATE_UP;
 293                         mac_link_redo(bmp->bm_mh, bmp->bm_linkstate);
 294                 }
 295         }
 296 
 297         /*
 298          * If we're becoming failed, then the link's current true state needs
 299          * to be reflected upwards to this link's clients.  If we're becoming
 300          * unfailed, then we get the state of the bridge instead on all
 301          * clients.
 302          */
 303         if (failed) {
 304                 if (bmp->bm_linkstate != blp->bl_linkstate)
 305                         mac_link_redo(blp->bl_mh, blp->bl_linkstate);
 306         } else {
 307                 mac_link_redo(blp->bl_mh, bmp->bm_linkstate);
 308         }
 309 
 310         /* get the current mblk we're going to send up */
 311         if ((mp = blp->bl_lfailmp) == NULL &&
 312             (mp = allocb(sizeof (bridge_ctl_t), BPRI_MED)) == NULL)
 313                 return;
 314 
 315         /* get a new one for next time */
 316         blp->bl_lfailmp = allocb(sizeof (bridge_ctl_t), BPRI_MED);
 317 
 318         /* if none for next time, then report only failures */
 319         if (blp->bl_lfailmp == NULL && !failed) {
 320                 blp->bl_lfailmp = mp;
 321                 return;
 322         }
 323 
 324         /* LINTED: alignment */
 325         bcp = (bridge_ctl_t *)mp->b_rptr;
 326         bcp->bc_linkid = blp->bl_linkid;
 327         bcp->bc_failed = failed;
 328         mp->b_wptr = (uchar_t *)(bcp + 1);
 329         mp->b_next = *mlist;
 330         *mlist = mp;
 331 }
 332 
 333 /*
 334  * Send control messages (link SDU changes) using the stream to the
 335  * bridge instance daemon.
 336  */
 337 static void
 338 send_up_messages(bridge_inst_t *bip, mblk_t *mp)
 339 {
 340         mblk_t *mnext;
 341         queue_t *rq;
 342 
 343         rq = bip->bi_control->bs_wq;
 344         rq = OTHERQ(rq);
 345         while (mp != NULL) {
 346                 mnext = mp->b_next;
 347                 mp->b_next = NULL;
 348                 putnext(rq, mp);
 349                 mp = mnext;
 350         }
 351 }
 352 
 353 /* ARGSUSED */
 354 static int
 355 bridge_m_getstat(void *arg, uint_t stat, uint64_t *val)
 356 {
 357         return (ENOTSUP);
 358 }
 359 
 360 static int
 361 bridge_m_start(void *arg)
 362 {
 363         bridge_mac_t *bmp = arg;
 364 
 365         bmp->bm_flags |= BMF_STARTED;
 366         return (0);
 367 }
 368 
 369 static void
 370 bridge_m_stop(void *arg)
 371 {
 372         bridge_mac_t *bmp = arg;
 373 
 374         bmp->bm_flags &= ~BMF_STARTED;
 375 }
 376 
 377 /* ARGSUSED */
 378 static int
 379 bridge_m_setpromisc(void *arg, boolean_t on)
 380 {
 381         return (0);
 382 }
 383 
 384 /* ARGSUSED */
 385 static int
 386 bridge_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
 387 {
 388         return (0);
 389 }
 390 
 391 /* ARGSUSED */
 392 static int
 393 bridge_m_unicst(void *arg, const uint8_t *macaddr)
 394 {
 395         return (ENOTSUP);
 396 }
 397 
 398 static mblk_t *
 399 bridge_m_tx(void *arg, mblk_t *mp)
 400 {
 401         _NOTE(ARGUNUSED(arg));
 402         freemsgchain(mp);
 403         return (NULL);
 404 }
 405 
 406 /* ARGSUSED */
 407 static int
 408 bridge_ioc_listfwd(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 409 {
 410         bridge_listfwd_t *blf = karg;
 411         bridge_inst_t *bip;
 412         bridge_fwd_t *bfp, match;
 413         avl_index_t where;
 414 
 415         bip = bridge_find_name(blf->blf_name);
 416         if (bip == NULL)
 417                 return (ENOENT);
 418 
 419         bcopy(blf->blf_dest, match.bf_dest, ETHERADDRL);
 420         match.bf_flags |= BFF_VLANLOCAL;
 421         rw_enter(&bip->bi_rwlock, RW_READER);
 422         if ((bfp = avl_find(&bip->bi_fwd, &match, &where)) == NULL)
 423                 bfp = avl_nearest(&bip->bi_fwd, where, AVL_AFTER);
 424         else
 425                 bfp = AVL_NEXT(&bip->bi_fwd, bfp);
 426         if (bfp == NULL) {
 427                 bzero(blf, sizeof (*blf));
 428         } else {
 429                 bcopy(bfp->bf_dest, blf->blf_dest, ETHERADDRL);
 430                 blf->blf_trill_nick = bfp->bf_trill_nick;
 431                 blf->blf_ms_age =
 432                     drv_hztousec(ddi_get_lbolt() - bfp->bf_lastheard) / 1000;
 433                 blf->blf_is_local =
 434                     (bfp->bf_flags & BFF_LOCALADDR) != 0;
 435                 blf->blf_linkid = bfp->bf_links[0]->bl_linkid;
 436         }
 437         rw_exit(&bip->bi_rwlock);
 438         bridge_unref(bip);
 439         return (0);
 440 }
 441 
 442 static int
 443 bridge_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 444     uint_t pr_valsize, const void *pr_val)
 445 {
 446         bridge_mac_t *bmp = arg;
 447         bridge_inst_t *bip;
 448         bridge_link_t *blp;
 449         int err;
 450         uint_t maxsdu;
 451         mblk_t *mlist;
 452 
 453         _NOTE(ARGUNUSED(pr_name));
 454         switch (pr_num) {
 455         case MAC_PROP_MTU:
 456                 if (pr_valsize < sizeof (bmp->bm_maxsdu)) {
 457                         err = EINVAL;
 458                         break;
 459                 }
 460                 (void) bcopy(pr_val, &maxsdu, sizeof (maxsdu));
 461                 if (maxsdu == bmp->bm_maxsdu) {
 462                         err = 0;
 463                 } else if ((bip = mac_to_inst(bmp)) == NULL) {
 464                         err = ENXIO;
 465                 } else {
 466                         rw_enter(&bip->bi_rwlock, RW_WRITER);
 467                         mlist = NULL;
 468                         for (blp = list_head(&bip->bi_links); blp != NULL;
 469                             blp = list_next(&bip->bi_links, blp)) {
 470                                 if (blp->bl_flags & BLF_DELETED)
 471                                         continue;
 472                                 if (blp->bl_maxsdu == maxsdu)
 473                                         link_sdu_fail(blp, B_FALSE, &mlist);
 474                                 else if (blp->bl_maxsdu == bmp->bm_maxsdu)
 475                                         link_sdu_fail(blp, B_TRUE, &mlist);
 476                         }
 477                         rw_exit(&bip->bi_rwlock);
 478                         bmp->bm_maxsdu = maxsdu;
 479                         (void) mac_maxsdu_update(bmp->bm_mh, maxsdu);
 480                         send_up_messages(bip, mlist);
 481                         bridge_unref(bip);
 482                         err = 0;
 483                 }
 484                 break;
 485 
 486         default:
 487                 err = ENOTSUP;
 488                 break;
 489         }
 490         return (err);
 491 }
 492 
 493 static int
 494 bridge_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 495     uint_t pr_valsize, void *pr_val)
 496 {
 497         bridge_mac_t *bmp = arg;
 498         int err = 0;
 499 
 500         _NOTE(ARGUNUSED(pr_name));
 501         switch (pr_num) {
 502         case MAC_PROP_STATUS:
 503                 ASSERT(pr_valsize >= sizeof (bmp->bm_linkstate));
 504                 bcopy(&bmp->bm_linkstate, pr_val, sizeof (&bmp->bm_linkstate));
 505                 break;
 506 
 507         default:
 508                 err = ENOTSUP;
 509                 break;
 510         }
 511         return (err);
 512 }
 513 
 514 static void
 515 bridge_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
 516     mac_prop_info_handle_t prh)
 517 {
 518         bridge_mac_t *bmp = arg;
 519 
 520         _NOTE(ARGUNUSED(pr_name));
 521 
 522         switch (pr_num) {
 523         case MAC_PROP_MTU:
 524                 mac_prop_info_set_range_uint32(prh, bmp->bm_maxsdu,
 525                     bmp->bm_maxsdu);
 526                 break;
 527         case MAC_PROP_STATUS:
 528                 mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
 529                 break;
 530         }
 531 }
 532 
 533 static mac_callbacks_t bridge_m_callbacks = {
 534         MC_SETPROP | MC_GETPROP | MC_PROPINFO,
 535         bridge_m_getstat,
 536         bridge_m_start,
 537         bridge_m_stop,
 538         bridge_m_setpromisc,
 539         bridge_m_multicst,
 540         bridge_m_unicst,
 541         bridge_m_tx,
 542         NULL,   /* reserved */
 543         NULL,   /* ioctl */
 544         NULL,   /* getcapab */
 545         NULL,   /* open */
 546         NULL,   /* close */
 547         bridge_m_setprop,
 548         bridge_m_getprop,
 549         bridge_m_propinfo
 550 };
 551 
 552 /*
 553  * Create kstats from a list.
 554  */
 555 static kstat_t *
 556 kstat_setup(kstat_named_t *knt, const char **names, int nstat,
 557     const char *unitname)
 558 {
 559         kstat_t *ksp;
 560         int i;
 561 
 562         for (i = 0; i < nstat; i++)
 563                 kstat_named_init(&knt[i], names[i], KSTAT_DATA_UINT64);
 564 
 565         ksp = kstat_create_zone(BRIDGE_DEV_NAME, 0, unitname, "net",
 566             KSTAT_TYPE_NAMED, nstat, KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID);
 567         if (ksp != NULL) {
 568                 ksp->ks_data = knt;
 569                 kstat_install(ksp);
 570         }
 571         return (ksp);
 572 }
 573 
 574 /*
 575  * Find an existing bridge_mac_t structure or allocate a new one for the given
 576  * bridge instance.  This creates the mac driver instance that snoop can use.
 577  */
 578 static int
 579 bmac_alloc(bridge_inst_t *bip, bridge_mac_t **bmacp)
 580 {
 581         bridge_mac_t *bmp, *bnew;
 582         mac_register_t *mac;
 583         int err;
 584 
 585         *bmacp = NULL;
 586         if ((mac = mac_alloc(MAC_VERSION)) == NULL)
 587                 return (EINVAL);
 588 
 589         bnew = kmem_zalloc(sizeof (*bnew), KM_SLEEP);
 590 
 591         rw_enter(&bmac_rwlock, RW_WRITER);
 592         for (bmp = list_head(&bmac_list); bmp != NULL;
 593             bmp = list_next(&bmac_list, bmp)) {
 594                 if (strcmp(bip->bi_name, bmp->bm_name) == 0) {
 595                         ASSERT(bmp->bm_inst == NULL);
 596                         bmp->bm_inst = bip;
 597                         rw_exit(&bmac_rwlock);
 598                         kmem_free(bnew, sizeof (*bnew));
 599                         mac_free(mac);
 600                         *bmacp = bmp;
 601                         return (0);
 602                 }
 603         }
 604 
 605         mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
 606         mac->m_driver = bnew;
 607         mac->m_dip = bridge_dev_info;
 608         mac->m_instance = (uint_t)-1;
 609         mac->m_src_addr = (uint8_t *)zero_addr;
 610         mac->m_callbacks = &bridge_m_callbacks;
 611 
 612         /*
 613          * Note that the SDU limits are irrelevant, as nobody transmits on the
 614          * bridge node itself.  It's mainly for monitoring but we allow
 615          * setting the bridge MTU for quick transition of all links part of the
 616          * bridge to a new MTU.
 617          */
 618         mac->m_min_sdu = 1;
 619         mac->m_max_sdu = 1500;
 620         err = mac_register(mac, &bnew->bm_mh);
 621         mac_free(mac);
 622         if (err != 0) {
 623                 rw_exit(&bmac_rwlock);
 624                 kmem_free(bnew, sizeof (*bnew));
 625                 return (err);
 626         }
 627 
 628         bnew->bm_inst = bip;
 629         (void) strcpy(bnew->bm_name, bip->bi_name);
 630         if (list_is_empty(&bmac_list)) {
 631                 bridge_timerid = timeout(bridge_timer, NULL,
 632                     bridge_scan_interval);
 633         }
 634         list_insert_tail(&bmac_list, bnew);
 635         rw_exit(&bmac_rwlock);
 636 
 637         /*
 638          * Mark the MAC as unable to go "active" so that only passive clients
 639          * (such as snoop) can bind to it.
 640          */
 641         mac_no_active(bnew->bm_mh);
 642         *bmacp = bnew;
 643         return (0);
 644 }
 645 
 646 /*
 647  * Disconnect the given bridge_mac_t from its bridge instance.  The bridge
 648  * instance is going away.  The mac instance can't go away until the clients
 649  * are gone (see bridge_timer).
 650  */
 651 static void
 652 bmac_disconnect(bridge_mac_t *bmp)
 653 {
 654         bridge_inst_t *bip;
 655 
 656         bmp->bm_linkstate = LINK_STATE_DOWN;
 657         mac_link_redo(bmp->bm_mh, LINK_STATE_DOWN);
 658 
 659         rw_enter(&bmac_rwlock, RW_READER);
 660         bip = bmp->bm_inst;
 661         bip->bi_mac = NULL;
 662         bmp->bm_inst = NULL;
 663         rw_exit(&bmac_rwlock);
 664 }
 665 
 666 /* This is used by the avl trees to sort forwarding table entries */
 667 static int
 668 fwd_compare(const void *addr1, const void *addr2)
 669 {
 670         const bridge_fwd_t *fwd1 = addr1;
 671         const bridge_fwd_t *fwd2 = addr2;
 672         int diff = memcmp(fwd1->bf_dest, fwd2->bf_dest, ETHERADDRL);
 673 
 674         if (diff != 0)
 675                 return (diff > 0 ? 1 : -1);
 676 
 677         if ((fwd1->bf_flags ^ fwd2->bf_flags) & BFF_VLANLOCAL) {
 678                 if (fwd1->bf_vlanid > fwd2->bf_vlanid)
 679                         return (1);
 680                 else if (fwd1->bf_vlanid < fwd2->bf_vlanid)
 681                         return (-1);
 682         }
 683         return (0);
 684 }
 685 
 686 static void
 687 inst_free(bridge_inst_t *bip)
 688 {
 689         ASSERT(bip->bi_mac == NULL);
 690         rw_destroy(&bip->bi_rwlock);
 691         list_destroy(&bip->bi_links);
 692         cv_destroy(&bip->bi_linkwait);
 693         avl_destroy(&bip->bi_fwd);
 694         if (bip->bi_ksp != NULL)
 695                 kstat_delete(bip->bi_ksp);
 696         kmem_free(bip, sizeof (*bip));
 697 }
 698 
 699 static bridge_inst_t *
 700 inst_alloc(const char *bridge)
 701 {
 702         bridge_inst_t *bip;
 703 
 704         bip = kmem_zalloc(sizeof (*bip), KM_SLEEP);
 705         bip->bi_refs = 1;
 706         (void) strcpy(bip->bi_name, bridge);
 707         rw_init(&bip->bi_rwlock, NULL, RW_DRIVER, NULL);
 708         list_create(&bip->bi_links, sizeof (bridge_link_t),
 709             offsetof(bridge_link_t, bl_node));
 710         cv_init(&bip->bi_linkwait, NULL, CV_DRIVER, NULL);
 711         avl_create(&bip->bi_fwd, fwd_compare, sizeof (bridge_fwd_t),
 712             offsetof(bridge_fwd_t, bf_node));
 713         return (bip);
 714 }
 715 
 716 static bridge_inst_t *
 717 bridge_find_name(const char *bridge)
 718 {
 719         bridge_inst_t *bip;
 720 
 721         mutex_enter(&inst_lock);
 722         for (bip = list_head(&inst_list); bip != NULL;
 723             bip = list_next(&inst_list, bip)) {
 724                 if (!(bip->bi_flags & BIF_SHUTDOWN) &&
 725                     strcmp(bridge, bip->bi_name) == 0) {
 726                         atomic_inc_uint(&bip->bi_refs);
 727                         break;
 728                 }
 729         }
 730         mutex_exit(&inst_lock);
 731 
 732         return (bip);
 733 }
 734 
 735 static int
 736 bridge_create(datalink_id_t linkid, const char *bridge, bridge_inst_t **bipc,
 737     cred_t *cred)
 738 {
 739         bridge_inst_t *bip, *bipnew;
 740         bridge_mac_t *bmp = NULL;
 741         int err;
 742 
 743         *bipc = NULL;
 744         bipnew = inst_alloc(bridge);
 745 
 746         mutex_enter(&inst_lock);
 747 lookup_retry:
 748         for (bip = list_head(&inst_list); bip != NULL;
 749             bip = list_next(&inst_list, bip)) {
 750                 if (strcmp(bridge, bip->bi_name) == 0)
 751                         break;
 752         }
 753 
 754         /* This should not take long; if it does, we've got a design problem */
 755         if (bip != NULL && (bip->bi_flags & BIF_SHUTDOWN)) {
 756                 cv_wait(&inst_cv, &inst_lock);
 757                 goto lookup_retry;
 758         }
 759 
 760         if (bip == NULL) {
 761                 bip = bipnew;
 762                 bipnew = NULL;
 763                 list_insert_tail(&inst_list, bip);
 764         }
 765 
 766         mutex_exit(&inst_lock);
 767         if (bipnew != NULL) {
 768                 inst_free(bipnew);
 769                 return (EEXIST);
 770         }
 771 
 772         bip->bi_ksp = kstat_setup((kstat_named_t *)&bip->bi_kstats,
 773             inst_kstats_list, Dim(inst_kstats_list), bip->bi_name);
 774 
 775         err = bmac_alloc(bip, &bmp);
 776         if ((bip->bi_mac = bmp) == NULL)
 777                 goto fail_create;
 778 
 779         /*
 780          * bm_inst is set, so the timer cannot yank the DLS rug from under us.
 781          * No extra locking is needed here.
 782          */
 783         if (!(bmp->bm_flags & BMF_DLS)) {
 784                 err = dls_devnet_create(bmp->bm_mh, linkid, crgetzoneid(cred));
 785                 if (err != 0)
 786                         goto fail_create;
 787                 bmp->bm_flags |= BMF_DLS;
 788         }
 789 
 790         bip->bi_dev = makedevice(bridge_major, mac_minor(bmp->bm_mh));
 791         *bipc = bip;
 792         return (0);
 793 
 794 fail_create:
 795         ASSERT(bip->bi_trilldata == NULL);
 796         bip->bi_flags |= BIF_SHUTDOWN;
 797         bridge_unref(bip);
 798         return (err);
 799 }
 800 
 801 static void
 802 bridge_unref(bridge_inst_t *bip)
 803 {
 804         if (atomic_dec_uint_nv(&bip->bi_refs) == 0) {
 805                 ASSERT(bip->bi_flags & BIF_SHUTDOWN);
 806                 /* free up mac for reuse before leaving global list */
 807                 if (bip->bi_mac != NULL)
 808                         bmac_disconnect(bip->bi_mac);
 809                 mutex_enter(&inst_lock);
 810                 list_remove(&inst_list, bip);
 811                 cv_broadcast(&inst_cv);
 812                 mutex_exit(&inst_lock);
 813                 inst_free(bip);
 814         }
 815 }
 816 
 817 /*
 818  * Stream instances are used only for allocating bridges and serving as a
 819  * control node.  They serve no data-handling function.
 820  */
 821 static bridge_stream_t *
 822 stream_alloc(void)
 823 {
 824         bridge_stream_t *bsp;
 825         minor_t mn;
 826 
 827         if ((mn = mac_minor_hold(B_FALSE)) == 0)
 828                 return (NULL);
 829         bsp = kmem_zalloc(sizeof (*bsp), KM_SLEEP);
 830         bsp->bs_minor = mn;
 831         return (bsp);
 832 }
 833 
 834 static void
 835 stream_free(bridge_stream_t *bsp)
 836 {
 837         mac_minor_rele(bsp->bs_minor);
 838         kmem_free(bsp, sizeof (*bsp));
 839 }
 840 
 841 /* Reference hold/release functions for STREAMS-related taskq */
 842 static void
 843 stream_ref(bridge_stream_t *bsp)
 844 {
 845         mutex_enter(&stream_ref_lock);
 846         bsp->bs_taskq_cnt++;
 847         mutex_exit(&stream_ref_lock);
 848 }
 849 
 850 static void
 851 stream_unref(bridge_stream_t *bsp)
 852 {
 853         mutex_enter(&stream_ref_lock);
 854         if (--bsp->bs_taskq_cnt == 0)
 855                 cv_broadcast(&stream_ref_cv);
 856         mutex_exit(&stream_ref_lock);
 857 }
 858 
 859 static void
 860 link_free(bridge_link_t *blp)
 861 {
 862         bridge_inst_t *bip = blp->bl_inst;
 863 
 864         ASSERT(!(blp->bl_flags & BLF_FREED));
 865         blp->bl_flags |= BLF_FREED;
 866         if (blp->bl_ksp != NULL)
 867                 kstat_delete(blp->bl_ksp);
 868         if (blp->bl_lfailmp != NULL)
 869                 freeb(blp->bl_lfailmp);
 870         cv_destroy(&blp->bl_trillwait);
 871         mutex_destroy(&blp->bl_trilllock);
 872         kmem_free(blp, sizeof (*blp));
 873         /* Don't unreference the bridge until the MAC is closed */
 874         bridge_unref(bip);
 875 }
 876 
 877 static void
 878 link_unref(bridge_link_t *blp)
 879 {
 880         if (atomic_dec_uint_nv(&blp->bl_refs) == 0) {
 881                 bridge_inst_t *bip = blp->bl_inst;
 882 
 883                 ASSERT(blp->bl_flags & BLF_DELETED);
 884                 rw_enter(&bip->bi_rwlock, RW_WRITER);
 885                 if (blp->bl_flags & BLF_LINK_ADDED)
 886                         list_remove(&bip->bi_links, blp);
 887                 rw_exit(&bip->bi_rwlock);
 888                 if (bip->bi_trilldata != NULL && list_is_empty(&bip->bi_links))
 889                         cv_broadcast(&bip->bi_linkwait);
 890                 link_free(blp);
 891         }
 892 }
 893 
 894 static bridge_fwd_t *
 895 fwd_alloc(const uint8_t *addr, uint_t nlinks, uint16_t nick)
 896 {
 897         bridge_fwd_t *bfp;
 898 
 899         bfp = kmem_zalloc(sizeof (*bfp) + (nlinks * sizeof (bridge_link_t *)),
 900             KM_NOSLEEP);
 901         if (bfp != NULL) {
 902                 bcopy(addr, bfp->bf_dest, ETHERADDRL);
 903                 bfp->bf_lastheard = ddi_get_lbolt();
 904                 bfp->bf_maxlinks = nlinks;
 905                 bfp->bf_links = (bridge_link_t **)(bfp + 1);
 906                 bfp->bf_trill_nick = nick;
 907         }
 908         return (bfp);
 909 }
 910 
 911 static bridge_fwd_t *
 912 fwd_find(bridge_inst_t *bip, const uint8_t *addr, uint16_t vlanid)
 913 {
 914         bridge_fwd_t *bfp, *vbfp;
 915         bridge_fwd_t match;
 916 
 917         bcopy(addr, match.bf_dest, ETHERADDRL);
 918         match.bf_flags = 0;
 919         rw_enter(&bip->bi_rwlock, RW_READER);
 920         if ((bfp = avl_find(&bip->bi_fwd, &match, NULL)) != NULL) {
 921                 if (bfp->bf_vlanid != vlanid && bfp->bf_vcnt > 0) {
 922                         match.bf_vlanid = vlanid;
 923                         match.bf_flags = BFF_VLANLOCAL;
 924                         vbfp = avl_find(&bip->bi_fwd, &match, NULL);
 925                         if (vbfp != NULL)
 926                                 bfp = vbfp;
 927                 }
 928                 atomic_inc_uint(&bfp->bf_refs);
 929         }
 930         rw_exit(&bip->bi_rwlock);
 931         return (bfp);
 932 }
 933 
 934 static void
 935 fwd_free(bridge_fwd_t *bfp)
 936 {
 937         uint_t i;
 938         bridge_inst_t *bip = bfp->bf_links[0]->bl_inst;
 939 
 940         KIDECR(bki_count);
 941         for (i = 0; i < bfp->bf_nlinks; i++)
 942                 link_unref(bfp->bf_links[i]);
 943         kmem_free(bfp,
 944             sizeof (*bfp) + bfp->bf_maxlinks * sizeof (bridge_link_t *));
 945 }
 946 
 947 static void
 948 fwd_unref(bridge_fwd_t *bfp)
 949 {
 950         if (atomic_dec_uint_nv(&bfp->bf_refs) == 0) {
 951                 ASSERT(!(bfp->bf_flags & BFF_INTREE));
 952                 fwd_free(bfp);
 953         }
 954 }
 955 
 956 static void
 957 fwd_delete(bridge_fwd_t *bfp)
 958 {
 959         bridge_inst_t *bip;
 960         bridge_fwd_t *bfpzero;
 961 
 962         if (bfp->bf_flags & BFF_INTREE) {
 963                 ASSERT(bfp->bf_nlinks > 0);
 964                 bip = bfp->bf_links[0]->bl_inst;
 965                 rw_enter(&bip->bi_rwlock, RW_WRITER);
 966                 /* Another thread could beat us to this */
 967                 if (bfp->bf_flags & BFF_INTREE) {
 968                         avl_remove(&bip->bi_fwd, bfp);
 969                         bfp->bf_flags &= ~BFF_INTREE;
 970                         if (bfp->bf_flags & BFF_VLANLOCAL) {
 971                                 bfp->bf_flags &= ~BFF_VLANLOCAL;
 972                                 bfpzero = avl_find(&bip->bi_fwd, bfp, NULL);
 973                                 if (bfpzero != NULL && bfpzero->bf_vcnt > 0)
 974                                         bfpzero->bf_vcnt--;
 975                         }
 976                         rw_exit(&bip->bi_rwlock);
 977                         fwd_unref(bfp);         /* no longer in avl tree */
 978                 } else {
 979                         rw_exit(&bip->bi_rwlock);
 980                 }
 981         }
 982 }
 983 
 984 static boolean_t
 985 fwd_insert(bridge_inst_t *bip, bridge_fwd_t *bfp)
 986 {
 987         avl_index_t idx;
 988         boolean_t retv;
 989 
 990         rw_enter(&bip->bi_rwlock, RW_WRITER);
 991         if (!(bip->bi_flags & BIF_SHUTDOWN) &&
 992             avl_numnodes(&bip->bi_fwd) < bip->bi_tablemax &&
 993             avl_find(&bip->bi_fwd, bfp, &idx) == NULL) {
 994                 avl_insert(&bip->bi_fwd, bfp, idx);
 995                 bfp->bf_flags |= BFF_INTREE;
 996                 atomic_inc_uint(&bfp->bf_refs);  /* avl entry */
 997                 retv = B_TRUE;
 998         } else {
 999                 retv = B_FALSE;
1000         }
1001         rw_exit(&bip->bi_rwlock);
1002         return (retv);
1003 }
1004 
1005 static void
1006 fwd_update_local(bridge_link_t *blp, const uint8_t *oldaddr,
1007     const uint8_t *newaddr)
1008 {
1009         bridge_inst_t *bip = blp->bl_inst;
1010         bridge_fwd_t *bfp, *bfnew;
1011         bridge_fwd_t match;
1012         avl_index_t idx;
1013         boolean_t drop_ref = B_FALSE;
1014 
1015         if (bcmp(oldaddr, newaddr, ETHERADDRL) == 0)
1016                 return;
1017 
1018         if (bcmp(oldaddr, zero_addr, ETHERADDRL) == 0)
1019                 goto no_old_addr;
1020 
1021         /*
1022          * Find the previous entry, and remove our link from it.
1023          */
1024         bcopy(oldaddr, match.bf_dest, ETHERADDRL);
1025         rw_enter(&bip->bi_rwlock, RW_WRITER);
1026         if ((bfp = avl_find(&bip->bi_fwd, &match, NULL)) != NULL) {
1027                 int i;
1028 
1029                 /*
1030                  * See if we're in the list, and remove if so.
1031                  */
1032                 for (i = 0; i < bfp->bf_nlinks; i++) {
1033                         if (bfp->bf_links[i] == blp) {
1034                                 /*
1035                                  * We assume writes are atomic, so no special
1036                                  * MT handling is needed.  The list length is
1037                                  * decremented first, and then we remove
1038                                  * entries.
1039                                  */
1040                                 bfp->bf_nlinks--;
1041                                 for (; i < bfp->bf_nlinks; i++)
1042                                         bfp->bf_links[i] = bfp->bf_links[i + 1];
1043                                 drop_ref = B_TRUE;
1044                                 break;
1045                         }
1046                 }
1047                 /* If no more links, then remove and free up */
1048                 if (bfp->bf_nlinks == 0) {
1049                         avl_remove(&bip->bi_fwd, bfp);
1050                         bfp->bf_flags &= ~BFF_INTREE;
1051                 } else {
1052                         bfp = NULL;
1053                 }
1054         }
1055         rw_exit(&bip->bi_rwlock);
1056         if (bfp != NULL)
1057                 fwd_unref(bfp);         /* no longer in avl tree */
1058 
1059         /*
1060          * Now get the new link address and add this link to the list.  The
1061          * list should be of length 1 unless the user has configured multiple
1062          * NICs with the same address.  (That's an incorrect configuration, but
1063          * we support it anyway.)
1064          */
1065 no_old_addr:
1066         bfp = NULL;
1067         if ((bip->bi_flags & BIF_SHUTDOWN) ||
1068             bcmp(newaddr, zero_addr, ETHERADDRL) == 0)
1069                 goto no_new_addr;
1070 
1071         bcopy(newaddr, match.bf_dest, ETHERADDRL);
1072         rw_enter(&bip->bi_rwlock, RW_WRITER);
1073         if ((bfp = avl_find(&bip->bi_fwd, &match, &idx)) == NULL) {
1074                 bfnew = fwd_alloc(newaddr, 1, RBRIDGE_NICKNAME_NONE);
1075                 if (bfnew != NULL)
1076                         KIINCR(bki_count);
1077         } else if (bfp->bf_nlinks < bfp->bf_maxlinks) {
1078                 /* special case: link fits in existing entry */
1079                 bfnew = bfp;
1080         } else {
1081                 bfnew = fwd_alloc(newaddr, bfp->bf_nlinks + 1,
1082                     RBRIDGE_NICKNAME_NONE);
1083                 if (bfnew != NULL) {
1084                         KIINCR(bki_count);
1085                         avl_remove(&bip->bi_fwd, bfp);
1086                         bfp->bf_flags &= ~BFF_INTREE;
1087                         bfnew->bf_nlinks = bfp->bf_nlinks;
1088                         bcopy(bfp->bf_links, bfnew->bf_links,
1089                             bfp->bf_nlinks * sizeof (bfp));
1090                         /* reset the idx value due to removal above */
1091                         (void) avl_find(&bip->bi_fwd, &match, &idx);
1092                 }
1093         }
1094 
1095         if (bfnew != NULL) {
1096                 bfnew->bf_links[bfnew->bf_nlinks++] = blp;
1097                 if (drop_ref)
1098                         drop_ref = B_FALSE;
1099                 else
1100                         atomic_inc_uint(&blp->bl_refs);  /* bf_links entry */
1101 
1102                 if (bfnew != bfp) {
1103                         /* local addresses are not subject to table limits */
1104                         avl_insert(&bip->bi_fwd, bfnew, idx);
1105                         bfnew->bf_flags |= (BFF_INTREE | BFF_LOCALADDR);
1106                         atomic_inc_uint(&bfnew->bf_refs);        /* avl entry */
1107                 }
1108         }
1109         rw_exit(&bip->bi_rwlock);
1110 
1111 no_new_addr:
1112         /*
1113          * If we found an existing entry and we replaced it with a new one,
1114          * then drop the table reference from the old one.  We removed it from
1115          * the AVL tree above.
1116          */
1117         if (bfnew != NULL && bfp != NULL && bfnew != bfp)
1118                 fwd_unref(bfp);
1119 
1120         /* Account for removed entry. */
1121         if (drop_ref)
1122                 link_unref(blp);
1123 }
1124 
1125 static void
1126 bridge_new_unicst(bridge_link_t *blp)
1127 {
1128         uint8_t new_mac[ETHERADDRL];
1129 
1130         mac_unicast_primary_get(blp->bl_mh, new_mac);
1131         fwd_update_local(blp, blp->bl_local_mac, new_mac);
1132         bcopy(new_mac, blp->bl_local_mac, ETHERADDRL);
1133 }
1134 
1135 /*
1136  * We must shut down a link prior to freeing it, and doing that requires
1137  * blocking to wait for running MAC threads while holding a reference.  This is
1138  * run from a taskq to accomplish proper link shutdown followed by reference
1139  * drop.
1140  */
1141 static void
1142 link_shutdown(void *arg)
1143 {
1144         bridge_link_t *blp = arg;
1145         mac_handle_t mh = blp->bl_mh;
1146         bridge_inst_t *bip;
1147         bridge_fwd_t *bfp, *bfnext;
1148         avl_tree_t fwd_scavenge;
1149         int i;
1150 
1151         /*
1152          * This link is being destroyed.  Notify TRILL now that it's no longer
1153          * possible to send packets.  Data packets may still arrive until TRILL
1154          * calls bridge_trill_lnunref.
1155          */
1156         if (blp->bl_trilldata != NULL)
1157                 trill_lndstr_fn(blp->bl_trilldata, blp);
1158 
1159         if (blp->bl_flags & BLF_PROM_ADDED)
1160                 (void) mac_promisc_remove(blp->bl_mphp);
1161 
1162         if (blp->bl_flags & BLF_SET_BRIDGE)
1163                 mac_bridge_clear(mh, (mac_handle_t)blp);
1164 
1165         if (blp->bl_flags & BLF_MARGIN_ADDED) {
1166                 (void) mac_notify_remove(blp->bl_mnh, B_TRUE);
1167                 (void) mac_margin_remove(mh, blp->bl_margin);
1168         }
1169 
1170         /* Tell the clients the real link state when we leave */
1171         mac_link_redo(blp->bl_mh,
1172             mac_stat_get(blp->bl_mh, MAC_STAT_LOWLINK_STATE));
1173 
1174         /* Destroy all of the forwarding entries related to this link */
1175         avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
1176             offsetof(bridge_fwd_t, bf_node));
1177         bip = blp->bl_inst;
1178         rw_enter(&bip->bi_rwlock, RW_WRITER);
1179         bfnext = avl_first(&bip->bi_fwd);
1180         while ((bfp = bfnext) != NULL) {
1181                 bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
1182                 for (i = 0; i < bfp->bf_nlinks; i++) {
1183                         if (bfp->bf_links[i] == blp)
1184                                 break;
1185                 }
1186                 if (i >= bfp->bf_nlinks)
1187                         continue;
1188                 if (bfp->bf_nlinks > 1) {
1189                         /* note that this can't be the last reference */
1190                         link_unref(blp);
1191                         bfp->bf_nlinks--;
1192                         for (; i < bfp->bf_nlinks; i++)
1193                                 bfp->bf_links[i] = bfp->bf_links[i + 1];
1194                 } else {
1195                         ASSERT(bfp->bf_flags & BFF_INTREE);
1196                         avl_remove(&bip->bi_fwd, bfp);
1197                         bfp->bf_flags &= ~BFF_INTREE;
1198                         avl_add(&fwd_scavenge, bfp);
1199                 }
1200         }
1201         rw_exit(&bip->bi_rwlock);
1202         bfnext = avl_first(&fwd_scavenge);
1203         while ((bfp = bfnext) != NULL) {
1204                 bfnext = AVL_NEXT(&fwd_scavenge, bfp);
1205                 avl_remove(&fwd_scavenge, bfp);
1206                 fwd_unref(bfp);
1207         }
1208         avl_destroy(&fwd_scavenge);
1209 
1210         if (blp->bl_flags & BLF_CLIENT_OPEN)
1211                 mac_client_close(blp->bl_mch, 0);
1212 
1213         mac_close(mh);
1214 
1215         /*
1216          * We are now completely removed from the active list, so drop the
1217          * reference (see bridge_add_link).
1218          */
1219         link_unref(blp);
1220 }
1221 
1222 static void
1223 shutdown_inst(bridge_inst_t *bip)
1224 {
1225         bridge_link_t *blp, *blnext;
1226         bridge_fwd_t *bfp;
1227 
1228         mutex_enter(&inst_lock);
1229         if (bip->bi_flags & BIF_SHUTDOWN) {
1230                 mutex_exit(&inst_lock);
1231                 return;
1232         }
1233 
1234         /*
1235          * Once on the inst_list, the bridge instance must not leave that list
1236          * without having the shutdown flag set first.  When the shutdown flag
1237          * is set, we own the list reference, so we must drop it before
1238          * returning.
1239          */
1240         bip->bi_flags |= BIF_SHUTDOWN;
1241         mutex_exit(&inst_lock);
1242 
1243         bip->bi_control = NULL;
1244 
1245         rw_enter(&bip->bi_rwlock, RW_READER);
1246         blnext = list_head(&bip->bi_links);
1247         while ((blp = blnext) != NULL) {
1248                 blnext = list_next(&bip->bi_links, blp);
1249                 if (!(blp->bl_flags & BLF_DELETED)) {
1250                         blp->bl_flags |= BLF_DELETED;
1251                         (void) ddi_taskq_dispatch(bridge_taskq, link_shutdown,
1252                             blp, DDI_SLEEP);
1253                 }
1254         }
1255         while ((bfp = avl_first(&bip->bi_fwd)) != NULL) {
1256                 atomic_inc_uint(&bfp->bf_refs);
1257                 rw_exit(&bip->bi_rwlock);
1258                 fwd_delete(bfp);
1259                 fwd_unref(bfp);
1260                 rw_enter(&bip->bi_rwlock, RW_READER);
1261         }
1262         rw_exit(&bip->bi_rwlock);
1263 
1264         /*
1265          * This bridge is being destroyed.  Notify TRILL once all of the
1266          * links are all gone.
1267          */
1268         mutex_enter(&inst_lock);
1269         while (bip->bi_trilldata != NULL && !list_is_empty(&bip->bi_links))
1270                 cv_wait(&bip->bi_linkwait, &inst_lock);
1271         mutex_exit(&inst_lock);
1272         if (bip->bi_trilldata != NULL)
1273                 trill_brdstr_fn(bip->bi_trilldata, bip);
1274 
1275         bridge_unref(bip);
1276 }
1277 
1278 /*
1279  * This is called once by the TRILL module when it starts up.  It just sets the
1280  * global TRILL callback function pointers -- data transmit/receive and bridge
1281  * and link destroy notification.  There's only one TRILL module, so only one
1282  * registration is needed.
1283  *
1284  * TRILL should call this function with NULL pointers before unloading.  It
1285  * must not do so before dropping all references to bridges and links.  We
1286  * assert that this is true on debug builds.
1287  */
1288 void
1289 bridge_trill_register_cb(trill_recv_pkt_t recv_fn, trill_encap_pkt_t encap_fn,
1290     trill_br_dstr_t brdstr_fn, trill_ln_dstr_t lndstr_fn)
1291 {
1292 #ifdef DEBUG
1293         if (recv_fn == NULL && trill_recv_fn != NULL) {
1294                 bridge_inst_t *bip;
1295                 bridge_link_t *blp;
1296 
1297                 mutex_enter(&inst_lock);
1298                 for (bip = list_head(&inst_list); bip != NULL;
1299                     bip = list_next(&inst_list, bip)) {
1300                         ASSERT(bip->bi_trilldata == NULL);
1301                         rw_enter(&bip->bi_rwlock, RW_READER);
1302                         for (blp = list_head(&bip->bi_links); blp != NULL;
1303                             blp = list_next(&bip->bi_links, blp)) {
1304                                 ASSERT(blp->bl_trilldata == NULL);
1305                         }
1306                         rw_exit(&bip->bi_rwlock);
1307                 }
1308                 mutex_exit(&inst_lock);
1309         }
1310 #endif
1311         trill_recv_fn = recv_fn;
1312         trill_encap_fn = encap_fn;
1313         trill_brdstr_fn = brdstr_fn;
1314         trill_lndstr_fn = lndstr_fn;
1315 }
1316 
1317 /*
1318  * This registers the TRILL instance pointer with a bridge.  Before this
1319  * pointer is set, the forwarding, TRILL receive, and bridge destructor
1320  * functions won't be called.
1321  *
1322  * TRILL holds a reference on a bridge with this call.  It must free the
1323  * reference by calling the unregister function below.
1324  */
1325 bridge_inst_t *
1326 bridge_trill_brref(const char *bname, void *ptr)
1327 {
1328         char bridge[MAXLINKNAMELEN];
1329         bridge_inst_t *bip;
1330 
1331         (void) snprintf(bridge, MAXLINKNAMELEN, "%s0", bname);
1332         bip = bridge_find_name(bridge);
1333         if (bip != NULL) {
1334                 ASSERT(bip->bi_trilldata == NULL && ptr != NULL);
1335                 bip->bi_trilldata = ptr;
1336         }
1337         return (bip);
1338 }
1339 
1340 void
1341 bridge_trill_brunref(bridge_inst_t *bip)
1342 {
1343         ASSERT(bip->bi_trilldata != NULL);
1344         bip->bi_trilldata = NULL;
1345         bridge_unref(bip);
1346 }
1347 
1348 /*
1349  * TRILL calls this function when referencing a particular link on a bridge.
1350  *
1351  * It holds a reference on the link, so TRILL must clear out the reference when
1352  * it's done with the link (on unbinding).
1353  */
1354 bridge_link_t *
1355 bridge_trill_lnref(bridge_inst_t *bip, datalink_id_t linkid, void *ptr)
1356 {
1357         bridge_link_t *blp;
1358 
1359         ASSERT(ptr != NULL);
1360         rw_enter(&bip->bi_rwlock, RW_READER);
1361         for (blp = list_head(&bip->bi_links); blp != NULL;
1362             blp = list_next(&bip->bi_links, blp)) {
1363                 if (!(blp->bl_flags & BLF_DELETED) &&
1364                     blp->bl_linkid == linkid && blp->bl_trilldata == NULL) {
1365                         blp->bl_trilldata = ptr;
1366                         blp->bl_flags &= ~BLF_TRILLACTIVE;
1367                         (void) memset(blp->bl_afs, 0, sizeof (blp->bl_afs));
1368                         atomic_inc_uint(&blp->bl_refs);
1369                         break;
1370                 }
1371         }
1372         rw_exit(&bip->bi_rwlock);
1373         return (blp);
1374 }
1375 
1376 void
1377 bridge_trill_lnunref(bridge_link_t *blp)
1378 {
1379         mutex_enter(&blp->bl_trilllock);
1380         ASSERT(blp->bl_trilldata != NULL);
1381         blp->bl_trilldata = NULL;
1382         blp->bl_flags &= ~BLF_TRILLACTIVE;
1383         while (blp->bl_trillthreads > 0)
1384                 cv_wait(&blp->bl_trillwait, &blp->bl_trilllock);
1385         mutex_exit(&blp->bl_trilllock);
1386         (void) memset(blp->bl_afs, 0xff, sizeof (blp->bl_afs));
1387         link_unref(blp);
1388 }
1389 
1390 /*
1391  * This periodic timer performs three functions:
1392  *  1. It scans the list of learned forwarding entries, and removes ones that
1393  *     haven't been heard from in a while.  The time limit is backed down if
1394  *     we're above the configured table limit.
1395  *  2. It walks the links and decays away the bl_learns counter.
1396  *  3. It scans the observability node entries looking for ones that can be
1397  *     freed up.
1398  */
1399 /* ARGSUSED */
1400 static void
1401 bridge_timer(void *arg)
1402 {
1403         bridge_inst_t *bip;
1404         bridge_fwd_t *bfp, *bfnext;
1405         bridge_mac_t *bmp, *bmnext;
1406         bridge_link_t *blp;
1407         int err;
1408         datalink_id_t tmpid;
1409         avl_tree_t fwd_scavenge;
1410         clock_t age_limit;
1411         uint32_t ldecay;
1412 
1413         avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
1414             offsetof(bridge_fwd_t, bf_node));
1415         mutex_enter(&inst_lock);
1416         for (bip = list_head(&inst_list); bip != NULL;
1417             bip = list_next(&inst_list, bip)) {
1418                 if (bip->bi_flags & BIF_SHUTDOWN)
1419                         continue;
1420                 rw_enter(&bip->bi_rwlock, RW_WRITER);
1421                 /* compute scaled maximum age based on table limit */
1422                 if (avl_numnodes(&bip->bi_fwd) > bip->bi_tablemax)
1423                         bip->bi_tshift++;
1424                 else
1425                         bip->bi_tshift = 0;
1426                 if ((age_limit = bridge_fwd_age >> bip->bi_tshift) == 0) {
1427                         if (bip->bi_tshift != 0)
1428                                 bip->bi_tshift--;
1429                         age_limit = 1;
1430                 }
1431                 bfnext = avl_first(&bip->bi_fwd);
1432                 while ((bfp = bfnext) != NULL) {
1433                         bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
1434                         if (!(bfp->bf_flags & BFF_LOCALADDR) &&
1435                             (ddi_get_lbolt() - bfp->bf_lastheard) > age_limit) {
1436                                 ASSERT(bfp->bf_flags & BFF_INTREE);
1437                                 avl_remove(&bip->bi_fwd, bfp);
1438                                 bfp->bf_flags &= ~BFF_INTREE;
1439                                 avl_add(&fwd_scavenge, bfp);
1440                         }
1441                 }
1442                 for (blp = list_head(&bip->bi_links); blp != NULL;
1443                     blp = list_next(&bip->bi_links, blp)) {
1444                         ldecay = mac_get_ldecay(blp->bl_mh);
1445                         if (ldecay >= blp->bl_learns)
1446                                 blp->bl_learns = 0;
1447                         else
1448                                 atomic_add_int(&blp->bl_learns, -(int)ldecay);
1449                 }
1450                 rw_exit(&bip->bi_rwlock);
1451                 bfnext = avl_first(&fwd_scavenge);
1452                 while ((bfp = bfnext) != NULL) {
1453                         bfnext = AVL_NEXT(&fwd_scavenge, bfp);
1454                         avl_remove(&fwd_scavenge, bfp);
1455                         KIINCR(bki_expire);
1456                         fwd_unref(bfp); /* drop tree reference */
1457                 }
1458         }
1459         mutex_exit(&inst_lock);
1460         avl_destroy(&fwd_scavenge);
1461 
1462         /*
1463          * Scan the bridge_mac_t entries and try to free up the ones that are
1464          * no longer active.  This must be done by polling, as neither DLS nor
1465          * MAC provides a driver any sort of positive control over clients.
1466          */
1467         rw_enter(&bmac_rwlock, RW_WRITER);
1468         bmnext = list_head(&bmac_list);
1469         while ((bmp = bmnext) != NULL) {
1470                 bmnext = list_next(&bmac_list, bmp);
1471 
1472                 /* ignore active bridges */
1473                 if (bmp->bm_inst != NULL)
1474                         continue;
1475 
1476                 if (bmp->bm_flags & BMF_DLS) {
1477                         err = dls_devnet_destroy(bmp->bm_mh, &tmpid, B_FALSE);
1478                         ASSERT(err == 0 || err == EBUSY);
1479                         if (err == 0)
1480                                 bmp->bm_flags &= ~BMF_DLS;
1481                 }
1482 
1483                 if (!(bmp->bm_flags & BMF_DLS)) {
1484                         err = mac_unregister(bmp->bm_mh);
1485                         ASSERT(err == 0 || err == EBUSY);
1486                         if (err == 0) {
1487                                 list_remove(&bmac_list, bmp);
1488                                 kmem_free(bmp, sizeof (*bmp));
1489                         }
1490                 }
1491         }
1492         if (list_is_empty(&bmac_list)) {
1493                 bridge_timerid = 0;
1494         } else {
1495                 bridge_timerid = timeout(bridge_timer, NULL,
1496                     bridge_scan_interval);
1497         }
1498         rw_exit(&bmac_rwlock);
1499 }
1500 
1501 static int
1502 bridge_open(queue_t *rq, dev_t *devp, int oflag, int sflag, cred_t *credp)
1503 {
1504         bridge_stream_t *bsp;
1505 
1506         if (rq->q_ptr != NULL)
1507                 return (0);
1508 
1509         if (sflag & MODOPEN)
1510                 return (EINVAL);
1511 
1512         /*
1513          * Check the minor node number being opened.  This tells us which
1514          * bridge instance the user wants.
1515          */
1516         if (getminor(*devp) != 0) {
1517                 /*
1518                  * This is a regular DLPI stream for snoop or the like.
1519                  * Redirect it through DLD.
1520                  */
1521                 rq->q_qinfo = &bridge_dld_rinit;
1522                 OTHERQ(rq)->q_qinfo = &bridge_dld_winit;
1523                 return (dld_open(rq, devp, oflag, sflag, credp));
1524         } else {
1525                 /*
1526                  * Allocate the bridge control stream structure.
1527                  */
1528                 if ((bsp = stream_alloc()) == NULL)
1529                         return (ENOSR);
1530                 rq->q_ptr = WR(rq)->q_ptr = (caddr_t)bsp;
1531                 bsp->bs_wq = WR(rq);
1532                 *devp = makedevice(getmajor(*devp), bsp->bs_minor);
1533                 qprocson(rq);
1534                 return (0);
1535         }
1536 }
1537 
1538 /*
1539  * This is used only for bridge control streams.  DLPI goes through dld
1540  * instead.
1541  */
1542 static int
1543 bridge_close(queue_t *rq)
1544 {
1545         bridge_stream_t *bsp = rq->q_ptr;
1546         bridge_inst_t *bip;
1547 
1548         /*
1549          * Wait for any stray taskq (add/delete link) entries related to this
1550          * stream to leave the system.
1551          */
1552         mutex_enter(&stream_ref_lock);
1553         while (bsp->bs_taskq_cnt != 0)
1554                 cv_wait(&stream_ref_cv, &stream_ref_lock);
1555         mutex_exit(&stream_ref_lock);
1556 
1557         qprocsoff(rq);
1558         if ((bip = bsp->bs_inst) != NULL)
1559                 shutdown_inst(bip);
1560         rq->q_ptr = WR(rq)->q_ptr = NULL;
1561         stream_free(bsp);
1562         if (bip != NULL)
1563                 bridge_unref(bip);
1564 
1565         return (0);
1566 }
1567 
1568 static void
1569 bridge_learn(bridge_link_t *blp, const uint8_t *saddr, uint16_t ingress_nick,
1570     uint16_t vlanid)
1571 {
1572         bridge_inst_t *bip = blp->bl_inst;
1573         bridge_fwd_t *bfp, *bfpnew;
1574         int i;
1575         boolean_t replaced = B_FALSE;
1576 
1577         /* Ignore multi-destination address used as source; it's nonsense. */
1578         if (*saddr & 1)
1579                 return;
1580 
1581         /*
1582          * If the source is known, then check whether it belongs on this link.
1583          * If not, and this isn't a fixed local address, then we've detected a
1584          * move.  If it's not known, learn it.
1585          */
1586         if ((bfp = fwd_find(bip, saddr, vlanid)) != NULL) {
1587                 /*
1588                  * If the packet has a fixed local source address, then there's
1589                  * nothing we can learn.  We must quit.  If this was a received
1590                  * packet, then the sender has stolen our address, but there's
1591                  * nothing we can do.  If it's a transmitted packet, then
1592                  * that's the normal case.
1593                  */
1594                 if (bfp->bf_flags & BFF_LOCALADDR) {
1595                         fwd_unref(bfp);
1596                         return;
1597                 }
1598 
1599                 /*
1600                  * Check if the link (and TRILL sender, if any) being used is
1601                  * among the ones registered for this address.  If so, then
1602                  * this is information that we already know.
1603                  */
1604                 if (bfp->bf_trill_nick == ingress_nick) {
1605                         for (i = 0; i < bfp->bf_nlinks; i++) {
1606                                 if (bfp->bf_links[i] == blp) {
1607                                         bfp->bf_lastheard = ddi_get_lbolt();
1608                                         fwd_unref(bfp);
1609                                         return;
1610                                 }
1611                         }
1612                 }
1613         }
1614 
1615         /*
1616          * Note that we intentionally "unlearn" things that appear to be under
1617          * attack on this link.  The forwarding cache is a negative thing for
1618          * security -- it disables reachability as a performance optimization
1619          * -- so leaving out entries optimizes for success and defends against
1620          * the attack.  Thus, the bare increment without a check in the delete
1621          * code above is right.  (And it's ok if we skid over the limit a
1622          * little, so there's no syncronization needed on the test.)
1623          */
1624         if (blp->bl_learns >= mac_get_llimit(blp->bl_mh)) {
1625                 if (bfp != NULL) {
1626                         if (bfp->bf_vcnt == 0)
1627                                 fwd_delete(bfp);
1628                         fwd_unref(bfp);
1629                 }
1630                 return;
1631         }
1632 
1633         atomic_inc_uint(&blp->bl_learns);
1634 
1635         if ((bfpnew = fwd_alloc(saddr, 1, ingress_nick)) == NULL) {
1636                 if (bfp != NULL)
1637                         fwd_unref(bfp);
1638                 return;
1639         }
1640         KIINCR(bki_count);
1641 
1642         if (bfp != NULL) {
1643                 /*
1644                  * If this is a new destination for the same VLAN, then delete
1645                  * so that we can update.  If it's a different VLAN, then we're
1646                  * not going to delete the original.  Split off instead into an
1647                  * IVL entry.
1648                  */
1649                 if (bfp->bf_vlanid == vlanid) {
1650                         /* save the count of IVL duplicates */
1651                         bfpnew->bf_vcnt = bfp->bf_vcnt;
1652 
1653                         /* entry deletes count as learning events */
1654                         atomic_inc_uint(&blp->bl_learns);
1655 
1656                         /* destroy and create anew; node moved */
1657                         fwd_delete(bfp);
1658                         replaced = B_TRUE;
1659                         KIINCR(bki_moved);
1660                 } else {
1661                         bfp->bf_vcnt++;
1662                         bfpnew->bf_flags |= BFF_VLANLOCAL;
1663                 }
1664                 fwd_unref(bfp);
1665         }
1666         bfpnew->bf_links[0] = blp;
1667         bfpnew->bf_nlinks = 1;
1668         atomic_inc_uint(&blp->bl_refs);  /* bf_links entry */
1669         if (!fwd_insert(bip, bfpnew))
1670                 fwd_free(bfpnew);
1671         else if (!replaced)
1672                 KIINCR(bki_source);
1673 }
1674 
1675 /*
1676  * Process the VLAN headers for output on a given link.  There are several
1677  * cases (noting that we don't map VLANs):
1678  *   1. The input packet is good as it is; either
1679  *      a. It has no tag, and output has same PVID
1680  *      b. It has a non-zero priority-only tag for PVID, and b_band is same
1681  *      c. It has a tag with VLAN different from PVID, and b_band is same
1682  *   2. The tag must change: non-zero b_band is different from tag priority
1683  *   3. The packet has a tag and should not (VLAN same as PVID, b_band zero)
1684  *   4. The packet has no tag and needs one:
1685  *      a. VLAN ID same as PVID, but b_band is non-zero
1686  *      b. VLAN ID different from PVID
1687  * We exclude case 1 first, then modify the packet.  Note that output packets
1688  * get a priority set by the mblk, not by the header, because QoS in bridging
1689  * requires priority recalculation at each node.
1690  *
1691  * The passed-in tci is the "impossible" value 0xFFFF when no tag is present.
1692  */
1693 static mblk_t *
1694 reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid)
1695 {
1696         boolean_t source_has_tag = (tci != 0xFFFF);
1697         mblk_t *mpcopy;
1698         size_t mlen, minlen;
1699         struct ether_vlan_header *evh;
1700         int pri;
1701 
1702         /* This helps centralize error handling in the caller. */
1703         if (mp == NULL)
1704                 return (mp);
1705 
1706         /* No forwarded packet can have hardware checksum enabled */
1707         DB_CKSUMFLAGS(mp) = 0;
1708 
1709         /* Get the no-modification cases out of the way first */
1710         if (!source_has_tag && vlanid == pvid)          /* 1a */
1711                 return (mp);
1712 
1713         pri = VLAN_PRI(tci);
1714         if (source_has_tag && mp->b_band == pri) {
1715                 if (vlanid != pvid)                     /* 1c */
1716                         return (mp);
1717                 if (pri != 0 && VLAN_ID(tci) == 0)      /* 1b */
1718                         return (mp);
1719         }
1720 
1721         /*
1722          * We now know that we must modify the packet.  Prepare for that.  Note
1723          * that if a tag is present, the caller has already done a pullup for
1724          * the VLAN header, so we're good to go.
1725          */
1726         if (MBLKL(mp) < sizeof (struct ether_header)) {
1727                 mpcopy = msgpullup(mp, sizeof (struct ether_header));
1728                 if (mpcopy == NULL) {
1729                         freemsg(mp);
1730                         return (NULL);
1731                 }
1732                 mp = mpcopy;
1733         }
1734         if (DB_REF(mp) > 1 || !IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)) ||
1735             (!source_has_tag && MBLKTAIL(mp) < VLAN_INCR)) {
1736                 minlen = mlen = MBLKL(mp);
1737                 if (!source_has_tag)
1738                         minlen += VLAN_INCR;
1739                 ASSERT(minlen >= sizeof (struct ether_vlan_header));
1740                 /*
1741                  * We're willing to copy some data to avoid fragmentation, but
1742                  * not a lot.
1743                  */
1744                 if (minlen > 256)
1745                         minlen = sizeof (struct ether_vlan_header);
1746                 mpcopy = allocb(minlen, BPRI_MED);
1747                 if (mpcopy == NULL) {
1748                         freemsg(mp);
1749                         return (NULL);
1750                 }
1751                 if (mlen <= minlen) {
1752                         /* We toss the first mblk when we can. */
1753                         bcopy(mp->b_rptr, mpcopy->b_rptr, mlen);
1754                         mpcopy->b_wptr += mlen;
1755                         mpcopy->b_cont = mp->b_cont;
1756                         freeb(mp);
1757                 } else {
1758                         /* If not, then just copy what we need */
1759                         if (!source_has_tag)
1760                                 minlen = sizeof (struct ether_header);
1761                         bcopy(mp->b_rptr, mpcopy->b_rptr, minlen);
1762                         mpcopy->b_wptr += minlen;
1763                         mpcopy->b_cont = mp;
1764                         mp->b_rptr += minlen;
1765                 }
1766                 mp = mpcopy;
1767         }
1768 
1769         /* LINTED: pointer alignment */
1770         evh = (struct ether_vlan_header *)mp->b_rptr;
1771         if (source_has_tag) {
1772                 if (mp->b_band == 0 && vlanid == pvid) {     /* 3 */
1773                         evh->ether_tpid = evh->ether_type;
1774                         mlen = MBLKL(mp);
1775                         if (mlen > sizeof (struct ether_vlan_header))
1776                                 ovbcopy(mp->b_rptr +
1777                                     sizeof (struct ether_vlan_header),
1778                                     mp->b_rptr + sizeof (struct ether_header),
1779                                     mlen - sizeof (struct ether_vlan_header));
1780                         mp->b_wptr -= VLAN_INCR;
1781                 } else {                                        /* 2 */
1782                         if (vlanid == pvid)
1783                                 vlanid = VLAN_ID_NONE;
1784                         tci = VLAN_TCI(mp->b_band, ETHER_CFI, vlanid);
1785                         evh->ether_tci = htons(tci);
1786                 }
1787         } else {
1788                 /* case 4: no header present, but one is needed */
1789                 mlen = MBLKL(mp);
1790                 if (mlen > sizeof (struct ether_header))
1791                         ovbcopy(mp->b_rptr + sizeof (struct ether_header),
1792                             mp->b_rptr + sizeof (struct ether_vlan_header),
1793                             mlen - sizeof (struct ether_header));
1794                 mp->b_wptr += VLAN_INCR;
1795                 ASSERT(mp->b_wptr <= DB_LIM(mp));
1796                 if (vlanid == pvid)
1797                         vlanid = VLAN_ID_NONE;
1798                 tci = VLAN_TCI(mp->b_band, ETHER_CFI, vlanid);
1799                 evh->ether_type = evh->ether_tpid;
1800                 evh->ether_tpid = htons(ETHERTYPE_VLAN);
1801                 evh->ether_tci = htons(tci);
1802         }
1803         return (mp);
1804 }
1805 
1806 /* Record VLAN information and strip header if requested . */
1807 static void
1808 update_header(mblk_t *mp, mac_header_info_t *hdr_info, boolean_t striphdr)
1809 {
1810         if (hdr_info->mhi_bindsap == ETHERTYPE_VLAN) {
1811                 struct ether_vlan_header *evhp;
1812                 uint16_t ether_type;
1813 
1814                 /* LINTED: alignment */
1815                 evhp = (struct ether_vlan_header *)mp->b_rptr;
1816                 hdr_info->mhi_istagged = B_TRUE;
1817                 hdr_info->mhi_tci = ntohs(evhp->ether_tci);
1818                 if (striphdr) {
1819                         /*
1820                          * For VLAN tagged frames update the ether_type
1821                          * in hdr_info before stripping the header.
1822                          */
1823                         ether_type = ntohs(evhp->ether_type);
1824                         hdr_info->mhi_origsap = ether_type;
1825                         hdr_info->mhi_bindsap = (ether_type > ETHERMTU) ?
1826                             ether_type : DLS_SAP_LLC;
1827                         mp->b_rptr = (uchar_t *)(evhp + 1);
1828                 }
1829         } else {
1830                 hdr_info->mhi_istagged = B_FALSE;
1831                 hdr_info->mhi_tci = VLAN_ID_NONE;
1832                 if (striphdr)
1833                         mp->b_rptr += sizeof (struct ether_header);
1834         }
1835 }
1836 
1837 /*
1838  * Return B_TRUE if we're allowed to send on this link with the given VLAN ID.
1839  */
1840 static boolean_t
1841 bridge_can_send(bridge_link_t *blp, uint16_t vlanid)
1842 {
1843         ASSERT(vlanid != VLAN_ID_NONE);
1844         if (blp->bl_flags & BLF_DELETED)
1845                 return (B_FALSE);
1846         if (blp->bl_trilldata == NULL && blp->bl_state != BLS_FORWARDING)
1847                 return (B_FALSE);
1848         return (BRIDGE_VLAN_ISSET(blp, vlanid) && BRIDGE_AF_ISSET(blp, vlanid));
1849 }
1850 
1851 /*
1852  * This function scans the bridge forwarding tables in order to forward a given
1853  * packet.  If the packet either doesn't need forwarding (the current link is
1854  * correct) or the current link needs a copy as well, then the packet is
1855  * returned to the caller.
1856  *
1857  * If a packet has been decapsulated from TRILL, then it must *NOT* reenter a
1858  * TRILL tunnel.  If the destination points there, then drop instead.
1859  */
1860 static mblk_t *
1861 bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
1862     uint16_t vlanid, uint16_t tci, boolean_t from_trill, boolean_t is_xmit)
1863 {
1864         mblk_t *mpsend, *mpcopy;
1865         bridge_inst_t *bip = blp->bl_inst;
1866         bridge_link_t *blpsend, *blpnext;
1867         bridge_fwd_t *bfp;
1868         uint_t i;
1869         boolean_t selfseen = B_FALSE;
1870         void *tdp;
1871         const uint8_t *daddr = hdr_info->mhi_daddr;
1872 
1873         /*
1874          * Check for the IEEE "reserved" multicast addresses.  Messages sent to
1875          * these addresses are used for link-local control (STP and pause), and
1876          * are never forwarded or redirected.
1877          */
1878         if (daddr[0] == 1 && daddr[1] == 0x80 && daddr[2] == 0xc2 &&
1879             daddr[3] == 0 && daddr[4] == 0 && (daddr[5] & 0xf0) == 0) {
1880                 if (from_trill) {
1881                         freemsg(mp);
1882                         mp = NULL;
1883                 }
1884                 return (mp);
1885         }
1886 
1887         if ((bfp = fwd_find(bip, daddr, vlanid)) != NULL) {
1888 
1889                 /*
1890                  * If trill indicates a destination for this node, then it's
1891                  * clearly not intended for local delivery.  We must tell TRILL
1892                  * to encapsulate, as long as we didn't just decapsulate it.
1893                  */
1894                 if (bfp->bf_trill_nick != RBRIDGE_NICKNAME_NONE) {
1895                         /*
1896                          * Error case: can't reencapsulate if the protocols are
1897                          * working correctly.
1898                          */
1899                         if (from_trill) {
1900                                 freemsg(mp);
1901                                 return (NULL);
1902                         }
1903                         mutex_enter(&blp->bl_trilllock);
1904                         if ((tdp = blp->bl_trilldata) != NULL) {
1905                                 blp->bl_trillthreads++;
1906                                 mutex_exit(&blp->bl_trilllock);
1907                                 update_header(mp, hdr_info, B_FALSE);
1908                                 if (is_xmit)
1909                                         mp = mac_fix_cksum(mp);
1910                                 /* all trill data frames have Inner.VLAN */
1911                                 mp = reform_vlan_header(mp, vlanid, tci, 0);
1912                                 if (mp == NULL) {
1913                                         KIINCR(bki_drops);
1914                                         fwd_unref(bfp);
1915                                         return (NULL);
1916                                 }
1917                                 trill_encap_fn(tdp, blp, hdr_info, mp,
1918                                     bfp->bf_trill_nick);
1919                                 mutex_enter(&blp->bl_trilllock);
1920                                 if (--blp->bl_trillthreads == 0 &&
1921                                     blp->bl_trilldata == NULL)
1922                                         cv_broadcast(&blp->bl_trillwait);
1923                         }
1924                         mutex_exit(&blp->bl_trilllock);
1925 
1926                         /* if TRILL has been disabled, then kill this stray */
1927                         if (tdp == NULL) {
1928                                 freemsg(mp);
1929                                 fwd_delete(bfp);
1930                         }
1931                         fwd_unref(bfp);
1932                         return (NULL);
1933                 }
1934 
1935                 /* find first link we can send on */
1936                 for (i = 0; i < bfp->bf_nlinks; i++) {
1937                         blpsend = bfp->bf_links[i];
1938                         if (blpsend == blp)
1939                                 selfseen = B_TRUE;
1940                         else if (bridge_can_send(blpsend, vlanid))
1941                                 break;
1942                 }
1943 
1944                 while (i < bfp->bf_nlinks) {
1945                         blpsend = bfp->bf_links[i];
1946                         for (i++; i < bfp->bf_nlinks; i++) {
1947                                 blpnext = bfp->bf_links[i];
1948                                 if (blpnext == blp)
1949                                         selfseen = B_TRUE;
1950                                 else if (bridge_can_send(blpnext, vlanid))
1951                                         break;
1952                         }
1953                         if (i == bfp->bf_nlinks && !selfseen) {
1954                                 mpsend = mp;
1955                                 mp = NULL;
1956                         } else {
1957                                 mpsend = copymsg(mp);
1958                         }
1959 
1960                         if (!from_trill && is_xmit)
1961                                 mpsend = mac_fix_cksum(mpsend);
1962 
1963                         mpsend = reform_vlan_header(mpsend, vlanid, tci,
1964                             blpsend->bl_pvid);
1965                         if (mpsend == NULL) {
1966                                 KIINCR(bki_drops);
1967                                 continue;
1968                         }
1969 
1970                         KIINCR(bki_forwards);
1971                         /*
1972                          * No need to bump up the link reference count, as
1973                          * the forwarding entry itself holds a reference to
1974                          * the link.
1975                          */
1976                         if (bfp->bf_flags & BFF_LOCALADDR) {
1977                                 mac_rx_common(blpsend->bl_mh, NULL, mpsend);
1978                         } else {
1979                                 KLPINCR(blpsend, bkl_xmit);
1980                                 MAC_RING_TX(blpsend->bl_mh, NULL, mpsend,
1981                                     mpsend);
1982                                 freemsg(mpsend);
1983                         }
1984                 }
1985                 /*
1986                  * Handle a special case: if we're transmitting to the original
1987                  * link, then check whether the localaddr flag is set.  If it
1988                  * is, then receive instead.  This doesn't happen with ordinary
1989                  * bridging, but does happen often with TRILL decapsulation.
1990                  */
1991                 if (mp != NULL && is_xmit && (bfp->bf_flags & BFF_LOCALADDR)) {
1992                         mac_rx_common(blp->bl_mh, NULL, mp);
1993                         mp = NULL;
1994                 }
1995                 fwd_unref(bfp);
1996         } else {
1997                 /*
1998                  * TRILL has two cases to handle.  If the packet is off the
1999                  * wire (not from TRILL), then we need to send up into the
2000                  * TRILL module to have the distribution tree computed.  If the
2001                  * packet is from TRILL (decapsulated), then we're part of the
2002                  * distribution tree, and we need to copy the packet on member
2003                  * interfaces.
2004                  *
2005                  * Thus, the from TRILL case is identical to the STP case.
2006                  */
2007                 if (!from_trill && blp->bl_trilldata != NULL) {
2008                         mutex_enter(&blp->bl_trilllock);
2009                         if ((tdp = blp->bl_trilldata) != NULL) {
2010                                 blp->bl_trillthreads++;
2011                                 mutex_exit(&blp->bl_trilllock);
2012                                 if ((mpsend = copymsg(mp)) != NULL) {
2013                                         update_header(mpsend,
2014                                             hdr_info, B_FALSE);
2015                                         /*
2016                                          * all trill data frames have
2017                                          * Inner.VLAN
2018                                          */
2019                                         mpsend = reform_vlan_header(mpsend,
2020                                             vlanid, tci, 0);
2021                                         if (mpsend == NULL) {
2022                                                 KIINCR(bki_drops);
2023                                         } else {
2024                                                 trill_encap_fn(tdp, blp,
2025                                                     hdr_info, mpsend,
2026                                                     RBRIDGE_NICKNAME_NONE);
2027                                         }
2028                                 }
2029                                 mutex_enter(&blp->bl_trilllock);
2030                                 if (--blp->bl_trillthreads == 0 &&
2031                                     blp->bl_trilldata == NULL)
2032                                         cv_broadcast(&blp->bl_trillwait);
2033                         }
2034                         mutex_exit(&blp->bl_trilllock);
2035                 }
2036 
2037                 /*
2038                  * This is an unknown destination, so flood.
2039                  */
2040                 rw_enter(&bip->bi_rwlock, RW_READER);
2041                 for (blpnext = list_head(&bip->bi_links); blpnext != NULL;
2042                     blpnext = list_next(&bip->bi_links, blpnext)) {
2043                         if (blpnext == blp)
2044                                 selfseen = B_TRUE;
2045                         else if (bridge_can_send(blpnext, vlanid))
2046                                 break;
2047                 }
2048                 if (blpnext != NULL)
2049                         atomic_inc_uint(&blpnext->bl_refs);
2050                 rw_exit(&bip->bi_rwlock);
2051                 while ((blpsend = blpnext) != NULL) {
2052                         rw_enter(&bip->bi_rwlock, RW_READER);
2053                         for (blpnext = list_next(&bip->bi_links, blpsend);
2054                             blpnext != NULL;
2055                             blpnext = list_next(&bip->bi_links, blpnext)) {
2056                                 if (blpnext == blp)
2057                                         selfseen = B_TRUE;
2058                                 else if (bridge_can_send(blpnext, vlanid))
2059                                         break;
2060                         }
2061                         if (blpnext != NULL)
2062                                 atomic_inc_uint(&blpnext->bl_refs);
2063                         rw_exit(&bip->bi_rwlock);
2064                         if (blpnext == NULL && !selfseen) {
2065                                 mpsend = mp;
2066                                 mp = NULL;
2067                         } else {
2068                                 mpsend = copymsg(mp);
2069                         }
2070 
2071                         if (!from_trill && is_xmit)
2072                                 mpsend = mac_fix_cksum(mpsend);
2073 
2074                         mpsend = reform_vlan_header(mpsend, vlanid, tci,
2075                             blpsend->bl_pvid);
2076                         if (mpsend == NULL) {
2077                                 KIINCR(bki_drops);
2078                                 continue;
2079                         }
2080 
2081                         if (hdr_info->mhi_dsttype == MAC_ADDRTYPE_UNICAST)
2082                                 KIINCR(bki_unknown);
2083                         else
2084                                 KIINCR(bki_mbcast);
2085                         KLPINCR(blpsend, bkl_xmit);
2086                         if ((mpcopy = copymsg(mpsend)) != NULL)
2087                                 mac_rx_common(blpsend->bl_mh, NULL, mpcopy);
2088                         MAC_RING_TX(blpsend->bl_mh, NULL, mpsend, mpsend);
2089                         freemsg(mpsend);
2090                         link_unref(blpsend);
2091                 }
2092         }
2093 
2094         /*
2095          * At this point, if np is non-NULL, it means that the caller needs to
2096          * continue on the selected link.
2097          */
2098         return (mp);
2099 }
2100 
2101 /*
2102  * Extract and validate the VLAN information for a given packet.  This checks
2103  * conformance with the rules for use of the PVID on the link, and for the
2104  * allowed (configured) VLAN set.
2105  *
2106  * Returns B_TRUE if the packet passes, B_FALSE if it fails.
2107  */
2108 static boolean_t
2109 bridge_get_vlan(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
2110     uint16_t *vlanidp, uint16_t *tcip)
2111 {
2112         uint16_t tci, vlanid;
2113 
2114         if (hdr_info->mhi_bindsap == ETHERTYPE_VLAN) {
2115                 ptrdiff_t tpos = offsetof(struct ether_vlan_header, ether_tci);
2116                 ptrdiff_t mlen;
2117 
2118                 /*
2119                  * Extract the VLAN ID information, regardless of alignment,
2120                  * and without a pullup.  This isn't attractive, but we do this
2121                  * to avoid having to deal with the pointers stashed in
2122                  * hdr_info moving around or having the caller deal with a new
2123                  * mblk_t pointer.
2124                  */
2125                 while (mp != NULL) {
2126                         mlen = MBLKL(mp);
2127                         if (mlen > tpos && mlen > 0)
2128                                 break;
2129                         tpos -= mlen;
2130                         mp = mp->b_cont;
2131                 }
2132                 if (mp == NULL)
2133                         return (B_FALSE);
2134                 tci = mp->b_rptr[tpos] << 8;
2135                 if (++tpos >= mlen) {
2136                         do {
2137                                 mp = mp->b_cont;
2138                         } while (mp != NULL && MBLKL(mp) == 0);
2139                         if (mp == NULL)
2140                                 return (B_FALSE);
2141                         tpos = 0;
2142                 }
2143                 tci |= mp->b_rptr[tpos];
2144 
2145                 vlanid = VLAN_ID(tci);
2146                 if (VLAN_CFI(tci) != ETHER_CFI || vlanid > VLAN_ID_MAX)
2147                         return (B_FALSE);
2148                 if (vlanid == VLAN_ID_NONE || vlanid == blp->bl_pvid)
2149                         goto input_no_vlan;
2150                 if (!BRIDGE_VLAN_ISSET(blp, vlanid))
2151                         return (B_FALSE);
2152         } else {
2153                 tci = 0xFFFF;
2154 input_no_vlan:
2155                 /*
2156                  * If PVID is set to zero, then untagged traffic is not
2157                  * supported here.  Do not learn or forward.
2158                  */
2159                 if ((vlanid = blp->bl_pvid) == VLAN_ID_NONE)
2160                         return (B_FALSE);
2161         }
2162 
2163         *tcip = tci;
2164         *vlanidp = vlanid;
2165         return (B_TRUE);
2166 }
2167 
2168 /*
2169  * Handle MAC notifications.
2170  */
2171 static void
2172 bridge_notify_cb(void *arg, mac_notify_type_t note_type)
2173 {
2174         bridge_link_t *blp = arg;
2175 
2176         switch (note_type) {
2177         case MAC_NOTE_UNICST:
2178                 bridge_new_unicst(blp);
2179                 break;
2180 
2181         case MAC_NOTE_SDU_SIZE: {
2182                 uint_t maxsdu;
2183                 bridge_inst_t *bip = blp->bl_inst;
2184                 bridge_mac_t *bmp = bip->bi_mac;
2185                 boolean_t notify = B_FALSE;
2186                 mblk_t *mlist = NULL;
2187 
2188                 mac_sdu_get(blp->bl_mh, NULL, &maxsdu);
2189                 rw_enter(&bip->bi_rwlock, RW_READER);
2190                 if (list_prev(&bip->bi_links, blp) == NULL &&
2191                     list_next(&bip->bi_links, blp) == NULL) {
2192                         notify = (maxsdu != bmp->bm_maxsdu);
2193                         bmp->bm_maxsdu = maxsdu;
2194                 }
2195                 blp->bl_maxsdu = maxsdu;
2196                 if (maxsdu != bmp->bm_maxsdu)
2197                         link_sdu_fail(blp, B_TRUE, &mlist);
2198                 else if (notify)
2199                         (void) mac_maxsdu_update(bmp->bm_mh, maxsdu);
2200                 rw_exit(&bip->bi_rwlock);
2201                 send_up_messages(bip, mlist);
2202                 break;
2203         }
2204         }
2205 }
2206 
2207 /*
2208  * This is called by the MAC layer.  As with the transmit side, we're right in
2209  * the data path for all I/O on this port, so if we don't need to forward this
2210  * packet anywhere, we have to send it upwards via mac_rx_common.
2211  */
2212 static void
2213 bridge_recv_cb(mac_handle_t mh, mac_resource_handle_t rsrc, mblk_t *mpnext)
2214 {
2215         mblk_t *mp, *mpcopy;
2216         bridge_link_t *blp = (bridge_link_t *)mh;
2217         bridge_inst_t *bip = blp->bl_inst;
2218         bridge_mac_t *bmp = bip->bi_mac;
2219         mac_header_info_t hdr_info;
2220         uint16_t vlanid, tci;
2221         boolean_t trillmode = B_FALSE;
2222 
2223         KIINCR(bki_recv);
2224         KLINCR(bkl_recv);
2225 
2226         /*
2227          * Regardless of state, check for inbound TRILL packets when TRILL is
2228          * active.  These are pulled out of band and sent for TRILL handling.
2229          */
2230         if (blp->bl_trilldata != NULL) {
2231                 void *tdp;
2232                 mblk_t *newhead;
2233                 mblk_t *tail = NULL;
2234 
2235                 mutex_enter(&blp->bl_trilllock);
2236                 if ((tdp = blp->bl_trilldata) != NULL) {
2237                         blp->bl_trillthreads++;
2238                         mutex_exit(&blp->bl_trilllock);
2239                         trillmode = B_TRUE;
2240                         newhead = mpnext;
2241                         while ((mp = mpnext) != NULL) {
2242                                 boolean_t raw_isis, bridge_group;
2243 
2244                                 mpnext = mp->b_next;
2245 
2246                                 /*
2247                                  * If the header isn't readable, then leave on
2248                                  * the list and continue.
2249                                  */
2250                                 if (mac_header_info(blp->bl_mh, mp,
2251                                     &hdr_info) != 0) {
2252                                         tail = mp;
2253                                         continue;
2254                                 }
2255 
2256                                 /*
2257                                  * The TRILL document specifies that, on
2258                                  * Ethernet alone, IS-IS packets arrive with
2259                                  * LLC rather than Ethertype, and using a
2260                                  * specific destination address.  We must check
2261                                  * for that here.  Also, we need to give BPDUs
2262                                  * to TRILL for processing.
2263                                  */
2264                                 raw_isis = bridge_group = B_FALSE;
2265                                 if (hdr_info.mhi_dsttype ==
2266                                     MAC_ADDRTYPE_MULTICAST) {
2267                                         if (memcmp(hdr_info.mhi_daddr,
2268                                             all_isis_rbridges, ETHERADDRL) == 0)
2269                                                 raw_isis = B_TRUE;
2270                                         else if (memcmp(hdr_info.mhi_daddr,
2271                                             bridge_group_address, ETHERADDRL) ==
2272                                             0)
2273                                                 bridge_group = B_TRUE;
2274                                 }
2275                                 if (!raw_isis && !bridge_group &&
2276                                     hdr_info.mhi_bindsap != ETHERTYPE_TRILL &&
2277                                     (hdr_info.mhi_bindsap != ETHERTYPE_VLAN ||
2278                                     /* LINTED: alignment */
2279                                     ((struct ether_vlan_header *)mp->b_rptr)->
2280                                     ether_type != htons(ETHERTYPE_TRILL))) {
2281                                         tail = mp;
2282                                         continue;
2283                                 }
2284 
2285                                 /*
2286                                  * We've got TRILL input.  Remove from the list
2287                                  * and send up through the TRILL module.  (Send
2288                                  * a copy through promiscuous receive just to
2289                                  * support snooping on TRILL.  Order isn't
2290                                  * preserved strictly, but that doesn't matter
2291                                  * here.)
2292                                  */
2293                                 if (tail != NULL)
2294                                         tail->b_next = mpnext;
2295                                 mp->b_next = NULL;
2296                                 if (mp == newhead)
2297                                         newhead = mpnext;
2298                                 mac_trill_snoop(blp->bl_mh, mp);
2299                                 update_header(mp, &hdr_info, B_TRUE);
2300                                 /*
2301                                  * On raw IS-IS and BPDU frames, we have to
2302                                  * make sure that the length is trimmed
2303                                  * properly.  We use origsap in order to cope
2304                                  * with jumbograms for IS-IS.  (Regular mac
2305                                  * can't.)
2306                                  */
2307                                 if (raw_isis || bridge_group) {
2308                                         size_t msglen = msgdsize(mp);
2309 
2310                                         if (msglen > hdr_info.mhi_origsap) {
2311                                                 (void) adjmsg(mp,
2312                                                     hdr_info.mhi_origsap -
2313                                                     msglen);
2314                                         } else if (msglen <
2315                                             hdr_info.mhi_origsap) {
2316                                                 freemsg(mp);
2317                                                 continue;
2318                                         }
2319                                 }
2320                                 trill_recv_fn(tdp, blp, rsrc, mp, &hdr_info);
2321                         }
2322                         mpnext = newhead;
2323                         mutex_enter(&blp->bl_trilllock);
2324                         if (--blp->bl_trillthreads == 0 &&
2325                             blp->bl_trilldata == NULL)
2326                                 cv_broadcast(&blp->bl_trillwait);
2327                 }
2328                 mutex_exit(&blp->bl_trilllock);
2329                 if (mpnext == NULL)
2330                         return;
2331         }
2332 
2333         /*
2334          * If this is a TRILL RBridge, then just check whether this link is
2335          * used at all for forwarding.  If not, then we're done.
2336          */
2337         if (trillmode) {
2338                 if (!(blp->bl_flags & BLF_TRILLACTIVE) ||
2339                     (blp->bl_flags & BLF_SDUFAIL)) {
2340                         mac_rx_common(blp->bl_mh, rsrc, mpnext);
2341                         return;
2342                 }
2343         } else {
2344                 /*
2345                  * For regular (STP) bridges, if we're in blocking or listening
2346                  * state, then do nothing.  We don't learn or forward until
2347                  * told to do so.
2348                  */
2349                 if (blp->bl_state == BLS_BLOCKLISTEN) {
2350                         mac_rx_common(blp->bl_mh, rsrc, mpnext);
2351                         return;
2352                 }
2353         }
2354 
2355         /*
2356          * Send a copy of the message chain up to the observability node users.
2357          * For TRILL, we must obey the VLAN AF rules, so we go packet-by-
2358          * packet.
2359          */
2360         if (!trillmode && blp->bl_state == BLS_FORWARDING &&
2361             (bmp->bm_flags & BMF_STARTED) &&
2362             (mp = copymsgchain(mpnext)) != NULL) {
2363                 mac_rx(bmp->bm_mh, NULL, mp);
2364         }
2365 
2366         /*
2367          * We must be in learning or forwarding state, or using TRILL on a link
2368          * with one or more VLANs active.  For each packet in the list, process
2369          * the source address, and then attempt to forward.
2370          */
2371         while ((mp = mpnext) != NULL) {
2372                 mpnext = mp->b_next;
2373                 mp->b_next = NULL;
2374 
2375                 /*
2376                  * If we can't decode the header or if the header specifies a
2377                  * multicast source address (impossible!), then don't bother
2378                  * learning or forwarding, but go ahead and forward up the
2379                  * stack for subsequent processing.
2380                  */
2381                 if (mac_header_info(blp->bl_mh, mp, &hdr_info) != 0 ||
2382                     (hdr_info.mhi_saddr[0] & 1) != 0) {
2383                         KIINCR(bki_drops);
2384                         KLINCR(bkl_drops);
2385                         mac_rx_common(blp->bl_mh, rsrc, mp);
2386                         continue;
2387                 }
2388 
2389                 /*
2390                  * Extract and validate the VLAN ID for this packet.
2391                  */
2392                 if (!bridge_get_vlan(blp, &hdr_info, mp, &vlanid, &tci) ||
2393                     !BRIDGE_AF_ISSET(blp, vlanid)) {
2394                         mac_rx_common(blp->bl_mh, rsrc, mp);
2395                         continue;
2396                 }
2397 
2398                 if (trillmode) {
2399                         /*
2400                          * Special test required by TRILL document: must
2401                          * discard frames with outer address set to ESADI.
2402                          */
2403                         if (memcmp(hdr_info.mhi_daddr, all_esadi_rbridges,
2404                             ETHERADDRL) == 0) {
2405                                 mac_rx_common(blp->bl_mh, rsrc, mp);
2406                                 continue;
2407                         }
2408 
2409                         /*
2410                          * If we're in TRILL mode, then the call above to get
2411                          * the VLAN ID has also checked that we're the
2412                          * appointed forwarder, so report that we're handling
2413                          * this packet to any observability node users.
2414                          */
2415                         if ((bmp->bm_flags & BMF_STARTED) &&
2416                             (mpcopy = copymsg(mp)) != NULL)
2417                                 mac_rx(bmp->bm_mh, NULL, mpcopy);
2418                 }
2419 
2420                 /*
2421                  * First process the source address and learn from it.  For
2422                  * TRILL, we learn only if we're the appointed forwarder.
2423                  */
2424                 bridge_learn(blp, hdr_info.mhi_saddr, RBRIDGE_NICKNAME_NONE,
2425                     vlanid);
2426 
2427                 /*
2428                  * Now check whether we're forwarding and look up the
2429                  * destination.  If we can forward, do so.
2430                  */
2431                 if (trillmode || blp->bl_state == BLS_FORWARDING) {
2432                         mp = bridge_forward(blp, &hdr_info, mp, vlanid, tci,
2433                             B_FALSE, B_FALSE);
2434                 }
2435                 if (mp != NULL)
2436                         mac_rx_common(blp->bl_mh, rsrc, mp);
2437         }
2438 }
2439 
2440 
2441 /* ARGSUSED */
2442 static mblk_t *
2443 bridge_xmit_cb(mac_handle_t mh, mac_ring_handle_t rh, mblk_t *mpnext)
2444 {
2445         bridge_link_t *blp = (bridge_link_t *)mh;
2446         bridge_inst_t *bip = blp->bl_inst;
2447         bridge_mac_t *bmp = bip->bi_mac;
2448         mac_header_info_t hdr_info;
2449         uint16_t vlanid, tci;
2450         mblk_t *mp, *mpcopy;
2451         boolean_t trillmode;
2452 
2453         trillmode = blp->bl_trilldata != NULL;
2454 
2455         /*
2456          * If we're using STP and we're in blocking or listening state, or if
2457          * we're using TRILL and no VLANs are active, then behave as though the
2458          * bridge isn't here at all, and send on the local link alone.
2459          */
2460         if ((!trillmode && blp->bl_state == BLS_BLOCKLISTEN) ||
2461             (trillmode &&
2462             (!(blp->bl_flags & BLF_TRILLACTIVE) ||
2463             (blp->bl_flags & BLF_SDUFAIL)))) {
2464                 KIINCR(bki_sent);
2465                 KLINCR(bkl_xmit);
2466                 MAC_RING_TX(blp->bl_mh, rh, mpnext, mp);
2467                 return (mp);
2468         }
2469 
2470         /*
2471          * Send a copy of the message up to the observability node users.
2472          * TRILL needs to check on a packet-by-packet basis.
2473          */
2474         if (!trillmode && blp->bl_state == BLS_FORWARDING &&
2475             (bmp->bm_flags & BMF_STARTED) &&
2476             (mp = copymsgchain(mpnext)) != NULL) {
2477                 mac_rx(bmp->bm_mh, NULL, mp);
2478         }
2479 
2480         while ((mp = mpnext) != NULL) {
2481                 mpnext = mp->b_next;
2482                 mp->b_next = NULL;
2483 
2484                 if (mac_header_info(blp->bl_mh, mp, &hdr_info) != 0) {
2485                         freemsg(mp);
2486                         continue;
2487                 }
2488 
2489                 /*
2490                  * Extract and validate the VLAN ID for this packet.
2491                  */
2492                 if (!bridge_get_vlan(blp, &hdr_info, mp, &vlanid, &tci) ||
2493                     !BRIDGE_AF_ISSET(blp, vlanid)) {
2494                         freemsg(mp);
2495                         continue;
2496                 }
2497 
2498                 /*
2499                  * If we're using TRILL, then we've now validated that we're
2500                  * the forwarder for this VLAN, so go ahead and let
2501                  * observability node users know about the packet.
2502                  */
2503                 if (trillmode && (bmp->bm_flags & BMF_STARTED) &&
2504                     (mpcopy = copymsg(mp)) != NULL) {
2505                         mac_rx(bmp->bm_mh, NULL, mpcopy);
2506                 }
2507 
2508                 /*
2509                  * We have to learn from our own transmitted packets, because
2510                  * there may be a Solaris DLPI raw sender (who can specify his
2511                  * own source address) using promiscuous mode for receive.  The
2512                  * mac layer information won't (and can't) tell us everything
2513                  * we need to know.
2514                  */
2515                 bridge_learn(blp, hdr_info.mhi_saddr, RBRIDGE_NICKNAME_NONE,
2516                     vlanid);
2517 
2518                 /* attempt forwarding */
2519                 if (trillmode || blp->bl_state == BLS_FORWARDING) {
2520                         mp = bridge_forward(blp, &hdr_info, mp, vlanid, tci,
2521                             B_FALSE, B_TRUE);
2522                 }
2523                 if (mp != NULL) {
2524                         MAC_RING_TX(blp->bl_mh, rh, mp, mp);
2525                         if (mp == NULL) {
2526                                 KIINCR(bki_sent);
2527                                 KLINCR(bkl_xmit);
2528                         }
2529                 }
2530                 /*
2531                  * If we get stuck, then stop.  Don't let the user's output
2532                  * packets get out of order.  (More importantly: don't try to
2533                  * bridge the same packet multiple times if flow control is
2534                  * asserted.)
2535                  */
2536                 if (mp != NULL) {
2537                         mp->b_next = mpnext;
2538                         break;
2539                 }
2540         }
2541         return (mp);
2542 }
2543 
2544 /*
2545  * This is called by TRILL when it decapsulates an packet, and we must forward
2546  * locally.  On failure, we just drop.
2547  *
2548  * Note that the ingress_nick reported by TRILL must not represent this local
2549  * node.
2550  */
2551 void
2552 bridge_trill_decaps(bridge_link_t *blp, mblk_t *mp, uint16_t ingress_nick)
2553 {
2554         mac_header_info_t hdr_info;
2555         uint16_t vlanid, tci;
2556         bridge_inst_t *bip = blp->bl_inst;   /* used by macros */
2557         mblk_t *mpcopy;
2558 
2559         if (mac_header_info(blp->bl_mh, mp, &hdr_info) != 0) {
2560                 freemsg(mp);
2561                 return;
2562         }
2563 
2564         /* Extract VLAN ID for this packet. */
2565         if (hdr_info.mhi_bindsap == ETHERTYPE_VLAN) {
2566                 struct ether_vlan_header *evhp;
2567 
2568                 /* LINTED: alignment */
2569                 evhp = (struct ether_vlan_header *)mp->b_rptr;
2570                 tci = ntohs(evhp->ether_tci);
2571                 vlanid = VLAN_ID(tci);
2572         } else {
2573                 /* Inner VLAN headers are required in TRILL data packets */
2574                 DTRACE_PROBE3(bridge__trill__decaps__novlan, bridge_link_t *,
2575                     blp, mblk_t *, mp, uint16_t, ingress_nick);
2576                 freemsg(mp);
2577                 return;
2578         }
2579 
2580         /* Learn the location of this sender in the RBridge network */
2581         bridge_learn(blp, hdr_info.mhi_saddr, ingress_nick, vlanid);
2582 
2583         /* attempt forwarding */
2584         mp = bridge_forward(blp, &hdr_info, mp, vlanid, tci, B_TRUE, B_TRUE);
2585         if (mp != NULL) {
2586                 if (bridge_can_send(blp, vlanid)) {
2587                         /* Deliver a copy locally as well */
2588                         if ((mpcopy = copymsg(mp)) != NULL)
2589                                 mac_rx_common(blp->bl_mh, NULL, mpcopy);
2590                         MAC_RING_TX(blp->bl_mh, NULL, mp, mp);
2591                 }
2592                 if (mp == NULL) {
2593                         KIINCR(bki_sent);
2594                         KLINCR(bkl_xmit);
2595                 } else {
2596                         freemsg(mp);
2597                 }
2598         }
2599 }
2600 
2601 /*
2602  * This function is used by TRILL _only_ to transmit TRILL-encapsulated
2603  * packets.  It sends on a single underlying link and does not bridge.
2604  */
2605 mblk_t *
2606 bridge_trill_output(bridge_link_t *blp, mblk_t *mp)
2607 {
2608         bridge_inst_t *bip = blp->bl_inst;   /* used by macros */
2609 
2610         mac_trill_snoop(blp->bl_mh, mp);
2611         MAC_RING_TX(blp->bl_mh, NULL, mp, mp);
2612         if (mp == NULL) {
2613                 KIINCR(bki_sent);
2614                 KLINCR(bkl_xmit);
2615         }
2616         return (mp);
2617 }
2618 
2619 /*
2620  * Set the "appointed forwarder" flag array for this link.  TRILL controls
2621  * forwarding on a VLAN basis.  The "trillactive" flag is an optimization for
2622  * the forwarder.
2623  */
2624 void
2625 bridge_trill_setvlans(bridge_link_t *blp, const uint8_t *arr)
2626 {
2627         int i;
2628         uint_t newflags = 0;
2629 
2630         for (i = 0; i < BRIDGE_VLAN_ARR_SIZE; i++) {
2631                 if ((blp->bl_afs[i] = arr[i]) != 0)
2632                         newflags = BLF_TRILLACTIVE;
2633         }
2634         blp->bl_flags = (blp->bl_flags & ~BLF_TRILLACTIVE) | newflags;
2635 }
2636 
2637 void
2638 bridge_trill_flush(bridge_link_t *blp, uint16_t vlan, boolean_t dotrill)
2639 {
2640         bridge_inst_t *bip = blp->bl_inst;
2641         bridge_fwd_t *bfp, *bfnext;
2642         avl_tree_t fwd_scavenge;
2643         int i;
2644 
2645         _NOTE(ARGUNUSED(vlan));
2646 
2647         avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
2648             offsetof(bridge_fwd_t, bf_node));
2649         rw_enter(&bip->bi_rwlock, RW_WRITER);
2650         bfnext = avl_first(&bip->bi_fwd);
2651         while ((bfp = bfnext) != NULL) {
2652                 bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
2653                 if (bfp->bf_flags & BFF_LOCALADDR)
2654                         continue;
2655                 if (dotrill) {
2656                         /* port doesn't matter if we're flushing TRILL */
2657                         if (bfp->bf_trill_nick == RBRIDGE_NICKNAME_NONE)
2658                                 continue;
2659                 } else {
2660                         if (bfp->bf_trill_nick != RBRIDGE_NICKNAME_NONE)
2661                                 continue;
2662                         for (i = 0; i < bfp->bf_nlinks; i++) {
2663                                 if (bfp->bf_links[i] == blp)
2664                                         break;
2665                         }
2666                         if (i >= bfp->bf_nlinks)
2667                                 continue;
2668                 }
2669                 ASSERT(bfp->bf_flags & BFF_INTREE);
2670                 avl_remove(&bip->bi_fwd, bfp);
2671                 bfp->bf_flags &= ~BFF_INTREE;
2672                 avl_add(&fwd_scavenge, bfp);
2673         }
2674         rw_exit(&bip->bi_rwlock);
2675         bfnext = avl_first(&fwd_scavenge);
2676         while ((bfp = bfnext) != NULL) {
2677                 bfnext = AVL_NEXT(&fwd_scavenge, bfp);
2678                 avl_remove(&fwd_scavenge, bfp);
2679                 fwd_unref(bfp);
2680         }
2681         avl_destroy(&fwd_scavenge);
2682 }
2683 
2684 /*
2685  * Let the mac module take or drop a reference to a bridge link.  When this is
2686  * called, the mac module is holding the mi_bridge_lock, so the link cannot be
2687  * in the process of entering or leaving a bridge.
2688  */
2689 static void
2690 bridge_ref_cb(mac_handle_t mh, boolean_t hold)
2691 {
2692         bridge_link_t *blp = (bridge_link_t *)mh;
2693 
2694         if (hold)
2695                 atomic_inc_uint(&blp->bl_refs);
2696         else
2697                 link_unref(blp);
2698 }
2699 
2700 /*
2701  * Handle link state changes reported by the mac layer.  This acts as a filter
2702  * for link state changes: if a link is reporting down, but there are other
2703  * links still up on the bridge, then the state is changed to "up."  When the
2704  * last link goes down, all are marked down, and when the first link goes up,
2705  * all are marked up.  (Recursion is avoided by the use of the "redo" function.)
2706  *
2707  * We treat unknown as equivalent to "up."
2708  */
2709 static link_state_t
2710 bridge_ls_cb(mac_handle_t mh, link_state_t newls)
2711 {
2712         bridge_link_t *blp = (bridge_link_t *)mh;
2713         bridge_link_t *blcmp;
2714         bridge_inst_t *bip;
2715         bridge_mac_t *bmp;
2716 
2717         if (newls != LINK_STATE_DOWN && blp->bl_linkstate != LINK_STATE_DOWN ||
2718             (blp->bl_flags & (BLF_DELETED|BLF_SDUFAIL))) {
2719                 blp->bl_linkstate = newls;
2720                 return (newls);
2721         }
2722 
2723         /*
2724          * Scan first to see if there are any other non-down links.  If there
2725          * are, then we're done.  Otherwise, if all others are down, then the
2726          * state of this link is the state of the bridge.
2727          */
2728         bip = blp->bl_inst;
2729         rw_enter(&bip->bi_rwlock, RW_WRITER);
2730         for (blcmp = list_head(&bip->bi_links); blcmp != NULL;
2731             blcmp = list_next(&bip->bi_links, blcmp)) {
2732                 if (blcmp != blp &&
2733                     !(blcmp->bl_flags & (BLF_DELETED|BLF_SDUFAIL)) &&
2734                     blcmp->bl_linkstate != LINK_STATE_DOWN)
2735                         break;
2736         }
2737 
2738         if (blcmp != NULL) {
2739                 /*
2740                  * If there are other links that are considered up, then tell
2741                  * the caller that the link is actually still up, regardless of
2742                  * this link's underlying state.
2743                  */
2744                 blp->bl_linkstate = newls;
2745                 newls = LINK_STATE_UP;
2746         } else if (blp->bl_linkstate != newls) {
2747                 /*
2748                  * If we've found no other 'up' links, and this link has
2749                  * changed state, then report the new state of the bridge to
2750                  * all other clients.
2751                  */
2752                 blp->bl_linkstate = newls;
2753                 for (blcmp = list_head(&bip->bi_links); blcmp != NULL;
2754                     blcmp = list_next(&bip->bi_links, blcmp)) {
2755                         if (blcmp != blp && !(blcmp->bl_flags & BLF_DELETED))
2756                                 mac_link_redo(blcmp->bl_mh, newls);
2757                 }
2758                 bmp = bip->bi_mac;
2759                 if ((bmp->bm_linkstate = newls) != LINK_STATE_DOWN)
2760                         bmp->bm_linkstate = LINK_STATE_UP;
2761                 mac_link_redo(bmp->bm_mh, bmp->bm_linkstate);
2762         }
2763         rw_exit(&bip->bi_rwlock);
2764         return (newls);
2765 }
2766 
2767 static void
2768 bridge_add_link(void *arg)
2769 {
2770         mblk_t *mp = arg;
2771         bridge_stream_t *bsp;
2772         bridge_inst_t *bip, *bipt;
2773         bridge_mac_t *bmp;
2774         datalink_id_t linkid;
2775         int err;
2776         mac_handle_t mh;
2777         uint_t maxsdu;
2778         bridge_link_t *blp = NULL, *blpt;
2779         const mac_info_t *mip;
2780         boolean_t macopen = B_FALSE;
2781         char linkname[MAXLINKNAMELEN];
2782         char kstatname[KSTAT_STRLEN];
2783         int i;
2784         link_state_t linkstate;
2785         mblk_t *mlist;
2786 
2787         bsp = (bridge_stream_t *)mp->b_next;
2788         mp->b_next = NULL;
2789         bip = bsp->bs_inst;
2790         /* LINTED: alignment */
2791         linkid = *(datalink_id_t *)mp->b_cont->b_rptr;
2792 
2793         /*
2794          * First make sure that there is no other bridge that has this link.
2795          * We don't want to overlap operations from two bridges; the MAC layer
2796          * supports only one bridge on a given MAC at a time.
2797          *
2798          * We rely on the fact that there's just one taskq thread for the
2799          * bridging module: once we've checked for a duplicate, we can drop the
2800          * lock, because no other thread could possibly be adding another link
2801          * until we're done.
2802          */
2803         mutex_enter(&inst_lock);
2804         for (bipt = list_head(&inst_list); bipt != NULL;
2805             bipt = list_next(&inst_list, bipt)) {
2806                 rw_enter(&bipt->bi_rwlock, RW_READER);
2807                 for (blpt = list_head(&bipt->bi_links); blpt != NULL;
2808                     blpt = list_next(&bipt->bi_links, blpt)) {
2809                         if (linkid == blpt->bl_linkid)
2810                                 break;
2811                 }
2812                 rw_exit(&bipt->bi_rwlock);
2813                 if (blpt != NULL)
2814                         break;
2815         }
2816         mutex_exit(&inst_lock);
2817         if (bipt != NULL) {
2818                 err = EBUSY;
2819                 goto fail;
2820         }
2821 
2822         if ((err = mac_open_by_linkid(linkid, &mh)) != 0)
2823                 goto fail;
2824         macopen = B_TRUE;
2825 
2826         /* we bridge only Ethernet */
2827         mip = mac_info(mh);
2828         if (mip->mi_media != DL_ETHER) {
2829                 err = ENOTSUP;
2830                 goto fail;
2831         }
2832 
2833         /*
2834          * Get the current maximum SDU on this interface.  If there are other
2835          * links on the bridge, then this one must match, or it errors out.
2836          * Otherwise, the first link becomes the standard for the new bridge.
2837          */
2838         mac_sdu_get(mh, NULL, &maxsdu);
2839         bmp = bip->bi_mac;
2840         if (list_is_empty(&bip->bi_links)) {
2841                 bmp->bm_maxsdu = maxsdu;
2842                 (void) mac_maxsdu_update(bmp->bm_mh, maxsdu);
2843         }
2844 
2845         /* figure the kstat name; also used as the mac client name */
2846         i = MBLKL(mp->b_cont) - sizeof (datalink_id_t);
2847         if (i < 0 || i >= MAXLINKNAMELEN)
2848                 i = MAXLINKNAMELEN - 1;
2849         bcopy(mp->b_cont->b_rptr + sizeof (datalink_id_t), linkname, i);
2850         linkname[i] = '\0';
2851         (void) snprintf(kstatname, sizeof (kstatname), "%s-%s", bip->bi_name,
2852             linkname);
2853 
2854         if ((blp = kmem_zalloc(sizeof (*blp), KM_NOSLEEP)) == NULL) {
2855                 err = ENOMEM;
2856                 goto fail;
2857         }
2858         blp->bl_lfailmp = allocb(sizeof (bridge_ctl_t), BPRI_MED);
2859         if (blp->bl_lfailmp == NULL) {
2860                 kmem_free(blp, sizeof (*blp));
2861                 blp = NULL;
2862                 err = ENOMEM;
2863                 goto fail;
2864         }
2865 
2866         blp->bl_refs = 1;
2867         atomic_inc_uint(&bip->bi_refs);
2868         blp->bl_inst = bip;
2869         blp->bl_mh = mh;
2870         blp->bl_linkid = linkid;
2871         blp->bl_maxsdu = maxsdu;
2872         cv_init(&blp->bl_trillwait, NULL, CV_DRIVER, NULL);
2873         mutex_init(&blp->bl_trilllock, NULL, MUTEX_DRIVER, NULL);
2874         (void) memset(blp->bl_afs, 0xff, sizeof (blp->bl_afs));
2875 
2876         err = mac_client_open(mh, &blp->bl_mch, kstatname, 0);
2877         if (err != 0)
2878                 goto fail;
2879         blp->bl_flags |= BLF_CLIENT_OPEN;
2880 
2881         err = mac_margin_add(mh, &blp->bl_margin, B_TRUE);
2882         if (err != 0)
2883                 goto fail;
2884         blp->bl_flags |= BLF_MARGIN_ADDED;
2885 
2886         blp->bl_mnh = mac_notify_add(mh, bridge_notify_cb, blp);
2887 
2888         /* Enable Bridging on the link */
2889         err = mac_bridge_set(mh, (mac_handle_t)blp);
2890         if (err != 0)
2891                 goto fail;
2892         blp->bl_flags |= BLF_SET_BRIDGE;
2893 
2894         err = mac_promisc_add(blp->bl_mch, MAC_CLIENT_PROMISC_ALL, NULL,
2895             blp, &blp->bl_mphp, MAC_PROMISC_FLAGS_NO_TX_LOOP);
2896         if (err != 0)
2897                 goto fail;
2898         blp->bl_flags |= BLF_PROM_ADDED;
2899 
2900         bridge_new_unicst(blp);
2901 
2902         blp->bl_ksp = kstat_setup((kstat_named_t *)&blp->bl_kstats,
2903             link_kstats_list, Dim(link_kstats_list), kstatname);
2904 
2905         /*
2906          * The link holds a reference to the bridge instance, so that the
2907          * instance can't go away before the link is freed.  The insertion into
2908          * bi_links holds a reference on the link (reference set to 1 above).
2909          * When marking as removed from bi_links (BLF_DELETED), drop the
2910          * reference on the link. When freeing the link, drop the reference on
2911          * the instance. BLF_LINK_ADDED tracks link insertion in bi_links list.
2912          */
2913         rw_enter(&bip->bi_rwlock, RW_WRITER);
2914         list_insert_tail(&bip->bi_links, blp);
2915         blp->bl_flags |= BLF_LINK_ADDED;
2916 
2917         /*
2918          * If the new link is no good on this bridge, then let the daemon know
2919          * about the problem.
2920          */
2921         mlist = NULL;
2922         if (maxsdu != bmp->bm_maxsdu)
2923                 link_sdu_fail(blp, B_TRUE, &mlist);
2924         rw_exit(&bip->bi_rwlock);
2925         send_up_messages(bip, mlist);
2926 
2927         /*
2928          * Trigger a link state update so that if this link is the first one
2929          * "up" in the bridge, then we notify everyone.  This triggers a trip
2930          * through bridge_ls_cb.
2931          */
2932         linkstate = mac_stat_get(mh, MAC_STAT_LOWLINK_STATE);
2933         blp->bl_linkstate = LINK_STATE_DOWN;
2934         mac_link_update(mh, linkstate);
2935 
2936         /*
2937          * We now need to report back to the stream that invoked us, and then
2938          * drop the reference on the stream that we're holding.
2939          */
2940         miocack(bsp->bs_wq, mp, 0, 0);
2941         stream_unref(bsp);
2942         return;
2943 
2944 fail:
2945         if (blp == NULL) {
2946                 if (macopen)
2947                         mac_close(mh);
2948         } else {
2949                 link_shutdown(blp);
2950         }
2951         miocnak(bsp->bs_wq, mp, 0, err);
2952         stream_unref(bsp);
2953 }
2954 
2955 static void
2956 bridge_rem_link(void *arg)
2957 {
2958         mblk_t *mp = arg;
2959         bridge_stream_t *bsp;
2960         bridge_inst_t *bip;
2961         bridge_mac_t *bmp;
2962         datalink_id_t linkid;
2963         bridge_link_t *blp, *blsave;
2964         boolean_t found;
2965         mblk_t *mlist;
2966 
2967         bsp = (bridge_stream_t *)mp->b_next;
2968         mp->b_next = NULL;
2969         bip = bsp->bs_inst;
2970         /* LINTED: alignment */
2971         linkid = *(datalink_id_t *)mp->b_cont->b_rptr;
2972 
2973         /*
2974          * We become reader here so that we can loop over the other links and
2975          * deliver link up/down notification.
2976          */
2977         rw_enter(&bip->bi_rwlock, RW_READER);
2978         found = B_FALSE;
2979         for (blp = list_head(&bip->bi_links); blp != NULL;
2980             blp = list_next(&bip->bi_links, blp)) {
2981                 if (blp->bl_linkid == linkid &&
2982                     !(blp->bl_flags & BLF_DELETED)) {
2983                         blp->bl_flags |= BLF_DELETED;
2984                         (void) ddi_taskq_dispatch(bridge_taskq, link_shutdown,
2985                             blp, DDI_SLEEP);
2986                         found = B_TRUE;
2987                         break;
2988                 }
2989         }
2990 
2991         /*
2992          * Check if this link is up and the remainder of the links are all
2993          * down.
2994          */
2995         if (blp != NULL && blp->bl_linkstate != LINK_STATE_DOWN) {
2996                 for (blp = list_head(&bip->bi_links); blp != NULL;
2997                     blp = list_next(&bip->bi_links, blp)) {
2998                         if (blp->bl_linkstate != LINK_STATE_DOWN &&
2999                             !(blp->bl_flags & (BLF_DELETED|BLF_SDUFAIL)))
3000                                 break;
3001                 }
3002                 if (blp == NULL) {
3003                         for (blp = list_head(&bip->bi_links); blp != NULL;
3004                             blp = list_next(&bip->bi_links, blp)) {
3005                                 if (!(blp->bl_flags & BLF_DELETED))
3006                                         mac_link_redo(blp->bl_mh,
3007                                             LINK_STATE_DOWN);
3008                         }
3009                         bmp = bip->bi_mac;
3010                         bmp->bm_linkstate = LINK_STATE_DOWN;
3011                         mac_link_redo(bmp->bm_mh, LINK_STATE_DOWN);
3012                 }
3013         }
3014 
3015         /*
3016          * Check if there's just one working link left on the bridge.  If so,
3017          * then that link is now authoritative for bridge MTU.
3018          */
3019         blsave = NULL;
3020         for (blp = list_head(&bip->bi_links); blp != NULL;
3021             blp = list_next(&bip->bi_links, blp)) {
3022                 if (!(blp->bl_flags & BLF_DELETED)) {
3023                         if (blsave == NULL)
3024                                 blsave = blp;
3025                         else
3026                                 break;
3027                 }
3028         }
3029         mlist = NULL;
3030         bmp = bip->bi_mac;
3031         if (blsave != NULL && blp == NULL &&
3032             blsave->bl_maxsdu != bmp->bm_maxsdu) {
3033                 bmp->bm_maxsdu = blsave->bl_maxsdu;
3034                 (void) mac_maxsdu_update(bmp->bm_mh, blsave->bl_maxsdu);
3035                 link_sdu_fail(blsave, B_FALSE, &mlist);
3036         }
3037         rw_exit(&bip->bi_rwlock);
3038         send_up_messages(bip, mlist);
3039 
3040         if (found)
3041                 miocack(bsp->bs_wq, mp, 0, 0);
3042         else
3043                 miocnak(bsp->bs_wq, mp, 0, ENOENT);
3044         stream_unref(bsp);
3045 }
3046 
3047 /*
3048  * This function intentionally returns with bi_rwlock held; it is intended for
3049  * quick checks and updates.
3050  */
3051 static bridge_link_t *
3052 enter_link(bridge_inst_t *bip, datalink_id_t linkid)
3053 {
3054         bridge_link_t *blp;
3055 
3056         rw_enter(&bip->bi_rwlock, RW_READER);
3057         for (blp = list_head(&bip->bi_links); blp != NULL;
3058             blp = list_next(&bip->bi_links, blp)) {
3059                 if (blp->bl_linkid == linkid && !(blp->bl_flags & BLF_DELETED))
3060                         break;
3061         }
3062         return (blp);
3063 }
3064 
3065 static void
3066 bridge_ioctl(queue_t *wq, mblk_t *mp)
3067 {
3068         bridge_stream_t *bsp = wq->q_ptr;
3069         bridge_inst_t *bip;
3070         struct iocblk *iop;
3071         int rc = EINVAL;
3072         int len = 0;
3073         bridge_link_t *blp;
3074         cred_t *cr;
3075 
3076         /* LINTED: alignment */
3077         iop = (struct iocblk *)mp->b_rptr;
3078 
3079         /*
3080          * For now, all of the bridge ioctls are privileged.
3081          */
3082         if ((cr = msg_getcred(mp, NULL)) == NULL)
3083                 cr = iop->ioc_cr;
3084         if (cr != NULL && secpolicy_net_config(cr, B_FALSE) != 0) {
3085                 miocnak(wq, mp, 0, EPERM);
3086                 return;
3087         }
3088 
3089         switch (iop->ioc_cmd) {
3090         case BRIOC_NEWBRIDGE: {
3091                 bridge_newbridge_t *bnb;
3092 
3093                 if (bsp->bs_inst != NULL ||
3094                     (rc = miocpullup(mp, sizeof (bridge_newbridge_t))) != 0)
3095                         break;
3096                 /* LINTED: alignment */
3097                 bnb = (bridge_newbridge_t *)mp->b_cont->b_rptr;
3098                 bnb->bnb_name[MAXNAMELEN-1] = '\0';
3099                 rc = bridge_create(bnb->bnb_linkid, bnb->bnb_name, &bip, cr);
3100                 if (rc != 0)
3101                         break;
3102 
3103                 rw_enter(&bip->bi_rwlock, RW_WRITER);
3104                 if (bip->bi_control != NULL) {
3105                         rw_exit(&bip->bi_rwlock);
3106                         bridge_unref(bip);
3107                         rc = EBUSY;
3108                 } else {
3109                         atomic_inc_uint(&bip->bi_refs);
3110                         bsp->bs_inst = bip;  /* stream holds reference */
3111                         bip->bi_control = bsp;
3112                         rw_exit(&bip->bi_rwlock);
3113                         rc = 0;
3114                 }
3115                 break;
3116         }
3117 
3118         case BRIOC_ADDLINK:
3119                 if ((bip = bsp->bs_inst) == NULL ||
3120                     (rc = miocpullup(mp, sizeof (datalink_id_t))) != 0)
3121                         break;
3122                 /*
3123                  * We cannot perform the action in this thread, because we're
3124                  * not in process context, and we may already be holding
3125                  * MAC-related locks.  Place the request on taskq.
3126                  */
3127                 mp->b_next = (mblk_t *)bsp;
3128                 stream_ref(bsp);
3129                 (void) ddi_taskq_dispatch(bridge_taskq, bridge_add_link, mp,
3130                     DDI_SLEEP);
3131                 return;
3132 
3133         case BRIOC_REMLINK:
3134                 if ((bip = bsp->bs_inst) == NULL ||
3135                     (rc = miocpullup(mp, sizeof (datalink_id_t))) != 0)
3136                         break;
3137                 /*
3138                  * We cannot perform the action in this thread, because we're
3139                  * not in process context, and we may already be holding
3140                  * MAC-related locks.  Place the request on taskq.
3141                  */
3142                 mp->b_next = (mblk_t *)bsp;
3143                 stream_ref(bsp);
3144                 (void) ddi_taskq_dispatch(bridge_taskq, bridge_rem_link, mp,
3145                     DDI_SLEEP);
3146                 return;
3147 
3148         case BRIOC_SETSTATE: {
3149                 bridge_setstate_t *bss;
3150 
3151                 if ((bip = bsp->bs_inst) == NULL ||
3152                     (rc = miocpullup(mp, sizeof (*bss))) != 0)
3153                         break;
3154                 /* LINTED: alignment */
3155                 bss = (bridge_setstate_t *)mp->b_cont->b_rptr;
3156                 if ((blp = enter_link(bip, bss->bss_linkid)) == NULL) {
3157                         rc = ENOENT;
3158                 } else {
3159                         rc = 0;
3160                         blp->bl_state = bss->bss_state;
3161                 }
3162                 rw_exit(&bip->bi_rwlock);
3163                 break;
3164         }
3165 
3166         case BRIOC_SETPVID: {
3167                 bridge_setpvid_t *bsv;
3168 
3169                 if ((bip = bsp->bs_inst) == NULL ||
3170                     (rc = miocpullup(mp, sizeof (*bsv))) != 0)
3171                         break;
3172                 /* LINTED: alignment */
3173                 bsv = (bridge_setpvid_t *)mp->b_cont->b_rptr;
3174                 if (bsv->bsv_vlan > VLAN_ID_MAX)
3175                         break;
3176                 if ((blp = enter_link(bip, bsv->bsv_linkid)) == NULL) {
3177                         rc = ENOENT;
3178                 } else if (blp->bl_pvid == bsv->bsv_vlan) {
3179                         rc = 0;
3180                 } else {
3181                         rc = 0;
3182                         BRIDGE_VLAN_CLR(blp, blp->bl_pvid);
3183                         blp->bl_pvid = bsv->bsv_vlan;
3184                         if (blp->bl_pvid != 0)
3185                                 BRIDGE_VLAN_SET(blp, blp->bl_pvid);
3186                 }
3187                 rw_exit(&bip->bi_rwlock);
3188                 break;
3189         }
3190 
3191         case BRIOC_VLANENAB: {
3192                 bridge_vlanenab_t *bve;
3193 
3194                 if ((bip = bsp->bs_inst) == NULL ||
3195                     (rc = miocpullup(mp, sizeof (*bve))) != 0)
3196                         break;
3197                 /* LINTED: alignment */
3198                 bve = (bridge_vlanenab_t *)mp->b_cont->b_rptr;
3199                 if (bve->bve_vlan > VLAN_ID_MAX)
3200                         break;
3201                 if ((blp = enter_link(bip, bve->bve_linkid)) == NULL) {
3202                         rc = ENOENT;
3203                 } else {
3204                         rc = 0;
3205                         /* special case: vlan 0 means "all" */
3206                         if (bve->bve_vlan == 0) {
3207                                 (void) memset(blp->bl_vlans,
3208                                     bve->bve_onoff ? ~0 : 0,
3209                                     sizeof (blp->bl_vlans));
3210                                 BRIDGE_VLAN_CLR(blp, 0);
3211                                 if (blp->bl_pvid != 0)
3212                                         BRIDGE_VLAN_SET(blp, blp->bl_pvid);
3213                         } else if (bve->bve_vlan == blp->bl_pvid) {
3214                                 rc = EINVAL;
3215                         } else if (bve->bve_onoff) {
3216                                 BRIDGE_VLAN_SET(blp, bve->bve_vlan);
3217                         } else {
3218                                 BRIDGE_VLAN_CLR(blp, bve->bve_vlan);
3219                         }
3220                 }
3221                 rw_exit(&bip->bi_rwlock);
3222                 break;
3223         }
3224 
3225         case BRIOC_FLUSHFWD: {
3226                 bridge_flushfwd_t *bff;
3227                 bridge_fwd_t *bfp, *bfnext;
3228                 avl_tree_t fwd_scavenge;
3229                 int i;
3230 
3231                 if ((bip = bsp->bs_inst) == NULL ||
3232                     (rc = miocpullup(mp, sizeof (*bff))) != 0)
3233                         break;
3234                 /* LINTED: alignment */
3235                 bff = (bridge_flushfwd_t *)mp->b_cont->b_rptr;
3236                 rw_enter(&bip->bi_rwlock, RW_WRITER);
3237                 /* This case means "all" */
3238                 if (bff->bff_linkid == DATALINK_INVALID_LINKID) {
3239                         blp = NULL;
3240                 } else {
3241                         for (blp = list_head(&bip->bi_links); blp != NULL;
3242                             blp = list_next(&bip->bi_links, blp)) {
3243                                 if (blp->bl_linkid == bff->bff_linkid &&
3244                                     !(blp->bl_flags & BLF_DELETED))
3245                                         break;
3246                         }
3247                         if (blp == NULL) {
3248                                 rc = ENOENT;
3249                                 rw_exit(&bip->bi_rwlock);
3250                                 break;
3251                         }
3252                 }
3253                 avl_create(&fwd_scavenge, fwd_compare, sizeof (bridge_fwd_t),
3254                     offsetof(bridge_fwd_t, bf_node));
3255                 bfnext = avl_first(&bip->bi_fwd);
3256                 while ((bfp = bfnext) != NULL) {
3257                         bfnext = AVL_NEXT(&bip->bi_fwd, bfp);
3258                         if (bfp->bf_flags & BFF_LOCALADDR)
3259                                 continue;
3260                         if (blp != NULL) {
3261                                 for (i = 0; i < bfp->bf_maxlinks; i++) {
3262                                         if (bfp->bf_links[i] == blp)
3263                                                 break;
3264                                 }
3265                                 /*
3266                                  * If the link is there and we're excluding,
3267                                  * then skip.  If the link is not there and
3268                                  * we're doing only that link, then skip.
3269                                  */
3270                                 if ((i < bfp->bf_maxlinks) == bff->bff_exclude)
3271                                         continue;
3272                         }
3273                         ASSERT(bfp->bf_flags & BFF_INTREE);
3274                         avl_remove(&bip->bi_fwd, bfp);
3275                         bfp->bf_flags &= ~BFF_INTREE;
3276                         avl_add(&fwd_scavenge, bfp);
3277                 }
3278                 rw_exit(&bip->bi_rwlock);
3279                 bfnext = avl_first(&fwd_scavenge);
3280                 while ((bfp = bfnext) != NULL) {
3281                         bfnext = AVL_NEXT(&fwd_scavenge, bfp);
3282                         avl_remove(&fwd_scavenge, bfp);
3283                         fwd_unref(bfp); /* drop tree reference */
3284                 }
3285                 avl_destroy(&fwd_scavenge);
3286                 break;
3287         }
3288 
3289         case BRIOC_TABLEMAX:
3290                 if ((bip = bsp->bs_inst) == NULL ||
3291                     (rc = miocpullup(mp, sizeof (uint32_t))) != 0)
3292                         break;
3293                 /* LINTED: alignment */
3294                 bip->bi_tablemax = *(uint32_t *)mp->b_cont->b_rptr;
3295                 break;
3296         }
3297 
3298         if (rc == 0)
3299                 miocack(wq, mp, len, 0);
3300         else
3301                 miocnak(wq, mp, 0, rc);
3302 }
3303 
3304 static void
3305 bridge_wput(queue_t *wq, mblk_t *mp)
3306 {
3307         switch (DB_TYPE(mp)) {
3308         case M_IOCTL:
3309                 bridge_ioctl(wq, mp);
3310                 break;
3311         case M_FLUSH:
3312                 if (*mp->b_rptr & FLUSHW)
3313                         *mp->b_rptr &= ~FLUSHW;
3314                 if (*mp->b_rptr & FLUSHR)
3315                         qreply(wq, mp);
3316                 else
3317                         freemsg(mp);
3318                 break;
3319         default:
3320                 freemsg(mp);
3321                 break;
3322         }
3323 }
3324 
3325 /*
3326  * This function allocates the main data structures for the bridge driver and
3327  * connects us into devfs.
3328  */
3329 static void
3330 bridge_inst_init(void)
3331 {
3332         bridge_scan_interval = drv_sectohz(5);
3333         bridge_fwd_age = drv_sectohz(25);
3334 
3335         rw_init(&bmac_rwlock, NULL, RW_DRIVER, NULL);
3336         list_create(&bmac_list, sizeof (bridge_mac_t),
3337             offsetof(bridge_mac_t, bm_node));
3338         list_create(&inst_list, sizeof (bridge_inst_t),
3339             offsetof(bridge_inst_t, bi_node));
3340         cv_init(&inst_cv, NULL, CV_DRIVER, NULL);
3341         mutex_init(&inst_lock, NULL, MUTEX_DRIVER, NULL);
3342         cv_init(&stream_ref_cv, NULL, CV_DRIVER, NULL);
3343         mutex_init(&stream_ref_lock, NULL, MUTEX_DRIVER, NULL);
3344 
3345         mac_bridge_vectors(bridge_xmit_cb, bridge_recv_cb, bridge_ref_cb,
3346             bridge_ls_cb);
3347 }
3348 
3349 /*
3350  * This function disconnects from devfs and destroys all data structures in
3351  * preparation for unload.  It's assumed that there are no active bridge
3352  * references left at this point.
3353  */
3354 static void
3355 bridge_inst_fini(void)
3356 {
3357         mac_bridge_vectors(NULL, NULL, NULL, NULL);
3358         if (bridge_timerid != 0)
3359                 (void) untimeout(bridge_timerid);
3360         rw_destroy(&bmac_rwlock);
3361         list_destroy(&bmac_list);
3362         list_destroy(&inst_list);
3363         cv_destroy(&inst_cv);
3364         mutex_destroy(&inst_lock);
3365         cv_destroy(&stream_ref_cv);
3366         mutex_destroy(&stream_ref_lock);
3367 }
3368 
3369 /*
3370  * bridge_attach()
3371  *
3372  * Description:
3373  *    Attach bridge driver to the system.
3374  */
3375 static int
3376 bridge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3377 {
3378         if (cmd != DDI_ATTACH)
3379                 return (DDI_FAILURE);
3380 
3381         if (ddi_create_minor_node(dip, BRIDGE_CTL, S_IFCHR, 0, DDI_PSEUDO,
3382             CLONE_DEV) == DDI_FAILURE) {
3383                 return (DDI_FAILURE);
3384         }
3385 
3386         if (dld_ioc_register(BRIDGE_IOC, bridge_ioc_list,
3387             DLDIOCCNT(bridge_ioc_list)) != 0) {
3388                 ddi_remove_minor_node(dip, BRIDGE_CTL);
3389                 return (DDI_FAILURE);
3390         }
3391 
3392         bridge_dev_info = dip;
3393         bridge_major = ddi_driver_major(dip);
3394         bridge_taskq = ddi_taskq_create(dip, BRIDGE_DEV_NAME, 1,
3395             TASKQ_DEFAULTPRI, 0);
3396         return (DDI_SUCCESS);
3397 }
3398 
3399 /*
3400  * bridge_detach()
3401  *
3402  * Description:
3403  *    Detach an interface to the system.
3404  */
3405 static int
3406 bridge_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3407 {
3408         if (cmd != DDI_DETACH)
3409                 return (DDI_FAILURE);
3410 
3411         ddi_remove_minor_node(dip, NULL);
3412         ddi_taskq_destroy(bridge_taskq);
3413         bridge_dev_info = NULL;
3414         return (DDI_SUCCESS);
3415 }
3416 
3417 /*
3418  * bridge_info()
3419  *
3420  * Description:
3421  *    Translate "dev_t" to a pointer to the associated "dev_info_t".
3422  */
3423 /* ARGSUSED */
3424 static int
3425 bridge_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg,
3426         void **result)
3427 {
3428         int     rc;
3429 
3430         switch (infocmd) {
3431         case DDI_INFO_DEVT2DEVINFO:
3432                 if (bridge_dev_info == NULL) {
3433                         rc = DDI_FAILURE;
3434                 } else {
3435                         *result = (void *)bridge_dev_info;
3436                         rc = DDI_SUCCESS;
3437                 }
3438                 break;
3439         case DDI_INFO_DEVT2INSTANCE:
3440                 *result = NULL;
3441                 rc = DDI_SUCCESS;
3442                 break;
3443         default:
3444                 rc = DDI_FAILURE;
3445                 break;
3446         }
3447         return (rc);
3448 }
3449 
3450 static struct module_info bridge_modinfo = {
3451         2105,                   /* mi_idnum */
3452         BRIDGE_DEV_NAME,        /* mi_idname */
3453         0,                      /* mi_minpsz */
3454         16384,                  /* mi_maxpsz */
3455         65536,                  /* mi_hiwat */
3456         128                     /* mi_lowat */
3457 };
3458 
3459 static struct qinit bridge_rinit = {
3460         NULL,                   /* qi_putp */
3461         NULL,                   /* qi_srvp */
3462         bridge_open,            /* qi_qopen */
3463         bridge_close,           /* qi_qclose */
3464         NULL,                   /* qi_qadmin */
3465         &bridge_modinfo,    /* qi_minfo */
3466         NULL                    /* qi_mstat */
3467 };
3468 
3469 static struct qinit bridge_winit = {
3470         (int (*)())bridge_wput, /* qi_putp */
3471         NULL,                   /* qi_srvp */
3472         NULL,                   /* qi_qopen */
3473         NULL,                   /* qi_qclose */
3474         NULL,                   /* qi_qadmin */
3475         &bridge_modinfo,    /* qi_minfo */
3476         NULL                    /* qi_mstat */
3477 };
3478 
3479 static struct streamtab bridge_tab = {
3480         &bridge_rinit,      /* st_rdinit */
3481         &bridge_winit       /* st_wrinit */
3482 };
3483 
3484 /* No STREAMS perimeters; we do all our own locking */
3485 DDI_DEFINE_STREAM_OPS(bridge_ops, nulldev, nulldev, bridge_attach,
3486     bridge_detach, nodev, bridge_info, D_NEW | D_MP, &bridge_tab,
3487     ddi_quiesce_not_supported);
3488 
3489 static struct modldrv modldrv = {
3490         &mod_driverops,
3491         "bridging driver",
3492         &bridge_ops
3493 };
3494 
3495 static struct modlinkage modlinkage = {
3496         MODREV_1,
3497         (void *)&modldrv,
3498         NULL
3499 };
3500 
3501 int
3502 _init(void)
3503 {
3504         int retv;
3505 
3506         mac_init_ops(NULL, BRIDGE_DEV_NAME);
3507         bridge_inst_init();
3508         if ((retv = mod_install(&modlinkage)) != 0)
3509                 bridge_inst_fini();
3510         return (retv);
3511 }
3512 
3513 int
3514 _fini(void)
3515 {
3516         int retv;
3517 
3518         rw_enter(&bmac_rwlock, RW_READER);
3519         retv = list_is_empty(&bmac_list) ? 0 : EBUSY;
3520         rw_exit(&bmac_rwlock);
3521         if (retv == 0 &&
3522             (retv = mod_remove(&modlinkage)) == 0)
3523                 bridge_inst_fini();
3524         return (retv);
3525 }
3526 
3527 int
3528 _info(struct modinfo *modinfop)
3529 {
3530         return (mod_info(&modlinkage, modinfop));
3531 }