1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #include <sys/kmem.h>
  28 #include <sys/ksynch.h>
  29 #include <sys/systm.h>
  30 #include <sys/socket.h>
  31 #include <sys/disp.h>
  32 #include <sys/taskq.h>
  33 #include <sys/cmn_err.h>
  34 #include <sys/strsun.h>
  35 #include <sys/sdt.h>
  36 #include <sys/atomic.h>
  37 #include <netinet/in.h>
  38 #include <inet/ip.h>
  39 #include <inet/ip6.h>
  40 #include <inet/tcp.h>
  41 #include <inet/udp_impl.h>
  42 #include <inet/kstatcom.h>
  43 
  44 #include <inet/ilb_ip.h>
  45 #include "ilb_alg.h"
  46 #include "ilb_nat.h"
  47 #include "ilb_conn.h"
  48 
  49 /* ILB kmem cache flag */
  50 int ilb_kmem_flags = 0;
  51 
  52 /*
  53  * The default size for the different hash tables.  Global for all stacks.
  54  * But each stack has its own table, just that their sizes are the same.
  55  */
  56 static size_t ilb_rule_hash_size = 2048;
  57 
  58 static size_t ilb_conn_hash_size = 262144;
  59 
  60 static size_t ilb_sticky_hash_size = 262144;
  61 
  62 /* This should be a prime number. */
  63 static size_t ilb_nat_src_hash_size = 97;
  64 
  65 /* Default NAT cache entry expiry time. */
  66 static uint32_t ilb_conn_tcp_expiry = 120;
  67 static uint32_t ilb_conn_udp_expiry = 60;
  68 
  69 /* Default sticky entry expiry time. */
  70 static uint32_t ilb_sticky_expiry = 60;
  71 
  72 /* addr is assumed to be a uint8_t * to an ipaddr_t. */
  73 #define ILB_RULE_HASH(addr, hash_size) \
  74         ((*((addr) + 3) * 29791 + *((addr) + 2) * 961 + *((addr) + 1) * 31 + \
  75         *(addr)) & ((hash_size) - 1))
  76 
  77 /*
  78  * Note on ILB delayed processing
  79  *
  80  * To avoid in line removal on some of the data structures, such as rules,
  81  * servers and ilb_conn_hash entries, ILB delays such processing to a taskq.
  82  * There are three types of ILB taskq:
  83  *
  84  * 1. rule handling: created at stack initialialization time, ilb_stack_init()
  85  * 2. conn hash handling: created at conn hash initialization time,
  86  *                        ilb_conn_hash_init()
  87  * 3. sticky hash handling: created at sticky hash initialization time,
  88  *                          ilb_sticky_hash_init()
  89  *
  90  * The rule taskq is for processing rule and server removal.  When a user
  91  * land rule/server removal request comes in, a taskq is dispatched after
  92  * removing the rule/server from all related hashes.  This taskq will wait
  93  * until all references to the rule/server are gone before removing it.
  94  * So the user land thread requesting the removal does not need to wait
  95  * for the removal completion.
  96  *
  97  * The conn hash/sticky hash taskq is for processing ilb_conn_hash and
  98  * ilb_sticky_hash table entry removal.  There are ilb_conn_timer_size timers
  99  * and ilb_sticky_timer_size timers running for ilb_conn_hash and
 100  * ilb_sticky_hash cleanup respectively.   Each timer is responsible for one
 101  * portion (same size) of the hash table.  When a timer fires, it dispatches
 102  * a conn hash taskq to clean up its portion of the table.  This avoids in
 103  * line processing of the removal.
 104  *
 105  * There is another delayed processing, the clean up of NAT source address
 106  * table.  We just use the timer to directly handle it instead of using
 107  * a taskq.  The reason is that the table is small so it is OK to use the
 108  * timer.
 109  */
 110 
 111 /* ILB rule taskq constants. */
 112 #define ILB_RULE_TASKQ_NUM_THR  20
 113 
 114 /* Argument passed to ILB rule taskq routines. */
 115 typedef struct {
 116         ilb_stack_t     *ilbs;
 117         ilb_rule_t      *rule;
 118 } ilb_rule_tq_t;
 119 
 120 /* kstat handling routines. */
 121 static kstat_t *ilb_kstat_g_init(netstackid_t, ilb_stack_t *);
 122 static void ilb_kstat_g_fini(netstackid_t, ilb_stack_t *);
 123 static kstat_t *ilb_rule_kstat_init(netstackid_t, ilb_rule_t *);
 124 static kstat_t *ilb_server_kstat_init(netstackid_t, ilb_rule_t *,
 125     ilb_server_t *);
 126 
 127 /* Rule hash handling routines. */
 128 static void ilb_rule_hash_init(ilb_stack_t *);
 129 static void ilb_rule_hash_fini(ilb_stack_t *);
 130 static void ilb_rule_hash_add(ilb_stack_t *, ilb_rule_t *, const in6_addr_t *);
 131 static void ilb_rule_hash_del(ilb_rule_t *);
 132 static ilb_rule_t *ilb_rule_hash(ilb_stack_t *, int, int, in6_addr_t *,
 133     in_port_t, zoneid_t, uint32_t, boolean_t *);
 134 
 135 static void ilb_rule_g_add(ilb_stack_t *, ilb_rule_t *);
 136 static void ilb_rule_g_del(ilb_stack_t *, ilb_rule_t *);
 137 static void ilb_del_rule_common(ilb_stack_t *, ilb_rule_t *);
 138 static ilb_rule_t *ilb_find_rule_locked(ilb_stack_t *, zoneid_t, const char *,
 139     int *);
 140 static boolean_t ilb_match_rule(ilb_stack_t *, zoneid_t, const char *, int,
 141     int, in_port_t, in_port_t, const in6_addr_t *);
 142 
 143 /* Back end server handling routines. */
 144 static void ilb_server_free(ilb_server_t *);
 145 
 146 /* Network stack handling routines. */
 147 static void *ilb_stack_init(netstackid_t, netstack_t *);
 148 static void ilb_stack_shutdown(netstackid_t, void *);
 149 static void ilb_stack_fini(netstackid_t, void *);
 150 
 151 /* Sticky connection handling routines. */
 152 static void ilb_rule_sticky_init(ilb_rule_t *);
 153 static void ilb_rule_sticky_fini(ilb_rule_t *);
 154 
 155 /* Handy macro to check for unspecified address. */
 156 #define IS_ADDR_UNSPEC(addr)                                            \
 157         (IN6_IS_ADDR_V4MAPPED(addr) ? IN6_IS_ADDR_V4MAPPED_ANY(addr) :  \
 158             IN6_IS_ADDR_UNSPECIFIED(addr))
 159 
 160 /*
 161  * Global kstat instance counter.  When a rule is created, its kstat instance
 162  * number is assigned by ilb_kstat_instance and ilb_kstat_instance is
 163  * incremented.
 164  */
 165 static uint_t ilb_kstat_instance = 0;
 166 
 167 /*
 168  * The ILB global kstat has name ILB_G_KS_NAME and class name ILB_G_KS_CNAME.
 169  * A rule's kstat has ILB_RULE_KS_CNAME class name.
 170  */
 171 #define ILB_G_KS_NAME           "global"
 172 #define ILB_G_KS_CNAME          "kstat"
 173 #define ILB_RULE_KS_CNAME       "rulestat"
 174 
 175 static kstat_t *
 176 ilb_kstat_g_init(netstackid_t stackid, ilb_stack_t *ilbs)
 177 {
 178         kstat_t *ksp;
 179         ilb_g_kstat_t template = {
 180                 { "num_rules",          KSTAT_DATA_UINT64, 0 },
 181                 { "ip_frag_in",         KSTAT_DATA_UINT64, 0 },
 182                 { "ip_frag_dropped",    KSTAT_DATA_UINT64, 0 }
 183         };
 184 
 185         ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, 0, ILB_G_KS_NAME,
 186             ILB_G_KS_CNAME, KSTAT_TYPE_NAMED, NUM_OF_FIELDS(ilb_g_kstat_t),
 187             KSTAT_FLAG_VIRTUAL, stackid);
 188         if (ksp == NULL)
 189                 return (NULL);
 190         bcopy(&template, ilbs->ilbs_kstat, sizeof (template));
 191         ksp->ks_data = ilbs->ilbs_kstat;
 192         ksp->ks_private = (void *)(uintptr_t)stackid;
 193 
 194         kstat_install(ksp);
 195         return (ksp);
 196 }
 197 
 198 static void
 199 ilb_kstat_g_fini(netstackid_t stackid, ilb_stack_t *ilbs)
 200 {
 201         if (ilbs->ilbs_ksp != NULL) {
 202                 ASSERT(stackid == (netstackid_t)(uintptr_t)
 203                     ilbs->ilbs_ksp->ks_private);
 204                 kstat_delete_netstack(ilbs->ilbs_ksp, stackid);
 205                 ilbs->ilbs_ksp = NULL;
 206         }
 207 }
 208 
 209 static kstat_t *
 210 ilb_rule_kstat_init(netstackid_t stackid, ilb_rule_t *rule)
 211 {
 212         kstat_t *ksp;
 213         ilb_rule_kstat_t template = {
 214                 { "num_servers",                KSTAT_DATA_UINT64, 0 },
 215                 { "bytes_not_processed",        KSTAT_DATA_UINT64, 0 },
 216                 { "pkt_not_processed",          KSTAT_DATA_UINT64, 0 },
 217                 { "bytes_dropped",              KSTAT_DATA_UINT64, 0 },
 218                 { "pkt_dropped",                KSTAT_DATA_UINT64, 0 },
 219                 { "nomem_bytes_dropped",        KSTAT_DATA_UINT64, 0 },
 220                 { "nomem_pkt_dropped",          KSTAT_DATA_UINT64, 0 },
 221                 { "noport_bytes_dropped",       KSTAT_DATA_UINT64, 0 },
 222                 { "noport_pkt_dropped",         KSTAT_DATA_UINT64, 0 },
 223                 { "icmp_echo_processed",        KSTAT_DATA_UINT64, 0 },
 224                 { "icmp_dropped",               KSTAT_DATA_UINT64, 0 },
 225                 { "icmp_too_big_processed",     KSTAT_DATA_UINT64, 0 },
 226                 { "icmp_too_big_dropped",       KSTAT_DATA_UINT64, 0 }
 227         };
 228 
 229         ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance,
 230             rule->ir_name, ILB_RULE_KS_CNAME, KSTAT_TYPE_NAMED,
 231             NUM_OF_FIELDS(ilb_rule_kstat_t), KSTAT_FLAG_VIRTUAL, stackid);
 232         if (ksp == NULL)
 233                 return (NULL);
 234 
 235         bcopy(&template, &rule->ir_kstat, sizeof (template));
 236         ksp->ks_data = &rule->ir_kstat;
 237         ksp->ks_private = (void *)(uintptr_t)stackid;
 238 
 239         kstat_install(ksp);
 240         return (ksp);
 241 }
 242 
 243 static kstat_t *
 244 ilb_server_kstat_init(netstackid_t stackid, ilb_rule_t *rule,
 245     ilb_server_t *server)
 246 {
 247         kstat_t *ksp;
 248         ilb_server_kstat_t template = {
 249                 { "bytes_processed",    KSTAT_DATA_UINT64, 0 },
 250                 { "pkt_processed",      KSTAT_DATA_UINT64, 0 },
 251                 { "ip_address",         KSTAT_DATA_STRING, 0 }
 252         };
 253         char cname_buf[KSTAT_STRLEN];
 254 
 255         /* 7 is "-sstat" */
 256         ASSERT(strlen(rule->ir_name) + 7 < KSTAT_STRLEN);
 257         (void) sprintf(cname_buf, "%s-sstat", rule->ir_name);
 258         ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance,
 259             server->iser_name, cname_buf, KSTAT_TYPE_NAMED,
 260             NUM_OF_FIELDS(ilb_server_kstat_t), KSTAT_FLAG_VIRTUAL, stackid);
 261         if (ksp == NULL)
 262                 return (NULL);
 263 
 264         bcopy(&template, &server->iser_kstat, sizeof (template));
 265         ksp->ks_data = &server->iser_kstat;
 266         ksp->ks_private = (void *)(uintptr_t)stackid;
 267 
 268         kstat_named_setstr(&server->iser_kstat.ip_address,
 269             server->iser_ip_addr);
 270         /* We never change the IP address */
 271         ksp->ks_data_size += strlen(server->iser_ip_addr) + 1;
 272 
 273         kstat_install(ksp);
 274         return (ksp);
 275 }
 276 
 277 /* Initialize the rule hash table. */
 278 static void
 279 ilb_rule_hash_init(ilb_stack_t *ilbs)
 280 {
 281         int i;
 282 
 283         /*
 284          * If ilbs->ilbs_rule_hash_size is not a power of 2, bump it up to
 285          * the next power of 2.
 286          */
 287         if (ilbs->ilbs_rule_hash_size & (ilbs->ilbs_rule_hash_size - 1)) {
 288                 for (i = 0; i < 31; i++) {
 289                         if (ilbs->ilbs_rule_hash_size < (1 << i))
 290                                 break;
 291                 }
 292                 ilbs->ilbs_rule_hash_size = 1 << i;
 293         }
 294         ilbs->ilbs_g_hash = kmem_zalloc(sizeof (ilb_hash_t) *
 295             ilbs->ilbs_rule_hash_size, KM_SLEEP);
 296         for (i = 0; i < ilbs->ilbs_rule_hash_size; i++) {
 297                 mutex_init(&ilbs->ilbs_g_hash[i].ilb_hash_lock, NULL,
 298                     MUTEX_DEFAULT, NULL);
 299         }
 300 }
 301 
 302 /* Clean up the rule hash table. */
 303 static void
 304 ilb_rule_hash_fini(ilb_stack_t *ilbs)
 305 {
 306         if (ilbs->ilbs_g_hash == NULL)
 307                 return;
 308         kmem_free(ilbs->ilbs_g_hash, sizeof (ilb_hash_t) *
 309             ilbs->ilbs_rule_hash_size);
 310 }
 311 
 312 /* Add a rule to the rule hash table. */
 313 static void
 314 ilb_rule_hash_add(ilb_stack_t *ilbs, ilb_rule_t *rule, const in6_addr_t *addr)
 315 {
 316         int i;
 317 
 318         i = ILB_RULE_HASH((uint8_t *)&addr->s6_addr32[3],
 319             ilbs->ilbs_rule_hash_size);
 320         DTRACE_PROBE2(ilb__rule__hash__add, ilb_rule_t *, rule, int, i);
 321         mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
 322         rule->ir_hash_next = ilbs->ilbs_g_hash[i].ilb_hash_rule;
 323         if (ilbs->ilbs_g_hash[i].ilb_hash_rule != NULL)
 324                 ilbs->ilbs_g_hash[i].ilb_hash_rule->ir_hash_prev = rule;
 325         rule->ir_hash_prev = NULL;
 326         ilbs->ilbs_g_hash[i].ilb_hash_rule = rule;
 327 
 328         rule->ir_hash = &ilbs->ilbs_g_hash[i];
 329         mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
 330 }
 331 
 332 /*
 333  * Remove a rule from the rule hash table.  Note that the rule is not freed
 334  * in this routine.
 335  */
 336 static void
 337 ilb_rule_hash_del(ilb_rule_t *rule)
 338 {
 339         mutex_enter(&rule->ir_hash->ilb_hash_lock);
 340         if (rule->ir_hash->ilb_hash_rule == rule) {
 341                 rule->ir_hash->ilb_hash_rule = rule->ir_hash_next;
 342                 if (rule->ir_hash_next != NULL)
 343                         rule->ir_hash_next->ir_hash_prev = NULL;
 344         } else {
 345                 if (rule->ir_hash_prev != NULL)
 346                         rule->ir_hash_prev->ir_hash_next =
 347                             rule->ir_hash_next;
 348                 if (rule->ir_hash_next != NULL) {
 349                         rule->ir_hash_next->ir_hash_prev =
 350                             rule->ir_hash_prev;
 351                 }
 352         }
 353         mutex_exit(&rule->ir_hash->ilb_hash_lock);
 354 
 355         rule->ir_hash_next = NULL;
 356         rule->ir_hash_prev = NULL;
 357         rule->ir_hash = NULL;
 358 }
 359 
 360 /*
 361  * Given the info of a packet, look for a match in the rule hash table.
 362  */
 363 static ilb_rule_t *
 364 ilb_rule_hash(ilb_stack_t *ilbs, int l3, int l4, in6_addr_t *addr,
 365     in_port_t port, zoneid_t zoneid, uint32_t len, boolean_t *busy)
 366 {
 367         int i;
 368         ilb_rule_t *rule;
 369         ipaddr_t v4_addr;
 370 
 371         *busy = B_FALSE;
 372         IN6_V4MAPPED_TO_IPADDR(addr, v4_addr);
 373         i = ILB_RULE_HASH((uint8_t *)&v4_addr, ilbs->ilbs_rule_hash_size);
 374         port = ntohs(port);
 375 
 376         mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
 377         for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
 378             rule = rule->ir_hash_next) {
 379                 if (!rule->ir_port_range) {
 380                         if (rule->ir_min_port != port)
 381                                 continue;
 382                 } else {
 383                         if (port < rule->ir_min_port ||
 384                             port > rule->ir_max_port) {
 385                                 continue;
 386                         }
 387                 }
 388                 if (rule->ir_ipver != l3 || rule->ir_proto != l4 ||
 389                     rule->ir_zoneid != zoneid) {
 390                         continue;
 391                 }
 392 
 393                 if (l3 == IPPROTO_IP) {
 394                         if (rule->ir_target_v4 != INADDR_ANY &&
 395                             rule->ir_target_v4 != v4_addr) {
 396                                 continue;
 397                         }
 398                 } else {
 399                         if (!IN6_IS_ADDR_UNSPECIFIED(&rule->ir_target_v6) &&
 400                             !IN6_ARE_ADDR_EQUAL(addr, &rule->ir_target_v6)) {
 401                                 continue;
 402                         }
 403                 }
 404 
 405                 /*
 406                  * Just update the stats if the rule is disabled.
 407                  */
 408                 mutex_enter(&rule->ir_lock);
 409                 if (!(rule->ir_flags & ILB_RULE_ENABLED)) {
 410                         ILB_R_KSTAT(rule, pkt_not_processed);
 411                         ILB_R_KSTAT_UPDATE(rule, bytes_not_processed, len);
 412                         mutex_exit(&rule->ir_lock);
 413                         rule = NULL;
 414                         break;
 415                 } else if (rule->ir_flags & ILB_RULE_BUSY) {
 416                         /*
 417                          * If we are busy...
 418                          *
 419                          * XXX we should have a queue to postpone the
 420                          * packet processing.  But this requires a
 421                          * mechanism in IP to re-start the packet
 422                          * processing.  So for now, just drop the packet.
 423                          */
 424                         ILB_R_KSTAT(rule, pkt_dropped);
 425                         ILB_R_KSTAT_UPDATE(rule, bytes_dropped, len);
 426                         mutex_exit(&rule->ir_lock);
 427                         *busy = B_TRUE;
 428                         rule = NULL;
 429                         break;
 430                 } else {
 431                         rule->ir_refcnt++;
 432                         ASSERT(rule->ir_refcnt != 1);
 433                         mutex_exit(&rule->ir_lock);
 434                         break;
 435                 }
 436         }
 437         mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
 438         return (rule);
 439 }
 440 
 441 /*
 442  * Add a rule to the global rule list.  This list is for finding all rules
 443  * in an IP stack.  The caller is assumed to hold the ilbs_g_lock.
 444  */
 445 static void
 446 ilb_rule_g_add(ilb_stack_t *ilbs, ilb_rule_t *rule)
 447 {
 448         ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
 449         rule->ir_next = ilbs->ilbs_rule_head;
 450         ilbs->ilbs_rule_head = rule;
 451         ILB_KSTAT_UPDATE(ilbs, num_rules, 1);
 452 }
 453 
 454 /* The call is assumed to hold the ilbs_g_lock. */
 455 static void
 456 ilb_rule_g_del(ilb_stack_t *ilbs, ilb_rule_t *rule)
 457 {
 458         ilb_rule_t *tmp_rule;
 459         ilb_rule_t *prev_rule;
 460 
 461         ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
 462         prev_rule = NULL;
 463         for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
 464             prev_rule = tmp_rule, tmp_rule = tmp_rule->ir_next) {
 465                 if (tmp_rule == rule)
 466                         break;
 467         }
 468         if (tmp_rule == NULL) {
 469                 mutex_exit(&ilbs->ilbs_g_lock);
 470                 return;
 471         }
 472         if (prev_rule == NULL)
 473                 ilbs->ilbs_rule_head = tmp_rule->ir_next;
 474         else
 475                 prev_rule->ir_next = tmp_rule->ir_next;
 476         ILB_KSTAT_UPDATE(ilbs, num_rules, -1);
 477 }
 478 
 479 /*
 480  * Helper routine to calculate how many source addresses are in a given
 481  * range.
 482  */
 483 static int64_t
 484 num_nat_src_v6(const in6_addr_t *a1, const in6_addr_t *a2)
 485 {
 486         int64_t ret;
 487         uint32_t addr1, addr2;
 488 
 489         /*
 490          * Here we assume that the max number of NAT source cannot be
 491          * large such that the most significant 2 s6_addr32 must be
 492          * equal.
 493          */
 494         addr1 = ntohl(a1->s6_addr32[3]);
 495         addr2 = ntohl(a2->s6_addr32[3]);
 496         if (a1->s6_addr32[0] != a2->s6_addr32[0] ||
 497             a1->s6_addr32[1] != a2->s6_addr32[1] ||
 498             a1->s6_addr32[2] > a2->s6_addr32[2] ||
 499             (a1->s6_addr32[2] == a2->s6_addr32[2] && addr1 > addr2)) {
 500                 return (-1);
 501         }
 502         if (a1->s6_addr32[2] == a2->s6_addr32[2]) {
 503                 return (addr2 - addr1 + 1);
 504         } else {
 505                 ret = (ntohl(a2->s6_addr32[2]) - ntohl(a1->s6_addr32[2]));
 506                 ret <<= 32;
 507                 ret = ret + addr1 - addr2;
 508                 return (ret + 1);
 509         }
 510 }
 511 
 512 /*
 513  * Add an ILB rule.
 514  */
 515 int
 516 ilb_rule_add(ilb_stack_t *ilbs, zoneid_t zoneid, const ilb_rule_cmd_t *cmd)
 517 {
 518         ilb_rule_t *rule;
 519         netstackid_t stackid;
 520         int ret;
 521         in_port_t min_port, max_port;
 522         int64_t num_src;
 523 
 524         /* Sanity checks. */
 525         if (cmd->ip_ver != IPPROTO_IP && cmd->ip_ver != IPPROTO_IPV6)
 526                 return (EINVAL);
 527 
 528         /* Need to support SCTP... */
 529         if (cmd->proto != IPPROTO_TCP && cmd->proto != IPPROTO_UDP)
 530                 return (EINVAL);
 531 
 532         /* For full NAT, the NAT source must be supplied. */
 533         if (cmd->topo == ILB_TOPO_IMPL_NAT) {
 534                 if (IS_ADDR_UNSPEC(&cmd->nat_src_start) ||
 535                     IS_ADDR_UNSPEC(&cmd->nat_src_end)) {
 536                         return (EINVAL);
 537                 }
 538         }
 539 
 540         /* Check invalid mask */
 541         if ((cmd->flags & ILB_RULE_STICKY) &&
 542             IS_ADDR_UNSPEC(&cmd->sticky_mask)) {
 543                 return (EINVAL);
 544         }
 545 
 546         /* Port is passed in network byte order. */
 547         min_port = ntohs(cmd->min_port);
 548         max_port = ntohs(cmd->max_port);
 549         if (min_port > max_port)
 550                 return (EINVAL);
 551 
 552         /* min_port == 0 means "all ports". Make it so */
 553         if (min_port == 0) {
 554                 min_port = 1;
 555                 max_port = 65535;
 556         }
 557 
 558         /* Funny address checking. */
 559         if (cmd->ip_ver == IPPROTO_IP) {
 560                 in_addr_t v4_addr1, v4_addr2;
 561 
 562                 v4_addr1 = cmd->vip.s6_addr32[3];
 563                 if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET ||
 564                     CLASSD(v4_addr1) || v4_addr1 == INADDR_BROADCAST ||
 565                     v4_addr1 == INADDR_ANY ||
 566                     !IN6_IS_ADDR_V4MAPPED(&cmd->vip)) {
 567                         return (EINVAL);
 568                 }
 569 
 570                 if (cmd->topo == ILB_TOPO_IMPL_NAT) {
 571                         v4_addr1 = ntohl(cmd->nat_src_start.s6_addr32[3]);
 572                         v4_addr2 = ntohl(cmd->nat_src_end.s6_addr32[3]);
 573                         if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET ||
 574                             (*(uchar_t *)&v4_addr2) == IN_LOOPBACKNET ||
 575                             v4_addr1 == INADDR_BROADCAST ||
 576                             v4_addr2 == INADDR_BROADCAST ||
 577                             v4_addr1 == INADDR_ANY || v4_addr2 == INADDR_ANY ||
 578                             CLASSD(v4_addr1) || CLASSD(v4_addr2) ||
 579                             !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) ||
 580                             !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) {
 581                                 return (EINVAL);
 582                         }
 583 
 584                         num_src = v4_addr2 - v4_addr1 + 1;
 585                         if (v4_addr1 > v4_addr2 || num_src > ILB_MAX_NAT_SRC)
 586                                 return (EINVAL);
 587                 }
 588         } else {
 589                 if (IN6_IS_ADDR_LOOPBACK(&cmd->vip) ||
 590                     IN6_IS_ADDR_MULTICAST(&cmd->vip) ||
 591                     IN6_IS_ADDR_UNSPECIFIED(&cmd->vip) ||
 592                     IN6_IS_ADDR_V4MAPPED(&cmd->vip)) {
 593                         return (EINVAL);
 594                 }
 595 
 596                 if (cmd->topo == ILB_TOPO_IMPL_NAT) {
 597                         if (IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_start) ||
 598                             IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_end) ||
 599                             IN6_IS_ADDR_MULTICAST(&cmd->nat_src_start) ||
 600                             IN6_IS_ADDR_MULTICAST(&cmd->nat_src_end) ||
 601                             IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_start) ||
 602                             IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_end) ||
 603                             IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) ||
 604                             IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) {
 605                                 return (EINVAL);
 606                         }
 607 
 608                         if ((num_src = num_nat_src_v6(&cmd->nat_src_start,
 609                             &cmd->nat_src_end)) < 0 ||
 610                             num_src > ILB_MAX_NAT_SRC) {
 611                                 return (EINVAL);
 612                         }
 613                 }
 614         }
 615 
 616         mutex_enter(&ilbs->ilbs_g_lock);
 617         if (ilbs->ilbs_g_hash == NULL)
 618                 ilb_rule_hash_init(ilbs);
 619         if (ilbs->ilbs_c2s_conn_hash == NULL) {
 620                 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
 621                 ilb_conn_hash_init(ilbs);
 622                 ilb_nat_src_init(ilbs);
 623         }
 624 
 625         /* Make sure that the new rule does not duplicate an existing one. */
 626         if (ilb_match_rule(ilbs, zoneid, cmd->name, cmd->ip_ver, cmd->proto,
 627             min_port, max_port, &cmd->vip)) {
 628                 mutex_exit(&ilbs->ilbs_g_lock);
 629                 return (EEXIST);
 630         }
 631 
 632         rule = kmem_zalloc(sizeof (ilb_rule_t), KM_NOSLEEP);
 633         if (rule == NULL) {
 634                 mutex_exit(&ilbs->ilbs_g_lock);
 635                 return (ENOMEM);
 636         }
 637 
 638         /* ir_name is all 0 to begin with */
 639         (void) memcpy(rule->ir_name, cmd->name, ILB_RULE_NAMESZ - 1);
 640 
 641         rule->ir_ks_instance = atomic_inc_uint_nv(&ilb_kstat_instance);
 642         stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
 643         if ((rule->ir_ksp = ilb_rule_kstat_init(stackid, rule)) == NULL) {
 644                 ret = ENOMEM;
 645                 goto error;
 646         }
 647 
 648         if (cmd->topo == ILB_TOPO_IMPL_NAT) {
 649                 rule->ir_nat_src_start = cmd->nat_src_start;
 650                 rule->ir_nat_src_end = cmd->nat_src_end;
 651         }
 652 
 653         rule->ir_ipver = cmd->ip_ver;
 654         rule->ir_proto = cmd->proto;
 655         rule->ir_topo = cmd->topo;
 656 
 657         rule->ir_min_port = min_port;
 658         rule->ir_max_port = max_port;
 659         if (rule->ir_min_port != rule->ir_max_port)
 660                 rule->ir_port_range = B_TRUE;
 661         else
 662                 rule->ir_port_range = B_FALSE;
 663 
 664         rule->ir_zoneid = zoneid;
 665 
 666         rule->ir_target_v6 = cmd->vip;
 667         rule->ir_servers = NULL;
 668 
 669         /*
 670          * The default connection drain timeout is indefinite (value 0),
 671          * meaning we will wait for all connections to finish.  So we
 672          * can assign cmd->conn_drain_timeout to it directly.
 673          */
 674         rule->ir_conn_drain_timeout = cmd->conn_drain_timeout;
 675         if (cmd->nat_expiry != 0) {
 676                 rule->ir_nat_expiry = cmd->nat_expiry;
 677         } else {
 678                 switch (rule->ir_proto) {
 679                 case IPPROTO_TCP:
 680                         rule->ir_nat_expiry = ilb_conn_tcp_expiry;
 681                         break;
 682                 case IPPROTO_UDP:
 683                         rule->ir_nat_expiry = ilb_conn_udp_expiry;
 684                         break;
 685                 default:
 686                         cmn_err(CE_PANIC, "data corruption: wrong ir_proto: %p",
 687                             (void *)rule);
 688                         break;
 689                 }
 690         }
 691         if (cmd->sticky_expiry != 0)
 692                 rule->ir_sticky_expiry = cmd->sticky_expiry;
 693         else
 694                 rule->ir_sticky_expiry = ilb_sticky_expiry;
 695 
 696         if (cmd->flags & ILB_RULE_STICKY) {
 697                 rule->ir_flags |= ILB_RULE_STICKY;
 698                 rule->ir_sticky_mask = cmd->sticky_mask;
 699                 if (ilbs->ilbs_sticky_hash == NULL)
 700                         ilb_sticky_hash_init(ilbs);
 701         }
 702         if (cmd->flags & ILB_RULE_ENABLED)
 703                 rule->ir_flags |= ILB_RULE_ENABLED;
 704 
 705         mutex_init(&rule->ir_lock, NULL, MUTEX_DEFAULT, NULL);
 706         cv_init(&rule->ir_cv, NULL, CV_DEFAULT, NULL);
 707 
 708         rule->ir_refcnt = 1;
 709 
 710         switch (cmd->algo) {
 711         case ILB_ALG_IMPL_ROUNDROBIN:
 712                 if ((rule->ir_alg = ilb_alg_rr_init(rule, NULL)) == NULL) {
 713                         ret = ENOMEM;
 714                         goto error;
 715                 }
 716                 rule->ir_alg_type = ILB_ALG_IMPL_ROUNDROBIN;
 717                 break;
 718         case ILB_ALG_IMPL_HASH_IP:
 719         case ILB_ALG_IMPL_HASH_IP_SPORT:
 720         case ILB_ALG_IMPL_HASH_IP_VIP:
 721                 if ((rule->ir_alg = ilb_alg_hash_init(rule,
 722                     &cmd->algo)) == NULL) {
 723                         ret = ENOMEM;
 724                         goto error;
 725                 }
 726                 rule->ir_alg_type = cmd->algo;
 727                 break;
 728         default:
 729                 ret = EINVAL;
 730                 goto error;
 731         }
 732 
 733         /* Add it to the global list and hash array at the end. */
 734         ilb_rule_g_add(ilbs, rule);
 735         ilb_rule_hash_add(ilbs, rule, &cmd->vip);
 736 
 737         mutex_exit(&ilbs->ilbs_g_lock);
 738 
 739         return (0);
 740 
 741 error:
 742         mutex_exit(&ilbs->ilbs_g_lock);
 743         if (rule->ir_ksp != NULL) {
 744                 /* stackid must be initialized if ir_ksp != NULL */
 745                 kstat_delete_netstack(rule->ir_ksp, stackid);
 746         }
 747         kmem_free(rule, sizeof (ilb_rule_t));
 748         return (ret);
 749 }
 750 
 751 /*
 752  * The final part in deleting a rule.  Either called directly or by the
 753  * taskq dispatched.
 754  */
 755 static void
 756 ilb_rule_del_common(ilb_stack_t *ilbs, ilb_rule_t *tmp_rule)
 757 {
 758         netstackid_t stackid;
 759         ilb_server_t *server;
 760 
 761         stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
 762 
 763         /*
 764          * Let the algorithm know that the rule is going away.  The
 765          * algorithm fini routine will free all its resources with this
 766          * rule.
 767          */
 768         tmp_rule->ir_alg->ilb_alg_fini(&tmp_rule->ir_alg);
 769 
 770         while ((server = tmp_rule->ir_servers) != NULL) {
 771                 mutex_enter(&server->iser_lock);
 772                 ilb_destroy_nat_src(&server->iser_nat_src);
 773                 if (tmp_rule->ir_conn_drain_timeout != 0) {
 774                         /*
 775                          * The garbage collection thread checks this value
 776                          * without grabing a lock.  So we need to use
 777                          * atomic_swap_64() to make sure that the value seen
 778                          * by gc thread is intact.
 779                          */
 780                         (void) atomic_swap_64(
 781                             (uint64_t *)&server->iser_die_time,
 782                             ddi_get_lbolt64() +
 783                             SEC_TO_TICK(tmp_rule->ir_conn_drain_timeout));
 784                 }
 785                 while (server->iser_refcnt > 1)
 786                         cv_wait(&server->iser_cv, &server->iser_lock);
 787                 tmp_rule->ir_servers = server->iser_next;
 788                 kstat_delete_netstack(server->iser_ksp, stackid);
 789                 kmem_free(server, sizeof (ilb_server_t));
 790         }
 791 
 792         ASSERT(tmp_rule->ir_ksp != NULL);
 793         kstat_delete_netstack(tmp_rule->ir_ksp, stackid);
 794 
 795         kmem_free(tmp_rule, sizeof (ilb_rule_t));
 796 }
 797 
 798 /* The routine executed by the delayed rule taskq. */
 799 static void
 800 ilb_rule_del_tq(void *arg)
 801 {
 802         ilb_stack_t *ilbs = ((ilb_rule_tq_t *)arg)->ilbs;
 803         ilb_rule_t *rule = ((ilb_rule_tq_t *)arg)->rule;
 804 
 805         mutex_enter(&rule->ir_lock);
 806         while (rule->ir_refcnt > 1)
 807                 cv_wait(&rule->ir_cv, &rule->ir_lock);
 808         ilb_rule_del_common(ilbs, rule);
 809         kmem_free(arg, sizeof (ilb_rule_tq_t));
 810 }
 811 
 812 /* Routine to delete a rule. */
 813 int
 814 ilb_rule_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name)
 815 {
 816         ilb_rule_t *tmp_rule;
 817         ilb_rule_tq_t *arg;
 818         int err;
 819 
 820         mutex_enter(&ilbs->ilbs_g_lock);
 821         if ((tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name,
 822             &err)) == NULL) {
 823                 mutex_exit(&ilbs->ilbs_g_lock);
 824                 return (err);
 825         }
 826 
 827         /*
 828          * First remove the rule from the hash array and the global list so
 829          * that no one can find this rule any more.
 830          */
 831         ilb_rule_hash_del(tmp_rule);
 832         ilb_rule_g_del(ilbs, tmp_rule);
 833         mutex_exit(&ilbs->ilbs_g_lock);
 834         ILB_RULE_REFRELE(tmp_rule);
 835 
 836         /*
 837          * Now no one can find this rule, we can remove it once all
 838          * references to it are dropped and all references to the list
 839          * of servers are dropped.  So dispatch a task to finish the deletion.
 840          * We do this instead of letting the last one referencing the
 841          * rule do it.  The reason is that the last one may be the
 842          * interrupt thread.  We want to minimize the work it needs to
 843          * do.  Rule deletion is not a critical task so it can be delayed.
 844          */
 845         arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP);
 846         arg->ilbs = ilbs;
 847         arg->rule = tmp_rule;
 848         (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq, arg,
 849             TQ_SLEEP);
 850 
 851         return (0);
 852 }
 853 
 854 /*
 855  * Given an IP address, check to see if there is a rule using this
 856  * as the VIP.  It can be used to check if we need to drop a fragment.
 857  */
 858 boolean_t
 859 ilb_rule_match_vip_v6(ilb_stack_t *ilbs, in6_addr_t *vip, ilb_rule_t **ret_rule)
 860 {
 861         int i;
 862         ilb_rule_t *rule;
 863         boolean_t ret = B_FALSE;
 864 
 865         i = ILB_RULE_HASH((uint8_t *)&vip->s6_addr32[3],
 866             ilbs->ilbs_rule_hash_size);
 867         mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
 868         for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
 869             rule = rule->ir_hash_next) {
 870                 if (IN6_ARE_ADDR_EQUAL(vip, &rule->ir_target_v6)) {
 871                         mutex_enter(&rule->ir_lock);
 872                         if (rule->ir_flags & ILB_RULE_BUSY) {
 873                                 mutex_exit(&rule->ir_lock);
 874                                 break;
 875                         }
 876                         if (ret_rule != NULL) {
 877                                 rule->ir_refcnt++;
 878                                 mutex_exit(&rule->ir_lock);
 879                                 *ret_rule = rule;
 880                         } else {
 881                                 mutex_exit(&rule->ir_lock);
 882                         }
 883                         ret = B_TRUE;
 884                         break;
 885                 }
 886         }
 887         mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
 888         return (ret);
 889 }
 890 
 891 boolean_t
 892 ilb_rule_match_vip_v4(ilb_stack_t *ilbs, ipaddr_t addr, ilb_rule_t **ret_rule)
 893 {
 894         int i;
 895         ilb_rule_t *rule;
 896         boolean_t ret = B_FALSE;
 897 
 898         i = ILB_RULE_HASH((uint8_t *)&addr, ilbs->ilbs_rule_hash_size);
 899         mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
 900         for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
 901             rule = rule->ir_hash_next) {
 902                 if (rule->ir_target_v6.s6_addr32[3] == addr) {
 903                         mutex_enter(&rule->ir_lock);
 904                         if (rule->ir_flags & ILB_RULE_BUSY) {
 905                                 mutex_exit(&rule->ir_lock);
 906                                 break;
 907                         }
 908                         if (ret_rule != NULL) {
 909                                 rule->ir_refcnt++;
 910                                 mutex_exit(&rule->ir_lock);
 911                                 *ret_rule = rule;
 912                         } else {
 913                                 mutex_exit(&rule->ir_lock);
 914                         }
 915                         ret = B_TRUE;
 916                         break;
 917                 }
 918         }
 919         mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
 920         return (ret);
 921 }
 922 
 923 static ilb_rule_t *
 924 ilb_find_rule_locked(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
 925     int *err)
 926 {
 927         ilb_rule_t *tmp_rule;
 928 
 929         ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
 930 
 931         for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
 932             tmp_rule = tmp_rule->ir_next) {
 933                 if (tmp_rule->ir_zoneid != zoneid)
 934                         continue;
 935                 if (strcasecmp(tmp_rule->ir_name, name) == 0) {
 936                         mutex_enter(&tmp_rule->ir_lock);
 937                         if (tmp_rule->ir_flags & ILB_RULE_BUSY) {
 938                                 mutex_exit(&tmp_rule->ir_lock);
 939                                 *err = EINPROGRESS;
 940                                 return (NULL);
 941                         }
 942                         tmp_rule->ir_refcnt++;
 943                         mutex_exit(&tmp_rule->ir_lock);
 944                         *err = 0;
 945                         return (tmp_rule);
 946                 }
 947         }
 948         *err = ENOENT;
 949         return (NULL);
 950 }
 951 
 952 /* To find a rule with a given name and zone in the global rule list. */
 953 ilb_rule_t *
 954 ilb_find_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
 955     int *err)
 956 {
 957         ilb_rule_t *tmp_rule;
 958 
 959         mutex_enter(&ilbs->ilbs_g_lock);
 960         tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name, err);
 961         mutex_exit(&ilbs->ilbs_g_lock);
 962         return (tmp_rule);
 963 }
 964 
 965 /* Try to match the given packet info and zone ID with a rule. */
 966 static boolean_t
 967 ilb_match_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, int l3,
 968     int l4, in_port_t min_port, in_port_t max_port, const in6_addr_t *addr)
 969 {
 970         ilb_rule_t *tmp_rule;
 971 
 972         ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
 973 
 974         for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
 975             tmp_rule = tmp_rule->ir_next) {
 976                 if (tmp_rule->ir_zoneid != zoneid)
 977                         continue;
 978 
 979                 /*
 980                  * We don't allow the same name in different rules even if all
 981                  * the other rule components are different.
 982                  */
 983                 if (strcasecmp(tmp_rule->ir_name, name) == 0)
 984                         return (B_TRUE);
 985 
 986                 if (tmp_rule->ir_ipver != l3 || tmp_rule->ir_proto != l4)
 987                         continue;
 988 
 989                 /*
 990                  * ir_min_port and ir_max_port are the same if ir_port_range
 991                  * is false.  In this case, if the ir_min|max_port (same) is
 992                  * outside of the given port range, it is OK.  In other cases,
 993                  * check if min and max port are outside a rule's range.
 994                  */
 995                 if (tmp_rule->ir_max_port < min_port ||
 996                     tmp_rule->ir_min_port > max_port) {
 997                         continue;
 998                 }
 999 
1000                 /*
1001                  * If l3 is IPv4, the addr passed in is assumed to be
1002                  * mapped address.
1003                  */
1004                 if (V6_OR_V4_INADDR_ANY(*addr) ||
1005                     V6_OR_V4_INADDR_ANY(tmp_rule->ir_target_v6) ||
1006                     IN6_ARE_ADDR_EQUAL(addr, &tmp_rule->ir_target_v6)) {
1007                         return (B_TRUE);
1008                 }
1009         }
1010         return (B_FALSE);
1011 }
1012 
1013 int
1014 ilb_rule_enable(ilb_stack_t *ilbs, zoneid_t zoneid,
1015     const char *rule_name, ilb_rule_t *in_rule)
1016 {
1017         ilb_rule_t *rule;
1018         int err;
1019 
1020         ASSERT((in_rule == NULL && rule_name != NULL) ||
1021             (in_rule != NULL && rule_name == NULL));
1022         if ((rule = in_rule) == NULL) {
1023                 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1024                     &err)) == NULL) {
1025                         return (err);
1026                 }
1027         }
1028         mutex_enter(&rule->ir_lock);
1029         rule->ir_flags |= ILB_RULE_ENABLED;
1030         mutex_exit(&rule->ir_lock);
1031 
1032         /* Only refrele if the rule is passed in. */
1033         if (in_rule == NULL)
1034                 ILB_RULE_REFRELE(rule);
1035         return (0);
1036 }
1037 
1038 int
1039 ilb_rule_disable(ilb_stack_t *ilbs, zoneid_t zoneid,
1040     const char *rule_name, ilb_rule_t *in_rule)
1041 {
1042         ilb_rule_t *rule;
1043         int err;
1044 
1045         ASSERT((in_rule == NULL && rule_name != NULL) ||
1046             (in_rule != NULL && rule_name == NULL));
1047         if ((rule = in_rule) == NULL) {
1048                 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1049                     &err)) == NULL) {
1050                         return (err);
1051                 }
1052         }
1053         mutex_enter(&rule->ir_lock);
1054         rule->ir_flags &= ~ILB_RULE_ENABLED;
1055         mutex_exit(&rule->ir_lock);
1056 
1057         /* Only refrele if the rule is passed in. */
1058         if (in_rule == NULL)
1059                 ILB_RULE_REFRELE(rule);
1060         return (0);
1061 }
1062 
1063 /*
1064  * XXX We should probably have a walker function to walk all rules.  For
1065  * now, just add a simple loop for enable/disable/del.
1066  */
1067 void
1068 ilb_rule_enable_all(ilb_stack_t *ilbs, zoneid_t zoneid)
1069 {
1070         ilb_rule_t *rule;
1071 
1072         mutex_enter(&ilbs->ilbs_g_lock);
1073         for (rule = ilbs->ilbs_rule_head; rule != NULL; rule = rule->ir_next) {
1074                 if (rule->ir_zoneid != zoneid)
1075                         continue;
1076                 /*
1077                  * No need to hold the rule as we are holding the global
1078                  * lock so it won't go away.  Ignore the return value here
1079                  * as the rule is provided so the call cannot fail.
1080                  */
1081                 (void) ilb_rule_enable(ilbs, zoneid, NULL, rule);
1082         }
1083         mutex_exit(&ilbs->ilbs_g_lock);
1084 }
1085 
1086 void
1087 ilb_rule_disable_all(ilb_stack_t *ilbs, zoneid_t zoneid)
1088 {
1089         ilb_rule_t *rule;
1090 
1091         mutex_enter(&ilbs->ilbs_g_lock);
1092         for (rule = ilbs->ilbs_rule_head; rule != NULL;
1093             rule = rule->ir_next) {
1094                 if (rule->ir_zoneid != zoneid)
1095                         continue;
1096                 (void) ilb_rule_disable(ilbs, zoneid, NULL, rule);
1097         }
1098         mutex_exit(&ilbs->ilbs_g_lock);
1099 }
1100 
1101 void
1102 ilb_rule_del_all(ilb_stack_t *ilbs, zoneid_t zoneid)
1103 {
1104         ilb_rule_t *rule;
1105         ilb_rule_tq_t *arg;
1106 
1107         mutex_enter(&ilbs->ilbs_g_lock);
1108         while ((rule = ilbs->ilbs_rule_head) != NULL) {
1109                 if (rule->ir_zoneid != zoneid)
1110                         continue;
1111                 ilb_rule_hash_del(rule);
1112                 ilb_rule_g_del(ilbs, rule);
1113                 mutex_exit(&ilbs->ilbs_g_lock);
1114 
1115                 arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP);
1116                 arg->ilbs = ilbs;
1117                 arg->rule = rule;
1118                 (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq,
1119                     arg, TQ_SLEEP);
1120 
1121                 mutex_enter(&ilbs->ilbs_g_lock);
1122         }
1123         mutex_exit(&ilbs->ilbs_g_lock);
1124 }
1125 
1126 /*
1127  * This is just an optimization, so don't grab the global lock.  The
1128  * worst case is that we missed a couple packets.
1129  */
1130 boolean_t
1131 ilb_has_rules(ilb_stack_t *ilbs)
1132 {
1133         return (ilbs->ilbs_rule_head != NULL);
1134 }
1135 
1136 
1137 static int
1138 ilb_server_toggle(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name,
1139     ilb_rule_t *rule, in6_addr_t *addr, boolean_t enable)
1140 {
1141         ilb_server_t *tmp_server;
1142         int ret;
1143 
1144         ASSERT((rule == NULL && rule_name != NULL) ||
1145             (rule != NULL && rule_name == NULL));
1146 
1147         if (rule == NULL) {
1148                 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1149                     &ret)) == NULL) {
1150                         return (ret);
1151                 }
1152         }
1153 
1154         /* Once we get a hold on the rule, no server can be added/deleted. */
1155         for (tmp_server = rule->ir_servers; tmp_server != NULL;
1156             tmp_server = tmp_server->iser_next) {
1157                 if (IN6_ARE_ADDR_EQUAL(&tmp_server->iser_addr_v6, addr))
1158                         break;
1159         }
1160         if (tmp_server == NULL) {
1161                 ret = ENOENT;
1162                 goto done;
1163         }
1164 
1165         if (enable) {
1166                 ret = rule->ir_alg->ilb_alg_server_enable(tmp_server,
1167                     rule->ir_alg->ilb_alg_data);
1168                 if (ret == 0) {
1169                         tmp_server->iser_enabled = B_TRUE;
1170                         tmp_server->iser_die_time = 0;
1171                 }
1172         } else {
1173                 ret = rule->ir_alg->ilb_alg_server_disable(tmp_server,
1174                     rule->ir_alg->ilb_alg_data);
1175                 if (ret == 0) {
1176                         tmp_server->iser_enabled = B_FALSE;
1177                         if (rule->ir_conn_drain_timeout != 0) {
1178                                 (void) atomic_swap_64(
1179                                     (uint64_t *)&tmp_server->iser_die_time,
1180                                     ddi_get_lbolt64() + SEC_TO_TICK(
1181                                     rule->ir_conn_drain_timeout));
1182                         }
1183                 }
1184         }
1185 
1186 done:
1187         if (rule_name != NULL)
1188                 ILB_RULE_REFRELE(rule);
1189         return (ret);
1190 }
1191 int
1192 ilb_server_enable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1193     ilb_rule_t *rule, in6_addr_t *addr)
1194 {
1195         return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_TRUE));
1196 }
1197 
1198 int
1199 ilb_server_disable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1200     ilb_rule_t *rule, in6_addr_t *addr)
1201 {
1202         return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_FALSE));
1203 }
1204 
1205 /*
1206  * Add a back end server to a rule.  If the address is IPv4, it is assumed
1207  * to be passed in as a mapped address.
1208  */
1209 int
1210 ilb_server_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_info_t *info)
1211 {
1212         ilb_server_t    *server;
1213         netstackid_t    stackid;
1214         int             ret = 0;
1215         in_port_t       min_port, max_port;
1216         in_port_t       range;
1217 
1218         /* Port is passed in network byte order. */
1219         min_port = ntohs(info->min_port);
1220         max_port = ntohs(info->max_port);
1221         if (min_port > max_port)
1222                 return (EINVAL);
1223 
1224         /* min_port == 0 means "all ports". Make it so */
1225         if (min_port == 0) {
1226                 min_port = 1;
1227                 max_port = 65535;
1228         }
1229         range = max_port - min_port;
1230 
1231         mutex_enter(&rule->ir_lock);
1232         /* If someone is already doing server add/del, sleeps and wait. */
1233         while (rule->ir_flags & ILB_RULE_BUSY) {
1234                 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1235                         mutex_exit(&rule->ir_lock);
1236                         return (EINTR);
1237                 }
1238         }
1239 
1240         /*
1241          * Set the rule to be busy to make sure that no new packet can
1242          * use this rule.
1243          */
1244         rule->ir_flags |= ILB_RULE_BUSY;
1245 
1246         /* Now wait for all other guys to finish their work. */
1247         while (rule->ir_refcnt > 2) {
1248                 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1249                         mutex_exit(&rule->ir_lock);
1250                         ret = EINTR;
1251                         goto end;
1252                 }
1253         }
1254         mutex_exit(&rule->ir_lock);
1255 
1256         /* Sanity checks... */
1257         if ((IN6_IS_ADDR_V4MAPPED(&info->addr) &&
1258             rule->ir_ipver != IPPROTO_IP) ||
1259             (!IN6_IS_ADDR_V4MAPPED(&info->addr) &&
1260             rule->ir_ipver != IPPROTO_IPV6)) {
1261                 ret = EINVAL;
1262                 goto end;
1263         }
1264 
1265         /*
1266          * Check for valid port range.
1267          *
1268          * For DSR, there can be no port shifting.  Hence the server
1269          * specification must be the same as the rule's.
1270          *
1271          * For half-NAT/NAT, the range must either be 0 (port collapsing) or
1272          * it must be equal to the same value as the rule port range.
1273          *
1274          */
1275         if (rule->ir_topo == ILB_TOPO_IMPL_DSR) {
1276                 if (rule->ir_max_port != max_port ||
1277                     rule->ir_min_port != min_port) {
1278                         ret = EINVAL;
1279                         goto end;
1280                 }
1281         } else {
1282                 if ((range != rule->ir_max_port - rule->ir_min_port) &&
1283                     range != 0) {
1284                         ret = EINVAL;
1285                         goto end;
1286                 }
1287         }
1288 
1289         /* Check for duplicate. */
1290         for (server = rule->ir_servers; server != NULL;
1291             server = server->iser_next) {
1292                 if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, &info->addr) ||
1293                     strcasecmp(server->iser_name, info->name) == 0) {
1294                         break;
1295                 }
1296         }
1297         if (server != NULL) {
1298                 ret = EEXIST;
1299                 goto end;
1300         }
1301 
1302         if ((server = kmem_zalloc(sizeof (ilb_server_t), KM_NOSLEEP)) == NULL) {
1303                 ret = ENOMEM;
1304                 goto end;
1305         }
1306 
1307         (void) memcpy(server->iser_name, info->name, ILB_SERVER_NAMESZ - 1);
1308         (void) inet_ntop(AF_INET6, &info->addr, server->iser_ip_addr,
1309             sizeof (server->iser_ip_addr));
1310         stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
1311         server->iser_ksp = ilb_server_kstat_init(stackid, rule, server);
1312         if (server->iser_ksp == NULL) {
1313                 kmem_free(server, sizeof (ilb_server_t));
1314                 ret = EINVAL;
1315                 goto end;
1316         }
1317 
1318         server->iser_stackid = stackid;
1319         server->iser_addr_v6 = info->addr;
1320         server->iser_min_port = min_port;
1321         server->iser_max_port = max_port;
1322         if (min_port != max_port)
1323                 server->iser_port_range = B_TRUE;
1324         else
1325                 server->iser_port_range = B_FALSE;
1326 
1327         /*
1328          * If the rule uses NAT, find/create the NAT source entry to use
1329          * for this server.
1330          */
1331         if (rule->ir_topo == ILB_TOPO_IMPL_NAT) {
1332                 in_port_t port;
1333 
1334                 /*
1335                  * If the server uses a port range, our port allocation
1336                  * scheme needs to treat it as a wildcard.  Refer to the
1337                  * comments in ilb_nat.c about the scheme.
1338                  */
1339                 if (server->iser_port_range)
1340                         port = 0;
1341                 else
1342                         port = server->iser_min_port;
1343 
1344                 if ((ret = ilb_create_nat_src(ilbs, &server->iser_nat_src,
1345                     &server->iser_addr_v6, port, &rule->ir_nat_src_start,
1346                     num_nat_src_v6(&rule->ir_nat_src_start,
1347                     &rule->ir_nat_src_end))) != 0) {
1348                         kstat_delete_netstack(server->iser_ksp, stackid);
1349                         kmem_free(server, sizeof (ilb_server_t));
1350                         goto end;
1351                 }
1352         }
1353 
1354         /*
1355          * The iser_lock is only used to protect iser_refcnt.  All the other
1356          * fields in ilb_server_t should not change, except for iser_enabled.
1357          * The worst thing that can happen if iser_enabled is messed up is
1358          * that one or two packets may not be load balanced to a server
1359          * correctly.
1360          */
1361         server->iser_refcnt = 1;
1362         server->iser_enabled = info->flags & ILB_SERVER_ENABLED ? B_TRUE :
1363             B_FALSE;
1364         mutex_init(&server->iser_lock, NULL, MUTEX_DEFAULT, NULL);
1365         cv_init(&server->iser_cv, NULL, CV_DEFAULT, NULL);
1366 
1367         /* Let the load balancing algorithm know about the addition. */
1368         ASSERT(rule->ir_alg != NULL);
1369         if ((ret = rule->ir_alg->ilb_alg_server_add(server,
1370             rule->ir_alg->ilb_alg_data)) != 0) {
1371                 kstat_delete_netstack(server->iser_ksp, stackid);
1372                 kmem_free(server, sizeof (ilb_server_t));
1373                 goto end;
1374         }
1375 
1376         /*
1377          * No need to hold ir_lock since no other thread should manipulate
1378          * the following fields until ILB_RULE_BUSY is cleared.
1379          */
1380         if (rule->ir_servers == NULL) {
1381                 server->iser_next = NULL;
1382         } else {
1383                 server->iser_next = rule->ir_servers;
1384         }
1385         rule->ir_servers = server;
1386         ILB_R_KSTAT(rule, num_servers);
1387 
1388 end:
1389         mutex_enter(&rule->ir_lock);
1390         rule->ir_flags &= ~ILB_RULE_BUSY;
1391         cv_signal(&rule->ir_cv);
1392         mutex_exit(&rule->ir_lock);
1393         return (ret);
1394 }
1395 
1396 /* The routine executed by the delayed rule processing taskq. */
1397 static void
1398 ilb_server_del_tq(void *arg)
1399 {
1400         ilb_server_t *server = (ilb_server_t *)arg;
1401 
1402         mutex_enter(&server->iser_lock);
1403         while (server->iser_refcnt > 1)
1404                 cv_wait(&server->iser_cv, &server->iser_lock);
1405         kstat_delete_netstack(server->iser_ksp, server->iser_stackid);
1406         kmem_free(server, sizeof (ilb_server_t));
1407 }
1408 
1409 /*
1410  * Delete a back end server from a rule.  If the address is IPv4, it is assumed
1411  * to be passed in as a mapped address.
1412  */
1413 int
1414 ilb_server_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name,
1415     ilb_rule_t *rule, in6_addr_t *addr)
1416 {
1417         ilb_server_t    *server;
1418         ilb_server_t    *prev_server;
1419         int             ret = 0;
1420 
1421         ASSERT((rule == NULL && rule_name != NULL) ||
1422             (rule != NULL && rule_name == NULL));
1423         if (rule == NULL) {
1424                 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1425                     &ret)) == NULL) {
1426                         return (ret);
1427                 }
1428         }
1429 
1430         mutex_enter(&rule->ir_lock);
1431         /* If someone is already doing server add/del, sleeps and wait. */
1432         while (rule->ir_flags & ILB_RULE_BUSY) {
1433                 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1434                         if (rule_name != NULL) {
1435                                 if (--rule->ir_refcnt <= 2)
1436                                         cv_signal(&rule->ir_cv);
1437                         }
1438                         mutex_exit(&rule->ir_lock);
1439                         return (EINTR);
1440                 }
1441         }
1442         /*
1443          * Set the rule to be busy to make sure that no new packet can
1444          * use this rule.
1445          */
1446         rule->ir_flags |= ILB_RULE_BUSY;
1447 
1448         /* Now wait for all other guys to finish their work. */
1449         while (rule->ir_refcnt > 2) {
1450                 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1451                         mutex_exit(&rule->ir_lock);
1452                         ret = EINTR;
1453                         goto end;
1454                 }
1455         }
1456         mutex_exit(&rule->ir_lock);
1457 
1458         prev_server = NULL;
1459         for (server = rule->ir_servers; server != NULL;
1460             prev_server = server, server = server->iser_next) {
1461                 if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, addr))
1462                         break;
1463         }
1464         if (server == NULL) {
1465                 ret = ENOENT;
1466                 goto end;
1467         }
1468 
1469         /*
1470          * Let the load balancing algorithm know about the removal.
1471          * The algorithm may disallow the removal...
1472          */
1473         if ((ret = rule->ir_alg->ilb_alg_server_del(server,
1474             rule->ir_alg->ilb_alg_data)) != 0) {
1475                 goto end;
1476         }
1477 
1478         if (prev_server == NULL)
1479                 rule->ir_servers = server->iser_next;
1480         else
1481                 prev_server->iser_next = server->iser_next;
1482 
1483         ILB_R_KSTAT_UPDATE(rule, num_servers, -1);
1484 
1485         /*
1486          * Mark the server as disabled so that if there is any sticky cache
1487          * using this server around, it won't be used.
1488          */
1489         server->iser_enabled = B_FALSE;
1490 
1491         mutex_enter(&server->iser_lock);
1492 
1493         /*
1494          * De-allocate the NAT source array.  The indiviual ilb_nat_src_entry_t
1495          * may not go away if there is still a conn using it.  The NAT source
1496          * timer will do the garbage collection.
1497          */
1498         ilb_destroy_nat_src(&server->iser_nat_src);
1499 
1500         /* If there is a hard limit on when a server should die, set it. */
1501         if (rule->ir_conn_drain_timeout != 0) {
1502                 (void) atomic_swap_64((uint64_t *)&server->iser_die_time,
1503                     ddi_get_lbolt64() +
1504                     SEC_TO_TICK(rule->ir_conn_drain_timeout));
1505         }
1506 
1507         if (server->iser_refcnt > 1) {
1508                 (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_server_del_tq,
1509                     server, TQ_SLEEP);
1510                 mutex_exit(&server->iser_lock);
1511         } else {
1512                 kstat_delete_netstack(server->iser_ksp, server->iser_stackid);
1513                 kmem_free(server, sizeof (ilb_server_t));
1514         }
1515 
1516 end:
1517         mutex_enter(&rule->ir_lock);
1518         rule->ir_flags &= ~ILB_RULE_BUSY;
1519         if (rule_name != NULL)
1520                 rule->ir_refcnt--;
1521         cv_signal(&rule->ir_cv);
1522         mutex_exit(&rule->ir_lock);
1523         return (ret);
1524 }
1525 
1526 /*
1527  * First check if the destination of the ICMP message matches a VIP of
1528  * a rule.  If it does not, just return ILB_PASSED.
1529  *
1530  * If the destination matches a VIP:
1531  *
1532  * For ICMP_ECHO_REQUEST, generate a response on behalf of the back end
1533  * server.
1534  *
1535  * For ICMP_DEST_UNREACHABLE fragmentation needed, check inside the payload
1536  * and see which back end server we should send this message to.  And we
1537  * need to do NAT on both the payload message and the outside IP packet.
1538  *
1539  * For other ICMP messages, drop them.
1540  */
1541 /* ARGSUSED */
1542 static int
1543 ilb_icmp_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha,
1544     icmph_t *icmph, ipaddr_t *lb_dst)
1545 {
1546         ipaddr_t vip;
1547         ilb_rule_t *rule;
1548         in6_addr_t addr6;
1549 
1550         if (!ilb_rule_match_vip_v4(ilbs, ipha->ipha_dst, &rule))
1551                 return (ILB_PASSED);
1552 
1553 
1554         if ((uint8_t *)icmph + sizeof (icmph_t) > mp->b_wptr) {
1555                 ILB_R_KSTAT(rule, icmp_dropped);
1556                 ILB_RULE_REFRELE(rule);
1557                 return (ILB_DROPPED);
1558         }
1559 
1560         switch (icmph->icmph_type) {
1561         case ICMP_ECHO_REQUEST:
1562                 ILB_R_KSTAT(rule, icmp_echo_processed);
1563                 ILB_RULE_REFRELE(rule);
1564 
1565                 icmph->icmph_type = ICMP_ECHO_REPLY;
1566                 icmph->icmph_checksum = 0;
1567                 icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0);
1568                 ipha->ipha_ttl =
1569                     ilbs->ilbs_netstack->netstack_ip->ips_ip_def_ttl;
1570                 *lb_dst = ipha->ipha_src;
1571                 vip = ipha->ipha_dst;
1572                 ipha->ipha_dst = ipha->ipha_src;
1573                 ipha->ipha_src = vip;
1574                 return (ILB_BALANCED);
1575         case ICMP_DEST_UNREACHABLE: {
1576                 int ret;
1577 
1578                 if (icmph->icmph_code != ICMP_FRAGMENTATION_NEEDED) {
1579                         ILB_R_KSTAT(rule, icmp_dropped);
1580                         ILB_RULE_REFRELE(rule);
1581                         return (ILB_DROPPED);
1582                 }
1583                 if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IP, ipha, icmph,
1584                     &addr6)) {
1585                         ILB_R_KSTAT(rule, icmp_2big_processed);
1586                         ret = ILB_BALANCED;
1587                 } else {
1588                         ILB_R_KSTAT(rule, icmp_2big_dropped);
1589                         ret = ILB_DROPPED;
1590                 }
1591                 ILB_RULE_REFRELE(rule);
1592                 IN6_V4MAPPED_TO_IPADDR(&addr6, *lb_dst);
1593                 return (ret);
1594         }
1595         default:
1596                 ILB_R_KSTAT(rule, icmp_dropped);
1597                 ILB_RULE_REFRELE(rule);
1598                 return (ILB_DROPPED);
1599         }
1600 }
1601 
1602 /* ARGSUSED */
1603 static int
1604 ilb_icmp_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h,
1605     icmp6_t *icmp6, in6_addr_t *lb_dst)
1606 {
1607         ilb_rule_t *rule;
1608 
1609         if (!ilb_rule_match_vip_v6(ilbs, &ip6h->ip6_dst, &rule))
1610                 return (ILB_PASSED);
1611 
1612         if ((uint8_t *)icmp6 + sizeof (icmp6_t) > mp->b_wptr) {
1613                 ILB_R_KSTAT(rule, icmp_dropped);
1614                 ILB_RULE_REFRELE(rule);
1615                 return (ILB_DROPPED);
1616         }
1617 
1618         switch (icmp6->icmp6_type) {
1619         case ICMP6_ECHO_REQUEST: {
1620                 int hdr_len;
1621 
1622                 ILB_R_KSTAT(rule, icmp_echo_processed);
1623                 ILB_RULE_REFRELE(rule);
1624 
1625                 icmp6->icmp6_type = ICMP6_ECHO_REPLY;
1626                 icmp6->icmp6_cksum = ip6h->ip6_plen;
1627                 hdr_len = (char *)icmp6 - (char *)ip6h;
1628                 icmp6->icmp6_cksum = IP_CSUM(mp, hdr_len,
1629                     ilb_pseudo_sum_v6(ip6h, IPPROTO_ICMPV6));
1630                 ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL;
1631                 ip6h->ip6_hops =
1632                     ilbs->ilbs_netstack->netstack_ip->ips_ipv6_def_hops;
1633                 *lb_dst = ip6h->ip6_src;
1634                 ip6h->ip6_src = ip6h->ip6_dst;
1635                 ip6h->ip6_dst = *lb_dst;
1636                 return (ILB_BALANCED);
1637         }
1638         case ICMP6_PACKET_TOO_BIG: {
1639                 int ret;
1640 
1641                 if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IPV6, ip6h, icmp6,
1642                     lb_dst)) {
1643                         ILB_R_KSTAT(rule, icmp_2big_processed);
1644                         ret = ILB_BALANCED;
1645                 } else {
1646                         ILB_R_KSTAT(rule, icmp_2big_dropped);
1647                         ret = ILB_DROPPED;
1648                 }
1649                 ILB_RULE_REFRELE(rule);
1650                 return (ret);
1651         }
1652         default:
1653                 ILB_R_KSTAT(rule, icmp_dropped);
1654                 ILB_RULE_REFRELE(rule);
1655                 return (ILB_DROPPED);
1656         }
1657 }
1658 
1659 /*
1660  * Common routine to check an incoming packet and decide what to do with it.
1661  * called by ilb_check_v4|v6().
1662  */
1663 static int
1664 ilb_check(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, in6_addr_t *src,
1665     in6_addr_t *dst, int l3, int l4, void *iph, uint8_t *tph, uint32_t pkt_len,
1666     in6_addr_t *lb_dst)
1667 {
1668         in_port_t               sport, dport;
1669         tcpha_t                 *tcph;
1670         udpha_t                 *udph;
1671         ilb_rule_t              *rule;
1672         ilb_server_t            *server;
1673         boolean_t               balanced;
1674         struct ilb_sticky_s     *s = NULL;
1675         int                     ret;
1676         uint32_t                ip_sum, tp_sum;
1677         ilb_nat_info_t          info;
1678         uint16_t                nat_src_idx;
1679         boolean_t               busy;
1680 
1681         /*
1682          * We don't really need to switch here since both protocols's
1683          * ports are at the same offset.  Just prepare for future protocol
1684          * specific processing.
1685          */
1686         switch (l4) {
1687         case IPPROTO_TCP:
1688                 if (tph + TCP_MIN_HEADER_LENGTH > mp->b_wptr)
1689                         return (ILB_DROPPED);
1690                 tcph = (tcpha_t *)tph;
1691                 sport = tcph->tha_lport;
1692                 dport = tcph->tha_fport;
1693                 break;
1694         case IPPROTO_UDP:
1695                 if (tph + sizeof (udpha_t) > mp->b_wptr)
1696                         return (ILB_DROPPED);
1697                 udph = (udpha_t *)tph;
1698                 sport = udph->uha_src_port;
1699                 dport = udph->uha_dst_port;
1700                 break;
1701         default:
1702                 return (ILB_PASSED);
1703         }
1704 
1705         /* Fast path, there is an existing conn. */
1706         if (ilb_check_conn(ilbs, l3, iph, l4, tph, src, dst, sport, dport,
1707             pkt_len, lb_dst)) {
1708                 return (ILB_BALANCED);
1709         }
1710 
1711         /*
1712          * If there is no existing connection for the incoming packet, check
1713          * to see if the packet matches a rule.  If not, just let IP decide
1714          * what to do with it.
1715          *
1716          * Note: a reply from back end server should not match a rule.  A
1717          * reply should match one existing conn.
1718          */
1719         rule = ilb_rule_hash(ilbs, l3, l4, dst, dport, ill->ill_zoneid,
1720             pkt_len, &busy);
1721         if (rule == NULL) {
1722                 /* If the rule is busy, just drop the packet. */
1723                 if (busy)
1724                         return (ILB_DROPPED);
1725                 else
1726                         return (ILB_PASSED);
1727         }
1728 
1729         /*
1730          * The packet matches a rule, use the rule load balance algorithm
1731          * to find a server.
1732          */
1733         balanced = rule->ir_alg->ilb_alg_lb(src, sport, dst, dport,
1734             rule->ir_alg->ilb_alg_data, &server);
1735         /*
1736          * This can only happen if there is no server in a rule or all
1737          * the servers are currently disabled.
1738          */
1739         if (!balanced)
1740                 goto no_server;
1741 
1742         /*
1743          * If the rule is sticky enabled, we need to check the sticky table.
1744          * If there is a sticky entry for the client, use the previous server
1745          * instead of the one found above (note that both can be the same).
1746          * If there is no entry for that client, add an entry to the sticky
1747          * table.  Both the find and add are done in ilb_sticky_find_add()
1748          * to avoid checking for duplicate when adding an entry.
1749          */
1750         if (rule->ir_flags & ILB_RULE_STICKY) {
1751                 in6_addr_t addr;
1752 
1753                 V6_MASK_COPY(*src, rule->ir_sticky_mask, addr);
1754                 if ((server = ilb_sticky_find_add(ilbs, rule, &addr, server,
1755                     &s, &nat_src_idx)) == NULL) {
1756                         ILB_R_KSTAT(rule, nomem_pkt_dropped);
1757                         ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
1758                         goto no_server;
1759                 }
1760         }
1761 
1762         /*
1763          * We are holding a reference on the rule, so the server
1764          * cannot go away.
1765          */
1766         *lb_dst = server->iser_addr_v6;
1767         ILB_S_KSTAT(server, pkt_processed);
1768         ILB_S_KSTAT_UPDATE(server, bytes_processed, pkt_len);
1769 
1770         switch (rule->ir_topo) {
1771         case ILB_TOPO_IMPL_NAT: {
1772                 ilb_nat_src_entry_t     *src_ent;
1773                 uint16_t                *src_idx;
1774 
1775                 /*
1776                  * We create a cache even if it is not a SYN segment.
1777                  * The server should return a RST.  When we see the
1778                  * RST, we will destroy this cache.  But by having
1779                  * a cache, we know how to NAT the returned RST.
1780                  */
1781                 info.vip = *dst;
1782                 info.dport = dport;
1783                 info.src = *src;
1784                 info.sport = sport;
1785 
1786                 /* If stickiness is enabled, use the same source address */
1787                 if (s != NULL)
1788                         src_idx = &nat_src_idx;
1789                 else
1790                         src_idx = NULL;
1791 
1792                 if ((src_ent = ilb_alloc_nat_addr(server->iser_nat_src,
1793                     &info.nat_src, &info.nat_sport, src_idx)) == NULL) {
1794                         if (s != NULL)
1795                                 ilb_sticky_refrele(s);
1796                         ILB_R_KSTAT(rule, pkt_dropped);
1797                         ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1798                         ILB_R_KSTAT(rule, noport_pkt_dropped);
1799                         ILB_R_KSTAT_UPDATE(rule, noport_bytes_dropped, pkt_len);
1800                         ret = ILB_DROPPED;
1801                         break;
1802                 }
1803                 info.src_ent = src_ent;
1804                 info.nat_dst = server->iser_addr_v6;
1805                 if (rule->ir_port_range && server->iser_port_range) {
1806                         info.nat_dport = htons(ntohs(dport) -
1807                             rule->ir_min_port + server->iser_min_port);
1808                 } else {
1809                         info.nat_dport = htons(server->iser_min_port);
1810                 }
1811 
1812                 /*
1813                  * If ilb_conn_add() fails, it will release the reference on
1814                  * sticky info and de-allocate the NAT source port allocated
1815                  * above.
1816                  */
1817                 if (ilb_conn_add(ilbs, rule, server, src, sport, dst,
1818                     dport, &info, &ip_sum, &tp_sum, s) != 0) {
1819                         ILB_R_KSTAT(rule, pkt_dropped);
1820                         ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1821                         ILB_R_KSTAT(rule, nomem_pkt_dropped);
1822                         ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
1823                         ret = ILB_DROPPED;
1824                         break;
1825                 }
1826                 ilb_full_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE);
1827                 ret = ILB_BALANCED;
1828                 break;
1829         }
1830         case ILB_TOPO_IMPL_HALF_NAT:
1831                 info.vip = *dst;
1832                 info.nat_dst = server->iser_addr_v6;
1833                 info.dport = dport;
1834                 if (rule->ir_port_range && server->iser_port_range) {
1835                         info.nat_dport = htons(ntohs(dport) -
1836                             rule->ir_min_port + server->iser_min_port);
1837                 } else {
1838                         info.nat_dport = htons(server->iser_min_port);
1839                 }
1840 
1841                 if (ilb_conn_add(ilbs, rule, server, src, sport, dst,
1842                     dport, &info, &ip_sum, &tp_sum, s) != 0) {
1843                         ILB_R_KSTAT(rule, pkt_dropped);
1844                         ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1845                         ILB_R_KSTAT(rule, nomem_pkt_dropped);
1846                         ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
1847                         ret = ILB_DROPPED;
1848                         break;
1849                 }
1850                 ilb_half_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE);
1851 
1852                 ret = ILB_BALANCED;
1853                 break;
1854         case ILB_TOPO_IMPL_DSR:
1855                 /*
1856                  * By decrementing the sticky refcnt, the period of
1857                  * stickiness (life time of ilb_sticky_t) will be
1858                  * from now to (now + default expiry time).
1859                  */
1860                 if (s != NULL)
1861                         ilb_sticky_refrele(s);
1862                 ret = ILB_BALANCED;
1863                 break;
1864         default:
1865                 cmn_err(CE_PANIC, "data corruption unknown topology: %p",
1866                     (void *) rule);
1867                 break;
1868         }
1869         ILB_RULE_REFRELE(rule);
1870         return (ret);
1871 
1872 no_server:
1873         /* This can only happen if there is no server available. */
1874         ILB_R_KSTAT(rule, pkt_dropped);
1875         ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1876         ILB_RULE_REFRELE(rule);
1877         return (ILB_DROPPED);
1878 }
1879 
1880 int
1881 ilb_check_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha, int l4,
1882     uint8_t *tph, ipaddr_t *lb_dst)
1883 {
1884         in6_addr_t v6_src, v6_dst, v6_lb_dst;
1885         int ret;
1886 
1887         ASSERT(DB_REF(mp) == 1);
1888 
1889         if (l4 == IPPROTO_ICMP) {
1890                 return (ilb_icmp_v4(ilbs, ill, mp, ipha, (icmph_t *)tph,
1891                     lb_dst));
1892         }
1893 
1894         IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6_src);
1895         IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6_dst);
1896         ret = ilb_check(ilbs, ill, mp, &v6_src, &v6_dst, IPPROTO_IP, l4, ipha,
1897             tph, ntohs(ipha->ipha_length), &v6_lb_dst);
1898         if (ret == ILB_BALANCED)
1899                 IN6_V4MAPPED_TO_IPADDR(&v6_lb_dst, *lb_dst);
1900         return (ret);
1901 }
1902 
1903 int
1904 ilb_check_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h, int l4,
1905     uint8_t *tph, in6_addr_t *lb_dst)
1906 {
1907         uint32_t pkt_len;
1908 
1909         ASSERT(DB_REF(mp) == 1);
1910 
1911         if (l4 == IPPROTO_ICMPV6) {
1912                 return (ilb_icmp_v6(ilbs, ill, mp, ip6h, (icmp6_t *)tph,
1913                     lb_dst));
1914         }
1915 
1916         pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
1917         return (ilb_check(ilbs, ill, mp, &ip6h->ip6_src, &ip6h->ip6_dst,
1918             IPPROTO_IPV6, l4, ip6h, tph, pkt_len, lb_dst));
1919 }
1920 
1921 void
1922 ilb_get_num_rules(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_rules)
1923 {
1924         ilb_rule_t *tmp_rule;
1925 
1926         mutex_enter(&ilbs->ilbs_g_lock);
1927         *num_rules = 0;
1928         for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
1929             tmp_rule = tmp_rule->ir_next) {
1930                 if (tmp_rule->ir_zoneid == zoneid)
1931                         *num_rules += 1;
1932         }
1933         mutex_exit(&ilbs->ilbs_g_lock);
1934 }
1935 
1936 int
1937 ilb_get_num_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1938     uint32_t *num_servers)
1939 {
1940         ilb_rule_t *rule;
1941         int err;
1942 
1943         if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL)
1944                 return (err);
1945         *num_servers = rule->ir_kstat.num_servers.value.ui64;
1946         ILB_RULE_REFRELE(rule);
1947         return (0);
1948 }
1949 
1950 int
1951 ilb_get_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1952     ilb_server_info_t *servers, uint32_t *num_servers)
1953 {
1954         ilb_rule_t *rule;
1955         ilb_server_t *server;
1956         size_t cnt;
1957         int err;
1958 
1959         if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL)
1960                 return (err);
1961         for (server = rule->ir_servers, cnt = *num_servers;
1962             server != NULL && cnt > 0;
1963             server = server->iser_next, cnt--, servers++) {
1964                 (void) memcpy(servers->name, server->iser_name,
1965                     ILB_SERVER_NAMESZ);
1966                 servers->addr = server->iser_addr_v6;
1967                 servers->min_port = htons(server->iser_min_port);
1968                 servers->max_port = htons(server->iser_max_port);
1969                 servers->flags = server->iser_enabled ? ILB_SERVER_ENABLED : 0;
1970                 servers->err = 0;
1971         }
1972         ILB_RULE_REFRELE(rule);
1973         *num_servers -= cnt;
1974 
1975         return (0);
1976 }
1977 
1978 void
1979 ilb_get_rulenames(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_names,
1980     char *buf)
1981 {
1982         ilb_rule_t *tmp_rule;
1983         int cnt;
1984 
1985         if (*num_names == 0)
1986                 return;
1987 
1988         mutex_enter(&ilbs->ilbs_g_lock);
1989         for (cnt = 0, tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
1990             tmp_rule = tmp_rule->ir_next) {
1991                 if (tmp_rule->ir_zoneid != zoneid)
1992                         continue;
1993 
1994                 (void) memcpy(buf, tmp_rule->ir_name, ILB_RULE_NAMESZ);
1995                 buf += ILB_RULE_NAMESZ;
1996                 if (++cnt == *num_names)
1997                         break;
1998         }
1999         mutex_exit(&ilbs->ilbs_g_lock);
2000         *num_names = cnt;
2001 }
2002 
2003 int
2004 ilb_rule_list(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_rule_cmd_t *cmd)
2005 {
2006         ilb_rule_t *rule;
2007         int err;
2008 
2009         if ((rule = ilb_find_rule(ilbs, zoneid, cmd->name, &err)) == NULL) {
2010                 return (err);
2011         }
2012 
2013         /*
2014          * Except the enabled flags, none of the following will change
2015          * in the life time of a rule.  So we don't hold the mutex when
2016          * reading them.  The worst is to report a wrong enabled flags.
2017          */
2018         cmd->ip_ver = rule->ir_ipver;
2019         cmd->proto = rule->ir_proto;
2020         cmd->min_port = htons(rule->ir_min_port);
2021         cmd->max_port = htons(rule->ir_max_port);
2022 
2023         cmd->vip = rule->ir_target_v6;
2024         cmd->algo = rule->ir_alg_type;
2025         cmd->topo = rule->ir_topo;
2026 
2027         cmd->nat_src_start = rule->ir_nat_src_start;
2028         cmd->nat_src_end = rule->ir_nat_src_end;
2029 
2030         cmd->conn_drain_timeout = rule->ir_conn_drain_timeout;
2031         cmd->nat_expiry = rule->ir_nat_expiry;
2032         cmd->sticky_expiry = rule->ir_sticky_expiry;
2033 
2034         cmd->flags = 0;
2035         if (rule->ir_flags & ILB_RULE_ENABLED)
2036                 cmd->flags |= ILB_RULE_ENABLED;
2037         if (rule->ir_flags & ILB_RULE_STICKY) {
2038                 cmd->flags |= ILB_RULE_STICKY;
2039                 cmd->sticky_mask = rule->ir_sticky_mask;
2040         }
2041 
2042         ILB_RULE_REFRELE(rule);
2043         return (0);
2044 }
2045 
2046 static void *
2047 ilb_stack_init(netstackid_t stackid, netstack_t *ns)
2048 {
2049         ilb_stack_t *ilbs;
2050         char tq_name[TASKQ_NAMELEN];
2051 
2052         ilbs = kmem_alloc(sizeof (ilb_stack_t), KM_SLEEP);
2053         ilbs->ilbs_netstack = ns;
2054 
2055         ilbs->ilbs_rule_head = NULL;
2056         ilbs->ilbs_g_hash = NULL;
2057         mutex_init(&ilbs->ilbs_g_lock, NULL, MUTEX_DEFAULT, NULL);
2058 
2059         ilbs->ilbs_kstat = kmem_alloc(sizeof (ilb_g_kstat_t), KM_SLEEP);
2060         if ((ilbs->ilbs_ksp = ilb_kstat_g_init(stackid, ilbs)) == NULL) {
2061                 kmem_free(ilbs, sizeof (ilb_stack_t));
2062                 return (NULL);
2063         }
2064 
2065         /*
2066          * ilbs_conn/sticky_hash related info is initialized in
2067          * ilb_conn/sticky_hash_init().
2068          */
2069         ilbs->ilbs_conn_taskq = NULL;
2070         ilbs->ilbs_rule_hash_size = ilb_rule_hash_size;
2071         ilbs->ilbs_conn_hash_size = ilb_conn_hash_size;
2072         ilbs->ilbs_c2s_conn_hash = NULL;
2073         ilbs->ilbs_s2c_conn_hash = NULL;
2074         ilbs->ilbs_conn_timer_list = NULL;
2075 
2076         ilbs->ilbs_sticky_hash = NULL;
2077         ilbs->ilbs_sticky_hash_size = ilb_sticky_hash_size;
2078         ilbs->ilbs_sticky_timer_list = NULL;
2079         ilbs->ilbs_sticky_taskq = NULL;
2080 
2081         /* The allocation is done later when there is a rule using NAT mode. */
2082         ilbs->ilbs_nat_src = NULL;
2083         ilbs->ilbs_nat_src_hash_size = ilb_nat_src_hash_size;
2084         mutex_init(&ilbs->ilbs_nat_src_lock, NULL, MUTEX_DEFAULT, NULL);
2085         ilbs->ilbs_nat_src_tid = 0;
2086 
2087         /* For listing the conn hash table */
2088         mutex_init(&ilbs->ilbs_conn_list_lock, NULL, MUTEX_DEFAULT, NULL);
2089         cv_init(&ilbs->ilbs_conn_list_cv, NULL, CV_DEFAULT, NULL);
2090         ilbs->ilbs_conn_list_busy = B_FALSE;
2091         ilbs->ilbs_conn_list_cur = 0;
2092         ilbs->ilbs_conn_list_connp = NULL;
2093 
2094         /* For listing the sticky hash table */
2095         mutex_init(&ilbs->ilbs_sticky_list_lock, NULL, MUTEX_DEFAULT, NULL);
2096         cv_init(&ilbs->ilbs_sticky_list_cv, NULL, CV_DEFAULT, NULL);
2097         ilbs->ilbs_sticky_list_busy = B_FALSE;
2098         ilbs->ilbs_sticky_list_cur = 0;
2099         ilbs->ilbs_sticky_list_curp = NULL;
2100 
2101         (void) snprintf(tq_name, sizeof (tq_name), "ilb_rule_taskq_%p",
2102             (void *)ns);
2103         ilbs->ilbs_rule_taskq = taskq_create(tq_name, ILB_RULE_TASKQ_NUM_THR,
2104             minclsyspri, 1, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
2105 
2106         return (ilbs);
2107 }
2108 
2109 /* ARGSUSED */
2110 static void
2111 ilb_stack_shutdown(netstackid_t stackid, void *arg)
2112 {
2113         ilb_stack_t *ilbs = (ilb_stack_t *)arg;
2114         ilb_rule_t *tmp_rule;
2115 
2116         ilb_sticky_hash_fini(ilbs);
2117         ilb_conn_hash_fini(ilbs);
2118         mutex_enter(&ilbs->ilbs_g_lock);
2119         while ((tmp_rule = ilbs->ilbs_rule_head) != NULL) {
2120                 ilb_rule_hash_del(tmp_rule);
2121                 ilb_rule_g_del(ilbs, tmp_rule);
2122                 mutex_exit(&ilbs->ilbs_g_lock);
2123                 ilb_rule_del_common(ilbs, tmp_rule);
2124                 mutex_enter(&ilbs->ilbs_g_lock);
2125         }
2126         mutex_exit(&ilbs->ilbs_g_lock);
2127         if (ilbs->ilbs_nat_src != NULL)
2128                 ilb_nat_src_fini(ilbs);
2129 }
2130 
2131 static void
2132 ilb_stack_fini(netstackid_t stackid, void * arg)
2133 {
2134         ilb_stack_t *ilbs = (ilb_stack_t *)arg;
2135 
2136         ilb_rule_hash_fini(ilbs);
2137         taskq_destroy(ilbs->ilbs_rule_taskq);
2138         ilb_kstat_g_fini(stackid, ilbs);
2139         kmem_free(ilbs->ilbs_kstat, sizeof (ilb_g_kstat_t));
2140         kmem_free(ilbs, sizeof (ilb_stack_t));
2141 }
2142 
2143 void
2144 ilb_ddi_g_init(void)
2145 {
2146         netstack_register(NS_ILB, ilb_stack_init, ilb_stack_shutdown,
2147             ilb_stack_fini);
2148 }
2149 
2150 void
2151 ilb_ddi_g_destroy(void)
2152 {
2153         netstack_unregister(NS_ILB);
2154         ilb_conn_cache_fini();
2155         ilb_sticky_cache_fini();
2156 }