Print this page
5255 uts shouldn't open-code ISP2
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/inet/ilb/ilb.c
+++ new/usr/src/uts/common/inet/ilb/ilb.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
↓ open down ↓ |
16 lines elided |
↑ open up ↑ |
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 24 * Use is subject to license terms.
25 25 */
26 26
27 +#include <sys/sysmacros.h>
27 28 #include <sys/kmem.h>
28 29 #include <sys/ksynch.h>
29 30 #include <sys/systm.h>
30 31 #include <sys/socket.h>
31 32 #include <sys/disp.h>
32 33 #include <sys/taskq.h>
33 34 #include <sys/cmn_err.h>
34 35 #include <sys/strsun.h>
35 36 #include <sys/sdt.h>
36 37 #include <sys/atomic.h>
37 38 #include <netinet/in.h>
38 39 #include <inet/ip.h>
39 40 #include <inet/ip6.h>
40 41 #include <inet/tcp.h>
41 42 #include <inet/udp_impl.h>
42 43 #include <inet/kstatcom.h>
43 44
44 45 #include <inet/ilb_ip.h>
45 46 #include "ilb_alg.h"
46 47 #include "ilb_nat.h"
47 48 #include "ilb_conn.h"
48 49
49 50 /* ILB kmem cache flag */
50 51 int ilb_kmem_flags = 0;
51 52
52 53 /*
53 54 * The default size for the different hash tables. Global for all stacks.
54 55 * But each stack has its own table, just that their sizes are the same.
55 56 */
56 57 static size_t ilb_rule_hash_size = 2048;
57 58
58 59 static size_t ilb_conn_hash_size = 262144;
59 60
60 61 static size_t ilb_sticky_hash_size = 262144;
61 62
62 63 /* This should be a prime number. */
63 64 static size_t ilb_nat_src_hash_size = 97;
64 65
65 66 /* Default NAT cache entry expiry time. */
66 67 static uint32_t ilb_conn_tcp_expiry = 120;
67 68 static uint32_t ilb_conn_udp_expiry = 60;
68 69
69 70 /* Default sticky entry expiry time. */
70 71 static uint32_t ilb_sticky_expiry = 60;
71 72
72 73 /* addr is assumed to be a uint8_t * to an ipaddr_t. */
73 74 #define ILB_RULE_HASH(addr, hash_size) \
74 75 ((*((addr) + 3) * 29791 + *((addr) + 2) * 961 + *((addr) + 1) * 31 + \
75 76 *(addr)) & ((hash_size) - 1))
76 77
77 78 /*
78 79 * Note on ILB delayed processing
79 80 *
80 81 * To avoid in line removal on some of the data structures, such as rules,
81 82 * servers and ilb_conn_hash entries, ILB delays such processing to a taskq.
82 83 * There are three types of ILB taskq:
83 84 *
84 85 * 1. rule handling: created at stack initialialization time, ilb_stack_init()
85 86 * 2. conn hash handling: created at conn hash initialization time,
86 87 * ilb_conn_hash_init()
87 88 * 3. sticky hash handling: created at sticky hash initialization time,
88 89 * ilb_sticky_hash_init()
89 90 *
90 91 * The rule taskq is for processing rule and server removal. When a user
91 92 * land rule/server removal request comes in, a taskq is dispatched after
92 93 * removing the rule/server from all related hashes. This taskq will wait
93 94 * until all references to the rule/server are gone before removing it.
94 95 * So the user land thread requesting the removal does not need to wait
95 96 * for the removal completion.
96 97 *
97 98 * The conn hash/sticky hash taskq is for processing ilb_conn_hash and
98 99 * ilb_sticky_hash table entry removal. There are ilb_conn_timer_size timers
99 100 * and ilb_sticky_timer_size timers running for ilb_conn_hash and
100 101 * ilb_sticky_hash cleanup respectively. Each timer is responsible for one
101 102 * portion (same size) of the hash table. When a timer fires, it dispatches
102 103 * a conn hash taskq to clean up its portion of the table. This avoids in
103 104 * line processing of the removal.
104 105 *
105 106 * There is another delayed processing, the clean up of NAT source address
106 107 * table. We just use the timer to directly handle it instead of using
107 108 * a taskq. The reason is that the table is small so it is OK to use the
108 109 * timer.
109 110 */
110 111
111 112 /* ILB rule taskq constants. */
112 113 #define ILB_RULE_TASKQ_NUM_THR 20
113 114
114 115 /* Argument passed to ILB rule taskq routines. */
115 116 typedef struct {
116 117 ilb_stack_t *ilbs;
117 118 ilb_rule_t *rule;
118 119 } ilb_rule_tq_t;
119 120
120 121 /* kstat handling routines. */
121 122 static kstat_t *ilb_kstat_g_init(netstackid_t, ilb_stack_t *);
122 123 static void ilb_kstat_g_fini(netstackid_t, ilb_stack_t *);
123 124 static kstat_t *ilb_rule_kstat_init(netstackid_t, ilb_rule_t *);
124 125 static kstat_t *ilb_server_kstat_init(netstackid_t, ilb_rule_t *,
125 126 ilb_server_t *);
126 127
127 128 /* Rule hash handling routines. */
128 129 static void ilb_rule_hash_init(ilb_stack_t *);
129 130 static void ilb_rule_hash_fini(ilb_stack_t *);
130 131 static void ilb_rule_hash_add(ilb_stack_t *, ilb_rule_t *, const in6_addr_t *);
131 132 static void ilb_rule_hash_del(ilb_rule_t *);
132 133 static ilb_rule_t *ilb_rule_hash(ilb_stack_t *, int, int, in6_addr_t *,
133 134 in_port_t, zoneid_t, uint32_t, boolean_t *);
134 135
135 136 static void ilb_rule_g_add(ilb_stack_t *, ilb_rule_t *);
136 137 static void ilb_rule_g_del(ilb_stack_t *, ilb_rule_t *);
137 138 static void ilb_del_rule_common(ilb_stack_t *, ilb_rule_t *);
138 139 static ilb_rule_t *ilb_find_rule_locked(ilb_stack_t *, zoneid_t, const char *,
139 140 int *);
140 141 static boolean_t ilb_match_rule(ilb_stack_t *, zoneid_t, const char *, int,
141 142 int, in_port_t, in_port_t, const in6_addr_t *);
142 143
143 144 /* Back end server handling routines. */
144 145 static void ilb_server_free(ilb_server_t *);
145 146
146 147 /* Network stack handling routines. */
147 148 static void *ilb_stack_init(netstackid_t, netstack_t *);
148 149 static void ilb_stack_shutdown(netstackid_t, void *);
149 150 static void ilb_stack_fini(netstackid_t, void *);
150 151
151 152 /* Sticky connection handling routines. */
152 153 static void ilb_rule_sticky_init(ilb_rule_t *);
153 154 static void ilb_rule_sticky_fini(ilb_rule_t *);
154 155
155 156 /* Handy macro to check for unspecified address. */
156 157 #define IS_ADDR_UNSPEC(addr) \
157 158 (IN6_IS_ADDR_V4MAPPED(addr) ? IN6_IS_ADDR_V4MAPPED_ANY(addr) : \
158 159 IN6_IS_ADDR_UNSPECIFIED(addr))
159 160
160 161 /*
161 162 * Global kstat instance counter. When a rule is created, its kstat instance
162 163 * number is assigned by ilb_kstat_instance and ilb_kstat_instance is
163 164 * incremented.
164 165 */
165 166 static uint_t ilb_kstat_instance = 0;
166 167
167 168 /*
168 169 * The ILB global kstat has name ILB_G_KS_NAME and class name ILB_G_KS_CNAME.
169 170 * A rule's kstat has ILB_RULE_KS_CNAME class name.
170 171 */
171 172 #define ILB_G_KS_NAME "global"
172 173 #define ILB_G_KS_CNAME "kstat"
173 174 #define ILB_RULE_KS_CNAME "rulestat"
174 175
175 176 static kstat_t *
176 177 ilb_kstat_g_init(netstackid_t stackid, ilb_stack_t *ilbs)
177 178 {
178 179 kstat_t *ksp;
179 180 ilb_g_kstat_t template = {
180 181 { "num_rules", KSTAT_DATA_UINT64, 0 },
181 182 { "ip_frag_in", KSTAT_DATA_UINT64, 0 },
182 183 { "ip_frag_dropped", KSTAT_DATA_UINT64, 0 }
183 184 };
184 185
185 186 ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, 0, ILB_G_KS_NAME,
186 187 ILB_G_KS_CNAME, KSTAT_TYPE_NAMED, NUM_OF_FIELDS(ilb_g_kstat_t),
187 188 KSTAT_FLAG_VIRTUAL, stackid);
188 189 if (ksp == NULL)
189 190 return (NULL);
190 191 bcopy(&template, ilbs->ilbs_kstat, sizeof (template));
191 192 ksp->ks_data = ilbs->ilbs_kstat;
192 193 ksp->ks_private = (void *)(uintptr_t)stackid;
193 194
194 195 kstat_install(ksp);
195 196 return (ksp);
196 197 }
197 198
198 199 static void
199 200 ilb_kstat_g_fini(netstackid_t stackid, ilb_stack_t *ilbs)
200 201 {
201 202 if (ilbs->ilbs_ksp != NULL) {
202 203 ASSERT(stackid == (netstackid_t)(uintptr_t)
203 204 ilbs->ilbs_ksp->ks_private);
204 205 kstat_delete_netstack(ilbs->ilbs_ksp, stackid);
205 206 ilbs->ilbs_ksp = NULL;
206 207 }
207 208 }
208 209
209 210 static kstat_t *
210 211 ilb_rule_kstat_init(netstackid_t stackid, ilb_rule_t *rule)
211 212 {
212 213 kstat_t *ksp;
213 214 ilb_rule_kstat_t template = {
214 215 { "num_servers", KSTAT_DATA_UINT64, 0 },
215 216 { "bytes_not_processed", KSTAT_DATA_UINT64, 0 },
216 217 { "pkt_not_processed", KSTAT_DATA_UINT64, 0 },
217 218 { "bytes_dropped", KSTAT_DATA_UINT64, 0 },
218 219 { "pkt_dropped", KSTAT_DATA_UINT64, 0 },
219 220 { "nomem_bytes_dropped", KSTAT_DATA_UINT64, 0 },
220 221 { "nomem_pkt_dropped", KSTAT_DATA_UINT64, 0 },
221 222 { "noport_bytes_dropped", KSTAT_DATA_UINT64, 0 },
222 223 { "noport_pkt_dropped", KSTAT_DATA_UINT64, 0 },
223 224 { "icmp_echo_processed", KSTAT_DATA_UINT64, 0 },
224 225 { "icmp_dropped", KSTAT_DATA_UINT64, 0 },
225 226 { "icmp_too_big_processed", KSTAT_DATA_UINT64, 0 },
226 227 { "icmp_too_big_dropped", KSTAT_DATA_UINT64, 0 }
227 228 };
228 229
229 230 ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance,
230 231 rule->ir_name, ILB_RULE_KS_CNAME, KSTAT_TYPE_NAMED,
231 232 NUM_OF_FIELDS(ilb_rule_kstat_t), KSTAT_FLAG_VIRTUAL, stackid);
232 233 if (ksp == NULL)
233 234 return (NULL);
234 235
235 236 bcopy(&template, &rule->ir_kstat, sizeof (template));
236 237 ksp->ks_data = &rule->ir_kstat;
237 238 ksp->ks_private = (void *)(uintptr_t)stackid;
238 239
239 240 kstat_install(ksp);
240 241 return (ksp);
241 242 }
242 243
243 244 static kstat_t *
244 245 ilb_server_kstat_init(netstackid_t stackid, ilb_rule_t *rule,
245 246 ilb_server_t *server)
246 247 {
247 248 kstat_t *ksp;
248 249 ilb_server_kstat_t template = {
249 250 { "bytes_processed", KSTAT_DATA_UINT64, 0 },
250 251 { "pkt_processed", KSTAT_DATA_UINT64, 0 },
251 252 { "ip_address", KSTAT_DATA_STRING, 0 }
252 253 };
253 254 char cname_buf[KSTAT_STRLEN];
254 255
255 256 /* 7 is "-sstat" */
256 257 ASSERT(strlen(rule->ir_name) + 7 < KSTAT_STRLEN);
257 258 (void) sprintf(cname_buf, "%s-sstat", rule->ir_name);
258 259 ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance,
259 260 server->iser_name, cname_buf, KSTAT_TYPE_NAMED,
260 261 NUM_OF_FIELDS(ilb_server_kstat_t), KSTAT_FLAG_VIRTUAL, stackid);
261 262 if (ksp == NULL)
262 263 return (NULL);
263 264
264 265 bcopy(&template, &server->iser_kstat, sizeof (template));
265 266 ksp->ks_data = &server->iser_kstat;
266 267 ksp->ks_private = (void *)(uintptr_t)stackid;
267 268
268 269 kstat_named_setstr(&server->iser_kstat.ip_address,
269 270 server->iser_ip_addr);
270 271 /* We never change the IP address */
271 272 ksp->ks_data_size += strlen(server->iser_ip_addr) + 1;
272 273
273 274 kstat_install(ksp);
274 275 return (ksp);
275 276 }
276 277
↓ open down ↓ |
240 lines elided |
↑ open up ↑ |
277 278 /* Initialize the rule hash table. */
278 279 static void
279 280 ilb_rule_hash_init(ilb_stack_t *ilbs)
280 281 {
281 282 int i;
282 283
283 284 /*
284 285 * If ilbs->ilbs_rule_hash_size is not a power of 2, bump it up to
285 286 * the next power of 2.
286 287 */
287 - if (ilbs->ilbs_rule_hash_size & (ilbs->ilbs_rule_hash_size - 1)) {
288 + if (!ISP2(ilbs->ilbs_rule_hash_size)) {
288 289 for (i = 0; i < 31; i++) {
289 290 if (ilbs->ilbs_rule_hash_size < (1 << i))
290 291 break;
291 292 }
292 293 ilbs->ilbs_rule_hash_size = 1 << i;
293 294 }
294 295 ilbs->ilbs_g_hash = kmem_zalloc(sizeof (ilb_hash_t) *
295 296 ilbs->ilbs_rule_hash_size, KM_SLEEP);
296 297 for (i = 0; i < ilbs->ilbs_rule_hash_size; i++) {
297 298 mutex_init(&ilbs->ilbs_g_hash[i].ilb_hash_lock, NULL,
298 299 MUTEX_DEFAULT, NULL);
299 300 }
300 301 }
301 302
302 303 /* Clean up the rule hash table. */
303 304 static void
304 305 ilb_rule_hash_fini(ilb_stack_t *ilbs)
305 306 {
306 307 if (ilbs->ilbs_g_hash == NULL)
307 308 return;
308 309 kmem_free(ilbs->ilbs_g_hash, sizeof (ilb_hash_t) *
309 310 ilbs->ilbs_rule_hash_size);
310 311 }
311 312
312 313 /* Add a rule to the rule hash table. */
313 314 static void
314 315 ilb_rule_hash_add(ilb_stack_t *ilbs, ilb_rule_t *rule, const in6_addr_t *addr)
315 316 {
316 317 int i;
317 318
318 319 i = ILB_RULE_HASH((uint8_t *)&addr->s6_addr32[3],
319 320 ilbs->ilbs_rule_hash_size);
320 321 DTRACE_PROBE2(ilb__rule__hash__add, ilb_rule_t *, rule, int, i);
321 322 mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
322 323 rule->ir_hash_next = ilbs->ilbs_g_hash[i].ilb_hash_rule;
323 324 if (ilbs->ilbs_g_hash[i].ilb_hash_rule != NULL)
324 325 ilbs->ilbs_g_hash[i].ilb_hash_rule->ir_hash_prev = rule;
325 326 rule->ir_hash_prev = NULL;
326 327 ilbs->ilbs_g_hash[i].ilb_hash_rule = rule;
327 328
328 329 rule->ir_hash = &ilbs->ilbs_g_hash[i];
329 330 mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
330 331 }
331 332
332 333 /*
333 334 * Remove a rule from the rule hash table. Note that the rule is not freed
334 335 * in this routine.
335 336 */
336 337 static void
337 338 ilb_rule_hash_del(ilb_rule_t *rule)
338 339 {
339 340 mutex_enter(&rule->ir_hash->ilb_hash_lock);
340 341 if (rule->ir_hash->ilb_hash_rule == rule) {
341 342 rule->ir_hash->ilb_hash_rule = rule->ir_hash_next;
342 343 if (rule->ir_hash_next != NULL)
343 344 rule->ir_hash_next->ir_hash_prev = NULL;
344 345 } else {
345 346 if (rule->ir_hash_prev != NULL)
346 347 rule->ir_hash_prev->ir_hash_next =
347 348 rule->ir_hash_next;
348 349 if (rule->ir_hash_next != NULL) {
349 350 rule->ir_hash_next->ir_hash_prev =
350 351 rule->ir_hash_prev;
351 352 }
352 353 }
353 354 mutex_exit(&rule->ir_hash->ilb_hash_lock);
354 355
355 356 rule->ir_hash_next = NULL;
356 357 rule->ir_hash_prev = NULL;
357 358 rule->ir_hash = NULL;
358 359 }
359 360
360 361 /*
361 362 * Given the info of a packet, look for a match in the rule hash table.
362 363 */
363 364 static ilb_rule_t *
364 365 ilb_rule_hash(ilb_stack_t *ilbs, int l3, int l4, in6_addr_t *addr,
365 366 in_port_t port, zoneid_t zoneid, uint32_t len, boolean_t *busy)
366 367 {
367 368 int i;
368 369 ilb_rule_t *rule;
369 370 ipaddr_t v4_addr;
370 371
371 372 *busy = B_FALSE;
372 373 IN6_V4MAPPED_TO_IPADDR(addr, v4_addr);
373 374 i = ILB_RULE_HASH((uint8_t *)&v4_addr, ilbs->ilbs_rule_hash_size);
374 375 port = ntohs(port);
375 376
376 377 mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
377 378 for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
378 379 rule = rule->ir_hash_next) {
379 380 if (!rule->ir_port_range) {
380 381 if (rule->ir_min_port != port)
381 382 continue;
382 383 } else {
383 384 if (port < rule->ir_min_port ||
384 385 port > rule->ir_max_port) {
385 386 continue;
386 387 }
387 388 }
388 389 if (rule->ir_ipver != l3 || rule->ir_proto != l4 ||
389 390 rule->ir_zoneid != zoneid) {
390 391 continue;
391 392 }
392 393
393 394 if (l3 == IPPROTO_IP) {
394 395 if (rule->ir_target_v4 != INADDR_ANY &&
395 396 rule->ir_target_v4 != v4_addr) {
396 397 continue;
397 398 }
398 399 } else {
399 400 if (!IN6_IS_ADDR_UNSPECIFIED(&rule->ir_target_v6) &&
400 401 !IN6_ARE_ADDR_EQUAL(addr, &rule->ir_target_v6)) {
401 402 continue;
402 403 }
403 404 }
404 405
405 406 /*
406 407 * Just update the stats if the rule is disabled.
407 408 */
408 409 mutex_enter(&rule->ir_lock);
409 410 if (!(rule->ir_flags & ILB_RULE_ENABLED)) {
410 411 ILB_R_KSTAT(rule, pkt_not_processed);
411 412 ILB_R_KSTAT_UPDATE(rule, bytes_not_processed, len);
412 413 mutex_exit(&rule->ir_lock);
413 414 rule = NULL;
414 415 break;
415 416 } else if (rule->ir_flags & ILB_RULE_BUSY) {
416 417 /*
417 418 * If we are busy...
418 419 *
419 420 * XXX we should have a queue to postpone the
420 421 * packet processing. But this requires a
421 422 * mechanism in IP to re-start the packet
422 423 * processing. So for now, just drop the packet.
423 424 */
424 425 ILB_R_KSTAT(rule, pkt_dropped);
425 426 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, len);
426 427 mutex_exit(&rule->ir_lock);
427 428 *busy = B_TRUE;
428 429 rule = NULL;
429 430 break;
430 431 } else {
431 432 rule->ir_refcnt++;
432 433 ASSERT(rule->ir_refcnt != 1);
433 434 mutex_exit(&rule->ir_lock);
434 435 break;
435 436 }
436 437 }
437 438 mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
438 439 return (rule);
439 440 }
440 441
441 442 /*
442 443 * Add a rule to the global rule list. This list is for finding all rules
443 444 * in an IP stack. The caller is assumed to hold the ilbs_g_lock.
444 445 */
445 446 static void
446 447 ilb_rule_g_add(ilb_stack_t *ilbs, ilb_rule_t *rule)
447 448 {
448 449 ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
449 450 rule->ir_next = ilbs->ilbs_rule_head;
450 451 ilbs->ilbs_rule_head = rule;
451 452 ILB_KSTAT_UPDATE(ilbs, num_rules, 1);
452 453 }
453 454
454 455 /* The call is assumed to hold the ilbs_g_lock. */
455 456 static void
456 457 ilb_rule_g_del(ilb_stack_t *ilbs, ilb_rule_t *rule)
457 458 {
458 459 ilb_rule_t *tmp_rule;
459 460 ilb_rule_t *prev_rule;
460 461
461 462 ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
462 463 prev_rule = NULL;
463 464 for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
464 465 prev_rule = tmp_rule, tmp_rule = tmp_rule->ir_next) {
465 466 if (tmp_rule == rule)
466 467 break;
467 468 }
468 469 if (tmp_rule == NULL) {
469 470 mutex_exit(&ilbs->ilbs_g_lock);
470 471 return;
471 472 }
472 473 if (prev_rule == NULL)
473 474 ilbs->ilbs_rule_head = tmp_rule->ir_next;
474 475 else
475 476 prev_rule->ir_next = tmp_rule->ir_next;
476 477 ILB_KSTAT_UPDATE(ilbs, num_rules, -1);
477 478 }
478 479
479 480 /*
480 481 * Helper routine to calculate how many source addresses are in a given
481 482 * range.
482 483 */
483 484 static int64_t
484 485 num_nat_src_v6(const in6_addr_t *a1, const in6_addr_t *a2)
485 486 {
486 487 int64_t ret;
487 488 uint32_t addr1, addr2;
488 489
489 490 /*
490 491 * Here we assume that the max number of NAT source cannot be
491 492 * large such that the most significant 2 s6_addr32 must be
492 493 * equal.
493 494 */
494 495 addr1 = ntohl(a1->s6_addr32[3]);
495 496 addr2 = ntohl(a2->s6_addr32[3]);
496 497 if (a1->s6_addr32[0] != a2->s6_addr32[0] ||
497 498 a1->s6_addr32[1] != a2->s6_addr32[1] ||
498 499 a1->s6_addr32[2] > a2->s6_addr32[2] ||
499 500 (a1->s6_addr32[2] == a2->s6_addr32[2] && addr1 > addr2)) {
500 501 return (-1);
501 502 }
502 503 if (a1->s6_addr32[2] == a2->s6_addr32[2]) {
503 504 return (addr2 - addr1 + 1);
504 505 } else {
505 506 ret = (ntohl(a2->s6_addr32[2]) - ntohl(a1->s6_addr32[2]));
506 507 ret <<= 32;
507 508 ret = ret + addr1 - addr2;
508 509 return (ret + 1);
509 510 }
510 511 }
511 512
512 513 /*
513 514 * Add an ILB rule.
514 515 */
515 516 int
516 517 ilb_rule_add(ilb_stack_t *ilbs, zoneid_t zoneid, const ilb_rule_cmd_t *cmd)
517 518 {
518 519 ilb_rule_t *rule;
519 520 netstackid_t stackid;
520 521 int ret;
521 522 in_port_t min_port, max_port;
522 523 int64_t num_src;
523 524
524 525 /* Sanity checks. */
525 526 if (cmd->ip_ver != IPPROTO_IP && cmd->ip_ver != IPPROTO_IPV6)
526 527 return (EINVAL);
527 528
528 529 /* Need to support SCTP... */
529 530 if (cmd->proto != IPPROTO_TCP && cmd->proto != IPPROTO_UDP)
530 531 return (EINVAL);
531 532
532 533 /* For full NAT, the NAT source must be supplied. */
533 534 if (cmd->topo == ILB_TOPO_IMPL_NAT) {
534 535 if (IS_ADDR_UNSPEC(&cmd->nat_src_start) ||
535 536 IS_ADDR_UNSPEC(&cmd->nat_src_end)) {
536 537 return (EINVAL);
537 538 }
538 539 }
539 540
540 541 /* Check invalid mask */
541 542 if ((cmd->flags & ILB_RULE_STICKY) &&
542 543 IS_ADDR_UNSPEC(&cmd->sticky_mask)) {
543 544 return (EINVAL);
544 545 }
545 546
546 547 /* Port is passed in network byte order. */
547 548 min_port = ntohs(cmd->min_port);
548 549 max_port = ntohs(cmd->max_port);
549 550 if (min_port > max_port)
550 551 return (EINVAL);
551 552
552 553 /* min_port == 0 means "all ports". Make it so */
553 554 if (min_port == 0) {
554 555 min_port = 1;
555 556 max_port = 65535;
556 557 }
557 558
558 559 /* Funny address checking. */
559 560 if (cmd->ip_ver == IPPROTO_IP) {
560 561 in_addr_t v4_addr1, v4_addr2;
561 562
562 563 v4_addr1 = cmd->vip.s6_addr32[3];
563 564 if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET ||
564 565 CLASSD(v4_addr1) || v4_addr1 == INADDR_BROADCAST ||
565 566 v4_addr1 == INADDR_ANY ||
566 567 !IN6_IS_ADDR_V4MAPPED(&cmd->vip)) {
567 568 return (EINVAL);
568 569 }
569 570
570 571 if (cmd->topo == ILB_TOPO_IMPL_NAT) {
571 572 v4_addr1 = ntohl(cmd->nat_src_start.s6_addr32[3]);
572 573 v4_addr2 = ntohl(cmd->nat_src_end.s6_addr32[3]);
573 574 if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET ||
574 575 (*(uchar_t *)&v4_addr2) == IN_LOOPBACKNET ||
575 576 v4_addr1 == INADDR_BROADCAST ||
576 577 v4_addr2 == INADDR_BROADCAST ||
577 578 v4_addr1 == INADDR_ANY || v4_addr2 == INADDR_ANY ||
578 579 CLASSD(v4_addr1) || CLASSD(v4_addr2) ||
579 580 !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) ||
580 581 !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) {
581 582 return (EINVAL);
582 583 }
583 584
584 585 num_src = v4_addr2 - v4_addr1 + 1;
585 586 if (v4_addr1 > v4_addr2 || num_src > ILB_MAX_NAT_SRC)
586 587 return (EINVAL);
587 588 }
588 589 } else {
589 590 if (IN6_IS_ADDR_LOOPBACK(&cmd->vip) ||
590 591 IN6_IS_ADDR_MULTICAST(&cmd->vip) ||
591 592 IN6_IS_ADDR_UNSPECIFIED(&cmd->vip) ||
592 593 IN6_IS_ADDR_V4MAPPED(&cmd->vip)) {
593 594 return (EINVAL);
594 595 }
595 596
596 597 if (cmd->topo == ILB_TOPO_IMPL_NAT) {
597 598 if (IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_start) ||
598 599 IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_end) ||
599 600 IN6_IS_ADDR_MULTICAST(&cmd->nat_src_start) ||
600 601 IN6_IS_ADDR_MULTICAST(&cmd->nat_src_end) ||
601 602 IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_start) ||
602 603 IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_end) ||
603 604 IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) ||
604 605 IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) {
605 606 return (EINVAL);
606 607 }
607 608
608 609 if ((num_src = num_nat_src_v6(&cmd->nat_src_start,
609 610 &cmd->nat_src_end)) < 0 ||
610 611 num_src > ILB_MAX_NAT_SRC) {
611 612 return (EINVAL);
612 613 }
613 614 }
614 615 }
615 616
616 617 mutex_enter(&ilbs->ilbs_g_lock);
617 618 if (ilbs->ilbs_g_hash == NULL)
618 619 ilb_rule_hash_init(ilbs);
619 620 if (ilbs->ilbs_c2s_conn_hash == NULL) {
620 621 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
621 622 ilb_conn_hash_init(ilbs);
622 623 ilb_nat_src_init(ilbs);
623 624 }
624 625
625 626 /* Make sure that the new rule does not duplicate an existing one. */
626 627 if (ilb_match_rule(ilbs, zoneid, cmd->name, cmd->ip_ver, cmd->proto,
627 628 min_port, max_port, &cmd->vip)) {
628 629 mutex_exit(&ilbs->ilbs_g_lock);
629 630 return (EEXIST);
630 631 }
631 632
632 633 rule = kmem_zalloc(sizeof (ilb_rule_t), KM_NOSLEEP);
633 634 if (rule == NULL) {
634 635 mutex_exit(&ilbs->ilbs_g_lock);
635 636 return (ENOMEM);
636 637 }
637 638
638 639 /* ir_name is all 0 to begin with */
639 640 (void) memcpy(rule->ir_name, cmd->name, ILB_RULE_NAMESZ - 1);
640 641
641 642 rule->ir_ks_instance = atomic_inc_uint_nv(&ilb_kstat_instance);
642 643 stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
643 644 if ((rule->ir_ksp = ilb_rule_kstat_init(stackid, rule)) == NULL) {
644 645 ret = ENOMEM;
645 646 goto error;
646 647 }
647 648
648 649 if (cmd->topo == ILB_TOPO_IMPL_NAT) {
649 650 rule->ir_nat_src_start = cmd->nat_src_start;
650 651 rule->ir_nat_src_end = cmd->nat_src_end;
651 652 }
652 653
653 654 rule->ir_ipver = cmd->ip_ver;
654 655 rule->ir_proto = cmd->proto;
655 656 rule->ir_topo = cmd->topo;
656 657
657 658 rule->ir_min_port = min_port;
658 659 rule->ir_max_port = max_port;
659 660 if (rule->ir_min_port != rule->ir_max_port)
660 661 rule->ir_port_range = B_TRUE;
661 662 else
662 663 rule->ir_port_range = B_FALSE;
663 664
664 665 rule->ir_zoneid = zoneid;
665 666
666 667 rule->ir_target_v6 = cmd->vip;
667 668 rule->ir_servers = NULL;
668 669
669 670 /*
670 671 * The default connection drain timeout is indefinite (value 0),
671 672 * meaning we will wait for all connections to finish. So we
672 673 * can assign cmd->conn_drain_timeout to it directly.
673 674 */
674 675 rule->ir_conn_drain_timeout = cmd->conn_drain_timeout;
675 676 if (cmd->nat_expiry != 0) {
676 677 rule->ir_nat_expiry = cmd->nat_expiry;
677 678 } else {
678 679 switch (rule->ir_proto) {
679 680 case IPPROTO_TCP:
680 681 rule->ir_nat_expiry = ilb_conn_tcp_expiry;
681 682 break;
682 683 case IPPROTO_UDP:
683 684 rule->ir_nat_expiry = ilb_conn_udp_expiry;
684 685 break;
685 686 default:
686 687 cmn_err(CE_PANIC, "data corruption: wrong ir_proto: %p",
687 688 (void *)rule);
688 689 break;
689 690 }
690 691 }
691 692 if (cmd->sticky_expiry != 0)
692 693 rule->ir_sticky_expiry = cmd->sticky_expiry;
693 694 else
694 695 rule->ir_sticky_expiry = ilb_sticky_expiry;
695 696
696 697 if (cmd->flags & ILB_RULE_STICKY) {
697 698 rule->ir_flags |= ILB_RULE_STICKY;
698 699 rule->ir_sticky_mask = cmd->sticky_mask;
699 700 if (ilbs->ilbs_sticky_hash == NULL)
700 701 ilb_sticky_hash_init(ilbs);
701 702 }
702 703 if (cmd->flags & ILB_RULE_ENABLED)
703 704 rule->ir_flags |= ILB_RULE_ENABLED;
704 705
705 706 mutex_init(&rule->ir_lock, NULL, MUTEX_DEFAULT, NULL);
706 707 cv_init(&rule->ir_cv, NULL, CV_DEFAULT, NULL);
707 708
708 709 rule->ir_refcnt = 1;
709 710
710 711 switch (cmd->algo) {
711 712 case ILB_ALG_IMPL_ROUNDROBIN:
712 713 if ((rule->ir_alg = ilb_alg_rr_init(rule, NULL)) == NULL) {
713 714 ret = ENOMEM;
714 715 goto error;
715 716 }
716 717 rule->ir_alg_type = ILB_ALG_IMPL_ROUNDROBIN;
717 718 break;
718 719 case ILB_ALG_IMPL_HASH_IP:
719 720 case ILB_ALG_IMPL_HASH_IP_SPORT:
720 721 case ILB_ALG_IMPL_HASH_IP_VIP:
721 722 if ((rule->ir_alg = ilb_alg_hash_init(rule,
722 723 &cmd->algo)) == NULL) {
723 724 ret = ENOMEM;
724 725 goto error;
725 726 }
726 727 rule->ir_alg_type = cmd->algo;
727 728 break;
728 729 default:
729 730 ret = EINVAL;
730 731 goto error;
731 732 }
732 733
733 734 /* Add it to the global list and hash array at the end. */
734 735 ilb_rule_g_add(ilbs, rule);
735 736 ilb_rule_hash_add(ilbs, rule, &cmd->vip);
736 737
737 738 mutex_exit(&ilbs->ilbs_g_lock);
738 739
739 740 return (0);
740 741
741 742 error:
742 743 mutex_exit(&ilbs->ilbs_g_lock);
743 744 if (rule->ir_ksp != NULL) {
744 745 /* stackid must be initialized if ir_ksp != NULL */
745 746 kstat_delete_netstack(rule->ir_ksp, stackid);
746 747 }
747 748 kmem_free(rule, sizeof (ilb_rule_t));
748 749 return (ret);
749 750 }
750 751
751 752 /*
752 753 * The final part in deleting a rule. Either called directly or by the
753 754 * taskq dispatched.
754 755 */
755 756 static void
756 757 ilb_rule_del_common(ilb_stack_t *ilbs, ilb_rule_t *tmp_rule)
757 758 {
758 759 netstackid_t stackid;
759 760 ilb_server_t *server;
760 761
761 762 stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
762 763
763 764 /*
764 765 * Let the algorithm know that the rule is going away. The
765 766 * algorithm fini routine will free all its resources with this
766 767 * rule.
767 768 */
768 769 tmp_rule->ir_alg->ilb_alg_fini(&tmp_rule->ir_alg);
769 770
770 771 while ((server = tmp_rule->ir_servers) != NULL) {
771 772 mutex_enter(&server->iser_lock);
772 773 ilb_destroy_nat_src(&server->iser_nat_src);
773 774 if (tmp_rule->ir_conn_drain_timeout != 0) {
774 775 /*
775 776 * The garbage collection thread checks this value
776 777 * without grabing a lock. So we need to use
777 778 * atomic_swap_64() to make sure that the value seen
778 779 * by gc thread is intact.
779 780 */
780 781 (void) atomic_swap_64(
781 782 (uint64_t *)&server->iser_die_time,
782 783 ddi_get_lbolt64() +
783 784 SEC_TO_TICK(tmp_rule->ir_conn_drain_timeout));
784 785 }
785 786 while (server->iser_refcnt > 1)
786 787 cv_wait(&server->iser_cv, &server->iser_lock);
787 788 tmp_rule->ir_servers = server->iser_next;
788 789 kstat_delete_netstack(server->iser_ksp, stackid);
789 790 kmem_free(server, sizeof (ilb_server_t));
790 791 }
791 792
792 793 ASSERT(tmp_rule->ir_ksp != NULL);
793 794 kstat_delete_netstack(tmp_rule->ir_ksp, stackid);
794 795
795 796 kmem_free(tmp_rule, sizeof (ilb_rule_t));
796 797 }
797 798
798 799 /* The routine executed by the delayed rule taskq. */
799 800 static void
800 801 ilb_rule_del_tq(void *arg)
801 802 {
802 803 ilb_stack_t *ilbs = ((ilb_rule_tq_t *)arg)->ilbs;
803 804 ilb_rule_t *rule = ((ilb_rule_tq_t *)arg)->rule;
804 805
805 806 mutex_enter(&rule->ir_lock);
806 807 while (rule->ir_refcnt > 1)
807 808 cv_wait(&rule->ir_cv, &rule->ir_lock);
808 809 ilb_rule_del_common(ilbs, rule);
809 810 kmem_free(arg, sizeof (ilb_rule_tq_t));
810 811 }
811 812
812 813 /* Routine to delete a rule. */
813 814 int
814 815 ilb_rule_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name)
815 816 {
816 817 ilb_rule_t *tmp_rule;
817 818 ilb_rule_tq_t *arg;
818 819 int err;
819 820
820 821 mutex_enter(&ilbs->ilbs_g_lock);
821 822 if ((tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name,
822 823 &err)) == NULL) {
823 824 mutex_exit(&ilbs->ilbs_g_lock);
824 825 return (err);
825 826 }
826 827
827 828 /*
828 829 * First remove the rule from the hash array and the global list so
829 830 * that no one can find this rule any more.
830 831 */
831 832 ilb_rule_hash_del(tmp_rule);
832 833 ilb_rule_g_del(ilbs, tmp_rule);
833 834 mutex_exit(&ilbs->ilbs_g_lock);
834 835 ILB_RULE_REFRELE(tmp_rule);
835 836
836 837 /*
837 838 * Now no one can find this rule, we can remove it once all
838 839 * references to it are dropped and all references to the list
839 840 * of servers are dropped. So dispatch a task to finish the deletion.
840 841 * We do this instead of letting the last one referencing the
841 842 * rule do it. The reason is that the last one may be the
842 843 * interrupt thread. We want to minimize the work it needs to
843 844 * do. Rule deletion is not a critical task so it can be delayed.
844 845 */
845 846 arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP);
846 847 arg->ilbs = ilbs;
847 848 arg->rule = tmp_rule;
848 849 (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq, arg,
849 850 TQ_SLEEP);
850 851
851 852 return (0);
852 853 }
853 854
854 855 /*
855 856 * Given an IP address, check to see if there is a rule using this
856 857 * as the VIP. It can be used to check if we need to drop a fragment.
857 858 */
858 859 boolean_t
859 860 ilb_rule_match_vip_v6(ilb_stack_t *ilbs, in6_addr_t *vip, ilb_rule_t **ret_rule)
860 861 {
861 862 int i;
862 863 ilb_rule_t *rule;
863 864 boolean_t ret = B_FALSE;
864 865
865 866 i = ILB_RULE_HASH((uint8_t *)&vip->s6_addr32[3],
866 867 ilbs->ilbs_rule_hash_size);
867 868 mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
868 869 for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
869 870 rule = rule->ir_hash_next) {
870 871 if (IN6_ARE_ADDR_EQUAL(vip, &rule->ir_target_v6)) {
871 872 mutex_enter(&rule->ir_lock);
872 873 if (rule->ir_flags & ILB_RULE_BUSY) {
873 874 mutex_exit(&rule->ir_lock);
874 875 break;
875 876 }
876 877 if (ret_rule != NULL) {
877 878 rule->ir_refcnt++;
878 879 mutex_exit(&rule->ir_lock);
879 880 *ret_rule = rule;
880 881 } else {
881 882 mutex_exit(&rule->ir_lock);
882 883 }
883 884 ret = B_TRUE;
884 885 break;
885 886 }
886 887 }
887 888 mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
888 889 return (ret);
889 890 }
890 891
891 892 boolean_t
892 893 ilb_rule_match_vip_v4(ilb_stack_t *ilbs, ipaddr_t addr, ilb_rule_t **ret_rule)
893 894 {
894 895 int i;
895 896 ilb_rule_t *rule;
896 897 boolean_t ret = B_FALSE;
897 898
898 899 i = ILB_RULE_HASH((uint8_t *)&addr, ilbs->ilbs_rule_hash_size);
899 900 mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
900 901 for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
901 902 rule = rule->ir_hash_next) {
902 903 if (rule->ir_target_v6.s6_addr32[3] == addr) {
903 904 mutex_enter(&rule->ir_lock);
904 905 if (rule->ir_flags & ILB_RULE_BUSY) {
905 906 mutex_exit(&rule->ir_lock);
906 907 break;
907 908 }
908 909 if (ret_rule != NULL) {
909 910 rule->ir_refcnt++;
910 911 mutex_exit(&rule->ir_lock);
911 912 *ret_rule = rule;
912 913 } else {
913 914 mutex_exit(&rule->ir_lock);
914 915 }
915 916 ret = B_TRUE;
916 917 break;
917 918 }
918 919 }
919 920 mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
920 921 return (ret);
921 922 }
922 923
923 924 static ilb_rule_t *
924 925 ilb_find_rule_locked(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
925 926 int *err)
926 927 {
927 928 ilb_rule_t *tmp_rule;
928 929
929 930 ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
930 931
931 932 for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
932 933 tmp_rule = tmp_rule->ir_next) {
933 934 if (tmp_rule->ir_zoneid != zoneid)
934 935 continue;
935 936 if (strcasecmp(tmp_rule->ir_name, name) == 0) {
936 937 mutex_enter(&tmp_rule->ir_lock);
937 938 if (tmp_rule->ir_flags & ILB_RULE_BUSY) {
938 939 mutex_exit(&tmp_rule->ir_lock);
939 940 *err = EINPROGRESS;
940 941 return (NULL);
941 942 }
942 943 tmp_rule->ir_refcnt++;
943 944 mutex_exit(&tmp_rule->ir_lock);
944 945 *err = 0;
945 946 return (tmp_rule);
946 947 }
947 948 }
948 949 *err = ENOENT;
949 950 return (NULL);
950 951 }
951 952
952 953 /* To find a rule with a given name and zone in the global rule list. */
953 954 ilb_rule_t *
954 955 ilb_find_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
955 956 int *err)
956 957 {
957 958 ilb_rule_t *tmp_rule;
958 959
959 960 mutex_enter(&ilbs->ilbs_g_lock);
960 961 tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name, err);
961 962 mutex_exit(&ilbs->ilbs_g_lock);
962 963 return (tmp_rule);
963 964 }
964 965
965 966 /* Try to match the given packet info and zone ID with a rule. */
966 967 static boolean_t
967 968 ilb_match_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, int l3,
968 969 int l4, in_port_t min_port, in_port_t max_port, const in6_addr_t *addr)
969 970 {
970 971 ilb_rule_t *tmp_rule;
971 972
972 973 ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
973 974
974 975 for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
975 976 tmp_rule = tmp_rule->ir_next) {
976 977 if (tmp_rule->ir_zoneid != zoneid)
977 978 continue;
978 979
979 980 /*
980 981 * We don't allow the same name in different rules even if all
981 982 * the other rule components are different.
982 983 */
983 984 if (strcasecmp(tmp_rule->ir_name, name) == 0)
984 985 return (B_TRUE);
985 986
986 987 if (tmp_rule->ir_ipver != l3 || tmp_rule->ir_proto != l4)
987 988 continue;
988 989
989 990 /*
990 991 * ir_min_port and ir_max_port are the same if ir_port_range
991 992 * is false. In this case, if the ir_min|max_port (same) is
992 993 * outside of the given port range, it is OK. In other cases,
993 994 * check if min and max port are outside a rule's range.
994 995 */
995 996 if (tmp_rule->ir_max_port < min_port ||
996 997 tmp_rule->ir_min_port > max_port) {
997 998 continue;
998 999 }
999 1000
1000 1001 /*
1001 1002 * If l3 is IPv4, the addr passed in is assumed to be
1002 1003 * mapped address.
1003 1004 */
1004 1005 if (V6_OR_V4_INADDR_ANY(*addr) ||
1005 1006 V6_OR_V4_INADDR_ANY(tmp_rule->ir_target_v6) ||
1006 1007 IN6_ARE_ADDR_EQUAL(addr, &tmp_rule->ir_target_v6)) {
1007 1008 return (B_TRUE);
1008 1009 }
1009 1010 }
1010 1011 return (B_FALSE);
1011 1012 }
1012 1013
1013 1014 int
1014 1015 ilb_rule_enable(ilb_stack_t *ilbs, zoneid_t zoneid,
1015 1016 const char *rule_name, ilb_rule_t *in_rule)
1016 1017 {
1017 1018 ilb_rule_t *rule;
1018 1019 int err;
1019 1020
1020 1021 ASSERT((in_rule == NULL && rule_name != NULL) ||
1021 1022 (in_rule != NULL && rule_name == NULL));
1022 1023 if ((rule = in_rule) == NULL) {
1023 1024 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1024 1025 &err)) == NULL) {
1025 1026 return (err);
1026 1027 }
1027 1028 }
1028 1029 mutex_enter(&rule->ir_lock);
1029 1030 rule->ir_flags |= ILB_RULE_ENABLED;
1030 1031 mutex_exit(&rule->ir_lock);
1031 1032
1032 1033 /* Only refrele if the rule is passed in. */
1033 1034 if (in_rule == NULL)
1034 1035 ILB_RULE_REFRELE(rule);
1035 1036 return (0);
1036 1037 }
1037 1038
1038 1039 int
1039 1040 ilb_rule_disable(ilb_stack_t *ilbs, zoneid_t zoneid,
1040 1041 const char *rule_name, ilb_rule_t *in_rule)
1041 1042 {
1042 1043 ilb_rule_t *rule;
1043 1044 int err;
1044 1045
1045 1046 ASSERT((in_rule == NULL && rule_name != NULL) ||
1046 1047 (in_rule != NULL && rule_name == NULL));
1047 1048 if ((rule = in_rule) == NULL) {
1048 1049 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1049 1050 &err)) == NULL) {
1050 1051 return (err);
1051 1052 }
1052 1053 }
1053 1054 mutex_enter(&rule->ir_lock);
1054 1055 rule->ir_flags &= ~ILB_RULE_ENABLED;
1055 1056 mutex_exit(&rule->ir_lock);
1056 1057
1057 1058 /* Only refrele if the rule is passed in. */
1058 1059 if (in_rule == NULL)
1059 1060 ILB_RULE_REFRELE(rule);
1060 1061 return (0);
1061 1062 }
1062 1063
1063 1064 /*
1064 1065 * XXX We should probably have a walker function to walk all rules. For
1065 1066 * now, just add a simple loop for enable/disable/del.
1066 1067 */
1067 1068 void
1068 1069 ilb_rule_enable_all(ilb_stack_t *ilbs, zoneid_t zoneid)
1069 1070 {
1070 1071 ilb_rule_t *rule;
1071 1072
1072 1073 mutex_enter(&ilbs->ilbs_g_lock);
1073 1074 for (rule = ilbs->ilbs_rule_head; rule != NULL; rule = rule->ir_next) {
1074 1075 if (rule->ir_zoneid != zoneid)
1075 1076 continue;
1076 1077 /*
1077 1078 * No need to hold the rule as we are holding the global
1078 1079 * lock so it won't go away. Ignore the return value here
1079 1080 * as the rule is provided so the call cannot fail.
1080 1081 */
1081 1082 (void) ilb_rule_enable(ilbs, zoneid, NULL, rule);
1082 1083 }
1083 1084 mutex_exit(&ilbs->ilbs_g_lock);
1084 1085 }
1085 1086
1086 1087 void
1087 1088 ilb_rule_disable_all(ilb_stack_t *ilbs, zoneid_t zoneid)
1088 1089 {
1089 1090 ilb_rule_t *rule;
1090 1091
1091 1092 mutex_enter(&ilbs->ilbs_g_lock);
1092 1093 for (rule = ilbs->ilbs_rule_head; rule != NULL;
1093 1094 rule = rule->ir_next) {
1094 1095 if (rule->ir_zoneid != zoneid)
1095 1096 continue;
1096 1097 (void) ilb_rule_disable(ilbs, zoneid, NULL, rule);
1097 1098 }
1098 1099 mutex_exit(&ilbs->ilbs_g_lock);
1099 1100 }
1100 1101
1101 1102 void
1102 1103 ilb_rule_del_all(ilb_stack_t *ilbs, zoneid_t zoneid)
1103 1104 {
1104 1105 ilb_rule_t *rule;
1105 1106 ilb_rule_tq_t *arg;
1106 1107
1107 1108 mutex_enter(&ilbs->ilbs_g_lock);
1108 1109 while ((rule = ilbs->ilbs_rule_head) != NULL) {
1109 1110 if (rule->ir_zoneid != zoneid)
1110 1111 continue;
1111 1112 ilb_rule_hash_del(rule);
1112 1113 ilb_rule_g_del(ilbs, rule);
1113 1114 mutex_exit(&ilbs->ilbs_g_lock);
1114 1115
1115 1116 arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP);
1116 1117 arg->ilbs = ilbs;
1117 1118 arg->rule = rule;
1118 1119 (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq,
1119 1120 arg, TQ_SLEEP);
1120 1121
1121 1122 mutex_enter(&ilbs->ilbs_g_lock);
1122 1123 }
1123 1124 mutex_exit(&ilbs->ilbs_g_lock);
1124 1125 }
1125 1126
1126 1127 /*
1127 1128 * This is just an optimization, so don't grab the global lock. The
1128 1129 * worst case is that we missed a couple packets.
1129 1130 */
1130 1131 boolean_t
1131 1132 ilb_has_rules(ilb_stack_t *ilbs)
1132 1133 {
1133 1134 return (ilbs->ilbs_rule_head != NULL);
1134 1135 }
1135 1136
1136 1137
1137 1138 static int
1138 1139 ilb_server_toggle(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name,
1139 1140 ilb_rule_t *rule, in6_addr_t *addr, boolean_t enable)
1140 1141 {
1141 1142 ilb_server_t *tmp_server;
1142 1143 int ret;
1143 1144
1144 1145 ASSERT((rule == NULL && rule_name != NULL) ||
1145 1146 (rule != NULL && rule_name == NULL));
1146 1147
1147 1148 if (rule == NULL) {
1148 1149 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1149 1150 &ret)) == NULL) {
1150 1151 return (ret);
1151 1152 }
1152 1153 }
1153 1154
1154 1155 /* Once we get a hold on the rule, no server can be added/deleted. */
1155 1156 for (tmp_server = rule->ir_servers; tmp_server != NULL;
1156 1157 tmp_server = tmp_server->iser_next) {
1157 1158 if (IN6_ARE_ADDR_EQUAL(&tmp_server->iser_addr_v6, addr))
1158 1159 break;
1159 1160 }
1160 1161 if (tmp_server == NULL) {
1161 1162 ret = ENOENT;
1162 1163 goto done;
1163 1164 }
1164 1165
1165 1166 if (enable) {
1166 1167 ret = rule->ir_alg->ilb_alg_server_enable(tmp_server,
1167 1168 rule->ir_alg->ilb_alg_data);
1168 1169 if (ret == 0) {
1169 1170 tmp_server->iser_enabled = B_TRUE;
1170 1171 tmp_server->iser_die_time = 0;
1171 1172 }
1172 1173 } else {
1173 1174 ret = rule->ir_alg->ilb_alg_server_disable(tmp_server,
1174 1175 rule->ir_alg->ilb_alg_data);
1175 1176 if (ret == 0) {
1176 1177 tmp_server->iser_enabled = B_FALSE;
1177 1178 if (rule->ir_conn_drain_timeout != 0) {
1178 1179 (void) atomic_swap_64(
1179 1180 (uint64_t *)&tmp_server->iser_die_time,
1180 1181 ddi_get_lbolt64() + SEC_TO_TICK(
1181 1182 rule->ir_conn_drain_timeout));
1182 1183 }
1183 1184 }
1184 1185 }
1185 1186
1186 1187 done:
1187 1188 if (rule_name != NULL)
1188 1189 ILB_RULE_REFRELE(rule);
1189 1190 return (ret);
1190 1191 }
1191 1192 int
1192 1193 ilb_server_enable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1193 1194 ilb_rule_t *rule, in6_addr_t *addr)
1194 1195 {
1195 1196 return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_TRUE));
1196 1197 }
1197 1198
1198 1199 int
1199 1200 ilb_server_disable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1200 1201 ilb_rule_t *rule, in6_addr_t *addr)
1201 1202 {
1202 1203 return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_FALSE));
1203 1204 }
1204 1205
1205 1206 /*
1206 1207 * Add a back end server to a rule. If the address is IPv4, it is assumed
1207 1208 * to be passed in as a mapped address.
1208 1209 */
1209 1210 int
1210 1211 ilb_server_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_info_t *info)
1211 1212 {
1212 1213 ilb_server_t *server;
1213 1214 netstackid_t stackid;
1214 1215 int ret = 0;
1215 1216 in_port_t min_port, max_port;
1216 1217 in_port_t range;
1217 1218
1218 1219 /* Port is passed in network byte order. */
1219 1220 min_port = ntohs(info->min_port);
1220 1221 max_port = ntohs(info->max_port);
1221 1222 if (min_port > max_port)
1222 1223 return (EINVAL);
1223 1224
1224 1225 /* min_port == 0 means "all ports". Make it so */
1225 1226 if (min_port == 0) {
1226 1227 min_port = 1;
1227 1228 max_port = 65535;
1228 1229 }
1229 1230 range = max_port - min_port;
1230 1231
1231 1232 mutex_enter(&rule->ir_lock);
1232 1233 /* If someone is already doing server add/del, sleeps and wait. */
1233 1234 while (rule->ir_flags & ILB_RULE_BUSY) {
1234 1235 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1235 1236 mutex_exit(&rule->ir_lock);
1236 1237 return (EINTR);
1237 1238 }
1238 1239 }
1239 1240
1240 1241 /*
1241 1242 * Set the rule to be busy to make sure that no new packet can
1242 1243 * use this rule.
1243 1244 */
1244 1245 rule->ir_flags |= ILB_RULE_BUSY;
1245 1246
1246 1247 /* Now wait for all other guys to finish their work. */
1247 1248 while (rule->ir_refcnt > 2) {
1248 1249 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1249 1250 mutex_exit(&rule->ir_lock);
1250 1251 ret = EINTR;
1251 1252 goto end;
1252 1253 }
1253 1254 }
1254 1255 mutex_exit(&rule->ir_lock);
1255 1256
1256 1257 /* Sanity checks... */
1257 1258 if ((IN6_IS_ADDR_V4MAPPED(&info->addr) &&
1258 1259 rule->ir_ipver != IPPROTO_IP) ||
1259 1260 (!IN6_IS_ADDR_V4MAPPED(&info->addr) &&
1260 1261 rule->ir_ipver != IPPROTO_IPV6)) {
1261 1262 ret = EINVAL;
1262 1263 goto end;
1263 1264 }
1264 1265
1265 1266 /*
1266 1267 * Check for valid port range.
1267 1268 *
1268 1269 * For DSR, there can be no port shifting. Hence the server
1269 1270 * specification must be the same as the rule's.
1270 1271 *
1271 1272 * For half-NAT/NAT, the range must either be 0 (port collapsing) or
1272 1273 * it must be equal to the same value as the rule port range.
1273 1274 *
1274 1275 */
1275 1276 if (rule->ir_topo == ILB_TOPO_IMPL_DSR) {
1276 1277 if (rule->ir_max_port != max_port ||
1277 1278 rule->ir_min_port != min_port) {
1278 1279 ret = EINVAL;
1279 1280 goto end;
1280 1281 }
1281 1282 } else {
1282 1283 if ((range != rule->ir_max_port - rule->ir_min_port) &&
1283 1284 range != 0) {
1284 1285 ret = EINVAL;
1285 1286 goto end;
1286 1287 }
1287 1288 }
1288 1289
1289 1290 /* Check for duplicate. */
1290 1291 for (server = rule->ir_servers; server != NULL;
1291 1292 server = server->iser_next) {
1292 1293 if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, &info->addr) ||
1293 1294 strcasecmp(server->iser_name, info->name) == 0) {
1294 1295 break;
1295 1296 }
1296 1297 }
1297 1298 if (server != NULL) {
1298 1299 ret = EEXIST;
1299 1300 goto end;
1300 1301 }
1301 1302
1302 1303 if ((server = kmem_zalloc(sizeof (ilb_server_t), KM_NOSLEEP)) == NULL) {
1303 1304 ret = ENOMEM;
1304 1305 goto end;
1305 1306 }
1306 1307
1307 1308 (void) memcpy(server->iser_name, info->name, ILB_SERVER_NAMESZ - 1);
1308 1309 (void) inet_ntop(AF_INET6, &info->addr, server->iser_ip_addr,
1309 1310 sizeof (server->iser_ip_addr));
1310 1311 stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
1311 1312 server->iser_ksp = ilb_server_kstat_init(stackid, rule, server);
1312 1313 if (server->iser_ksp == NULL) {
1313 1314 kmem_free(server, sizeof (ilb_server_t));
1314 1315 ret = EINVAL;
1315 1316 goto end;
1316 1317 }
1317 1318
1318 1319 server->iser_stackid = stackid;
1319 1320 server->iser_addr_v6 = info->addr;
1320 1321 server->iser_min_port = min_port;
1321 1322 server->iser_max_port = max_port;
1322 1323 if (min_port != max_port)
1323 1324 server->iser_port_range = B_TRUE;
1324 1325 else
1325 1326 server->iser_port_range = B_FALSE;
1326 1327
1327 1328 /*
1328 1329 * If the rule uses NAT, find/create the NAT source entry to use
1329 1330 * for this server.
1330 1331 */
1331 1332 if (rule->ir_topo == ILB_TOPO_IMPL_NAT) {
1332 1333 in_port_t port;
1333 1334
1334 1335 /*
1335 1336 * If the server uses a port range, our port allocation
1336 1337 * scheme needs to treat it as a wildcard. Refer to the
1337 1338 * comments in ilb_nat.c about the scheme.
1338 1339 */
1339 1340 if (server->iser_port_range)
1340 1341 port = 0;
1341 1342 else
1342 1343 port = server->iser_min_port;
1343 1344
1344 1345 if ((ret = ilb_create_nat_src(ilbs, &server->iser_nat_src,
1345 1346 &server->iser_addr_v6, port, &rule->ir_nat_src_start,
1346 1347 num_nat_src_v6(&rule->ir_nat_src_start,
1347 1348 &rule->ir_nat_src_end))) != 0) {
1348 1349 kstat_delete_netstack(server->iser_ksp, stackid);
1349 1350 kmem_free(server, sizeof (ilb_server_t));
1350 1351 goto end;
1351 1352 }
1352 1353 }
1353 1354
1354 1355 /*
1355 1356 * The iser_lock is only used to protect iser_refcnt. All the other
1356 1357 * fields in ilb_server_t should not change, except for iser_enabled.
1357 1358 * The worst thing that can happen if iser_enabled is messed up is
1358 1359 * that one or two packets may not be load balanced to a server
1359 1360 * correctly.
1360 1361 */
1361 1362 server->iser_refcnt = 1;
1362 1363 server->iser_enabled = info->flags & ILB_SERVER_ENABLED ? B_TRUE :
1363 1364 B_FALSE;
1364 1365 mutex_init(&server->iser_lock, NULL, MUTEX_DEFAULT, NULL);
1365 1366 cv_init(&server->iser_cv, NULL, CV_DEFAULT, NULL);
1366 1367
1367 1368 /* Let the load balancing algorithm know about the addition. */
1368 1369 ASSERT(rule->ir_alg != NULL);
1369 1370 if ((ret = rule->ir_alg->ilb_alg_server_add(server,
1370 1371 rule->ir_alg->ilb_alg_data)) != 0) {
1371 1372 kstat_delete_netstack(server->iser_ksp, stackid);
1372 1373 kmem_free(server, sizeof (ilb_server_t));
1373 1374 goto end;
1374 1375 }
1375 1376
1376 1377 /*
1377 1378 * No need to hold ir_lock since no other thread should manipulate
1378 1379 * the following fields until ILB_RULE_BUSY is cleared.
1379 1380 */
1380 1381 if (rule->ir_servers == NULL) {
1381 1382 server->iser_next = NULL;
1382 1383 } else {
1383 1384 server->iser_next = rule->ir_servers;
1384 1385 }
1385 1386 rule->ir_servers = server;
1386 1387 ILB_R_KSTAT(rule, num_servers);
1387 1388
1388 1389 end:
1389 1390 mutex_enter(&rule->ir_lock);
1390 1391 rule->ir_flags &= ~ILB_RULE_BUSY;
1391 1392 cv_signal(&rule->ir_cv);
1392 1393 mutex_exit(&rule->ir_lock);
1393 1394 return (ret);
1394 1395 }
1395 1396
1396 1397 /* The routine executed by the delayed rule processing taskq. */
1397 1398 static void
1398 1399 ilb_server_del_tq(void *arg)
1399 1400 {
1400 1401 ilb_server_t *server = (ilb_server_t *)arg;
1401 1402
1402 1403 mutex_enter(&server->iser_lock);
1403 1404 while (server->iser_refcnt > 1)
1404 1405 cv_wait(&server->iser_cv, &server->iser_lock);
1405 1406 kstat_delete_netstack(server->iser_ksp, server->iser_stackid);
1406 1407 kmem_free(server, sizeof (ilb_server_t));
1407 1408 }
1408 1409
1409 1410 /*
1410 1411 * Delete a back end server from a rule. If the address is IPv4, it is assumed
1411 1412 * to be passed in as a mapped address.
1412 1413 */
1413 1414 int
1414 1415 ilb_server_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name,
1415 1416 ilb_rule_t *rule, in6_addr_t *addr)
1416 1417 {
1417 1418 ilb_server_t *server;
1418 1419 ilb_server_t *prev_server;
1419 1420 int ret = 0;
1420 1421
1421 1422 ASSERT((rule == NULL && rule_name != NULL) ||
1422 1423 (rule != NULL && rule_name == NULL));
1423 1424 if (rule == NULL) {
1424 1425 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1425 1426 &ret)) == NULL) {
1426 1427 return (ret);
1427 1428 }
1428 1429 }
1429 1430
1430 1431 mutex_enter(&rule->ir_lock);
1431 1432 /* If someone is already doing server add/del, sleeps and wait. */
1432 1433 while (rule->ir_flags & ILB_RULE_BUSY) {
1433 1434 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1434 1435 if (rule_name != NULL) {
1435 1436 if (--rule->ir_refcnt <= 2)
1436 1437 cv_signal(&rule->ir_cv);
1437 1438 }
1438 1439 mutex_exit(&rule->ir_lock);
1439 1440 return (EINTR);
1440 1441 }
1441 1442 }
1442 1443 /*
1443 1444 * Set the rule to be busy to make sure that no new packet can
1444 1445 * use this rule.
1445 1446 */
1446 1447 rule->ir_flags |= ILB_RULE_BUSY;
1447 1448
1448 1449 /* Now wait for all other guys to finish their work. */
1449 1450 while (rule->ir_refcnt > 2) {
1450 1451 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1451 1452 mutex_exit(&rule->ir_lock);
1452 1453 ret = EINTR;
1453 1454 goto end;
1454 1455 }
1455 1456 }
1456 1457 mutex_exit(&rule->ir_lock);
1457 1458
1458 1459 prev_server = NULL;
1459 1460 for (server = rule->ir_servers; server != NULL;
1460 1461 prev_server = server, server = server->iser_next) {
1461 1462 if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, addr))
1462 1463 break;
1463 1464 }
1464 1465 if (server == NULL) {
1465 1466 ret = ENOENT;
1466 1467 goto end;
1467 1468 }
1468 1469
1469 1470 /*
1470 1471 * Let the load balancing algorithm know about the removal.
1471 1472 * The algorithm may disallow the removal...
1472 1473 */
1473 1474 if ((ret = rule->ir_alg->ilb_alg_server_del(server,
1474 1475 rule->ir_alg->ilb_alg_data)) != 0) {
1475 1476 goto end;
1476 1477 }
1477 1478
1478 1479 if (prev_server == NULL)
1479 1480 rule->ir_servers = server->iser_next;
1480 1481 else
1481 1482 prev_server->iser_next = server->iser_next;
1482 1483
1483 1484 ILB_R_KSTAT_UPDATE(rule, num_servers, -1);
1484 1485
1485 1486 /*
1486 1487 * Mark the server as disabled so that if there is any sticky cache
1487 1488 * using this server around, it won't be used.
1488 1489 */
1489 1490 server->iser_enabled = B_FALSE;
1490 1491
1491 1492 mutex_enter(&server->iser_lock);
1492 1493
1493 1494 /*
1494 1495 * De-allocate the NAT source array. The indiviual ilb_nat_src_entry_t
1495 1496 * may not go away if there is still a conn using it. The NAT source
1496 1497 * timer will do the garbage collection.
1497 1498 */
1498 1499 ilb_destroy_nat_src(&server->iser_nat_src);
1499 1500
1500 1501 /* If there is a hard limit on when a server should die, set it. */
1501 1502 if (rule->ir_conn_drain_timeout != 0) {
1502 1503 (void) atomic_swap_64((uint64_t *)&server->iser_die_time,
1503 1504 ddi_get_lbolt64() +
1504 1505 SEC_TO_TICK(rule->ir_conn_drain_timeout));
1505 1506 }
1506 1507
1507 1508 if (server->iser_refcnt > 1) {
1508 1509 (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_server_del_tq,
1509 1510 server, TQ_SLEEP);
1510 1511 mutex_exit(&server->iser_lock);
1511 1512 } else {
1512 1513 kstat_delete_netstack(server->iser_ksp, server->iser_stackid);
1513 1514 kmem_free(server, sizeof (ilb_server_t));
1514 1515 }
1515 1516
1516 1517 end:
1517 1518 mutex_enter(&rule->ir_lock);
1518 1519 rule->ir_flags &= ~ILB_RULE_BUSY;
1519 1520 if (rule_name != NULL)
1520 1521 rule->ir_refcnt--;
1521 1522 cv_signal(&rule->ir_cv);
1522 1523 mutex_exit(&rule->ir_lock);
1523 1524 return (ret);
1524 1525 }
1525 1526
1526 1527 /*
1527 1528 * First check if the destination of the ICMP message matches a VIP of
1528 1529 * a rule. If it does not, just return ILB_PASSED.
1529 1530 *
1530 1531 * If the destination matches a VIP:
1531 1532 *
1532 1533 * For ICMP_ECHO_REQUEST, generate a response on behalf of the back end
1533 1534 * server.
1534 1535 *
1535 1536 * For ICMP_DEST_UNREACHABLE fragmentation needed, check inside the payload
1536 1537 * and see which back end server we should send this message to. And we
1537 1538 * need to do NAT on both the payload message and the outside IP packet.
1538 1539 *
1539 1540 * For other ICMP messages, drop them.
1540 1541 */
1541 1542 /* ARGSUSED */
1542 1543 static int
1543 1544 ilb_icmp_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha,
1544 1545 icmph_t *icmph, ipaddr_t *lb_dst)
1545 1546 {
1546 1547 ipaddr_t vip;
1547 1548 ilb_rule_t *rule;
1548 1549 in6_addr_t addr6;
1549 1550
1550 1551 if (!ilb_rule_match_vip_v4(ilbs, ipha->ipha_dst, &rule))
1551 1552 return (ILB_PASSED);
1552 1553
1553 1554
1554 1555 if ((uint8_t *)icmph + sizeof (icmph_t) > mp->b_wptr) {
1555 1556 ILB_R_KSTAT(rule, icmp_dropped);
1556 1557 ILB_RULE_REFRELE(rule);
1557 1558 return (ILB_DROPPED);
1558 1559 }
1559 1560
1560 1561 switch (icmph->icmph_type) {
1561 1562 case ICMP_ECHO_REQUEST:
1562 1563 ILB_R_KSTAT(rule, icmp_echo_processed);
1563 1564 ILB_RULE_REFRELE(rule);
1564 1565
1565 1566 icmph->icmph_type = ICMP_ECHO_REPLY;
1566 1567 icmph->icmph_checksum = 0;
1567 1568 icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0);
1568 1569 ipha->ipha_ttl =
1569 1570 ilbs->ilbs_netstack->netstack_ip->ips_ip_def_ttl;
1570 1571 *lb_dst = ipha->ipha_src;
1571 1572 vip = ipha->ipha_dst;
1572 1573 ipha->ipha_dst = ipha->ipha_src;
1573 1574 ipha->ipha_src = vip;
1574 1575 return (ILB_BALANCED);
1575 1576 case ICMP_DEST_UNREACHABLE: {
1576 1577 int ret;
1577 1578
1578 1579 if (icmph->icmph_code != ICMP_FRAGMENTATION_NEEDED) {
1579 1580 ILB_R_KSTAT(rule, icmp_dropped);
1580 1581 ILB_RULE_REFRELE(rule);
1581 1582 return (ILB_DROPPED);
1582 1583 }
1583 1584 if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IP, ipha, icmph,
1584 1585 &addr6)) {
1585 1586 ILB_R_KSTAT(rule, icmp_2big_processed);
1586 1587 ret = ILB_BALANCED;
1587 1588 } else {
1588 1589 ILB_R_KSTAT(rule, icmp_2big_dropped);
1589 1590 ret = ILB_DROPPED;
1590 1591 }
1591 1592 ILB_RULE_REFRELE(rule);
1592 1593 IN6_V4MAPPED_TO_IPADDR(&addr6, *lb_dst);
1593 1594 return (ret);
1594 1595 }
1595 1596 default:
1596 1597 ILB_R_KSTAT(rule, icmp_dropped);
1597 1598 ILB_RULE_REFRELE(rule);
1598 1599 return (ILB_DROPPED);
1599 1600 }
1600 1601 }
1601 1602
1602 1603 /* ARGSUSED */
1603 1604 static int
1604 1605 ilb_icmp_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h,
1605 1606 icmp6_t *icmp6, in6_addr_t *lb_dst)
1606 1607 {
1607 1608 ilb_rule_t *rule;
1608 1609
1609 1610 if (!ilb_rule_match_vip_v6(ilbs, &ip6h->ip6_dst, &rule))
1610 1611 return (ILB_PASSED);
1611 1612
1612 1613 if ((uint8_t *)icmp6 + sizeof (icmp6_t) > mp->b_wptr) {
1613 1614 ILB_R_KSTAT(rule, icmp_dropped);
1614 1615 ILB_RULE_REFRELE(rule);
1615 1616 return (ILB_DROPPED);
1616 1617 }
1617 1618
1618 1619 switch (icmp6->icmp6_type) {
1619 1620 case ICMP6_ECHO_REQUEST: {
1620 1621 int hdr_len;
1621 1622
1622 1623 ILB_R_KSTAT(rule, icmp_echo_processed);
1623 1624 ILB_RULE_REFRELE(rule);
1624 1625
1625 1626 icmp6->icmp6_type = ICMP6_ECHO_REPLY;
1626 1627 icmp6->icmp6_cksum = ip6h->ip6_plen;
1627 1628 hdr_len = (char *)icmp6 - (char *)ip6h;
1628 1629 icmp6->icmp6_cksum = IP_CSUM(mp, hdr_len,
1629 1630 ilb_pseudo_sum_v6(ip6h, IPPROTO_ICMPV6));
1630 1631 ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL;
1631 1632 ip6h->ip6_hops =
1632 1633 ilbs->ilbs_netstack->netstack_ip->ips_ipv6_def_hops;
1633 1634 *lb_dst = ip6h->ip6_src;
1634 1635 ip6h->ip6_src = ip6h->ip6_dst;
1635 1636 ip6h->ip6_dst = *lb_dst;
1636 1637 return (ILB_BALANCED);
1637 1638 }
1638 1639 case ICMP6_PACKET_TOO_BIG: {
1639 1640 int ret;
1640 1641
1641 1642 if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IPV6, ip6h, icmp6,
1642 1643 lb_dst)) {
1643 1644 ILB_R_KSTAT(rule, icmp_2big_processed);
1644 1645 ret = ILB_BALANCED;
1645 1646 } else {
1646 1647 ILB_R_KSTAT(rule, icmp_2big_dropped);
1647 1648 ret = ILB_DROPPED;
1648 1649 }
1649 1650 ILB_RULE_REFRELE(rule);
1650 1651 return (ret);
1651 1652 }
1652 1653 default:
1653 1654 ILB_R_KSTAT(rule, icmp_dropped);
1654 1655 ILB_RULE_REFRELE(rule);
1655 1656 return (ILB_DROPPED);
1656 1657 }
1657 1658 }
1658 1659
1659 1660 /*
1660 1661 * Common routine to check an incoming packet and decide what to do with it.
1661 1662 * called by ilb_check_v4|v6().
1662 1663 */
1663 1664 static int
1664 1665 ilb_check(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, in6_addr_t *src,
1665 1666 in6_addr_t *dst, int l3, int l4, void *iph, uint8_t *tph, uint32_t pkt_len,
1666 1667 in6_addr_t *lb_dst)
1667 1668 {
1668 1669 in_port_t sport, dport;
1669 1670 tcpha_t *tcph;
1670 1671 udpha_t *udph;
1671 1672 ilb_rule_t *rule;
1672 1673 ilb_server_t *server;
1673 1674 boolean_t balanced;
1674 1675 struct ilb_sticky_s *s = NULL;
1675 1676 int ret;
1676 1677 uint32_t ip_sum, tp_sum;
1677 1678 ilb_nat_info_t info;
1678 1679 uint16_t nat_src_idx;
1679 1680 boolean_t busy;
1680 1681
1681 1682 /*
1682 1683 * We don't really need to switch here since both protocols's
1683 1684 * ports are at the same offset. Just prepare for future protocol
1684 1685 * specific processing.
1685 1686 */
1686 1687 switch (l4) {
1687 1688 case IPPROTO_TCP:
1688 1689 if (tph + TCP_MIN_HEADER_LENGTH > mp->b_wptr)
1689 1690 return (ILB_DROPPED);
1690 1691 tcph = (tcpha_t *)tph;
1691 1692 sport = tcph->tha_lport;
1692 1693 dport = tcph->tha_fport;
1693 1694 break;
1694 1695 case IPPROTO_UDP:
1695 1696 if (tph + sizeof (udpha_t) > mp->b_wptr)
1696 1697 return (ILB_DROPPED);
1697 1698 udph = (udpha_t *)tph;
1698 1699 sport = udph->uha_src_port;
1699 1700 dport = udph->uha_dst_port;
1700 1701 break;
1701 1702 default:
1702 1703 return (ILB_PASSED);
1703 1704 }
1704 1705
1705 1706 /* Fast path, there is an existing conn. */
1706 1707 if (ilb_check_conn(ilbs, l3, iph, l4, tph, src, dst, sport, dport,
1707 1708 pkt_len, lb_dst)) {
1708 1709 return (ILB_BALANCED);
1709 1710 }
1710 1711
1711 1712 /*
1712 1713 * If there is no existing connection for the incoming packet, check
1713 1714 * to see if the packet matches a rule. If not, just let IP decide
1714 1715 * what to do with it.
1715 1716 *
1716 1717 * Note: a reply from back end server should not match a rule. A
1717 1718 * reply should match one existing conn.
1718 1719 */
1719 1720 rule = ilb_rule_hash(ilbs, l3, l4, dst, dport, ill->ill_zoneid,
1720 1721 pkt_len, &busy);
1721 1722 if (rule == NULL) {
1722 1723 /* If the rule is busy, just drop the packet. */
1723 1724 if (busy)
1724 1725 return (ILB_DROPPED);
1725 1726 else
1726 1727 return (ILB_PASSED);
1727 1728 }
1728 1729
1729 1730 /*
1730 1731 * The packet matches a rule, use the rule load balance algorithm
1731 1732 * to find a server.
1732 1733 */
1733 1734 balanced = rule->ir_alg->ilb_alg_lb(src, sport, dst, dport,
1734 1735 rule->ir_alg->ilb_alg_data, &server);
1735 1736 /*
1736 1737 * This can only happen if there is no server in a rule or all
1737 1738 * the servers are currently disabled.
1738 1739 */
1739 1740 if (!balanced)
1740 1741 goto no_server;
1741 1742
1742 1743 /*
1743 1744 * If the rule is sticky enabled, we need to check the sticky table.
1744 1745 * If there is a sticky entry for the client, use the previous server
1745 1746 * instead of the one found above (note that both can be the same).
1746 1747 * If there is no entry for that client, add an entry to the sticky
1747 1748 * table. Both the find and add are done in ilb_sticky_find_add()
1748 1749 * to avoid checking for duplicate when adding an entry.
1749 1750 */
1750 1751 if (rule->ir_flags & ILB_RULE_STICKY) {
1751 1752 in6_addr_t addr;
1752 1753
1753 1754 V6_MASK_COPY(*src, rule->ir_sticky_mask, addr);
1754 1755 if ((server = ilb_sticky_find_add(ilbs, rule, &addr, server,
1755 1756 &s, &nat_src_idx)) == NULL) {
1756 1757 ILB_R_KSTAT(rule, nomem_pkt_dropped);
1757 1758 ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
1758 1759 goto no_server;
1759 1760 }
1760 1761 }
1761 1762
1762 1763 /*
1763 1764 * We are holding a reference on the rule, so the server
1764 1765 * cannot go away.
1765 1766 */
1766 1767 *lb_dst = server->iser_addr_v6;
1767 1768 ILB_S_KSTAT(server, pkt_processed);
1768 1769 ILB_S_KSTAT_UPDATE(server, bytes_processed, pkt_len);
1769 1770
1770 1771 switch (rule->ir_topo) {
1771 1772 case ILB_TOPO_IMPL_NAT: {
1772 1773 ilb_nat_src_entry_t *src_ent;
1773 1774 uint16_t *src_idx;
1774 1775
1775 1776 /*
1776 1777 * We create a cache even if it is not a SYN segment.
1777 1778 * The server should return a RST. When we see the
1778 1779 * RST, we will destroy this cache. But by having
1779 1780 * a cache, we know how to NAT the returned RST.
1780 1781 */
1781 1782 info.vip = *dst;
1782 1783 info.dport = dport;
1783 1784 info.src = *src;
1784 1785 info.sport = sport;
1785 1786
1786 1787 /* If stickiness is enabled, use the same source address */
1787 1788 if (s != NULL)
1788 1789 src_idx = &nat_src_idx;
1789 1790 else
1790 1791 src_idx = NULL;
1791 1792
1792 1793 if ((src_ent = ilb_alloc_nat_addr(server->iser_nat_src,
1793 1794 &info.nat_src, &info.nat_sport, src_idx)) == NULL) {
1794 1795 if (s != NULL)
1795 1796 ilb_sticky_refrele(s);
1796 1797 ILB_R_KSTAT(rule, pkt_dropped);
1797 1798 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1798 1799 ILB_R_KSTAT(rule, noport_pkt_dropped);
1799 1800 ILB_R_KSTAT_UPDATE(rule, noport_bytes_dropped, pkt_len);
1800 1801 ret = ILB_DROPPED;
1801 1802 break;
1802 1803 }
1803 1804 info.src_ent = src_ent;
1804 1805 info.nat_dst = server->iser_addr_v6;
1805 1806 if (rule->ir_port_range && server->iser_port_range) {
1806 1807 info.nat_dport = htons(ntohs(dport) -
1807 1808 rule->ir_min_port + server->iser_min_port);
1808 1809 } else {
1809 1810 info.nat_dport = htons(server->iser_min_port);
1810 1811 }
1811 1812
1812 1813 /*
1813 1814 * If ilb_conn_add() fails, it will release the reference on
1814 1815 * sticky info and de-allocate the NAT source port allocated
1815 1816 * above.
1816 1817 */
1817 1818 if (ilb_conn_add(ilbs, rule, server, src, sport, dst,
1818 1819 dport, &info, &ip_sum, &tp_sum, s) != 0) {
1819 1820 ILB_R_KSTAT(rule, pkt_dropped);
1820 1821 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1821 1822 ILB_R_KSTAT(rule, nomem_pkt_dropped);
1822 1823 ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
1823 1824 ret = ILB_DROPPED;
1824 1825 break;
1825 1826 }
1826 1827 ilb_full_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE);
1827 1828 ret = ILB_BALANCED;
1828 1829 break;
1829 1830 }
1830 1831 case ILB_TOPO_IMPL_HALF_NAT:
1831 1832 info.vip = *dst;
1832 1833 info.nat_dst = server->iser_addr_v6;
1833 1834 info.dport = dport;
1834 1835 if (rule->ir_port_range && server->iser_port_range) {
1835 1836 info.nat_dport = htons(ntohs(dport) -
1836 1837 rule->ir_min_port + server->iser_min_port);
1837 1838 } else {
1838 1839 info.nat_dport = htons(server->iser_min_port);
1839 1840 }
1840 1841
1841 1842 if (ilb_conn_add(ilbs, rule, server, src, sport, dst,
1842 1843 dport, &info, &ip_sum, &tp_sum, s) != 0) {
1843 1844 ILB_R_KSTAT(rule, pkt_dropped);
1844 1845 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1845 1846 ILB_R_KSTAT(rule, nomem_pkt_dropped);
1846 1847 ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
1847 1848 ret = ILB_DROPPED;
1848 1849 break;
1849 1850 }
1850 1851 ilb_half_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE);
1851 1852
1852 1853 ret = ILB_BALANCED;
1853 1854 break;
1854 1855 case ILB_TOPO_IMPL_DSR:
1855 1856 /*
1856 1857 * By decrementing the sticky refcnt, the period of
1857 1858 * stickiness (life time of ilb_sticky_t) will be
1858 1859 * from now to (now + default expiry time).
1859 1860 */
1860 1861 if (s != NULL)
1861 1862 ilb_sticky_refrele(s);
1862 1863 ret = ILB_BALANCED;
1863 1864 break;
1864 1865 default:
1865 1866 cmn_err(CE_PANIC, "data corruption unknown topology: %p",
1866 1867 (void *) rule);
1867 1868 break;
1868 1869 }
1869 1870 ILB_RULE_REFRELE(rule);
1870 1871 return (ret);
1871 1872
1872 1873 no_server:
1873 1874 /* This can only happen if there is no server available. */
1874 1875 ILB_R_KSTAT(rule, pkt_dropped);
1875 1876 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1876 1877 ILB_RULE_REFRELE(rule);
1877 1878 return (ILB_DROPPED);
1878 1879 }
1879 1880
1880 1881 int
1881 1882 ilb_check_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha, int l4,
1882 1883 uint8_t *tph, ipaddr_t *lb_dst)
1883 1884 {
1884 1885 in6_addr_t v6_src, v6_dst, v6_lb_dst;
1885 1886 int ret;
1886 1887
1887 1888 ASSERT(DB_REF(mp) == 1);
1888 1889
1889 1890 if (l4 == IPPROTO_ICMP) {
1890 1891 return (ilb_icmp_v4(ilbs, ill, mp, ipha, (icmph_t *)tph,
1891 1892 lb_dst));
1892 1893 }
1893 1894
1894 1895 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6_src);
1895 1896 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6_dst);
1896 1897 ret = ilb_check(ilbs, ill, mp, &v6_src, &v6_dst, IPPROTO_IP, l4, ipha,
1897 1898 tph, ntohs(ipha->ipha_length), &v6_lb_dst);
1898 1899 if (ret == ILB_BALANCED)
1899 1900 IN6_V4MAPPED_TO_IPADDR(&v6_lb_dst, *lb_dst);
1900 1901 return (ret);
1901 1902 }
1902 1903
1903 1904 int
1904 1905 ilb_check_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h, int l4,
1905 1906 uint8_t *tph, in6_addr_t *lb_dst)
1906 1907 {
1907 1908 uint32_t pkt_len;
1908 1909
1909 1910 ASSERT(DB_REF(mp) == 1);
1910 1911
1911 1912 if (l4 == IPPROTO_ICMPV6) {
1912 1913 return (ilb_icmp_v6(ilbs, ill, mp, ip6h, (icmp6_t *)tph,
1913 1914 lb_dst));
1914 1915 }
1915 1916
1916 1917 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
1917 1918 return (ilb_check(ilbs, ill, mp, &ip6h->ip6_src, &ip6h->ip6_dst,
1918 1919 IPPROTO_IPV6, l4, ip6h, tph, pkt_len, lb_dst));
1919 1920 }
1920 1921
1921 1922 void
1922 1923 ilb_get_num_rules(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_rules)
1923 1924 {
1924 1925 ilb_rule_t *tmp_rule;
1925 1926
1926 1927 mutex_enter(&ilbs->ilbs_g_lock);
1927 1928 *num_rules = 0;
1928 1929 for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
1929 1930 tmp_rule = tmp_rule->ir_next) {
1930 1931 if (tmp_rule->ir_zoneid == zoneid)
1931 1932 *num_rules += 1;
1932 1933 }
1933 1934 mutex_exit(&ilbs->ilbs_g_lock);
1934 1935 }
1935 1936
1936 1937 int
1937 1938 ilb_get_num_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1938 1939 uint32_t *num_servers)
1939 1940 {
1940 1941 ilb_rule_t *rule;
1941 1942 int err;
1942 1943
1943 1944 if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL)
1944 1945 return (err);
1945 1946 *num_servers = rule->ir_kstat.num_servers.value.ui64;
1946 1947 ILB_RULE_REFRELE(rule);
1947 1948 return (0);
1948 1949 }
1949 1950
1950 1951 int
1951 1952 ilb_get_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1952 1953 ilb_server_info_t *servers, uint32_t *num_servers)
1953 1954 {
1954 1955 ilb_rule_t *rule;
1955 1956 ilb_server_t *server;
1956 1957 size_t cnt;
1957 1958 int err;
1958 1959
1959 1960 if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL)
1960 1961 return (err);
1961 1962 for (server = rule->ir_servers, cnt = *num_servers;
1962 1963 server != NULL && cnt > 0;
1963 1964 server = server->iser_next, cnt--, servers++) {
1964 1965 (void) memcpy(servers->name, server->iser_name,
1965 1966 ILB_SERVER_NAMESZ);
1966 1967 servers->addr = server->iser_addr_v6;
1967 1968 servers->min_port = htons(server->iser_min_port);
1968 1969 servers->max_port = htons(server->iser_max_port);
1969 1970 servers->flags = server->iser_enabled ? ILB_SERVER_ENABLED : 0;
1970 1971 servers->err = 0;
1971 1972 }
1972 1973 ILB_RULE_REFRELE(rule);
1973 1974 *num_servers -= cnt;
1974 1975
1975 1976 return (0);
1976 1977 }
1977 1978
1978 1979 void
1979 1980 ilb_get_rulenames(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_names,
1980 1981 char *buf)
1981 1982 {
1982 1983 ilb_rule_t *tmp_rule;
1983 1984 int cnt;
1984 1985
1985 1986 if (*num_names == 0)
1986 1987 return;
1987 1988
1988 1989 mutex_enter(&ilbs->ilbs_g_lock);
1989 1990 for (cnt = 0, tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
1990 1991 tmp_rule = tmp_rule->ir_next) {
1991 1992 if (tmp_rule->ir_zoneid != zoneid)
1992 1993 continue;
1993 1994
1994 1995 (void) memcpy(buf, tmp_rule->ir_name, ILB_RULE_NAMESZ);
1995 1996 buf += ILB_RULE_NAMESZ;
1996 1997 if (++cnt == *num_names)
1997 1998 break;
1998 1999 }
1999 2000 mutex_exit(&ilbs->ilbs_g_lock);
2000 2001 *num_names = cnt;
2001 2002 }
2002 2003
2003 2004 int
2004 2005 ilb_rule_list(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_rule_cmd_t *cmd)
2005 2006 {
2006 2007 ilb_rule_t *rule;
2007 2008 int err;
2008 2009
2009 2010 if ((rule = ilb_find_rule(ilbs, zoneid, cmd->name, &err)) == NULL) {
2010 2011 return (err);
2011 2012 }
2012 2013
2013 2014 /*
2014 2015 * Except the enabled flags, none of the following will change
2015 2016 * in the life time of a rule. So we don't hold the mutex when
2016 2017 * reading them. The worst is to report a wrong enabled flags.
2017 2018 */
2018 2019 cmd->ip_ver = rule->ir_ipver;
2019 2020 cmd->proto = rule->ir_proto;
2020 2021 cmd->min_port = htons(rule->ir_min_port);
2021 2022 cmd->max_port = htons(rule->ir_max_port);
2022 2023
2023 2024 cmd->vip = rule->ir_target_v6;
2024 2025 cmd->algo = rule->ir_alg_type;
2025 2026 cmd->topo = rule->ir_topo;
2026 2027
2027 2028 cmd->nat_src_start = rule->ir_nat_src_start;
2028 2029 cmd->nat_src_end = rule->ir_nat_src_end;
2029 2030
2030 2031 cmd->conn_drain_timeout = rule->ir_conn_drain_timeout;
2031 2032 cmd->nat_expiry = rule->ir_nat_expiry;
2032 2033 cmd->sticky_expiry = rule->ir_sticky_expiry;
2033 2034
2034 2035 cmd->flags = 0;
2035 2036 if (rule->ir_flags & ILB_RULE_ENABLED)
2036 2037 cmd->flags |= ILB_RULE_ENABLED;
2037 2038 if (rule->ir_flags & ILB_RULE_STICKY) {
2038 2039 cmd->flags |= ILB_RULE_STICKY;
2039 2040 cmd->sticky_mask = rule->ir_sticky_mask;
2040 2041 }
2041 2042
2042 2043 ILB_RULE_REFRELE(rule);
2043 2044 return (0);
2044 2045 }
2045 2046
2046 2047 static void *
2047 2048 ilb_stack_init(netstackid_t stackid, netstack_t *ns)
2048 2049 {
2049 2050 ilb_stack_t *ilbs;
2050 2051 char tq_name[TASKQ_NAMELEN];
2051 2052
2052 2053 ilbs = kmem_alloc(sizeof (ilb_stack_t), KM_SLEEP);
2053 2054 ilbs->ilbs_netstack = ns;
2054 2055
2055 2056 ilbs->ilbs_rule_head = NULL;
2056 2057 ilbs->ilbs_g_hash = NULL;
2057 2058 mutex_init(&ilbs->ilbs_g_lock, NULL, MUTEX_DEFAULT, NULL);
2058 2059
2059 2060 ilbs->ilbs_kstat = kmem_alloc(sizeof (ilb_g_kstat_t), KM_SLEEP);
2060 2061 if ((ilbs->ilbs_ksp = ilb_kstat_g_init(stackid, ilbs)) == NULL) {
2061 2062 kmem_free(ilbs, sizeof (ilb_stack_t));
2062 2063 return (NULL);
2063 2064 }
2064 2065
2065 2066 /*
2066 2067 * ilbs_conn/sticky_hash related info is initialized in
2067 2068 * ilb_conn/sticky_hash_init().
2068 2069 */
2069 2070 ilbs->ilbs_conn_taskq = NULL;
2070 2071 ilbs->ilbs_rule_hash_size = ilb_rule_hash_size;
2071 2072 ilbs->ilbs_conn_hash_size = ilb_conn_hash_size;
2072 2073 ilbs->ilbs_c2s_conn_hash = NULL;
2073 2074 ilbs->ilbs_s2c_conn_hash = NULL;
2074 2075 ilbs->ilbs_conn_timer_list = NULL;
2075 2076
2076 2077 ilbs->ilbs_sticky_hash = NULL;
2077 2078 ilbs->ilbs_sticky_hash_size = ilb_sticky_hash_size;
2078 2079 ilbs->ilbs_sticky_timer_list = NULL;
2079 2080 ilbs->ilbs_sticky_taskq = NULL;
2080 2081
2081 2082 /* The allocation is done later when there is a rule using NAT mode. */
2082 2083 ilbs->ilbs_nat_src = NULL;
2083 2084 ilbs->ilbs_nat_src_hash_size = ilb_nat_src_hash_size;
2084 2085 mutex_init(&ilbs->ilbs_nat_src_lock, NULL, MUTEX_DEFAULT, NULL);
2085 2086 ilbs->ilbs_nat_src_tid = 0;
2086 2087
2087 2088 /* For listing the conn hash table */
2088 2089 mutex_init(&ilbs->ilbs_conn_list_lock, NULL, MUTEX_DEFAULT, NULL);
2089 2090 cv_init(&ilbs->ilbs_conn_list_cv, NULL, CV_DEFAULT, NULL);
2090 2091 ilbs->ilbs_conn_list_busy = B_FALSE;
2091 2092 ilbs->ilbs_conn_list_cur = 0;
2092 2093 ilbs->ilbs_conn_list_connp = NULL;
2093 2094
2094 2095 /* For listing the sticky hash table */
2095 2096 mutex_init(&ilbs->ilbs_sticky_list_lock, NULL, MUTEX_DEFAULT, NULL);
2096 2097 cv_init(&ilbs->ilbs_sticky_list_cv, NULL, CV_DEFAULT, NULL);
2097 2098 ilbs->ilbs_sticky_list_busy = B_FALSE;
2098 2099 ilbs->ilbs_sticky_list_cur = 0;
2099 2100 ilbs->ilbs_sticky_list_curp = NULL;
2100 2101
2101 2102 (void) snprintf(tq_name, sizeof (tq_name), "ilb_rule_taskq_%p",
2102 2103 (void *)ns);
2103 2104 ilbs->ilbs_rule_taskq = taskq_create(tq_name, ILB_RULE_TASKQ_NUM_THR,
2104 2105 minclsyspri, 1, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
2105 2106
2106 2107 return (ilbs);
2107 2108 }
2108 2109
2109 2110 /* ARGSUSED */
2110 2111 static void
2111 2112 ilb_stack_shutdown(netstackid_t stackid, void *arg)
2112 2113 {
2113 2114 ilb_stack_t *ilbs = (ilb_stack_t *)arg;
2114 2115 ilb_rule_t *tmp_rule;
2115 2116
2116 2117 ilb_sticky_hash_fini(ilbs);
2117 2118 ilb_conn_hash_fini(ilbs);
2118 2119 mutex_enter(&ilbs->ilbs_g_lock);
2119 2120 while ((tmp_rule = ilbs->ilbs_rule_head) != NULL) {
2120 2121 ilb_rule_hash_del(tmp_rule);
2121 2122 ilb_rule_g_del(ilbs, tmp_rule);
2122 2123 mutex_exit(&ilbs->ilbs_g_lock);
2123 2124 ilb_rule_del_common(ilbs, tmp_rule);
2124 2125 mutex_enter(&ilbs->ilbs_g_lock);
2125 2126 }
2126 2127 mutex_exit(&ilbs->ilbs_g_lock);
2127 2128 if (ilbs->ilbs_nat_src != NULL)
2128 2129 ilb_nat_src_fini(ilbs);
2129 2130 }
2130 2131
2131 2132 static void
2132 2133 ilb_stack_fini(netstackid_t stackid, void * arg)
2133 2134 {
2134 2135 ilb_stack_t *ilbs = (ilb_stack_t *)arg;
2135 2136
2136 2137 ilb_rule_hash_fini(ilbs);
2137 2138 taskq_destroy(ilbs->ilbs_rule_taskq);
2138 2139 ilb_kstat_g_fini(stackid, ilbs);
2139 2140 kmem_free(ilbs->ilbs_kstat, sizeof (ilb_g_kstat_t));
2140 2141 kmem_free(ilbs, sizeof (ilb_stack_t));
2141 2142 }
2142 2143
2143 2144 void
2144 2145 ilb_ddi_g_init(void)
2145 2146 {
2146 2147 netstack_register(NS_ILB, ilb_stack_init, ilb_stack_shutdown,
2147 2148 ilb_stack_fini);
2148 2149 }
2149 2150
2150 2151 void
2151 2152 ilb_ddi_g_destroy(void)
2152 2153 {
2153 2154 netstack_unregister(NS_ILB);
2154 2155 ilb_conn_cache_fini();
2155 2156 ilb_sticky_cache_fini();
2156 2157 }
↓ open down ↓ |
1859 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX