Print this page
5045 use atomic_{inc,dec}_* instead of atomic_add_*
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/inet/ip/ip_dce.c
+++ new/usr/src/uts/common/inet/ip/ip_dce.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
25 25 */
26 26
27 27 #include <sys/types.h>
28 28 #include <sys/stream.h>
29 29 #include <sys/strsun.h>
30 30 #include <sys/zone.h>
31 31 #include <sys/ddi.h>
32 32 #include <sys/disp.h>
33 33 #include <sys/sunddi.h>
34 34 #include <sys/cmn_err.h>
35 35 #include <sys/debug.h>
36 36 #include <sys/atomic.h>
37 37 #include <sys/callb.h>
38 38 #define _SUN_TPI_VERSION 2
39 39 #include <sys/tihdr.h>
40 40
41 41 #include <inet/common.h>
42 42 #include <inet/mi.h>
43 43 #include <inet/mib2.h>
44 44 #include <inet/snmpcom.h>
45 45
46 46 #include <netinet/ip6.h>
47 47 #include <netinet/icmp6.h>
48 48
49 49 #include <inet/ip.h>
50 50 #include <inet/ip_impl.h>
51 51 #include <inet/ip6.h>
52 52 #include <inet/ip6_asp.h>
53 53 #include <inet/ip_multi.h>
54 54 #include <inet/ip_if.h>
55 55 #include <inet/ip_ire.h>
56 56 #include <inet/ip_ftable.h>
57 57 #include <inet/ip_rts.h>
58 58 #include <inet/ip_ndp.h>
59 59 #include <inet/ipclassifier.h>
60 60 #include <inet/ip_listutils.h>
61 61
62 62 #include <sys/sunddi.h>
63 63
64 64 /*
65 65 * Routines for handling destination cache entries.
66 66 * There is always one DCEF_DEFAULT for each ip_stack_t created at init time.
67 67 * That entry holds both the IP ident value and the dce generation number.
68 68 *
69 69 * Any time a DCE is changed significantly (different path MTU, but NOT
70 70 * different ULP info!), the dce_generation number is increased.
71 71 * Also, when a new DCE is created, the dce_generation number in the default
72 72 * DCE is bumped. That allows the dce_t information to be cached efficiently
73 73 * as long as the entity caching the dce_t also caches the dce_generation,
74 74 * and compares the cached generation to detect any changes.
75 75 * Furthermore, when a DCE is deleted, if there are any outstanding references
76 76 * to the DCE it will be marked as condemned. The condemned mark is
77 77 * a designated generation number which is never otherwise used, hence
78 78 * the single comparison with the generation number captures that as well.
79 79 *
80 80 * An example of code which caches is as follows:
81 81 *
82 82 * if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) {
83 83 * The DCE has changed
84 84 * mystruct->my_dce = dce_lookup_pkt(mp, ixa,
85 85 * &mystruct->my_dce_generation);
86 86 * Not needed in practice, since we have the default DCE:
87 87 * if (DCE_IS_CONDEMNED(mystruct->my_dce))
88 88 * return failure;
89 89 * }
90 90 *
91 91 * Note that for IPv6 link-local addresses we record the ifindex since the
92 92 * link-locals are not globally unique.
93 93 */
94 94
95 95 /*
96 96 * Hash bucket structure for DCEs
97 97 */
98 98 typedef struct dcb_s {
99 99 krwlock_t dcb_lock;
100 100 uint32_t dcb_cnt;
101 101 dce_t *dcb_dce;
102 102 } dcb_t;
103 103
104 104 static void dce_delete_locked(dcb_t *, dce_t *);
105 105 static void dce_make_condemned(dce_t *);
106 106
107 107 static kmem_cache_t *dce_cache;
108 108 static kthread_t *dce_reclaim_thread;
109 109 static kmutex_t dce_reclaim_lock;
110 110 static kcondvar_t dce_reclaim_cv;
111 111 static int dce_reclaim_shutdown;
112 112
113 113 /* Global so it can be tuned in /etc/system. This must be a power of two. */
114 114 uint_t ip_dce_hash_size = 1024;
115 115
116 116 /* The time in seconds between executions of the IP DCE reclaim worker. */
117 117 uint_t ip_dce_reclaim_interval = 60;
118 118
119 119 /* The factor of the DCE threshold at which to start hard reclaims */
120 120 uint_t ip_dce_reclaim_threshold_hard = 2;
121 121
122 122 /* Operates on a uint64_t */
123 123 #define RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48))
124 124
125 125 /*
126 126 * Reclaim a fraction of dce's in the dcb.
127 127 * For now we have a higher probability to delete DCEs without DCE_PMTU.
128 128 */
129 129 static void
130 130 dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction)
131 131 {
132 132 uint_t fraction_pmtu = fraction*4;
133 133 uint_t hash;
134 134 dce_t *dce, *nextdce;
135 135 hrtime_t seed = gethrtime();
136 136 uint_t retained = 0;
137 137 uint_t max = ipst->ips_ip_dce_reclaim_threshold;
138 138
139 139 max *= ip_dce_reclaim_threshold_hard;
140 140
141 141 rw_enter(&dcb->dcb_lock, RW_WRITER);
142 142 for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
143 143 nextdce = dce->dce_next;
144 144 /* Clear DCEF_PMTU if the pmtu is too old */
145 145 mutex_enter(&dce->dce_lock);
146 146 if ((dce->dce_flags & DCEF_PMTU) &&
147 147 TICK_TO_SEC(ddi_get_lbolt64()) - dce->dce_last_change_time >
148 148 ipst->ips_ip_pathmtu_interval) {
149 149 dce->dce_flags &= ~DCEF_PMTU;
150 150 mutex_exit(&dce->dce_lock);
151 151 dce_increment_generation(dce);
152 152 } else {
153 153 mutex_exit(&dce->dce_lock);
154 154 }
155 155
156 156 if (max == 0 || retained < max) {
157 157 hash = RANDOM_HASH((uint64_t)((uintptr_t)dce | seed));
158 158
159 159 if (dce->dce_flags & DCEF_PMTU) {
160 160 if (hash % fraction_pmtu != 0) {
161 161 retained++;
162 162 continue;
163 163 }
164 164 } else {
165 165 if (hash % fraction != 0) {
166 166 retained++;
167 167 continue;
168 168 }
169 169 }
170 170 }
171 171
172 172 IP_STAT(ipst, ip_dce_reclaim_deleted);
173 173 dce_delete_locked(dcb, dce);
174 174 dce_refrele(dce);
175 175 }
176 176 rw_exit(&dcb->dcb_lock);
177 177 }
178 178
179 179 /*
180 180 * kmem_cache callback to free up memory.
181 181 *
182 182 */
183 183 static void
184 184 ip_dce_reclaim_stack(ip_stack_t *ipst)
185 185 {
186 186 int i;
187 187
188 188 IP_STAT(ipst, ip_dce_reclaim_calls);
189 189 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
190 190 dcb_reclaim(&ipst->ips_dce_hash_v4[i], ipst,
191 191 ipst->ips_ip_dce_reclaim_fraction);
192 192
193 193 dcb_reclaim(&ipst->ips_dce_hash_v6[i], ipst,
194 194 ipst->ips_ip_dce_reclaim_fraction);
195 195 }
196 196
197 197 /*
198 198 * Walk all CONNs that can have a reference on an ire, nce or dce.
199 199 * Get them to update any stale references to drop any refholds they
200 200 * have.
201 201 */
202 202 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
203 203 }
204 204
205 205 /*
206 206 * Called by dce_reclaim_worker() below, and no one else. Typically this will
207 207 * mean that the number of entries in the hash buckets has exceeded a tunable
208 208 * threshold.
209 209 */
210 210 static void
211 211 ip_dce_reclaim(void)
212 212 {
213 213 netstack_handle_t nh;
214 214 netstack_t *ns;
215 215 ip_stack_t *ipst;
216 216
217 217 ASSERT(curthread == dce_reclaim_thread);
218 218
219 219 netstack_next_init(&nh);
220 220 while ((ns = netstack_next(&nh)) != NULL) {
221 221 /*
222 222 * netstack_next() can return a netstack_t with a NULL
223 223 * netstack_ip at boot time.
224 224 */
225 225 if ((ipst = ns->netstack_ip) == NULL) {
226 226 netstack_rele(ns);
227 227 continue;
228 228 }
229 229 if (atomic_swap_uint(&ipst->ips_dce_reclaim_needed, 0) != 0)
230 230 ip_dce_reclaim_stack(ipst);
231 231 netstack_rele(ns);
232 232 }
233 233 netstack_next_fini(&nh);
234 234 }
235 235
236 236 /* ARGSUSED */
237 237 static void
238 238 dce_reclaim_worker(void *arg)
239 239 {
240 240 callb_cpr_t cprinfo;
241 241
242 242 CALLB_CPR_INIT(&cprinfo, &dce_reclaim_lock, callb_generic_cpr,
243 243 "dce_reclaim_worker");
244 244
245 245 mutex_enter(&dce_reclaim_lock);
246 246 while (!dce_reclaim_shutdown) {
247 247 CALLB_CPR_SAFE_BEGIN(&cprinfo);
248 248 (void) cv_timedwait(&dce_reclaim_cv, &dce_reclaim_lock,
249 249 ddi_get_lbolt() + ip_dce_reclaim_interval * hz);
250 250 CALLB_CPR_SAFE_END(&cprinfo, &dce_reclaim_lock);
251 251
252 252 if (dce_reclaim_shutdown)
253 253 break;
254 254
255 255 mutex_exit(&dce_reclaim_lock);
256 256 ip_dce_reclaim();
257 257 mutex_enter(&dce_reclaim_lock);
258 258 }
259 259
260 260 ASSERT(MUTEX_HELD(&dce_reclaim_lock));
261 261 dce_reclaim_thread = NULL;
262 262 dce_reclaim_shutdown = 0;
263 263 cv_broadcast(&dce_reclaim_cv);
264 264 CALLB_CPR_EXIT(&cprinfo); /* drops the lock */
265 265
266 266 thread_exit();
267 267 }
268 268
269 269 void
270 270 dce_g_init(void)
271 271 {
272 272 dce_cache = kmem_cache_create("dce_cache",
273 273 sizeof (dce_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
274 274
275 275 mutex_init(&dce_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
276 276 cv_init(&dce_reclaim_cv, NULL, CV_DEFAULT, NULL);
277 277
278 278 dce_reclaim_thread = thread_create(NULL, 0, dce_reclaim_worker,
279 279 NULL, 0, &p0, TS_RUN, minclsyspri);
280 280 }
281 281
282 282 void
283 283 dce_g_destroy(void)
284 284 {
285 285 mutex_enter(&dce_reclaim_lock);
286 286 dce_reclaim_shutdown = 1;
287 287 cv_signal(&dce_reclaim_cv);
288 288 while (dce_reclaim_thread != NULL)
289 289 cv_wait(&dce_reclaim_cv, &dce_reclaim_lock);
290 290 mutex_exit(&dce_reclaim_lock);
291 291
292 292 cv_destroy(&dce_reclaim_cv);
293 293 mutex_destroy(&dce_reclaim_lock);
294 294
295 295 kmem_cache_destroy(dce_cache);
296 296 }
297 297
298 298 /*
299 299 * Allocate a default DCE and a hash table for per-IP address DCEs
300 300 */
301 301 void
302 302 dce_stack_init(ip_stack_t *ipst)
303 303 {
304 304 int i;
305 305
306 306 ipst->ips_dce_default = kmem_cache_alloc(dce_cache, KM_SLEEP);
307 307 bzero(ipst->ips_dce_default, sizeof (dce_t));
308 308 ipst->ips_dce_default->dce_flags = DCEF_DEFAULT;
309 309 ipst->ips_dce_default->dce_generation = DCE_GENERATION_INITIAL;
310 310 ipst->ips_dce_default->dce_last_change_time =
311 311 TICK_TO_SEC(ddi_get_lbolt64());
312 312 ipst->ips_dce_default->dce_refcnt = 1; /* Should never go away */
313 313 ipst->ips_dce_default->dce_ipst = ipst;
314 314
315 315 /* This must be a power of two since we are using IRE_ADDR_HASH macro */
316 316 ipst->ips_dce_hashsize = ip_dce_hash_size;
317 317 ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize *
318 318 sizeof (dcb_t), KM_SLEEP);
319 319 ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize *
320 320 sizeof (dcb_t), KM_SLEEP);
321 321 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
322 322 rw_init(&ipst->ips_dce_hash_v4[i].dcb_lock, NULL, RW_DEFAULT,
323 323 NULL);
324 324 rw_init(&ipst->ips_dce_hash_v6[i].dcb_lock, NULL, RW_DEFAULT,
325 325 NULL);
326 326 }
327 327 }
328 328
329 329 void
330 330 dce_stack_destroy(ip_stack_t *ipst)
331 331 {
332 332 int i;
333 333 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
334 334 rw_destroy(&ipst->ips_dce_hash_v4[i].dcb_lock);
335 335 rw_destroy(&ipst->ips_dce_hash_v6[i].dcb_lock);
336 336 }
337 337 kmem_free(ipst->ips_dce_hash_v4,
338 338 ipst->ips_dce_hashsize * sizeof (dcb_t));
339 339 ipst->ips_dce_hash_v4 = NULL;
340 340 kmem_free(ipst->ips_dce_hash_v6,
341 341 ipst->ips_dce_hashsize * sizeof (dcb_t));
342 342 ipst->ips_dce_hash_v6 = NULL;
343 343 ipst->ips_dce_hashsize = 0;
344 344
345 345 ASSERT(ipst->ips_dce_default->dce_refcnt == 1);
346 346 kmem_cache_free(dce_cache, ipst->ips_dce_default);
347 347 ipst->ips_dce_default = NULL;
348 348 }
349 349
350 350 /* When any DCE is good enough */
351 351 dce_t *
352 352 dce_get_default(ip_stack_t *ipst)
353 353 {
354 354 dce_t *dce;
355 355
356 356 dce = ipst->ips_dce_default;
357 357 dce_refhold(dce);
358 358 return (dce);
359 359 }
360 360
361 361 /*
362 362 * Generic for IPv4 and IPv6.
363 363 *
364 364 * Used by callers that need to cache e.g., the datapath
365 365 * Returns the generation number in the last argument.
366 366 */
367 367 dce_t *
368 368 dce_lookup_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
369 369 {
370 370 if (ixa->ixa_flags & IXAF_IS_IPV4) {
371 371 /*
372 372 * If we have a source route we need to look for the final
373 373 * destination in the source route option.
374 374 */
375 375 ipaddr_t final_dst;
376 376 ipha_t *ipha = (ipha_t *)mp->b_rptr;
377 377
378 378 final_dst = ip_get_dst(ipha);
379 379 return (dce_lookup_v4(final_dst, ixa->ixa_ipst, generationp));
380 380 } else {
381 381 uint_t ifindex;
382 382 /*
383 383 * If we have a routing header we need to look for the final
384 384 * destination in the routing extension header.
385 385 */
386 386 in6_addr_t final_dst;
387 387 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
388 388
389 389 final_dst = ip_get_dst_v6(ip6h, mp, NULL);
390 390 ifindex = 0;
391 391 if (IN6_IS_ADDR_LINKSCOPE(&final_dst) && ixa->ixa_nce != NULL) {
392 392 ifindex = ixa->ixa_nce->nce_common->ncec_ill->
393 393 ill_phyint->phyint_ifindex;
394 394 }
395 395 return (dce_lookup_v6(&final_dst, ifindex, ixa->ixa_ipst,
396 396 generationp));
397 397 }
398 398 }
399 399
400 400 /*
401 401 * Used by callers that need to cache e.g., the datapath
402 402 * Returns the generation number in the last argument.
403 403 */
404 404 dce_t *
405 405 dce_lookup_v4(ipaddr_t dst, ip_stack_t *ipst, uint_t *generationp)
406 406 {
407 407 uint_t hash;
408 408 dcb_t *dcb;
409 409 dce_t *dce;
410 410
411 411 /* Set *generationp before dropping the lock(s) that allow additions */
412 412 if (generationp != NULL)
413 413 *generationp = ipst->ips_dce_default->dce_generation;
414 414
415 415 hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
416 416 dcb = &ipst->ips_dce_hash_v4[hash];
417 417 rw_enter(&dcb->dcb_lock, RW_READER);
418 418 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
419 419 if (dce->dce_v4addr == dst) {
420 420 mutex_enter(&dce->dce_lock);
421 421 if (!DCE_IS_CONDEMNED(dce)) {
422 422 dce_refhold(dce);
423 423 if (generationp != NULL)
424 424 *generationp = dce->dce_generation;
425 425 mutex_exit(&dce->dce_lock);
426 426 rw_exit(&dcb->dcb_lock);
427 427 return (dce);
428 428 }
429 429 mutex_exit(&dce->dce_lock);
430 430 }
431 431 }
432 432 rw_exit(&dcb->dcb_lock);
433 433 /* Not found */
434 434 dce = ipst->ips_dce_default;
435 435 dce_refhold(dce);
436 436 return (dce);
437 437 }
438 438
439 439 /*
440 440 * Used by callers that need to cache e.g., the datapath
441 441 * Returns the generation number in the last argument.
442 442 * ifindex should only be set for link-locals
443 443 */
444 444 dce_t *
445 445 dce_lookup_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst,
446 446 uint_t *generationp)
447 447 {
448 448 uint_t hash;
449 449 dcb_t *dcb;
450 450 dce_t *dce;
451 451
452 452 /* Set *generationp before dropping the lock(s) that allow additions */
453 453 if (generationp != NULL)
454 454 *generationp = ipst->ips_dce_default->dce_generation;
455 455
456 456 hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
457 457 dcb = &ipst->ips_dce_hash_v6[hash];
458 458 rw_enter(&dcb->dcb_lock, RW_READER);
459 459 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
460 460 if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
461 461 dce->dce_ifindex == ifindex) {
462 462 mutex_enter(&dce->dce_lock);
463 463 if (!DCE_IS_CONDEMNED(dce)) {
464 464 dce_refhold(dce);
465 465 if (generationp != NULL)
466 466 *generationp = dce->dce_generation;
467 467 mutex_exit(&dce->dce_lock);
468 468 rw_exit(&dcb->dcb_lock);
469 469 return (dce);
470 470 }
471 471 mutex_exit(&dce->dce_lock);
472 472 }
473 473 }
474 474 rw_exit(&dcb->dcb_lock);
475 475 /* Not found */
476 476 dce = ipst->ips_dce_default;
477 477 dce_refhold(dce);
478 478 return (dce);
479 479 }
480 480
481 481 /*
482 482 * Atomically looks for a non-default DCE, and if not found tries to create one.
483 483 * If there is no memory it returns NULL.
484 484 * When an entry is created we increase the generation number on
485 485 * the default DCE so that conn_ip_output will detect there is a new DCE.
486 486 */
487 487 dce_t *
488 488 dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst)
489 489 {
490 490 uint_t hash;
491 491 dcb_t *dcb;
492 492 dce_t *dce;
493 493
494 494 hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
495 495 dcb = &ipst->ips_dce_hash_v4[hash];
496 496 /*
497 497 * Assuming that we get fairly even distribution across all of the
498 498 * buckets, once one bucket is overly full, prune the whole cache.
499 499 */
500 500 if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold)
501 501 atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1);
502 502 rw_enter(&dcb->dcb_lock, RW_WRITER);
503 503 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
504 504 if (dce->dce_v4addr == dst) {
505 505 mutex_enter(&dce->dce_lock);
506 506 if (!DCE_IS_CONDEMNED(dce)) {
507 507 dce_refhold(dce);
508 508 mutex_exit(&dce->dce_lock);
509 509 rw_exit(&dcb->dcb_lock);
510 510 return (dce);
511 511 }
512 512 mutex_exit(&dce->dce_lock);
513 513 }
514 514 }
515 515 dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
516 516 if (dce == NULL) {
517 517 rw_exit(&dcb->dcb_lock);
518 518 return (NULL);
519 519 }
520 520 bzero(dce, sizeof (dce_t));
521 521 dce->dce_ipst = ipst; /* No netstack_hold */
522 522 dce->dce_v4addr = dst;
523 523 dce->dce_generation = DCE_GENERATION_INITIAL;
524 524 dce->dce_ipversion = IPV4_VERSION;
↓ open down ↓ |
524 lines elided |
↑ open up ↑ |
525 525 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
526 526 dce_refhold(dce); /* For the hash list */
527 527
528 528 /* Link into list */
529 529 if (dcb->dcb_dce != NULL)
530 530 dcb->dcb_dce->dce_ptpn = &dce->dce_next;
531 531 dce->dce_next = dcb->dcb_dce;
532 532 dce->dce_ptpn = &dcb->dcb_dce;
533 533 dcb->dcb_dce = dce;
534 534 dce->dce_bucket = dcb;
535 - atomic_add_32(&dcb->dcb_cnt, 1);
535 + atomic_inc_32(&dcb->dcb_cnt);
536 536 dce_refhold(dce); /* For the caller */
537 537 rw_exit(&dcb->dcb_lock);
538 538
539 539 /* Initialize dce_ident to be different than for the last packet */
540 540 dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
541 541
542 542 dce_increment_generation(ipst->ips_dce_default);
543 543 return (dce);
544 544 }
545 545
546 546 /*
547 547 * Atomically looks for a non-default DCE, and if not found tries to create one.
548 548 * If there is no memory it returns NULL.
549 549 * When an entry is created we increase the generation number on
550 550 * the default DCE so that conn_ip_output will detect there is a new DCE.
551 551 * ifindex should only be used with link-local addresses.
552 552 */
553 553 dce_t *
554 554 dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst)
555 555 {
556 556 uint_t hash;
557 557 dcb_t *dcb;
558 558 dce_t *dce;
559 559
560 560 /* We should not create entries for link-locals w/o an ifindex */
561 561 ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst)) || ifindex != 0);
562 562
563 563 hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
564 564 dcb = &ipst->ips_dce_hash_v6[hash];
565 565 /*
566 566 * Assuming that we get fairly even distribution across all of the
567 567 * buckets, once one bucket is overly full, prune the whole cache.
568 568 */
569 569 if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold)
570 570 atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1);
571 571 rw_enter(&dcb->dcb_lock, RW_WRITER);
572 572 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
573 573 if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
574 574 dce->dce_ifindex == ifindex) {
575 575 mutex_enter(&dce->dce_lock);
576 576 if (!DCE_IS_CONDEMNED(dce)) {
577 577 dce_refhold(dce);
578 578 mutex_exit(&dce->dce_lock);
579 579 rw_exit(&dcb->dcb_lock);
580 580 return (dce);
581 581 }
582 582 mutex_exit(&dce->dce_lock);
583 583 }
584 584 }
585 585
586 586 dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
587 587 if (dce == NULL) {
588 588 rw_exit(&dcb->dcb_lock);
589 589 return (NULL);
590 590 }
591 591 bzero(dce, sizeof (dce_t));
592 592 dce->dce_ipst = ipst; /* No netstack_hold */
593 593 dce->dce_v6addr = *dst;
594 594 dce->dce_ifindex = ifindex;
595 595 dce->dce_generation = DCE_GENERATION_INITIAL;
596 596 dce->dce_ipversion = IPV6_VERSION;
↓ open down ↓ |
51 lines elided |
↑ open up ↑ |
597 597 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
598 598 dce_refhold(dce); /* For the hash list */
599 599
600 600 /* Link into list */
601 601 if (dcb->dcb_dce != NULL)
602 602 dcb->dcb_dce->dce_ptpn = &dce->dce_next;
603 603 dce->dce_next = dcb->dcb_dce;
604 604 dce->dce_ptpn = &dcb->dcb_dce;
605 605 dcb->dcb_dce = dce;
606 606 dce->dce_bucket = dcb;
607 - atomic_add_32(&dcb->dcb_cnt, 1);
607 + atomic_inc_32(&dcb->dcb_cnt);
608 608 dce_refhold(dce); /* For the caller */
609 609 rw_exit(&dcb->dcb_lock);
610 610
611 611 /* Initialize dce_ident to be different than for the last packet */
612 612 dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
613 613 dce_increment_generation(ipst->ips_dce_default);
614 614 return (dce);
615 615 }
616 616
617 617 /*
618 618 * Set/update uinfo. Creates a per-destination dce if none exists.
619 619 *
620 620 * Note that we do not bump the generation number here.
621 621 * New connections will find the new uinfo.
622 622 *
623 623 * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd.
624 624 */
625 625 static void
626 626 dce_setuinfo(dce_t *dce, iulp_t *uinfo)
627 627 {
628 628 /*
629 629 * Update the round trip time estimate and/or the max frag size
630 630 * and/or the slow start threshold.
631 631 *
632 632 * We serialize multiple advises using dce_lock.
633 633 */
634 634 mutex_enter(&dce->dce_lock);
635 635 /* Gard against setting to zero */
636 636 if (uinfo->iulp_rtt != 0) {
637 637 /*
638 638 * If there is no old cached values, initialize them
639 639 * conservatively. Set them to be (1.5 * new value).
640 640 */
641 641 if (dce->dce_uinfo.iulp_rtt != 0) {
642 642 dce->dce_uinfo.iulp_rtt = (dce->dce_uinfo.iulp_rtt +
643 643 uinfo->iulp_rtt) >> 1;
644 644 } else {
645 645 dce->dce_uinfo.iulp_rtt = uinfo->iulp_rtt +
646 646 (uinfo->iulp_rtt >> 1);
647 647 }
648 648 if (dce->dce_uinfo.iulp_rtt_sd != 0) {
649 649 dce->dce_uinfo.iulp_rtt_sd =
650 650 (dce->dce_uinfo.iulp_rtt_sd +
651 651 uinfo->iulp_rtt_sd) >> 1;
652 652 } else {
653 653 dce->dce_uinfo.iulp_rtt_sd = uinfo->iulp_rtt_sd +
654 654 (uinfo->iulp_rtt_sd >> 1);
655 655 }
656 656 }
657 657 if (uinfo->iulp_mtu != 0) {
658 658 if (dce->dce_flags & DCEF_PMTU) {
659 659 dce->dce_pmtu = MIN(uinfo->iulp_mtu, dce->dce_pmtu);
660 660 } else {
661 661 dce->dce_pmtu = MIN(uinfo->iulp_mtu, IP_MAXPACKET);
662 662 dce->dce_flags |= DCEF_PMTU;
663 663 }
664 664 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
665 665 }
666 666 if (uinfo->iulp_ssthresh != 0) {
667 667 if (dce->dce_uinfo.iulp_ssthresh != 0)
668 668 dce->dce_uinfo.iulp_ssthresh =
669 669 (uinfo->iulp_ssthresh +
670 670 dce->dce_uinfo.iulp_ssthresh) >> 1;
671 671 else
672 672 dce->dce_uinfo.iulp_ssthresh = uinfo->iulp_ssthresh;
673 673 }
674 674 /* We have uinfo for sure */
675 675 dce->dce_flags |= DCEF_UINFO;
676 676 mutex_exit(&dce->dce_lock);
677 677 }
678 678
679 679
680 680 int
681 681 dce_update_uinfo_v4(ipaddr_t dst, iulp_t *uinfo, ip_stack_t *ipst)
682 682 {
683 683 dce_t *dce;
684 684
685 685 dce = dce_lookup_and_add_v4(dst, ipst);
686 686 if (dce == NULL)
687 687 return (ENOMEM);
688 688
689 689 dce_setuinfo(dce, uinfo);
690 690 dce_refrele(dce);
691 691 return (0);
692 692 }
693 693
694 694 int
695 695 dce_update_uinfo_v6(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
696 696 ip_stack_t *ipst)
697 697 {
698 698 dce_t *dce;
699 699
700 700 dce = dce_lookup_and_add_v6(dst, ifindex, ipst);
701 701 if (dce == NULL)
702 702 return (ENOMEM);
703 703
704 704 dce_setuinfo(dce, uinfo);
705 705 dce_refrele(dce);
706 706 return (0);
707 707 }
708 708
709 709 /* Common routine for IPv4 and IPv6 */
710 710 int
711 711 dce_update_uinfo(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
712 712 ip_stack_t *ipst)
713 713 {
714 714 ipaddr_t dst4;
715 715
716 716 if (IN6_IS_ADDR_V4MAPPED_ANY(dst)) {
717 717 IN6_V4MAPPED_TO_IPADDR(dst, dst4);
718 718 return (dce_update_uinfo_v4(dst4, uinfo, ipst));
719 719 } else {
720 720 return (dce_update_uinfo_v6(dst, ifindex, uinfo, ipst));
721 721 }
722 722 }
723 723
↓ open down ↓ |
106 lines elided |
↑ open up ↑ |
724 724 static void
725 725 dce_make_condemned(dce_t *dce)
726 726 {
727 727 ip_stack_t *ipst = dce->dce_ipst;
728 728
729 729 mutex_enter(&dce->dce_lock);
730 730 ASSERT(!DCE_IS_CONDEMNED(dce));
731 731 dce->dce_generation = DCE_GENERATION_CONDEMNED;
732 732 mutex_exit(&dce->dce_lock);
733 733 /* Count how many condemned dces for kmem_cache callback */
734 - atomic_add_32(&ipst->ips_num_dce_condemned, 1);
734 + atomic_inc_32(&ipst->ips_num_dce_condemned);
735 735 }
736 736
737 737 /*
738 738 * Increment the generation avoiding the special condemned value
739 739 */
740 740 void
741 741 dce_increment_generation(dce_t *dce)
742 742 {
743 743 uint_t generation;
744 744
745 745 mutex_enter(&dce->dce_lock);
746 746 if (!DCE_IS_CONDEMNED(dce)) {
747 747 generation = dce->dce_generation + 1;
748 748 if (generation == DCE_GENERATION_CONDEMNED)
749 749 generation = DCE_GENERATION_INITIAL;
750 750 ASSERT(generation != DCE_GENERATION_VERIFY);
751 751 dce->dce_generation = generation;
752 752 }
753 753 mutex_exit(&dce->dce_lock);
754 754 }
755 755
756 756 /*
757 757 * Increment the generation number on all dces that have a path MTU and
758 758 * the default DCE. Used when ill_mtu or ill_mc_mtu changes.
759 759 */
760 760 void
761 761 dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst)
762 762 {
763 763 int i;
764 764 dcb_t *dcb;
765 765 dce_t *dce;
766 766
767 767 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
768 768 if (isv6)
769 769 dcb = &ipst->ips_dce_hash_v6[i];
770 770 else
771 771 dcb = &ipst->ips_dce_hash_v4[i];
772 772 rw_enter(&dcb->dcb_lock, RW_WRITER);
773 773 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
774 774 if (DCE_IS_CONDEMNED(dce))
775 775 continue;
776 776 dce_increment_generation(dce);
777 777 }
778 778 rw_exit(&dcb->dcb_lock);
779 779 }
780 780 dce_increment_generation(ipst->ips_dce_default);
781 781 }
782 782
783 783 /*
784 784 * Caller needs to do a dce_refrele since we can't do the
785 785 * dce_refrele under dcb_lock.
↓ open down ↓ |
41 lines elided |
↑ open up ↑ |
786 786 */
787 787 static void
788 788 dce_delete_locked(dcb_t *dcb, dce_t *dce)
789 789 {
790 790 dce->dce_bucket = NULL;
791 791 *dce->dce_ptpn = dce->dce_next;
792 792 if (dce->dce_next != NULL)
793 793 dce->dce_next->dce_ptpn = dce->dce_ptpn;
794 794 dce->dce_ptpn = NULL;
795 795 dce->dce_next = NULL;
796 - atomic_add_32(&dcb->dcb_cnt, -1);
796 + atomic_dec_32(&dcb->dcb_cnt);
797 797 dce_make_condemned(dce);
798 798 }
799 799
800 800 static void
801 801 dce_inactive(dce_t *dce)
802 802 {
803 803 ip_stack_t *ipst = dce->dce_ipst;
804 804
805 805 ASSERT(!(dce->dce_flags & DCEF_DEFAULT));
806 806 ASSERT(dce->dce_ptpn == NULL);
807 807 ASSERT(dce->dce_bucket == NULL);
808 808
809 809 /* Count how many condemned dces for kmem_cache callback */
810 810 if (DCE_IS_CONDEMNED(dce))
811 - atomic_add_32(&ipst->ips_num_dce_condemned, -1);
811 + atomic_dec_32(&ipst->ips_num_dce_condemned);
812 812
813 813 kmem_cache_free(dce_cache, dce);
814 814 }
815 815
816 816 void
817 817 dce_refrele(dce_t *dce)
818 818 {
819 819 ASSERT(dce->dce_refcnt != 0);
820 - if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0)
820 + if (atomic_dec_32_nv(&dce->dce_refcnt) == 0)
821 821 dce_inactive(dce);
822 822 }
823 823
824 824 void
825 825 dce_refhold(dce_t *dce)
826 826 {
827 - atomic_add_32(&dce->dce_refcnt, 1);
827 + atomic_inc_32(&dce->dce_refcnt);
828 828 ASSERT(dce->dce_refcnt != 0);
829 829 }
830 830
831 831 /* No tracing support yet hence the same as the above functions */
832 832 void
833 833 dce_refrele_notr(dce_t *dce)
834 834 {
835 835 ASSERT(dce->dce_refcnt != 0);
836 - if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0)
836 + if (atomic_dec_32_nv(&dce->dce_refcnt) == 0)
837 837 dce_inactive(dce);
838 838 }
839 839
840 840 void
841 841 dce_refhold_notr(dce_t *dce)
842 842 {
843 - atomic_add_32(&dce->dce_refcnt, 1);
843 + atomic_inc_32(&dce->dce_refcnt);
844 844 ASSERT(dce->dce_refcnt != 0);
845 845 }
846 846
847 847 /* Report both the IPv4 and IPv6 DCEs. */
848 848 mblk_t *
849 849 ip_snmp_get_mib2_ip_dce(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
850 850 {
851 851 struct opthdr *optp;
852 852 mblk_t *mp2ctl;
853 853 dest_cache_entry_t dest_cache;
854 854 mblk_t *mp_tail = NULL;
855 855 dce_t *dce;
856 856 dcb_t *dcb;
857 857 int i;
858 858 uint64_t current_time;
859 859
860 860 current_time = TICK_TO_SEC(ddi_get_lbolt64());
861 861
862 862 /*
863 863 * make a copy of the original message
864 864 */
865 865 mp2ctl = copymsg(mpctl);
866 866
867 867 /* First we do IPv4 entries */
868 868 optp = (struct opthdr *)&mpctl->b_rptr[
869 869 sizeof (struct T_optmgmt_ack)];
870 870 optp->level = MIB2_IP;
871 871 optp->name = EXPER_IP_DCE;
872 872
873 873 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
874 874 dcb = &ipst->ips_dce_hash_v4[i];
875 875 rw_enter(&dcb->dcb_lock, RW_READER);
876 876 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
877 877 dest_cache.DestIpv4Address = dce->dce_v4addr;
878 878 dest_cache.DestFlags = dce->dce_flags;
879 879 if (dce->dce_flags & DCEF_PMTU)
880 880 dest_cache.DestPmtu = dce->dce_pmtu;
881 881 else
882 882 dest_cache.DestPmtu = 0;
883 883 dest_cache.DestIdent = dce->dce_ident;
884 884 dest_cache.DestIfindex = 0;
885 885 dest_cache.DestAge = current_time -
886 886 dce->dce_last_change_time;
887 887 if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
888 888 (char *)&dest_cache, (int)sizeof (dest_cache))) {
889 889 ip1dbg(("ip_snmp_get_mib2_ip_dce: "
890 890 "failed to allocate %u bytes\n",
891 891 (uint_t)sizeof (dest_cache)));
892 892 }
893 893 }
894 894 rw_exit(&dcb->dcb_lock);
895 895 }
896 896 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
897 897 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
898 898 (int)optp->level, (int)optp->name, (int)optp->len));
899 899 qreply(q, mpctl);
900 900
901 901 if (mp2ctl == NULL) {
902 902 /* Copymsg failed above */
903 903 return (NULL);
904 904 }
905 905
906 906 /* Now for IPv6 */
907 907 mpctl = mp2ctl;
908 908 mp_tail = NULL;
909 909 mp2ctl = copymsg(mpctl);
910 910 optp = (struct opthdr *)&mpctl->b_rptr[
911 911 sizeof (struct T_optmgmt_ack)];
912 912 optp->level = MIB2_IP6;
913 913 optp->name = EXPER_IP_DCE;
914 914
915 915 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
916 916 dcb = &ipst->ips_dce_hash_v6[i];
917 917 rw_enter(&dcb->dcb_lock, RW_READER);
918 918 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
919 919 dest_cache.DestIpv6Address = dce->dce_v6addr;
920 920 dest_cache.DestFlags = dce->dce_flags;
921 921 if (dce->dce_flags & DCEF_PMTU)
922 922 dest_cache.DestPmtu = dce->dce_pmtu;
923 923 else
924 924 dest_cache.DestPmtu = 0;
925 925 dest_cache.DestIdent = dce->dce_ident;
926 926 if (IN6_IS_ADDR_LINKSCOPE(&dce->dce_v6addr))
927 927 dest_cache.DestIfindex = dce->dce_ifindex;
928 928 else
929 929 dest_cache.DestIfindex = 0;
930 930 dest_cache.DestAge = current_time -
931 931 dce->dce_last_change_time;
932 932 if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
933 933 (char *)&dest_cache, (int)sizeof (dest_cache))) {
934 934 ip1dbg(("ip_snmp_get_mib2_ip_dce: "
935 935 "failed to allocate %u bytes\n",
936 936 (uint_t)sizeof (dest_cache)));
937 937 }
938 938 }
939 939 rw_exit(&dcb->dcb_lock);
940 940 }
941 941 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
942 942 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
943 943 (int)optp->level, (int)optp->name, (int)optp->len));
944 944 qreply(q, mpctl);
945 945
946 946 return (mp2ctl);
947 947 }
948 948
949 949 /*
950 950 * Remove IPv6 DCEs which refer to an ifindex that is going away.
951 951 * This is not required for correctness, but it avoids netstat -d
952 952 * showing stale stuff that will never be used.
953 953 */
954 954 void
955 955 dce_cleanup(uint_t ifindex, ip_stack_t *ipst)
956 956 {
957 957 uint_t i;
958 958 dcb_t *dcb;
959 959 dce_t *dce, *nextdce;
960 960
961 961 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
962 962 dcb = &ipst->ips_dce_hash_v6[i];
963 963 rw_enter(&dcb->dcb_lock, RW_WRITER);
964 964
965 965 for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
966 966 nextdce = dce->dce_next;
967 967 if (dce->dce_ifindex == ifindex) {
968 968 dce_delete_locked(dcb, dce);
969 969 dce_refrele(dce);
970 970 }
971 971 }
972 972 rw_exit(&dcb->dcb_lock);
973 973 }
974 974 }
↓ open down ↓ |
121 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX