Print this page
5045 use atomic_{inc,dec}_* instead of atomic_add_*
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/inet/ip/ip_ndp.c
+++ new/usr/src/uts/common/inet/ip/ip_ndp.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 */
24 24
25 25 #include <sys/types.h>
26 26 #include <sys/stream.h>
27 27 #include <sys/stropts.h>
28 28 #include <sys/strsun.h>
29 29 #include <sys/sysmacros.h>
30 30 #include <sys/errno.h>
31 31 #include <sys/dlpi.h>
32 32 #include <sys/socket.h>
33 33 #include <sys/ddi.h>
34 34 #include <sys/sunddi.h>
35 35 #include <sys/cmn_err.h>
36 36 #include <sys/debug.h>
37 37 #include <sys/vtrace.h>
38 38 #include <sys/kmem.h>
39 39 #include <sys/zone.h>
40 40 #include <sys/ethernet.h>
41 41 #include <sys/sdt.h>
42 42 #include <sys/mac.h>
43 43
44 44 #include <net/if.h>
45 45 #include <net/if_types.h>
46 46 #include <net/if_dl.h>
47 47 #include <net/route.h>
48 48 #include <netinet/in.h>
49 49 #include <netinet/ip6.h>
50 50 #include <netinet/icmp6.h>
51 51
52 52 #include <inet/common.h>
53 53 #include <inet/mi.h>
54 54 #include <inet/mib2.h>
55 55 #include <inet/nd.h>
56 56 #include <inet/ip.h>
57 57 #include <inet/ip_impl.h>
58 58 #include <inet/ipclassifier.h>
59 59 #include <inet/ip_if.h>
60 60 #include <inet/ip_ire.h>
61 61 #include <inet/ip_rts.h>
62 62 #include <inet/ip6.h>
63 63 #include <inet/ip_ndp.h>
64 64 #include <inet/sctp_ip.h>
65 65 #include <inet/ip_arp.h>
66 66 #include <inet/ip2mac_impl.h>
67 67
68 68 #define ANNOUNCE_INTERVAL(isv6) \
69 69 (isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
70 70 ipst->ips_ip_arp_publish_interval)
71 71
72 72 #define DEFENSE_INTERVAL(isv6) \
73 73 (isv6 ? ipst->ips_ndp_defend_interval : \
74 74 ipst->ips_arp_defend_interval)
75 75
76 76 /* Non-tunable probe interval, based on link capabilities */
77 77 #define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500)
78 78
79 79 /*
80 80 * The IPv4 Link Local address space is special; we do extra duplicate checking
81 81 * there, as the entire assignment mechanism rests on random numbers.
82 82 */
83 83 #define IS_IPV4_LL_SPACE(ptr) (((uchar_t *)ptr)[0] == 169 && \
84 84 ((uchar_t *)ptr)[1] == 254)
85 85
86 86 /*
87 87 * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
88 88 * in to the ncec*add* functions.
89 89 *
90 90 * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
91 91 * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
92 92 * that we will respond to requests for the protocol address.
93 93 */
94 94 #define NCE_EXTERNAL_FLAGS_MASK \
95 95 (NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
96 96 NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
97 97 NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)
98 98
99 99 /*
100 100 * Lock ordering:
101 101 *
102 102 * ndp_g_lock -> ill_lock -> ncec_lock
103 103 *
104 104 * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
105 105 * ncec_next. ncec_lock protects the contents of the NCE (particularly
106 106 * ncec_refcnt).
107 107 */
108 108
109 109 static void nce_cleanup_list(ncec_t *ncec);
110 110 static void nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
111 111 static ncec_t *ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
112 112 ncec_t *);
113 113 static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *);
114 114 static int nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
115 115 uint16_t ncec_flags, nce_t **newnce);
116 116 static int nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
117 117 uint16_t ncec_flags, nce_t **newnce);
118 118 static boolean_t ndp_xmit(ill_t *ill, uint32_t operation,
119 119 uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
120 120 const in6_addr_t *target, int flag);
121 121 static void ncec_refhold_locked(ncec_t *);
122 122 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
123 123 static void nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
124 124 static int nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
125 125 uint16_t, uint16_t, nce_t **);
126 126 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
127 127 static nce_t *nce_add(ill_t *, ncec_t *);
128 128 static void nce_inactive(nce_t *);
129 129 extern nce_t *nce_lookup(ill_t *, const in6_addr_t *);
130 130 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
131 131 static int nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
132 132 uint16_t, uint16_t, nce_t **);
133 133 static int nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
134 134 uint16_t, uint16_t, nce_t **);
135 135 static int nce_add_v6_postprocess(nce_t *);
136 136 static int nce_add_v4_postprocess(nce_t *);
137 137 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
138 138 static clock_t nce_fuzz_interval(clock_t, boolean_t);
139 139 static void nce_resolv_ipmp_ok(ncec_t *);
140 140 static void nce_walk_common(ill_t *, pfi_t, void *);
141 141 static void nce_start_timer(ncec_t *, uint_t);
142 142 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
143 143 static void nce_fastpath_trigger(nce_t *);
144 144 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
145 145
146 146 #ifdef DEBUG
147 147 static void ncec_trace_cleanup(const ncec_t *);
148 148 #endif
149 149
150 150 #define NCE_HASH_PTR_V4(ipst, addr) \
151 151 (&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
152 152
153 153 #define NCE_HASH_PTR_V6(ipst, addr) \
154 154 (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
155 155 NCE_TABLE_SIZE)]))
156 156
157 157 extern kmem_cache_t *ncec_cache;
158 158 extern kmem_cache_t *nce_cache;
159 159
160 160 /*
161 161 * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
162 162 * If src_ill is not null, the ncec_addr is bound to src_ill. The
163 163 * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
164 164 * the probe is sent on the ncec_ill (in the non-IPMP case) or the
165 165 * IPMP cast_ill (in the IPMP case).
166 166 *
167 167 * Note that the probe interval is based on the src_ill for IPv6, and
168 168 * the ncec_xmit_interval for IPv4.
169 169 */
170 170 static void
171 171 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
172 172 {
173 173 boolean_t dropped;
174 174 uint32_t probe_interval;
175 175
176 176 ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
177 177 ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
178 178 if (ncec->ncec_ipversion == IPV6_VERSION) {
179 179 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
180 180 ncec->ncec_lladdr, ncec->ncec_lladdr_length,
181 181 &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
182 182 probe_interval = ILL_PROBE_INTERVAL(src_ill);
183 183 } else {
184 184 /* IPv4 DAD delay the initial probe. */
185 185 if (send_probe)
186 186 dropped = arp_probe(ncec);
187 187 else
188 188 dropped = B_TRUE;
189 189 probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
190 190 !send_probe);
191 191 }
192 192 if (!dropped) {
193 193 mutex_enter(&ncec->ncec_lock);
194 194 ncec->ncec_pcnt--;
195 195 mutex_exit(&ncec->ncec_lock);
196 196 }
197 197 nce_restart_timer(ncec, probe_interval);
198 198 }
199 199
200 200 /*
201 201 * Compute default flags to use for an advertisement of this ncec's address.
202 202 */
203 203 static int
204 204 nce_advert_flags(const ncec_t *ncec)
205 205 {
206 206 int flag = 0;
207 207
208 208 if (ncec->ncec_flags & NCE_F_ISROUTER)
209 209 flag |= NDP_ISROUTER;
210 210 if (!(ncec->ncec_flags & NCE_F_ANYCAST))
211 211 flag |= NDP_ORIDE;
212 212
213 213 return (flag);
214 214 }
215 215
216 216 /*
217 217 * NDP Cache Entry creation routine.
218 218 * This routine must always be called with ndp6->ndp_g_lock held.
219 219 */
220 220 int
221 221 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
222 222 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
223 223 {
224 224 int err;
225 225 nce_t *nce;
226 226
227 227 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
228 228 ASSERT(ill != NULL && ill->ill_isv6);
229 229
230 230 err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
231 231 &nce);
232 232 if (err != 0)
233 233 return (err);
234 234 ASSERT(newnce != NULL);
235 235 *newnce = nce;
236 236 return (err);
237 237 }
238 238
239 239 /*
240 240 * Post-processing routine to be executed after nce_add_v6(). This function
241 241 * triggers fastpath (if appropriate) and DAD on the newly added nce entry
242 242 * and must be called without any locks held.
243 243 */
244 244 int
245 245 nce_add_v6_postprocess(nce_t *nce)
246 246 {
247 247 ncec_t *ncec = nce->nce_common;
248 248 boolean_t dropped = B_FALSE;
249 249 uchar_t *hw_addr = ncec->ncec_lladdr;
250 250 uint_t hw_addr_len = ncec->ncec_lladdr_length;
251 251 ill_t *ill = ncec->ncec_ill;
252 252 int err = 0;
253 253 uint16_t flags = ncec->ncec_flags;
254 254 ip_stack_t *ipst = ill->ill_ipst;
255 255 boolean_t trigger_fastpath = B_TRUE;
256 256
257 257 /*
258 258 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
259 259 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
260 260 * We call nce_fastpath from nce_update if the link layer address of
261 261 * the peer changes from nce_update
262 262 */
263 263 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
264 264 (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
265 265 trigger_fastpath = B_FALSE;
266 266
267 267 if (trigger_fastpath)
268 268 nce_fastpath_trigger(nce);
269 269 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
270 270 ill_t *hwaddr_ill;
271 271 /*
272 272 * Unicast entry that needs DAD.
273 273 */
274 274 if (IS_IPMP(ill)) {
275 275 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
276 276 hw_addr, hw_addr_len);
277 277 } else {
278 278 hwaddr_ill = ill;
279 279 }
280 280 nce_dad(ncec, hwaddr_ill, B_TRUE);
281 281 err = EINPROGRESS;
282 282 } else if (flags & NCE_F_UNSOL_ADV) {
283 283 /*
284 284 * We account for the transmit below by assigning one
285 285 * less than the ndd variable. Subsequent decrements
286 286 * are done in nce_timer.
287 287 */
288 288 mutex_enter(&ncec->ncec_lock);
289 289 ncec->ncec_unsolicit_count =
290 290 ipst->ips_ip_ndp_unsolicit_count - 1;
291 291 mutex_exit(&ncec->ncec_lock);
292 292 dropped = ndp_xmit(ill,
293 293 ND_NEIGHBOR_ADVERT,
294 294 hw_addr,
295 295 hw_addr_len,
296 296 &ncec->ncec_addr, /* Source and target of the adv */
297 297 &ipv6_all_hosts_mcast, /* Destination of the packet */
298 298 nce_advert_flags(ncec));
299 299 mutex_enter(&ncec->ncec_lock);
300 300 if (dropped)
301 301 ncec->ncec_unsolicit_count++;
302 302 else
303 303 ncec->ncec_last_time_defended = ddi_get_lbolt();
304 304 if (ncec->ncec_unsolicit_count != 0) {
305 305 nce_start_timer(ncec,
306 306 ipst->ips_ip_ndp_unsolicit_interval);
307 307 }
308 308 mutex_exit(&ncec->ncec_lock);
309 309 }
310 310 return (err);
311 311 }
312 312
313 313 /*
314 314 * Atomically lookup and add (if needed) Neighbor Cache information for
315 315 * an address.
316 316 *
317 317 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
318 318 * are always added pointing at the ipmp_ill. Thus, when the ill passed
319 319 * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
320 320 * entries will be created, both pointing at the same ncec_t. The nce_t
321 321 * entries will have their nce_ill set to the ipmp_ill and the under_ill
322 322 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
323 323 * Local addresses are always created on the ill passed to nce_add_v6.
324 324 */
325 325 int
326 326 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
327 327 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
328 328 {
329 329 int err = 0;
330 330 ip_stack_t *ipst = ill->ill_ipst;
331 331 nce_t *nce, *upper_nce = NULL;
332 332 ill_t *in_ill = ill;
333 333 boolean_t need_ill_refrele = B_FALSE;
334 334
335 335 if (flags & NCE_F_MCAST) {
336 336 /*
337 337 * hw_addr will be figured out in nce_set_multicast_v6;
338 338 * caller has to select the cast_ill
339 339 */
340 340 ASSERT(hw_addr == NULL);
341 341 ASSERT(!IS_IPMP(ill));
342 342 err = nce_set_multicast_v6(ill, addr, flags, newnce);
343 343 return (err);
344 344 }
345 345 ASSERT(ill->ill_isv6);
346 346 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
347 347 ill = ipmp_ill_hold_ipmp_ill(ill);
348 348 if (ill == NULL)
349 349 return (ENXIO);
350 350 need_ill_refrele = B_TRUE;
351 351 }
352 352
353 353 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
354 354 nce = nce_lookup_addr(ill, addr);
355 355 if (nce == NULL) {
356 356 err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
357 357 &nce);
358 358 } else {
359 359 err = EEXIST;
360 360 }
361 361 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
362 362 if (err == 0)
363 363 err = nce_add_v6_postprocess(nce);
364 364 if (in_ill != ill && nce != NULL) {
365 365 nce_t *under_nce = NULL;
366 366
367 367 /*
368 368 * in_ill was the under_ill. Try to create the under_nce.
369 369 * Hold the ill_g_lock to prevent changes to group membership
370 370 * until we are done.
371 371 */
372 372 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
373 373 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
374 374 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
375 375 ill_t *, ill);
376 376 rw_exit(&ipst->ips_ill_g_lock);
377 377 err = ENXIO;
378 378 nce_refrele(nce);
379 379 nce = NULL;
380 380 goto bail;
381 381 }
382 382 under_nce = nce_fastpath_create(in_ill, nce->nce_common);
383 383 if (under_nce == NULL) {
384 384 rw_exit(&ipst->ips_ill_g_lock);
385 385 err = EINVAL;
386 386 nce_refrele(nce);
387 387 nce = NULL;
388 388 goto bail;
389 389 }
390 390 rw_exit(&ipst->ips_ill_g_lock);
391 391 upper_nce = nce;
392 392 nce = under_nce; /* will be returned to caller */
393 393 if (NCE_ISREACHABLE(nce->nce_common))
394 394 nce_fastpath_trigger(under_nce);
395 395 }
396 396 /* nce_refrele is deferred until the lock is dropped */
397 397 if (nce != NULL) {
398 398 if (newnce != NULL)
399 399 *newnce = nce;
400 400 else
401 401 nce_refrele(nce);
402 402 }
403 403 bail:
404 404 if (upper_nce != NULL)
405 405 nce_refrele(upper_nce);
406 406 if (need_ill_refrele)
407 407 ill_refrele(ill);
408 408 return (err);
409 409 }
410 410
411 411 /*
412 412 * Remove all the CONDEMNED nces from the appropriate hash table.
413 413 * We create a private list of NCEs, these may have ires pointing
414 414 * to them, so the list will be passed through to clean up dependent
415 415 * ires and only then we can do ncec_refrele() which can make NCE inactive.
416 416 */
417 417 static void
418 418 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
419 419 {
420 420 ncec_t *ncec1;
421 421 ncec_t **ptpn;
422 422
423 423 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
424 424 ASSERT(ndp->ndp_g_walker == 0);
425 425 for (; ncec; ncec = ncec1) {
426 426 ncec1 = ncec->ncec_next;
427 427 mutex_enter(&ncec->ncec_lock);
428 428 if (NCE_ISCONDEMNED(ncec)) {
429 429 ptpn = ncec->ncec_ptpn;
430 430 ncec1 = ncec->ncec_next;
431 431 if (ncec1 != NULL)
432 432 ncec1->ncec_ptpn = ptpn;
433 433 *ptpn = ncec1;
434 434 ncec->ncec_ptpn = NULL;
435 435 ncec->ncec_next = NULL;
436 436 ncec->ncec_next = *free_nce_list;
437 437 *free_nce_list = ncec;
438 438 }
439 439 mutex_exit(&ncec->ncec_lock);
440 440 }
441 441 }
442 442
443 443 /*
444 444 * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
445 445 * will return this NCE. Also no new timeouts will
446 446 * be started (See nce_restart_timer).
447 447 * 2. Cancel any currently running timeouts.
448 448 * 3. If there is an ndp walker, return. The walker will do the cleanup.
449 449 * This ensures that walkers see a consistent list of NCEs while walking.
450 450 * 4. Otherwise remove the NCE from the list of NCEs
451 451 */
452 452 void
453 453 ncec_delete(ncec_t *ncec)
454 454 {
455 455 ncec_t **ptpn;
456 456 ncec_t *ncec1;
457 457 int ipversion = ncec->ncec_ipversion;
458 458 ndp_g_t *ndp;
459 459 ip_stack_t *ipst = ncec->ncec_ipst;
460 460
461 461 if (ipversion == IPV4_VERSION)
462 462 ndp = ipst->ips_ndp4;
463 463 else
464 464 ndp = ipst->ips_ndp6;
465 465
466 466 /* Serialize deletes */
467 467 mutex_enter(&ncec->ncec_lock);
468 468 if (NCE_ISCONDEMNED(ncec)) {
469 469 /* Some other thread is doing the delete */
470 470 mutex_exit(&ncec->ncec_lock);
471 471 return;
↓ open down ↓ |
471 lines elided |
↑ open up ↑ |
472 472 }
473 473 /*
474 474 * Caller has a refhold. Also 1 ref for being in the list. Thus
475 475 * refcnt has to be >= 2
476 476 */
477 477 ASSERT(ncec->ncec_refcnt >= 2);
478 478 ncec->ncec_flags |= NCE_F_CONDEMNED;
479 479 mutex_exit(&ncec->ncec_lock);
480 480
481 481 /* Count how many condemned ires for kmem_cache callback */
482 - atomic_add_32(&ipst->ips_num_nce_condemned, 1);
482 + atomic_inc_32(&ipst->ips_num_nce_condemned);
483 483 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
484 484
485 485 /* Complete any waiting callbacks */
486 486 ncec_cb_dispatch(ncec);
487 487
488 488 /*
489 489 * Cancel any running timer. Timeout can't be restarted
490 490 * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
491 491 * Passing invalid timeout id is fine.
492 492 */
493 493 if (ncec->ncec_timeout_id != 0) {
494 494 (void) untimeout(ncec->ncec_timeout_id);
495 495 ncec->ncec_timeout_id = 0;
496 496 }
497 497
498 498 mutex_enter(&ndp->ndp_g_lock);
499 499 if (ncec->ncec_ptpn == NULL) {
500 500 /*
501 501 * The last ndp walker has already removed this ncec from
502 502 * the list after we marked the ncec CONDEMNED and before
503 503 * we grabbed the global lock.
504 504 */
505 505 mutex_exit(&ndp->ndp_g_lock);
506 506 return;
507 507 }
508 508 if (ndp->ndp_g_walker > 0) {
509 509 /*
510 510 * Can't unlink. The walker will clean up
511 511 */
512 512 ndp->ndp_g_walker_cleanup = B_TRUE;
513 513 mutex_exit(&ndp->ndp_g_lock);
514 514 return;
515 515 }
516 516
517 517 /*
518 518 * Now remove the ncec from the list. nce_restart_timer won't restart
519 519 * the timer since it is marked CONDEMNED.
520 520 */
521 521 ptpn = ncec->ncec_ptpn;
522 522 ncec1 = ncec->ncec_next;
523 523 if (ncec1 != NULL)
524 524 ncec1->ncec_ptpn = ptpn;
525 525 *ptpn = ncec1;
526 526 ncec->ncec_ptpn = NULL;
527 527 ncec->ncec_next = NULL;
528 528 mutex_exit(&ndp->ndp_g_lock);
529 529
530 530 /* Removed from ncec_ptpn/ncec_next list */
531 531 ncec_refrele_notr(ncec);
532 532 }
533 533
534 534 void
535 535 ncec_inactive(ncec_t *ncec)
536 536 {
537 537 mblk_t **mpp;
538 538 ill_t *ill = ncec->ncec_ill;
539 539 ip_stack_t *ipst = ncec->ncec_ipst;
540 540
541 541 ASSERT(ncec->ncec_refcnt == 0);
542 542 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
543 543
544 544 /* Count how many condemned nces for kmem_cache callback */
545 545 if (NCE_ISCONDEMNED(ncec))
546 546 atomic_add_32(&ipst->ips_num_nce_condemned, -1);
547 547
548 548 /* Free all allocated messages */
549 549 mpp = &ncec->ncec_qd_mp;
550 550 while (*mpp != NULL) {
551 551 mblk_t *mp;
552 552
553 553 mp = *mpp;
554 554 *mpp = mp->b_next;
555 555
556 556 inet_freemsg(mp);
557 557 }
558 558 /*
559 559 * must have been cleaned up in ncec_delete
560 560 */
561 561 ASSERT(list_is_empty(&ncec->ncec_cb));
562 562 list_destroy(&ncec->ncec_cb);
563 563 /*
564 564 * free the ncec_lladdr if one was allocated in nce_add_common()
565 565 */
566 566 if (ncec->ncec_lladdr_length > 0)
567 567 kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);
568 568
569 569 #ifdef DEBUG
570 570 ncec_trace_cleanup(ncec);
571 571 #endif
572 572
573 573 mutex_enter(&ill->ill_lock);
574 574 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
575 575 (char *), "ncec", (void *), ncec);
576 576 ill->ill_ncec_cnt--;
577 577 ncec->ncec_ill = NULL;
578 578 /*
579 579 * If the number of ncec's associated with this ill have dropped
580 580 * to zero, check whether we need to restart any operation that
581 581 * is waiting for this to happen.
582 582 */
583 583 if (ILL_DOWN_OK(ill)) {
584 584 /* ipif_ill_refrele_tail drops the ill_lock */
585 585 ipif_ill_refrele_tail(ill);
586 586 } else {
587 587 mutex_exit(&ill->ill_lock);
588 588 }
589 589
590 590 mutex_destroy(&ncec->ncec_lock);
591 591 kmem_cache_free(ncec_cache, ncec);
592 592 }
593 593
594 594 /*
595 595 * ncec_walk routine. Delete the ncec if it is associated with the ill
596 596 * that is going away. Always called as a writer.
597 597 */
598 598 void
599 599 ncec_delete_per_ill(ncec_t *ncec, uchar_t *arg)
600 600 {
601 601 if ((ncec != NULL) && ncec->ncec_ill == (ill_t *)arg) {
602 602 ncec_delete(ncec);
603 603 }
604 604 }
605 605
606 606 /*
607 607 * Neighbor Cache cleanup logic for a list of ncec_t entries.
608 608 */
609 609 static void
610 610 nce_cleanup_list(ncec_t *ncec)
611 611 {
612 612 ncec_t *ncec_next;
613 613
614 614 ASSERT(ncec != NULL);
615 615 while (ncec != NULL) {
616 616 ncec_next = ncec->ncec_next;
617 617 ncec->ncec_next = NULL;
618 618
619 619 /*
620 620 * It is possible for the last ndp walker (this thread)
621 621 * to come here after ncec_delete has marked the ncec CONDEMNED
622 622 * and before it has removed the ncec from the fastpath list
623 623 * or called untimeout. So we need to do it here. It is safe
624 624 * for both ncec_delete and this thread to do it twice or
625 625 * even simultaneously since each of the threads has a
626 626 * reference on the ncec.
627 627 */
628 628 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
629 629 /*
630 630 * Cancel any running timer. Timeout can't be restarted
631 631 * since CONDEMNED is set. The ncec_lock can't be
632 632 * held across untimeout though passing invalid timeout
633 633 * id is fine.
634 634 */
635 635 if (ncec->ncec_timeout_id != 0) {
636 636 (void) untimeout(ncec->ncec_timeout_id);
637 637 ncec->ncec_timeout_id = 0;
638 638 }
639 639 /* Removed from ncec_ptpn/ncec_next list */
640 640 ncec_refrele_notr(ncec);
641 641 ncec = ncec_next;
642 642 }
643 643 }
644 644
645 645 /*
646 646 * Restart DAD on given NCE. Returns B_TRUE if DAD has been restarted.
647 647 */
648 648 boolean_t
649 649 nce_restart_dad(ncec_t *ncec)
650 650 {
651 651 boolean_t started;
652 652 ill_t *ill, *hwaddr_ill;
653 653
654 654 if (ncec == NULL)
655 655 return (B_FALSE);
656 656 ill = ncec->ncec_ill;
657 657 mutex_enter(&ncec->ncec_lock);
658 658 if (ncec->ncec_state == ND_PROBE) {
659 659 mutex_exit(&ncec->ncec_lock);
660 660 started = B_TRUE;
661 661 } else if (ncec->ncec_state == ND_REACHABLE) {
662 662 ASSERT(ncec->ncec_lladdr != NULL);
663 663 ncec->ncec_state = ND_PROBE;
664 664 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
665 665 /*
666 666 * Slight cheat here: we don't use the initial probe delay
667 667 * for IPv4 in this obscure case.
668 668 */
669 669 mutex_exit(&ncec->ncec_lock);
670 670 if (IS_IPMP(ill)) {
671 671 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
672 672 ncec->ncec_lladdr, ncec->ncec_lladdr_length);
673 673 } else {
674 674 hwaddr_ill = ill;
675 675 }
676 676 nce_dad(ncec, hwaddr_ill, B_TRUE);
677 677 started = B_TRUE;
678 678 } else {
679 679 mutex_exit(&ncec->ncec_lock);
680 680 started = B_FALSE;
681 681 }
682 682 return (started);
683 683 }
684 684
685 685 /*
686 686 * IPv6 Cache entry lookup. Try to find an ncec matching the parameters passed.
687 687 * If one is found, the refcnt on the ncec will be incremented.
688 688 */
689 689 ncec_t *
690 690 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
691 691 {
692 692 ncec_t *ncec;
693 693 ip_stack_t *ipst = ill->ill_ipst;
694 694
695 695 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
696 696 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
697 697
698 698 /* Get head of v6 hash table */
699 699 ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
700 700 ncec = ncec_lookup_illgrp(ill, addr, ncec);
701 701 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
702 702 rw_exit(&ipst->ips_ill_g_lock);
703 703 return (ncec);
704 704 }
705 705 /*
706 706 * IPv4 Cache entry lookup. Try to find an ncec matching the parameters passed.
707 707 * If one is found, the refcnt on the ncec will be incremented.
708 708 */
709 709 ncec_t *
710 710 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
711 711 {
712 712 ncec_t *ncec = NULL;
713 713 in6_addr_t addr6;
714 714 ip_stack_t *ipst = ill->ill_ipst;
715 715
716 716 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
717 717 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
718 718
719 719 /* Get head of v4 hash table */
720 720 ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
721 721 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
722 722 ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
723 723 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
724 724 rw_exit(&ipst->ips_ill_g_lock);
725 725 return (ncec);
726 726 }
727 727
728 728 /*
729 729 * Cache entry lookup. Try to find an ncec matching the parameters passed.
730 730 * If an ncec is found, increment the hold count on that ncec.
731 731 * The caller passes in the start of the appropriate hash table, and must
732 732 * be holding the appropriate global lock (ndp_g_lock). In addition, since
733 733 * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
734 734 * must be held as reader.
735 735 *
736 736 * This function always matches across the ipmp group.
737 737 */
738 738 ncec_t *
739 739 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
740 740 {
741 741 ndp_g_t *ndp;
742 742 ip_stack_t *ipst = ill->ill_ipst;
743 743
744 744 if (ill->ill_isv6)
745 745 ndp = ipst->ips_ndp6;
746 746 else
747 747 ndp = ipst->ips_ndp4;
748 748
749 749 ASSERT(ill != NULL);
750 750 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
751 751 if (IN6_IS_ADDR_UNSPECIFIED(addr))
752 752 return (NULL);
753 753 for (; ncec != NULL; ncec = ncec->ncec_next) {
754 754 if (ncec->ncec_ill == ill ||
755 755 IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
756 756 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
757 757 mutex_enter(&ncec->ncec_lock);
758 758 if (!NCE_ISCONDEMNED(ncec)) {
759 759 ncec_refhold_locked(ncec);
760 760 mutex_exit(&ncec->ncec_lock);
761 761 break;
762 762 }
763 763 mutex_exit(&ncec->ncec_lock);
764 764 }
765 765 }
766 766 }
767 767 return (ncec);
768 768 }
769 769
770 770 /*
771 771 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
772 772 * entries for ill only, i.e., when ill is part of an ipmp group,
773 773 * nce_lookup_v4 will never try to match across the group.
774 774 */
775 775 nce_t *
776 776 nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
777 777 {
778 778 nce_t *nce;
779 779 in6_addr_t addr6;
780 780 ip_stack_t *ipst = ill->ill_ipst;
781 781
782 782 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
783 783 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
784 784 nce = nce_lookup_addr(ill, &addr6);
785 785 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
786 786 return (nce);
787 787 }
788 788
789 789 /*
790 790 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
791 791 * entries for ill only, i.e., when ill is part of an ipmp group,
792 792 * nce_lookup_v6 will never try to match across the group.
793 793 */
794 794 nce_t *
795 795 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
796 796 {
797 797 nce_t *nce;
798 798 ip_stack_t *ipst = ill->ill_ipst;
799 799
800 800 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
801 801 nce = nce_lookup_addr(ill, addr6);
802 802 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
803 803 return (nce);
804 804 }
805 805
806 806 static nce_t *
807 807 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
808 808 {
809 809 nce_t *nce;
810 810
811 811 ASSERT(ill != NULL);
812 812 #ifdef DEBUG
813 813 if (ill->ill_isv6)
814 814 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
815 815 else
816 816 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
817 817 #endif
818 818 mutex_enter(&ill->ill_lock);
819 819 nce = nce_lookup(ill, addr);
820 820 mutex_exit(&ill->ill_lock);
821 821 return (nce);
822 822 }
823 823
824 824
825 825 /*
826 826 * Router turned to host. We need to make sure that cached copies of the ncec
827 827 * are not used for forwarding packets if they were derived from the default
828 828 * route, and that the default route itself is removed, as required by
829 829 * section 7.2.5 of RFC 2461.
830 830 *
831 831 * Note that the ncec itself probably has valid link-layer information for the
832 832 * nexthop, so that there is no reason to delete the ncec, as long as the
833 833 * ISROUTER flag is turned off.
834 834 */
835 835 static void
836 836 ncec_router_to_host(ncec_t *ncec)
837 837 {
838 838 ire_t *ire;
839 839 ip_stack_t *ipst = ncec->ncec_ipst;
840 840
841 841 mutex_enter(&ncec->ncec_lock);
842 842 ncec->ncec_flags &= ~NCE_F_ISROUTER;
843 843 mutex_exit(&ncec->ncec_lock);
844 844
845 845 ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
846 846 &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL,
847 847 MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
848 848 if (ire != NULL) {
849 849 ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
850 850 ire_delete(ire);
851 851 ire_refrele(ire);
852 852 }
853 853 }
854 854
855 855 /*
856 856 * Process passed in parameters either from an incoming packet or via
857 857 * user ioctl.
858 858 */
859 859 void
860 860 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
861 861 {
862 862 ill_t *ill = ncec->ncec_ill;
863 863 uint32_t hw_addr_len = ill->ill_phys_addr_length;
864 864 boolean_t ll_updated = B_FALSE;
865 865 boolean_t ll_changed;
866 866 nce_t *nce;
867 867
868 868 ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
869 869 /*
870 870 * No updates of link layer address or the neighbor state is
871 871 * allowed, when the cache is in NONUD state. This still
872 872 * allows for responding to reachability solicitation.
873 873 */
874 874 mutex_enter(&ncec->ncec_lock);
875 875 if (ncec->ncec_state == ND_INCOMPLETE) {
876 876 if (hw_addr == NULL) {
877 877 mutex_exit(&ncec->ncec_lock);
878 878 return;
879 879 }
880 880 nce_set_ll(ncec, hw_addr);
881 881 /*
882 882 * Update ncec state and send the queued packets
883 883 * back to ip this time ire will be added.
884 884 */
885 885 if (flag & ND_NA_FLAG_SOLICITED) {
886 886 nce_update(ncec, ND_REACHABLE, NULL);
887 887 } else {
888 888 nce_update(ncec, ND_STALE, NULL);
889 889 }
890 890 mutex_exit(&ncec->ncec_lock);
891 891 nce = nce_fastpath(ncec, B_TRUE, NULL);
892 892 nce_resolv_ok(ncec);
893 893 if (nce != NULL)
894 894 nce_refrele(nce);
895 895 return;
896 896 }
897 897 ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
898 898 if (!is_adv) {
899 899 /* If this is a SOLICITATION request only */
900 900 if (ll_changed)
901 901 nce_update(ncec, ND_STALE, hw_addr);
902 902 mutex_exit(&ncec->ncec_lock);
903 903 ncec_cb_dispatch(ncec);
904 904 return;
905 905 }
906 906 if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
907 907 /* If in any other state than REACHABLE, ignore */
908 908 if (ncec->ncec_state == ND_REACHABLE) {
909 909 nce_update(ncec, ND_STALE, NULL);
910 910 }
911 911 mutex_exit(&ncec->ncec_lock);
912 912 ncec_cb_dispatch(ncec);
913 913 return;
914 914 } else {
915 915 if (ll_changed) {
916 916 nce_update(ncec, ND_UNCHANGED, hw_addr);
917 917 ll_updated = B_TRUE;
918 918 }
919 919 if (flag & ND_NA_FLAG_SOLICITED) {
920 920 nce_update(ncec, ND_REACHABLE, NULL);
921 921 } else {
922 922 if (ll_updated) {
923 923 nce_update(ncec, ND_STALE, NULL);
924 924 }
925 925 }
926 926 mutex_exit(&ncec->ncec_lock);
927 927 if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
928 928 NCE_F_ISROUTER)) {
929 929 ncec_router_to_host(ncec);
930 930 } else {
931 931 ncec_cb_dispatch(ncec);
932 932 }
933 933 }
934 934 }
935 935
936 936 /*
937 937 * Pass arg1 to the pfi supplied, along with each ncec in existence.
938 938 * ncec_walk() places a REFHOLD on the ncec and drops the lock when
939 939 * walking the hash list.
940 940 */
941 941 void
942 942 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
943 943 boolean_t trace)
944 944 {
945 945 ncec_t *ncec;
946 946 ncec_t *ncec1;
947 947 ncec_t **ncep;
948 948 ncec_t *free_nce_list = NULL;
949 949
950 950 mutex_enter(&ndp->ndp_g_lock);
951 951 /* Prevent ncec_delete from unlink and free of NCE */
952 952 ndp->ndp_g_walker++;
953 953 mutex_exit(&ndp->ndp_g_lock);
954 954 for (ncep = ndp->nce_hash_tbl;
955 955 ncep < A_END(ndp->nce_hash_tbl); ncep++) {
956 956 for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
957 957 ncec1 = ncec->ncec_next;
958 958 if (ill == NULL || ncec->ncec_ill == ill) {
959 959 if (trace) {
960 960 ncec_refhold(ncec);
961 961 (*pfi)(ncec, arg1);
962 962 ncec_refrele(ncec);
963 963 } else {
964 964 ncec_refhold_notr(ncec);
965 965 (*pfi)(ncec, arg1);
966 966 ncec_refrele_notr(ncec);
967 967 }
968 968 }
969 969 }
970 970 }
971 971 mutex_enter(&ndp->ndp_g_lock);
972 972 ndp->ndp_g_walker--;
973 973 if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
974 974 /* Time to delete condemned entries */
975 975 for (ncep = ndp->nce_hash_tbl;
976 976 ncep < A_END(ndp->nce_hash_tbl); ncep++) {
977 977 ncec = *ncep;
978 978 if (ncec != NULL) {
979 979 nce_remove(ndp, ncec, &free_nce_list);
980 980 }
981 981 }
982 982 ndp->ndp_g_walker_cleanup = B_FALSE;
983 983 }
984 984
985 985 mutex_exit(&ndp->ndp_g_lock);
986 986
987 987 if (free_nce_list != NULL) {
988 988 nce_cleanup_list(free_nce_list);
989 989 }
990 990 }
991 991
992 992 /*
993 993 * Walk everything.
994 994 * Note that ill can be NULL hence can't derive the ipst from it.
995 995 */
996 996 void
997 997 ncec_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
998 998 {
999 999 ncec_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
1000 1000 ncec_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
1001 1001 }
1002 1002
1003 1003 /*
1004 1004 * For each interface an entry is added for the unspecified multicast group.
1005 1005 * Here that mapping is used to form the multicast cache entry for a particular
1006 1006 * multicast destination.
1007 1007 */
1008 1008 static int
1009 1009 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
1010 1010 uint16_t flags, nce_t **newnce)
1011 1011 {
1012 1012 uchar_t *hw_addr;
1013 1013 int err = 0;
1014 1014 ip_stack_t *ipst = ill->ill_ipst;
1015 1015 nce_t *nce;
1016 1016
1017 1017 ASSERT(ill != NULL);
1018 1018 ASSERT(ill->ill_isv6);
1019 1019 ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1020 1020
1021 1021 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1022 1022 nce = nce_lookup_addr(ill, dst);
1023 1023 if (nce != NULL) {
1024 1024 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1025 1025 goto done;
1026 1026 }
1027 1027 if (ill->ill_net_type == IRE_IF_RESOLVER) {
1028 1028 /*
1029 1029 * For IRE_IF_RESOLVER a hardware mapping can be
1030 1030 * generated.
1031 1031 */
1032 1032 hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1033 1033 if (hw_addr == NULL) {
1034 1034 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1035 1035 return (ENOMEM);
1036 1036 }
1037 1037 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
1038 1038 } else {
1039 1039 /* No hw_addr is needed for IRE_IF_NORESOLVER. */
1040 1040 hw_addr = NULL;
1041 1041 }
1042 1042 ASSERT((flags & NCE_F_MCAST) != 0);
1043 1043 ASSERT((flags & NCE_F_NONUD) != 0);
1044 1044 /* nce_state will be computed by nce_add_common() */
1045 1045 err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1046 1046 ND_UNCHANGED, &nce);
1047 1047 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1048 1048 if (err == 0)
1049 1049 err = nce_add_v6_postprocess(nce);
1050 1050 if (hw_addr != NULL)
1051 1051 kmem_free(hw_addr, ill->ill_nd_lla_len);
1052 1052 if (err != 0) {
1053 1053 ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1054 1054 return (err);
1055 1055 }
1056 1056 done:
1057 1057 ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1058 1058 if (newnce != NULL)
1059 1059 *newnce = nce;
1060 1060 else
1061 1061 nce_refrele(nce);
1062 1062 return (0);
1063 1063 }
1064 1064
1065 1065 /*
1066 1066 * Return the link layer address, and any flags of a ncec.
1067 1067 */
1068 1068 int
1069 1069 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1070 1070 {
1071 1071 ncec_t *ncec;
1072 1072 in6_addr_t *addr;
1073 1073 sin6_t *sin6;
1074 1074
1075 1075 ASSERT(ill != NULL && ill->ill_isv6);
1076 1076 sin6 = (sin6_t *)&lnr->lnr_addr;
1077 1077 addr = &sin6->sin6_addr;
1078 1078
1079 1079 /*
1080 1080 * NOTE: if the ill is an IPMP interface, then match against the whole
1081 1081 * illgrp. This e.g. allows in.ndpd to retrieve the link layer
1082 1082 * addresses for the data addresses on an IPMP interface even though
1083 1083 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
1084 1084 */
1085 1085 ncec = ncec_lookup_illgrp_v6(ill, addr);
1086 1086 if (ncec == NULL)
1087 1087 return (ESRCH);
1088 1088 /* If no link layer address is available yet, return ESRCH */
1089 1089 if (!NCE_ISREACHABLE(ncec)) {
1090 1090 ncec_refrele(ncec);
1091 1091 return (ESRCH);
1092 1092 }
1093 1093 lnr->lnr_hdw_len = ill->ill_phys_addr_length;
1094 1094 bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
1095 1095 lnr->lnr_hdw_len);
1096 1096 if (ncec->ncec_flags & NCE_F_ISROUTER)
1097 1097 lnr->lnr_flags = NDF_ISROUTER_ON;
1098 1098 if (ncec->ncec_flags & NCE_F_ANYCAST)
1099 1099 lnr->lnr_flags |= NDF_ANYCAST_ON;
1100 1100 ncec_refrele(ncec);
1101 1101 return (0);
1102 1102 }
1103 1103
1104 1104 /*
1105 1105 * Finish setting up the Enable/Disable multicast for the driver.
1106 1106 */
1107 1107 mblk_t *
1108 1108 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
1109 1109 uint32_t hw_addr_offset, mblk_t *mp)
1110 1110 {
1111 1111 uchar_t *hw_addr;
1112 1112 ipaddr_t v4group;
1113 1113 uchar_t *addr;
1114 1114
1115 1115 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1116 1116 if (IN6_IS_ADDR_V4MAPPED(v6group)) {
1117 1117 IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
1118 1118
1119 1119 ASSERT(CLASSD(v4group));
1120 1120 ASSERT(!(ill->ill_isv6));
1121 1121
1122 1122 addr = (uchar_t *)&v4group;
1123 1123 } else {
1124 1124 ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
1125 1125 ASSERT(ill->ill_isv6);
1126 1126
1127 1127 addr = (uchar_t *)v6group;
1128 1128 }
1129 1129 hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1130 1130 if (hw_addr == NULL) {
1131 1131 ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
1132 1132 freemsg(mp);
1133 1133 return (NULL);
1134 1134 }
1135 1135
1136 1136 ip_mcast_mapping(ill, addr, hw_addr);
1137 1137 return (mp);
1138 1138 }
1139 1139
1140 1140 void
1141 1141 ip_ndp_resolve(ncec_t *ncec)
1142 1142 {
1143 1143 in_addr_t sender4 = INADDR_ANY;
1144 1144 in6_addr_t sender6 = ipv6_all_zeros;
1145 1145 ill_t *src_ill;
1146 1146 uint32_t ms;
1147 1147
1148 1148 src_ill = nce_resolve_src(ncec, &sender6);
1149 1149 if (src_ill == NULL) {
1150 1150 /* Make sure we try again later */
1151 1151 ms = ncec->ncec_ill->ill_reachable_retrans_time;
1152 1152 nce_restart_timer(ncec, (clock_t)ms);
1153 1153 return;
1154 1154 }
1155 1155 if (ncec->ncec_ipversion == IPV4_VERSION)
1156 1156 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
1157 1157 mutex_enter(&ncec->ncec_lock);
1158 1158 if (ncec->ncec_ipversion == IPV6_VERSION)
1159 1159 ms = ndp_solicit(ncec, sender6, src_ill);
1160 1160 else
1161 1161 ms = arp_request(ncec, sender4, src_ill);
1162 1162 mutex_exit(&ncec->ncec_lock);
1163 1163 if (ms == 0) {
1164 1164 if (ncec->ncec_state != ND_REACHABLE) {
1165 1165 if (ncec->ncec_ipversion == IPV6_VERSION)
1166 1166 ndp_resolv_failed(ncec);
1167 1167 else
1168 1168 arp_resolv_failed(ncec);
1169 1169 ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
1170 1170 nce_make_unreachable(ncec);
1171 1171 ncec_delete(ncec);
1172 1172 }
1173 1173 } else {
1174 1174 nce_restart_timer(ncec, (clock_t)ms);
1175 1175 }
1176 1176 done:
1177 1177 ill_refrele(src_ill);
1178 1178 }
1179 1179
1180 1180 /*
1181 1181 * Send an IPv6 neighbor solicitation.
1182 1182 * Returns number of milliseconds after which we should either rexmit or abort.
1183 1183 * Return of zero means we should abort.
1184 1184 * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
1185 1185 * The optional source address is used as a hint to ndp_solicit for
1186 1186 * which source to use in the packet.
1187 1187 *
1188 1188 * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
1189 1189 * the packet.
1190 1190 */
1191 1191 uint32_t
1192 1192 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
1193 1193 {
1194 1194 in6_addr_t dst;
1195 1195 boolean_t dropped = B_FALSE;
1196 1196
1197 1197 ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
1198 1198 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
1199 1199
1200 1200 if (ncec->ncec_rcnt == 0)
1201 1201 return (0);
1202 1202
1203 1203 dst = ncec->ncec_addr;
1204 1204 ncec->ncec_rcnt--;
1205 1205 mutex_exit(&ncec->ncec_lock);
1206 1206 dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
1207 1207 ill->ill_phys_addr_length, &src, &dst, 0);
1208 1208 mutex_enter(&ncec->ncec_lock);
1209 1209 if (dropped)
1210 1210 ncec->ncec_rcnt++;
1211 1211 return (ncec->ncec_ill->ill_reachable_retrans_time);
1212 1212 }
1213 1213
1214 1214 /*
1215 1215 * Attempt to recover an address on an interface that's been marked as a
1216 1216 * duplicate. Because NCEs are destroyed when the interface goes down, there's
1217 1217 * no easy way to just probe the address and have the right thing happen if
1218 1218 * it's no longer in use. Instead, we just bring it up normally and allow the
1219 1219 * regular interface start-up logic to probe for a remaining duplicate and take
1220 1220 * us back down if necessary.
1221 1221 * Neither DHCP nor temporary addresses arrive here; they're excluded by
1222 1222 * ip_ndp_excl.
1223 1223 */
1224 1224 /* ARGSUSED */
1225 1225 void
1226 1226 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1227 1227 {
1228 1228 ill_t *ill = rq->q_ptr;
1229 1229 ipif_t *ipif;
1230 1230 in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
1231 1231 in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
1232 1232 boolean_t addr_equal;
1233 1233
1234 1234 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1235 1235 /*
1236 1236 * We do not support recovery of proxy ARP'd interfaces,
1237 1237 * because the system lacks a complete proxy ARP mechanism.
1238 1238 */
1239 1239 if (ill->ill_isv6) {
1240 1240 addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1241 1241 addr6);
1242 1242 } else {
1243 1243 addr_equal = (ipif->ipif_lcl_addr == *addr4);
1244 1244 }
1245 1245
1246 1246 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
1247 1247 continue;
1248 1248
1249 1249 /*
1250 1250 * If we have already recovered or if the interface is going
1251 1251 * away, then ignore.
1252 1252 */
1253 1253 mutex_enter(&ill->ill_lock);
1254 1254 if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1255 1255 (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1256 1256 mutex_exit(&ill->ill_lock);
1257 1257 continue;
1258 1258 }
1259 1259
1260 1260 ipif->ipif_flags &= ~IPIF_DUPLICATE;
1261 1261 ill->ill_ipif_dup_count--;
1262 1262 mutex_exit(&ill->ill_lock);
1263 1263 ipif->ipif_was_dup = B_TRUE;
1264 1264
1265 1265 if (ill->ill_isv6) {
1266 1266 VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1267 1267 (void) ipif_up_done_v6(ipif);
1268 1268 } else {
1269 1269 VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
1270 1270 EINPROGRESS);
1271 1271 (void) ipif_up_done(ipif);
1272 1272 }
1273 1273 }
1274 1274 freeb(mp);
1275 1275 }
1276 1276
1277 1277 /*
1278 1278 * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1279 1279 * As long as someone else holds the address, the interface will stay down.
1280 1280 * When that conflict goes away, the interface is brought back up. This is
1281 1281 * done so that accidental shutdowns of addresses aren't made permanent. Your
1282 1282 * server will recover from a failure.
1283 1283 *
1284 1284 * For DHCP and temporary addresses, recovery is not done in the kernel.
1285 1285 * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1286 1286 *
1287 1287 * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1288 1288 */
1289 1289 void
1290 1290 ipif_dup_recovery(void *arg)
1291 1291 {
1292 1292 ipif_t *ipif = arg;
1293 1293
1294 1294 ipif->ipif_recovery_id = 0;
1295 1295 if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1296 1296 return;
1297 1297
1298 1298 /*
1299 1299 * No lock, because this is just an optimization.
1300 1300 */
1301 1301 if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1302 1302 return;
1303 1303
1304 1304 /* If the link is down, we'll retry this later */
1305 1305 if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1306 1306 return;
1307 1307
1308 1308 ipif_do_recovery(ipif);
1309 1309 }
1310 1310
1311 1311 /*
1312 1312 * Perform interface recovery by forcing the duplicate interfaces up and
1313 1313 * allowing the system to determine which ones should stay up.
1314 1314 *
1315 1315 * Called both by recovery timer expiry and link-up notification.
1316 1316 */
1317 1317 void
1318 1318 ipif_do_recovery(ipif_t *ipif)
1319 1319 {
1320 1320 ill_t *ill = ipif->ipif_ill;
1321 1321 mblk_t *mp;
1322 1322 ip_stack_t *ipst = ill->ill_ipst;
1323 1323 size_t mp_size;
1324 1324
1325 1325 if (ipif->ipif_isv6)
1326 1326 mp_size = sizeof (ipif->ipif_v6lcl_addr);
1327 1327 else
1328 1328 mp_size = sizeof (ipif->ipif_lcl_addr);
1329 1329 mp = allocb(mp_size, BPRI_MED);
1330 1330 if (mp == NULL) {
1331 1331 mutex_enter(&ill->ill_lock);
1332 1332 if (ipst->ips_ip_dup_recovery > 0 &&
1333 1333 ipif->ipif_recovery_id == 0 &&
1334 1334 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1335 1335 ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1336 1336 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1337 1337 }
1338 1338 mutex_exit(&ill->ill_lock);
1339 1339 } else {
1340 1340 /*
1341 1341 * A recovery timer may still be running if we got here from
1342 1342 * ill_restart_dad(); cancel that timer.
1343 1343 */
1344 1344 if (ipif->ipif_recovery_id != 0)
1345 1345 (void) untimeout(ipif->ipif_recovery_id);
1346 1346 ipif->ipif_recovery_id = 0;
1347 1347
1348 1348 if (ipif->ipif_isv6) {
1349 1349 bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1350 1350 sizeof (ipif->ipif_v6lcl_addr));
1351 1351 } else {
1352 1352 bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
1353 1353 sizeof (ipif->ipif_lcl_addr));
1354 1354 }
1355 1355 ill_refhold(ill);
1356 1356 qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
1357 1357 B_FALSE);
1358 1358 }
1359 1359 }
1360 1360
1361 1361 /*
1362 1362 * Find the MAC and IP addresses in an NA/NS message.
1363 1363 */
1364 1364 static void
1365 1365 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
1366 1366 in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
1367 1367 {
1368 1368 icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1369 1369 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1370 1370 uchar_t *addr;
1371 1371 int alen;
1372 1372
1373 1373 /* icmp_inbound_v6 ensures this */
1374 1374 ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1375 1375
1376 1376 addr = ira->ira_l2src;
1377 1377 alen = ill->ill_phys_addr_length;
1378 1378 if (alen > 0) {
1379 1379 *haddr = addr;
1380 1380 *haddrlenp = alen;
1381 1381 } else {
1382 1382 *haddr = NULL;
1383 1383 *haddrlenp = 0;
1384 1384 }
1385 1385
1386 1386 /* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1387 1387 *targp = ns->nd_ns_target;
1388 1388 }
1389 1389
1390 1390 /*
1391 1391 * This is for exclusive changes due to NDP duplicate address detection
1392 1392 * failure.
1393 1393 */
1394 1394 /* ARGSUSED */
1395 1395 static void
1396 1396 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1397 1397 {
1398 1398 ill_t *ill = rq->q_ptr;
1399 1399 ipif_t *ipif;
1400 1400 uchar_t *haddr;
1401 1401 uint_t haddrlen;
1402 1402 ip_stack_t *ipst = ill->ill_ipst;
1403 1403 in6_addr_t targ;
1404 1404 ip_recv_attr_t iras;
1405 1405 mblk_t *attrmp;
1406 1406
1407 1407 attrmp = mp;
1408 1408 mp = mp->b_cont;
1409 1409 attrmp->b_cont = NULL;
1410 1410 if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
1411 1411 /* The ill or ip_stack_t disappeared on us */
1412 1412 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1413 1413 ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
1414 1414 freemsg(mp);
1415 1415 ira_cleanup(&iras, B_TRUE);
1416 1416 return;
1417 1417 }
1418 1418
1419 1419 ASSERT(ill == iras.ira_rill);
1420 1420
1421 1421 ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
1422 1422 if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1423 1423 /*
1424 1424 * Ignore conflicts generated by misbehaving switches that
1425 1425 * just reflect our own messages back to us. For IPMP, we may
1426 1426 * see reflections across any ill in the illgrp.
1427 1427 *
1428 1428 * RFC2462 and revisions tried to detect both the case
1429 1429 * when a statically configured IPv6 address is a duplicate,
1430 1430 * and the case when the L2 address itself is a duplicate. The
1431 1431 * later is important because, with stateles address autoconf,
1432 1432 * if the L2 address is a duplicate, the resulting IPv6
1433 1433 * address(es) would also be duplicates. We rely on DAD of the
1434 1434 * IPv6 address itself to detect the latter case.
1435 1435 */
1436 1436 /* For an under ill_grp can change under lock */
1437 1437 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1438 1438 if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1439 1439 IS_UNDER_IPMP(ill) &&
1440 1440 ipmp_illgrp_find_ill(ill->ill_grp, haddr,
1441 1441 haddrlen) != NULL) {
1442 1442 rw_exit(&ipst->ips_ill_g_lock);
1443 1443 goto ignore_conflict;
1444 1444 }
1445 1445 rw_exit(&ipst->ips_ill_g_lock);
1446 1446 }
1447 1447
1448 1448 /*
1449 1449 * Look up the appropriate ipif.
1450 1450 */
1451 1451 ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
1452 1452 if (ipif == NULL)
1453 1453 goto ignore_conflict;
1454 1454
1455 1455 /* Reload the ill to match the ipif */
1456 1456 ill = ipif->ipif_ill;
1457 1457
1458 1458 /* If it's already duplicate or ineligible, then don't do anything. */
1459 1459 if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1460 1460 ipif_refrele(ipif);
1461 1461 goto ignore_conflict;
1462 1462 }
1463 1463
1464 1464 /*
1465 1465 * If this is a failure during duplicate recovery, then don't
1466 1466 * complain. It may take a long time to recover.
1467 1467 */
1468 1468 if (!ipif->ipif_was_dup) {
1469 1469 char ibuf[LIFNAMSIZ];
1470 1470 char hbuf[MAC_STR_LEN];
1471 1471 char sbuf[INET6_ADDRSTRLEN];
1472 1472
1473 1473 ipif_get_name(ipif, ibuf, sizeof (ibuf));
1474 1474 cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1475 1475 " disabled", ibuf,
1476 1476 inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1477 1477 mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1478 1478 }
1479 1479 mutex_enter(&ill->ill_lock);
1480 1480 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1481 1481 ipif->ipif_flags |= IPIF_DUPLICATE;
1482 1482 ill->ill_ipif_dup_count++;
1483 1483 mutex_exit(&ill->ill_lock);
1484 1484 (void) ipif_down(ipif, NULL, NULL);
1485 1485 (void) ipif_down_tail(ipif);
1486 1486 mutex_enter(&ill->ill_lock);
1487 1487 if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1488 1488 ill->ill_net_type == IRE_IF_RESOLVER &&
1489 1489 !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1490 1490 ipst->ips_ip_dup_recovery > 0) {
1491 1491 ASSERT(ipif->ipif_recovery_id == 0);
1492 1492 ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1493 1493 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1494 1494 }
1495 1495 mutex_exit(&ill->ill_lock);
1496 1496 ipif_refrele(ipif);
1497 1497
1498 1498 ignore_conflict:
1499 1499 freemsg(mp);
1500 1500 ira_cleanup(&iras, B_TRUE);
1501 1501 }
1502 1502
1503 1503 /*
1504 1504 * Handle failure by tearing down the ipifs with the specified address. Note
1505 1505 * that tearing down the ipif also means deleting the ncec through ipif_down, so
1506 1506 * it's not possible to do recovery by just restarting the ncec timer. Instead,
1507 1507 * we start a timer on the ipif.
1508 1508 * Caller has to free mp;
1509 1509 */
1510 1510 static void
1511 1511 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
1512 1512 {
1513 1513 const uchar_t *haddr;
1514 1514 ill_t *ill = ira->ira_rill;
1515 1515
1516 1516 /*
1517 1517 * Ignore conflicts generated by misbehaving switches that just
1518 1518 * reflect our own messages back to us.
1519 1519 */
1520 1520
1521 1521 /* icmp_inbound_v6 ensures this */
1522 1522 ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1523 1523 haddr = ira->ira_l2src;
1524 1524 if (haddr != NULL &&
1525 1525 bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1526 1526 return;
1527 1527 }
1528 1528
1529 1529 if ((mp = copymsg(mp)) != NULL) {
1530 1530 mblk_t *attrmp;
1531 1531
1532 1532 attrmp = ip_recv_attr_to_mblk(ira);
1533 1533 if (attrmp == NULL) {
1534 1534 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1535 1535 ip_drop_input("ipIfStatsInDiscards", mp, ill);
1536 1536 freemsg(mp);
1537 1537 } else {
1538 1538 ASSERT(attrmp->b_cont == NULL);
1539 1539 attrmp->b_cont = mp;
1540 1540 mp = attrmp;
1541 1541 ill_refhold(ill);
1542 1542 qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
1543 1543 B_FALSE);
1544 1544 }
1545 1545 }
1546 1546 }
1547 1547
1548 1548 /*
1549 1549 * Handle a discovered conflict: some other system is advertising that it owns
1550 1550 * one of our IP addresses. We need to defend ourselves, or just shut down the
1551 1551 * interface.
1552 1552 *
1553 1553 * Handles both IPv4 and IPv6
1554 1554 */
1555 1555 boolean_t
1556 1556 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
1557 1557 {
1558 1558 ipif_t *ipif;
1559 1559 clock_t now;
1560 1560 uint_t maxdefense;
1561 1561 uint_t defs;
1562 1562 ill_t *ill = ira->ira_ill;
1563 1563 ip_stack_t *ipst = ill->ill_ipst;
1564 1564 uint32_t elapsed;
1565 1565 boolean_t isv6 = ill->ill_isv6;
1566 1566 ipaddr_t ncec_addr;
1567 1567
1568 1568 if (isv6) {
1569 1569 ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
1570 1570 ipst);
1571 1571 } else {
1572 1572 if (arp_no_defense) {
1573 1573 /*
1574 1574 * Yes, there is a conflict, but no, we do not
1575 1575 * defend ourself.
1576 1576 */
1577 1577 return (B_TRUE);
1578 1578 }
1579 1579 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
1580 1580 ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
1581 1581 ipst);
1582 1582 }
1583 1583 if (ipif == NULL)
1584 1584 return (B_FALSE);
1585 1585
1586 1586 /*
1587 1587 * First, figure out if this address is disposable.
1588 1588 */
1589 1589 if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1590 1590 maxdefense = ipst->ips_ip_max_temp_defend;
1591 1591 else
1592 1592 maxdefense = ipst->ips_ip_max_defend;
1593 1593
1594 1594 /*
1595 1595 * Now figure out how many times we've defended ourselves. Ignore
1596 1596 * defenses that happened long in the past.
1597 1597 */
1598 1598 now = ddi_get_lbolt();
1599 1599 elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
1600 1600 mutex_enter(&ncec->ncec_lock);
1601 1601 if ((defs = ncec->ncec_defense_count) > 0 &&
1602 1602 elapsed > ipst->ips_ip_defend_interval) {
1603 1603 /*
1604 1604 * ip_defend_interval has elapsed.
1605 1605 * reset the defense count.
1606 1606 */
1607 1607 ncec->ncec_defense_count = defs = 0;
1608 1608 }
1609 1609 ncec->ncec_defense_count++;
1610 1610 ncec->ncec_last_time_defended = now;
1611 1611 mutex_exit(&ncec->ncec_lock);
1612 1612 ipif_refrele(ipif);
1613 1613
1614 1614 /*
1615 1615 * If we've defended ourselves too many times already, then give up and
1616 1616 * tear down the interface(s) using this address.
1617 1617 * Otherwise, caller has to defend by sending out an announce.
1618 1618 */
1619 1619 if (defs >= maxdefense) {
1620 1620 if (isv6)
1621 1621 ndp_failure(mp, ira);
1622 1622 else
1623 1623 arp_failure(mp, ira);
1624 1624 } else {
1625 1625 return (B_TRUE); /* caller must defend this address */
1626 1626 }
1627 1627 return (B_FALSE);
1628 1628 }
1629 1629
1630 1630 /*
1631 1631 * Handle reception of Neighbor Solicitation messages.
1632 1632 */
1633 1633 static void
1634 1634 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
1635 1635 {
1636 1636 ill_t *ill = ira->ira_ill, *under_ill;
1637 1637 nd_neighbor_solicit_t *ns;
1638 1638 uint32_t hlen = ill->ill_phys_addr_length;
1639 1639 uchar_t *haddr = NULL;
1640 1640 icmp6_t *icmp_nd;
1641 1641 ip6_t *ip6h;
1642 1642 ncec_t *our_ncec = NULL;
1643 1643 in6_addr_t target;
1644 1644 in6_addr_t src;
1645 1645 int len;
1646 1646 int flag = 0;
1647 1647 nd_opt_hdr_t *opt = NULL;
1648 1648 boolean_t bad_solicit = B_FALSE;
1649 1649 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
1650 1650 boolean_t need_ill_refrele = B_FALSE;
1651 1651
1652 1652 ip6h = (ip6_t *)mp->b_rptr;
1653 1653 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1654 1654 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1655 1655 src = ip6h->ip6_src;
1656 1656 ns = (nd_neighbor_solicit_t *)icmp_nd;
1657 1657 target = ns->nd_ns_target;
1658 1658 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1659 1659 IN6_IS_ADDR_LOOPBACK(&target)) {
1660 1660 if (ip_debug > 2) {
1661 1661 /* ip1dbg */
1662 1662 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1663 1663 AF_INET6, &target);
1664 1664 }
1665 1665 bad_solicit = B_TRUE;
1666 1666 goto done;
1667 1667 }
1668 1668 if (len > sizeof (nd_neighbor_solicit_t)) {
1669 1669 /* Options present */
1670 1670 opt = (nd_opt_hdr_t *)&ns[1];
1671 1671 len -= sizeof (nd_neighbor_solicit_t);
1672 1672 if (!ndp_verify_optlen(opt, len)) {
1673 1673 ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1674 1674 bad_solicit = B_TRUE;
1675 1675 goto done;
1676 1676 }
1677 1677 }
1678 1678 if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1679 1679 /* Check to see if this is a valid DAD solicitation */
1680 1680 if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1681 1681 if (ip_debug > 2) {
1682 1682 /* ip1dbg */
1683 1683 pr_addr_dbg("ndp_input_solicit: IPv6 "
1684 1684 "Destination is not solicited node "
1685 1685 "multicast %s\n", AF_INET6,
1686 1686 &ip6h->ip6_dst);
1687 1687 }
1688 1688 bad_solicit = B_TRUE;
1689 1689 goto done;
1690 1690 }
1691 1691 }
1692 1692
1693 1693 /*
1694 1694 * NOTE: with IPMP, it's possible the nominated multicast ill (which
1695 1695 * received this packet if it's multicast) is not the ill tied to
1696 1696 * e.g. the IPMP ill's data link-local. So we match across the illgrp
1697 1697 * to ensure we find the associated NCE.
1698 1698 */
1699 1699 our_ncec = ncec_lookup_illgrp_v6(ill, &target);
1700 1700 /*
1701 1701 * If this is a valid Solicitation for an address we are publishing,
1702 1702 * then a PUBLISH entry should exist in the cache
1703 1703 */
1704 1704 if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
1705 1705 ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1706 1706 "ifname=%s ", ill->ill_name));
1707 1707 if (ip_debug > 2) {
1708 1708 /* ip1dbg */
1709 1709 pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1710 1710 }
1711 1711 if (our_ncec == NULL)
1712 1712 bad_solicit = B_TRUE;
1713 1713 goto done;
1714 1714 }
1715 1715
1716 1716 /* At this point we should have a verified NS per spec */
1717 1717 if (opt != NULL) {
1718 1718 opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1719 1719 if (opt != NULL) {
1720 1720 haddr = (uchar_t *)&opt[1];
1721 1721 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1722 1722 hlen == 0) {
1723 1723 ip1dbg(("ndp_input_advert: bad SLLA\n"));
1724 1724 bad_solicit = B_TRUE;
1725 1725 goto done;
1726 1726 }
1727 1727 }
1728 1728 }
1729 1729
1730 1730 /* If sending directly to peer, set the unicast flag */
1731 1731 if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1732 1732 flag |= NDP_UNICAST;
1733 1733
1734 1734 /*
1735 1735 * Create/update the entry for the soliciting node on the ipmp_ill.
1736 1736 * or respond to outstanding queries, don't if
1737 1737 * the source is unspecified address.
1738 1738 */
1739 1739 if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1740 1740 int err;
1741 1741 nce_t *nnce;
1742 1742
1743 1743 ASSERT(ill->ill_isv6);
1744 1744 /*
1745 1745 * Regular solicitations *must* include the Source Link-Layer
1746 1746 * Address option. Ignore messages that do not.
1747 1747 */
1748 1748 if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1749 1749 ip1dbg(("ndp_input_solicit: source link-layer address "
1750 1750 "option missing with a specified source.\n"));
1751 1751 bad_solicit = B_TRUE;
1752 1752 goto done;
1753 1753 }
1754 1754
1755 1755 /*
1756 1756 * This is a regular solicitation. If we're still in the
1757 1757 * process of verifying the address, then don't respond at all
1758 1758 * and don't keep track of the sender.
1759 1759 */
1760 1760 if (our_ncec->ncec_state == ND_PROBE)
1761 1761 goto done;
1762 1762
1763 1763 /*
1764 1764 * If the solicitation doesn't have sender hardware address
1765 1765 * (legal for unicast solicitation), then process without
1766 1766 * installing the return NCE. Either we already know it, or
1767 1767 * we'll be forced to look it up when (and if) we reply to the
1768 1768 * packet.
1769 1769 */
1770 1770 if (haddr == NULL)
1771 1771 goto no_source;
1772 1772
1773 1773 under_ill = ill;
1774 1774 if (IS_UNDER_IPMP(under_ill)) {
1775 1775 ill = ipmp_ill_hold_ipmp_ill(under_ill);
1776 1776 if (ill == NULL)
1777 1777 ill = under_ill;
1778 1778 else
1779 1779 need_ill_refrele = B_TRUE;
1780 1780 }
1781 1781 err = nce_lookup_then_add_v6(ill,
1782 1782 haddr, hlen,
1783 1783 &src, /* Soliciting nodes address */
1784 1784 0,
1785 1785 ND_STALE,
1786 1786 &nnce);
1787 1787
1788 1788 if (need_ill_refrele) {
1789 1789 ill_refrele(ill);
1790 1790 ill = under_ill;
1791 1791 need_ill_refrele = B_FALSE;
1792 1792 }
1793 1793 switch (err) {
1794 1794 case 0:
1795 1795 /* done with this entry */
1796 1796 nce_refrele(nnce);
1797 1797 break;
1798 1798 case EEXIST:
1799 1799 /*
1800 1800 * B_FALSE indicates this is not an an advertisement.
1801 1801 */
1802 1802 nce_process(nnce->nce_common, haddr, 0, B_FALSE);
1803 1803 nce_refrele(nnce);
1804 1804 break;
1805 1805 default:
1806 1806 ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1807 1807 err));
1808 1808 goto done;
1809 1809 }
1810 1810 no_source:
1811 1811 flag |= NDP_SOLICITED;
1812 1812 } else {
1813 1813 /*
1814 1814 * No source link layer address option should be present in a
1815 1815 * valid DAD request.
1816 1816 */
1817 1817 if (haddr != NULL) {
1818 1818 ip1dbg(("ndp_input_solicit: source link-layer address "
1819 1819 "option present with an unspecified source.\n"));
1820 1820 bad_solicit = B_TRUE;
1821 1821 goto done;
1822 1822 }
1823 1823 if (our_ncec->ncec_state == ND_PROBE) {
1824 1824 /*
1825 1825 * Internally looped-back probes will have
1826 1826 * IRAF_L2SRC_LOOPBACK set so we can ignore our own
1827 1827 * transmissions.
1828 1828 */
1829 1829 if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
1830 1830 /*
1831 1831 * If someone else is probing our address, then
1832 1832 * we've crossed wires. Declare failure.
1833 1833 */
1834 1834 ndp_failure(mp, ira);
1835 1835 }
1836 1836 goto done;
1837 1837 }
1838 1838 /*
1839 1839 * This is a DAD probe. Multicast the advertisement to the
1840 1840 * all-nodes address.
1841 1841 */
1842 1842 src = ipv6_all_hosts_mcast;
1843 1843 }
1844 1844 flag |= nce_advert_flags(our_ncec);
1845 1845 (void) ndp_xmit(ill,
1846 1846 ND_NEIGHBOR_ADVERT,
1847 1847 our_ncec->ncec_lladdr,
1848 1848 our_ncec->ncec_lladdr_length,
1849 1849 &target, /* Source and target of the advertisement pkt */
1850 1850 &src, /* IP Destination (source of original pkt) */
1851 1851 flag);
1852 1852 done:
1853 1853 if (bad_solicit)
1854 1854 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
1855 1855 if (our_ncec != NULL)
1856 1856 ncec_refrele(our_ncec);
1857 1857 }
1858 1858
1859 1859 /*
1860 1860 * Handle reception of Neighbor Solicitation messages
1861 1861 */
1862 1862 void
1863 1863 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
1864 1864 {
1865 1865 ill_t *ill = ira->ira_ill;
1866 1866 nd_neighbor_advert_t *na;
1867 1867 uint32_t hlen = ill->ill_phys_addr_length;
1868 1868 uchar_t *haddr = NULL;
1869 1869 icmp6_t *icmp_nd;
1870 1870 ip6_t *ip6h;
1871 1871 ncec_t *dst_ncec = NULL;
1872 1872 in6_addr_t target;
1873 1873 nd_opt_hdr_t *opt = NULL;
1874 1874 int len;
1875 1875 ip_stack_t *ipst = ill->ill_ipst;
1876 1876 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
1877 1877
1878 1878 ip6h = (ip6_t *)mp->b_rptr;
1879 1879 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1880 1880 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1881 1881 na = (nd_neighbor_advert_t *)icmp_nd;
1882 1882
1883 1883 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
1884 1884 (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
1885 1885 ip1dbg(("ndp_input_advert: Target is multicast but the "
1886 1886 "solicited flag is not zero\n"));
1887 1887 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1888 1888 return;
1889 1889 }
1890 1890 target = na->nd_na_target;
1891 1891 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1892 1892 IN6_IS_ADDR_LOOPBACK(&target)) {
1893 1893 if (ip_debug > 2) {
1894 1894 /* ip1dbg */
1895 1895 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1896 1896 AF_INET6, &target);
1897 1897 }
1898 1898 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1899 1899 return;
1900 1900 }
1901 1901 if (len > sizeof (nd_neighbor_advert_t)) {
1902 1902 opt = (nd_opt_hdr_t *)&na[1];
1903 1903 if (!ndp_verify_optlen(opt,
1904 1904 len - sizeof (nd_neighbor_advert_t))) {
1905 1905 ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
1906 1906 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1907 1907 return;
1908 1908 }
1909 1909 /* At this point we have a verified NA per spec */
1910 1910 len -= sizeof (nd_neighbor_advert_t);
1911 1911 opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
1912 1912 if (opt != NULL) {
1913 1913 haddr = (uchar_t *)&opt[1];
1914 1914 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1915 1915 hlen == 0) {
1916 1916 ip1dbg(("ndp_input_advert: bad SLLA\n"));
1917 1917 BUMP_MIB(mib,
1918 1918 ipv6IfIcmpInBadNeighborAdvertisements);
1919 1919 return;
1920 1920 }
1921 1921 }
1922 1922 }
1923 1923
1924 1924 /*
1925 1925 * NOTE: we match across the illgrp since we need to do DAD for all of
1926 1926 * our local addresses, and those are spread across all the active
1927 1927 * ills in the group.
1928 1928 */
1929 1929 if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
1930 1930 return;
1931 1931
1932 1932 if (NCE_PUBLISH(dst_ncec)) {
1933 1933 /*
1934 1934 * Someone just advertised an addresses that we publish. First,
1935 1935 * check it it was us -- if so, we can safely ignore it.
1936 1936 * We don't get the haddr from the ira_l2src because, in the
1937 1937 * case that the packet originated from us, on an IPMP group,
1938 1938 * the ira_l2src may would be the link-layer address of the
1939 1939 * cast_ill used to send the packet, which may not be the same
1940 1940 * as the dst_ncec->ncec_lladdr of the address.
1941 1941 */
1942 1942 if (haddr != NULL) {
1943 1943 if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
1944 1944 goto out;
1945 1945
1946 1946 if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
1947 1947 goto out; /* from us -- no conflict */
1948 1948
1949 1949 /*
1950 1950 * If we're in an IPMP group, check if this is an echo
1951 1951 * from another ill in the group. Use the double-
1952 1952 * checked locking pattern to avoid grabbing
1953 1953 * ill_g_lock in the non-IPMP case.
1954 1954 */
1955 1955 if (IS_UNDER_IPMP(ill)) {
1956 1956 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1957 1957 if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
1958 1958 ill->ill_grp, haddr, hlen) != NULL) {
1959 1959 rw_exit(&ipst->ips_ill_g_lock);
1960 1960 goto out;
1961 1961 }
1962 1962 rw_exit(&ipst->ips_ill_g_lock);
1963 1963 }
1964 1964 }
1965 1965
1966 1966 /*
1967 1967 * This appears to be a real conflict. If we're trying to
1968 1968 * configure this NCE (ND_PROBE), then shut it down.
1969 1969 * Otherwise, handle the discovered conflict.
1970 1970 */
1971 1971 if (dst_ncec->ncec_state == ND_PROBE) {
1972 1972 ndp_failure(mp, ira);
1973 1973 } else {
1974 1974 if (ip_nce_conflict(mp, ira, dst_ncec)) {
1975 1975 char hbuf[MAC_STR_LEN];
1976 1976 char sbuf[INET6_ADDRSTRLEN];
1977 1977
1978 1978 cmn_err(CE_WARN,
1979 1979 "node '%s' is using %s on %s",
1980 1980 inet_ntop(AF_INET6, &target, sbuf,
1981 1981 sizeof (sbuf)),
1982 1982 haddr == NULL ? "<none>" :
1983 1983 mac_colon_addr(haddr, hlen, hbuf,
1984 1984 sizeof (hbuf)), ill->ill_name);
1985 1985 /*
1986 1986 * RFC 4862, Section 5.4.4 does not mandate
1987 1987 * any specific behavior when an NA matches
1988 1988 * a non-tentative address assigned to the
1989 1989 * receiver. We make the choice of defending
1990 1990 * our address, based on the assumption that
1991 1991 * the sender has not detected the Duplicate.
1992 1992 *
1993 1993 * ncec_last_time_defended has been adjusted
1994 1994 * in ip_nce_conflict()
1995 1995 */
1996 1996 (void) ndp_announce(dst_ncec);
1997 1997 }
1998 1998 }
1999 1999 } else {
2000 2000 if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
2001 2001 dst_ncec->ncec_flags |= NCE_F_ISROUTER;
2002 2002
2003 2003 /* B_TRUE indicates this an advertisement */
2004 2004 nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
2005 2005 }
2006 2006 out:
2007 2007 ncec_refrele(dst_ncec);
2008 2008 }
2009 2009
2010 2010 /*
2011 2011 * Process NDP neighbor solicitation/advertisement messages.
2012 2012 * The checksum has already checked o.k before reaching here.
2013 2013 * Information about the datalink header is contained in ira_l2src, but
2014 2014 * that should be ignored for loopback packets.
2015 2015 */
2016 2016 void
2017 2017 ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
2018 2018 {
2019 2019 ill_t *ill = ira->ira_rill;
2020 2020 icmp6_t *icmp_nd;
2021 2021 ip6_t *ip6h;
2022 2022 int len;
2023 2023 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
2024 2024 ill_t *orig_ill = NULL;
2025 2025
2026 2026 /*
2027 2027 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
2028 2028 * and make it be the IPMP upper so avoid being confused by a packet
2029 2029 * addressed to a unicast address on a different ill.
2030 2030 */
2031 2031 if (IS_UNDER_IPMP(ill)) {
2032 2032 orig_ill = ill;
2033 2033 ill = ipmp_ill_hold_ipmp_ill(orig_ill);
2034 2034 if (ill == NULL) {
2035 2035 ill = orig_ill;
2036 2036 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2037 2037 ip_drop_input("ipIfStatsInDiscards - IPMP ill",
2038 2038 mp, ill);
2039 2039 freemsg(mp);
2040 2040 return;
2041 2041 }
2042 2042 ASSERT(ill != orig_ill);
2043 2043 orig_ill = ira->ira_ill;
2044 2044 ira->ira_ill = ill;
2045 2045 mib = ill->ill_icmp6_mib;
2046 2046 }
2047 2047 if (!pullupmsg(mp, -1)) {
2048 2048 ip1dbg(("ndp_input: pullupmsg failed\n"));
2049 2049 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2050 2050 ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
2051 2051 goto done;
2052 2052 }
2053 2053 ip6h = (ip6_t *)mp->b_rptr;
2054 2054 if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2055 2055 ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2056 2056 ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
2057 2057 BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2058 2058 goto done;
2059 2059 }
2060 2060 /*
2061 2061 * NDP does not accept any extension headers between the
2062 2062 * IP header and the ICMP header since e.g. a routing
2063 2063 * header could be dangerous.
2064 2064 * This assumes that any AH or ESP headers are removed
2065 2065 * by ip prior to passing the packet to ndp_input.
2066 2066 */
2067 2067 if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2068 2068 ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2069 2069 ip6h->ip6_nxt));
2070 2070 ip_drop_input("Wrong next header", mp, ill);
2071 2071 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2072 2072 goto done;
2073 2073 }
2074 2074 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2075 2075 ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2076 2076 icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2077 2077 if (icmp_nd->icmp6_code != 0) {
2078 2078 ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2079 2079 ip_drop_input("code non-zero", mp, ill);
2080 2080 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2081 2081 goto done;
2082 2082 }
2083 2083 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2084 2084 /*
2085 2085 * Make sure packet length is large enough for either
2086 2086 * a NS or a NA icmp packet.
2087 2087 */
2088 2088 if (len < sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2089 2089 ip1dbg(("ndp_input: packet too short\n"));
2090 2090 ip_drop_input("packet too short", mp, ill);
2091 2091 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2092 2092 goto done;
2093 2093 }
2094 2094 if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2095 2095 ndp_input_solicit(mp, ira);
2096 2096 } else {
2097 2097 ndp_input_advert(mp, ira);
2098 2098 }
2099 2099 done:
2100 2100 freemsg(mp);
2101 2101 if (orig_ill != NULL) {
2102 2102 ill_refrele(ill);
2103 2103 ira->ira_ill = orig_ill;
2104 2104 }
2105 2105 }
2106 2106
2107 2107 /*
2108 2108 * ndp_xmit is called to form and transmit a ND solicitation or
2109 2109 * advertisement ICMP packet.
2110 2110 *
2111 2111 * If the source address is unspecified and this isn't a probe (used for
2112 2112 * duplicate address detection), an appropriate source address and link layer
2113 2113 * address will be chosen here. The link layer address option is included if
2114 2114 * the source is specified (i.e., all non-probe packets), and omitted (per the
2115 2115 * specification) otherwise.
2116 2116 *
2117 2117 * It returns B_FALSE only if it does a successful put() to the
2118 2118 * corresponding ill's ill_wq otherwise returns B_TRUE.
2119 2119 */
2120 2120 static boolean_t
2121 2121 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
2122 2122 const in6_addr_t *sender, const in6_addr_t *target, int flag)
2123 2123 {
2124 2124 uint32_t len;
2125 2125 icmp6_t *icmp6;
2126 2126 mblk_t *mp;
2127 2127 ip6_t *ip6h;
2128 2128 nd_opt_hdr_t *opt;
2129 2129 uint_t plen;
2130 2130 zoneid_t zoneid = GLOBAL_ZONEID;
2131 2131 ill_t *hwaddr_ill = ill;
2132 2132 ip_xmit_attr_t ixas;
2133 2133 ip_stack_t *ipst = ill->ill_ipst;
2134 2134 boolean_t need_refrele = B_FALSE;
2135 2135 boolean_t probe = B_FALSE;
2136 2136
2137 2137 if (IS_UNDER_IPMP(ill)) {
2138 2138 probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
2139 2139 /*
2140 2140 * We send non-probe packets on the upper IPMP interface.
2141 2141 * ip_output_simple() will use cast_ill for sending any
2142 2142 * multicast packets. Note that we can't follow the same
2143 2143 * logic for probe packets because all interfaces in the ipmp
2144 2144 * group may have failed, so that we really want to only try
2145 2145 * to send the ND packet on the ill corresponding to the src
2146 2146 * address.
2147 2147 */
2148 2148 if (!probe) {
2149 2149 ill = ipmp_ill_hold_ipmp_ill(ill);
2150 2150 if (ill != NULL)
2151 2151 need_refrele = B_TRUE;
2152 2152 else
2153 2153 ill = hwaddr_ill;
2154 2154 }
2155 2155 }
2156 2156
2157 2157 /*
2158 2158 * If we have a unspecified source(sender) address, select a
2159 2159 * proper source address for the solicitation here itself so
2160 2160 * that we can initialize the h/w address correctly.
2161 2161 *
2162 2162 * If the sender is specified then we use this address in order
2163 2163 * to lookup the zoneid before calling ip_output_v6(). This is to
2164 2164 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2165 2165 * by IP (we cannot guarantee that the global zone has an interface
2166 2166 * route to the destination).
2167 2167 *
2168 2168 * Note that the NA never comes here with the unspecified source
2169 2169 * address.
2170 2170 */
2171 2171
2172 2172 /*
2173 2173 * Probes will have unspec src at this point.
2174 2174 */
2175 2175 if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2176 2176 zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
2177 2177 /*
2178 2178 * It's possible for ipif_lookup_addr_zoneid_v6() to return
2179 2179 * ALL_ZONES if it cannot find a matching ipif for the address
2180 2180 * we are trying to use. In this case we err on the side of
2181 2181 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2182 2182 */
2183 2183 if (zoneid == ALL_ZONES)
2184 2184 zoneid = GLOBAL_ZONEID;
2185 2185 }
2186 2186
2187 2187 plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
2188 2188 len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
2189 2189 mp = allocb(len, BPRI_LO);
2190 2190 if (mp == NULL) {
2191 2191 if (need_refrele)
2192 2192 ill_refrele(ill);
2193 2193 return (B_TRUE);
2194 2194 }
2195 2195
2196 2196 bzero((char *)mp->b_rptr, len);
2197 2197 mp->b_wptr = mp->b_rptr + len;
2198 2198
2199 2199 bzero(&ixas, sizeof (ixas));
2200 2200 ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM;
2201 2201
2202 2202 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
2203 2203 ixas.ixa_ipst = ipst;
2204 2204 ixas.ixa_cred = kcred;
2205 2205 ixas.ixa_cpid = NOPID;
2206 2206 ixas.ixa_tsl = NULL;
2207 2207 ixas.ixa_zoneid = zoneid;
2208 2208
2209 2209 ip6h = (ip6_t *)mp->b_rptr;
2210 2210 ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2211 2211 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2212 2212 ip6h->ip6_nxt = IPPROTO_ICMPV6;
2213 2213 ip6h->ip6_hops = IPV6_MAX_HOPS;
2214 2214 ixas.ixa_multicast_ttl = ip6h->ip6_hops;
2215 2215 ip6h->ip6_dst = *target;
2216 2216 icmp6 = (icmp6_t *)&ip6h[1];
2217 2217
2218 2218 if (hw_addr_len != 0) {
2219 2219 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2220 2220 sizeof (nd_neighbor_advert_t));
2221 2221 } else {
2222 2222 opt = NULL;
2223 2223 }
2224 2224 if (operation == ND_NEIGHBOR_SOLICIT) {
2225 2225 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2226 2226
2227 2227 if (opt != NULL && !(flag & NDP_PROBE)) {
2228 2228 /*
2229 2229 * Note that we don't send out SLLA for ND probes
2230 2230 * per RFC 4862, even though we do send out the src
2231 2231 * haddr for IPv4 DAD probes, even though both IPv4
2232 2232 * and IPv6 go out with the unspecified/INADDR_ANY
2233 2233 * src IP addr.
2234 2234 */
2235 2235 opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2236 2236 }
2237 2237 ip6h->ip6_src = *sender;
2238 2238 ns->nd_ns_target = *target;
2239 2239 if (!(flag & NDP_UNICAST)) {
2240 2240 /* Form multicast address of the target */
2241 2241 ip6h->ip6_dst = ipv6_solicited_node_mcast;
2242 2242 ip6h->ip6_dst.s6_addr32[3] |=
2243 2243 ns->nd_ns_target.s6_addr32[3];
2244 2244 }
2245 2245 } else {
2246 2246 nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2247 2247
2248 2248 ASSERT(!(flag & NDP_PROBE));
2249 2249 if (opt != NULL)
2250 2250 opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2251 2251 ip6h->ip6_src = *sender;
2252 2252 na->nd_na_target = *sender;
2253 2253 if (flag & NDP_ISROUTER)
2254 2254 na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2255 2255 if (flag & NDP_SOLICITED)
2256 2256 na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2257 2257 if (flag & NDP_ORIDE)
2258 2258 na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2259 2259 }
2260 2260
2261 2261 if (!(flag & NDP_PROBE)) {
2262 2262 if (hw_addr != NULL && opt != NULL) {
2263 2263 /* Fill in link layer address and option len */
2264 2264 opt->nd_opt_len = (uint8_t)plen;
2265 2265 bcopy(hw_addr, &opt[1], hw_addr_len);
2266 2266 }
2267 2267 }
2268 2268 if (opt != NULL && opt->nd_opt_type == 0) {
2269 2269 /* If there's no link layer address option, then strip it. */
2270 2270 len -= plen * 8;
2271 2271 mp->b_wptr = mp->b_rptr + len;
2272 2272 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2273 2273 }
2274 2274
2275 2275 icmp6->icmp6_type = (uint8_t)operation;
2276 2276 icmp6->icmp6_code = 0;
2277 2277 /*
2278 2278 * Prepare for checksum by putting icmp length in the icmp
2279 2279 * checksum field. The checksum is calculated in ip_output.c.
2280 2280 */
2281 2281 icmp6->icmp6_cksum = ip6h->ip6_plen;
2282 2282
2283 2283 (void) ip_output_simple(mp, &ixas);
2284 2284 ixa_cleanup(&ixas);
2285 2285 if (need_refrele)
2286 2286 ill_refrele(ill);
2287 2287 return (B_FALSE);
2288 2288 }
2289 2289
2290 2290 /*
2291 2291 * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
2292 2292 * The datapath uses this as an indication that there
2293 2293 * is a problem (as opposed to a NCE that was just
2294 2294 * reclaimed due to lack of memory.
2295 2295 * Note that static ARP entries never become unreachable.
2296 2296 */
2297 2297 void
2298 2298 nce_make_unreachable(ncec_t *ncec)
2299 2299 {
2300 2300 mutex_enter(&ncec->ncec_lock);
2301 2301 ncec->ncec_state = ND_UNREACHABLE;
2302 2302 mutex_exit(&ncec->ncec_lock);
2303 2303 }
2304 2304
2305 2305 /*
2306 2306 * NCE retransmit timer. Common to IPv4 and IPv6.
2307 2307 * This timer goes off when:
2308 2308 * a. It is time to retransmit a resolution for resolver.
2309 2309 * b. It is time to send reachability probes.
2310 2310 */
2311 2311 void
2312 2312 nce_timer(void *arg)
2313 2313 {
2314 2314 ncec_t *ncec = arg;
2315 2315 ill_t *ill = ncec->ncec_ill, *src_ill;
2316 2316 char addrbuf[INET6_ADDRSTRLEN];
2317 2317 boolean_t dropped = B_FALSE;
2318 2318 ip_stack_t *ipst = ncec->ncec_ipst;
2319 2319 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2320 2320 in_addr_t sender4 = INADDR_ANY;
2321 2321 in6_addr_t sender6 = ipv6_all_zeros;
2322 2322
2323 2323 /*
2324 2324 * The timer has to be cancelled by ncec_delete before doing the final
2325 2325 * refrele. So the NCE is guaranteed to exist when the timer runs
2326 2326 * until it clears the timeout_id. Before clearing the timeout_id
2327 2327 * bump up the refcnt so that we can continue to use the ncec
2328 2328 */
2329 2329 ASSERT(ncec != NULL);
2330 2330 mutex_enter(&ncec->ncec_lock);
2331 2331 ncec_refhold_locked(ncec);
2332 2332 ncec->ncec_timeout_id = 0;
2333 2333 mutex_exit(&ncec->ncec_lock);
2334 2334
2335 2335 src_ill = nce_resolve_src(ncec, &sender6);
2336 2336 /* if we could not find a sender address, return */
2337 2337 if (src_ill == NULL) {
2338 2338 if (!isv6) {
2339 2339 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
2340 2340 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
2341 2341 &sender4, addrbuf, sizeof (addrbuf))));
2342 2342 } else {
2343 2343 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
2344 2344 &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2345 2345 }
2346 2346 nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2347 2347 ncec_refrele(ncec);
2348 2348 return;
2349 2349 }
2350 2350 if (!isv6)
2351 2351 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
2352 2352
2353 2353 mutex_enter(&ncec->ncec_lock);
2354 2354 /*
2355 2355 * Check the reachability state.
2356 2356 */
2357 2357 switch (ncec->ncec_state) {
2358 2358 case ND_DELAY:
2359 2359 ASSERT(ncec->ncec_lladdr != NULL);
2360 2360 ncec->ncec_state = ND_PROBE;
2361 2361 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2362 2362 if (isv6) {
2363 2363 mutex_exit(&ncec->ncec_lock);
2364 2364 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
2365 2365 src_ill->ill_phys_addr,
2366 2366 src_ill->ill_phys_addr_length,
2367 2367 &sender6, &ncec->ncec_addr,
2368 2368 NDP_UNICAST);
2369 2369 } else {
2370 2370 dropped = (arp_request(ncec, sender4, src_ill) == 0);
2371 2371 mutex_exit(&ncec->ncec_lock);
2372 2372 }
2373 2373 if (!dropped) {
2374 2374 mutex_enter(&ncec->ncec_lock);
2375 2375 ncec->ncec_pcnt--;
2376 2376 mutex_exit(&ncec->ncec_lock);
2377 2377 }
2378 2378 if (ip_debug > 3) {
2379 2379 /* ip2dbg */
2380 2380 pr_addr_dbg("nce_timer: state for %s changed "
2381 2381 "to PROBE\n", AF_INET6, &ncec->ncec_addr);
2382 2382 }
2383 2383 nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2384 2384 break;
2385 2385 case ND_PROBE:
2386 2386 /* must be retransmit timer */
2387 2387 ASSERT(ncec->ncec_pcnt >= -1);
2388 2388 if (ncec->ncec_pcnt > 0) {
2389 2389 /*
2390 2390 * As per RFC2461, the ncec gets deleted after
2391 2391 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2392 2392 * Note that the first unicast solicitation is sent
2393 2393 * during the DELAY state.
2394 2394 */
2395 2395 ip2dbg(("nce_timer: pcount=%x dst %s\n",
2396 2396 ncec->ncec_pcnt,
2397 2397 inet_ntop((isv6? AF_INET6 : AF_INET),
2398 2398 &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2399 2399 if (NCE_PUBLISH(ncec)) {
2400 2400 mutex_exit(&ncec->ncec_lock);
2401 2401 /*
2402 2402 * send out a probe; note that src_ill
2403 2403 * is ignored by nce_dad() for all
2404 2404 * DAD message types other than IPv6
2405 2405 * unicast probes
2406 2406 */
2407 2407 nce_dad(ncec, src_ill, B_TRUE);
2408 2408 } else {
2409 2409 ASSERT(src_ill != NULL);
2410 2410 if (isv6) {
2411 2411 mutex_exit(&ncec->ncec_lock);
2412 2412 dropped = ndp_xmit(src_ill,
2413 2413 ND_NEIGHBOR_SOLICIT,
2414 2414 src_ill->ill_phys_addr,
2415 2415 src_ill->ill_phys_addr_length,
2416 2416 &sender6, &ncec->ncec_addr,
2417 2417 NDP_UNICAST);
2418 2418 } else {
2419 2419 /*
2420 2420 * since the nce is REACHABLE,
2421 2421 * the ARP request will be sent out
2422 2422 * as a link-layer unicast.
2423 2423 */
2424 2424 dropped = (arp_request(ncec, sender4,
2425 2425 src_ill) == 0);
2426 2426 mutex_exit(&ncec->ncec_lock);
2427 2427 }
2428 2428 if (!dropped) {
2429 2429 mutex_enter(&ncec->ncec_lock);
2430 2430 ncec->ncec_pcnt--;
2431 2431 mutex_exit(&ncec->ncec_lock);
2432 2432 }
2433 2433 nce_restart_timer(ncec,
2434 2434 ill->ill_reachable_retrans_time);
2435 2435 }
2436 2436 } else if (ncec->ncec_pcnt < 0) {
2437 2437 /* No hope, delete the ncec */
2438 2438 /* Tell datapath it went bad */
2439 2439 ncec->ncec_state = ND_UNREACHABLE;
2440 2440 mutex_exit(&ncec->ncec_lock);
2441 2441 if (ip_debug > 2) {
2442 2442 /* ip1dbg */
2443 2443 pr_addr_dbg("nce_timer: Delete NCE for"
2444 2444 " dst %s\n", (isv6? AF_INET6: AF_INET),
2445 2445 &ncec->ncec_addr);
2446 2446 }
2447 2447 /* if static ARP can't delete. */
2448 2448 if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
2449 2449 ncec_delete(ncec);
2450 2450
2451 2451 } else if (!NCE_PUBLISH(ncec)) {
2452 2452 /*
2453 2453 * Probe count is 0 for a dynamic entry (one that we
2454 2454 * ourselves are not publishing). We should never get
2455 2455 * here if NONUD was requested, hence the ASSERT below.
2456 2456 */
2457 2457 ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
2458 2458 ip2dbg(("nce_timer: pcount=%x dst %s\n",
2459 2459 ncec->ncec_pcnt, inet_ntop(AF_INET6,
2460 2460 &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2461 2461 ncec->ncec_pcnt--;
2462 2462 mutex_exit(&ncec->ncec_lock);
2463 2463 /* Wait one interval before killing */
2464 2464 nce_restart_timer(ncec,
2465 2465 ill->ill_reachable_retrans_time);
2466 2466 } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2467 2467 ipif_t *ipif;
2468 2468 ipaddr_t ncec_addr;
2469 2469
2470 2470 /*
2471 2471 * We're done probing, and we can now declare this
2472 2472 * address to be usable. Let IP know that it's ok to
2473 2473 * use.
2474 2474 */
2475 2475 ncec->ncec_state = ND_REACHABLE;
2476 2476 ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
2477 2477 mutex_exit(&ncec->ncec_lock);
2478 2478 if (isv6) {
2479 2479 ipif = ipif_lookup_addr_exact_v6(
2480 2480 &ncec->ncec_addr, ill, ipst);
2481 2481 } else {
2482 2482 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
2483 2483 ncec_addr);
2484 2484 ipif = ipif_lookup_addr_exact(ncec_addr, ill,
2485 2485 ipst);
2486 2486 }
2487 2487 if (ipif != NULL) {
2488 2488 if (ipif->ipif_was_dup) {
2489 2489 char ibuf[LIFNAMSIZ];
2490 2490 char sbuf[INET6_ADDRSTRLEN];
2491 2491
2492 2492 ipif->ipif_was_dup = B_FALSE;
2493 2493 (void) inet_ntop(AF_INET6,
2494 2494 &ipif->ipif_v6lcl_addr,
2495 2495 sbuf, sizeof (sbuf));
2496 2496 ipif_get_name(ipif, ibuf,
2497 2497 sizeof (ibuf));
2498 2498 cmn_err(CE_NOTE, "recovered address "
2499 2499 "%s on %s", sbuf, ibuf);
2500 2500 }
2501 2501 if ((ipif->ipif_flags & IPIF_UP) &&
2502 2502 !ipif->ipif_addr_ready)
2503 2503 ipif_up_notify(ipif);
2504 2504 ipif->ipif_addr_ready = 1;
2505 2505 ipif_refrele(ipif);
2506 2506 }
2507 2507 if (!isv6 && arp_no_defense)
2508 2508 break;
2509 2509 /* Begin defending our new address */
2510 2510 if (ncec->ncec_unsolicit_count > 0) {
2511 2511 ncec->ncec_unsolicit_count--;
2512 2512 if (isv6) {
2513 2513 dropped = ndp_announce(ncec);
2514 2514 } else {
2515 2515 dropped = arp_announce(ncec);
2516 2516 }
2517 2517
2518 2518 if (dropped)
2519 2519 ncec->ncec_unsolicit_count++;
2520 2520 else
2521 2521 ncec->ncec_last_time_defended =
2522 2522 ddi_get_lbolt();
2523 2523 }
2524 2524 if (ncec->ncec_unsolicit_count > 0) {
2525 2525 nce_restart_timer(ncec,
2526 2526 ANNOUNCE_INTERVAL(isv6));
2527 2527 } else if (DEFENSE_INTERVAL(isv6) != 0) {
2528 2528 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2529 2529 }
2530 2530 } else {
2531 2531 /*
2532 2532 * This is an address we're probing to be our own, but
2533 2533 * the ill is down. Wait until it comes back before
2534 2534 * doing anything, but switch to reachable state so
2535 2535 * that the restart will work.
2536 2536 */
2537 2537 ncec->ncec_state = ND_REACHABLE;
2538 2538 mutex_exit(&ncec->ncec_lock);
2539 2539 }
2540 2540 break;
2541 2541 case ND_INCOMPLETE: {
2542 2542 mblk_t *mp, *nextmp;
2543 2543 mblk_t **prevmpp;
2544 2544
2545 2545 /*
2546 2546 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
2547 2547 * for any IPMP probe packets, and toss them. IPMP probe
2548 2548 * packets will always be at the head of ncec_qd_mp, so that
2549 2549 * we can stop at the first queued ND packet that is
2550 2550 * not a probe packet.
2551 2551 */
2552 2552 prevmpp = &ncec->ncec_qd_mp;
2553 2553 for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
2554 2554 nextmp = mp->b_next;
2555 2555
2556 2556 if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
2557 2557 inet_freemsg(mp);
2558 2558 ncec->ncec_nprobes--;
2559 2559 *prevmpp = nextmp;
2560 2560 } else {
2561 2561 prevmpp = &mp->b_next;
2562 2562 }
2563 2563 }
2564 2564
2565 2565 /*
2566 2566 * Must be resolver's retransmit timer.
2567 2567 */
2568 2568 mutex_exit(&ncec->ncec_lock);
2569 2569 ip_ndp_resolve(ncec);
2570 2570 break;
2571 2571 }
2572 2572 case ND_REACHABLE:
2573 2573 if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
2574 2574 ncec->ncec_unsolicit_count != 0) ||
2575 2575 (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
2576 2576 if (ncec->ncec_unsolicit_count > 0) {
2577 2577 ncec->ncec_unsolicit_count--;
2578 2578 mutex_exit(&ncec->ncec_lock);
2579 2579 /*
2580 2580 * When we get to zero announcements left,
2581 2581 * switch to address defense
2582 2582 */
2583 2583 } else {
2584 2584 boolean_t rate_limit;
2585 2585
2586 2586 mutex_exit(&ncec->ncec_lock);
2587 2587 rate_limit = ill_defend_rate_limit(ill, ncec);
2588 2588 if (rate_limit) {
2589 2589 nce_restart_timer(ncec,
2590 2590 DEFENSE_INTERVAL(isv6));
2591 2591 break;
2592 2592 }
2593 2593 }
2594 2594 if (isv6) {
2595 2595 dropped = ndp_announce(ncec);
2596 2596 } else {
2597 2597 dropped = arp_announce(ncec);
2598 2598 }
2599 2599 mutex_enter(&ncec->ncec_lock);
2600 2600 if (dropped) {
2601 2601 ncec->ncec_unsolicit_count++;
2602 2602 } else {
2603 2603 ncec->ncec_last_time_defended =
2604 2604 ddi_get_lbolt();
2605 2605 }
2606 2606 mutex_exit(&ncec->ncec_lock);
2607 2607 if (ncec->ncec_unsolicit_count != 0) {
2608 2608 nce_restart_timer(ncec,
2609 2609 ANNOUNCE_INTERVAL(isv6));
2610 2610 } else {
2611 2611 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2612 2612 }
2613 2613 } else {
2614 2614 mutex_exit(&ncec->ncec_lock);
2615 2615 }
2616 2616 break;
2617 2617 default:
2618 2618 mutex_exit(&ncec->ncec_lock);
2619 2619 break;
2620 2620 }
2621 2621 done:
2622 2622 ncec_refrele(ncec);
2623 2623 ill_refrele(src_ill);
2624 2624 }
2625 2625
2626 2626 /*
2627 2627 * Set a link layer address from the ll_addr passed in.
2628 2628 * Copy SAP from ill.
2629 2629 */
2630 2630 static void
2631 2631 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
2632 2632 {
2633 2633 ill_t *ill = ncec->ncec_ill;
2634 2634
2635 2635 ASSERT(ll_addr != NULL);
2636 2636 if (ill->ill_phys_addr_length > 0) {
2637 2637 /*
2638 2638 * The bcopy() below used to be called for the physical address
2639 2639 * length rather than the link layer address length. For
2640 2640 * ethernet and many other media, the phys_addr and lla are
2641 2641 * identical.
2642 2642 *
2643 2643 * The phys_addr and lla may not be the same for devices that
2644 2644 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
2645 2645 * no known instances of these.
2646 2646 *
2647 2647 * For PPP or other interfaces with a zero length
2648 2648 * physical address, don't do anything here.
2649 2649 * The bcopy() with a zero phys_addr length was previously
2650 2650 * a no-op for interfaces with a zero-length physical address.
2651 2651 * Using the lla for them would change the way they operate.
2652 2652 * Doing nothing in such cases preserves expected behavior.
2653 2653 */
2654 2654 bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
2655 2655 }
2656 2656 }
2657 2657
2658 2658 boolean_t
2659 2659 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
2660 2660 uint32_t ll_addr_len)
2661 2661 {
2662 2662 ASSERT(ncec->ncec_lladdr != NULL);
2663 2663 if (ll_addr == NULL)
2664 2664 return (B_FALSE);
2665 2665 if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
2666 2666 return (B_TRUE);
2667 2667 return (B_FALSE);
2668 2668 }
2669 2669
2670 2670 /*
2671 2671 * Updates the link layer address or the reachability state of
2672 2672 * a cache entry. Reset probe counter if needed.
2673 2673 */
2674 2674 void
2675 2675 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
2676 2676 {
2677 2677 ill_t *ill = ncec->ncec_ill;
2678 2678 boolean_t need_stop_timer = B_FALSE;
2679 2679 boolean_t need_fastpath_update = B_FALSE;
2680 2680 nce_t *nce = NULL;
2681 2681 timeout_id_t tid;
2682 2682
2683 2683 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2684 2684 /*
2685 2685 * If this interface does not do NUD, there is no point
2686 2686 * in allowing an update to the cache entry. Although
2687 2687 * we will respond to NS.
2688 2688 * The only time we accept an update for a resolver when
2689 2689 * NUD is turned off is when it has just been created.
2690 2690 * Non-Resolvers will always be created as REACHABLE.
2691 2691 */
2692 2692 if (new_state != ND_UNCHANGED) {
2693 2693 if ((ncec->ncec_flags & NCE_F_NONUD) &&
2694 2694 (ncec->ncec_state != ND_INCOMPLETE))
2695 2695 return;
2696 2696 ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2697 2697 ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2698 2698 need_stop_timer = B_TRUE;
2699 2699 if (new_state == ND_REACHABLE)
2700 2700 ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64());
2701 2701 else {
2702 2702 /* We force NUD in this case */
2703 2703 ncec->ncec_last = 0;
2704 2704 }
2705 2705 ncec->ncec_state = new_state;
2706 2706 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2707 2707 ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
2708 2708 new_state == ND_INCOMPLETE);
2709 2709 }
2710 2710 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2711 2711 tid = ncec->ncec_timeout_id;
2712 2712 ncec->ncec_timeout_id = 0;
2713 2713 }
2714 2714 /*
2715 2715 * Re-trigger fastpath probe and
2716 2716 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2717 2717 * whatever packets that happens to be transmitting at the time.
2718 2718 */
2719 2719 if (new_ll_addr != NULL) {
2720 2720 bcopy(new_ll_addr, ncec->ncec_lladdr,
2721 2721 ill->ill_phys_addr_length);
2722 2722 need_fastpath_update = B_TRUE;
2723 2723 }
2724 2724 mutex_exit(&ncec->ncec_lock);
2725 2725 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2726 2726 if (tid != 0)
2727 2727 (void) untimeout(tid);
2728 2728 }
2729 2729 if (need_fastpath_update) {
2730 2730 /*
2731 2731 * Delete any existing existing dlur_mp and fp_mp information.
2732 2732 * For IPMP interfaces, all underlying ill's must be checked
2733 2733 * and purged.
2734 2734 */
2735 2735 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
2736 2736 /*
2737 2737 * add the new dlur_mp and fp_mp
2738 2738 */
2739 2739 nce = nce_fastpath(ncec, B_TRUE, NULL);
2740 2740 if (nce != NULL)
2741 2741 nce_refrele(nce);
2742 2742 }
2743 2743 mutex_enter(&ncec->ncec_lock);
2744 2744 }
2745 2745
2746 2746 static void
2747 2747 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2748 2748 {
2749 2749 uint_t count = 0;
2750 2750 mblk_t **mpp, *tmp;
2751 2751
2752 2752 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2753 2753
2754 2754 for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2755 2755 if (++count > ncec->ncec_ill->ill_max_buf) {
2756 2756 tmp = ncec->ncec_qd_mp->b_next;
2757 2757 ncec->ncec_qd_mp->b_next = NULL;
2758 2758 /*
2759 2759 * if we never create data addrs on the under_ill
2760 2760 * does this matter?
2761 2761 */
2762 2762 BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
2763 2763 ipIfStatsOutDiscards);
2764 2764 ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
2765 2765 ncec->ncec_ill);
2766 2766 freemsg(ncec->ncec_qd_mp);
2767 2767 ncec->ncec_qd_mp = tmp;
2768 2768 }
2769 2769 }
2770 2770
2771 2771 if (head_insert) {
2772 2772 ncec->ncec_nprobes++;
2773 2773 mp->b_next = ncec->ncec_qd_mp;
2774 2774 ncec->ncec_qd_mp = mp;
2775 2775 } else {
2776 2776 *mpp = mp;
2777 2777 }
2778 2778 }
2779 2779
2780 2780 /*
2781 2781 * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
2782 2782 * queued at the head or tail of the queue based on the input argument
2783 2783 * 'head_insert'. The caller should specify this argument as B_TRUE if this
2784 2784 * packet is an IPMP probe packet, in which case the following happens:
2785 2785 *
2786 2786 * 1. Insert it at the head of the ncec_qd_mp list. Consider the normal
2787 2787 * (non-ipmp_probe) load-speading case where the source address of the ND
2788 2788 * packet is not tied to ncec_ill. If the ill bound to the source address
2789 2789 * cannot receive, the response to the ND packet will not be received.
2790 2790 * However, if ND packets for ncec_ill's probes are queued behind that ND
2791 2791 * packet, those probes will also fail to be sent, and thus in.mpathd will
2792 2792 * erroneously conclude that ncec_ill has also failed.
2793 2793 *
2794 2794 * 2. Drop the ipmp_probe packet in ndp_timer() if the ND did not succeed on
2795 2795 * the first attempt. This ensures that ND problems do not manifest as
2796 2796 * probe RTT spikes.
2797 2797 *
2798 2798 * We achieve this by inserting ipmp_probe() packets at the head of the
2799 2799 * nce_queue.
2800 2800 *
2801 2801 * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
2802 2802 * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
2803 2803 */
2804 2804 void
2805 2805 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2806 2806 {
2807 2807 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2808 2808 nce_queue_mp_common(ncec, mp, head_insert);
2809 2809 }
2810 2810
2811 2811 /*
2812 2812 * Called when address resolution failed due to a timeout.
2813 2813 * Send an ICMP unreachable in response to all queued packets.
2814 2814 */
2815 2815 void
2816 2816 ndp_resolv_failed(ncec_t *ncec)
2817 2817 {
2818 2818 mblk_t *mp, *nxt_mp;
2819 2819 char buf[INET6_ADDRSTRLEN];
2820 2820 ill_t *ill = ncec->ncec_ill;
2821 2821 ip_recv_attr_t iras;
2822 2822
2823 2823 bzero(&iras, sizeof (iras));
2824 2824 iras.ira_flags = 0;
2825 2825 /*
2826 2826 * we are setting the ira_rill to the ipmp_ill (instead of
2827 2827 * the actual ill on which the packet was received), but this
2828 2828 * is ok because we don't actually need the real ira_rill.
2829 2829 * to send the icmp unreachable to the sender.
2830 2830 */
2831 2831 iras.ira_ill = iras.ira_rill = ill;
2832 2832 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2833 2833 iras.ira_rifindex = iras.ira_ruifindex;
2834 2834
2835 2835 ip1dbg(("ndp_resolv_failed: dst %s\n",
2836 2836 inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
2837 2837 mutex_enter(&ncec->ncec_lock);
2838 2838 mp = ncec->ncec_qd_mp;
2839 2839 ncec->ncec_qd_mp = NULL;
2840 2840 ncec->ncec_nprobes = 0;
2841 2841 mutex_exit(&ncec->ncec_lock);
2842 2842 while (mp != NULL) {
2843 2843 nxt_mp = mp->b_next;
2844 2844 mp->b_next = NULL;
2845 2845
2846 2846 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2847 2847 ip_drop_output("ipIfStatsOutDiscards - address unreachable",
2848 2848 mp, ill);
2849 2849 icmp_unreachable_v6(mp,
2850 2850 ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
2851 2851 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2852 2852 mp = nxt_mp;
2853 2853 }
2854 2854 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
2855 2855 }
2856 2856
2857 2857 /*
2858 2858 * Handle the completion of NDP and ARP resolution.
2859 2859 */
2860 2860 void
2861 2861 nce_resolv_ok(ncec_t *ncec)
2862 2862 {
2863 2863 mblk_t *mp;
2864 2864 uint_t pkt_len;
2865 2865 iaflags_t ixaflags = IXAF_NO_TRACE;
2866 2866 nce_t *nce;
2867 2867 ill_t *ill = ncec->ncec_ill;
2868 2868 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2869 2869 ip_stack_t *ipst = ill->ill_ipst;
2870 2870
2871 2871 if (IS_IPMP(ncec->ncec_ill)) {
2872 2872 nce_resolv_ipmp_ok(ncec);
2873 2873 return;
2874 2874 }
2875 2875 /* non IPMP case */
2876 2876
2877 2877 mutex_enter(&ncec->ncec_lock);
2878 2878 ASSERT(ncec->ncec_nprobes == 0);
2879 2879 mp = ncec->ncec_qd_mp;
2880 2880 ncec->ncec_qd_mp = NULL;
2881 2881 mutex_exit(&ncec->ncec_lock);
2882 2882
2883 2883 while (mp != NULL) {
2884 2884 mblk_t *nxt_mp;
2885 2885
2886 2886 if (ill->ill_isv6) {
2887 2887 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2888 2888
2889 2889 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
2890 2890 } else {
2891 2891 ipha_t *ipha = (ipha_t *)mp->b_rptr;
2892 2892
2893 2893 ixaflags |= IXAF_IS_IPV4;
2894 2894 pkt_len = ntohs(ipha->ipha_length);
2895 2895 }
2896 2896 nxt_mp = mp->b_next;
2897 2897 mp->b_next = NULL;
2898 2898 /*
2899 2899 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
2900 2900 * longer available, but it's ok to drop this flag because TCP
2901 2901 * has its own flow-control in effect, so TCP packets
2902 2902 * are not likely to get here when flow-control is in effect.
2903 2903 */
2904 2904 mutex_enter(&ill->ill_lock);
2905 2905 nce = nce_lookup(ill, &ncec->ncec_addr);
2906 2906 mutex_exit(&ill->ill_lock);
2907 2907
2908 2908 if (nce == NULL) {
2909 2909 if (isv6) {
2910 2910 BUMP_MIB(&ipst->ips_ip6_mib,
2911 2911 ipIfStatsOutDiscards);
2912 2912 } else {
2913 2913 BUMP_MIB(&ipst->ips_ip_mib,
2914 2914 ipIfStatsOutDiscards);
2915 2915 }
2916 2916 ip_drop_output("ipIfStatsOutDiscards - no nce",
2917 2917 mp, NULL);
2918 2918 freemsg(mp);
2919 2919 } else {
2920 2920 /*
2921 2921 * We don't know the zoneid, but
2922 2922 * ip_xmit does not care since IXAF_NO_TRACE
2923 2923 * is set. (We traced the packet the first
2924 2924 * time through ip_xmit.)
2925 2925 */
2926 2926 (void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
2927 2927 ALL_ZONES, 0, NULL);
2928 2928 nce_refrele(nce);
2929 2929 }
2930 2930 mp = nxt_mp;
2931 2931 }
2932 2932
2933 2933 ncec_cb_dispatch(ncec); /* complete callbacks */
2934 2934 }
2935 2935
2936 2936 /*
2937 2937 * Called by SIOCSNDP* ioctl to add/change an ncec entry
2938 2938 * and the corresponding attributes.
2939 2939 * Disallow states other than ND_REACHABLE or ND_STALE.
2940 2940 */
2941 2941 int
2942 2942 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
2943 2943 {
2944 2944 sin6_t *sin6;
2945 2945 in6_addr_t *addr;
2946 2946 ncec_t *ncec;
2947 2947 nce_t *nce;
2948 2948 int err = 0;
2949 2949 uint16_t new_flags = 0;
2950 2950 uint16_t old_flags = 0;
2951 2951 int inflags = lnr->lnr_flags;
2952 2952 ip_stack_t *ipst = ill->ill_ipst;
2953 2953 boolean_t do_postprocess = B_FALSE;
2954 2954
2955 2955 ASSERT(ill->ill_isv6);
2956 2956 if ((lnr->lnr_state_create != ND_REACHABLE) &&
2957 2957 (lnr->lnr_state_create != ND_STALE))
2958 2958 return (EINVAL);
2959 2959
2960 2960 sin6 = (sin6_t *)&lnr->lnr_addr;
2961 2961 addr = &sin6->sin6_addr;
2962 2962
2963 2963 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
2964 2964 ASSERT(!IS_UNDER_IPMP(ill));
2965 2965 nce = nce_lookup_addr(ill, addr);
2966 2966 if (nce != NULL)
2967 2967 new_flags = nce->nce_common->ncec_flags;
2968 2968
2969 2969 switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
2970 2970 case NDF_ISROUTER_ON:
2971 2971 new_flags |= NCE_F_ISROUTER;
2972 2972 break;
2973 2973 case NDF_ISROUTER_OFF:
2974 2974 new_flags &= ~NCE_F_ISROUTER;
2975 2975 break;
2976 2976 case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
2977 2977 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2978 2978 if (nce != NULL)
2979 2979 nce_refrele(nce);
2980 2980 return (EINVAL);
2981 2981 }
2982 2982 if (inflags & NDF_STATIC)
2983 2983 new_flags |= NCE_F_STATIC;
2984 2984
2985 2985 switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
2986 2986 case NDF_ANYCAST_ON:
2987 2987 new_flags |= NCE_F_ANYCAST;
2988 2988 break;
2989 2989 case NDF_ANYCAST_OFF:
2990 2990 new_flags &= ~NCE_F_ANYCAST;
2991 2991 break;
2992 2992 case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
2993 2993 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2994 2994 if (nce != NULL)
2995 2995 nce_refrele(nce);
2996 2996 return (EINVAL);
2997 2997 }
2998 2998
2999 2999 if (nce == NULL) {
3000 3000 err = nce_add_v6(ill,
3001 3001 (uchar_t *)lnr->lnr_hdw_addr,
3002 3002 ill->ill_phys_addr_length,
3003 3003 addr,
3004 3004 new_flags,
3005 3005 lnr->lnr_state_create,
3006 3006 &nce);
3007 3007 if (err != 0) {
3008 3008 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3009 3009 ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3010 3010 return (err);
3011 3011 } else {
3012 3012 do_postprocess = B_TRUE;
3013 3013 }
3014 3014 }
3015 3015 ncec = nce->nce_common;
3016 3016 old_flags = ncec->ncec_flags;
3017 3017 if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3018 3018 ncec_router_to_host(ncec);
3019 3019 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3020 3020 if (do_postprocess)
3021 3021 err = nce_add_v6_postprocess(nce);
3022 3022 nce_refrele(nce);
3023 3023 return (0);
3024 3024 }
3025 3025 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3026 3026
3027 3027 if (do_postprocess)
3028 3028 err = nce_add_v6_postprocess(nce);
3029 3029 /*
3030 3030 * err cannot be anything other than 0 because we don't support
3031 3031 * proxy arp of static addresses.
3032 3032 */
3033 3033 ASSERT(err == 0);
3034 3034
3035 3035 mutex_enter(&ncec->ncec_lock);
3036 3036 ncec->ncec_flags = new_flags;
3037 3037 mutex_exit(&ncec->ncec_lock);
3038 3038 /*
3039 3039 * Note that we ignore the state at this point, which
3040 3040 * should be either STALE or REACHABLE. Instead we let
3041 3041 * the link layer address passed in to determine the state
3042 3042 * much like incoming packets.
3043 3043 */
3044 3044 nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3045 3045 nce_refrele(nce);
3046 3046 return (0);
3047 3047 }
3048 3048
3049 3049 /*
3050 3050 * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
3051 3051 * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
3052 3052 * be held to ensure that they are in the same group.
3053 3053 */
3054 3054 static nce_t *
3055 3055 nce_fastpath_create(ill_t *ill, ncec_t *ncec)
3056 3056 {
3057 3057
3058 3058 nce_t *nce;
3059 3059
3060 3060 nce = nce_ill_lookup_then_add(ill, ncec);
3061 3061
3062 3062 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3063 3063 return (nce);
3064 3064
3065 3065 /*
3066 3066 * hold the ncec_lock to synchronize with nce_update() so that,
3067 3067 * at the end of this function, the contents of nce_dlur_mp are
3068 3068 * consistent with ncec->ncec_lladdr, even though some intermediate
3069 3069 * packet may have been sent out with a mangled address, which would
3070 3070 * only be a transient condition.
3071 3071 */
3072 3072 mutex_enter(&ncec->ncec_lock);
3073 3073 if (ncec->ncec_lladdr != NULL) {
3074 3074 bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
3075 3075 NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
3076 3076 } else {
3077 3077 nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
3078 3078 ill->ill_sap_length);
3079 3079 }
3080 3080 mutex_exit(&ncec->ncec_lock);
3081 3081 return (nce);
3082 3082 }
3083 3083
3084 3084 /*
3085 3085 * we make nce_fp_mp to have an M_DATA prepend.
3086 3086 * The caller ensures there is hold on ncec for this function.
3087 3087 * Note that since ill_fastpath_probe() copies the mblk there is
3088 3088 * no need to hold the nce or ncec beyond this function.
3089 3089 *
3090 3090 * If the caller has passed in a non-null ncec_nce to nce_fastpath() that
3091 3091 * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3092 3092 * and will be returned back by this function, so that no extra nce_refrele
3093 3093 * is required for the caller. The calls from nce_add_common() use this
3094 3094 * method. All other callers (that pass in NULL ncec_nce) will have to do a
3095 3095 * nce_refrele of the returned nce (when it is non-null).
3096 3096 */
3097 3097 nce_t *
3098 3098 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3099 3099 {
3100 3100 nce_t *nce;
3101 3101 ill_t *ill = ncec->ncec_ill;
3102 3102
3103 3103 ASSERT(ill != NULL);
3104 3104
3105 3105 if (IS_IPMP(ill) && trigger_fp_req) {
3106 3106 trigger_fp_req = B_FALSE;
3107 3107 ipmp_ncec_refresh_nce(ncec);
3108 3108 }
3109 3109
3110 3110 /*
3111 3111 * If the caller already has the nce corresponding to the ill, use
3112 3112 * that one. Otherwise we have to lookup/add the nce. Calls from
3113 3113 * nce_add_common() fall in the former category, and have just done
3114 3114 * the nce lookup/add that can be reused.
3115 3115 */
3116 3116 if (ncec_nce == NULL)
3117 3117 nce = nce_fastpath_create(ill, ncec);
3118 3118 else
3119 3119 nce = ncec_nce;
3120 3120
3121 3121 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3122 3122 return (nce);
3123 3123
3124 3124 if (trigger_fp_req)
3125 3125 nce_fastpath_trigger(nce);
3126 3126 return (nce);
3127 3127 }
3128 3128
3129 3129 /*
3130 3130 * Trigger fastpath on nce. No locks may be held.
3131 3131 */
3132 3132 static void
3133 3133 nce_fastpath_trigger(nce_t *nce)
3134 3134 {
3135 3135 int res;
3136 3136 ill_t *ill = nce->nce_ill;
3137 3137 ncec_t *ncec = nce->nce_common;
3138 3138
3139 3139 res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
3140 3140 /*
3141 3141 * EAGAIN is an indication of a transient error
3142 3142 * i.e. allocation failure etc. leave the ncec in the list it
3143 3143 * will be updated when another probe happens for another ire
3144 3144 * if not it will be taken out of the list when the ire is
3145 3145 * deleted.
3146 3146 */
3147 3147 if (res != 0 && res != EAGAIN && res != ENOTSUP)
3148 3148 nce_fastpath_list_delete(ill, ncec, NULL);
3149 3149 }
3150 3150
3151 3151 /*
3152 3152 * Add ncec to the nce fastpath list on ill.
3153 3153 */
3154 3154 static nce_t *
3155 3155 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
3156 3156 {
3157 3157 nce_t *nce = NULL;
3158 3158
3159 3159 ASSERT(MUTEX_HELD(&ill->ill_lock));
3160 3160 /*
3161 3161 * Atomically ensure that the ill is not CONDEMNED and is not going
3162 3162 * down, before adding the NCE.
3163 3163 */
3164 3164 if (ill->ill_state_flags & ILL_CONDEMNED)
3165 3165 return (NULL);
3166 3166 mutex_enter(&ncec->ncec_lock);
3167 3167 /*
3168 3168 * if ncec has not been deleted and
3169 3169 * is not already in the list add it.
3170 3170 */
3171 3171 if (!NCE_ISCONDEMNED(ncec)) {
3172 3172 nce = nce_lookup(ill, &ncec->ncec_addr);
3173 3173 if (nce != NULL)
3174 3174 goto done;
3175 3175 nce = nce_add(ill, ncec);
3176 3176 }
3177 3177 done:
3178 3178 mutex_exit(&ncec->ncec_lock);
3179 3179 return (nce);
3180 3180 }
3181 3181
3182 3182 nce_t *
3183 3183 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3184 3184 {
3185 3185 nce_t *nce;
3186 3186
3187 3187 mutex_enter(&ill->ill_lock);
3188 3188 nce = nce_ill_lookup_then_add_locked(ill, ncec);
3189 3189 mutex_exit(&ill->ill_lock);
3190 3190 return (nce);
3191 3191 }
3192 3192
3193 3193
3194 3194 /*
3195 3195 * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3196 3196 * nce is added to the 'dead' list, and the caller must nce_refrele() the
3197 3197 * entry after all locks have been dropped.
3198 3198 */
3199 3199 void
3200 3200 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
3201 3201 {
3202 3202 nce_t *nce;
3203 3203
3204 3204 ASSERT(ill != NULL);
3205 3205
3206 3206 /* delete any nces referencing the ncec from underlying ills */
3207 3207 if (IS_IPMP(ill))
3208 3208 ipmp_ncec_delete_nce(ncec);
3209 3209
3210 3210 /* now the ill itself */
3211 3211 mutex_enter(&ill->ill_lock);
3212 3212 for (nce = list_head(&ill->ill_nce); nce != NULL;
3213 3213 nce = list_next(&ill->ill_nce, nce)) {
3214 3214 if (nce->nce_common == ncec) {
3215 3215 nce_refhold(nce);
3216 3216 nce_delete(nce);
3217 3217 break;
3218 3218 }
3219 3219 }
3220 3220 mutex_exit(&ill->ill_lock);
3221 3221 if (nce != NULL) {
3222 3222 if (dead == NULL)
3223 3223 nce_refrele(nce);
3224 3224 else
3225 3225 list_insert_tail(dead, nce);
3226 3226 }
3227 3227 }
3228 3228
3229 3229 /*
3230 3230 * when the fastpath response does not fit in the datab
3231 3231 * associated with the existing nce_fp_mp, we delete and
3232 3232 * add the nce to retrigger fastpath based on the information
3233 3233 * in the ncec_t.
3234 3234 */
3235 3235 static nce_t *
3236 3236 nce_delete_then_add(nce_t *nce)
3237 3237 {
3238 3238 ill_t *ill = nce->nce_ill;
3239 3239 nce_t *newnce = NULL;
3240 3240
3241 3241 ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3242 3242 (void *)nce, ill->ill_name));
3243 3243 mutex_enter(&ill->ill_lock);
3244 3244 mutex_enter(&nce->nce_common->ncec_lock);
3245 3245 nce_delete(nce);
3246 3246 /*
3247 3247 * Make sure that ncec is not condemned before adding. We hold the
3248 3248 * ill_lock and ncec_lock to synchronize with ncec_delete() and
3249 3249 * ipmp_ncec_delete_nce()
3250 3250 */
3251 3251 if (!NCE_ISCONDEMNED(nce->nce_common))
3252 3252 newnce = nce_add(ill, nce->nce_common);
3253 3253 mutex_exit(&nce->nce_common->ncec_lock);
3254 3254 mutex_exit(&ill->ill_lock);
3255 3255 nce_refrele(nce);
3256 3256 return (newnce); /* could be null if nomem */
3257 3257 }
3258 3258
3259 3259 typedef struct nce_fp_match_s {
3260 3260 nce_t *nce_fp_match_res;
3261 3261 mblk_t *nce_fp_match_ack_mp;
3262 3262 } nce_fp_match_t;
3263 3263
3264 3264 /* ARGSUSED */
3265 3265 static int
3266 3266 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
3267 3267 {
3268 3268 nce_fp_match_t *nce_fp_marg = arg;
3269 3269 ncec_t *ncec = nce->nce_common;
3270 3270 mblk_t *mp = nce_fp_marg->nce_fp_match_ack_mp;
3271 3271 uchar_t *mp_rptr, *ud_mp_rptr;
3272 3272 mblk_t *ud_mp = nce->nce_dlur_mp;
3273 3273 ptrdiff_t cmplen;
3274 3274
3275 3275 /*
3276 3276 * mp is the mp associated with the fastpath ack.
3277 3277 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
3278 3278 * under consideration. If the contents match, then the
3279 3279 * fastpath ack is used to update the nce.
3280 3280 */
3281 3281 if (ud_mp == NULL)
3282 3282 return (0);
3283 3283 mp_rptr = mp->b_rptr;
3284 3284 cmplen = mp->b_wptr - mp_rptr;
3285 3285 ASSERT(cmplen >= 0);
3286 3286
3287 3287 ud_mp_rptr = ud_mp->b_rptr;
3288 3288 /*
3289 3289 * The ncec is locked here to prevent any other threads from accessing
3290 3290 * and changing nce_dlur_mp when the address becomes resolved to an
3291 3291 * lla while we're in the middle of looking at and comparing the
3292 3292 * hardware address (lla). It is also locked to prevent multiple
3293 3293 * threads in nce_fastpath() from examining nce_dlur_mp at the same
3294 3294 * time.
3295 3295 */
3296 3296 mutex_enter(&ncec->ncec_lock);
3297 3297 if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3298 3298 bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
3299 3299 nce_fp_marg->nce_fp_match_res = nce;
3300 3300 mutex_exit(&ncec->ncec_lock);
3301 3301 nce_refhold(nce);
3302 3302 return (1);
3303 3303 }
3304 3304 mutex_exit(&ncec->ncec_lock);
3305 3305 return (0);
3306 3306 }
3307 3307
3308 3308 /*
3309 3309 * Update all NCE's that are not in fastpath mode and
3310 3310 * have an nce_fp_mp that matches mp. mp->b_cont contains
3311 3311 * the fastpath header.
3312 3312 *
3313 3313 * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3314 3314 */
3315 3315 void
3316 3316 nce_fastpath_update(ill_t *ill, mblk_t *mp)
3317 3317 {
3318 3318 nce_fp_match_t nce_fp_marg;
3319 3319 nce_t *nce;
3320 3320 mblk_t *nce_fp_mp, *fp_mp;
3321 3321
3322 3322 nce_fp_marg.nce_fp_match_res = NULL;
3323 3323 nce_fp_marg.nce_fp_match_ack_mp = mp;
3324 3324
3325 3325 nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);
3326 3326
3327 3327 if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
3328 3328 return;
3329 3329
3330 3330 mutex_enter(&nce->nce_lock);
3331 3331 nce_fp_mp = nce->nce_fp_mp;
3332 3332
3333 3333 if (nce_fp_mp != NULL) {
3334 3334 fp_mp = mp->b_cont;
3335 3335 if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
3336 3336 nce_fp_mp->b_datap->db_lim) {
3337 3337 mutex_exit(&nce->nce_lock);
3338 3338 nce = nce_delete_then_add(nce);
3339 3339 if (nce == NULL) {
3340 3340 return;
3341 3341 }
3342 3342 mutex_enter(&nce->nce_lock);
3343 3343 nce_fp_mp = nce->nce_fp_mp;
3344 3344 }
3345 3345 }
3346 3346
3347 3347 /* Matched - install mp as the fastpath mp */
3348 3348 if (nce_fp_mp == NULL) {
3349 3349 fp_mp = dupb(mp->b_cont);
3350 3350 nce->nce_fp_mp = fp_mp;
3351 3351 } else {
3352 3352 fp_mp = mp->b_cont;
3353 3353 bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
3354 3354 nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
3355 3355 + MBLKL(fp_mp);
3356 3356 }
3357 3357 mutex_exit(&nce->nce_lock);
3358 3358 nce_refrele(nce);
3359 3359 }
3360 3360
3361 3361 /*
3362 3362 * Return a pointer to a given option in the packet.
3363 3363 * Assumes that option part of the packet have already been validated.
3364 3364 */
3365 3365 nd_opt_hdr_t *
3366 3366 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3367 3367 {
3368 3368 while (optlen > 0) {
3369 3369 if (opt->nd_opt_type == opt_type)
3370 3370 return (opt);
3371 3371 optlen -= 8 * opt->nd_opt_len;
3372 3372 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3373 3373 }
3374 3374 return (NULL);
3375 3375 }
3376 3376
3377 3377 /*
3378 3378 * Verify all option lengths present are > 0, also check to see
3379 3379 * if the option lengths and packet length are consistent.
3380 3380 */
3381 3381 boolean_t
3382 3382 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3383 3383 {
3384 3384 ASSERT(opt != NULL);
3385 3385 while (optlen > 0) {
3386 3386 if (opt->nd_opt_len == 0)
3387 3387 return (B_FALSE);
3388 3388 optlen -= 8 * opt->nd_opt_len;
3389 3389 if (optlen < 0)
3390 3390 return (B_FALSE);
3391 3391 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3392 3392 }
3393 3393 return (B_TRUE);
3394 3394 }
3395 3395
3396 3396 /*
3397 3397 * ncec_walk function.
3398 3398 * Free a fraction of the NCE cache entries.
3399 3399 *
3400 3400 * A possible optimization here would be to use ncec_last where possible, and
3401 3401 * delete the least-frequently used entry, which would require more complex
3402 3402 * computation as we walk through the ncec's (e.g., track ncec entries by
3403 3403 * order of ncec_last and/or maintain state)
3404 3404 */
3405 3405 static void
3406 3406 ncec_cache_reclaim(ncec_t *ncec, char *arg)
3407 3407 {
3408 3408 ip_stack_t *ipst = ncec->ncec_ipst;
3409 3409 uint_t fraction = *(uint_t *)arg;
3410 3410 uint_t rand;
3411 3411
3412 3412 if ((ncec->ncec_flags &
3413 3413 (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
3414 3414 return;
3415 3415 }
3416 3416
3417 3417 rand = (uint_t)ddi_get_lbolt() +
3418 3418 NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
3419 3419 if ((rand/fraction)*fraction == rand) {
3420 3420 IP_STAT(ipst, ip_nce_reclaim_deleted);
3421 3421 ncec_delete(ncec);
3422 3422 }
3423 3423 }
3424 3424
3425 3425 /*
3426 3426 * kmem_cache callback to free up memory.
3427 3427 *
3428 3428 * For now we just delete a fixed fraction.
3429 3429 */
3430 3430 static void
3431 3431 ip_nce_reclaim_stack(ip_stack_t *ipst)
3432 3432 {
3433 3433 uint_t fraction = ipst->ips_ip_nce_reclaim_fraction;
3434 3434
3435 3435 IP_STAT(ipst, ip_nce_reclaim_calls);
3436 3436
3437 3437 ncec_walk(NULL, (pfi_t)ncec_cache_reclaim, (uchar_t *)&fraction, ipst);
3438 3438
3439 3439 /*
3440 3440 * Walk all CONNs that can have a reference on an ire, ncec or dce.
3441 3441 * Get them to update any stale references to drop any refholds they
3442 3442 * have.
3443 3443 */
3444 3444 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
3445 3445 }
3446 3446
3447 3447 /*
3448 3448 * Called by the memory allocator subsystem directly, when the system
3449 3449 * is running low on memory.
3450 3450 */
3451 3451 /* ARGSUSED */
3452 3452 void
3453 3453 ip_nce_reclaim(void *args)
3454 3454 {
3455 3455 netstack_handle_t nh;
3456 3456 netstack_t *ns;
3457 3457 ip_stack_t *ipst;
3458 3458
3459 3459 netstack_next_init(&nh);
3460 3460 while ((ns = netstack_next(&nh)) != NULL) {
3461 3461 /*
3462 3462 * netstack_next() can return a netstack_t with a NULL
3463 3463 * netstack_ip at boot time.
3464 3464 */
3465 3465 if ((ipst = ns->netstack_ip) == NULL) {
3466 3466 netstack_rele(ns);
3467 3467 continue;
3468 3468 }
3469 3469 ip_nce_reclaim_stack(ipst);
3470 3470 netstack_rele(ns);
3471 3471 }
3472 3472 netstack_next_fini(&nh);
3473 3473 }
3474 3474
3475 3475 #ifdef DEBUG
3476 3476 void
3477 3477 ncec_trace_ref(ncec_t *ncec)
3478 3478 {
3479 3479 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3480 3480
3481 3481 if (ncec->ncec_trace_disable)
3482 3482 return;
3483 3483
3484 3484 if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
3485 3485 ncec->ncec_trace_disable = B_TRUE;
3486 3486 ncec_trace_cleanup(ncec);
3487 3487 }
3488 3488 }
3489 3489
3490 3490 void
3491 3491 ncec_untrace_ref(ncec_t *ncec)
3492 3492 {
3493 3493 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3494 3494
3495 3495 if (!ncec->ncec_trace_disable)
3496 3496 th_trace_unref(ncec);
3497 3497 }
3498 3498
3499 3499 static void
3500 3500 ncec_trace_cleanup(const ncec_t *ncec)
3501 3501 {
3502 3502 th_trace_cleanup(ncec, ncec->ncec_trace_disable);
3503 3503 }
3504 3504 #endif
3505 3505
3506 3506 /*
3507 3507 * Called when address resolution fails due to a timeout.
3508 3508 * Send an ICMP unreachable in response to all queued packets.
3509 3509 */
3510 3510 void
3511 3511 arp_resolv_failed(ncec_t *ncec)
3512 3512 {
3513 3513 mblk_t *mp, *nxt_mp;
3514 3514 char buf[INET6_ADDRSTRLEN];
3515 3515 struct in_addr ipv4addr;
3516 3516 ill_t *ill = ncec->ncec_ill;
3517 3517 ip_stack_t *ipst = ncec->ncec_ipst;
3518 3518 ip_recv_attr_t iras;
3519 3519
3520 3520 bzero(&iras, sizeof (iras));
3521 3521 iras.ira_flags = IRAF_IS_IPV4;
3522 3522 /*
3523 3523 * we are setting the ira_rill to the ipmp_ill (instead of
3524 3524 * the actual ill on which the packet was received), but this
3525 3525 * is ok because we don't actually need the real ira_rill.
3526 3526 * to send the icmp unreachable to the sender.
3527 3527 */
3528 3528 iras.ira_ill = iras.ira_rill = ill;
3529 3529 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3530 3530 iras.ira_rifindex = iras.ira_ruifindex;
3531 3531
3532 3532 IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
3533 3533 ip3dbg(("arp_resolv_failed: dst %s\n",
3534 3534 inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3535 3535 mutex_enter(&ncec->ncec_lock);
3536 3536 mp = ncec->ncec_qd_mp;
3537 3537 ncec->ncec_qd_mp = NULL;
3538 3538 ncec->ncec_nprobes = 0;
3539 3539 mutex_exit(&ncec->ncec_lock);
3540 3540 while (mp != NULL) {
3541 3541 nxt_mp = mp->b_next;
3542 3542 mp->b_next = NULL;
3543 3543
3544 3544 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3545 3545 ip_drop_output("ipIfStatsOutDiscards - address unreachable",
3546 3546 mp, ill);
3547 3547 if (ipst->ips_ip_arp_icmp_error) {
3548 3548 ip3dbg(("arp_resolv_failed: "
3549 3549 "Calling icmp_unreachable\n"));
3550 3550 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
3551 3551 } else {
3552 3552 freemsg(mp);
3553 3553 }
3554 3554 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3555 3555 mp = nxt_mp;
3556 3556 }
3557 3557 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
3558 3558 }
3559 3559
3560 3560 /*
3561 3561 * if ill is an under_ill, translate it to the ipmp_ill and add the
3562 3562 * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
3563 3563 * one on the underlying in_ill) will be created for the
3564 3564 * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
3565 3565 */
3566 3566 int
3567 3567 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3568 3568 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3569 3569 {
3570 3570 int err;
3571 3571 in6_addr_t addr6;
3572 3572 ip_stack_t *ipst = ill->ill_ipst;
3573 3573 nce_t *nce, *upper_nce = NULL;
3574 3574 ill_t *in_ill = ill, *under = NULL;
3575 3575 boolean_t need_ill_refrele = B_FALSE;
3576 3576
3577 3577 if (flags & NCE_F_MCAST) {
3578 3578 /*
3579 3579 * hw_addr will be figured out in nce_set_multicast_v4;
3580 3580 * caller needs to pass in the cast_ill for ipmp
3581 3581 */
3582 3582 ASSERT(hw_addr == NULL);
3583 3583 ASSERT(!IS_IPMP(ill));
3584 3584 err = nce_set_multicast_v4(ill, addr, flags, newnce);
3585 3585 return (err);
3586 3586 }
3587 3587
3588 3588 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
3589 3589 ill = ipmp_ill_hold_ipmp_ill(ill);
3590 3590 if (ill == NULL)
3591 3591 return (ENXIO);
3592 3592 need_ill_refrele = B_TRUE;
3593 3593 }
3594 3594 if ((flags & NCE_F_BCAST) != 0) {
3595 3595 /*
3596 3596 * IPv4 broadcast ncec: compute the hwaddr.
3597 3597 */
3598 3598 if (IS_IPMP(ill)) {
3599 3599 under = ipmp_ill_hold_xmit_ill(ill, B_FALSE);
3600 3600 if (under == NULL) {
3601 3601 if (need_ill_refrele)
3602 3602 ill_refrele(ill);
3603 3603 return (ENETDOWN);
3604 3604 }
3605 3605 hw_addr = under->ill_bcast_mp->b_rptr +
3606 3606 NCE_LL_ADDR_OFFSET(under);
3607 3607 hw_addr_len = under->ill_phys_addr_length;
3608 3608 } else {
3609 3609 hw_addr = ill->ill_bcast_mp->b_rptr +
3610 3610 NCE_LL_ADDR_OFFSET(ill),
3611 3611 hw_addr_len = ill->ill_phys_addr_length;
3612 3612 }
3613 3613 }
3614 3614
3615 3615 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3616 3616 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3617 3617 nce = nce_lookup_addr(ill, &addr6);
3618 3618 if (nce == NULL) {
3619 3619 err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
3620 3620 state, &nce);
3621 3621 } else {
3622 3622 err = EEXIST;
3623 3623 }
3624 3624 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3625 3625 if (err == 0)
3626 3626 err = nce_add_v4_postprocess(nce);
3627 3627
3628 3628 if (in_ill != ill && nce != NULL) {
3629 3629 nce_t *under_nce = NULL;
3630 3630
3631 3631 /*
3632 3632 * in_ill was the under_ill. Try to create the under_nce.
3633 3633 * Hold the ill_g_lock to prevent changes to group membership
3634 3634 * until we are done.
3635 3635 */
3636 3636 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3637 3637 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
3638 3638 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
3639 3639 ill_t *, ill);
3640 3640 rw_exit(&ipst->ips_ill_g_lock);
3641 3641 err = ENXIO;
3642 3642 nce_refrele(nce);
3643 3643 nce = NULL;
3644 3644 goto bail;
3645 3645 }
3646 3646 under_nce = nce_fastpath_create(in_ill, nce->nce_common);
3647 3647 if (under_nce == NULL) {
3648 3648 rw_exit(&ipst->ips_ill_g_lock);
3649 3649 err = EINVAL;
3650 3650 nce_refrele(nce);
3651 3651 nce = NULL;
3652 3652 goto bail;
3653 3653 }
3654 3654 rw_exit(&ipst->ips_ill_g_lock);
3655 3655 upper_nce = nce;
3656 3656 nce = under_nce; /* will be returned to caller */
3657 3657 if (NCE_ISREACHABLE(nce->nce_common))
3658 3658 nce_fastpath_trigger(under_nce);
3659 3659 }
3660 3660 if (nce != NULL) {
3661 3661 if (newnce != NULL)
3662 3662 *newnce = nce;
3663 3663 else
3664 3664 nce_refrele(nce);
3665 3665 }
3666 3666 bail:
3667 3667 if (under != NULL)
3668 3668 ill_refrele(under);
3669 3669 if (upper_nce != NULL)
3670 3670 nce_refrele(upper_nce);
3671 3671 if (need_ill_refrele)
3672 3672 ill_refrele(ill);
3673 3673
3674 3674 return (err);
3675 3675 }
3676 3676
3677 3677 /*
3678 3678 * NDP Cache Entry creation routine for IPv4.
3679 3679 * This routine must always be called with ndp4->ndp_g_lock held.
3680 3680 * Prior to return, ncec_refcnt is incremented.
3681 3681 *
3682 3682 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
3683 3683 * are always added pointing at the ipmp_ill. Thus, when the ill passed
3684 3684 * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
3685 3685 * entries will be created, both pointing at the same ncec_t. The nce_t
3686 3686 * entries will have their nce_ill set to the ipmp_ill and the under_ill
3687 3687 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
3688 3688 * Local addresses are always created on the ill passed to nce_add_v4.
3689 3689 */
3690 3690 int
3691 3691 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3692 3692 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3693 3693 {
3694 3694 int err;
3695 3695 boolean_t is_multicast = (flags & NCE_F_MCAST);
3696 3696 struct in6_addr addr6;
3697 3697 nce_t *nce;
3698 3698
3699 3699 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
3700 3700 ASSERT(!ill->ill_isv6);
3701 3701 ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);
3702 3702
3703 3703 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3704 3704 err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
3705 3705 &nce);
3706 3706 ASSERT(newnce != NULL);
3707 3707 *newnce = nce;
3708 3708 return (err);
3709 3709 }
3710 3710
3711 3711 /*
3712 3712 * Post-processing routine to be executed after nce_add_v4(). This function
3713 3713 * triggers fastpath (if appropriate) and DAD on the newly added nce entry
3714 3714 * and must be called without any locks held.
3715 3715 *
3716 3716 * Always returns 0, but we return an int to keep this symmetric with the
3717 3717 * IPv6 counter-part.
3718 3718 */
3719 3719 int
3720 3720 nce_add_v4_postprocess(nce_t *nce)
3721 3721 {
3722 3722 ncec_t *ncec = nce->nce_common;
3723 3723 uint16_t flags = ncec->ncec_flags;
3724 3724 boolean_t ndp_need_dad = B_FALSE;
3725 3725 boolean_t dropped;
3726 3726 clock_t delay;
3727 3727 ip_stack_t *ipst = ncec->ncec_ill->ill_ipst;
3728 3728 uchar_t *hw_addr = ncec->ncec_lladdr;
3729 3729 boolean_t trigger_fastpath = B_TRUE;
3730 3730
3731 3731 /*
3732 3732 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
3733 3733 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
3734 3734 * We call nce_fastpath from nce_update if the link layer address of
3735 3735 * the peer changes from nce_update
3736 3736 */
3737 3737 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
3738 3738 ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
3739 3739 trigger_fastpath = B_FALSE;
3740 3740
3741 3741 if (trigger_fastpath)
3742 3742 nce_fastpath_trigger(nce);
3743 3743
3744 3744 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
3745 3745 /*
3746 3746 * Either the caller (by passing in ND_PROBE)
3747 3747 * or nce_add_common() (by the internally computed state
3748 3748 * based on ncec_addr and ill_net_type) has determined
3749 3749 * that this unicast entry needs DAD. Trigger DAD.
3750 3750 */
3751 3751 ndp_need_dad = B_TRUE;
3752 3752 } else if (flags & NCE_F_UNSOL_ADV) {
3753 3753 /*
3754 3754 * We account for the transmit below by assigning one
3755 3755 * less than the ndd variable. Subsequent decrements
3756 3756 * are done in nce_timer.
3757 3757 */
3758 3758 mutex_enter(&ncec->ncec_lock);
3759 3759 ncec->ncec_unsolicit_count =
3760 3760 ipst->ips_ip_arp_publish_count - 1;
3761 3761 mutex_exit(&ncec->ncec_lock);
3762 3762 dropped = arp_announce(ncec);
3763 3763 mutex_enter(&ncec->ncec_lock);
3764 3764 if (dropped)
3765 3765 ncec->ncec_unsolicit_count++;
3766 3766 else
3767 3767 ncec->ncec_last_time_defended = ddi_get_lbolt();
3768 3768 if (ncec->ncec_unsolicit_count != 0) {
3769 3769 nce_start_timer(ncec,
3770 3770 ipst->ips_ip_arp_publish_interval);
3771 3771 }
3772 3772 mutex_exit(&ncec->ncec_lock);
3773 3773 }
3774 3774
3775 3775 /*
3776 3776 * If ncec_xmit_interval is 0, user has configured us to send the first
3777 3777 * probe right away. Do so, and set up for the subsequent probes.
3778 3778 */
3779 3779 if (ndp_need_dad) {
3780 3780 mutex_enter(&ncec->ncec_lock);
3781 3781 if (ncec->ncec_pcnt == 0) {
3782 3782 /*
3783 3783 * DAD probes and announce can be
3784 3784 * administratively disabled by setting the
3785 3785 * probe_count to zero. Restart the timer in
3786 3786 * this case to mark the ipif as ready.
3787 3787 */
3788 3788 ncec->ncec_unsolicit_count = 0;
3789 3789 mutex_exit(&ncec->ncec_lock);
3790 3790 nce_restart_timer(ncec, 0);
3791 3791 } else {
3792 3792 mutex_exit(&ncec->ncec_lock);
3793 3793 delay = ((ncec->ncec_flags & NCE_F_FAST) ?
3794 3794 ipst->ips_arp_probe_delay :
3795 3795 ipst->ips_arp_fastprobe_delay);
3796 3796 nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
3797 3797 }
3798 3798 }
3799 3799 return (0);
3800 3800 }
3801 3801
3802 3802 /*
3803 3803 * ncec_walk routine to update all entries that have a given destination or
3804 3804 * gateway address and cached link layer (MAC) address. This is used when ARP
3805 3805 * informs us that a network-to-link-layer mapping may have changed.
3806 3806 */
3807 3807 void
3808 3808 nce_update_hw_changed(ncec_t *ncec, void *arg)
3809 3809 {
3810 3810 nce_hw_map_t *hwm = arg;
3811 3811 ipaddr_t ncec_addr;
3812 3812
3813 3813 if (ncec->ncec_state != ND_REACHABLE)
3814 3814 return;
3815 3815
3816 3816 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
3817 3817 if (ncec_addr != hwm->hwm_addr)
3818 3818 return;
3819 3819
3820 3820 mutex_enter(&ncec->ncec_lock);
3821 3821 if (hwm->hwm_flags != 0)
3822 3822 ncec->ncec_flags = hwm->hwm_flags;
3823 3823 nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
3824 3824 mutex_exit(&ncec->ncec_lock);
3825 3825 }
3826 3826
3827 3827 void
3828 3828 ncec_refhold(ncec_t *ncec)
3829 3829 {
3830 3830 mutex_enter(&(ncec)->ncec_lock);
3831 3831 (ncec)->ncec_refcnt++;
3832 3832 ASSERT((ncec)->ncec_refcnt != 0);
3833 3833 #ifdef DEBUG
3834 3834 ncec_trace_ref(ncec);
3835 3835 #endif
3836 3836 mutex_exit(&(ncec)->ncec_lock);
3837 3837 }
3838 3838
3839 3839 void
3840 3840 ncec_refhold_notr(ncec_t *ncec)
3841 3841 {
3842 3842 mutex_enter(&(ncec)->ncec_lock);
3843 3843 (ncec)->ncec_refcnt++;
3844 3844 ASSERT((ncec)->ncec_refcnt != 0);
3845 3845 mutex_exit(&(ncec)->ncec_lock);
3846 3846 }
3847 3847
3848 3848 static void
3849 3849 ncec_refhold_locked(ncec_t *ncec)
3850 3850 {
3851 3851 ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
3852 3852 (ncec)->ncec_refcnt++;
3853 3853 #ifdef DEBUG
3854 3854 ncec_trace_ref(ncec);
3855 3855 #endif
3856 3856 }
3857 3857
3858 3858 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */
3859 3859 void
3860 3860 ncec_refrele(ncec_t *ncec)
3861 3861 {
3862 3862 mutex_enter(&(ncec)->ncec_lock);
3863 3863 #ifdef DEBUG
3864 3864 ncec_untrace_ref(ncec);
3865 3865 #endif
3866 3866 ASSERT((ncec)->ncec_refcnt != 0);
3867 3867 if (--(ncec)->ncec_refcnt == 0) {
3868 3868 ncec_inactive(ncec);
3869 3869 } else {
3870 3870 mutex_exit(&(ncec)->ncec_lock);
3871 3871 }
3872 3872 }
3873 3873
3874 3874 void
3875 3875 ncec_refrele_notr(ncec_t *ncec)
3876 3876 {
3877 3877 mutex_enter(&(ncec)->ncec_lock);
3878 3878 ASSERT((ncec)->ncec_refcnt != 0);
3879 3879 if (--(ncec)->ncec_refcnt == 0) {
3880 3880 ncec_inactive(ncec);
3881 3881 } else {
3882 3882 mutex_exit(&(ncec)->ncec_lock);
3883 3883 }
3884 3884 }
3885 3885
3886 3886 /*
3887 3887 * Common to IPv4 and IPv6.
3888 3888 */
3889 3889 void
3890 3890 nce_restart_timer(ncec_t *ncec, uint_t ms)
3891 3891 {
3892 3892 timeout_id_t tid;
3893 3893
3894 3894 ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));
3895 3895
3896 3896 /* First cancel any running timer */
3897 3897 mutex_enter(&ncec->ncec_lock);
3898 3898 tid = ncec->ncec_timeout_id;
3899 3899 ncec->ncec_timeout_id = 0;
3900 3900 if (tid != 0) {
3901 3901 mutex_exit(&ncec->ncec_lock);
3902 3902 (void) untimeout(tid);
3903 3903 mutex_enter(&ncec->ncec_lock);
3904 3904 }
3905 3905
3906 3906 /* Restart timer */
3907 3907 nce_start_timer(ncec, ms);
3908 3908 mutex_exit(&ncec->ncec_lock);
3909 3909 }
3910 3910
3911 3911 static void
3912 3912 nce_start_timer(ncec_t *ncec, uint_t ms)
3913 3913 {
3914 3914 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3915 3915 /*
3916 3916 * Don't start the timer if the ncec has been deleted, or if the timer
3917 3917 * is already running
3918 3918 */
3919 3919 if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
3920 3920 ncec->ncec_timeout_id = timeout(nce_timer, ncec,
3921 3921 MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
3922 3922 }
3923 3923 }
3924 3924
3925 3925 int
3926 3926 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
3927 3927 uint16_t flags, nce_t **newnce)
3928 3928 {
3929 3929 uchar_t *hw_addr;
3930 3930 int err = 0;
3931 3931 ip_stack_t *ipst = ill->ill_ipst;
3932 3932 in6_addr_t dst6;
3933 3933 nce_t *nce;
3934 3934
3935 3935 ASSERT(!ill->ill_isv6);
3936 3936
3937 3937 IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
3938 3938 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3939 3939 if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
3940 3940 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3941 3941 goto done;
3942 3942 }
3943 3943 if (ill->ill_net_type == IRE_IF_RESOLVER) {
3944 3944 /*
3945 3945 * For IRE_IF_RESOLVER a hardware mapping can be
3946 3946 * generated, for IRE_IF_NORESOLVER, resolution cookie
3947 3947 * in the ill is copied in nce_add_v4().
3948 3948 */
3949 3949 hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
3950 3950 if (hw_addr == NULL) {
3951 3951 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3952 3952 return (ENOMEM);
3953 3953 }
3954 3954 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
3955 3955 } else {
3956 3956 /*
3957 3957 * IRE_IF_NORESOLVER type simply copies the resolution
3958 3958 * cookie passed in. So no hw_addr is needed.
3959 3959 */
3960 3960 hw_addr = NULL;
3961 3961 }
3962 3962 ASSERT(flags & NCE_F_MCAST);
3963 3963 ASSERT(flags & NCE_F_NONUD);
3964 3964 /* nce_state will be computed by nce_add_common() */
3965 3965 err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
3966 3966 ND_UNCHANGED, &nce);
3967 3967 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3968 3968 if (err == 0)
3969 3969 err = nce_add_v4_postprocess(nce);
3970 3970 if (hw_addr != NULL)
3971 3971 kmem_free(hw_addr, ill->ill_phys_addr_length);
3972 3972 if (err != 0) {
3973 3973 ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
3974 3974 return (err);
3975 3975 }
3976 3976 done:
3977 3977 if (newnce != NULL)
3978 3978 *newnce = nce;
3979 3979 else
3980 3980 nce_refrele(nce);
3981 3981 return (0);
3982 3982 }
3983 3983
3984 3984 /*
3985 3985 * This is used when scanning for "old" (least recently broadcast) NCEs. We
3986 3986 * don't want to have to walk the list for every single one, so we gather up
3987 3987 * batches at a time.
3988 3988 */
3989 3989 #define NCE_RESCHED_LIST_LEN 8
3990 3990
3991 3991 typedef struct {
3992 3992 ill_t *ncert_ill;
3993 3993 uint_t ncert_num;
3994 3994 ncec_t *ncert_nces[NCE_RESCHED_LIST_LEN];
3995 3995 } nce_resched_t;
3996 3996
3997 3997 /*
3998 3998 * Pick the longest waiting NCEs for defense.
3999 3999 */
4000 4000 /* ARGSUSED */
4001 4001 static int
4002 4002 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
4003 4003 {
4004 4004 nce_resched_t *ncert = arg;
4005 4005 ncec_t **ncecs;
4006 4006 ncec_t **ncec_max;
4007 4007 ncec_t *ncec_temp;
4008 4008 ncec_t *ncec = nce->nce_common;
4009 4009
4010 4010 ASSERT(ncec->ncec_ill == ncert->ncert_ill);
4011 4011 /*
4012 4012 * Only reachable entries that are ready for announcement are eligible.
4013 4013 */
4014 4014 if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
4015 4015 return (0);
4016 4016 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
4017 4017 ncec_refhold(ncec);
4018 4018 ncert->ncert_nces[ncert->ncert_num++] = ncec;
4019 4019 } else {
4020 4020 ncecs = ncert->ncert_nces;
4021 4021 ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
4022 4022 ncec_refhold(ncec);
4023 4023 for (; ncecs < ncec_max; ncecs++) {
4024 4024 ASSERT(ncec != NULL);
4025 4025 if ((*ncecs)->ncec_last_time_defended >
4026 4026 ncec->ncec_last_time_defended) {
4027 4027 ncec_temp = *ncecs;
4028 4028 *ncecs = ncec;
4029 4029 ncec = ncec_temp;
4030 4030 }
4031 4031 }
4032 4032 ncec_refrele(ncec);
4033 4033 }
4034 4034 return (0);
4035 4035 }
4036 4036
4037 4037 /*
4038 4038 * Reschedule the ARP defense of any long-waiting NCEs. It's assumed that this
4039 4039 * doesn't happen very often (if at all), and thus it needn't be highly
4040 4040 * optimized. (Note, though, that it's actually O(N) complexity, because the
4041 4041 * outer loop is bounded by a constant rather than by the length of the list.)
4042 4042 */
4043 4043 static void
4044 4044 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
4045 4045 {
4046 4046 ncec_t *ncec;
4047 4047 ip_stack_t *ipst = ill->ill_ipst;
4048 4048 uint_t i, defend_rate;
4049 4049
4050 4050 i = ill->ill_defend_count;
4051 4051 ill->ill_defend_count = 0;
4052 4052 if (ill->ill_isv6)
4053 4053 defend_rate = ipst->ips_ndp_defend_rate;
4054 4054 else
4055 4055 defend_rate = ipst->ips_arp_defend_rate;
4056 4056 /* If none could be sitting around, then don't reschedule */
4057 4057 if (i < defend_rate) {
4058 4058 DTRACE_PROBE1(reschedule_none, ill_t *, ill);
4059 4059 return;
4060 4060 }
4061 4061 ncert->ncert_ill = ill;
4062 4062 while (ill->ill_defend_count < defend_rate) {
4063 4063 nce_walk_common(ill, ncec_reschedule, ncert);
4064 4064 for (i = 0; i < ncert->ncert_num; i++) {
4065 4065
4066 4066 ncec = ncert->ncert_nces[i];
4067 4067 mutex_enter(&ncec->ncec_lock);
4068 4068 ncec->ncec_flags |= NCE_F_DELAYED;
4069 4069 mutex_exit(&ncec->ncec_lock);
4070 4070 /*
4071 4071 * we plan to schedule this ncec, so incr the
4072 4072 * defend_count in anticipation.
4073 4073 */
4074 4074 if (++ill->ill_defend_count >= defend_rate)
4075 4075 break;
4076 4076 }
4077 4077 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
4078 4078 break;
4079 4079 }
4080 4080 }
4081 4081
4082 4082 /*
4083 4083 * Check if the current rate-limiting parameters permit the sending
4084 4084 * of another address defense announcement for both IPv4 and IPv6.
4085 4085 * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
4086 4086 * permitted), and B_FALSE otherwise. The `defend_rate' parameter
4087 4087 * determines how many address defense announcements are permitted
4088 4088 * in any `defense_perio' interval.
4089 4089 */
4090 4090 static boolean_t
4091 4091 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
4092 4092 {
4093 4093 clock_t now = ddi_get_lbolt();
4094 4094 ip_stack_t *ipst = ill->ill_ipst;
4095 4095 clock_t start = ill->ill_defend_start;
4096 4096 uint32_t elapsed, defend_period, defend_rate;
4097 4097 nce_resched_t ncert;
4098 4098 boolean_t ret;
4099 4099 int i;
4100 4100
4101 4101 if (ill->ill_isv6) {
4102 4102 defend_period = ipst->ips_ndp_defend_period;
4103 4103 defend_rate = ipst->ips_ndp_defend_rate;
4104 4104 } else {
4105 4105 defend_period = ipst->ips_arp_defend_period;
4106 4106 defend_rate = ipst->ips_arp_defend_rate;
4107 4107 }
4108 4108 if (defend_rate == 0)
4109 4109 return (B_TRUE);
4110 4110 bzero(&ncert, sizeof (ncert));
4111 4111 mutex_enter(&ill->ill_lock);
4112 4112 if (start > 0) {
4113 4113 elapsed = now - start;
4114 4114 if (elapsed > SEC_TO_TICK(defend_period)) {
4115 4115 ill->ill_defend_start = now;
4116 4116 /*
4117 4117 * nce_ill_reschedule will attempt to
4118 4118 * prevent starvation by reschduling the
4119 4119 * oldest entries, which are marked with
4120 4120 * the NCE_F_DELAYED flag.
4121 4121 */
4122 4122 nce_ill_reschedule(ill, &ncert);
4123 4123 }
4124 4124 } else {
4125 4125 ill->ill_defend_start = now;
4126 4126 }
4127 4127 ASSERT(ill->ill_defend_count <= defend_rate);
4128 4128 mutex_enter(&ncec->ncec_lock);
4129 4129 if (ncec->ncec_flags & NCE_F_DELAYED) {
4130 4130 /*
4131 4131 * This ncec was rescheduled as one of the really old
4132 4132 * entries needing on-going defense. The
4133 4133 * ill_defend_count was already incremented in
4134 4134 * nce_ill_reschedule. Go ahead and send the announce.
4135 4135 */
4136 4136 ncec->ncec_flags &= ~NCE_F_DELAYED;
4137 4137 mutex_exit(&ncec->ncec_lock);
4138 4138 ret = B_FALSE;
4139 4139 goto done;
4140 4140 }
4141 4141 mutex_exit(&ncec->ncec_lock);
4142 4142 if (ill->ill_defend_count < defend_rate)
4143 4143 ill->ill_defend_count++;
4144 4144 if (ill->ill_defend_count == defend_rate) {
4145 4145 /*
4146 4146 * we are no longer allowed to send unbidden defense
4147 4147 * messages. Wait for rescheduling.
4148 4148 */
4149 4149 ret = B_TRUE;
4150 4150 } else {
4151 4151 ret = B_FALSE;
4152 4152 }
4153 4153 done:
4154 4154 mutex_exit(&ill->ill_lock);
4155 4155 /*
4156 4156 * After all the locks have been dropped we can restart nce timer,
4157 4157 * and refrele the delayed ncecs
4158 4158 */
4159 4159 for (i = 0; i < ncert.ncert_num; i++) {
4160 4160 clock_t xmit_interval;
4161 4161 ncec_t *tmp;
4162 4162
4163 4163 tmp = ncert.ncert_nces[i];
4164 4164 xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
4165 4165 B_FALSE);
4166 4166 nce_restart_timer(tmp, xmit_interval);
4167 4167 ncec_refrele(tmp);
4168 4168 }
4169 4169 return (ret);
4170 4170 }
4171 4171
4172 4172 boolean_t
4173 4173 ndp_announce(ncec_t *ncec)
4174 4174 {
4175 4175 return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
4176 4176 ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
4177 4177 nce_advert_flags(ncec)));
4178 4178 }
4179 4179
4180 4180 ill_t *
4181 4181 nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
4182 4182 {
4183 4183 mblk_t *mp;
4184 4184 in6_addr_t src6;
4185 4185 ipaddr_t src4;
4186 4186 ill_t *ill = ncec->ncec_ill;
4187 4187 ill_t *src_ill = NULL;
4188 4188 ipif_t *ipif = NULL;
4189 4189 boolean_t is_myaddr = NCE_MYADDR(ncec);
4190 4190 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4191 4191
4192 4192 ASSERT(src != NULL);
4193 4193 ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
4194 4194 src6 = *src;
4195 4195 if (is_myaddr) {
4196 4196 src6 = ncec->ncec_addr;
4197 4197 if (!isv6)
4198 4198 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
4199 4199 } else {
4200 4200 /*
4201 4201 * try to find one from the outgoing packet.
4202 4202 */
4203 4203 mutex_enter(&ncec->ncec_lock);
4204 4204 mp = ncec->ncec_qd_mp;
4205 4205 if (mp != NULL) {
4206 4206 if (isv6) {
4207 4207 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
4208 4208
4209 4209 src6 = ip6h->ip6_src;
4210 4210 } else {
4211 4211 ipha_t *ipha = (ipha_t *)mp->b_rptr;
4212 4212
4213 4213 src4 = ipha->ipha_src;
4214 4214 IN6_IPADDR_TO_V4MAPPED(src4, &src6);
4215 4215 }
4216 4216 }
4217 4217 mutex_exit(&ncec->ncec_lock);
4218 4218 }
4219 4219
4220 4220 /*
4221 4221 * For outgoing packets, if the src of outgoing packet is one
4222 4222 * of the assigned interface addresses use it, otherwise we
4223 4223 * will pick the source address below.
4224 4224 * For local addresses (is_myaddr) doing DAD, NDP announce
4225 4225 * messages are mcast. So we use the (IPMP) cast_ill or the
4226 4226 * (non-IPMP) ncec_ill for these message types. The only case
4227 4227 * of unicast DAD messages are for IPv6 ND probes, for which
4228 4228 * we find the ipif_bound_ill corresponding to the ncec_addr.
4229 4229 */
4230 4230 if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
4231 4231 if (isv6) {
4232 4232 ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
4233 4233 ill->ill_ipst);
4234 4234 } else {
4235 4235 ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
4236 4236 ill->ill_ipst);
4237 4237 }
4238 4238
4239 4239 /*
4240 4240 * If no relevant ipif can be found, then it's not one of our
4241 4241 * addresses. Reset to :: and try to find a src for the NS or
4242 4242 * ARP request using ipif_select_source_v[4,6] below.
4243 4243 * If an ipif can be found, but it's not yet done with
4244 4244 * DAD verification, and we are not being invoked for
4245 4245 * DAD (i.e., !is_myaddr), then just postpone this
4246 4246 * transmission until later.
4247 4247 */
4248 4248 if (ipif == NULL) {
4249 4249 src6 = ipv6_all_zeros;
4250 4250 src4 = INADDR_ANY;
4251 4251 } else if (!ipif->ipif_addr_ready && !is_myaddr) {
4252 4252 DTRACE_PROBE2(nce__resolve__ipif__not__ready,
4253 4253 ncec_t *, ncec, ipif_t *, ipif);
4254 4254 ipif_refrele(ipif);
4255 4255 return (NULL);
4256 4256 }
4257 4257 }
4258 4258
4259 4259 if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
4260 4260 /*
4261 4261 * Pick a source address for this solicitation, but
4262 4262 * restrict the selection to addresses assigned to the
4263 4263 * output interface. We do this because the destination will
4264 4264 * create a neighbor cache entry for the source address of
4265 4265 * this packet, so the source address had better be a valid
4266 4266 * neighbor.
4267 4267 */
4268 4268 if (isv6) {
4269 4269 ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
4270 4270 B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4271 4271 B_FALSE, NULL);
4272 4272 } else {
4273 4273 ipaddr_t nce_addr;
4274 4274
4275 4275 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
4276 4276 ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
4277 4277 B_FALSE, NULL);
4278 4278 }
4279 4279 if (ipif == NULL && IS_IPMP(ill)) {
4280 4280 ill_t *send_ill = ipmp_ill_hold_xmit_ill(ill, B_TRUE);
4281 4281
4282 4282 if (send_ill != NULL) {
4283 4283 if (isv6) {
4284 4284 ipif = ipif_select_source_v6(send_ill,
4285 4285 &ncec->ncec_addr, B_TRUE,
4286 4286 IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4287 4287 B_FALSE, NULL);
4288 4288 } else {
4289 4289 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
4290 4290 src4);
4291 4291 ipif = ipif_select_source_v4(send_ill,
4292 4292 src4, ALL_ZONES, B_TRUE, NULL);
4293 4293 }
4294 4294 ill_refrele(send_ill);
4295 4295 }
4296 4296 }
4297 4297
4298 4298 if (ipif == NULL) {
4299 4299 char buf[INET6_ADDRSTRLEN];
4300 4300
4301 4301 ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
4302 4302 inet_ntop((isv6 ? AF_INET6 : AF_INET),
4303 4303 (char *)&ncec->ncec_addr, buf, sizeof (buf))));
4304 4304 DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
4305 4305 return (NULL);
4306 4306 }
4307 4307 src6 = ipif->ipif_v6lcl_addr;
4308 4308 }
4309 4309 *src = src6;
4310 4310 if (ipif != NULL) {
4311 4311 src_ill = ipif->ipif_ill;
4312 4312 if (IS_IPMP(src_ill))
4313 4313 src_ill = ipmp_ipif_hold_bound_ill(ipif);
4314 4314 else
4315 4315 ill_refhold(src_ill);
4316 4316 ipif_refrele(ipif);
4317 4317 DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
4318 4318 ill_t *, src_ill);
4319 4319 }
4320 4320 return (src_ill);
4321 4321 }
4322 4322
4323 4323 void
4324 4324 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
4325 4325 uchar_t *hwaddr, int hwaddr_len, int flags)
4326 4326 {
4327 4327 ill_t *ill;
4328 4328 ncec_t *ncec;
4329 4329 nce_t *nce;
4330 4330 uint16_t new_state;
4331 4331
4332 4332 ill = (ipif ? ipif->ipif_ill : NULL);
4333 4333 if (ill != NULL) {
4334 4334 /*
4335 4335 * only one ncec is possible
4336 4336 */
4337 4337 nce = nce_lookup_v4(ill, addr);
4338 4338 if (nce != NULL) {
4339 4339 ncec = nce->nce_common;
4340 4340 mutex_enter(&ncec->ncec_lock);
4341 4341 if (NCE_ISREACHABLE(ncec))
4342 4342 new_state = ND_UNCHANGED;
4343 4343 else
4344 4344 new_state = ND_STALE;
4345 4345 ncec->ncec_flags = flags;
4346 4346 nce_update(ncec, new_state, hwaddr);
4347 4347 mutex_exit(&ncec->ncec_lock);
4348 4348 nce_refrele(nce);
4349 4349 return;
4350 4350 }
4351 4351 } else {
4352 4352 /*
4353 4353 * ill is wildcard; clean up all ncec's and ire's
4354 4354 * that match on addr.
4355 4355 */
4356 4356 nce_hw_map_t hwm;
4357 4357
4358 4358 hwm.hwm_addr = *addr;
4359 4359 hwm.hwm_hwlen = hwaddr_len;
4360 4360 hwm.hwm_hwaddr = hwaddr;
4361 4361 hwm.hwm_flags = flags;
4362 4362
4363 4363 ncec_walk_common(ipst->ips_ndp4, NULL,
4364 4364 (pfi_t)nce_update_hw_changed, (uchar_t *)&hwm, B_TRUE);
4365 4365 }
4366 4366 }
4367 4367
4368 4368 /*
4369 4369 * Common function to add ncec entries.
4370 4370 * we always add the ncec with ncec_ill == ill, and always create
4371 4371 * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
4372 4372 * ncec is !reachable.
4373 4373 *
4374 4374 * When the caller passes in an nce_state of ND_UNCHANGED,
4375 4375 * nce_add_common() will determine the state of the created nce based
4376 4376 * on the ill_net_type and nce_flags used. Otherwise, the nce will
4377 4377 * be created with state set to the passed in nce_state.
4378 4378 */
4379 4379 static int
4380 4380 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
4381 4381 const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
4382 4382 {
4383 4383 static ncec_t nce_nil;
4384 4384 uchar_t *template = NULL;
4385 4385 int err;
4386 4386 ncec_t *ncec;
4387 4387 ncec_t **ncep;
4388 4388 ip_stack_t *ipst = ill->ill_ipst;
4389 4389 uint16_t state;
4390 4390 boolean_t fastprobe = B_FALSE;
4391 4391 struct ndp_g_s *ndp;
4392 4392 nce_t *nce = NULL;
4393 4393 mblk_t *dlur_mp = NULL;
4394 4394
4395 4395 if (ill->ill_isv6)
4396 4396 ndp = ill->ill_ipst->ips_ndp6;
4397 4397 else
4398 4398 ndp = ill->ill_ipst->ips_ndp4;
4399 4399
4400 4400 *retnce = NULL;
4401 4401
4402 4402 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
4403 4403
4404 4404 if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
4405 4405 ip0dbg(("nce_add_common: no addr\n"));
4406 4406 return (EINVAL);
4407 4407 }
4408 4408 if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
4409 4409 ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
4410 4410 return (EINVAL);
4411 4411 }
4412 4412
4413 4413 if (ill->ill_isv6) {
4414 4414 ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
4415 4415 } else {
4416 4416 ipaddr_t v4addr;
4417 4417
4418 4418 IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
4419 4419 ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
4420 4420 }
4421 4421
4422 4422 /*
4423 4423 * The caller has ensured that there is no nce on ill, but there could
4424 4424 * still be an nce_common_t for the address, so that we find exisiting
4425 4425 * ncec_t strucutures first, and atomically add a new nce_t if
4426 4426 * one is found. The ndp_g_lock ensures that we don't cross threads
4427 4427 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
4428 4428 * compare for matches across the illgrp because this function is
4429 4429 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
4430 4430 * with the nce_lookup_then_add_v* passing in the ipmp_ill where
4431 4431 * appropriate.
4432 4432 */
4433 4433 ncec = *ncep;
4434 4434 for (; ncec != NULL; ncec = ncec->ncec_next) {
4435 4435 if (ncec->ncec_ill == ill) {
4436 4436 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
4437 4437 /*
4438 4438 * We should never find *retnce to be
4439 4439 * MYADDR, since the caller may then
4440 4440 * incorrectly restart a DAD timer that's
4441 4441 * already running. However, if we are in
4442 4442 * forwarding mode, and the interface is
4443 4443 * moving in/out of groups, the data
4444 4444 * path ire lookup (e.g., ire_revalidate_nce)
4445 4445 * may have determined that some destination
4446 4446 * is offlink while the control path is adding
4447 4447 * that address as a local address.
4448 4448 * Recover from this case by failing the
4449 4449 * lookup
4450 4450 */
4451 4451 if (NCE_MYADDR(ncec))
4452 4452 return (ENXIO);
4453 4453 *retnce = nce_ill_lookup_then_add(ill, ncec);
4454 4454 if (*retnce != NULL)
4455 4455 break;
4456 4456 }
4457 4457 }
4458 4458 }
4459 4459 if (*retnce != NULL) /* caller must trigger fastpath on nce */
4460 4460 return (0);
4461 4461
4462 4462 ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
4463 4463 if (ncec == NULL)
4464 4464 return (ENOMEM);
4465 4465 *ncec = nce_nil;
4466 4466 ncec->ncec_ill = ill;
4467 4467 ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
4468 4468 ncec->ncec_flags = flags;
4469 4469 ncec->ncec_ipst = ipst; /* No netstack_hold */
4470 4470
4471 4471 if (!ill->ill_isv6) {
4472 4472 ipaddr_t addr4;
4473 4473
4474 4474 /*
4475 4475 * DAD probe interval and probe count are set based on
4476 4476 * fast/slow probe settings. If the underlying link doesn't
4477 4477 * have reliably up/down notifications or if we're working
4478 4478 * with IPv4 169.254.0.0/16 Link Local Address space, then
4479 4479 * don't use the fast timers. Otherwise, use them.
4480 4480 */
4481 4481 ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
4482 4482 IN6_V4MAPPED_TO_IPADDR(addr, addr4);
4483 4483 if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) {
4484 4484 fastprobe = B_TRUE;
4485 4485 } else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) &&
4486 4486 !IS_IPV4_LL_SPACE(&addr4)) {
4487 4487 ill_t *hwaddr_ill;
4488 4488
4489 4489 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr,
4490 4490 hw_addr_len);
4491 4491 if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link)
4492 4492 fastprobe = B_TRUE;
4493 4493 }
4494 4494 if (fastprobe) {
4495 4495 ncec->ncec_xmit_interval =
4496 4496 ipst->ips_arp_fastprobe_interval;
4497 4497 ncec->ncec_pcnt =
4498 4498 ipst->ips_arp_fastprobe_count;
4499 4499 ncec->ncec_flags |= NCE_F_FAST;
4500 4500 } else {
4501 4501 ncec->ncec_xmit_interval =
4502 4502 ipst->ips_arp_probe_interval;
4503 4503 ncec->ncec_pcnt =
4504 4504 ipst->ips_arp_probe_count;
4505 4505 }
4506 4506 if (NCE_PUBLISH(ncec)) {
4507 4507 ncec->ncec_unsolicit_count =
4508 4508 ipst->ips_ip_arp_publish_count;
4509 4509 }
4510 4510 } else {
4511 4511 /*
4512 4512 * probe interval is constant: ILL_PROBE_INTERVAL
4513 4513 * probe count is constant: ND_MAX_UNICAST_SOLICIT
4514 4514 */
4515 4515 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
4516 4516 if (NCE_PUBLISH(ncec)) {
4517 4517 ncec->ncec_unsolicit_count =
4518 4518 ipst->ips_ip_ndp_unsolicit_count;
4519 4519 }
4520 4520 }
4521 4521 ncec->ncec_rcnt = ill->ill_xmit_count;
4522 4522 ncec->ncec_addr = *addr;
4523 4523 ncec->ncec_qd_mp = NULL;
4524 4524 ncec->ncec_refcnt = 1; /* for ncec getting created */
4525 4525 mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
4526 4526 ncec->ncec_trace_disable = B_FALSE;
4527 4527
4528 4528 /*
4529 4529 * ncec_lladdr holds link layer address
4530 4530 */
4531 4531 if (hw_addr_len > 0) {
4532 4532 template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
4533 4533 if (template == NULL) {
4534 4534 err = ENOMEM;
4535 4535 goto err_ret;
4536 4536 }
4537 4537 ncec->ncec_lladdr = template;
4538 4538 ncec->ncec_lladdr_length = hw_addr_len;
4539 4539 bzero(ncec->ncec_lladdr, hw_addr_len);
4540 4540 }
4541 4541 if ((flags & NCE_F_BCAST) != 0) {
4542 4542 state = ND_REACHABLE;
4543 4543 ASSERT(hw_addr_len > 0);
4544 4544 } else if (ill->ill_net_type == IRE_IF_RESOLVER) {
4545 4545 state = ND_INITIAL;
4546 4546 } else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
4547 4547 /*
4548 4548 * NORESOLVER entries are always created in the REACHABLE
4549 4549 * state.
4550 4550 */
4551 4551 state = ND_REACHABLE;
4552 4552 if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
4553 4553 ill->ill_mactype != DL_IPV4 &&
4554 4554 ill->ill_mactype != DL_6TO4) {
4555 4555 /*
4556 4556 * We create a nce_res_mp with the IP nexthop address
4557 4557 * as the destination address if the physical length
4558 4558 * is exactly 4 bytes for point-to-multipoint links
4559 4559 * that do their own resolution from IP to link-layer
4560 4560 * address (e.g. IP over X.25).
4561 4561 */
4562 4562 bcopy((uchar_t *)addr,
4563 4563 ncec->ncec_lladdr, ill->ill_phys_addr_length);
4564 4564 }
4565 4565 if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
4566 4566 ill->ill_mactype != DL_IPV6) {
4567 4567 /*
4568 4568 * We create a nce_res_mp with the IP nexthop address
4569 4569 * as the destination address if the physical legnth
4570 4570 * is exactly 16 bytes for point-to-multipoint links
4571 4571 * that do their own resolution from IP to link-layer
4572 4572 * address.
4573 4573 */
4574 4574 bcopy((uchar_t *)addr,
4575 4575 ncec->ncec_lladdr, ill->ill_phys_addr_length);
4576 4576 }
4577 4577 /*
4578 4578 * Since NUD is not part of the base IPv4 protocol definition,
4579 4579 * IPv4 neighbor entries on NORESOLVER interfaces will never
4580 4580 * age, and are marked NCE_F_NONUD.
4581 4581 */
4582 4582 if (!ill->ill_isv6)
4583 4583 ncec->ncec_flags |= NCE_F_NONUD;
4584 4584 } else if (ill->ill_net_type == IRE_LOOPBACK) {
4585 4585 state = ND_REACHABLE;
4586 4586 }
4587 4587
4588 4588 if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
4589 4589 /*
4590 4590 * We are adding an ncec with a deterministic hw_addr,
4591 4591 * so the state can only be one of {REACHABLE, STALE, PROBE}.
4592 4592 *
4593 4593 * if we are adding a unicast ncec for the local address
4594 4594 * it would be REACHABLE; we would be adding a ND_STALE entry
4595 4595 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
4596 4596 * addresses are added in PROBE to trigger DAD.
4597 4597 */
4598 4598 if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
4599 4599 ill->ill_net_type == IRE_IF_NORESOLVER)
4600 4600 state = ND_REACHABLE;
4601 4601 else if (!NCE_PUBLISH(ncec))
4602 4602 state = ND_STALE;
4603 4603 else
4604 4604 state = ND_PROBE;
4605 4605 if (hw_addr != NULL)
4606 4606 nce_set_ll(ncec, hw_addr);
4607 4607 }
4608 4608 /* caller overrides internally computed state */
4609 4609 if (nce_state != ND_UNCHANGED)
4610 4610 state = nce_state;
4611 4611
4612 4612 if (state == ND_PROBE)
4613 4613 ncec->ncec_flags |= NCE_F_UNVERIFIED;
4614 4614
4615 4615 ncec->ncec_state = state;
4616 4616
4617 4617 if (state == ND_REACHABLE) {
4618 4618 ncec->ncec_last = ncec->ncec_init_time =
4619 4619 TICK_TO_MSEC(ddi_get_lbolt64());
4620 4620 } else {
4621 4621 ncec->ncec_last = 0;
4622 4622 if (state == ND_INITIAL)
4623 4623 ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64());
4624 4624 }
4625 4625 list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
4626 4626 offsetof(ncec_cb_t, ncec_cb_node));
4627 4627 /*
4628 4628 * have all the memory allocations out of the way before taking locks
4629 4629 * and adding the nce.
4630 4630 */
4631 4631 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4632 4632 if (nce == NULL) {
4633 4633 err = ENOMEM;
4634 4634 goto err_ret;
4635 4635 }
4636 4636 if (ncec->ncec_lladdr != NULL ||
4637 4637 ill->ill_net_type == IRE_IF_NORESOLVER) {
4638 4638 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4639 4639 ill->ill_phys_addr_length, ill->ill_sap,
4640 4640 ill->ill_sap_length);
4641 4641 if (dlur_mp == NULL) {
4642 4642 err = ENOMEM;
4643 4643 goto err_ret;
4644 4644 }
4645 4645 }
4646 4646
4647 4647 /*
4648 4648 * Atomically ensure that the ill is not CONDEMNED, before
4649 4649 * adding the NCE.
4650 4650 */
4651 4651 mutex_enter(&ill->ill_lock);
4652 4652 if (ill->ill_state_flags & ILL_CONDEMNED) {
4653 4653 mutex_exit(&ill->ill_lock);
4654 4654 err = EINVAL;
4655 4655 goto err_ret;
4656 4656 }
4657 4657 if (!NCE_MYADDR(ncec) &&
4658 4658 (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
4659 4659 mutex_exit(&ill->ill_lock);
4660 4660 DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
4661 4661 err = EINVAL;
4662 4662 goto err_ret;
4663 4663 }
4664 4664 /*
4665 4665 * Acquire the ncec_lock even before adding the ncec to the list
4666 4666 * so that it cannot get deleted after the ncec is added, but
4667 4667 * before we add the nce.
4668 4668 */
4669 4669 mutex_enter(&ncec->ncec_lock);
4670 4670 if ((ncec->ncec_next = *ncep) != NULL)
4671 4671 ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
4672 4672 *ncep = ncec;
4673 4673 ncec->ncec_ptpn = ncep;
4674 4674
4675 4675 /* Bump up the number of ncec's referencing this ill */
4676 4676 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4677 4677 (char *), "ncec", (void *), ncec);
4678 4678 ill->ill_ncec_cnt++;
4679 4679 /*
4680 4680 * Since we hold the ncec_lock at this time, the ncec cannot be
4681 4681 * condemned, and we can safely add the nce.
4682 4682 */
4683 4683 *retnce = nce_add_impl(ill, ncec, nce, dlur_mp);
4684 4684 mutex_exit(&ncec->ncec_lock);
4685 4685 mutex_exit(&ill->ill_lock);
4686 4686
4687 4687 /* caller must trigger fastpath on *retnce */
4688 4688 return (0);
4689 4689
4690 4690 err_ret:
4691 4691 if (ncec != NULL)
4692 4692 kmem_cache_free(ncec_cache, ncec);
4693 4693 if (nce != NULL)
4694 4694 kmem_cache_free(nce_cache, nce);
4695 4695 freemsg(dlur_mp);
4696 4696 if (template != NULL)
4697 4697 kmem_free(template, ill->ill_phys_addr_length);
4698 4698 return (err);
4699 4699 }
4700 4700
4701 4701 /*
4702 4702 * take a ref on the nce
4703 4703 */
4704 4704 void
4705 4705 nce_refhold(nce_t *nce)
4706 4706 {
4707 4707 mutex_enter(&nce->nce_lock);
4708 4708 nce->nce_refcnt++;
4709 4709 ASSERT((nce)->nce_refcnt != 0);
4710 4710 mutex_exit(&nce->nce_lock);
4711 4711 }
4712 4712
4713 4713 /*
4714 4714 * release a ref on the nce; In general, this
4715 4715 * cannot be called with locks held because nce_inactive
4716 4716 * may result in nce_inactive which will take the ill_lock,
4717 4717 * do ipif_ill_refrele_tail etc. Thus the one exception
4718 4718 * where this can be called with locks held is when the caller
4719 4719 * is certain that the nce_refcnt is sufficient to prevent
4720 4720 * the invocation of nce_inactive.
4721 4721 */
4722 4722 void
4723 4723 nce_refrele(nce_t *nce)
4724 4724 {
4725 4725 ASSERT((nce)->nce_refcnt != 0);
4726 4726 mutex_enter(&nce->nce_lock);
4727 4727 if (--nce->nce_refcnt == 0)
4728 4728 nce_inactive(nce); /* destroys the mutex */
4729 4729 else
4730 4730 mutex_exit(&nce->nce_lock);
4731 4731 }
4732 4732
4733 4733 /*
4734 4734 * free the nce after all refs have gone away.
4735 4735 */
4736 4736 static void
4737 4737 nce_inactive(nce_t *nce)
4738 4738 {
4739 4739 ill_t *ill = nce->nce_ill;
4740 4740
4741 4741 ASSERT(nce->nce_refcnt == 0);
4742 4742
4743 4743 ncec_refrele_notr(nce->nce_common);
4744 4744 nce->nce_common = NULL;
4745 4745 freemsg(nce->nce_fp_mp);
4746 4746 freemsg(nce->nce_dlur_mp);
4747 4747
4748 4748 mutex_enter(&ill->ill_lock);
4749 4749 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
4750 4750 (char *), "nce", (void *), nce);
4751 4751 ill->ill_nce_cnt--;
4752 4752 nce->nce_ill = NULL;
4753 4753 /*
4754 4754 * If the number of ncec's associated with this ill have dropped
4755 4755 * to zero, check whether we need to restart any operation that
4756 4756 * is waiting for this to happen.
4757 4757 */
4758 4758 if (ILL_DOWN_OK(ill)) {
4759 4759 /* ipif_ill_refrele_tail drops the ill_lock */
4760 4760 ipif_ill_refrele_tail(ill);
4761 4761 } else {
4762 4762 mutex_exit(&ill->ill_lock);
4763 4763 }
4764 4764
4765 4765 mutex_destroy(&nce->nce_lock);
4766 4766 kmem_cache_free(nce_cache, nce);
4767 4767 }
4768 4768
4769 4769 /*
4770 4770 * Add an nce to the ill_nce list.
4771 4771 */
4772 4772 static nce_t *
4773 4773 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)
4774 4774 {
4775 4775 bzero(nce, sizeof (*nce));
4776 4776 mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
4777 4777 nce->nce_common = ncec;
4778 4778 nce->nce_addr = ncec->ncec_addr;
4779 4779 nce->nce_ill = ill;
4780 4780 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4781 4781 (char *), "nce", (void *), nce);
4782 4782 ill->ill_nce_cnt++;
4783 4783
4784 4784 nce->nce_refcnt = 1; /* for the thread */
4785 4785 ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
4786 4786 nce->nce_dlur_mp = dlur_mp;
4787 4787
4788 4788 /* add nce to the ill's fastpath list. */
4789 4789 nce->nce_refcnt++; /* for the list */
4790 4790 list_insert_head(&ill->ill_nce, nce);
4791 4791 return (nce);
4792 4792 }
4793 4793
4794 4794 static nce_t *
4795 4795 nce_add(ill_t *ill, ncec_t *ncec)
4796 4796 {
4797 4797 nce_t *nce;
4798 4798 mblk_t *dlur_mp = NULL;
4799 4799
4800 4800 ASSERT(MUTEX_HELD(&ill->ill_lock));
4801 4801 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
4802 4802
4803 4803 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4804 4804 if (nce == NULL)
4805 4805 return (NULL);
4806 4806 if (ncec->ncec_lladdr != NULL ||
4807 4807 ill->ill_net_type == IRE_IF_NORESOLVER) {
4808 4808 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4809 4809 ill->ill_phys_addr_length, ill->ill_sap,
4810 4810 ill->ill_sap_length);
4811 4811 if (dlur_mp == NULL) {
4812 4812 kmem_cache_free(nce_cache, nce);
4813 4813 return (NULL);
4814 4814 }
4815 4815 }
4816 4816 return (nce_add_impl(ill, ncec, nce, dlur_mp));
4817 4817 }
4818 4818
4819 4819 /*
4820 4820 * remove the nce from the ill_faspath list
4821 4821 */
4822 4822 void
4823 4823 nce_delete(nce_t *nce)
4824 4824 {
4825 4825 ill_t *ill = nce->nce_ill;
4826 4826
4827 4827 ASSERT(MUTEX_HELD(&ill->ill_lock));
4828 4828
4829 4829 mutex_enter(&nce->nce_lock);
4830 4830 if (nce->nce_is_condemned) {
4831 4831 /*
4832 4832 * some other thread has removed this nce from the ill_nce list
4833 4833 */
4834 4834 mutex_exit(&nce->nce_lock);
4835 4835 return;
4836 4836 }
4837 4837 nce->nce_is_condemned = B_TRUE;
4838 4838 mutex_exit(&nce->nce_lock);
4839 4839
4840 4840 list_remove(&ill->ill_nce, nce);
4841 4841 /*
4842 4842 * even though we are holding the ill_lock, it is ok to
4843 4843 * call nce_refrele here because we know that we should have
4844 4844 * at least 2 refs on the nce: one for the thread, and one
4845 4845 * for the list. The refrele below will release the one for
4846 4846 * the list.
4847 4847 */
4848 4848 nce_refrele(nce);
4849 4849 }
4850 4850
4851 4851 nce_t *
4852 4852 nce_lookup(ill_t *ill, const in6_addr_t *addr)
4853 4853 {
4854 4854 nce_t *nce = NULL;
4855 4855
4856 4856 ASSERT(ill != NULL);
4857 4857 ASSERT(MUTEX_HELD(&ill->ill_lock));
4858 4858
4859 4859 for (nce = list_head(&ill->ill_nce); nce != NULL;
4860 4860 nce = list_next(&ill->ill_nce, nce)) {
4861 4861 if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
4862 4862 break;
4863 4863 }
4864 4864
4865 4865 /*
4866 4866 * if we found the nce on the ill_nce list while holding
4867 4867 * the ill_lock, then it cannot be condemned yet.
4868 4868 */
4869 4869 if (nce != NULL) {
4870 4870 ASSERT(!nce->nce_is_condemned);
4871 4871 nce_refhold(nce);
4872 4872 }
4873 4873 return (nce);
4874 4874 }
4875 4875
4876 4876 /*
4877 4877 * Walk the ill_nce list on ill. The callback function func() cannot perform
4878 4878 * any destructive actions.
4879 4879 */
4880 4880 static void
4881 4881 nce_walk_common(ill_t *ill, pfi_t func, void *arg)
4882 4882 {
4883 4883 nce_t *nce = NULL, *nce_next;
4884 4884
4885 4885 ASSERT(MUTEX_HELD(&ill->ill_lock));
4886 4886 for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4887 4887 nce_next = list_next(&ill->ill_nce, nce);
4888 4888 if (func(ill, nce, arg) != 0)
4889 4889 break;
4890 4890 nce = nce_next;
4891 4891 }
4892 4892 }
4893 4893
4894 4894 void
4895 4895 nce_walk(ill_t *ill, pfi_t func, void *arg)
4896 4896 {
4897 4897 mutex_enter(&ill->ill_lock);
4898 4898 nce_walk_common(ill, func, arg);
4899 4899 mutex_exit(&ill->ill_lock);
4900 4900 }
4901 4901
4902 4902 void
4903 4903 nce_flush(ill_t *ill, boolean_t flushall)
4904 4904 {
4905 4905 nce_t *nce, *nce_next;
4906 4906 list_t dead;
4907 4907
4908 4908 list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
4909 4909 mutex_enter(&ill->ill_lock);
4910 4910 for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4911 4911 nce_next = list_next(&ill->ill_nce, nce);
4912 4912 if (!flushall && NCE_PUBLISH(nce->nce_common)) {
4913 4913 nce = nce_next;
4914 4914 continue;
4915 4915 }
4916 4916 /*
4917 4917 * nce_delete requires that the caller should either not
4918 4918 * be holding locks, or should hold a ref to ensure that
4919 4919 * we wont hit ncec_inactive. So take a ref and clean up
4920 4920 * after the list is flushed.
4921 4921 */
4922 4922 nce_refhold(nce);
4923 4923 nce_delete(nce);
4924 4924 list_insert_tail(&dead, nce);
4925 4925 nce = nce_next;
4926 4926 }
4927 4927 mutex_exit(&ill->ill_lock);
4928 4928 while ((nce = list_head(&dead)) != NULL) {
4929 4929 list_remove(&dead, nce);
4930 4930 nce_refrele(nce);
4931 4931 }
4932 4932 ASSERT(list_is_empty(&dead));
4933 4933 list_destroy(&dead);
4934 4934 }
4935 4935
4936 4936 /* Return an interval that is anywhere in the [1 .. intv] range */
4937 4937 static clock_t
4938 4938 nce_fuzz_interval(clock_t intv, boolean_t initial_time)
4939 4939 {
4940 4940 clock_t rnd, frac;
4941 4941
4942 4942 (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
4943 4943 /* Note that clock_t is signed; must chop off bits */
4944 4944 rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
4945 4945 if (initial_time) {
4946 4946 if (intv <= 0)
4947 4947 intv = 1;
4948 4948 else
4949 4949 intv = (rnd % intv) + 1;
4950 4950 } else {
4951 4951 /* Compute 'frac' as 20% of the configured interval */
4952 4952 if ((frac = intv / 5) <= 1)
4953 4953 frac = 2;
4954 4954 /* Set intv randomly in the range [intv-frac .. intv+frac] */
4955 4955 if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
4956 4956 intv = 1;
4957 4957 }
4958 4958 return (intv);
4959 4959 }
4960 4960
4961 4961 void
4962 4962 nce_resolv_ipmp_ok(ncec_t *ncec)
4963 4963 {
4964 4964 mblk_t *mp;
4965 4965 uint_t pkt_len;
4966 4966 iaflags_t ixaflags = IXAF_NO_TRACE;
4967 4967 nce_t *under_nce;
4968 4968 ill_t *ill = ncec->ncec_ill;
4969 4969 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4970 4970 ipif_t *src_ipif = NULL;
4971 4971 ip_stack_t *ipst = ill->ill_ipst;
4972 4972 ill_t *send_ill;
4973 4973 uint_t nprobes;
4974 4974
4975 4975 ASSERT(IS_IPMP(ill));
4976 4976
4977 4977 mutex_enter(&ncec->ncec_lock);
4978 4978 nprobes = ncec->ncec_nprobes;
4979 4979 mp = ncec->ncec_qd_mp;
4980 4980 ncec->ncec_qd_mp = NULL;
4981 4981 ncec->ncec_nprobes = 0;
4982 4982 mutex_exit(&ncec->ncec_lock);
4983 4983
4984 4984 while (mp != NULL) {
4985 4985 mblk_t *nxt_mp;
4986 4986
4987 4987 nxt_mp = mp->b_next;
4988 4988 mp->b_next = NULL;
4989 4989 if (isv6) {
4990 4990 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
4991 4991
4992 4992 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
4993 4993 src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
4994 4994 ill, ALL_ZONES, ipst);
4995 4995 } else {
4996 4996 ipha_t *ipha = (ipha_t *)mp->b_rptr;
4997 4997
4998 4998 ixaflags |= IXAF_IS_IPV4;
4999 4999 pkt_len = ntohs(ipha->ipha_length);
5000 5000 src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
5001 5001 ill, ALL_ZONES, ipst);
5002 5002 }
5003 5003
5004 5004 /*
5005 5005 * find a new nce based on an under_ill. The first IPMP probe
5006 5006 * packet gets queued, so we could still find a src_ipif that
5007 5007 * matches an IPMP test address.
5008 5008 */
5009 5009 if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
5010 5010 /*
5011 5011 * if src_ipif is null, this could be either a
5012 5012 * forwarded packet or a probe whose src got deleted.
5013 5013 * We identify the former case by looking for the
5014 5014 * ncec_nprobes: the first ncec_nprobes packets are
5015 5015 * probes;
5016 5016 */
5017 5017 if (src_ipif == NULL && nprobes > 0)
5018 5018 goto drop_pkt;
5019 5019
5020 5020 /*
5021 5021 * For forwarded packets, we use the ipmp rotor
5022 5022 * to find send_ill.
5023 5023 */
5024 5024 send_ill = ipmp_ill_hold_xmit_ill(ncec->ncec_ill,
5025 5025 B_TRUE);
5026 5026 } else {
5027 5027 send_ill = src_ipif->ipif_ill;
5028 5028 ill_refhold(send_ill);
5029 5029 }
5030 5030
5031 5031 DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
5032 5032 (ncec_t *), ncec, (ipif_t *),
5033 5033 src_ipif, (ill_t *), send_ill);
5034 5034
5035 5035 if (send_ill == NULL) {
5036 5036 if (src_ipif != NULL)
5037 5037 ipif_refrele(src_ipif);
5038 5038 goto drop_pkt;
5039 5039 }
5040 5040 /* create an under_nce on send_ill */
5041 5041 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5042 5042 if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
5043 5043 under_nce = nce_fastpath_create(send_ill, ncec);
5044 5044 else
5045 5045 under_nce = NULL;
5046 5046 rw_exit(&ipst->ips_ill_g_lock);
5047 5047 if (under_nce != NULL && NCE_ISREACHABLE(ncec))
5048 5048 nce_fastpath_trigger(under_nce);
5049 5049
5050 5050 ill_refrele(send_ill);
5051 5051 if (src_ipif != NULL)
5052 5052 ipif_refrele(src_ipif);
5053 5053
5054 5054 if (under_nce != NULL) {
5055 5055 (void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
5056 5056 ALL_ZONES, 0, NULL);
5057 5057 nce_refrele(under_nce);
5058 5058 if (nprobes > 0)
5059 5059 nprobes--;
5060 5060 mp = nxt_mp;
5061 5061 continue;
5062 5062 }
5063 5063 drop_pkt:
5064 5064 if (isv6) {
5065 5065 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
5066 5066 } else {
5067 5067 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
5068 5068 }
5069 5069 ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
5070 5070 freemsg(mp);
5071 5071 if (nprobes > 0)
5072 5072 nprobes--;
5073 5073 mp = nxt_mp;
5074 5074 }
5075 5075 ncec_cb_dispatch(ncec); /* complete callbacks */
5076 5076 }
↓ open down ↓ |
4584 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX