Print this page
5045 use atomic_{inc,dec}_* instead of atomic_add_*
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/io/ib/clients/rdsv3/cong.c
+++ new/usr/src/uts/common/io/ib/clients/rdsv3/cong.c
1 1 /*
2 2 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
3 3 */
4 4
5 5 /*
6 6 * This file contains code imported from the OFED rds source file cong.c
7 7 * Oracle elects to have and use the contents of cong.c under and governed
8 8 * by the OpenIB.org BSD license (see below for full license text). However,
9 9 * the following notice accompanied the original version of this file:
10 10 */
11 11
12 12
13 13 /*
14 14 * Copyright (c) 2007 Oracle. All rights reserved.
15 15 *
16 16 * This software is available to you under a choice of one of two
17 17 * licenses. You may choose to be licensed under the terms of the GNU
18 18 * General Public License (GPL) Version 2, available from the file
19 19 * COPYING in the main directory of this source tree, or the
20 20 * OpenIB.org BSD license below:
21 21 *
22 22 * Redistribution and use in source and binary forms, with or
23 23 * without modification, are permitted provided that the following
24 24 * conditions are met:
25 25 *
26 26 * - Redistributions of source code must retain the above
27 27 * copyright notice, this list of conditions and the following
28 28 * disclaimer.
29 29 *
30 30 * - Redistributions in binary form must reproduce the above
31 31 * copyright notice, this list of conditions and the following
32 32 * disclaimer in the documentation and/or other materials
33 33 * provided with the distribution.
34 34 *
35 35 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
36 36 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
37 37 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
38 38 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
39 39 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
40 40 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
41 41 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
42 42 * SOFTWARE.
43 43 *
44 44 */
45 45 #include <sys/rds.h>
46 46
47 47 #include <sys/ib/clients/rdsv3/rdsv3.h>
48 48 #include <sys/ib/clients/rdsv3/rdsv3_impl.h>
49 49 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
50 50
51 51 /*
52 52 * This file implements the receive side of the unconventional congestion
53 53 * management in RDS.
54 54 *
55 55 * Messages waiting in the receive queue on the receiving socket are accounted
56 56 * against the sockets SO_RCVBUF option value. Only the payload bytes in the
57 57 * message are accounted for. If the number of bytes queued equals or exceeds
58 58 * rcvbuf then the socket is congested. All sends attempted to this socket's
59 59 * address should return block or return -EWOULDBLOCK.
60 60 *
61 61 * Applications are expected to be reasonably tuned such that this situation
62 62 * very rarely occurs. An application encountering this "back-pressure" is
63 63 * considered a bug.
64 64 *
65 65 * This is implemented by having each node maintain bitmaps which indicate
66 66 * which ports on bound addresses are congested. As the bitmap changes it is
67 67 * sent through all the connections which terminate in the local address of the
68 68 * bitmap which changed.
69 69 *
70 70 * The bitmaps are allocated as connections are brought up. This avoids
71 71 * allocation in the interrupt handling path which queues messages on sockets.
72 72 * The dense bitmaps let transports send the entire bitmap on any bitmap change
73 73 * reasonably efficiently. This is much easier to implement than some
74 74 * finer-grained communication of per-port congestion. The sender does a very
75 75 * inexpensive bit test to test if the port it's about to send to is congested
76 76 * or not.
77 77 */
78 78
79 79 /*
80 80 * Interaction with poll is a tad tricky. We want all processes stuck in
81 81 * poll to wake up and check whether a congested destination became uncongested.
82 82 * The really sad thing is we have no idea which destinations the application
83 83 * wants to send to - we don't even know which rdsv3_connections are involved.
84 84 * So until we implement a more flexible rds poll interface, we have to make
85 85 * do with this:
86 86 * We maintain a global counter that is incremented each time a congestion map
87 87 * update is received. Each rds socket tracks this value, and if rdsv3_poll
88 88 * finds that the saved generation number is smaller than the global generation
89 89 * number, it wakes up the process.
90 90 */
91 91 static atomic_t rdsv3_cong_generation = ATOMIC_INIT(0);
92 92
93 93 /*
94 94 * Congestion monitoring
95 95 */
96 96 static struct list rdsv3_cong_monitor;
97 97 static krwlock_t rdsv3_cong_monitor_lock;
98 98
99 99 /*
100 100 * Yes, a global lock. It's used so infrequently that it's worth keeping it
101 101 * global to simplify the locking. It's only used in the following
102 102 * circumstances:
103 103 *
104 104 * - on connection buildup to associate a conn with its maps
105 105 * - on map changes to inform conns of a new map to send
106 106 *
107 107 * It's sadly ordered under the socket callback lock and the connection lock.
108 108 * Receive paths can mark ports congested from interrupt context so the
109 109 * lock masks interrupts.
110 110 */
111 111 static kmutex_t rdsv3_cong_lock;
112 112 static struct avl_tree rdsv3_cong_tree;
113 113
114 114 static struct rdsv3_cong_map *
115 115 rdsv3_cong_tree_walk(uint32_be_t addr, struct rdsv3_cong_map *insert)
116 116 {
117 117 struct rdsv3_cong_map *map;
118 118 avl_index_t where;
119 119
120 120 if (insert) {
121 121 map = avl_find(&rdsv3_cong_tree, insert, &where);
122 122 if (map == NULL) {
123 123 avl_insert(&rdsv3_cong_tree, insert, where);
124 124 return (NULL);
125 125 }
126 126 } else {
127 127 struct rdsv3_cong_map map1;
128 128 map1.m_addr = addr;
129 129 map = avl_find(&rdsv3_cong_tree, &map1, &where);
130 130 }
131 131
132 132 return (map);
133 133 }
134 134
135 135 /*
136 136 * There is only ever one bitmap for any address. Connections try and allocate
137 137 * these bitmaps in the process getting pointers to them. The bitmaps are only
138 138 * ever freed as the module is removed after all connections have been freed.
139 139 */
140 140 static struct rdsv3_cong_map *
141 141 rdsv3_cong_from_addr(uint32_be_t addr)
142 142 {
143 143 struct rdsv3_cong_map *map;
144 144 struct rdsv3_cong_map *ret = NULL;
145 145 unsigned long zp;
146 146 unsigned long i;
147 147
148 148 RDSV3_DPRINTF4("rdsv3_cong_from_addr", "Enter(addr: %x)", ntohl(addr));
149 149
150 150 map = kmem_zalloc(sizeof (struct rdsv3_cong_map), KM_NOSLEEP);
151 151 if (!map)
152 152 return (NULL);
153 153
154 154 map->m_addr = addr;
155 155 rdsv3_init_waitqueue(&map->m_waitq);
156 156 list_create(&map->m_conn_list, sizeof (struct rdsv3_connection),
157 157 offsetof(struct rdsv3_connection, c_map_item));
158 158
159 159 for (i = 0; i < RDSV3_CONG_MAP_PAGES; i++) {
160 160 zp = (unsigned long)kmem_zalloc(PAGE_SIZE, KM_NOSLEEP);
161 161 if (zp == 0)
162 162 goto out;
163 163 map->m_page_addrs[i] = zp;
164 164 }
165 165
166 166 mutex_enter(&rdsv3_cong_lock);
167 167 ret = rdsv3_cong_tree_walk(addr, map);
168 168 mutex_exit(&rdsv3_cong_lock);
169 169
170 170 if (!ret) {
171 171 ret = map;
172 172 map = NULL;
173 173 }
174 174
175 175 out:
176 176 if (map) {
177 177 for (i = 0; i < RDSV3_CONG_MAP_PAGES && map->m_page_addrs[i];
178 178 i++)
179 179 kmem_free((void *)map->m_page_addrs[i], PAGE_SIZE);
180 180 kmem_free(map, sizeof (*map));
181 181 }
182 182
183 183 RDSV3_DPRINTF5("rdsv3_cong_from_addr", "map %p for addr %x",
184 184 ret, ntohl(addr));
185 185
186 186 return (ret);
187 187 }
188 188
189 189 /*
190 190 * Put the conn on its local map's list. This is called when the conn is
191 191 * really added to the hash. It's nested under the rdsv3_conn_lock, sadly.
192 192 */
193 193 void
194 194 rdsv3_cong_add_conn(struct rdsv3_connection *conn)
195 195 {
196 196 RDSV3_DPRINTF4("rdsv3_cong_add_conn", "Enter(conn: %p)", conn);
197 197
198 198 RDSV3_DPRINTF5("rdsv3_cong_add_conn", "conn %p now on map %p",
199 199 conn, conn->c_lcong);
200 200 mutex_enter(&rdsv3_cong_lock);
201 201 list_insert_tail(&conn->c_lcong->m_conn_list, conn);
202 202 mutex_exit(&rdsv3_cong_lock);
203 203
204 204 RDSV3_DPRINTF4("rdsv3_cong_add_conn", "Return(conn: %p)", conn);
205 205 }
206 206
207 207 void
208 208 rdsv3_cong_remove_conn(struct rdsv3_connection *conn)
209 209 {
210 210 RDSV3_DPRINTF4("rdsv3_cong_remove_conn", "Enter(conn: %p)", conn);
211 211
212 212 RDSV3_DPRINTF5("rdsv3_cong_remove_conn", "removing conn %p from map %p",
213 213 conn, conn->c_lcong);
214 214 mutex_enter(&rdsv3_cong_lock);
215 215 list_remove_node(&conn->c_map_item);
216 216 mutex_exit(&rdsv3_cong_lock);
217 217
218 218 RDSV3_DPRINTF4("rdsv3_cong_remove_conn", "Return(conn: %p)", conn);
219 219 }
220 220
221 221 int
222 222 rdsv3_cong_get_maps(struct rdsv3_connection *conn)
223 223 {
224 224 conn->c_lcong = rdsv3_cong_from_addr(conn->c_laddr);
225 225 conn->c_fcong = rdsv3_cong_from_addr(conn->c_faddr);
226 226
227 227 if (!(conn->c_lcong && conn->c_fcong))
228 228 return (-ENOMEM);
229 229
230 230 return (0);
231 231 }
232 232
233 233 void
234 234 rdsv3_cong_queue_updates(struct rdsv3_cong_map *map)
235 235 {
236 236 struct rdsv3_connection *conn;
237 237
238 238 RDSV3_DPRINTF4("rdsv3_cong_queue_updates", "Enter(map: %p)", map);
239 239
240 240 mutex_enter(&rdsv3_cong_lock);
241 241
242 242 RDSV3_FOR_EACH_LIST_NODE(conn, &map->m_conn_list, c_map_item) {
243 243 if (!test_and_set_bit(0, &conn->c_map_queued)) {
244 244 rdsv3_stats_inc(s_cong_update_queued);
245 245 (void) rdsv3_send_xmit(conn);
246 246 }
247 247 }
248 248
249 249 mutex_exit(&rdsv3_cong_lock);
250 250
251 251 RDSV3_DPRINTF4("rdsv3_cong_queue_updates", "Return(map: %p)", map);
↓ open down ↓ |
251 lines elided |
↑ open up ↑ |
252 252 }
253 253
254 254 void
255 255 rdsv3_cong_map_updated(struct rdsv3_cong_map *map, uint64_t portmask)
256 256 {
257 257 RDSV3_DPRINTF4("rdsv3_cong_map_updated",
258 258 "waking map %p for %u.%u.%u.%u",
259 259 map, NIPQUAD(map->m_addr));
260 260
261 261 rdsv3_stats_inc(s_cong_update_received);
262 - atomic_add_32(&rdsv3_cong_generation, 1);
262 + atomic_inc_32(&rdsv3_cong_generation);
263 263 #if 0
264 264 XXX
265 265 if (waitqueue_active(&map->m_waitq))
266 266 #endif
267 267 rdsv3_wake_up(&map->m_waitq);
268 268
269 269 if (portmask && !list_is_empty(&rdsv3_cong_monitor)) {
270 270 struct rdsv3_sock *rs;
271 271
272 272 rw_enter(&rdsv3_cong_monitor_lock, RW_READER);
273 273 RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_cong_monitor,
274 274 rs_cong_list) {
275 275 mutex_enter(&rs->rs_lock);
276 276 rs->rs_cong_notify |= (rs->rs_cong_mask & portmask);
277 277 rs->rs_cong_mask &= ~portmask;
278 278 mutex_exit(&rs->rs_lock);
279 279 if (rs->rs_cong_notify)
280 280 rdsv3_wake_sk_sleep(rs);
281 281 }
282 282 rw_exit(&rdsv3_cong_monitor_lock);
283 283 }
284 284
285 285 RDSV3_DPRINTF4("rdsv3_cong_map_updated", "Return(map: %p)", map);
286 286 }
287 287
288 288 int
289 289 rdsv3_cong_updated_since(unsigned long *recent)
290 290 {
291 291 unsigned long gen = atomic_get(&rdsv3_cong_generation);
292 292
293 293 if (*recent == gen)
294 294 return (0);
295 295 *recent = gen;
296 296 return (1);
297 297 }
298 298
299 299 /*
300 300 * We're called under the locking that protects the sockets receive buffer
301 301 * consumption. This makes it a lot easier for the caller to only call us
302 302 * when it knows that an existing set bit needs to be cleared, and vice versa.
303 303 * We can't block and we need to deal with concurrent sockets working against
304 304 * the same per-address map.
305 305 */
306 306 void
307 307 rdsv3_cong_set_bit(struct rdsv3_cong_map *map, uint16_be_t port)
308 308 {
309 309 unsigned long i;
310 310 unsigned long off;
311 311
312 312 RDSV3_DPRINTF4("rdsv3_cong_set_bit",
313 313 "setting congestion for %u.%u.%u.%u:%u in map %p",
314 314 NIPQUAD(map->m_addr), ntohs(port), map);
315 315
316 316 i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS;
317 317 off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS;
318 318 set_le_bit(off, (void *)map->m_page_addrs[i]);
319 319 }
320 320
321 321 void
322 322 rdsv3_cong_clear_bit(struct rdsv3_cong_map *map, uint16_be_t port)
323 323 {
324 324 unsigned long i;
325 325 unsigned long off;
326 326
327 327 RDSV3_DPRINTF4("rdsv3_cong_clear_bit",
328 328 "clearing congestion for %u.%u.%u.%u:%u in map %p\n",
329 329 NIPQUAD(map->m_addr), ntohs(port), map);
330 330
331 331 i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS;
332 332 off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS;
333 333 clear_le_bit(off, (void *)map->m_page_addrs[i]);
334 334 }
335 335
336 336 static int
337 337 rdsv3_cong_test_bit(struct rdsv3_cong_map *map, uint16_be_t port)
338 338 {
339 339 unsigned long i;
340 340 unsigned long off;
341 341
342 342 i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS;
343 343 off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS;
344 344
345 345 RDSV3_DPRINTF5("rdsv3_cong_test_bit", "port: 0x%x i = %lx off = %lx",
346 346 ntohs(port), i, off);
347 347
348 348 return (test_le_bit(off, (void *)map->m_page_addrs[i]));
349 349 }
350 350
351 351 void
352 352 rdsv3_cong_add_socket(struct rdsv3_sock *rs)
353 353 {
354 354 RDSV3_DPRINTF4("rdsv3_cong_add_socket", "Enter(rs: %p)", rs);
355 355
356 356 rw_enter(&rdsv3_cong_monitor_lock, RW_WRITER);
357 357 if (!list_link_active(&rs->rs_cong_list))
358 358 list_insert_head(&rdsv3_cong_monitor, rs);
359 359 rw_exit(&rdsv3_cong_monitor_lock);
360 360 }
361 361
362 362 void
363 363 rdsv3_cong_remove_socket(struct rdsv3_sock *rs)
364 364 {
365 365 struct rdsv3_cong_map *map;
366 366
367 367 RDSV3_DPRINTF4("rdsv3_cong_remove_socket", "Enter(rs: %p)", rs);
368 368
369 369 rw_enter(&rdsv3_cong_monitor_lock, RW_WRITER);
370 370 list_remove_node(&rs->rs_cong_list);
371 371 rw_exit(&rdsv3_cong_monitor_lock);
372 372
373 373 /* update congestion map for now-closed port */
374 374 mutex_enter(&rdsv3_cong_lock);
375 375 map = rdsv3_cong_tree_walk(rs->rs_bound_addr, NULL);
376 376 mutex_exit(&rdsv3_cong_lock);
377 377
378 378 if (map && rdsv3_cong_test_bit(map, rs->rs_bound_port)) {
379 379 rdsv3_cong_clear_bit(map, rs->rs_bound_port);
380 380 rdsv3_cong_queue_updates(map);
381 381 }
382 382 }
383 383
384 384 int
385 385 rdsv3_cong_wait(struct rdsv3_cong_map *map, uint16_be_t port, int nonblock,
386 386 struct rdsv3_sock *rs)
387 387 {
388 388 int ret = 0;
389 389
390 390 RDSV3_DPRINTF4("rdsv3_cong_wait", "Enter(rs: %p, mode: %d)",
391 391 rs, nonblock);
392 392
393 393 if (!rdsv3_cong_test_bit(map, port))
394 394 return (0);
395 395 if (nonblock) {
396 396 if (rs && rs->rs_cong_monitor) {
397 397 /*
398 398 * It would have been nice to have an atomic set_bit on
399 399 * a uint64_t.
400 400 */
401 401 mutex_enter(&rs->rs_lock);
402 402 rs->rs_cong_mask |=
403 403 RDS_CONG_MONITOR_MASK(ntohs(port));
404 404 mutex_exit(&rs->rs_lock);
405 405
406 406 /*
407 407 * Test again - a congestion update may have arrived in
408 408 * the meantime.
409 409 */
410 410 if (!rdsv3_cong_test_bit(map, port))
411 411 return (0);
412 412 }
413 413 rdsv3_stats_inc(s_cong_send_error);
414 414 return (-ENOBUFS);
415 415 }
416 416
417 417 rdsv3_stats_inc(s_cong_send_blocked);
418 418 RDSV3_DPRINTF3("rdsv3_cong_wait", "waiting on map %p for port %u",
419 419 map, ntohs(port));
420 420
421 421 #if 0
422 422 ret = rdsv3_wait_sig(&map->m_waitq, !rdsv3_cong_test_bit(map, port));
423 423 if (ret == 0)
424 424 return (-ERESTART);
425 425 return (0);
426 426 #else
427 427 mutex_enter(&map->m_waitq.waitq_mutex);
428 428 map->m_waitq.waitq_waiters++;
429 429 while (rdsv3_cong_test_bit(map, port)) {
430 430 ret = cv_wait_sig(&map->m_waitq.waitq_cv,
431 431 &map->m_waitq.waitq_mutex);
432 432 if (ret == 0) {
433 433 ret = -EINTR;
434 434 break;
435 435 }
436 436 }
437 437 map->m_waitq.waitq_waiters--;
438 438 mutex_exit(&map->m_waitq.waitq_mutex);
439 439 return (ret);
440 440 #endif
441 441 }
442 442
443 443 void
444 444 rdsv3_cong_exit(void)
445 445 {
446 446 struct rdsv3_cong_map *map;
447 447 unsigned long i;
448 448
449 449 RDSV3_DPRINTF4("rdsv3_cong_exit", "Enter");
450 450
451 451 while ((map = avl_first(&rdsv3_cong_tree))) {
452 452 RDSV3_DPRINTF5("rdsv3_cong_exit", "freeing map %p\n", map);
453 453 avl_remove(&rdsv3_cong_tree, map);
454 454 for (i = 0; i < RDSV3_CONG_MAP_PAGES && map->m_page_addrs[i];
455 455 i++)
456 456 kmem_free((void *)map->m_page_addrs[i], PAGE_SIZE);
457 457 kmem_free(map, sizeof (*map));
458 458 }
459 459
460 460 RDSV3_DPRINTF4("rdsv3_cong_exit", "Return");
461 461 }
462 462
463 463 /*
464 464 * Allocate a RDS message containing a congestion update.
465 465 */
466 466 struct rdsv3_message *
467 467 rdsv3_cong_update_alloc(struct rdsv3_connection *conn)
468 468 {
469 469 struct rdsv3_cong_map *map = conn->c_lcong;
470 470 struct rdsv3_message *rm;
471 471
472 472 rm = rdsv3_message_map_pages(map->m_page_addrs, RDSV3_CONG_MAP_BYTES);
473 473 if (!IS_ERR(rm))
474 474 rm->m_inc.i_hdr.h_flags = RDSV3_FLAG_CONG_BITMAP;
475 475
476 476 return (rm);
477 477 }
478 478
479 479 static int
480 480 rdsv3_cong_compare(const void *map1, const void *map2)
481 481 {
482 482 #define addr1 ((struct rdsv3_cong_map *)map1)->m_addr
483 483 #define addr2 ((struct rdsv3_cong_map *)map2)->m_addr
484 484
485 485 if (addr1 < addr2)
486 486 return (-1);
487 487 if (addr1 > addr2)
488 488 return (1);
489 489 return (0);
490 490 }
491 491
492 492 void
493 493 rdsv3_cong_init(void)
494 494 {
495 495 list_create(&rdsv3_cong_monitor, sizeof (struct rdsv3_sock),
496 496 offsetof(struct rdsv3_sock, rs_cong_list));
497 497 rw_init(&rdsv3_cong_monitor_lock, NULL, RW_DRIVER, NULL);
498 498 mutex_init(&rdsv3_cong_lock, NULL, MUTEX_DRIVER, NULL);
499 499 avl_create(&rdsv3_cong_tree, rdsv3_cong_compare,
500 500 sizeof (struct rdsv3_cong_map), offsetof(struct rdsv3_cong_map,
501 501 m_rb_node));
502 502 }
↓ open down ↓ |
230 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX