1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  24  */
  25 
  26 /*
  27  * Copyright (c) 2007, The Ohio State University. All rights reserved.
  28  *
  29  * Portions of this source code is developed by the team members of
  30  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
  31  * headed by Professor Dhabaleswar K. (DK) Panda.
  32  *
  33  * Acknowledgements to contributions from developors:
  34  *   Ranjit Noronha: noronha@cse.ohio-state.edu
  35  *   Lei Chai      : chail@cse.ohio-state.edu
  36  *   Weikuan Yu    : yuw@cse.ohio-state.edu
  37  *
  38  */
  39 
  40 /*
  41  * The rpcib plugin. Implements the interface for RDMATF's
  42  * interaction with IBTF.
  43  */
  44 
  45 #include <sys/param.h>
  46 #include <sys/types.h>
  47 #include <sys/user.h>
  48 #include <sys/systm.h>
  49 #include <sys/sysmacros.h>
  50 #include <sys/proc.h>
  51 #include <sys/socket.h>
  52 #include <sys/file.h>
  53 #include <sys/stream.h>
  54 #include <sys/strsubr.h>
  55 #include <sys/stropts.h>
  56 #include <sys/errno.h>
  57 #include <sys/kmem.h>
  58 #include <sys/debug.h>
  59 #include <sys/pathname.h>
  60 #include <sys/kstat.h>
  61 #include <sys/t_lock.h>
  62 #include <sys/ddi.h>
  63 #include <sys/cmn_err.h>
  64 #include <sys/time.h>
  65 #include <sys/isa_defs.h>
  66 #include <sys/callb.h>
  67 #include <sys/sunddi.h>
  68 #include <sys/sunndi.h>
  69 #include <sys/sdt.h>
  70 #include <sys/ib/ibtl/ibti.h>
  71 #include <rpc/rpc.h>
  72 #include <rpc/ib.h>
  73 #include <sys/modctl.h>
  74 #include <sys/kstr.h>
  75 #include <sys/sockio.h>
  76 #include <sys/vnode.h>
  77 #include <sys/tiuser.h>
  78 #include <net/if.h>
  79 #include <net/if_types.h>
  80 #include <sys/cred.h>
  81 #include <rpc/rpc_rdma.h>
  82 #include <nfs/nfs.h>
  83 #include <sys/atomic.h>
  84 
  85 #define NFS_RDMA_PORT   20049
  86 
  87 
  88 /*
  89  * Convenience structures for connection management
  90  */
  91 typedef struct rpcib_ipaddrs {
  92         void    *ri_list;       /* pointer to list of addresses */
  93         uint_t  ri_count;       /* number of addresses in list */
  94         uint_t  ri_size;        /* size of ri_list in bytes */
  95 } rpcib_ipaddrs_t;
  96 
  97 
  98 typedef struct rpcib_ping {
  99         rib_hca_t  *hca;
 100         ibt_path_info_t path;
 101         ibt_ip_addr_t srcip;
 102         ibt_ip_addr_t dstip;
 103 } rpcib_ping_t;
 104 
 105 /*
 106  * Prototype declarations for driver ops
 107  */
 108 static int      rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
 109 static int      rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
 110                                 void *, void **);
 111 static int      rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
 112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *);
 113 static int      rpcib_do_ip_ioctl(int, int, void *);
 114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *);
 115 static int rpcib_cache_kstat_update(kstat_t *, int);
 116 static void rib_force_cleanup(void *);
 117 static void rib_stop_hca_services(rib_hca_t *);
 118 static void rib_attach_hca(void);
 119 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
 120                 struct netbuf *d_svcaddr, CONN **conn);
 121 
 122 struct {
 123         kstat_named_t cache_limit;
 124         kstat_named_t cache_allocation;
 125         kstat_named_t cache_hits;
 126         kstat_named_t cache_misses;
 127         kstat_named_t cache_misses_above_the_limit;
 128 } rpcib_kstat = {
 129         {"cache_limit",                 KSTAT_DATA_UINT64 },
 130         {"cache_allocation",            KSTAT_DATA_UINT64 },
 131         {"cache_hits",                  KSTAT_DATA_UINT64 },
 132         {"cache_misses",                KSTAT_DATA_UINT64 },
 133         {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 },
 134 };
 135 
 136 /* rpcib cb_ops */
 137 static struct cb_ops rpcib_cbops = {
 138         nulldev,                /* open */
 139         nulldev,                /* close */
 140         nodev,                  /* strategy */
 141         nodev,                  /* print */
 142         nodev,                  /* dump */
 143         nodev,                  /* read */
 144         nodev,                  /* write */
 145         nodev,                  /* ioctl */
 146         nodev,                  /* devmap */
 147         nodev,                  /* mmap */
 148         nodev,                  /* segmap */
 149         nochpoll,               /* poll */
 150         ddi_prop_op,            /* prop_op */
 151         NULL,                   /* stream */
 152         D_MP,                   /* cb_flag */
 153         CB_REV,                 /* rev */
 154         nodev,                  /* int (*cb_aread)() */
 155         nodev                   /* int (*cb_awrite)() */
 156 };
 157 
 158 /*
 159  * Device options
 160  */
 161 static struct dev_ops rpcib_ops = {
 162         DEVO_REV,               /* devo_rev, */
 163         0,                      /* refcnt  */
 164         rpcib_getinfo,          /* info */
 165         nulldev,                /* identify */
 166         nulldev,                /* probe */
 167         rpcib_attach,           /* attach */
 168         rpcib_detach,           /* detach */
 169         nodev,                  /* reset */
 170         &rpcib_cbops,                   /* driver ops - devctl interfaces */
 171         NULL,                   /* bus operations */
 172         NULL,                   /* power */
 173         ddi_quiesce_not_needed,         /* quiesce */
 174 };
 175 
 176 /*
 177  * Module linkage information.
 178  */
 179 
 180 static struct modldrv rib_modldrv = {
 181         &mod_driverops,             /* Driver module */
 182         "RPCIB plugin driver",  /* Driver name and version */
 183         &rpcib_ops,         /* Driver ops */
 184 };
 185 
 186 static struct modlinkage rib_modlinkage = {
 187         MODREV_1,
 188         (void *)&rib_modldrv,
 189         NULL
 190 };
 191 
 192 typedef struct rib_lrc_entry {
 193         struct rib_lrc_entry *forw;
 194         struct rib_lrc_entry *back;
 195         char *lrc_buf;
 196 
 197         uint32_t lrc_len;
 198         void  *avl_node;
 199         bool_t registered;
 200 
 201         struct mrc lrc_mhandle;
 202         bool_t lrc_on_freed_list;
 203 } rib_lrc_entry_t;
 204 
 205 typedef struct cache_struct     {
 206         rib_lrc_entry_t         r;
 207         uint32_t                len;
 208         uint32_t                elements;
 209         kmutex_t                node_lock;
 210         avl_node_t              avl_link;
 211 } cache_avl_struct_t;
 212 
 213 uint64_t        cache_limit = 100 * 1024 * 1024;
 214 static uint64_t cache_watermark = 80 * 1024 * 1024;
 215 static bool_t   stats_enabled = FALSE;
 216 
 217 static uint64_t max_unsignaled_rws = 5;
 218 int nfs_rdma_port = NFS_RDMA_PORT;
 219 
 220 #define RIBNETID_TCP    "tcp"
 221 #define RIBNETID_TCP6   "tcp6"
 222 
 223 /*
 224  * rib_stat: private data pointer used when registering
 225  *      with the IBTF.  It is returned to the consumer
 226  *      in all callbacks.
 227  */
 228 static rpcib_state_t *rib_stat = NULL;
 229 
 230 #define RNR_RETRIES     IBT_RNR_RETRY_1
 231 #define MAX_PORTS       2
 232 #define RDMA_DUMMY_WRID 0x4D3A1D4D3A1D
 233 #define RDMA_CONN_REAP_RETRY    10      /* 10 secs */
 234 
 235 int preposted_rbufs = RDMA_BUFS_GRANT;
 236 int send_threshold = 1;
 237 
 238 /*
 239  * Old cards with Tavor driver have limited memory footprint
 240  * when booted in 32bit. The rib_max_rbufs tunable can be
 241  * tuned for more buffers if needed.
 242  */
 243 
 244 #if !defined(_ELF64) && !defined(__sparc)
 245 int rib_max_rbufs = MAX_BUFS;
 246 #else
 247 int rib_max_rbufs = 10 * MAX_BUFS;
 248 #endif  /* !(_ELF64) && !(__sparc) */
 249 
 250 int rib_conn_timeout = 60 * 12;         /* 12 minutes */
 251 
 252 /*
 253  * State of the plugin.
 254  * ACCEPT = accepting new connections and requests.
 255  * NO_ACCEPT = not accepting new connection and requests.
 256  * This should eventually move to rpcib_state_t structure, since this
 257  * will tell in which state the plugin is for a particular type of service
 258  * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
 259  * state for one and in no_accept state for the other.
 260  */
 261 int             plugin_state;
 262 kmutex_t        plugin_state_lock;
 263 
 264 ldi_ident_t rpcib_li;
 265 
 266 /*
 267  * RPCIB RDMATF operations
 268  */
 269 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
 270 static rdma_stat rib_disconnect(CONN *conn);
 271 static void rib_listen(struct rdma_svc_data *rd);
 272 static void rib_listen_stop(struct rdma_svc_data *rd);
 273 static rdma_stat rib_registermem(CONN *conn, caddr_t  adsp, caddr_t buf,
 274         uint_t buflen, struct mrc *buf_handle);
 275 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
 276         struct mrc buf_handle);
 277 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
 278                 caddr_t buf, uint_t buflen, struct mrc *buf_handle);
 279 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
 280                 struct mrc buf_handle);
 281 static rdma_stat rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf,
 282         uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle,
 283         void *lrc);
 284 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
 285         struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
 286 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
 287         caddr_t buf, int len, int cpu);
 288 
 289 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
 290 
 291 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
 292 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
 293 
 294 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
 295 
 296 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
 297 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
 298 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
 299 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid);
 300 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
 301 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
 302 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
 303 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
 304 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *);
 305 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *,
 306         int addr_type, void *, CONN **);
 307 static rdma_stat rib_conn_release(CONN *conn);
 308 static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int,
 309         rpcib_ping_t *, CONN **);
 310 static rdma_stat rib_getinfo(rdma_info_t *info);
 311 
 312 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len);
 313 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
 314 static void rib_destroy_cache(rib_hca_t *hca);
 315 static  void    rib_server_side_cache_reclaim(void *argp);
 316 static int avl_compare(const void *t1, const void *t2);
 317 
 318 static void rib_stop_services(rib_hca_t *);
 319 static void rib_close_channels(rib_conn_list_t *);
 320 static void rib_conn_close(void *);
 321 static void rib_recv_rele(rib_qp_t *);
 322 static rdma_stat rib_conn_release_locked(CONN *conn);
 323 
 324 /*
 325  * RPCIB addressing operations
 326  */
 327 
 328 /*
 329  * RDMA operations the RPCIB module exports
 330  */
 331 static rdmaops_t rib_ops = {
 332         rib_reachable,
 333         rib_conn_get,
 334         rib_conn_release,
 335         rib_listen,
 336         rib_listen_stop,
 337         rib_registermem,
 338         rib_deregistermem,
 339         rib_registermemsync,
 340         rib_deregistermemsync,
 341         rib_syncmem,
 342         rib_reg_buf_alloc,
 343         rib_reg_buf_free,
 344         rib_send,
 345         rib_send_resp,
 346         rib_post_resp,
 347         rib_post_resp_remove,
 348         rib_post_recv,
 349         rib_recv,
 350         rib_read,
 351         rib_write,
 352         rib_getinfo,
 353 };
 354 
 355 /*
 356  * RDMATF RPCIB plugin details
 357  */
 358 static rdma_mod_t rib_mod = {
 359         "ibtf",         /* api name */
 360         RDMATF_VERS_1,
 361         0,
 362         &rib_ops,   /* rdma op vector for ibtf */
 363 };
 364 
 365 static rdma_stat rpcib_open_hcas(rpcib_state_t *);
 366 static rdma_stat rib_qp_init(rib_qp_t *, int);
 367 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
 368 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
 369 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
 370 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
 371 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
 372 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t,
 373         ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *);
 374 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
 375         ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
 376 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *);
 377 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
 378         rib_qp_t **);
 379 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
 380         rib_qp_t **);
 381 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
 382 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
 383 static int rib_free_sendwait(struct send_wid *);
 384 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
 385 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
 386 static void rdma_done_rem_list(rib_qp_t *);
 387 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
 388 
 389 static void rib_async_handler(void *,
 390         ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
 391 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
 392 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
 393 static int rib_free_svc_recv(struct svc_recv *);
 394 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
 395 static void rib_free_wid(struct recv_wid *);
 396 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
 397 static void rib_detach_hca(ibt_hca_hdl_t);
 398 static void rib_close_a_channel(CONN *);
 399 static void rib_send_hold(rib_qp_t *);
 400 static void rib_send_rele(rib_qp_t *);
 401 
 402 /*
 403  * Registration with IBTF as a consumer
 404  */
 405 static struct ibt_clnt_modinfo_s rib_modinfo = {
 406         IBTI_V_CURR,
 407         IBT_GENERIC,
 408         rib_async_handler,      /* async event handler */
 409         NULL,                   /* Memory Region Handler */
 410         "nfs/ib"
 411 };
 412 
 413 /*
 414  * Global strucuture
 415  */
 416 
 417 typedef struct rpcib_s {
 418         dev_info_t      *rpcib_dip;
 419         kmutex_t        rpcib_mutex;
 420 } rpcib_t;
 421 
 422 rpcib_t rpcib;
 423 
 424 /*
 425  * /etc/system controlled variable to control
 426  * debugging in rpcib kernel module.
 427  * Set it to values greater that 1 to control
 428  * the amount of debugging messages required.
 429  */
 430 int rib_debug = 0;
 431 
 432 int
 433 _init(void)
 434 {
 435         int error;
 436 
 437         error = mod_install((struct modlinkage *)&rib_modlinkage);
 438         if (error != 0) {
 439                 /*
 440                  * Could not load module
 441                  */
 442                 return (error);
 443         }
 444         mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
 445         return (0);
 446 }
 447 
 448 int
 449 _fini()
 450 {
 451         int status;
 452 
 453         /*
 454          * Remove module
 455          */
 456         if ((status = mod_remove(&rib_modlinkage)) != 0) {
 457                 return (status);
 458         }
 459         mutex_destroy(&plugin_state_lock);
 460         return (0);
 461 }
 462 
 463 int
 464 _info(struct modinfo *modinfop)
 465 {
 466         return (mod_info(&rib_modlinkage, modinfop));
 467 }
 468 
 469 /*
 470  * rpcib_getinfo()
 471  * Given the device number, return the devinfo pointer or the
 472  * instance number.
 473  * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
 474  */
 475 
 476 /*ARGSUSED*/
 477 static int
 478 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
 479 {
 480         int ret = DDI_SUCCESS;
 481 
 482         switch (cmd) {
 483         case DDI_INFO_DEVT2DEVINFO:
 484                 if (rpcib.rpcib_dip != NULL)
 485                         *result = rpcib.rpcib_dip;
 486                 else {
 487                         *result = NULL;
 488                         ret = DDI_FAILURE;
 489                 }
 490                 break;
 491 
 492         case DDI_INFO_DEVT2INSTANCE:
 493                 *result = NULL;
 494                 break;
 495 
 496         default:
 497                 ret = DDI_FAILURE;
 498         }
 499         return (ret);
 500 }
 501 
 502 static void
 503 rpcib_free_hca_list()
 504 {
 505         rib_hca_t *hca, *hcap;
 506 
 507         rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
 508         hca = rib_stat->hcas_list;
 509         rib_stat->hcas_list = NULL;
 510         rw_exit(&rib_stat->hcas_list_lock);
 511         while (hca != NULL) {
 512                 rw_enter(&hca->state_lock, RW_WRITER);
 513                 hcap = hca;
 514                 hca = hca->next;
 515                 rib_stat->nhca_inited--;
 516                 rib_mod.rdma_count--;
 517                 hcap->state = HCA_DETACHED;
 518                 rw_exit(&hcap->state_lock);
 519                 rib_stop_hca_services(hcap);
 520 
 521                 kmem_free(hcap, sizeof (*hcap));
 522         }
 523 }
 524 
 525 static rdma_stat
 526 rpcib_free_service_list()
 527 {
 528         rib_service_t *service;
 529         ibt_status_t ret;
 530 
 531         rw_enter(&rib_stat->service_list_lock, RW_WRITER);
 532         while (rib_stat->service_list != NULL) {
 533                 service = rib_stat->service_list;
 534                 ret = ibt_unbind_all_services(service->srv_hdl);
 535                 if (ret != IBT_SUCCESS) {
 536                         rw_exit(&rib_stat->service_list_lock);
 537 #ifdef DEBUG
 538                         cmn_err(CE_NOTE, "rpcib_free_service_list: "
 539                             "ibt_unbind_all_services failed (%d)\n", (int)ret);
 540 #endif
 541                         return (RDMA_FAILED);
 542                 }
 543                 ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl,
 544                     service->srv_hdl);
 545                 if (ret != IBT_SUCCESS) {
 546                         rw_exit(&rib_stat->service_list_lock);
 547 #ifdef DEBUG
 548                         cmn_err(CE_NOTE, "rpcib_free_service_list: "
 549                             "ibt_deregister_service failed (%d)\n", (int)ret);
 550 #endif
 551                         return (RDMA_FAILED);
 552                 }
 553                 rib_stat->service_list = service->next;
 554                 kmem_free(service, sizeof (rib_service_t));
 555         }
 556         rw_exit(&rib_stat->service_list_lock);
 557 
 558         return (RDMA_SUCCESS);
 559 }
 560 
 561 static int
 562 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 563 {
 564         ibt_status_t    ibt_status;
 565         rdma_stat       r_status;
 566 
 567         switch (cmd) {
 568         case DDI_ATTACH:
 569                 break;
 570         case DDI_RESUME:
 571                 return (DDI_SUCCESS);
 572         default:
 573                 return (DDI_FAILURE);
 574         }
 575 
 576         mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
 577 
 578         mutex_enter(&rpcib.rpcib_mutex);
 579         if (rpcib.rpcib_dip != NULL) {
 580                 mutex_exit(&rpcib.rpcib_mutex);
 581                 return (DDI_FAILURE);
 582         }
 583         rpcib.rpcib_dip = dip;
 584         mutex_exit(&rpcib.rpcib_mutex);
 585         /*
 586          * Create the "rpcib" minor-node.
 587          */
 588         if (ddi_create_minor_node(dip,
 589             "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
 590                 /* Error message, no cmn_err as they print on console */
 591                 return (DDI_FAILURE);
 592         }
 593 
 594         if (rib_stat == NULL) {
 595                 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
 596                 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
 597                 rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL);
 598                 mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL);
 599         }
 600 
 601         rib_stat->hca_count = ibt_get_hca_list(NULL);
 602         if (rib_stat->hca_count < 1) {
 603                 mutex_destroy(&rib_stat->listen_lock);
 604                 rw_destroy(&rib_stat->hcas_list_lock);
 605                 mutex_destroy(&rib_stat->open_hca_lock);
 606                 kmem_free(rib_stat, sizeof (*rib_stat));
 607                 rib_stat = NULL;
 608                 return (DDI_FAILURE);
 609         }
 610 
 611         ibt_status = ibt_attach(&rib_modinfo, dip,
 612             (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
 613 
 614         if (ibt_status != IBT_SUCCESS) {
 615                 mutex_destroy(&rib_stat->listen_lock);
 616                 rw_destroy(&rib_stat->hcas_list_lock);
 617                 mutex_destroy(&rib_stat->open_hca_lock);
 618                 kmem_free(rib_stat, sizeof (*rib_stat));
 619                 rib_stat = NULL;
 620                 return (DDI_FAILURE);
 621         }
 622 
 623         rib_stat->service_list = NULL;
 624         rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL);
 625         mutex_enter(&rib_stat->open_hca_lock);
 626         if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) {
 627                 mutex_exit(&rib_stat->open_hca_lock);
 628                 goto open_fail;
 629         }
 630         mutex_exit(&rib_stat->open_hca_lock);
 631 
 632         if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) !=
 633             DDI_PROP_SUCCESS) {
 634                 cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update "
 635                     "failed.");
 636                 goto register_fail;
 637         }
 638 
 639         /*
 640          * Register with rdmatf
 641          */
 642         r_status = rdma_register_mod(&rib_mod);
 643         if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
 644                 cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, "
 645                     "status = %d", r_status);
 646                 goto register_fail;
 647         }
 648 
 649         return (DDI_SUCCESS);
 650 
 651 register_fail:
 652 
 653 open_fail:
 654         (void) ibt_detach(rib_stat->ibt_clnt_hdl);
 655         rpcib_free_hca_list();
 656         (void) rpcib_free_service_list();
 657         mutex_destroy(&rib_stat->listen_lock);
 658         rw_destroy(&rib_stat->hcas_list_lock);
 659         mutex_destroy(&rib_stat->open_hca_lock);
 660         rw_destroy(&rib_stat->service_list_lock);
 661         kmem_free(rib_stat, sizeof (*rib_stat));
 662         rib_stat = NULL;
 663         return (DDI_FAILURE);
 664 }
 665 
 666 /*ARGSUSED*/
 667 static int
 668 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 669 {
 670         switch (cmd) {
 671 
 672         case DDI_DETACH:
 673                 break;
 674 
 675         case DDI_SUSPEND:
 676         default:
 677                 return (DDI_FAILURE);
 678         }
 679 
 680         /*
 681          * Detach the hca and free resources
 682          */
 683         mutex_enter(&plugin_state_lock);
 684         plugin_state = NO_ACCEPT;
 685         mutex_exit(&plugin_state_lock);
 686 
 687         if (rpcib_free_service_list() != RDMA_SUCCESS)
 688                 return (DDI_FAILURE);
 689         rpcib_free_hca_list();
 690 
 691         (void) ibt_detach(rib_stat->ibt_clnt_hdl);
 692         mutex_destroy(&rib_stat->listen_lock);
 693         rw_destroy(&rib_stat->hcas_list_lock);
 694         mutex_destroy(&rib_stat->open_hca_lock);
 695         rw_destroy(&rib_stat->service_list_lock);
 696 
 697         kmem_free(rib_stat, sizeof (*rib_stat));
 698         rib_stat = NULL;
 699 
 700         mutex_enter(&rpcib.rpcib_mutex);
 701         rpcib.rpcib_dip = NULL;
 702         mutex_exit(&rpcib.rpcib_mutex);
 703         mutex_destroy(&rpcib.rpcib_mutex);
 704         return (DDI_SUCCESS);
 705 }
 706 
 707 
 708 static void rib_rbufpool_free(rib_hca_t *, int);
 709 static void rib_rbufpool_deregister(rib_hca_t *, int);
 710 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
 711 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
 712 static rdma_stat rib_rem_replylist(rib_qp_t *);
 713 static int rib_remreply(rib_qp_t *, struct reply *);
 714 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
 715 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
 716 
 717 
 718 /*
 719  * One CQ pair per HCA
 720  */
 721 static rdma_stat
 722 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
 723         rib_cq_t **cqp)
 724 {
 725         rib_cq_t        *cq;
 726         ibt_cq_attr_t   cq_attr;
 727         uint32_t        real_size;
 728         ibt_status_t    status;
 729         rdma_stat       error = RDMA_SUCCESS;
 730 
 731         cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
 732         cq->rib_hca = hca;
 733         bzero(&cq_attr, sizeof (cq_attr));
 734         cq_attr.cq_size = cq_size;
 735         cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
 736         status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
 737             &real_size);
 738         if (status != IBT_SUCCESS) {
 739                 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
 740                     " status=%d", status);
 741                 error = RDMA_FAILED;
 742                 goto fail;
 743         }
 744         ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca);
 745 
 746         /*
 747          * Enable CQ callbacks. CQ Callbacks are single shot
 748          * (e.g. you have to call ibt_enable_cq_notify()
 749          * after each callback to get another one).
 750          */
 751         status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
 752         if (status != IBT_SUCCESS) {
 753                 cmn_err(CE_WARN, "rib_create_cq: "
 754                     "enable_cq_notify failed, status %d", status);
 755                 error = RDMA_FAILED;
 756                 goto fail;
 757         }
 758         *cqp = cq;
 759 
 760         return (error);
 761 fail:
 762         if (cq->rib_cq_hdl)
 763                 (void) ibt_free_cq(cq->rib_cq_hdl);
 764         if (cq)
 765                 kmem_free(cq, sizeof (rib_cq_t));
 766         return (error);
 767 }
 768 
 769 /*
 770  * rpcib_find_hca
 771  *
 772  * Caller should have already locked the hcas_lock before calling
 773  * this function.
 774  */
 775 static rib_hca_t *
 776 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid)
 777 {
 778         rib_hca_t *hca = ribstat->hcas_list;
 779 
 780         while (hca && hca->hca_guid != guid)
 781                 hca = hca->next;
 782 
 783         return (hca);
 784 }
 785 
 786 static rdma_stat
 787 rpcib_open_hcas(rpcib_state_t *ribstat)
 788 {
 789         rib_hca_t               *hca;
 790         ibt_status_t            ibt_status;
 791         rdma_stat               status;
 792         ibt_hca_portinfo_t      *pinfop;
 793         ibt_pd_flags_t          pd_flags = IBT_PD_NO_FLAGS;
 794         uint_t                  size, cq_size;
 795         int                     i;
 796         kstat_t *ksp;
 797         cache_avl_struct_t example_avl_node;
 798         char rssc_name[32];
 799         int old_nhca_inited = ribstat->nhca_inited;
 800         ib_guid_t               *hca_guids;
 801 
 802         ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
 803 
 804         ribstat->hca_count = ibt_get_hca_list(&hca_guids);
 805         if (ribstat->hca_count == 0)
 806                 return (RDMA_FAILED);
 807 
 808         rw_enter(&ribstat->hcas_list_lock, RW_WRITER);
 809         /*
 810          * Open a hca and setup for RDMA
 811          */
 812         for (i = 0; i < ribstat->hca_count; i++) {
 813                 if (rpcib_find_hca(ribstat, hca_guids[i]))
 814                         continue;
 815                 hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP);
 816 
 817                 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
 818                     hca_guids[i], &hca->hca_hdl);
 819                 if (ibt_status != IBT_SUCCESS) {
 820                         kmem_free(hca, sizeof (rib_hca_t));
 821                         continue;
 822                 }
 823                 hca->hca_guid = hca_guids[i];
 824                 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
 825                 hca->state = HCA_INITED;
 826 
 827                 /*
 828                  * query HCA info
 829                  */
 830                 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
 831                 if (ibt_status != IBT_SUCCESS) {
 832                         goto fail1;
 833                 }
 834 
 835                 /*
 836                  * One PD (Protection Domain) per HCA.
 837                  * A qp is allowed to access a memory region
 838                  * only when it's in the same PD as that of
 839                  * the memory region.
 840                  */
 841                 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
 842                 if (ibt_status != IBT_SUCCESS) {
 843                         goto fail1;
 844                 }
 845 
 846                 /*
 847                  * query HCA ports
 848                  */
 849                 ibt_status = ibt_query_hca_ports(hca->hca_hdl,
 850                     0, &pinfop, &hca->hca_nports, &size);
 851                 if (ibt_status != IBT_SUCCESS) {
 852                         goto fail2;
 853                 }
 854                 hca->hca_ports = pinfop;
 855                 hca->hca_pinfosz = size;
 856                 pinfop = NULL;
 857 
 858                 cq_size = DEF_CQ_SIZE; /* default cq size */
 859                 /*
 860                  * Create 2 pairs of cq's (1 pair for client
 861                  * and the other pair for server) on this hca.
 862                  * If number of qp's gets too large, then several
 863                  * cq's will be needed.
 864                  */
 865                 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
 866                     &hca->svc_rcq);
 867                 if (status != RDMA_SUCCESS) {
 868                         goto fail3;
 869                 }
 870 
 871                 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
 872                     &hca->svc_scq);
 873                 if (status != RDMA_SUCCESS) {
 874                         goto fail3;
 875                 }
 876 
 877                 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
 878                     &hca->clnt_rcq);
 879                 if (status != RDMA_SUCCESS) {
 880                         goto fail3;
 881                 }
 882 
 883                 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
 884                     &hca->clnt_scq);
 885                 if (status != RDMA_SUCCESS) {
 886                         goto fail3;
 887                 }
 888 
 889                 /*
 890                  * Create buffer pools.
 891                  * Note rib_rbuf_create also allocates memory windows.
 892                  */
 893                 hca->recv_pool = rib_rbufpool_create(hca,
 894                     RECV_BUFFER, rib_max_rbufs);
 895                 if (hca->recv_pool == NULL) {
 896                         goto fail3;
 897                 }
 898 
 899                 hca->send_pool = rib_rbufpool_create(hca,
 900                     SEND_BUFFER, rib_max_rbufs);
 901                 if (hca->send_pool == NULL) {
 902                         rib_rbufpool_destroy(hca, RECV_BUFFER);
 903                         goto fail3;
 904                 }
 905 
 906                 if (hca->server_side_cache == NULL) {
 907                         (void) sprintf(rssc_name,
 908                             "rib_srvr_cache_%llx",
 909                             (long long unsigned int) hca->hca_guid);
 910                         hca->server_side_cache = kmem_cache_create(
 911                             rssc_name,
 912                             sizeof (cache_avl_struct_t), 0,
 913                             NULL,
 914                             NULL,
 915                             rib_server_side_cache_reclaim,
 916                             hca, NULL, 0);
 917                 }
 918 
 919                 avl_create(&hca->avl_tree,
 920                     avl_compare,
 921                     sizeof (cache_avl_struct_t),
 922                     (uint_t)(uintptr_t)&example_avl_node.avl_link-
 923                     (uint_t)(uintptr_t)&example_avl_node);
 924 
 925                 rw_init(&hca->bound_services_lock, NULL, RW_DRIVER,
 926                     hca->iblock);
 927                 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
 928                 rw_init(&hca->avl_rw_lock,
 929                     NULL, RW_DRIVER, hca->iblock);
 930                 mutex_init(&hca->cache_allocation_lock,
 931                     NULL, MUTEX_DRIVER, NULL);
 932                 hca->avl_init = TRUE;
 933 
 934                 /* Create kstats for the cache */
 935                 ASSERT(INGLOBALZONE(curproc));
 936 
 937                 if (!stats_enabled) {
 938                         ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
 939                             KSTAT_TYPE_NAMED,
 940                             sizeof (rpcib_kstat) / sizeof (kstat_named_t),
 941                             KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
 942                             GLOBAL_ZONEID);
 943                         if (ksp) {
 944                                 ksp->ks_data = (void *) &rpcib_kstat;
 945                                 ksp->ks_update = rpcib_cache_kstat_update;
 946                                 kstat_install(ksp);
 947                                 stats_enabled = TRUE;
 948                         }
 949                 }
 950                 if (hca->cleanup_helper == NULL) {
 951                         char tq_name[sizeof (hca->hca_guid) * 2 + 1];
 952 
 953                         (void) snprintf(tq_name, sizeof (tq_name), "%llX",
 954                             (unsigned long long int) hca->hca_guid);
 955                         hca->cleanup_helper = ddi_taskq_create(NULL,
 956                             tq_name, 1, TASKQ_DEFAULTPRI, 0);
 957                 }
 958 
 959                 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
 960                 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
 961                 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
 962                     hca->iblock);
 963                 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
 964                     hca->iblock);
 965                 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
 966                 hca->inuse = TRUE;
 967 
 968                 hca->next = ribstat->hcas_list;
 969                 ribstat->hcas_list = hca;
 970                 ribstat->nhca_inited++;
 971                 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
 972                 continue;
 973 
 974 fail3:
 975                 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
 976 fail2:
 977                 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
 978 fail1:
 979                 (void) ibt_close_hca(hca->hca_hdl);
 980                 kmem_free(hca, sizeof (rib_hca_t));
 981         }
 982         rw_exit(&ribstat->hcas_list_lock);
 983         ibt_free_hca_list(hca_guids, ribstat->hca_count);
 984         rib_mod.rdma_count = rib_stat->nhca_inited;
 985 
 986         /*
 987          * return success if at least one new hca has been configured.
 988          */
 989         if (ribstat->nhca_inited != old_nhca_inited)
 990                 return (RDMA_SUCCESS);
 991         else
 992                 return (RDMA_FAILED);
 993 }
 994 
 995 /*
 996  * Callback routines
 997  */
 998 
 999 /*
1000  * SCQ handlers
1001  */
1002 /* ARGSUSED */
1003 static void
1004 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1005 {
1006         ibt_status_t    ibt_status;
1007         ibt_wc_t        wc;
1008         struct send_wid *wd;
1009         CONN            *conn;
1010         rib_qp_t        *qp;
1011         int             i;
1012 
1013         /*
1014          * Re-enable cq notify here to avoid missing any
1015          * completion queue notification.
1016          */
1017         (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1018 
1019         ibt_status = IBT_SUCCESS;
1020         while (ibt_status != IBT_CQ_EMPTY) {
1021                 bzero(&wc, sizeof (wc));
1022                 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1023                 if (ibt_status != IBT_SUCCESS)
1024                         return;
1025 
1026                 /*
1027                  * Got a send completion
1028                  */
1029                 if (wc.wc_id != RDMA_DUMMY_WRID) {
1030                         wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1031                         qp = wd->qp;
1032                         conn = qptoc(qp);
1033 
1034                         mutex_enter(&wd->sendwait_lock);
1035                         switch (wc.wc_status) {
1036                         case IBT_WC_SUCCESS:
1037                                 wd->status = RDMA_SUCCESS;
1038                                 break;
1039                         default:
1040 /*
1041  *    RC Send Q Error Code              Local state     Remote State
1042  *    ====================              ===========     ============
1043  *    IBT_WC_BAD_RESPONSE_ERR             ERROR           None
1044  *    IBT_WC_LOCAL_LEN_ERR                ERROR           None
1045  *    IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           None
1046  *    IBT_WC_LOCAL_PROTECT_ERR            ERROR           None
1047  *    IBT_WC_MEM_WIN_BIND_ERR             ERROR           None
1048  *    IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR
1049  *    IBT_WC_REMOTE_ACCESS_ERR            ERROR           ERROR
1050  *    IBT_WC_REMOTE_OP_ERR                ERROR           ERROR
1051  *    IBT_WC_RNR_NAK_TIMEOUT_ERR          ERROR           None
1052  *    IBT_WC_TRANS_TIMEOUT_ERR            ERROR           None
1053  *    IBT_WC_WR_FLUSHED_ERR               ERROR           None
1054  */
1055                                 /*
1056                                  * Channel in error state. Set connection to
1057                                  * ERROR and cleanup will happen either from
1058                                  * conn_release  or from rib_conn_get
1059                                  */
1060                                 wd->status = RDMA_FAILED;
1061                                 mutex_enter(&conn->c_lock);
1062                                 if (conn->c_state != C_DISCONN_PEND)
1063                                         conn->c_state = C_ERROR_CONN;
1064                                 mutex_exit(&conn->c_lock);
1065                                 break;
1066                         }
1067 
1068                         if (wd->cv_sig == 1) {
1069                                 /*
1070                                  * Notify poster
1071                                  */
1072                                 cv_signal(&wd->wait_cv);
1073                                 mutex_exit(&wd->sendwait_lock);
1074                         } else {
1075                                 /*
1076                                  * Poster not waiting for notification.
1077                                  * Free the send buffers and send_wid
1078                                  */
1079                                 for (i = 0; i < wd->nsbufs; i++) {
1080                                         rib_rbuf_free(qptoc(wd->qp),
1081                                             SEND_BUFFER,
1082                                             (void *)(uintptr_t)wd->sbufaddr[i]);
1083                                 }
1084 
1085                                 /* decrement the send ref count */
1086                                 rib_send_rele(qp);
1087 
1088                                 mutex_exit(&wd->sendwait_lock);
1089                                 (void) rib_free_sendwait(wd);
1090                         }
1091                 }
1092         }
1093 }
1094 
1095 /* ARGSUSED */
1096 static void
1097 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1098 {
1099         ibt_status_t    ibt_status;
1100         ibt_wc_t        wc;
1101         struct send_wid *wd;
1102         rib_qp_t        *qp;
1103         CONN            *conn;
1104         int             i;
1105 
1106         /*
1107          * Re-enable cq notify here to avoid missing any
1108          * completion queue notification.
1109          */
1110         (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1111 
1112         ibt_status = IBT_SUCCESS;
1113         while (ibt_status != IBT_CQ_EMPTY) {
1114                 bzero(&wc, sizeof (wc));
1115                 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1116                 if (ibt_status != IBT_SUCCESS)
1117                         return;
1118 
1119                 /*
1120                  * Got a send completion
1121                  */
1122                 if (wc.wc_id != RDMA_DUMMY_WRID) {
1123                         wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1124                         qp = wd->qp;
1125                         conn = qptoc(qp);
1126                         mutex_enter(&wd->sendwait_lock);
1127 
1128                         switch (wc.wc_status) {
1129                         case IBT_WC_SUCCESS:
1130                                 wd->status = RDMA_SUCCESS;
1131                                 break;
1132                         default:
1133                                 /*
1134                                  * Channel in error state. Set connection to
1135                                  * ERROR and cleanup will happen either from
1136                                  * conn_release  or conn timeout.
1137                                  */
1138                                 wd->status = RDMA_FAILED;
1139                                 mutex_enter(&conn->c_lock);
1140                                 if (conn->c_state != C_DISCONN_PEND)
1141                                         conn->c_state = C_ERROR_CONN;
1142                                 mutex_exit(&conn->c_lock);
1143                                 break;
1144                         }
1145 
1146                         if (wd->cv_sig == 1) {
1147                                 /*
1148                                  * Update completion status and notify poster
1149                                  */
1150                                 cv_signal(&wd->wait_cv);
1151                                 mutex_exit(&wd->sendwait_lock);
1152                         } else {
1153                                 /*
1154                                  * Poster not waiting for notification.
1155                                  * Free the send buffers and send_wid
1156                                  */
1157                                 for (i = 0; i < wd->nsbufs; i++) {
1158                                         rib_rbuf_free(qptoc(wd->qp),
1159                                             SEND_BUFFER,
1160                                             (void *)(uintptr_t)wd->sbufaddr[i]);
1161                                 }
1162 
1163                                 /* decrement the send ref count */
1164                                 rib_send_rele(qp);
1165 
1166                                 mutex_exit(&wd->sendwait_lock);
1167                                 (void) rib_free_sendwait(wd);
1168                         }
1169                 }
1170         }
1171 }
1172 
1173 /*
1174  * RCQ handler
1175  */
1176 /* ARGSUSED */
1177 static void
1178 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1179 {
1180         rib_qp_t        *qp;
1181         ibt_status_t    ibt_status;
1182         ibt_wc_t        wc;
1183         struct recv_wid *rwid;
1184 
1185         /*
1186          * Re-enable cq notify here to avoid missing any
1187          * completion queue notification.
1188          */
1189         (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1190 
1191         ibt_status = IBT_SUCCESS;
1192         while (ibt_status != IBT_CQ_EMPTY) {
1193                 bzero(&wc, sizeof (wc));
1194                 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1195                 if (ibt_status != IBT_SUCCESS)
1196                         return;
1197 
1198                 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1199                 qp = rwid->qp;
1200 
1201                 if (wc.wc_status == IBT_WC_SUCCESS) {
1202                         XDR     inxdrs, *xdrs;
1203                         uint_t  xid, vers, op, find_xid = 0;
1204                         struct reply    *r;
1205                         CONN *conn = qptoc(qp);
1206                         uint32_t rdma_credit = 0;
1207 
1208                         xdrs = &inxdrs;
1209                         xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1210                             wc.wc_bytes_xfer, XDR_DECODE);
1211                         /*
1212                          * Treat xid as opaque (xid is the first entity
1213                          * in the rpc rdma message).
1214                          */
1215                         xid = *(uint32_t *)(uintptr_t)rwid->addr;
1216 
1217                         /* Skip xid and set the xdr position accordingly. */
1218                         XDR_SETPOS(xdrs, sizeof (uint32_t));
1219                         (void) xdr_u_int(xdrs, &vers);
1220                         (void) xdr_u_int(xdrs, &rdma_credit);
1221                         (void) xdr_u_int(xdrs, &op);
1222                         XDR_DESTROY(xdrs);
1223 
1224                         if (vers != RPCRDMA_VERS) {
1225                                 /*
1226                                  * Invalid RPC/RDMA version. Cannot
1227                                  * interoperate.  Set connection to
1228                                  * ERROR state and bail out.
1229                                  */
1230                                 mutex_enter(&conn->c_lock);
1231                                 if (conn->c_state != C_DISCONN_PEND)
1232                                         conn->c_state = C_ERROR_CONN;
1233                                 mutex_exit(&conn->c_lock);
1234                                 rib_rbuf_free(conn, RECV_BUFFER,
1235                                     (void *)(uintptr_t)rwid->addr);
1236                                 rib_free_wid(rwid);
1237                                 rib_recv_rele(qp);
1238                                 continue;
1239                         }
1240 
1241                         mutex_enter(&qp->replylist_lock);
1242                         for (r = qp->replylist; r != NULL; r = r->next) {
1243                                 if (r->xid == xid) {
1244                                         find_xid = 1;
1245                                         switch (op) {
1246                                         case RDMA_MSG:
1247                                         case RDMA_NOMSG:
1248                                         case RDMA_MSGP:
1249                                                 r->status = RDMA_SUCCESS;
1250                                                 r->vaddr_cq = rwid->addr;
1251                                                 r->bytes_xfer =
1252                                                     wc.wc_bytes_xfer;
1253                                                 cv_signal(&r->wait_cv);
1254                                                 break;
1255                                         default:
1256                                                 rib_rbuf_free(qptoc(qp),
1257                                                     RECV_BUFFER,
1258                                                     (void *)(uintptr_t)
1259                                                     rwid->addr);
1260                                                 break;
1261                                         }
1262                                         break;
1263                                 }
1264                         }
1265                         mutex_exit(&qp->replylist_lock);
1266                         if (find_xid == 0) {
1267                                 /* RPC caller not waiting for reply */
1268 
1269                                 DTRACE_PROBE1(rpcib__i__nomatchxid1,
1270                                     int, xid);
1271 
1272                                 rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1273                                     (void *)(uintptr_t)rwid->addr);
1274                         }
1275                 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1276                         CONN *conn = qptoc(qp);
1277 
1278                         /*
1279                          * Connection being flushed. Just free
1280                          * the posted buffer
1281                          */
1282                         rib_rbuf_free(conn, RECV_BUFFER,
1283                             (void *)(uintptr_t)rwid->addr);
1284                 } else {
1285                         CONN *conn = qptoc(qp);
1286 /*
1287  *  RC Recv Q Error Code                Local state     Remote State
1288  *  ====================                ===========     ============
1289  *  IBT_WC_LOCAL_ACCESS_ERR             ERROR           ERROR when NAK recvd
1290  *  IBT_WC_LOCAL_LEN_ERR                ERROR           ERROR when NAK recvd
1291  *  IBT_WC_LOCAL_PROTECT_ERR            ERROR           ERROR when NAK recvd
1292  *  IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           ERROR when NAK recvd
1293  *  IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR when NAK recvd
1294  *  IBT_WC_WR_FLUSHED_ERR               None            None
1295  */
1296                         /*
1297                          * Channel in error state. Set connection
1298                          * in ERROR state.
1299                          */
1300                         mutex_enter(&conn->c_lock);
1301                         if (conn->c_state != C_DISCONN_PEND)
1302                                 conn->c_state = C_ERROR_CONN;
1303                         mutex_exit(&conn->c_lock);
1304                         rib_rbuf_free(conn, RECV_BUFFER,
1305                             (void *)(uintptr_t)rwid->addr);
1306                 }
1307                 rib_free_wid(rwid);
1308                 rib_recv_rele(qp);
1309         }
1310 }
1311 
1312 /* Server side */
1313 /* ARGSUSED */
1314 static void
1315 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1316 {
1317         rdma_recv_data_t *rdp;
1318         rib_qp_t        *qp;
1319         ibt_status_t    ibt_status;
1320         ibt_wc_t        wc;
1321         struct svc_recv *s_recvp;
1322         CONN            *conn;
1323         mblk_t          *mp;
1324 
1325         /*
1326          * Re-enable cq notify here to avoid missing any
1327          * completion queue notification.
1328          */
1329         (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1330 
1331         ibt_status = IBT_SUCCESS;
1332         while (ibt_status != IBT_CQ_EMPTY) {
1333                 bzero(&wc, sizeof (wc));
1334                 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1335                 if (ibt_status != IBT_SUCCESS)
1336                         return;
1337 
1338                 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1339                 qp = s_recvp->qp;
1340                 conn = qptoc(qp);
1341 
1342                 if (wc.wc_status == IBT_WC_SUCCESS) {
1343                         XDR     inxdrs, *xdrs;
1344                         uint_t  xid, vers, op;
1345                         uint32_t rdma_credit;
1346 
1347                         xdrs = &inxdrs;
1348                         /* s_recvp->vaddr stores data */
1349                         xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1350                             wc.wc_bytes_xfer, XDR_DECODE);
1351 
1352                         /*
1353                          * Treat xid as opaque (xid is the first entity
1354                          * in the rpc rdma message).
1355                          */
1356                         xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1357                         /* Skip xid and set the xdr position accordingly. */
1358                         XDR_SETPOS(xdrs, sizeof (uint32_t));
1359                         if (!xdr_u_int(xdrs, &vers) ||
1360                             !xdr_u_int(xdrs, &rdma_credit) ||
1361                             !xdr_u_int(xdrs, &op)) {
1362                                 rib_rbuf_free(conn, RECV_BUFFER,
1363                                     (void *)(uintptr_t)s_recvp->vaddr);
1364                                 XDR_DESTROY(xdrs);
1365                                 rib_recv_rele(qp);
1366                                 (void) rib_free_svc_recv(s_recvp);
1367                                 continue;
1368                         }
1369                         XDR_DESTROY(xdrs);
1370 
1371                         if (vers != RPCRDMA_VERS) {
1372                                 /*
1373                                  * Invalid RPC/RDMA version.
1374                                  * Drop rpc rdma message.
1375                                  */
1376                                 rib_rbuf_free(conn, RECV_BUFFER,
1377                                     (void *)(uintptr_t)s_recvp->vaddr);
1378                                 rib_recv_rele(qp);
1379                                 (void) rib_free_svc_recv(s_recvp);
1380                                 continue;
1381                         }
1382                         /*
1383                          * Is this for RDMA_DONE?
1384                          */
1385                         if (op == RDMA_DONE) {
1386                                 rib_rbuf_free(conn, RECV_BUFFER,
1387                                     (void *)(uintptr_t)s_recvp->vaddr);
1388                                 /*
1389                                  * Wake up the thread waiting on
1390                                  * a RDMA_DONE for xid
1391                                  */
1392                                 mutex_enter(&qp->rdlist_lock);
1393                                 rdma_done_notify(qp, xid);
1394                                 mutex_exit(&qp->rdlist_lock);
1395                                 rib_recv_rele(qp);
1396                                 (void) rib_free_svc_recv(s_recvp);
1397                                 continue;
1398                         }
1399 
1400                         mutex_enter(&plugin_state_lock);
1401                         mutex_enter(&conn->c_lock);
1402                         if ((plugin_state == ACCEPT) &&
1403                             (conn->c_state == C_CONNECTED)) {
1404                                 conn->c_ref++;
1405                                 mutex_exit(&conn->c_lock);
1406                                 while ((mp = allocb(sizeof (*rdp), BPRI_LO))
1407                                     == NULL)
1408                                         (void) strwaitbuf(
1409                                             sizeof (*rdp), BPRI_LO);
1410                                 /*
1411                                  * Plugin is in accept state, hence the master
1412                                  * transport queue for this is still accepting
1413                                  * requests. Hence we can call svc_queuereq to
1414                                  * queue this recieved msg.
1415                                  */
1416                                 rdp = (rdma_recv_data_t *)mp->b_rptr;
1417                                 rdp->conn = conn;
1418                                 rdp->rpcmsg.addr =
1419                                     (caddr_t)(uintptr_t)s_recvp->vaddr;
1420                                 rdp->rpcmsg.type = RECV_BUFFER;
1421                                 rdp->rpcmsg.len = wc.wc_bytes_xfer;
1422                                 rdp->status = wc.wc_status;
1423                                 mp->b_wptr += sizeof (*rdp);
1424                                 (void) svc_queuereq((queue_t *)rib_stat->q, mp,
1425                                     FALSE);
1426                                 mutex_exit(&plugin_state_lock);
1427                         } else {
1428                                 /*
1429                                  * The master transport for this is going
1430                                  * away and the queue is not accepting anymore
1431                                  * requests for krpc, so don't do anything, just
1432                                  * free the msg.
1433                                  */
1434                                 mutex_exit(&conn->c_lock);
1435                                 mutex_exit(&plugin_state_lock);
1436                                 rib_rbuf_free(conn, RECV_BUFFER,
1437                                     (void *)(uintptr_t)s_recvp->vaddr);
1438                         }
1439                 } else {
1440                         rib_rbuf_free(conn, RECV_BUFFER,
1441                             (void *)(uintptr_t)s_recvp->vaddr);
1442                 }
1443                 rib_recv_rele(qp);
1444                 (void) rib_free_svc_recv(s_recvp);
1445         }
1446 }
1447 
1448 static void
1449 rib_attach_hca()
1450 {
1451         mutex_enter(&rib_stat->open_hca_lock);
1452         (void) rpcib_open_hcas(rib_stat);
1453         rib_listen(NULL);
1454         mutex_exit(&rib_stat->open_hca_lock);
1455 }
1456 
1457 /*
1458  * Handles DR event of IBT_HCA_DETACH_EVENT.
1459  */
1460 /* ARGSUSED */
1461 static void
1462 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1463         ibt_async_code_t code, ibt_async_event_t *event)
1464 {
1465         switch (code) {
1466         case IBT_HCA_ATTACH_EVENT:
1467                 rib_attach_hca();
1468                 break;
1469         case IBT_HCA_DETACH_EVENT:
1470                 rib_detach_hca(hca_hdl);
1471 #ifdef DEBUG
1472                 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1473 #endif
1474                 break;
1475         case IBT_EVENT_PORT_UP:
1476                 /*
1477                  * A port is up. We should call rib_listen() since there is
1478                  * a chance that rib_listen() may have failed during
1479                  * rib_attach_hca() because the port had not been up yet.
1480                  */
1481                 rib_listen(NULL);
1482 #ifdef DEBUG
1483                 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1484 #endif
1485                 break;
1486 #ifdef DEBUG
1487         case IBT_EVENT_PATH_MIGRATED:
1488                 cmn_err(CE_NOTE, "rib_async_handler(): "
1489                     "IBT_EVENT_PATH_MIGRATED\n");
1490                 break;
1491         case IBT_EVENT_SQD:
1492                 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1493                 break;
1494         case IBT_EVENT_COM_EST:
1495                 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1496                 break;
1497         case IBT_ERROR_CATASTROPHIC_CHAN:
1498                 cmn_err(CE_NOTE, "rib_async_handler(): "
1499                     "IBT_ERROR_CATASTROPHIC_CHAN\n");
1500                 break;
1501         case IBT_ERROR_INVALID_REQUEST_CHAN:
1502                 cmn_err(CE_NOTE, "rib_async_handler(): "
1503                     "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1504                 break;
1505         case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1506                 cmn_err(CE_NOTE, "rib_async_handler(): "
1507                     "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1508                 break;
1509         case IBT_ERROR_PATH_MIGRATE_REQ:
1510                 cmn_err(CE_NOTE, "rib_async_handler(): "
1511                     "IBT_ERROR_PATH_MIGRATE_REQ\n");
1512                 break;
1513         case IBT_ERROR_CQ:
1514                 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1515                 break;
1516         case IBT_ERROR_PORT_DOWN:
1517                 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1518                 break;
1519         case IBT_ASYNC_OPAQUE1:
1520                 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1521                 break;
1522         case IBT_ASYNC_OPAQUE2:
1523                 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1524                 break;
1525         case IBT_ASYNC_OPAQUE3:
1526                 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1527                 break;
1528         case IBT_ASYNC_OPAQUE4:
1529                 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1530                 break;
1531 #endif
1532         default:
1533                 break;
1534         }
1535 }
1536 
1537 /*
1538  * Client's reachable function.
1539  */
1540 static rdma_stat
1541 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1542 {
1543         rdma_stat       status;
1544         rpcib_ping_t    rpt;
1545         struct netbuf   saddr;
1546         CONN            *conn;
1547 
1548         bzero(&saddr, sizeof (struct netbuf));
1549         status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn);
1550 
1551         if (status == RDMA_SUCCESS) {
1552                 *handle = (void *)rpt.hca;
1553                 /* release the reference */
1554                 (void) rib_conn_release(conn);
1555                 return (RDMA_SUCCESS);
1556         } else {
1557                 *handle = NULL;
1558                 DTRACE_PROBE(rpcib__i__pingfailed);
1559                 return (RDMA_FAILED);
1560         }
1561 }
1562 
1563 /* Client side qp creation */
1564 static rdma_stat
1565 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1566 {
1567         rib_qp_t        *kqp = NULL;
1568         CONN            *conn;
1569         rdma_clnt_cred_ctrl_t *cc_info;
1570 
1571         ASSERT(qp != NULL);
1572         *qp = NULL;
1573 
1574         kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1575         conn = qptoc(kqp);
1576         kqp->hca = hca;
1577         kqp->rdmaconn.c_rdmamod = &rib_mod;
1578         kqp->rdmaconn.c_private = (caddr_t)kqp;
1579 
1580         kqp->mode = RIB_CLIENT;
1581         kqp->chan_flags = IBT_BLOCKING;
1582         conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1583         bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1584         conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1585         /*
1586          * Initialize
1587          */
1588         cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1589         cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1590         mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1591         cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1592         mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1593         mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1594         mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1595         mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1596         cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1597         mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1598         /*
1599          * Initialize the client credit control
1600          * portion of the rdmaconn struct.
1601          */
1602         kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1603         cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1604         cc_info->clnt_cc_granted_ops = 0;
1605         cc_info->clnt_cc_in_flight_ops = 0;
1606         cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1607 
1608         *qp = kqp;
1609         return (RDMA_SUCCESS);
1610 }
1611 
1612 /* Server side qp creation */
1613 static rdma_stat
1614 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1615 {
1616         rib_qp_t        *kqp = NULL;
1617         ibt_chan_sizes_t        chan_sizes;
1618         ibt_rc_chan_alloc_args_t        qp_attr;
1619         ibt_status_t            ibt_status;
1620         rdma_srv_cred_ctrl_t *cc_info;
1621 
1622         *qp = NULL;
1623 
1624         kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1625         kqp->hca = hca;
1626         kqp->port_num = port;
1627         kqp->rdmaconn.c_rdmamod = &rib_mod;
1628         kqp->rdmaconn.c_private = (caddr_t)kqp;
1629 
1630         /*
1631          * Create the qp handle
1632          */
1633         bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1634         qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1635         qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1636         qp_attr.rc_pd = hca->pd_hdl;
1637         qp_attr.rc_hca_port_num = port;
1638         qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1639         qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1640         qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1641         qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1642         qp_attr.rc_clone_chan = NULL;
1643         qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1644         qp_attr.rc_flags = IBT_WR_SIGNALED;
1645 
1646         rw_enter(&hca->state_lock, RW_READER);
1647         if (hca->state != HCA_DETACHED) {
1648                 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1649                     IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1650                     &chan_sizes);
1651         } else {
1652                 rw_exit(&hca->state_lock);
1653                 goto fail;
1654         }
1655         rw_exit(&hca->state_lock);
1656 
1657         if (ibt_status != IBT_SUCCESS) {
1658                 DTRACE_PROBE1(rpcib__i_svccreatechanfail,
1659                     int, ibt_status);
1660                 goto fail;
1661         }
1662 
1663         kqp->mode = RIB_SERVER;
1664         kqp->chan_flags = IBT_BLOCKING;
1665         kqp->q = q;  /* server ONLY */
1666 
1667         cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1668         cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1669         mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1670         mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1671         cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1672         mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1673         mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1674         mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1675         cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1676         mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1677         /*
1678          * Set the private data area to qp to be used in callbacks
1679          */
1680         ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1681         kqp->rdmaconn.c_state = C_CONNECTED;
1682 
1683         /*
1684          * Initialize the server credit control
1685          * portion of the rdmaconn struct.
1686          */
1687         kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1688         cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1689         cc_info->srv_cc_buffers_granted = preposted_rbufs;
1690         cc_info->srv_cc_cur_buffers_used = 0;
1691         cc_info->srv_cc_posted = preposted_rbufs;
1692 
1693         *qp = kqp;
1694 
1695         return (RDMA_SUCCESS);
1696 fail:
1697         if (kqp)
1698                 kmem_free(kqp, sizeof (rib_qp_t));
1699 
1700         return (RDMA_FAILED);
1701 }
1702 
1703 /* ARGSUSED */
1704 ibt_cm_status_t
1705 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1706     ibt_cm_return_args_t *ret_args, void *priv_data,
1707     ibt_priv_data_len_t len)
1708 {
1709         rib_hca_t       *hca;
1710 
1711         hca = (rib_hca_t *)clnt_hdl;
1712 
1713         switch (event->cm_type) {
1714 
1715         /* got a connection close event */
1716         case IBT_CM_EVENT_CONN_CLOSED:
1717         {
1718                 CONN    *conn;
1719                 rib_qp_t *qp;
1720 
1721                 /* check reason why connection was closed */
1722                 switch (event->cm_event.closed) {
1723                 case IBT_CM_CLOSED_DREP_RCVD:
1724                 case IBT_CM_CLOSED_DREQ_TIMEOUT:
1725                 case IBT_CM_CLOSED_DUP:
1726                 case IBT_CM_CLOSED_ABORT:
1727                 case IBT_CM_CLOSED_ALREADY:
1728                         /*
1729                          * These cases indicate the local end initiated
1730                          * the closing of the channel. Nothing to do here.
1731                          */
1732                         break;
1733                 default:
1734                         /*
1735                          * Reason for CONN_CLOSED event must be one of
1736                          * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1737                          * or IBT_CM_CLOSED_STALE. These indicate cases were
1738                          * the remote end is closing the channel. In these
1739                          * cases free the channel and transition to error
1740                          * state
1741                          */
1742                         qp = ibt_get_chan_private(event->cm_channel);
1743                         conn = qptoc(qp);
1744                         mutex_enter(&conn->c_lock);
1745                         if (conn->c_state == C_DISCONN_PEND) {
1746                                 mutex_exit(&conn->c_lock);
1747                                 break;
1748                         }
1749 
1750                         conn->c_state = C_ERROR_CONN;
1751 
1752                         /*
1753                          * Free the conn if c_ref is down to 0 already
1754                          */
1755                         if (conn->c_ref == 0) {
1756                                 /*
1757                                  * Remove from list and free conn
1758                                  */
1759                                 conn->c_state = C_DISCONN_PEND;
1760                                 mutex_exit(&conn->c_lock);
1761                                 rw_enter(&hca->state_lock, RW_READER);
1762                                 if (hca->state != HCA_DETACHED)
1763                                         (void) rib_disconnect_channel(conn,
1764                                             &hca->cl_conn_list);
1765                                 rw_exit(&hca->state_lock);
1766                         } else {
1767                                 /*
1768                                  * conn will be freed when c_ref goes to 0.
1769                                  * Indicate to cleaning thread not to close
1770                                  * the connection, but just free the channel.
1771                                  */
1772                                 conn->c_flags |= C_CLOSE_NOTNEEDED;
1773                                 mutex_exit(&conn->c_lock);
1774                         }
1775 #ifdef DEBUG
1776                         if (rib_debug)
1777                                 cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1778                                     "(CONN_CLOSED) channel disconnected");
1779 #endif
1780                         break;
1781                 }
1782                 break;
1783         }
1784         default:
1785                 break;
1786         }
1787         return (IBT_CM_ACCEPT);
1788 }
1789 
1790 /*
1791  * Connect to the server.
1792  */
1793 rdma_stat
1794 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp)
1795 {
1796         ibt_chan_open_args_t    chan_args;      /* channel args */
1797         ibt_chan_sizes_t        chan_sizes;
1798         ibt_rc_chan_alloc_args_t        qp_attr;
1799         ibt_status_t            ibt_status;
1800         ibt_rc_returns_t        ret_args;       /* conn reject info */
1801         int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */
1802         ibt_ip_cm_info_t        ipcm_info;
1803         uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ];
1804 
1805 
1806         (void) bzero(&chan_args, sizeof (chan_args));
1807         (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1808         (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
1809 
1810         ipcm_info.src_addr.family = rptp->srcip.family;
1811         switch (ipcm_info.src_addr.family) {
1812         case AF_INET:
1813                 ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr;
1814                 break;
1815         case AF_INET6:
1816                 ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr;
1817                 break;
1818         }
1819 
1820         ipcm_info.dst_addr.family = rptp->srcip.family;
1821         switch (ipcm_info.dst_addr.family) {
1822         case AF_INET:
1823                 ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr;
1824                 break;
1825         case AF_INET6:
1826                 ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr;
1827                 break;
1828         }
1829 
1830         ipcm_info.src_port = (in_port_t)nfs_rdma_port;
1831 
1832         ibt_status = ibt_format_ip_private_data(&ipcm_info,
1833             IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt);
1834 
1835         if (ibt_status != IBT_SUCCESS) {
1836                 cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n");
1837                 return (-1);
1838         }
1839 
1840         qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num;
1841         /* Alloc a RC channel */
1842         qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1843         qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1844         qp_attr.rc_pd = hca->pd_hdl;
1845         qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1846         qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1847         qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1848         qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1849         qp_attr.rc_clone_chan = NULL;
1850         qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1851         qp_attr.rc_flags = IBT_WR_SIGNALED;
1852 
1853         rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port);
1854         chan_args.oc_path = &rptp->path;
1855 
1856         chan_args.oc_cm_handler = rib_clnt_cm_handler;
1857         chan_args.oc_cm_clnt_private = (void *)hca;
1858         chan_args.oc_rdma_ra_out = 4;
1859         chan_args.oc_rdma_ra_in = 4;
1860         chan_args.oc_path_retry_cnt = 2;
1861         chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1862         chan_args.oc_priv_data = cmp_ip_pvt;
1863         chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ;
1864 
1865 refresh:
1866         rw_enter(&hca->state_lock, RW_READER);
1867         if (hca->state != HCA_DETACHED) {
1868                 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1869                     IBT_ACHAN_NO_FLAGS,
1870                     &qp_attr, &qp->qp_hdl,
1871                     &chan_sizes);
1872         } else {
1873                 rw_exit(&hca->state_lock);
1874                 return (RDMA_FAILED);
1875         }
1876         rw_exit(&hca->state_lock);
1877 
1878         if (ibt_status != IBT_SUCCESS) {
1879                 DTRACE_PROBE1(rpcib__i_conntosrv,
1880                     int, ibt_status);
1881                 return (RDMA_FAILED);
1882         }
1883 
1884         /* Connect to the Server */
1885         (void) bzero(&ret_args, sizeof (ret_args));
1886         mutex_enter(&qp->cb_lock);
1887         ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1888             IBT_BLOCKING, &chan_args, &ret_args);
1889         if (ibt_status != IBT_SUCCESS) {
1890                 DTRACE_PROBE2(rpcib__i_openrctosrv,
1891                     int, ibt_status, int, ret_args.rc_status);
1892 
1893                 (void) ibt_free_channel(qp->qp_hdl);
1894                 qp->qp_hdl = NULL;
1895                 mutex_exit(&qp->cb_lock);
1896                 if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1897                     ret_args.rc_status == IBT_CM_CONN_STALE) {
1898                         /*
1899                          * Got IBT_CM_CONN_STALE probably because of stale
1900                          * data on the passive end of a channel that existed
1901                          * prior to reboot. Retry establishing a channel
1902                          * REFRESH_ATTEMPTS times, during which time the
1903                          * stale conditions on the server might clear up.
1904                          */
1905                         goto refresh;
1906                 }
1907                 return (RDMA_FAILED);
1908         }
1909         mutex_exit(&qp->cb_lock);
1910         /*
1911          * Set the private data area to qp to be used in callbacks
1912          */
1913         ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1914         return (RDMA_SUCCESS);
1915 }
1916 
1917 rdma_stat
1918 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp)
1919 {
1920         uint_t                  i, addr_count;
1921         ibt_status_t            ibt_status;
1922         uint8_t                 num_paths_p;
1923         ibt_ip_path_attr_t      ipattr;
1924         ibt_path_ip_src_t       srcip;
1925         rpcib_ipaddrs_t         addrs4;
1926         rpcib_ipaddrs_t         addrs6;
1927         struct sockaddr_in      *sinp;
1928         struct sockaddr_in6     *sin6p;
1929         rdma_stat               retval = RDMA_FAILED;
1930         rib_hca_t *hca;
1931 
1932         if ((addr_type != AF_INET) && (addr_type != AF_INET6))
1933                 return (RDMA_INVAL);
1934         ASSERT(raddr->buf != NULL);
1935 
1936         bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1937 
1938         if (!rpcib_get_ib_addresses(&addrs4, &addrs6) ||
1939             (addrs4.ri_count == 0 && addrs6.ri_count == 0)) {
1940                 retval = RDMA_FAILED;
1941                 goto done2;
1942         }
1943 
1944         if (addr_type == AF_INET) {
1945                 addr_count = addrs4.ri_count;
1946                 sinp = (struct sockaddr_in *)raddr->buf;
1947                 rptp->dstip.family = AF_INET;
1948                 rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr;
1949                 sinp = addrs4.ri_list;
1950         } else {
1951                 addr_count = addrs6.ri_count;
1952                 sin6p = (struct sockaddr_in6 *)raddr->buf;
1953                 rptp->dstip.family = AF_INET6;
1954                 rptp->dstip.un.ip6addr = sin6p->sin6_addr;
1955                 sin6p = addrs6.ri_list;
1956         }
1957 
1958         rw_enter(&rib_stat->hcas_list_lock, RW_READER);
1959         for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
1960                 rw_enter(&hca->state_lock, RW_READER);
1961                 if (hca->state == HCA_DETACHED) {
1962                         rw_exit(&hca->state_lock);
1963                         continue;
1964                 }
1965 
1966                 ipattr.ipa_dst_ip       = &rptp->dstip;
1967                 ipattr.ipa_hca_guid     = hca->hca_guid;
1968                 ipattr.ipa_ndst         = 1;
1969                 ipattr.ipa_max_paths    = 1;
1970                 ipattr.ipa_src_ip.family = rptp->dstip.family;
1971                 for (i = 0; i < addr_count; i++) {
1972                         num_paths_p = 0;
1973                         if (addr_type == AF_INET) {
1974                                 ipattr.ipa_src_ip.un.ip4addr =
1975                                     sinp[i].sin_addr.s_addr;
1976                         } else {
1977                                 ipattr.ipa_src_ip.un.ip6addr =
1978                                     sin6p[i].sin6_addr;
1979                         }
1980                         bzero(&srcip, sizeof (ibt_path_ip_src_t));
1981 
1982                         ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1983                             IBT_PATH_NO_FLAGS, &ipattr, &rptp->path,
1984                             &num_paths_p, &srcip);
1985                         if (ibt_status == IBT_SUCCESS &&
1986                             num_paths_p != 0 &&
1987                             rptp->path.pi_hca_guid == hca->hca_guid) {
1988                                 rptp->hca = hca;
1989                                 rw_exit(&hca->state_lock);
1990                                 if (addr_type == AF_INET) {
1991                                         rptp->srcip.family = AF_INET;
1992                                         rptp->srcip.un.ip4addr =
1993                                             srcip.ip_primary.un.ip4addr;
1994                                 } else {
1995                                         rptp->srcip.family = AF_INET6;
1996                                         rptp->srcip.un.ip6addr =
1997                                             srcip.ip_primary.un.ip6addr;
1998 
1999                                 }
2000                                 retval = RDMA_SUCCESS;
2001                                 goto done1;
2002                         }
2003                 }
2004                 rw_exit(&hca->state_lock);
2005         }
2006 done1:
2007         rw_exit(&rib_stat->hcas_list_lock);
2008 done2:
2009         if (addrs4.ri_size > 0)
2010                 kmem_free(addrs4.ri_list, addrs4.ri_size);
2011         if (addrs6.ri_size > 0)
2012                 kmem_free(addrs6.ri_list, addrs6.ri_size);
2013         return (retval);
2014 }
2015 
2016 /*
2017  * Close channel, remove from connection list and
2018  * free up resources allocated for that channel.
2019  */
2020 rdma_stat
2021 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
2022 {
2023         rib_qp_t        *qp = ctoqp(conn);
2024         rib_hca_t       *hca;
2025 
2026         mutex_enter(&conn->c_lock);
2027         if (conn->c_timeout != NULL) {
2028                 mutex_exit(&conn->c_lock);
2029                 (void) untimeout(conn->c_timeout);
2030                 mutex_enter(&conn->c_lock);
2031         }
2032 
2033         while (conn->c_flags & C_CLOSE_PENDING) {
2034                 cv_wait(&conn->c_cv, &conn->c_lock);
2035         }
2036         mutex_exit(&conn->c_lock);
2037 
2038         /*
2039          * c_ref == 0 and connection is in C_DISCONN_PEND
2040          */
2041         hca = qp->hca;
2042         if (conn_list != NULL)
2043                 (void) rib_rm_conn(conn, conn_list);
2044 
2045         /*
2046          * There is only one case where we get here with
2047          * qp_hdl = NULL, which is during connection setup on
2048          * the client. In such a case there are no posted
2049          * send/recv buffers.
2050          */
2051         if (qp->qp_hdl != NULL) {
2052                 mutex_enter(&qp->posted_rbufs_lock);
2053                 while (qp->n_posted_rbufs)
2054                         cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
2055                 mutex_exit(&qp->posted_rbufs_lock);
2056 
2057                 mutex_enter(&qp->send_rbufs_lock);
2058                 while (qp->n_send_rbufs)
2059                         cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock);
2060                         mutex_exit(&qp->send_rbufs_lock);
2061 
2062                 (void) ibt_free_channel(qp->qp_hdl);
2063                         qp->qp_hdl = NULL;
2064         }
2065 
2066         ASSERT(qp->rdlist == NULL);
2067 
2068         if (qp->replylist != NULL) {
2069                 (void) rib_rem_replylist(qp);
2070         }
2071 
2072         cv_destroy(&qp->cb_conn_cv);
2073         cv_destroy(&qp->posted_rbufs_cv);
2074         cv_destroy(&qp->send_rbufs_cv);
2075         mutex_destroy(&qp->cb_lock);
2076         mutex_destroy(&qp->replylist_lock);
2077         mutex_destroy(&qp->posted_rbufs_lock);
2078         mutex_destroy(&qp->send_rbufs_lock);
2079         mutex_destroy(&qp->rdlist_lock);
2080 
2081         cv_destroy(&conn->c_cv);
2082         mutex_destroy(&conn->c_lock);
2083 
2084         if (conn->c_raddr.buf != NULL) {
2085                 kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
2086         }
2087         if (conn->c_laddr.buf != NULL) {
2088                 kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
2089         }
2090         if (conn->c_netid != NULL) {
2091                 kmem_free(conn->c_netid, (strlen(conn->c_netid) + 1));
2092         }
2093         if (conn->c_addrmask.buf != NULL) {
2094                 kmem_free(conn->c_addrmask.buf, conn->c_addrmask.len);
2095         }
2096 
2097         /*
2098          * Credit control cleanup.
2099          */
2100         if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
2101                 rdma_clnt_cred_ctrl_t *cc_info;
2102                 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
2103                 cv_destroy(&cc_info->clnt_cc_cv);
2104         }
2105 
2106         kmem_free(qp, sizeof (rib_qp_t));
2107 
2108         /*
2109          * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
2110          * then the hca is no longer being used.
2111          */
2112         if (conn_list != NULL) {
2113                 rw_enter(&hca->state_lock, RW_READER);
2114                 if (hca->state == HCA_DETACHED) {
2115                         rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
2116                         if (hca->srv_conn_list.conn_hd == NULL) {
2117                                 rw_enter(&hca->cl_conn_list.conn_lock,
2118                                     RW_READER);
2119 
2120                                 if (hca->cl_conn_list.conn_hd == NULL) {
2121                                         mutex_enter(&hca->inuse_lock);
2122                                         hca->inuse = FALSE;
2123                                         cv_signal(&hca->cb_cv);
2124                                         mutex_exit(&hca->inuse_lock);
2125                                 }
2126                                 rw_exit(&hca->cl_conn_list.conn_lock);
2127                         }
2128                         rw_exit(&hca->srv_conn_list.conn_lock);
2129                 }
2130                 rw_exit(&hca->state_lock);
2131         }
2132 
2133         return (RDMA_SUCCESS);
2134 }
2135 
2136 /*
2137  * All sends are done under the protection of
2138  * the wdesc->sendwait_lock. n_send_rbufs count
2139  * is protected using the send_rbufs_lock.
2140  * lock ordering is:
2141  * sendwait_lock -> send_rbufs_lock
2142  */
2143 
2144 void
2145 rib_send_hold(rib_qp_t *qp)
2146 {
2147         mutex_enter(&qp->send_rbufs_lock);
2148         qp->n_send_rbufs++;
2149         mutex_exit(&qp->send_rbufs_lock);
2150 }
2151 
2152 void
2153 rib_send_rele(rib_qp_t *qp)
2154 {
2155         mutex_enter(&qp->send_rbufs_lock);
2156         qp->n_send_rbufs--;
2157         if (qp->n_send_rbufs == 0)
2158                 cv_signal(&qp->send_rbufs_cv);
2159         mutex_exit(&qp->send_rbufs_lock);
2160 }
2161 
2162 void
2163 rib_recv_rele(rib_qp_t *qp)
2164 {
2165         mutex_enter(&qp->posted_rbufs_lock);
2166         qp->n_posted_rbufs--;
2167         if (qp->n_posted_rbufs == 0)
2168                 cv_signal(&qp->posted_rbufs_cv);
2169         mutex_exit(&qp->posted_rbufs_lock);
2170 }
2171 
2172 /*
2173  * Wait for send completion notification. Only on receiving a
2174  * notification be it a successful or error completion, free the
2175  * send_wid.
2176  */
2177 static rdma_stat
2178 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
2179 {
2180         clock_t timout, cv_wait_ret;
2181         rdma_stat error = RDMA_SUCCESS;
2182         int     i;
2183 
2184         /*
2185          * Wait for send to complete
2186          */
2187         ASSERT(wd != NULL);
2188         mutex_enter(&wd->sendwait_lock);
2189         if (wd->status == (uint_t)SEND_WAIT) {
2190                 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
2191                     ddi_get_lbolt();
2192 
2193                 if (qp->mode == RIB_SERVER) {
2194                         while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
2195                             &wd->sendwait_lock, timout)) > 0 &&
2196                             wd->status == (uint_t)SEND_WAIT)
2197                                 ;
2198                         switch (cv_wait_ret) {
2199                         case -1:        /* timeout */
2200                                 DTRACE_PROBE(rpcib__i__srvsendwait__timeout);
2201 
2202                                 wd->cv_sig = 0;              /* no signal needed */
2203                                 error = RDMA_TIMEDOUT;
2204                                 break;
2205                         default:        /* got send completion */
2206                                 break;
2207                         }
2208                 } else {
2209                         while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
2210                             &wd->sendwait_lock, timout)) > 0 &&
2211                             wd->status == (uint_t)SEND_WAIT)
2212                                 ;
2213                         switch (cv_wait_ret) {
2214                         case -1:        /* timeout */
2215                                 DTRACE_PROBE(rpcib__i__clntsendwait__timeout);
2216 
2217                                 wd->cv_sig = 0;              /* no signal needed */
2218                                 error = RDMA_TIMEDOUT;
2219                                 break;
2220                         case 0:         /* interrupted */
2221                                 DTRACE_PROBE(rpcib__i__clntsendwait__intr);
2222 
2223                                 wd->cv_sig = 0;              /* no signal needed */
2224                                 error = RDMA_INTR;
2225                                 break;
2226                         default:        /* got send completion */
2227                                 break;
2228                         }
2229                 }
2230         }
2231 
2232         if (wd->status != (uint_t)SEND_WAIT) {
2233                 /* got send completion */
2234                 if (wd->status != RDMA_SUCCESS) {
2235                         switch (wd->status) {
2236                         case RDMA_CONNLOST:
2237                                 error = RDMA_CONNLOST;
2238                                 break;
2239                         default:
2240                                 error = RDMA_FAILED;
2241                                 break;
2242                         }
2243                 }
2244                 for (i = 0; i < wd->nsbufs; i++) {
2245                         rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2246                             (void *)(uintptr_t)wd->sbufaddr[i]);
2247                 }
2248 
2249                 rib_send_rele(qp);
2250 
2251                 mutex_exit(&wd->sendwait_lock);
2252                 (void) rib_free_sendwait(wd);
2253 
2254         } else {
2255                 mutex_exit(&wd->sendwait_lock);
2256         }
2257         return (error);
2258 }
2259 
2260 static struct send_wid *
2261 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2262 {
2263         struct send_wid *wd;
2264 
2265         wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2266         wd->xid = xid;
2267         wd->cv_sig = cv_sig;
2268         wd->qp = qp;
2269         cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2270         mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2271         wd->status = (uint_t)SEND_WAIT;
2272 
2273         return (wd);
2274 }
2275 
2276 static int
2277 rib_free_sendwait(struct send_wid *wdesc)
2278 {
2279         cv_destroy(&wdesc->wait_cv);
2280         mutex_destroy(&wdesc->sendwait_lock);
2281         kmem_free(wdesc, sizeof (*wdesc));
2282 
2283         return (0);
2284 }
2285 
2286 static rdma_stat
2287 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2288 {
2289         mutex_enter(&qp->replylist_lock);
2290         if (rep != NULL) {
2291                 (void) rib_remreply(qp, rep);
2292                 mutex_exit(&qp->replylist_lock);
2293                 return (RDMA_SUCCESS);
2294         }
2295         mutex_exit(&qp->replylist_lock);
2296         return (RDMA_FAILED);
2297 }
2298 
2299 /*
2300  * Send buffers are freed here only in case of error in posting
2301  * on QP. If the post succeeded, the send buffers are freed upon
2302  * send completion in rib_sendwait() or in the scq_handler.
2303  */
2304 rdma_stat
2305 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2306         int send_sig, int cv_sig, caddr_t *swid)
2307 {
2308         struct send_wid *wdesc;
2309         struct clist    *clp;
2310         ibt_status_t    ibt_status = IBT_SUCCESS;
2311         rdma_stat       ret = RDMA_SUCCESS;
2312         ibt_send_wr_t   tx_wr;
2313         int             i, nds;
2314         ibt_wr_ds_t     sgl[DSEG_MAX];
2315         uint_t          total_msg_size;
2316         rib_qp_t        *qp;
2317 
2318         qp = ctoqp(conn);
2319 
2320         ASSERT(cl != NULL);
2321 
2322         bzero(&tx_wr, sizeof (ibt_send_wr_t));
2323 
2324         nds = 0;
2325         total_msg_size = 0;
2326         clp = cl;
2327         while (clp != NULL) {
2328                 if (nds >= DSEG_MAX) {
2329                         DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded);
2330                         return (RDMA_FAILED);
2331                 }
2332                 sgl[nds].ds_va = clp->w.c_saddr;
2333                 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2334                 sgl[nds].ds_len = clp->c_len;
2335                 total_msg_size += clp->c_len;
2336                 clp = clp->c_next;
2337                 nds++;
2338         }
2339 
2340         if (send_sig) {
2341                 /* Set SEND_SIGNAL flag. */
2342                 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2343                 wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2344                 *swid = (caddr_t)wdesc;
2345                 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2346                 mutex_enter(&wdesc->sendwait_lock);
2347                 wdesc->nsbufs = nds;
2348                 for (i = 0; i < nds; i++) {
2349                         wdesc->sbufaddr[i] = sgl[i].ds_va;
2350                 }
2351         } else {
2352                 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2353                 *swid = NULL;
2354                 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2355         }
2356 
2357         tx_wr.wr_opcode = IBT_WRC_SEND;
2358         tx_wr.wr_trans = IBT_RC_SRV;
2359         tx_wr.wr_nds = nds;
2360         tx_wr.wr_sgl = sgl;
2361 
2362         mutex_enter(&conn->c_lock);
2363         if (conn->c_state == C_CONNECTED) {
2364                 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2365         }
2366         if (conn->c_state != C_CONNECTED ||
2367             ibt_status != IBT_SUCCESS) {
2368                 if (conn->c_state != C_DISCONN_PEND)
2369                         conn->c_state = C_ERROR_CONN;
2370                 mutex_exit(&conn->c_lock);
2371                 if (send_sig) {
2372                         for (i = 0; i < nds; i++) {
2373                                 rib_rbuf_free(conn, SEND_BUFFER,
2374                                     (void *)(uintptr_t)wdesc->sbufaddr[i]);
2375                         }
2376                         mutex_exit(&wdesc->sendwait_lock);
2377                         (void) rib_free_sendwait(wdesc);
2378                 }
2379                 return (RDMA_CONNLOST);
2380         }
2381 
2382         mutex_exit(&conn->c_lock);
2383 
2384         if (send_sig) {
2385                 rib_send_hold(qp);
2386                 mutex_exit(&wdesc->sendwait_lock);
2387                 if (cv_sig) {
2388                         /*
2389                          * cv_wait for send to complete.
2390                          * We can fail due to a timeout or signal or
2391                          * unsuccessful send.
2392                          */
2393                         ret = rib_sendwait(qp, wdesc);
2394 
2395                         return (ret);
2396                 }
2397         }
2398 
2399         return (RDMA_SUCCESS);
2400 }
2401 
2402 
2403 rdma_stat
2404 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2405 {
2406         rdma_stat       ret;
2407         caddr_t         wd;
2408 
2409         /* send-wait & cv_signal */
2410         ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2411         return (ret);
2412 }
2413 
2414 /*
2415  * Deprecated/obsolete interface not used currently
2416  * but earlier used for READ-READ protocol.
2417  * Send RPC reply and wait for RDMA_DONE.
2418  */
2419 rdma_stat
2420 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2421 {
2422         rdma_stat ret = RDMA_SUCCESS;
2423         struct rdma_done_list *rd;
2424         clock_t cv_wait_ret;
2425         caddr_t *wid = NULL;
2426         rib_qp_t *qp = ctoqp(conn);
2427 
2428         mutex_enter(&qp->rdlist_lock);
2429         rd = rdma_done_add(qp, msgid);
2430 
2431         /* No cv_signal (whether send-wait or no-send-wait) */
2432         ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
2433 
2434         if (ret != RDMA_SUCCESS) {
2435                 rdma_done_rm(qp, rd);
2436         } else {
2437                 /*
2438                  * Wait for RDMA_DONE from remote end
2439                  */
2440                 cv_wait_ret = cv_reltimedwait(&rd->rdma_done_cv,
2441                     &qp->rdlist_lock, drv_usectohz(REPLY_WAIT_TIME * 1000000),
2442                     TR_CLOCK_TICK);
2443 
2444                 rdma_done_rm(qp, rd);
2445 
2446                 if (cv_wait_ret < 0) {
2447                         ret = RDMA_TIMEDOUT;
2448                 }
2449         }
2450 
2451         mutex_exit(&qp->rdlist_lock);
2452         return (ret);
2453 }
2454 
2455 static struct recv_wid *
2456 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2457 {
2458         struct recv_wid *rwid;
2459 
2460         rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2461         rwid->xid = msgid;
2462         rwid->addr = sgl->ds_va;
2463         rwid->qp = qp;
2464 
2465         return (rwid);
2466 }
2467 
2468 static void
2469 rib_free_wid(struct recv_wid *rwid)
2470 {
2471         kmem_free(rwid, sizeof (struct recv_wid));
2472 }
2473 
2474 rdma_stat
2475 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2476 {
2477         rib_qp_t        *qp = ctoqp(conn);
2478         struct clist    *clp = cl;
2479         struct reply    *rep;
2480         struct recv_wid *rwid;
2481         int             nds;
2482         ibt_wr_ds_t     sgl[DSEG_MAX];
2483         ibt_recv_wr_t   recv_wr;
2484         rdma_stat       ret;
2485         ibt_status_t    ibt_status;
2486 
2487         /*
2488          * rdma_clnt_postrecv uses RECV_BUFFER.
2489          */
2490 
2491         nds = 0;
2492         while (cl != NULL) {
2493                 if (nds >= DSEG_MAX) {
2494                         ret = RDMA_FAILED;
2495                         goto done;
2496                 }
2497                 sgl[nds].ds_va = cl->w.c_saddr;
2498                 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2499                 sgl[nds].ds_len = cl->c_len;
2500                 cl = cl->c_next;
2501                 nds++;
2502         }
2503 
2504         if (nds != 1) {
2505                 ret = RDMA_FAILED;
2506                 goto done;
2507         }
2508 
2509         bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2510         recv_wr.wr_nds = nds;
2511         recv_wr.wr_sgl = sgl;
2512 
2513         rwid = rib_create_wid(qp, &sgl[0], msgid);
2514         if (rwid) {
2515                 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2516         } else {
2517                 ret = RDMA_NORESOURCE;
2518                 goto done;
2519         }
2520         rep = rib_addreplylist(qp, msgid);
2521         if (!rep) {
2522                 rib_free_wid(rwid);
2523                 ret = RDMA_NORESOURCE;
2524                 goto done;
2525         }
2526 
2527         mutex_enter(&conn->c_lock);
2528 
2529         if (conn->c_state == C_CONNECTED) {
2530                 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2531         }
2532 
2533         if (conn->c_state != C_CONNECTED ||
2534             ibt_status != IBT_SUCCESS) {
2535                 if (conn->c_state != C_DISCONN_PEND)
2536                         conn->c_state = C_ERROR_CONN;
2537                 mutex_exit(&conn->c_lock);
2538                 rib_free_wid(rwid);
2539                 (void) rib_rem_rep(qp, rep);
2540                 ret = RDMA_CONNLOST;
2541                 goto done;
2542         }
2543 
2544         mutex_enter(&qp->posted_rbufs_lock);
2545         qp->n_posted_rbufs++;
2546         mutex_exit(&qp->posted_rbufs_lock);
2547 
2548         mutex_exit(&conn->c_lock);
2549         return (RDMA_SUCCESS);
2550 
2551 done:
2552         while (clp != NULL) {
2553                 rib_rbuf_free(conn, RECV_BUFFER,
2554                     (void *)(uintptr_t)clp->w.c_saddr3);
2555                 clp = clp->c_next;
2556         }
2557         return (ret);
2558 }
2559 
2560 rdma_stat
2561 rib_svc_post(CONN* conn, struct clist *cl)
2562 {
2563         rib_qp_t        *qp = ctoqp(conn);
2564         struct svc_recv *s_recvp;
2565         int             nds;
2566         ibt_wr_ds_t     sgl[DSEG_MAX];
2567         ibt_recv_wr_t   recv_wr;
2568         ibt_status_t    ibt_status;
2569 
2570         nds = 0;
2571         while (cl != NULL) {
2572                 if (nds >= DSEG_MAX) {
2573                         return (RDMA_FAILED);
2574                 }
2575                 sgl[nds].ds_va = cl->w.c_saddr;
2576                 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2577                 sgl[nds].ds_len = cl->c_len;
2578                 cl = cl->c_next;
2579                 nds++;
2580         }
2581 
2582         if (nds != 1) {
2583                 rib_rbuf_free(conn, RECV_BUFFER,
2584                     (caddr_t)(uintptr_t)sgl[0].ds_va);
2585 
2586                 return (RDMA_FAILED);
2587         }
2588 
2589         bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2590         recv_wr.wr_nds = nds;
2591         recv_wr.wr_sgl = sgl;
2592 
2593         s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2594         /* Use s_recvp's addr as wr id */
2595         recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2596         mutex_enter(&conn->c_lock);
2597         if (conn->c_state == C_CONNECTED) {
2598                 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2599         }
2600         if (conn->c_state != C_CONNECTED ||
2601             ibt_status != IBT_SUCCESS) {
2602                 if (conn->c_state != C_DISCONN_PEND)
2603                         conn->c_state = C_ERROR_CONN;
2604                 mutex_exit(&conn->c_lock);
2605                 rib_rbuf_free(conn, RECV_BUFFER,
2606                     (caddr_t)(uintptr_t)sgl[0].ds_va);
2607                 (void) rib_free_svc_recv(s_recvp);
2608 
2609                 return (RDMA_CONNLOST);
2610         }
2611         mutex_exit(&conn->c_lock);
2612 
2613         return (RDMA_SUCCESS);
2614 }
2615 
2616 /* Client */
2617 rdma_stat
2618 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2619 {
2620         return (rib_clnt_post(conn, cl, msgid));
2621 }
2622 
2623 /* Client */
2624 rdma_stat
2625 rib_post_resp_remove(CONN* conn, uint32_t msgid)
2626 {
2627         rib_qp_t        *qp = ctoqp(conn);
2628         struct reply    *rep;
2629 
2630         mutex_enter(&qp->replylist_lock);
2631         for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2632                 if (rep->xid == msgid) {
2633                         if (rep->vaddr_cq) {
2634                                 rib_rbuf_free(conn, RECV_BUFFER,
2635                                     (caddr_t)(uintptr_t)rep->vaddr_cq);
2636                         }
2637                         (void) rib_remreply(qp, rep);
2638                         break;
2639                 }
2640         }
2641         mutex_exit(&qp->replylist_lock);
2642 
2643         return (RDMA_SUCCESS);
2644 }
2645 
2646 /* Server */
2647 rdma_stat
2648 rib_post_recv(CONN *conn, struct clist *cl)
2649 {
2650         rib_qp_t        *qp = ctoqp(conn);
2651 
2652         if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2653                 mutex_enter(&qp->posted_rbufs_lock);
2654                 qp->n_posted_rbufs++;
2655                 mutex_exit(&qp->posted_rbufs_lock);
2656                 return (RDMA_SUCCESS);
2657         }
2658         return (RDMA_FAILED);
2659 }
2660 
2661 /*
2662  * Client side only interface to "recv" the rpc reply buf
2663  * posted earlier by rib_post_resp(conn, cl, msgid).
2664  */
2665 rdma_stat
2666 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2667 {
2668         struct reply *rep = NULL;
2669         clock_t timout, cv_wait_ret;
2670         rdma_stat ret = RDMA_SUCCESS;
2671         rib_qp_t *qp = ctoqp(conn);
2672 
2673         /*
2674          * Find the reply structure for this msgid
2675          */
2676         mutex_enter(&qp->replylist_lock);
2677 
2678         for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2679                 if (rep->xid == msgid)
2680                         break;
2681         }
2682 
2683         if (rep != NULL) {
2684                 /*
2685                  * If message not yet received, wait.
2686                  */
2687                 if (rep->status == (uint_t)REPLY_WAIT) {
2688                         timout = ddi_get_lbolt() +
2689                             drv_usectohz(REPLY_WAIT_TIME * 1000000);
2690 
2691                         while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2692                             &qp->replylist_lock, timout)) > 0 &&
2693                             rep->status == (uint_t)REPLY_WAIT)
2694                                 ;
2695 
2696                         switch (cv_wait_ret) {
2697                         case -1:        /* timeout */
2698                                 ret = RDMA_TIMEDOUT;
2699                                 break;
2700                         case 0:
2701                                 ret = RDMA_INTR;
2702                                 break;
2703                         default:
2704                                 break;
2705                         }
2706                 }
2707 
2708                 if (rep->status == RDMA_SUCCESS) {
2709                         struct clist *cl = NULL;
2710 
2711                         /*
2712                          * Got message successfully
2713                          */
2714                         clist_add(&cl, 0, rep->bytes_xfer, NULL,
2715                             (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2716                         *clp = cl;
2717                 } else {
2718                         if (rep->status != (uint_t)REPLY_WAIT) {
2719                                 /*
2720                                  * Got error in reply message. Free
2721                                  * recv buffer here.
2722                                  */
2723                                 ret = rep->status;
2724                                 rib_rbuf_free(conn, RECV_BUFFER,
2725                                     (caddr_t)(uintptr_t)rep->vaddr_cq);
2726                         }
2727                 }
2728                 (void) rib_remreply(qp, rep);
2729         } else {
2730                 /*
2731                  * No matching reply structure found for given msgid on the
2732                  * reply wait list.
2733                  */
2734                 ret = RDMA_INVAL;
2735                 DTRACE_PROBE(rpcib__i__nomatchxid2);
2736         }
2737 
2738         /*
2739          * Done.
2740          */
2741         mutex_exit(&qp->replylist_lock);
2742         return (ret);
2743 }
2744 
2745 /*
2746  * RDMA write a buffer to the remote address.
2747  */
2748 rdma_stat
2749 rib_write(CONN *conn, struct clist *cl, int wait)
2750 {
2751         ibt_send_wr_t   tx_wr;
2752         int             cv_sig;
2753         ibt_wr_ds_t     sgl[DSEG_MAX];
2754         struct send_wid *wdesc;
2755         ibt_status_t    ibt_status;
2756         rdma_stat       ret = RDMA_SUCCESS;
2757         rib_qp_t        *qp = ctoqp(conn);
2758         uint64_t        n_writes = 0;
2759 
2760         if (cl == NULL) {
2761                 return (RDMA_FAILED);
2762         }
2763 
2764         while ((cl != NULL)) {
2765                 if (cl->c_len > 0) {
2766                         bzero(&tx_wr, sizeof (ibt_send_wr_t));
2767                         tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr;
2768                         tx_wr.wr.rc.rcwr.rdma.rdma_rkey =
2769                             cl->c_dmemhandle.mrc_rmr; /* rkey */
2770                         sgl[0].ds_va = cl->w.c_saddr;
2771                         sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2772                         sgl[0].ds_len = cl->c_len;
2773 
2774                         if (wait) {
2775                                 cv_sig = 1;
2776                         } else {
2777                                 if (n_writes > max_unsignaled_rws) {
2778                                         n_writes = 0;
2779                                         cv_sig = 1;
2780                                 } else {
2781                                         cv_sig = 0;
2782                                 }
2783                         }
2784 
2785                         if (cv_sig) {
2786                                 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2787                                 wdesc = rib_init_sendwait(0, cv_sig, qp);
2788                                 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2789                                 mutex_enter(&wdesc->sendwait_lock);
2790                         } else {
2791                                 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2792                                 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2793                         }
2794                         tx_wr.wr_opcode = IBT_WRC_RDMAW;
2795                         tx_wr.wr_trans = IBT_RC_SRV;
2796                         tx_wr.wr_nds = 1;
2797                         tx_wr.wr_sgl = sgl;
2798 
2799                         mutex_enter(&conn->c_lock);
2800                         if (conn->c_state == C_CONNECTED) {
2801                                 ibt_status =
2802                                     ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2803                         }
2804                         if (conn->c_state != C_CONNECTED ||
2805                             ibt_status != IBT_SUCCESS) {
2806                                 if (conn->c_state != C_DISCONN_PEND)
2807                                         conn->c_state = C_ERROR_CONN;
2808                                 mutex_exit(&conn->c_lock);
2809                                 if (cv_sig) {
2810                                         mutex_exit(&wdesc->sendwait_lock);
2811                                         (void) rib_free_sendwait(wdesc);
2812                                 }
2813                                 return (RDMA_CONNLOST);
2814                         }
2815 
2816                         mutex_exit(&conn->c_lock);
2817 
2818                         /*
2819                          * Wait for send to complete
2820                          */
2821                         if (cv_sig) {
2822 
2823                                 rib_send_hold(qp);
2824                                 mutex_exit(&wdesc->sendwait_lock);
2825 
2826                                 ret = rib_sendwait(qp, wdesc);
2827                                 if (ret != 0)
2828                                         return (ret);
2829                         }
2830                         n_writes ++;
2831                 }
2832                 cl = cl->c_next;
2833         }
2834         return (RDMA_SUCCESS);
2835 }
2836 
2837 /*
2838  * RDMA Read a buffer from the remote address.
2839  */
2840 rdma_stat
2841 rib_read(CONN *conn, struct clist *cl, int wait)
2842 {
2843         ibt_send_wr_t   rx_wr;
2844         int             cv_sig = 0;
2845         ibt_wr_ds_t     sgl;
2846         struct send_wid *wdesc;
2847         ibt_status_t    ibt_status = IBT_SUCCESS;
2848         rdma_stat       ret = RDMA_SUCCESS;
2849         rib_qp_t        *qp = ctoqp(conn);
2850 
2851         if (cl == NULL) {
2852                 return (RDMA_FAILED);
2853         }
2854 
2855         while (cl != NULL) {
2856                 bzero(&rx_wr, sizeof (ibt_send_wr_t));
2857                 /*
2858                  * Remote address is at the head chunk item in list.
2859                  */
2860                 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr;
2861                 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr;
2862 
2863                 sgl.ds_va = cl->u.c_daddr;
2864                 sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2865                 sgl.ds_len = cl->c_len;
2866 
2867                 /*
2868                  * If there are multiple chunks to be read, and
2869                  * wait is set, ask for signal only for the last chunk
2870                  * and wait only on the last chunk. The completion of
2871                  * RDMA_READ on last chunk ensures that reads on all
2872                  * previous chunks are also completed.
2873                  */
2874                 if (wait && (cl->c_next == NULL)) {
2875                         cv_sig = 1;
2876                         wdesc = rib_init_sendwait(0, cv_sig, qp);
2877                         rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2878                         rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2879                         mutex_enter(&wdesc->sendwait_lock);
2880                 } else {
2881                         rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2882                         rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2883                 }
2884                 rx_wr.wr_opcode = IBT_WRC_RDMAR;
2885                 rx_wr.wr_trans = IBT_RC_SRV;
2886                 rx_wr.wr_nds = 1;
2887                 rx_wr.wr_sgl = &sgl;
2888 
2889                 mutex_enter(&conn->c_lock);
2890                 if (conn->c_state == C_CONNECTED) {
2891                         ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2892                 }
2893                 if (conn->c_state != C_CONNECTED ||
2894                     ibt_status != IBT_SUCCESS) {
2895                         if (conn->c_state != C_DISCONN_PEND)
2896                                 conn->c_state = C_ERROR_CONN;
2897                         mutex_exit(&conn->c_lock);
2898                         if (wait && (cl->c_next == NULL)) {
2899                                 mutex_exit(&wdesc->sendwait_lock);
2900                                 (void) rib_free_sendwait(wdesc);
2901                         }
2902                         return (RDMA_CONNLOST);
2903                 }
2904 
2905                 mutex_exit(&conn->c_lock);
2906 
2907                 /*
2908                  * Wait for send to complete if this is the
2909                  * last item in the list.
2910                  */
2911                 if (wait && cl->c_next == NULL) {
2912                         rib_send_hold(qp);
2913                         mutex_exit(&wdesc->sendwait_lock);
2914 
2915                         ret = rib_sendwait(qp, wdesc);
2916 
2917                         if (ret != 0)
2918                                 return (ret);
2919                 }
2920                 cl = cl->c_next;
2921         }
2922         return (RDMA_SUCCESS);
2923 }
2924 
2925 /*
2926  * rib_srv_cm_handler()
2927  *    Connection Manager callback to handle RC connection requests.
2928  */
2929 /* ARGSUSED */
2930 static ibt_cm_status_t
2931 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2932         ibt_cm_return_args_t *ret_args, void *priv_data,
2933         ibt_priv_data_len_t len)
2934 {
2935         queue_t         *q;
2936         rib_qp_t        *qp;
2937         rib_hca_t       *hca;
2938         rdma_stat       status = RDMA_SUCCESS;
2939         int             i;
2940         struct clist    cl;
2941         rdma_buf_t      rdbuf = {0};
2942         void            *buf = NULL;
2943         CONN            *conn;
2944         ibt_ip_cm_info_t        ipinfo;
2945         struct sockaddr_in *s;
2946         struct sockaddr_in6 *s6;
2947         int sin_size = sizeof (struct sockaddr_in);
2948         int in_size = sizeof (struct in_addr);
2949         int sin6_size = sizeof (struct sockaddr_in6);
2950 
2951         ASSERT(any != NULL);
2952         ASSERT(event != NULL);
2953 
2954         hca = (rib_hca_t *)any;
2955 
2956         /* got a connection request */
2957         switch (event->cm_type) {
2958         case IBT_CM_EVENT_REQ_RCV:
2959                 /*
2960                  * If the plugin is in the NO_ACCEPT state, bail out.
2961                  */
2962                 mutex_enter(&plugin_state_lock);
2963                 if (plugin_state == NO_ACCEPT) {
2964                         mutex_exit(&plugin_state_lock);
2965                         return (IBT_CM_REJECT);
2966                 }
2967                 mutex_exit(&plugin_state_lock);
2968 
2969                 /*
2970                  * Need to send a MRA MAD to CM so that it does not
2971                  * timeout on us.
2972                  */
2973                 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2974                     event->cm_event.req.req_timeout * 8, NULL, 0);
2975 
2976                 mutex_enter(&rib_stat->open_hca_lock);
2977                 q = rib_stat->q;
2978                 mutex_exit(&rib_stat->open_hca_lock);
2979 
2980                 status = rib_svc_create_chan(hca, (caddr_t)q,
2981                     event->cm_event.req.req_prim_hca_port, &qp);
2982 
2983                 if (status) {
2984                         return (IBT_CM_REJECT);
2985                 }
2986 
2987                 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2988                 ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
2989                 ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
2990                 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2991 
2992                 /*
2993                  * Pre-posts RECV buffers
2994                  */
2995                 conn = qptoc(qp);
2996                 for (i = 0; i < preposted_rbufs; i++) {
2997                         bzero(&rdbuf, sizeof (rdbuf));
2998                         rdbuf.type = RECV_BUFFER;
2999                         buf = rib_rbuf_alloc(conn, &rdbuf);
3000                         if (buf == NULL) {
3001                                 /*
3002                                  * A connection is not established yet.
3003                                  * Just flush the channel. Buffers
3004                                  * posted till now will error out with
3005                                  * IBT_WC_WR_FLUSHED_ERR.
3006                                  */
3007                                 (void) ibt_flush_channel(qp->qp_hdl);
3008                                 (void) rib_disconnect_channel(conn, NULL);
3009                                 return (IBT_CM_REJECT);
3010                         }
3011 
3012                         bzero(&cl, sizeof (cl));
3013                         cl.w.c_saddr3 = (caddr_t)rdbuf.addr;
3014                         cl.c_len = rdbuf.len;
3015                         cl.c_smemhandle.mrc_lmr =
3016                             rdbuf.handle.mrc_lmr; /* lkey */
3017                         cl.c_next = NULL;
3018                         status = rib_post_recv(conn, &cl);
3019                         if (status != RDMA_SUCCESS) {
3020                                 /*
3021                                  * A connection is not established yet.
3022                                  * Just flush the channel. Buffers
3023                                  * posted till now will error out with
3024                                  * IBT_WC_WR_FLUSHED_ERR.
3025                                  */
3026                                 (void) ibt_flush_channel(qp->qp_hdl);
3027                                 (void) rib_disconnect_channel(conn, NULL);
3028                                 return (IBT_CM_REJECT);
3029                         }
3030                 }
3031                 (void) rib_add_connlist(conn, &hca->srv_conn_list);
3032 
3033                 /*
3034                  * Get the address translation
3035                  */
3036                 rw_enter(&hca->state_lock, RW_READER);
3037                 if (hca->state == HCA_DETACHED) {
3038                         rw_exit(&hca->state_lock);
3039                         return (IBT_CM_REJECT);
3040                 }
3041                 rw_exit(&hca->state_lock);
3042 
3043                 bzero(&ipinfo, sizeof (ibt_ip_cm_info_t));
3044 
3045                 if (ibt_get_ip_data(event->cm_priv_data_len,
3046                     event->cm_priv_data,
3047                     &ipinfo) != IBT_SUCCESS) {
3048 
3049                         return (IBT_CM_REJECT);
3050                 }
3051 
3052                 switch (ipinfo.src_addr.family) {
3053                 case AF_INET:
3054 
3055                         conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1,
3056                             KM_SLEEP);
3057                         (void) strcpy(conn->c_netid, RIBNETID_TCP);
3058 
3059                         conn->c_raddr.maxlen =
3060                             conn->c_raddr.len = sin_size;
3061                         conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3062 
3063                         s = (struct sockaddr_in *)conn->c_raddr.buf;
3064                         s->sin_family = AF_INET;
3065                         bcopy((void *)&ipinfo.src_addr.un.ip4addr,
3066                             &s->sin_addr, in_size);
3067 
3068                         conn->c_laddr.maxlen =
3069                             conn->c_laddr.len = sin_size;
3070                         conn->c_laddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3071 
3072                         s = (struct sockaddr_in *)conn->c_laddr.buf;
3073                         s->sin_family = AF_INET;
3074                         bcopy((void *)&ipinfo.dst_addr.un.ip4addr,
3075                             &s->sin_addr, in_size);
3076 
3077                         conn->c_addrmask.maxlen = conn->c_addrmask.len =
3078                             sizeof (struct sockaddr_in);
3079                         conn->c_addrmask.buf =
3080                             kmem_zalloc(conn->c_addrmask.len, KM_SLEEP);
3081                         ((struct sockaddr_in *)
3082                             conn->c_addrmask.buf)->sin_addr.s_addr =
3083                             (uint32_t)~0;
3084                         ((struct sockaddr_in *)
3085                             conn->c_addrmask.buf)->sin_family =
3086                             (sa_family_t)~0;
3087                         break;
3088 
3089                 case AF_INET6:
3090 
3091                         conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1,
3092                             KM_SLEEP);
3093                         (void) strcpy(conn->c_netid, RIBNETID_TCP6);
3094 
3095                         conn->c_raddr.maxlen =
3096                             conn->c_raddr.len = sin6_size;
3097                         conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3098 
3099                         s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
3100                         s6->sin6_family = AF_INET6;
3101                         bcopy((void *)&ipinfo.src_addr.un.ip6addr,
3102                             &s6->sin6_addr,
3103                             sizeof (struct in6_addr));
3104 
3105                         conn->c_laddr.maxlen =
3106                             conn->c_laddr.len = sin6_size;
3107                         conn->c_laddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3108 
3109                         s6 = (struct sockaddr_in6 *)conn->c_laddr.buf;
3110                         s6->sin6_family = AF_INET6;
3111                         bcopy((void *)&ipinfo.dst_addr.un.ip6addr,
3112                             &s6->sin6_addr,
3113                             sizeof (struct in6_addr));
3114 
3115                         conn->c_addrmask.maxlen = conn->c_addrmask.len =
3116                             sizeof (struct sockaddr_in6);
3117                         conn->c_addrmask.buf =
3118                             kmem_zalloc(conn->c_addrmask.len, KM_SLEEP);
3119                         (void) memset(&((struct sockaddr_in6 *)
3120                             conn->c_addrmask.buf)->sin6_addr, (uchar_t)~0,
3121                             sizeof (struct in6_addr));
3122                         ((struct sockaddr_in6 *)
3123                             conn->c_addrmask.buf)->sin6_family =
3124                             (sa_family_t)~0;
3125                         break;
3126 
3127                 default:
3128                         return (IBT_CM_REJECT);
3129                 }
3130 
3131                 break;
3132 
3133         case IBT_CM_EVENT_CONN_CLOSED:
3134         {
3135                 CONN            *conn;
3136                 rib_qp_t        *qp;
3137 
3138                 switch (event->cm_event.closed) {
3139                 case IBT_CM_CLOSED_DREP_RCVD:
3140                 case IBT_CM_CLOSED_DREQ_TIMEOUT:
3141                 case IBT_CM_CLOSED_DUP:
3142                 case IBT_CM_CLOSED_ABORT:
3143                 case IBT_CM_CLOSED_ALREADY:
3144                         /*
3145                          * These cases indicate the local end initiated
3146                          * the closing of the channel. Nothing to do here.
3147                          */
3148                         break;
3149                 default:
3150                         /*
3151                          * Reason for CONN_CLOSED event must be one of
3152                          * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
3153                          * or IBT_CM_CLOSED_STALE. These indicate cases were
3154                          * the remote end is closing the channel. In these
3155                          * cases free the channel and transition to error
3156                          * state
3157                          */
3158                         qp = ibt_get_chan_private(event->cm_channel);
3159                         conn = qptoc(qp);
3160                         mutex_enter(&conn->c_lock);
3161                         if (conn->c_state == C_DISCONN_PEND) {
3162                                 mutex_exit(&conn->c_lock);
3163                                 break;
3164                         }
3165                         conn->c_state = C_ERROR_CONN;
3166 
3167                         /*
3168                          * Free the conn if c_ref goes down to 0
3169                          */
3170                         if (conn->c_ref == 0) {
3171                                 /*
3172                                  * Remove from list and free conn
3173                                  */
3174                                 conn->c_state = C_DISCONN_PEND;
3175                                 mutex_exit(&conn->c_lock);
3176                                 (void) rib_disconnect_channel(conn,
3177                                     &hca->srv_conn_list);
3178                         } else {
3179                                 /*
3180                                  * conn will be freed when c_ref goes to 0.
3181                                  * Indicate to cleaning thread not to close
3182                                  * the connection, but just free the channel.
3183                                  */
3184                                 conn->c_flags |= C_CLOSE_NOTNEEDED;
3185                                 mutex_exit(&conn->c_lock);
3186                         }
3187                         DTRACE_PROBE(rpcib__i__srvcm_chandisconnect);
3188                         break;
3189                 }
3190                 break;
3191         }
3192         case IBT_CM_EVENT_CONN_EST:
3193                 /*
3194                  * RTU received, hence connection established.
3195                  */
3196                 if (rib_debug > 1)
3197                         cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3198                             "(CONN_EST) channel established");
3199                 break;
3200 
3201         default:
3202                 if (rib_debug > 2) {
3203                         /* Let CM handle the following events. */
3204                         if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
3205                                 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3206                                     "server recv'ed IBT_CM_EVENT_REP_RCV\n");
3207                         } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
3208                                 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3209                                     "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
3210                         } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
3211                                 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3212                                     "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
3213                         } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
3214                                 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3215                                     "server recv'ed IBT_CM_EVENT_APR_RCV\n");
3216                         } else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
3217                                 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3218                                     "server recv'ed IBT_CM_EVENT_FAILURE\n");
3219                         }
3220                 }
3221                 return (IBT_CM_DEFAULT);
3222         }
3223 
3224         /* accept all other CM messages (i.e. let the CM handle them) */
3225         return (IBT_CM_ACCEPT);
3226 }
3227 
3228 static rdma_stat
3229 rib_register_service(rib_hca_t *hca, int service_type,
3230         uint8_t protocol_num, in_port_t dst_port)
3231 {
3232         ibt_srv_desc_t          sdesc;
3233         ibt_hca_portinfo_t      *port_infop;
3234         ib_svc_id_t             srv_id;
3235         ibt_srv_hdl_t           srv_hdl;
3236         uint_t                  port_size;
3237         uint_t                  pki, i, num_ports, nbinds;
3238         ibt_status_t            ibt_status;
3239         rib_service_t           *service;
3240         ib_pkey_t               pkey;
3241 
3242         /*
3243          * Query all ports for the given HCA
3244          */
3245         rw_enter(&hca->state_lock, RW_READER);
3246         if (hca->state != HCA_DETACHED) {
3247                 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3248                     &num_ports, &port_size);
3249                 rw_exit(&hca->state_lock);
3250         } else {
3251                 rw_exit(&hca->state_lock);
3252                 return (RDMA_FAILED);
3253         }
3254         if (ibt_status != IBT_SUCCESS) {
3255                 return (RDMA_FAILED);
3256         }
3257 
3258         DTRACE_PROBE1(rpcib__i__regservice_numports,
3259             int, num_ports);
3260 
3261         for (i = 0; i < num_ports; i++) {
3262                 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3263                         DTRACE_PROBE1(rpcib__i__regservice__portinactive,
3264                             int, i+1);
3265                 } else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) {
3266                         DTRACE_PROBE1(rpcib__i__regservice__portactive,
3267                             int, i+1);
3268                 }
3269         }
3270 
3271         /*
3272          * Get all the IP addresses on this system to register the
3273          * given "service type" on all DNS recognized IP addrs.
3274          * Each service type such as NFS will have all the systems
3275          * IP addresses as its different names. For now the only
3276          * type of service we support in RPCIB is NFS.
3277          */
3278         rw_enter(&rib_stat->service_list_lock, RW_WRITER);
3279         /*
3280          * Start registering and binding service to active
3281          * on active ports on this HCA.
3282          */
3283         nbinds = 0;
3284         for (service = rib_stat->service_list;
3285             service && (service->srv_type != service_type);
3286             service = service->next)
3287                 ;
3288 
3289         if (service == NULL) {
3290                 /*
3291                  * We use IP addresses as the service names for
3292                  * service registration.  Register each of them
3293                  * with CM to obtain a svc_id and svc_hdl.  We do not
3294                  * register the service with machine's loopback address.
3295                  */
3296                 (void) bzero(&srv_id, sizeof (ib_svc_id_t));
3297                 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3298                 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3299                 sdesc.sd_handler = rib_srv_cm_handler;
3300                 sdesc.sd_flags = 0;
3301                 ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3302                     &sdesc, ibt_get_ip_sid(protocol_num, dst_port),
3303                     1, &srv_hdl, &srv_id);
3304                 if ((ibt_status != IBT_SUCCESS) &&
3305                     (ibt_status != IBT_CM_SERVICE_EXISTS)) {
3306                         rw_exit(&rib_stat->service_list_lock);
3307                         DTRACE_PROBE1(rpcib__i__regservice__ibtres,
3308                             int, ibt_status);
3309                         ibt_free_portinfo(port_infop, port_size);
3310                         return (RDMA_FAILED);
3311                 }
3312 
3313                 /*
3314                  * Allocate and prepare a service entry
3315                  */
3316                 service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP);
3317 
3318                 service->srv_type = service_type;
3319                 service->srv_hdl = srv_hdl;
3320                 service->srv_id = srv_id;
3321 
3322                 service->next = rib_stat->service_list;
3323                 rib_stat->service_list = service;
3324                 DTRACE_PROBE1(rpcib__i__regservice__new__service,
3325                     int, service->srv_type);
3326         } else {
3327                 srv_hdl = service->srv_hdl;
3328                 srv_id = service->srv_id;
3329                 DTRACE_PROBE1(rpcib__i__regservice__existing__service,
3330                     int, service->srv_type);
3331         }
3332 
3333         for (i = 0; i < num_ports; i++) {
3334                 ibt_sbind_hdl_t         sbp;
3335                 rib_hca_service_t       *hca_srv;
3336                 ib_gid_t                gid;
3337 
3338                 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3339                         continue;
3340 
3341                 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3342                         pkey = port_infop[i].p_pkey_tbl[pki];
3343 
3344                         rw_enter(&hca->bound_services_lock, RW_READER);
3345                         gid = port_infop[i].p_sgid_tbl[0];
3346                         for (hca_srv = hca->bound_services; hca_srv;
3347                             hca_srv = hca_srv->next) {
3348                                 if ((hca_srv->srv_id == service->srv_id) &&
3349                                     (hca_srv->gid.gid_prefix ==
3350                                     gid.gid_prefix) &&
3351                                     (hca_srv->gid.gid_guid == gid.gid_guid))
3352                                         break;
3353                         }
3354                         rw_exit(&hca->bound_services_lock);
3355                         if (hca_srv != NULL) {
3356                                 /*
3357                                  * port is alreay bound the the service
3358                                  */
3359                                 DTRACE_PROBE1(
3360                                     rpcib__i__regservice__already__bound,
3361                                     int, i+1);
3362                                 nbinds++;
3363                                 continue;
3364                         }
3365 
3366                         if ((pkey & IBSRM_HB) &&
3367                             (pkey != IB_PKEY_INVALID_FULL)) {
3368 
3369                                 sbp = NULL;
3370                                 ibt_status = ibt_bind_service(srv_hdl,
3371                                     gid, NULL, hca, &sbp);
3372 
3373                                 if (ibt_status == IBT_SUCCESS) {
3374                                         hca_srv = kmem_zalloc(
3375                                             sizeof (rib_hca_service_t),
3376                                             KM_SLEEP);
3377                                         hca_srv->srv_id = srv_id;
3378                                         hca_srv->gid = gid;
3379                                         hca_srv->sbind_hdl = sbp;
3380 
3381                                         rw_enter(&hca->bound_services_lock,
3382                                             RW_WRITER);
3383                                         hca_srv->next = hca->bound_services;
3384                                         hca->bound_services = hca_srv;
3385                                         rw_exit(&hca->bound_services_lock);
3386                                         nbinds++;
3387                                 }
3388 
3389                                 DTRACE_PROBE1(rpcib__i__regservice__bindres,
3390                                     int, ibt_status);
3391                         }
3392                 }
3393         }
3394         rw_exit(&rib_stat->service_list_lock);
3395 
3396         ibt_free_portinfo(port_infop, port_size);
3397 
3398         if (nbinds == 0) {
3399                 return (RDMA_FAILED);
3400         } else {
3401                 /*
3402                  * Put this plugin into accept state, since atleast
3403                  * one registration was successful.
3404                  */
3405                 mutex_enter(&plugin_state_lock);
3406                 plugin_state = ACCEPT;
3407                 mutex_exit(&plugin_state_lock);
3408                 return (RDMA_SUCCESS);
3409         }
3410 }
3411 
3412 void
3413 rib_listen(struct rdma_svc_data *rd)
3414 {
3415         rdma_stat status;
3416         int n_listening = 0;
3417         rib_hca_t *hca;
3418 
3419         mutex_enter(&rib_stat->listen_lock);
3420         /*
3421          * if rd parameter is NULL then it means that rib_stat->q is
3422          * already initialized by a call from RDMA and we just want to
3423          * add a newly attached HCA to the same listening state as other
3424          * HCAs.
3425          */
3426         if (rd == NULL) {
3427                 if (rib_stat->q == NULL) {
3428                         mutex_exit(&rib_stat->listen_lock);
3429                         return;
3430                 }
3431         } else {
3432                 rib_stat->q = &rd->q;
3433         }
3434         rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3435         for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3436                 /*
3437                  * First check if a hca is still attached
3438                  */
3439                 rw_enter(&hca->state_lock, RW_READER);
3440                 if (hca->state != HCA_INITED) {
3441                         rw_exit(&hca->state_lock);
3442                         continue;
3443                 }
3444                 rw_exit(&hca->state_lock);
3445 
3446                 /*
3447                  * Right now the only service type is NFS. Hence
3448                  * force feed this value. Ideally to communicate
3449                  * the service type it should be passed down in
3450                  * rdma_svc_data.
3451                  */
3452                 status = rib_register_service(hca, NFS,
3453                     IPPROTO_TCP, nfs_rdma_port);
3454                 if (status == RDMA_SUCCESS)
3455                         n_listening++;
3456         }
3457         rw_exit(&rib_stat->hcas_list_lock);
3458 
3459         /*
3460          * Service active on an HCA, check rd->err_code for more
3461          * explainable errors.
3462          */
3463         if (rd) {
3464                 if (n_listening > 0) {
3465                         rd->active = 1;
3466                         rd->err_code = RDMA_SUCCESS;
3467                 } else {
3468                         rd->active = 0;
3469                         rd->err_code = RDMA_FAILED;
3470                 }
3471         }
3472         mutex_exit(&rib_stat->listen_lock);
3473 }
3474 
3475 /* XXXX */
3476 /* ARGSUSED */
3477 static void
3478 rib_listen_stop(struct rdma_svc_data *svcdata)
3479 {
3480         rib_hca_t               *hca;
3481 
3482         mutex_enter(&rib_stat->listen_lock);
3483         /*
3484          * KRPC called the RDMATF to stop the listeners, this means
3485          * stop sending incomming or recieved requests to KRPC master
3486          * transport handle for RDMA-IB. This is also means that the
3487          * master transport handle, responsible for us, is going away.
3488          */
3489         mutex_enter(&plugin_state_lock);
3490         plugin_state = NO_ACCEPT;
3491         if (svcdata != NULL)
3492                 svcdata->active = 0;
3493         mutex_exit(&plugin_state_lock);
3494 
3495         rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3496         for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3497                 /*
3498                  * First check if a hca is still attached
3499                  */
3500                 rw_enter(&hca->state_lock, RW_READER);
3501                 if (hca->state == HCA_DETACHED) {
3502                         rw_exit(&hca->state_lock);
3503                         continue;
3504                 }
3505                 rib_close_channels(&hca->srv_conn_list);
3506                 rib_stop_services(hca);
3507                 rw_exit(&hca->state_lock);
3508         }
3509         rw_exit(&rib_stat->hcas_list_lock);
3510 
3511         /*
3512          * Avoid rib_listen() using the stale q field.
3513          * This could happen if a port goes up after all services
3514          * are already unregistered.
3515          */
3516         rib_stat->q = NULL;
3517         mutex_exit(&rib_stat->listen_lock);
3518 }
3519 
3520 /*
3521  * Traverse the HCA's service list to unbind and deregister services.
3522  * For each bound service of HCA to be removed, first find the corresponding
3523  * service handle (srv_hdl) and then unbind the service by calling
3524  * ibt_unbind_service().
3525  */
3526 static void
3527 rib_stop_services(rib_hca_t *hca)
3528 {
3529         rib_hca_service_t *srv_list, *to_remove;
3530 
3531         /*
3532          * unbind and deregister the services for this service type.
3533          * Right now there is only one service type. In future it will
3534          * be passed down to this function.
3535          */
3536         rw_enter(&hca->bound_services_lock, RW_READER);
3537         srv_list = hca->bound_services;
3538         hca->bound_services = NULL;
3539         rw_exit(&hca->bound_services_lock);
3540 
3541         while (srv_list != NULL) {
3542                 rib_service_t *sc;
3543 
3544                 to_remove = srv_list;
3545                 srv_list = to_remove->next;
3546                 rw_enter(&rib_stat->service_list_lock, RW_READER);
3547                 for (sc = rib_stat->service_list;
3548                     sc && (sc->srv_id != to_remove->srv_id);
3549                     sc = sc->next)
3550                         ;
3551                 /*
3552                  * if sc is NULL then the service doesn't exist anymore,
3553                  * probably just removed completely through rib_stat.
3554                  */
3555                 if (sc != NULL)
3556                         (void) ibt_unbind_service(sc->srv_hdl,
3557                             to_remove->sbind_hdl);
3558                 rw_exit(&rib_stat->service_list_lock);
3559                 kmem_free(to_remove, sizeof (rib_hca_service_t));
3560         }
3561 }
3562 
3563 static struct svc_recv *
3564 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3565 {
3566         struct svc_recv *recvp;
3567 
3568         recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3569         recvp->vaddr = sgl->ds_va;
3570         recvp->qp = qp;
3571         recvp->bytes_xfer = 0;
3572         return (recvp);
3573 }
3574 
3575 static int
3576 rib_free_svc_recv(struct svc_recv *recvp)
3577 {
3578         kmem_free(recvp, sizeof (*recvp));
3579 
3580         return (0);
3581 }
3582 
3583 static struct reply *
3584 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3585 {
3586         struct reply    *rep;
3587 
3588 
3589         rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3590         if (rep == NULL) {
3591                 DTRACE_PROBE(rpcib__i__addrreply__nomem);
3592                 return (NULL);
3593         }
3594         rep->xid = msgid;
3595         rep->vaddr_cq = NULL;
3596         rep->bytes_xfer = 0;
3597         rep->status = (uint_t)REPLY_WAIT;
3598         rep->prev = NULL;
3599         cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3600 
3601         mutex_enter(&qp->replylist_lock);
3602         if (qp->replylist) {
3603                 rep->next = qp->replylist;
3604                 qp->replylist->prev = rep;
3605         }
3606         qp->rep_list_size++;
3607 
3608         DTRACE_PROBE1(rpcib__i__addrreply__listsize,
3609             int, qp->rep_list_size);
3610 
3611         qp->replylist = rep;
3612         mutex_exit(&qp->replylist_lock);
3613 
3614         return (rep);
3615 }
3616 
3617 static rdma_stat
3618 rib_rem_replylist(rib_qp_t *qp)
3619 {
3620         struct reply    *r, *n;
3621 
3622         mutex_enter(&qp->replylist_lock);
3623         for (r = qp->replylist; r != NULL; r = n) {
3624                 n = r->next;
3625                 (void) rib_remreply(qp, r);
3626         }
3627         mutex_exit(&qp->replylist_lock);
3628 
3629         return (RDMA_SUCCESS);
3630 }
3631 
3632 static int
3633 rib_remreply(rib_qp_t *qp, struct reply *rep)
3634 {
3635 
3636         ASSERT(MUTEX_HELD(&qp->replylist_lock));
3637         if (rep->prev) {
3638                 rep->prev->next = rep->next;
3639         }
3640         if (rep->next) {
3641                 rep->next->prev = rep->prev;
3642         }
3643         if (qp->replylist == rep)
3644                 qp->replylist = rep->next;
3645 
3646         cv_destroy(&rep->wait_cv);
3647         qp->rep_list_size--;
3648 
3649         DTRACE_PROBE1(rpcib__i__remreply__listsize,
3650             int, qp->rep_list_size);
3651 
3652         kmem_free(rep, sizeof (*rep));
3653 
3654         return (0);
3655 }
3656 
3657 rdma_stat
3658 rib_registermem(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3659         struct mrc *buf_handle)
3660 {
3661         ibt_mr_hdl_t    mr_hdl = NULL;  /* memory region handle */
3662         ibt_mr_desc_t   mr_desc;        /* vaddr, lkey, rkey */
3663         rdma_stat       status;
3664         rib_hca_t       *hca = (ctoqp(conn))->hca;
3665 
3666         /*
3667          * Note: ALL buffer pools use the same memory type RDMARW.
3668          */
3669         status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3670         if (status == RDMA_SUCCESS) {
3671                 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3672                 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3673                 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3674         } else {
3675                 buf_handle->mrc_linfo = NULL;
3676                 buf_handle->mrc_lmr = 0;
3677                 buf_handle->mrc_rmr = 0;
3678         }
3679         return (status);
3680 }
3681 
3682 static rdma_stat
3683 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size,
3684         ibt_mr_flags_t spec,
3685         ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3686 {
3687         ibt_mr_attr_t   mem_attr;
3688         ibt_status_t    ibt_status;
3689         mem_attr.mr_vaddr = (uintptr_t)buf;
3690         mem_attr.mr_len = (ib_msglen_t)size;
3691         mem_attr.mr_as = (struct as *)(caddr_t)adsp;
3692         mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3693             IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3694             IBT_MR_ENABLE_WINDOW_BIND | spec;
3695 
3696         rw_enter(&hca->state_lock, RW_READER);
3697         if (hca->state != HCA_DETACHED) {
3698                 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3699                     &mem_attr, mr_hdlp, mr_descp);
3700                 rw_exit(&hca->state_lock);
3701         } else {
3702                 rw_exit(&hca->state_lock);
3703                 return (RDMA_FAILED);
3704         }
3705 
3706         if (ibt_status != IBT_SUCCESS) {
3707                 return (RDMA_FAILED);
3708         }
3709         return (RDMA_SUCCESS);
3710 }
3711 
3712 rdma_stat
3713 rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3714         struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
3715 {
3716         ibt_mr_hdl_t    mr_hdl = NULL;  /* memory region handle */
3717         rib_lrc_entry_t *l;
3718         ibt_mr_desc_t   mr_desc;        /* vaddr, lkey, rkey */
3719         rdma_stat       status;
3720         rib_hca_t       *hca = (ctoqp(conn))->hca;
3721 
3722         /*
3723          * Non-coherent memory registration.
3724          */
3725         l = (rib_lrc_entry_t *)lrc;
3726         if (l) {
3727                 if (l->registered) {
3728                         buf_handle->mrc_linfo =
3729                             (uintptr_t)l->lrc_mhandle.mrc_linfo;
3730                         buf_handle->mrc_lmr =
3731                             (uint32_t)l->lrc_mhandle.mrc_lmr;
3732                         buf_handle->mrc_rmr =
3733                             (uint32_t)l->lrc_mhandle.mrc_rmr;
3734                         *sync_handle = (RIB_SYNCMEM_HANDLE)
3735                             (uintptr_t)l->lrc_mhandle.mrc_linfo;
3736                         return (RDMA_SUCCESS);
3737                 } else {
3738                         /* Always register the whole buffer */
3739                         buf = (caddr_t)l->lrc_buf;
3740                         buflen = l->lrc_len;
3741                 }
3742         }
3743         status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3744 
3745         if (status == RDMA_SUCCESS) {
3746                 if (l) {
3747                         l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
3748                         l->lrc_mhandle.mrc_lmr   = (uint32_t)mr_desc.md_lkey;
3749                         l->lrc_mhandle.mrc_rmr   = (uint32_t)mr_desc.md_rkey;
3750                         l->registered                 = TRUE;
3751                 }
3752                 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3753                 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3754                 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3755                 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3756         } else {
3757                 buf_handle->mrc_linfo = NULL;
3758                 buf_handle->mrc_lmr = 0;
3759                 buf_handle->mrc_rmr = 0;
3760         }
3761         return (status);
3762 }
3763 
3764 /* ARGSUSED */
3765 rdma_stat
3766 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3767 {
3768         rib_hca_t *hca = (ctoqp(conn))->hca;
3769         /*
3770          * Allow memory deregistration even if HCA is
3771          * getting detached. Need all outstanding
3772          * memory registrations to be deregistered
3773          * before HCA_DETACH_EVENT can be accepted.
3774          */
3775         (void) ibt_deregister_mr(hca->hca_hdl,
3776             (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3777         return (RDMA_SUCCESS);
3778 }
3779 
3780 /* ARGSUSED */
3781 rdma_stat
3782 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3783                 RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
3784 {
3785         rib_lrc_entry_t *l;
3786         l = (rib_lrc_entry_t *)lrc;
3787         if (l)
3788                 if (l->registered)
3789                         return (RDMA_SUCCESS);
3790 
3791         (void) rib_deregistermem(conn, buf, buf_handle);
3792 
3793         return (RDMA_SUCCESS);
3794 }
3795 
3796 /* ARGSUSED */
3797 rdma_stat
3798 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3799                 int len, int cpu)
3800 {
3801         ibt_status_t    status;
3802         rib_hca_t *hca = (ctoqp(conn))->hca;
3803         ibt_mr_sync_t   mr_segment;
3804 
3805         mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3806         mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3807         mr_segment.ms_len = (ib_memlen_t)len;
3808         if (cpu) {
3809                 /* make incoming data visible to memory */
3810                 mr_segment.ms_flags = IBT_SYNC_WRITE;
3811         } else {
3812                 /* make memory changes visible to IO */
3813                 mr_segment.ms_flags = IBT_SYNC_READ;
3814         }
3815         rw_enter(&hca->state_lock, RW_READER);
3816         if (hca->state != HCA_DETACHED) {
3817                 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3818                 rw_exit(&hca->state_lock);
3819         } else {
3820                 rw_exit(&hca->state_lock);
3821                 return (RDMA_FAILED);
3822         }
3823 
3824         if (status == IBT_SUCCESS)
3825                 return (RDMA_SUCCESS);
3826         else {
3827                 return (RDMA_FAILED);
3828         }
3829 }
3830 
3831 /*
3832  * XXXX ????
3833  */
3834 static rdma_stat
3835 rib_getinfo(rdma_info_t *info)
3836 {
3837         /*
3838          * XXXX Hack!
3839          */
3840         info->addrlen = 16;
3841         info->mts = 1000000;
3842         info->mtu = 1000000;
3843 
3844         return (RDMA_SUCCESS);
3845 }
3846 
3847 rib_bufpool_t *
3848 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3849 {
3850         rib_bufpool_t   *rbp = NULL;
3851         bufpool_t       *bp = NULL;
3852         caddr_t         buf;
3853         ibt_mr_attr_t   mem_attr;
3854         ibt_status_t    ibt_status;
3855         int             i, j;
3856 
3857         rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3858 
3859         bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3860             num * sizeof (void *), KM_SLEEP);
3861 
3862         mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3863         bp->numelems = num;
3864 
3865 
3866         switch (ptype) {
3867         case SEND_BUFFER:
3868                 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3869                 bp->rsize = RPC_MSG_SZ;
3870                 break;
3871         case RECV_BUFFER:
3872                 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3873                 bp->rsize = RPC_BUF_SIZE;
3874                 break;
3875         default:
3876                 goto fail;
3877         }
3878 
3879         /*
3880          * Register the pool.
3881          */
3882         bp->bufsize = num * bp->rsize;
3883         bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3884         rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3885             sizeof (ibt_mr_hdl_t), KM_SLEEP);
3886         rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3887             sizeof (ibt_mr_desc_t), KM_SLEEP);
3888         rw_enter(&hca->state_lock, RW_READER);
3889 
3890         if (hca->state == HCA_DETACHED) {
3891                 rw_exit(&hca->state_lock);
3892                 goto fail;
3893         }
3894 
3895         for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3896                 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3897                 mem_attr.mr_vaddr = (uintptr_t)buf;
3898                 mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3899                 mem_attr.mr_as = NULL;
3900                 ibt_status = ibt_register_mr(hca->hca_hdl,
3901                     hca->pd_hdl, &mem_attr,
3902                     &rbp->mr_hdl[i],
3903                     &rbp->mr_desc[i]);
3904                 if (ibt_status != IBT_SUCCESS) {
3905                         for (j = 0; j < i; j++) {
3906                                 (void) ibt_deregister_mr(hca->hca_hdl,
3907                                     rbp->mr_hdl[j]);
3908                         }
3909                         rw_exit(&hca->state_lock);
3910                         goto fail;
3911                 }
3912         }
3913         rw_exit(&hca->state_lock);
3914         buf = (caddr_t)bp->buf;
3915         for (i = 0; i < num; i++, buf += bp->rsize) {
3916                 bp->buflist[i] = (void *)buf;
3917         }
3918         bp->buffree = num - 1;       /* no. of free buffers */
3919         rbp->bpool = bp;
3920 
3921         return (rbp);
3922 fail:
3923         if (bp) {
3924                 if (bp->buf)
3925                         kmem_free(bp->buf, bp->bufsize);
3926                 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3927         }
3928         if (rbp) {
3929                 if (rbp->mr_hdl)
3930                         kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3931                 if (rbp->mr_desc)
3932                         kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3933                 kmem_free(rbp, sizeof (rib_bufpool_t));
3934         }
3935         return (NULL);
3936 }
3937 
3938 static void
3939 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3940 {
3941         int i;
3942         rib_bufpool_t *rbp = NULL;
3943         bufpool_t *bp;
3944 
3945         /*
3946          * Obtain pool address based on type of pool
3947          */
3948         switch (ptype) {
3949                 case SEND_BUFFER:
3950                         rbp = hca->send_pool;
3951                         break;
3952                 case RECV_BUFFER:
3953                         rbp = hca->recv_pool;
3954                         break;
3955                 default:
3956                         return;
3957         }
3958         if (rbp == NULL)
3959                 return;
3960 
3961         bp = rbp->bpool;
3962 
3963         /*
3964          * Deregister the pool memory and free it.
3965          */
3966         for (i = 0; i < bp->numelems; i++) {
3967                 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3968         }
3969 }
3970 
3971 static void
3972 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3973 {
3974 
3975         rib_bufpool_t *rbp = NULL;
3976         bufpool_t *bp;
3977 
3978         /*
3979          * Obtain pool address based on type of pool
3980          */
3981         switch (ptype) {
3982                 case SEND_BUFFER:
3983                         rbp = hca->send_pool;
3984                         break;
3985                 case RECV_BUFFER:
3986                         rbp = hca->recv_pool;
3987                         break;
3988                 default:
3989                         return;
3990         }
3991         if (rbp == NULL)
3992                 return;
3993 
3994         bp = rbp->bpool;
3995 
3996         /*
3997          * Free the pool memory.
3998          */
3999         if (rbp->mr_hdl)
4000                 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
4001 
4002         if (rbp->mr_desc)
4003                 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
4004         if (bp->buf)
4005                 kmem_free(bp->buf, bp->bufsize);
4006         mutex_destroy(&bp->buflock);
4007         kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
4008         kmem_free(rbp, sizeof (rib_bufpool_t));
4009 }
4010 
4011 void
4012 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
4013 {
4014         /*
4015          * Deregister the pool memory and free it.
4016          */
4017         rib_rbufpool_deregister(hca, ptype);
4018         rib_rbufpool_free(hca, ptype);
4019 }
4020 
4021 /*
4022  * Fetch a buffer from the pool of type specified in rdbuf->type.
4023  */
4024 static rdma_stat
4025 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4026 {
4027         rib_lrc_entry_t *rlep;
4028 
4029         if (rdbuf->type ==  RDMA_LONG_BUFFER) {
4030                 rlep = rib_get_cache_buf(conn, rdbuf->len);
4031                 rdbuf->rb_private =  (caddr_t)rlep;
4032                 rdbuf->addr = rlep->lrc_buf;
4033                 rdbuf->handle = rlep->lrc_mhandle;
4034                 return (RDMA_SUCCESS);
4035         }
4036 
4037         rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
4038         if (rdbuf->addr) {
4039                 switch (rdbuf->type) {
4040                 case SEND_BUFFER:
4041                         rdbuf->len = RPC_MSG_SZ;     /* 1K */
4042                         break;
4043                 case RECV_BUFFER:
4044                         rdbuf->len = RPC_BUF_SIZE; /* 2K */
4045                         break;
4046                 default:
4047                         rdbuf->len = 0;
4048                 }
4049                 return (RDMA_SUCCESS);
4050         } else
4051                 return (RDMA_FAILED);
4052 }
4053 
4054 /*
4055  * Fetch a buffer of specified type.
4056  * Note that rdbuf->handle is mw's rkey.
4057  */
4058 static void *
4059 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4060 {
4061         rib_qp_t        *qp = ctoqp(conn);
4062         rib_hca_t       *hca = qp->hca;
4063         rdma_btype      ptype = rdbuf->type;
4064         void            *buf;
4065         rib_bufpool_t   *rbp = NULL;
4066         bufpool_t       *bp;
4067         int             i;
4068 
4069         /*
4070          * Obtain pool address based on type of pool
4071          */
4072         switch (ptype) {
4073         case SEND_BUFFER:
4074                 rbp = hca->send_pool;
4075                 break;
4076         case RECV_BUFFER:
4077                 rbp = hca->recv_pool;
4078                 break;
4079         default:
4080                 return (NULL);
4081         }
4082         if (rbp == NULL)
4083                 return (NULL);
4084 
4085         bp = rbp->bpool;
4086 
4087         mutex_enter(&bp->buflock);
4088         if (bp->buffree < 0) {
4089                 mutex_exit(&bp->buflock);
4090                 return (NULL);
4091         }
4092 
4093         /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
4094         buf = bp->buflist[bp->buffree];
4095         rdbuf->addr = buf;
4096         rdbuf->len = bp->rsize;
4097         for (i = bp->numelems - 1; i >= 0; i--) {
4098                 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
4099                         rdbuf->handle.mrc_rmr =
4100                             (uint32_t)rbp->mr_desc[i].md_rkey;
4101                         rdbuf->handle.mrc_linfo =
4102                             (uintptr_t)rbp->mr_hdl[i];
4103                         rdbuf->handle.mrc_lmr =
4104                             (uint32_t)rbp->mr_desc[i].md_lkey;
4105                         bp->buffree--;
4106 
4107                         mutex_exit(&bp->buflock);
4108 
4109                         return (buf);
4110                 }
4111         }
4112 
4113         mutex_exit(&bp->buflock);
4114 
4115         return (NULL);
4116 }
4117 
4118 static void
4119 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
4120 {
4121 
4122         if (rdbuf->type == RDMA_LONG_BUFFER) {
4123                 rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private);
4124                 rdbuf->rb_private = NULL;
4125                 return;
4126         }
4127         rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
4128 }
4129 
4130 static void
4131 rib_rbuf_free(CONN *conn, int ptype, void *buf)
4132 {
4133         rib_qp_t *qp = ctoqp(conn);
4134         rib_hca_t *hca = qp->hca;
4135         rib_bufpool_t *rbp = NULL;
4136         bufpool_t *bp;
4137 
4138         /*
4139          * Obtain pool address based on type of pool
4140          */
4141         switch (ptype) {
4142         case SEND_BUFFER:
4143                 rbp = hca->send_pool;
4144                 break;
4145         case RECV_BUFFER:
4146                 rbp = hca->recv_pool;
4147                 break;
4148         default:
4149                 return;
4150         }
4151         if (rbp == NULL)
4152                 return;
4153 
4154         bp = rbp->bpool;
4155 
4156         mutex_enter(&bp->buflock);
4157         if (++bp->buffree >= bp->numelems) {
4158                 /*
4159                  * Should never happen
4160                  */
4161                 bp->buffree--;
4162         } else {
4163                 bp->buflist[bp->buffree] = buf;
4164         }
4165         mutex_exit(&bp->buflock);
4166 }
4167 
4168 static rdma_stat
4169 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
4170 {
4171         rw_enter(&connlist->conn_lock, RW_WRITER);
4172         if (connlist->conn_hd) {
4173                 cn->c_next = connlist->conn_hd;
4174                 connlist->conn_hd->c_prev = cn;
4175         }
4176         connlist->conn_hd = cn;
4177         rw_exit(&connlist->conn_lock);
4178 
4179         return (RDMA_SUCCESS);
4180 }
4181 
4182 static rdma_stat
4183 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
4184 {
4185         rw_enter(&connlist->conn_lock, RW_WRITER);
4186         if (cn->c_prev) {
4187                 cn->c_prev->c_next = cn->c_next;
4188         }
4189         if (cn->c_next) {
4190                 cn->c_next->c_prev = cn->c_prev;
4191         }
4192         if (connlist->conn_hd == cn)
4193                 connlist->conn_hd = cn->c_next;
4194         rw_exit(&connlist->conn_lock);
4195 
4196         return (RDMA_SUCCESS);
4197 }
4198 
4199 /* ARGSUSED */
4200 static rdma_stat
4201 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4202     int addr_type, void *handle, CONN **conn)
4203 {
4204         rdma_stat status;
4205         rpcib_ping_t rpt;
4206 
4207         status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn);
4208         return (status);
4209 }
4210 
4211 /*
4212  * rib_find_hca_connection
4213  *
4214  * if there is an existing connection to the specified address then
4215  * it will be returned in conn, otherwise conn will be set to NULL.
4216  * Also cleans up any connection that is in error state.
4217  */
4218 static int
4219 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
4220     struct netbuf *d_svcaddr, CONN **conn)
4221 {
4222         CONN *cn;
4223         clock_t cv_stat, timout;
4224 
4225         *conn = NULL;
4226 again:
4227         rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4228         cn = hca->cl_conn_list.conn_hd;
4229         while (cn != NULL) {
4230                 /*
4231                  * First, clear up any connection in the ERROR state
4232                  */
4233                 mutex_enter(&cn->c_lock);
4234                 if (cn->c_state == C_ERROR_CONN) {
4235                         if (cn->c_ref == 0) {
4236                                 /*
4237                                  * Remove connection from list and destroy it.
4238                                  */
4239                                 cn->c_state = C_DISCONN_PEND;
4240                                 mutex_exit(&cn->c_lock);
4241                                 rw_exit(&hca->cl_conn_list.conn_lock);
4242                                 rib_conn_close((void *)cn);
4243                                 goto again;
4244                         }
4245                         mutex_exit(&cn->c_lock);
4246                         cn = cn->c_next;
4247                         continue;
4248                 }
4249                 if (cn->c_state == C_DISCONN_PEND) {
4250                         mutex_exit(&cn->c_lock);
4251                         cn = cn->c_next;
4252                         continue;
4253                 }
4254 
4255                 /*
4256                  * source address is only checked for if there is one,
4257                  * this is the case for retries.
4258                  */
4259                 if ((cn->c_raddr.len == d_svcaddr->len) &&
4260                     (bcmp(d_svcaddr->buf, cn->c_raddr.buf,
4261                     d_svcaddr->len) == 0) &&
4262                     ((s_svcaddr->len == 0) ||
4263                     ((cn->c_laddr.len == s_svcaddr->len) &&
4264                     (bcmp(s_svcaddr->buf, cn->c_laddr.buf,
4265                     s_svcaddr->len) == 0)))) {
4266                         /*
4267                          * Our connection. Give up conn list lock
4268                          * as we are done traversing the list.
4269                          */
4270                         rw_exit(&hca->cl_conn_list.conn_lock);
4271                         if (cn->c_state == C_CONNECTED) {
4272                                 cn->c_ref++; /* sharing a conn */
4273                                 mutex_exit(&cn->c_lock);
4274                                 *conn = cn;
4275                                 return (RDMA_SUCCESS);
4276                         }
4277                         if (cn->c_state == C_CONN_PEND) {
4278                                 /*
4279                                  * Hold a reference to this conn before
4280                                  * we give up the lock.
4281                                  */
4282                                 cn->c_ref++;
4283                                 timout =  ddi_get_lbolt() +
4284                                     drv_usectohz(CONN_WAIT_TIME * 1000000);
4285                                 while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
4286                                     &cn->c_lock, timout)) > 0 &&
4287                                     cn->c_state == C_CONN_PEND)
4288                                         ;
4289                                 if (cv_stat == 0) {
4290                                         (void) rib_conn_release_locked(cn);
4291                                         return (RDMA_INTR);
4292                                 }
4293                                 if (cv_stat < 0) {
4294                                         (void) rib_conn_release_locked(cn);
4295                                         return (RDMA_TIMEDOUT);
4296                                 }
4297                                 if (cn->c_state == C_CONNECTED) {
4298                                         *conn = cn;
4299                                         mutex_exit(&cn->c_lock);
4300                                         return (RDMA_SUCCESS);
4301                                 } else {
4302                                         (void) rib_conn_release_locked(cn);
4303                                         return (RDMA_TIMEDOUT);
4304                                 }
4305                         }
4306                 }
4307                 mutex_exit(&cn->c_lock);
4308                 cn = cn->c_next;
4309         }
4310         rw_exit(&hca->cl_conn_list.conn_lock);
4311         *conn = NULL;
4312         return (RDMA_FAILED);
4313 }
4314 
4315 /*
4316  * Connection management.
4317  * IBTF does not support recycling of channels. So connections are only
4318  * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
4319  * C_DISCONN_PEND state. No C_IDLE state.
4320  * C_CONN_PEND state: Connection establishment in progress to the server.
4321  * C_CONNECTED state: A connection when created is in C_CONNECTED state.
4322  * It has an RC channel associated with it. ibt_post_send/recv are allowed
4323  * only in this state.
4324  * C_ERROR_CONN state: A connection transitions to this state when WRs on the
4325  * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
4326  * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
4327  * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
4328  * c_ref drops to 0 (this indicates that RPC has no more references to this
4329  * connection), the connection should be destroyed. A connection transitions
4330  * into this state when it is being destroyed.
4331  */
4332 /* ARGSUSED */
4333 static rdma_stat
4334 rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4335     int addr_type, rpcib_ping_t *rpt, CONN **conn)
4336 {
4337         CONN *cn;
4338         int status;
4339         rib_hca_t *hca;
4340         rib_qp_t *qp;
4341         int s_addr_len;
4342         char *s_addr_buf;
4343 
4344         rw_enter(&rib_stat->hcas_list_lock, RW_READER);
4345         for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
4346                 rw_enter(&hca->state_lock, RW_READER);
4347                 if (hca->state != HCA_DETACHED) {
4348                         status = rib_find_hca_connection(hca, s_svcaddr,
4349                             d_svcaddr, conn);
4350                         rw_exit(&hca->state_lock);
4351                         if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) {
4352                                 rw_exit(&rib_stat->hcas_list_lock);
4353                                 return (status);
4354                         }
4355                 } else
4356                         rw_exit(&hca->state_lock);
4357         }
4358         rw_exit(&rib_stat->hcas_list_lock);
4359 
4360         /*
4361          * No existing connection found, establish a new connection.
4362          */
4363         bzero(rpt, sizeof (rpcib_ping_t));
4364 
4365         status = rib_ping_srv(addr_type, d_svcaddr, rpt);
4366         if (status != RDMA_SUCCESS) {
4367                 return (RDMA_FAILED);
4368         }
4369         hca = rpt->hca;
4370 
4371         if (rpt->srcip.family == AF_INET) {
4372                 s_addr_len = sizeof (rpt->srcip.un.ip4addr);
4373                 s_addr_buf = (char *)&rpt->srcip.un.ip4addr;
4374         } else if (rpt->srcip.family == AF_INET6) {
4375                 s_addr_len = sizeof (rpt->srcip.un.ip6addr);
4376                 s_addr_buf = (char *)&rpt->srcip.un.ip6addr;
4377         } else {
4378                 return (RDMA_FAILED);
4379         }
4380 
4381         /*
4382          * Channel to server doesn't exist yet, create one.
4383          */
4384         if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) {
4385                 return (RDMA_FAILED);
4386         }
4387         cn = qptoc(qp);
4388         cn->c_state = C_CONN_PEND;
4389         cn->c_ref = 1;
4390 
4391         cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP);
4392         bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len);
4393         cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len;
4394 
4395         if (rpt->srcip.family == AF_INET) {
4396                 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, KM_SLEEP);
4397                 (void) strcpy(cn->c_netid, RIBNETID_TCP);
4398 
4399                 cn->c_addrmask.len = cn->c_addrmask.maxlen =
4400                     sizeof (struct sockaddr_in);
4401                 cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP);
4402 
4403                 ((struct sockaddr_in *)cn->c_addrmask.buf)->sin_addr.s_addr =
4404                     (uint32_t)~0;
4405                 ((struct sockaddr_in *)cn->c_addrmask.buf)->sin_family =
4406                     (ushort_t)~0;
4407 
4408         } else {
4409                 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, KM_SLEEP);
4410                 (void) strcpy(cn->c_netid, RIBNETID_TCP6);
4411 
4412                 cn->c_addrmask.len = cn->c_addrmask.maxlen =
4413                     sizeof (struct sockaddr_in6);
4414                 cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP);
4415 
4416                 (void) memset(
4417                     &((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_addr,
4418                     (uchar_t)~0, sizeof (struct in6_addr));
4419                 ((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_family =
4420                     (sa_family_t)~0;
4421         }
4422 
4423         /*
4424          * Add to conn list.
4425          * We had given up the READER lock. In the time since then,
4426          * another thread might have created the connection we are
4427          * trying here. But for now, that is quiet alright - there
4428          * might be two connections between a pair of hosts instead
4429          * of one. If we really want to close that window,
4430          * then need to check the list after acquiring the
4431          * WRITER lock.
4432          */
4433         (void) rib_add_connlist(cn, &hca->cl_conn_list);
4434         status = rib_conn_to_srv(hca, qp, rpt);
4435         mutex_enter(&cn->c_lock);
4436 
4437         if (cn->c_flags & C_CLOSE_PENDING) {
4438                 /*
4439                  * This handles a case where the module or
4440                  * HCA detached in the time a connection is
4441                  * established. In such a case close the
4442                  * connection immediately if this is the
4443                  * only reference.
4444                  */
4445                 if (cn->c_ref == 1) {
4446                         cn->c_ref--;
4447                         cn->c_state = C_DISCONN_PEND;
4448                         mutex_exit(&cn->c_lock);
4449                         rib_conn_close((void *)cn);
4450                         return (RDMA_FAILED);
4451                 }
4452 
4453                 /*
4454                  * Connection to be closed later when c_ref = 0
4455                  */
4456                 status = RDMA_FAILED;
4457         }
4458 
4459         if (status == RDMA_SUCCESS) {
4460                 cn->c_state = C_CONNECTED;
4461                 *conn = cn;
4462         } else {
4463                 cn->c_state = C_ERROR_CONN;
4464                 cn->c_ref--;
4465         }
4466         cv_signal(&cn->c_cv);
4467         mutex_exit(&cn->c_lock);
4468         return (status);
4469 }
4470 
4471 static void
4472 rib_conn_close(void *rarg)
4473 {
4474         CONN *conn = (CONN *)rarg;
4475         rib_qp_t *qp = ctoqp(conn);
4476 
4477         mutex_enter(&conn->c_lock);
4478         if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4479 
4480                 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4481 
4482                 /*
4483                  * Live connection in CONNECTED state.
4484                  */
4485                 if (conn->c_state == C_CONNECTED) {
4486                         conn->c_state = C_ERROR_CONN;
4487                 }
4488                 mutex_exit(&conn->c_lock);
4489 
4490                 rib_close_a_channel(conn);
4491 
4492                 mutex_enter(&conn->c_lock);
4493                 conn->c_flags &= ~C_CLOSE_PENDING;
4494         }
4495 
4496         mutex_exit(&conn->c_lock);
4497 
4498         if (qp->mode == RIB_SERVER)
4499                 (void) rib_disconnect_channel(conn,
4500                     &qp->hca->srv_conn_list);
4501         else
4502                 (void) rib_disconnect_channel(conn,
4503                     &qp->hca->cl_conn_list);
4504 }
4505 
4506 static void
4507 rib_conn_timeout_call(void *carg)
4508 {
4509         time_t idle_time;
4510         CONN *conn = (CONN *)carg;
4511         rib_hca_t *hca = ctoqp(conn)->hca;
4512         int error;
4513 
4514         mutex_enter(&conn->c_lock);
4515         if ((conn->c_ref > 0) ||
4516             (conn->c_state == C_DISCONN_PEND)) {
4517                 conn->c_timeout = NULL;
4518                 mutex_exit(&conn->c_lock);
4519                 return;
4520         }
4521 
4522         idle_time = (gethrestime_sec() - conn->c_last_used);
4523 
4524         if ((idle_time <= rib_conn_timeout) &&
4525             (conn->c_state != C_ERROR_CONN)) {
4526                 /*
4527                  * There was activity after the last timeout.
4528                  * Extend the conn life. Unless the conn is
4529                  * already in error state.
4530                  */
4531                 conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4532                     SEC_TO_TICK(rib_conn_timeout - idle_time));
4533                 mutex_exit(&conn->c_lock);
4534                 return;
4535         }
4536 
4537         error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close,
4538             (void *)conn, DDI_NOSLEEP);
4539 
4540         /*
4541          * If taskq dispatch fails above, then reset the timeout
4542          * to try again after 10 secs.
4543          */
4544 
4545         if (error != DDI_SUCCESS) {
4546                 conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4547                     SEC_TO_TICK(RDMA_CONN_REAP_RETRY));
4548                 mutex_exit(&conn->c_lock);
4549                 return;
4550         }
4551 
4552         conn->c_state = C_DISCONN_PEND;
4553         mutex_exit(&conn->c_lock);
4554 }
4555 
4556 static rdma_stat
4557 rib_conn_release(CONN *conn)
4558 {
4559         mutex_enter(&conn->c_lock);
4560         return (rib_conn_release_locked(conn));
4561 }
4562 
4563 /*
4564  * Expects conn->c_lock to be held on entry.
4565  * c_lock released on return
4566  */
4567 static rdma_stat
4568 rib_conn_release_locked(CONN *conn)
4569 {
4570         conn->c_ref--;
4571 
4572         conn->c_last_used = gethrestime_sec();
4573         if (conn->c_ref > 0) {
4574                 mutex_exit(&conn->c_lock);
4575                 return (RDMA_SUCCESS);
4576         }
4577 
4578         /*
4579          * If a conn is C_ERROR_CONN, close the channel.
4580          */
4581         if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) {
4582                 conn->c_state = C_DISCONN_PEND;
4583                 mutex_exit(&conn->c_lock);
4584                 rib_conn_close((void *)conn);
4585                 return (RDMA_SUCCESS);
4586         }
4587 
4588         /*
4589          * c_ref == 0, set a timeout for conn release
4590          */
4591 
4592         if (conn->c_timeout == NULL) {
4593                 conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4594                     SEC_TO_TICK(rib_conn_timeout));
4595         }
4596 
4597         mutex_exit(&conn->c_lock);
4598         return (RDMA_SUCCESS);
4599 }
4600 
4601 /*
4602  * Add at front of list
4603  */
4604 static struct rdma_done_list *
4605 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4606 {
4607         struct rdma_done_list *rd;
4608 
4609         ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4610 
4611         rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4612         rd->xid = xid;
4613         cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4614 
4615         rd->prev = NULL;
4616         rd->next = qp->rdlist;
4617         if (qp->rdlist != NULL)
4618                 qp->rdlist->prev = rd;
4619         qp->rdlist = rd;
4620 
4621         return (rd);
4622 }
4623 
4624 static void
4625 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4626 {
4627         struct rdma_done_list *r;
4628 
4629         ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4630 
4631         r = rd->next;
4632         if (r != NULL) {
4633                 r->prev = rd->prev;
4634         }
4635 
4636         r = rd->prev;
4637         if (r != NULL) {
4638                 r->next = rd->next;
4639         } else {
4640                 qp->rdlist = rd->next;
4641         }
4642 
4643         cv_destroy(&rd->rdma_done_cv);
4644         kmem_free(rd, sizeof (*rd));
4645 }
4646 
4647 static void
4648 rdma_done_rem_list(rib_qp_t *qp)
4649 {
4650         struct rdma_done_list   *r, *n;
4651 
4652         mutex_enter(&qp->rdlist_lock);
4653         for (r = qp->rdlist; r != NULL; r = n) {
4654                 n = r->next;
4655                 rdma_done_rm(qp, r);
4656         }
4657         mutex_exit(&qp->rdlist_lock);
4658 }
4659 
4660 static void
4661 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4662 {
4663         struct rdma_done_list *r = qp->rdlist;
4664 
4665         ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4666 
4667         while (r) {
4668                 if (r->xid == xid) {
4669                         cv_signal(&r->rdma_done_cv);
4670                         return;
4671                 } else {
4672                         r = r->next;
4673                 }
4674         }
4675         DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid,
4676             int, xid);
4677 }
4678 
4679 /*
4680  * Expects conn->c_lock to be held by the caller.
4681  */
4682 
4683 static void
4684 rib_close_a_channel(CONN *conn)
4685 {
4686         rib_qp_t        *qp;
4687         qp = ctoqp(conn);
4688 
4689         if (qp->qp_hdl == NULL) {
4690                 /* channel already freed */
4691                 return;
4692         }
4693 
4694         /*
4695          * Call ibt_close_rc_channel in blocking mode
4696          * with no callbacks.
4697          */
4698         (void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS,
4699             NULL, 0, NULL, NULL, 0);
4700 }
4701 
4702 /*
4703  * Goes through all connections and closes the channel
4704  * This will cause all the WRs on those channels to be
4705  * flushed.
4706  */
4707 static void
4708 rib_close_channels(rib_conn_list_t *connlist)
4709 {
4710         CONN            *conn, *tmp;
4711 
4712         rw_enter(&connlist->conn_lock, RW_READER);
4713         conn = connlist->conn_hd;
4714         while (conn != NULL) {
4715                 mutex_enter(&conn->c_lock);
4716                 tmp = conn->c_next;
4717                 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4718 
4719                         if (conn->c_state == C_CONN_PEND) {
4720                                 conn->c_flags |= C_CLOSE_PENDING;
4721                                 goto next;
4722                         }
4723 
4724                         conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4725 
4726                         /*
4727                          * Live connection in CONNECTED state.
4728                          */
4729                         if (conn->c_state == C_CONNECTED)
4730                                 conn->c_state = C_ERROR_CONN;
4731                         mutex_exit(&conn->c_lock);
4732 
4733                         rib_close_a_channel(conn);
4734 
4735                         mutex_enter(&conn->c_lock);
4736                         conn->c_flags &= ~C_CLOSE_PENDING;
4737                         /* Signal a pending rib_disconnect_channel() */
4738                         cv_signal(&conn->c_cv);
4739                 }
4740 next:
4741                 mutex_exit(&conn->c_lock);
4742                 conn = tmp;
4743         }
4744         rw_exit(&connlist->conn_lock);
4745 }
4746 
4747 /*
4748  * Frees up all connections that are no longer being referenced
4749  */
4750 static void
4751 rib_purge_connlist(rib_conn_list_t *connlist)
4752 {
4753         CONN            *conn;
4754 
4755 top:
4756         rw_enter(&connlist->conn_lock, RW_READER);
4757         conn = connlist->conn_hd;
4758         while (conn != NULL) {
4759                 mutex_enter(&conn->c_lock);
4760 
4761                 /*
4762                  * At this point connection is either in ERROR
4763                  * or DISCONN_PEND state. If in DISCONN_PEND state
4764                  * then some other thread is culling that connection.
4765                  * If not and if c_ref is 0, then destroy the connection.
4766                  */
4767                 if (conn->c_ref == 0 &&
4768                     conn->c_state != C_DISCONN_PEND) {
4769                         /*
4770                          * Cull the connection
4771                          */
4772                         conn->c_state = C_DISCONN_PEND;
4773                         mutex_exit(&conn->c_lock);
4774                         rw_exit(&connlist->conn_lock);
4775                         (void) rib_disconnect_channel(conn, connlist);
4776                         goto top;
4777                 } else {
4778                         /*
4779                          * conn disconnect already scheduled or will
4780                          * happen from conn_release when c_ref drops to 0.
4781                          */
4782                         mutex_exit(&conn->c_lock);
4783                 }
4784                 conn = conn->c_next;
4785         }
4786         rw_exit(&connlist->conn_lock);
4787 
4788         /*
4789          * At this point, only connections with c_ref != 0 are on the list
4790          */
4791 }
4792 
4793 /*
4794  * Free all the HCA resources and close
4795  * the hca.
4796  */
4797 
4798 static void
4799 rib_free_hca(rib_hca_t *hca)
4800 {
4801         (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4802         (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4803         (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4804         (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4805 
4806         kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4807         kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4808         kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4809         kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4810 
4811         rib_rbufpool_destroy(hca, RECV_BUFFER);
4812         rib_rbufpool_destroy(hca, SEND_BUFFER);
4813         rib_destroy_cache(hca);
4814         if (rib_mod.rdma_count == 0)
4815                 (void) rdma_unregister_mod(&rib_mod);
4816         (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4817         (void) ibt_close_hca(hca->hca_hdl);
4818         hca->hca_hdl = NULL;
4819 }
4820 
4821 
4822 static void
4823 rib_stop_hca_services(rib_hca_t *hca)
4824 {
4825         rib_stop_services(hca);
4826         rib_close_channels(&hca->cl_conn_list);
4827         rib_close_channels(&hca->srv_conn_list);
4828 
4829         rib_purge_connlist(&hca->cl_conn_list);
4830         rib_purge_connlist(&hca->srv_conn_list);
4831 
4832         if ((rib_stat->hcas_list == NULL) && stats_enabled) {
4833                 kstat_delete_byname_zone("unix", 0, "rpcib_cache",
4834                     GLOBAL_ZONEID);
4835                 stats_enabled = FALSE;
4836         }
4837 
4838         rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4839         rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4840         if (hca->srv_conn_list.conn_hd == NULL &&
4841             hca->cl_conn_list.conn_hd == NULL) {
4842                 /*
4843                  * conn_lists are NULL, so destroy
4844                  * buffers, close hca and be done.
4845                  */
4846                 rib_free_hca(hca);
4847         }
4848         rw_exit(&hca->cl_conn_list.conn_lock);
4849         rw_exit(&hca->srv_conn_list.conn_lock);
4850 
4851         if (hca->hca_hdl != NULL) {
4852                 mutex_enter(&hca->inuse_lock);
4853                 while (hca->inuse)
4854                         cv_wait(&hca->cb_cv, &hca->inuse_lock);
4855                 mutex_exit(&hca->inuse_lock);
4856 
4857                 rib_free_hca(hca);
4858         }
4859         rw_destroy(&hca->bound_services_lock);
4860 
4861         if (hca->cleanup_helper != NULL) {
4862                 ddi_taskq_destroy(hca->cleanup_helper);
4863                 hca->cleanup_helper = NULL;
4864         }
4865 }
4866 
4867 /*
4868  * Cleans and closes up all uses of the HCA
4869  */
4870 static void
4871 rib_detach_hca(ibt_hca_hdl_t hca_hdl)
4872 {
4873         rib_hca_t *hca = NULL;
4874         rib_hca_t **hcap;
4875 
4876         rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
4877         for (hcap = &rib_stat->hcas_list; *hcap; hcap = &(*hcap)->next) {
4878                 hca = *hcap;
4879                 rw_enter(&hca->state_lock, RW_WRITER);
4880                 if (hca->hca_hdl == hca_hdl) {
4881                         /*
4882                          * Mark as detached and remove from
4883                          * hca list.
4884                          */
4885                         hca->state = HCA_DETACHED;
4886                         *hcap = hca->next;
4887                         rib_stat->nhca_inited--;
4888                         rib_mod.rdma_count--;
4889                         rw_exit(&hca->state_lock);
4890                         break;
4891                 }
4892                 rw_exit(&hca->state_lock);
4893         }
4894         rw_exit(&rib_stat->hcas_list_lock);
4895 
4896         if (hca == NULL)
4897                 return;
4898         ASSERT(hca->hca_hdl == hca_hdl);
4899 
4900         /*
4901          * Stop all services on the HCA
4902          * Go through cl_conn_list and close all rc_channels
4903          * Go through svr_conn_list and close all rc_channels
4904          * Free connections whose c_ref has dropped to 0
4905          * Destroy all CQs
4906          * Deregister and released all buffer pool memory after all
4907          * connections are destroyed
4908          * Free the protection domain
4909          * ibt_close_hca()
4910          */
4911         rib_stop_hca_services(hca);
4912 
4913         kmem_free(hca, sizeof (*hca));
4914 }
4915 
4916 static void
4917 rib_server_side_cache_reclaim(void *argp)
4918 {
4919         cache_avl_struct_t    *rcas;
4920         rib_lrc_entry_t         *rb;
4921         rib_hca_t *hca = (rib_hca_t *)argp;
4922 
4923         rw_enter(&hca->avl_rw_lock, RW_WRITER);
4924         rcas = avl_first(&hca->avl_tree);
4925         if (rcas != NULL)
4926                 avl_remove(&hca->avl_tree, rcas);
4927 
4928         while (rcas != NULL) {
4929                 while (rcas->r.forw != &rcas->r) {
4930                         rcas->elements--;
4931                         rb = rcas->r.forw;
4932                         remque(rb);
4933                         if (rb->registered)
4934                                 (void) rib_deregistermem_via_hca(hca,
4935                                     rb->lrc_buf, rb->lrc_mhandle);
4936 
4937                         hca->cache_allocation -= rb->lrc_len;
4938                         kmem_free(rb->lrc_buf, rb->lrc_len);
4939                         kmem_free(rb, sizeof (rib_lrc_entry_t));
4940                 }
4941                 mutex_destroy(&rcas->node_lock);
4942                 kmem_cache_free(hca->server_side_cache, rcas);
4943                 rcas = avl_first(&hca->avl_tree);
4944                 if (rcas != NULL)
4945                         avl_remove(&hca->avl_tree, rcas);
4946         }
4947         rw_exit(&hca->avl_rw_lock);
4948 }
4949 
4950 static void
4951 rib_server_side_cache_cleanup(void *argp)
4952 {
4953         cache_avl_struct_t    *rcas;
4954         rib_lrc_entry_t         *rb;
4955         rib_hca_t *hca = (rib_hca_t *)argp;
4956 
4957         mutex_enter(&hca->cache_allocation_lock);
4958         if (hca->cache_allocation < cache_limit) {
4959                 mutex_exit(&hca->cache_allocation_lock);
4960                 return;
4961         }
4962         mutex_exit(&hca->cache_allocation_lock);
4963 
4964         rw_enter(&hca->avl_rw_lock, RW_WRITER);
4965         rcas = avl_last(&hca->avl_tree);
4966         if (rcas != NULL)
4967                 avl_remove(&hca->avl_tree, rcas);
4968 
4969         while (rcas != NULL) {
4970                 while (rcas->r.forw != &rcas->r) {
4971                         rcas->elements--;
4972                         rb = rcas->r.forw;
4973                         remque(rb);
4974                         if (rb->registered)
4975                                 (void) rib_deregistermem_via_hca(hca,
4976                                     rb->lrc_buf, rb->lrc_mhandle);
4977 
4978                         hca->cache_allocation -= rb->lrc_len;
4979 
4980                         kmem_free(rb->lrc_buf, rb->lrc_len);
4981                         kmem_free(rb, sizeof (rib_lrc_entry_t));
4982                 }
4983                 mutex_destroy(&rcas->node_lock);
4984                 if (hca->server_side_cache) {
4985                         kmem_cache_free(hca->server_side_cache, rcas);
4986                 }
4987 
4988                 if (hca->cache_allocation < cache_limit) {
4989                         rw_exit(&hca->avl_rw_lock);
4990                         return;
4991                 }
4992 
4993                 rcas = avl_last(&hca->avl_tree);
4994                 if (rcas != NULL)
4995                         avl_remove(&hca->avl_tree, rcas);
4996         }
4997         rw_exit(&hca->avl_rw_lock);
4998 }
4999 
5000 static int
5001 avl_compare(const void *t1, const void *t2)
5002 {
5003         if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
5004                 return (0);
5005 
5006         if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
5007                 return (-1);
5008 
5009         return (1);
5010 }
5011 
5012 static void
5013 rib_destroy_cache(rib_hca_t *hca)
5014 {
5015         if (hca->avl_init) {
5016                 rib_server_side_cache_reclaim((void *)hca);
5017                 if (hca->server_side_cache) {
5018                         kmem_cache_destroy(hca->server_side_cache);
5019                         hca->server_side_cache = NULL;
5020                 }
5021                 avl_destroy(&hca->avl_tree);
5022                 mutex_destroy(&hca->cache_allocation_lock);
5023                 rw_destroy(&hca->avl_rw_lock);
5024         }
5025         hca->avl_init = FALSE;
5026 }
5027 
5028 static void
5029 rib_force_cleanup(void *hca)
5030 {
5031         if (((rib_hca_t *)hca)->cleanup_helper != NULL)
5032                 (void) ddi_taskq_dispatch(
5033                     ((rib_hca_t *)hca)->cleanup_helper,
5034                     rib_server_side_cache_cleanup,
5035                     (void *)hca, DDI_NOSLEEP);
5036 }
5037 
5038 static rib_lrc_entry_t *
5039 rib_get_cache_buf(CONN *conn, uint32_t len)
5040 {
5041         cache_avl_struct_t      cas, *rcas;
5042         rib_hca_t       *hca = (ctoqp(conn))->hca;
5043         rib_lrc_entry_t *reply_buf;
5044         avl_index_t where = NULL;
5045         uint64_t c_alloc = 0;
5046 
5047         if (!hca->avl_init)
5048                 goto  error_alloc;
5049 
5050         cas.len = len;
5051 
5052         rw_enter(&hca->avl_rw_lock, RW_READER);
5053 
5054         mutex_enter(&hca->cache_allocation_lock);
5055         c_alloc = hca->cache_allocation;
5056         mutex_exit(&hca->cache_allocation_lock);
5057 
5058         if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas,
5059             &where)) == NULL) {
5060                 /* Am I above the cache limit */
5061                 if ((c_alloc + len) >= cache_limit) {
5062                         rib_force_cleanup((void *)hca);
5063                         rw_exit(&hca->avl_rw_lock);
5064                         mutex_enter(&hca->cache_allocation_lock);
5065                         hca->cache_misses_above_the_limit ++;
5066                         mutex_exit(&hca->cache_allocation_lock);
5067 
5068                         /* Allocate and register the buffer directly */
5069                         goto error_alloc;
5070                 }
5071 
5072                 rw_exit(&hca->avl_rw_lock);
5073                 rw_enter(&hca->avl_rw_lock, RW_WRITER);
5074 
5075                 /* Recheck to make sure no other thread added the entry in */
5076                 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,
5077                     &cas, &where)) == NULL) {
5078                         /* Allocate an avl tree entry */
5079                         rcas = (cache_avl_struct_t *)
5080                             kmem_cache_alloc(hca->server_side_cache, KM_SLEEP);
5081 
5082                         bzero(rcas, sizeof (cache_avl_struct_t));
5083                         rcas->elements = 0;
5084                         rcas->r.forw = &rcas->r;
5085                         rcas->r.back = &rcas->r;
5086                         rcas->len = len;
5087                         mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
5088                         avl_insert(&hca->avl_tree, rcas, where);
5089                 }
5090         }
5091 
5092         mutex_enter(&rcas->node_lock);
5093 
5094         if (rcas->r.forw != &rcas->r && rcas->elements > 0) {
5095                 reply_buf = rcas->r.forw;
5096                 remque(reply_buf);
5097                 rcas->elements--;
5098                 mutex_exit(&rcas->node_lock);
5099                 rw_exit(&hca->avl_rw_lock);
5100 
5101                 mutex_enter(&hca->cache_allocation_lock);
5102                 hca->cache_hits++;
5103                 hca->cache_allocation -= len;
5104                 mutex_exit(&hca->cache_allocation_lock);
5105         } else {
5106                 /* Am I above the cache limit */
5107                 mutex_exit(&rcas->node_lock);
5108                 if ((c_alloc + len) >= cache_limit) {
5109                         rib_force_cleanup((void *)hca);
5110                         rw_exit(&hca->avl_rw_lock);
5111 
5112                         mutex_enter(&hca->cache_allocation_lock);
5113                         hca->cache_misses_above_the_limit++;
5114                         mutex_exit(&hca->cache_allocation_lock);
5115                         /* Allocate and register the buffer directly */
5116                         goto error_alloc;
5117                 }
5118                 rw_exit(&hca->avl_rw_lock);
5119                 mutex_enter(&hca->cache_allocation_lock);
5120                 hca->cache_misses++;
5121                 mutex_exit(&hca->cache_allocation_lock);
5122                 /* Allocate a reply_buf entry */
5123                 reply_buf = (rib_lrc_entry_t *)
5124                     kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5125                 bzero(reply_buf, sizeof (rib_lrc_entry_t));
5126                 reply_buf->lrc_buf  = kmem_alloc(len, KM_SLEEP);
5127                 reply_buf->lrc_len  = len;
5128                 reply_buf->registered = FALSE;
5129                 reply_buf->avl_node = (void *)rcas;
5130         }
5131 
5132         return (reply_buf);
5133 
5134 error_alloc:
5135         reply_buf = (rib_lrc_entry_t *)
5136             kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5137         bzero(reply_buf, sizeof (rib_lrc_entry_t));
5138         reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
5139         reply_buf->lrc_len = len;
5140         reply_buf->registered = FALSE;
5141         reply_buf->avl_node = NULL;
5142 
5143         return (reply_buf);
5144 }
5145 
5146 /*
5147  * Return a pre-registered back to the cache (without
5148  * unregistering the buffer)..
5149  */
5150 
5151 static void
5152 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
5153 {
5154         cache_avl_struct_t    cas, *rcas;
5155         avl_index_t where = NULL;
5156         rib_hca_t       *hca = (ctoqp(conn))->hca;
5157 
5158         if (!hca->avl_init)
5159                 goto  error_free;
5160 
5161         cas.len = reg_buf->lrc_len;
5162         rw_enter(&hca->avl_rw_lock, RW_READER);
5163         if ((rcas = (cache_avl_struct_t *)
5164             avl_find(&hca->avl_tree, &cas, &where)) == NULL) {
5165                 rw_exit(&hca->avl_rw_lock);
5166                 goto error_free;
5167         } else {
5168                 cas.len = reg_buf->lrc_len;
5169                 mutex_enter(&rcas->node_lock);
5170                 insque(reg_buf, &rcas->r);
5171                 rcas->elements ++;
5172                 mutex_exit(&rcas->node_lock);
5173                 rw_exit(&hca->avl_rw_lock);
5174                 mutex_enter(&hca->cache_allocation_lock);
5175                 hca->cache_allocation += cas.len;
5176                 mutex_exit(&hca->cache_allocation_lock);
5177         }
5178 
5179         return;
5180 
5181 error_free:
5182 
5183         if (reg_buf->registered)
5184                 (void) rib_deregistermem_via_hca(hca,
5185                     reg_buf->lrc_buf, reg_buf->lrc_mhandle);
5186         kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len);
5187         kmem_free(reg_buf, sizeof (rib_lrc_entry_t));
5188 }
5189 
5190 static rdma_stat
5191 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
5192         uint_t buflen, struct mrc *buf_handle)
5193 {
5194         ibt_mr_hdl_t    mr_hdl = NULL;  /* memory region handle */
5195         ibt_mr_desc_t   mr_desc;        /* vaddr, lkey, rkey */
5196         rdma_stat       status;
5197 
5198 
5199         /*
5200          * Note: ALL buffer pools use the same memory type RDMARW.
5201          */
5202         status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
5203         if (status == RDMA_SUCCESS) {
5204                 buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl;
5205                 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
5206                 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
5207         } else {
5208                 buf_handle->mrc_linfo = NULL;
5209                 buf_handle->mrc_lmr = 0;
5210                 buf_handle->mrc_rmr = 0;
5211         }
5212         return (status);
5213 }
5214 
5215 /* ARGSUSED */
5216 static rdma_stat
5217 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
5218     struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
5219 {
5220 
5221         (void) rib_deregistermem_via_hca(hca, buf, buf_handle);
5222         return (RDMA_SUCCESS);
5223 }
5224 
5225 /* ARGSUSED */
5226 static rdma_stat
5227 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
5228 {
5229 
5230         (void) ibt_deregister_mr(hca->hca_hdl,
5231             (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
5232         return (RDMA_SUCCESS);
5233 }
5234 
5235 /*
5236  * Check if the IP interface named by `lifrp' is RDMA-capable.
5237  */
5238 static boolean_t
5239 rpcib_rdma_capable_interface(struct lifreq *lifrp)
5240 {
5241         char ifname[LIFNAMSIZ];
5242         char *cp;
5243 
5244         if (lifrp->lifr_type == IFT_IB)
5245                 return (B_TRUE);
5246 
5247         /*
5248          * Strip off the logical interface portion before getting
5249          * intimate with the name.
5250          */
5251         (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
5252         if ((cp = strchr(ifname, ':')) != NULL)
5253                 *cp = '\0';
5254 
5255         return (strcmp("lo0", ifname) == 0);
5256 }
5257 
5258 static int
5259 rpcib_do_ip_ioctl(int cmd, int len, void *arg)
5260 {
5261         vnode_t *kkvp, *vp;
5262         TIUSER  *tiptr;
5263         struct  strioctl iocb;
5264         k_sigset_t smask;
5265         int     err = 0;
5266 
5267         if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kkvp) == 0) {
5268                 if (t_kopen(NULL, kkvp->v_rdev, FREAD|FWRITE,
5269                     &tiptr, CRED()) == 0) {
5270                         vp = tiptr->fp->f_vnode;
5271                 } else {
5272                         VN_RELE(kkvp);
5273                         return (EPROTO);
5274                 }
5275         } else {
5276                 return (EPROTO);
5277         }
5278 
5279         iocb.ic_cmd = cmd;
5280         iocb.ic_timout = 0;
5281         iocb.ic_len = len;
5282         iocb.ic_dp = (caddr_t)arg;
5283         sigintr(&smask, 0);
5284         err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
5285         sigunintr(&smask);
5286         (void) t_kclose(tiptr, 0);
5287         VN_RELE(kkvp);
5288         return (err);
5289 }
5290 
5291 /*
5292  * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
5293  * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
5294  */
5295 static int
5296 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep)
5297 {
5298         int err;
5299         struct lifnum lifn;
5300 
5301         bzero(&lifn, sizeof (struct lifnum));
5302         lifn.lifn_family = AF_UNSPEC;
5303 
5304         err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn);
5305         if (err != 0)
5306                 return (err);
5307 
5308         /*
5309          * Pad the interface count to account for additional interfaces that
5310          * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
5311          */
5312         lifn.lifn_count += 4;
5313 
5314         bzero(lifcp, sizeof (struct lifconf));
5315         lifcp->lifc_family = AF_UNSPEC;
5316         lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
5317         lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
5318 
5319         err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
5320         if (err != 0) {
5321                 kmem_free(lifcp->lifc_buf, *bufsizep);
5322                 return (err);
5323         }
5324         return (0);
5325 }
5326 
5327 static boolean_t
5328 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6)
5329 {
5330         uint_t i, nifs;
5331         uint_t bufsize;
5332         struct lifconf lifc;
5333         struct lifreq *lifrp;
5334         struct sockaddr_in *sinp;
5335         struct sockaddr_in6 *sin6p;
5336 
5337         bzero(addrs4, sizeof (rpcib_ipaddrs_t));
5338         bzero(addrs6, sizeof (rpcib_ipaddrs_t));
5339 
5340         if (rpcib_do_lifconf(&lifc, &bufsize) != 0)
5341                 return (B_FALSE);
5342 
5343         if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) {
5344                 kmem_free(lifc.lifc_buf, bufsize);
5345                 return (B_FALSE);
5346         }
5347 
5348         /*
5349          * Worst case is that all of the addresses are IB-capable and have
5350          * the same address family, so size our buffers accordingly.
5351          */
5352         addrs4->ri_size = nifs * sizeof (struct sockaddr_in);
5353         addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP);
5354         addrs6->ri_size = nifs * sizeof (struct sockaddr_in6);
5355         addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP);
5356 
5357         for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
5358                 if (!rpcib_rdma_capable_interface(lifrp))
5359                         continue;
5360 
5361                 if (lifrp->lifr_addr.ss_family == AF_INET) {
5362                         sinp = addrs4->ri_list;
5363                         bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++],
5364                             sizeof (struct sockaddr_in));
5365                 } else if (lifrp->lifr_addr.ss_family == AF_INET6) {
5366                         sin6p = addrs6->ri_list;
5367                         bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++],
5368                             sizeof (struct sockaddr_in6));
5369                 }
5370         }
5371 
5372         kmem_free(lifc.lifc_buf, bufsize);
5373         return (B_TRUE);
5374 }
5375 
5376 /* ARGSUSED */
5377 static int
5378 rpcib_cache_kstat_update(kstat_t *ksp, int rw)
5379 {
5380         rib_hca_t *hca;
5381 
5382         if (KSTAT_WRITE == rw) {
5383                 return (EACCES);
5384         }
5385 
5386         rpcib_kstat.cache_limit.value.ui64 =
5387             (uint64_t)cache_limit;
5388         rw_enter(&rib_stat->hcas_list_lock, RW_READER);
5389         for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
5390                 rpcib_kstat.cache_allocation.value.ui64 +=
5391                     (uint64_t)hca->cache_allocation;
5392                 rpcib_kstat.cache_hits.value.ui64 +=
5393                     (uint64_t)hca->cache_hits;
5394                 rpcib_kstat.cache_misses.value.ui64 +=
5395                     (uint64_t)hca->cache_misses;
5396                 rpcib_kstat.cache_misses_above_the_limit.value.ui64 +=
5397                     (uint64_t)hca->cache_misses_above_the_limit;
5398         }
5399         rw_exit(&rib_stat->hcas_list_lock);
5400         return (0);
5401 }