5255-uts-shouldnt-open-code-ISP2 Wdiff usr/src/uts/common/io/ib/adapters/tavor/tavor_srq.c

Print this page

5255 uts shouldn't open-code ISP2

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/io/ib/adapters/tavor/tavor_srq.c
          +++ new/usr/src/uts/common/io/ib/adapters/tavor/tavor_srq.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24   24   * Use is subject to license terms.

↓ open down ↓

24 lines elided

↑ open up ↑

  25   25   */
  26   26  
  27   27  /*
  28   28   * tavor_srq.c
  29   29   *    Tavor Shared Receive Queue Processing Routines
  30   30   *
  31   31   *    Implements all the routines necessary for allocating, freeing, querying,
  32   32   *    modifying and posting shared receive queues.
  33   33   */
  34   34  
       35 +#include <sys/sysmacros.h>
  35   36  #include <sys/types.h>
  36   37  #include <sys/conf.h>
  37   38  #include <sys/ddi.h>
  38   39  #include <sys/sunddi.h>
  39   40  #include <sys/modctl.h>
  40   41  #include <sys/bitmap.h>
  41   42  
  42   43  #include <sys/ib/adapters/tavor/tavor.h>
  43   44  
  44   45  static void tavor_srq_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,

  45   46      tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl);
  46   47  
  47   48  /*
  48   49   * tavor_srq_alloc()
  49   50   *    Context: Can be called only from user or kernel context.
  50   51   */
  51   52  int
  52   53  tavor_srq_alloc(tavor_state_t *state, tavor_srq_info_t *srqinfo,
  53   54      uint_t sleepflag, tavor_srq_options_t *op)
  54   55  {
  55   56          ibt_srq_hdl_t           ibt_srqhdl;
  56   57          tavor_pdhdl_t           pd;
  57   58          ibt_srq_sizes_t         *sizes;
  58   59          ibt_srq_sizes_t         *real_sizes;
  59   60          tavor_srqhdl_t          *srqhdl;
  60   61          ibt_srq_flags_t         flags;
  61   62          tavor_rsrc_t            *srqc, *rsrc;
  62   63          tavor_hw_srqc_t         srqc_entry;
  63   64          uint32_t                *buf;
  64   65          tavor_srqhdl_t          srq;
  65   66          tavor_umap_db_entry_t   *umapdb;
  66   67          ibt_mr_attr_t           mr_attr;
  67   68          tavor_mr_options_t      mr_op;
  68   69          tavor_mrhdl_t           mr;
  69   70          uint64_t                addr;
  70   71          uint64_t                value, srq_desc_off;
  71   72          uint32_t                lkey;
  72   73          uint32_t                log_srq_size;
  73   74          uint32_t                uarpg;
  74   75          uint_t                  wq_location, dma_xfer_mode, srq_is_umap;
  75   76          int                     flag, status;
  76   77          char                    *errormsg;
  77   78          uint_t                  max_sgl;
  78   79          uint_t                  wqesz;
  79   80  
  80   81          _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sizes))
  81   82  
  82   83          TAVOR_TNF_ENTER(tavor_srq_alloc);
  83   84  
  84   85          /*
  85   86           * Check the "options" flag.  Currently this flag tells the driver
  86   87           * whether or not the SRQ's work queues should be come from normal
  87   88           * system memory or whether they should be allocated from DDR memory.
  88   89           */
  89   90          if (op == NULL) {
  90   91                  wq_location = TAVOR_QUEUE_LOCATION_NORMAL;
  91   92          } else {
  92   93                  wq_location = op->srqo_wq_loc;
  93   94          }
  94   95  
  95   96          /*
  96   97           * Extract the necessary info from the tavor_srq_info_t structure
  97   98           */
  98   99          real_sizes = srqinfo->srqi_real_sizes;
  99  100          sizes      = srqinfo->srqi_sizes;
 100  101          pd         = srqinfo->srqi_pd;
 101  102          ibt_srqhdl = srqinfo->srqi_ibt_srqhdl;
 102  103          flags      = srqinfo->srqi_flags;
 103  104          srqhdl     = srqinfo->srqi_srqhdl;
 104  105  
 105  106          /*
 106  107           * Determine whether SRQ is being allocated for userland access or
 107  108           * whether it is being allocated for kernel access.  If the SRQ is
 108  109           * being allocated for userland access, then lookup the UAR doorbell
 109  110           * page number for the current process.  Note:  If this is not found
 110  111           * (e.g. if the process has not previously open()'d the Tavor driver),
 111  112           * then an error is returned.
 112  113           */
 113  114          srq_is_umap = (flags & IBT_SRQ_USER_MAP) ? 1 : 0;
 114  115          if (srq_is_umap) {
 115  116                  status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
 116  117                      MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
 117  118                  if (status != DDI_SUCCESS) {
 118  119                          /* Set "status" and "errormsg" and goto failure */
 119  120                          TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "failed UAR page");
 120  121                          goto srqalloc_fail3;
 121  122                  }
 122  123                  uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
 123  124          }
 124  125  
 125  126          /* Increase PD refcnt */
 126  127          tavor_pd_refcnt_inc(pd);
 127  128  
 128  129          /* Allocate an SRQ context entry */
 129  130          status = tavor_rsrc_alloc(state, TAVOR_SRQC, 1, sleepflag, &srqc);
 130  131          if (status != DDI_SUCCESS) {
 131  132                  /* Set "status" and "errormsg" and goto failure */
 132  133                  TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed SRQ context");
 133  134                  goto srqalloc_fail1;
 134  135          }
 135  136  
 136  137          /* Allocate the SRQ Handle entry */
 137  138          status = tavor_rsrc_alloc(state, TAVOR_SRQHDL, 1, sleepflag, &rsrc);
 138  139          if (status != DDI_SUCCESS) {
 139  140                  /* Set "status" and "errormsg" and goto failure */
 140  141                  TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed SRQ handle");
 141  142                  goto srqalloc_fail2;
 142  143          }
 143  144  
 144  145          srq = (tavor_srqhdl_t)rsrc->tr_addr;
 145  146          _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq))
 146  147  
 147  148          srq->srq_srqnum = srqc->tr_indx;        /* just use index */
 148  149  
 149  150          /*
 150  151           * If this will be a user-mappable SRQ, then allocate an entry for
 151  152           * the "userland resources database".  This will later be added to
 152  153           * the database (after all further SRQ operations are successful).
 153  154           * If we fail here, we must undo the reference counts and the
 154  155           * previous resource allocation.
 155  156           */
 156  157          if (srq_is_umap) {
 157  158                  umapdb = tavor_umap_db_alloc(state->ts_instance,
 158  159                      srq->srq_srqnum, MLNX_UMAP_SRQMEM_RSRC,
 159  160                      (uint64_t)(uintptr_t)rsrc);
 160  161                  if (umapdb == NULL) {
 161  162                          /* Set "status" and "errormsg" and goto failure */
 162  163                          TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
 163  164                          goto srqalloc_fail3;
 164  165                  }

↓ open down ↓

120 lines elided

↑ open up ↑

 165  166          }
 166  167  
 167  168          /*
 168  169           * Calculate the appropriate size for the SRQ.
 169  170           * Note:  All Tavor SRQs must be a power-of-2 in size.  Also
 170  171           * they may not be any smaller than TAVOR_SRQ_MIN_SIZE.  This step
 171  172           * is to round the requested size up to the next highest power-of-2
 172  173           */
 173  174          sizes->srq_wr_sz = max(sizes->srq_wr_sz, TAVOR_SRQ_MIN_SIZE);
 174  175          log_srq_size = highbit(sizes->srq_wr_sz);
 175      -        if ((sizes->srq_wr_sz & (sizes->srq_wr_sz - 1)) == 0) {
      176 +        if (ISP2(sizes->srq_wr_sz)) {
 176  177                  log_srq_size = log_srq_size - 1;
 177  178          }
 178  179  
 179  180          /*
 180  181           * Next we verify that the rounded-up size is valid (i.e. consistent
 181  182           * with the device limits and/or software-configured limits).  If not,
 182  183           * then obviously we have a lot of cleanup to do before returning.
 183  184           */
 184  185          if (log_srq_size > state->ts_cfg_profile->cp_log_max_srq_sz) {
 185  186                  /* Set "status" and "errormsg" and goto failure */

 186  187                  TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max SRQ size");
 187  188                  goto srqalloc_fail4;
 188  189          }
 189  190  
 190  191          /*
 191  192           * Next we verify that the requested number of SGL is valid (i.e.
 192  193           * consistent with the device limits and/or software-configured
 193  194           * limits).  If not, then obviously the same cleanup needs to be done.
 194  195           */
 195  196          max_sgl = state->ts_cfg_profile->cp_srq_max_sgl;
 196  197          if (sizes->srq_sgl_sz > max_sgl) {
 197  198                  /* Set "status" and "errormsg" and goto failure */
 198  199                  TAVOR_TNF_FAIL(IBT_HCA_SGL_EXCEEDED, "max SRQ SGL");
 199  200                  goto srqalloc_fail4;
 200  201          }
 201  202  
 202  203          /*
 203  204           * Determine the SRQ's WQE sizes.  This depends on the requested
 204  205           * number of SGLs.  Note: This also has the side-effect of
 205  206           * calculating the real number of SGLs (for the calculated WQE size)
 206  207           */
 207  208          tavor_srq_sgl_to_logwqesz(state, sizes->srq_sgl_sz,
 208  209              TAVOR_QP_WQ_TYPE_RECVQ, &srq->srq_wq_log_wqesz,
 209  210              &srq->srq_wq_sgl);
 210  211  
 211  212          /*
 212  213           * Allocate the memory for SRQ work queues.  Note:  The location from
 213  214           * which we will allocate these work queues has been passed in through
 214  215           * the tavor_qp_options_t structure.  Since Tavor work queues are not
 215  216           * allowed to cross a 32-bit (4GB) boundary, the alignment of the work
 216  217           * queue memory is very important.  We used to allocate work queues
 217  218           * (the combined receive and send queues) so that they would be aligned
 218  219           * on their combined size.  That alignment guaranteed that they would
 219  220           * never cross the 4GB boundary (Tavor work queues are on the order of
 220  221           * MBs at maximum).  Now we are able to relax this alignment constraint
 221  222           * by ensuring that the IB address assigned to the queue memory (as a
 222  223           * result of the tavor_mr_register() call) is offset from zero.
 223  224           * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
 224  225           * guarantee the alignment, but when attempting to use IOMMU bypass
 225  226           * mode we found that we were not allowed to specify any alignment that
 226  227           * was more restrictive than the system page size.  So we avoided this
 227  228           * constraint by passing two alignment values, one for the memory
 228  229           * allocation itself and the other for the DMA handle (for later bind).
 229  230           * This used to cause more memory than necessary to be allocated (in
 230  231           * order to guarantee the more restrictive alignment contraint).  But
 231  232           * be guaranteeing the zero-based IB virtual address for the queue, we
 232  233           * are able to conserve this memory.
 233  234           *
 234  235           * Note: If SRQ is not user-mappable, then it may come from either
 235  236           * kernel system memory or from HCA-attached local DDR memory.
 236  237           *
 237  238           * Note2: We align this queue on a pagesize boundary.  This is required
 238  239           * to make sure that all the resulting IB addresses will start at 0, for
 239  240           * a zero-based queue.  By making sure we are aligned on at least a
 240  241           * page, any offset we use into our queue will be the same as when we
 241  242           * perform tavor_srq_modify() operations later.
 242  243           */
 243  244          wqesz = (1 << srq->srq_wq_log_wqesz);
 244  245          srq->srq_wqinfo.qa_size = (1 << log_srq_size) * wqesz;
 245  246          srq->srq_wqinfo.qa_alloc_align = PAGESIZE;
 246  247          srq->srq_wqinfo.qa_bind_align = PAGESIZE;
 247  248          if (srq_is_umap) {
 248  249                  srq->srq_wqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
 249  250          } else {
 250  251                  srq->srq_wqinfo.qa_location = wq_location;
 251  252          }
 252  253          status = tavor_queue_alloc(state, &srq->srq_wqinfo, sleepflag);
 253  254          if (status != DDI_SUCCESS) {
 254  255                  /* Set "status" and "errormsg" and goto failure */
 255  256                  TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed srq");
 256  257                  goto srqalloc_fail4;
 257  258          }
 258  259          buf = (uint32_t *)srq->srq_wqinfo.qa_buf_aligned;
 259  260          _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
 260  261  
 261  262          /*
 262  263           * Register the memory for the SRQ work queues.  The memory for the SRQ
 263  264           * must be registered in the Tavor TPT tables.  This gives us the LKey
 264  265           * to specify in the SRQ context later.  Note: If the work queue is to
 265  266           * be allocated from DDR memory, then only a "bypass" mapping is
 266  267           * appropriate.  And if the SRQ memory is user-mappable, then we force
 267  268           * DDI_DMA_CONSISTENT mapping.  Also, in order to meet the alignment
 268  269           * restriction, we pass the "mro_bind_override_addr" flag in the call
 269  270           * to tavor_mr_register().  This guarantees that the resulting IB vaddr
 270  271           * will be zero-based (modulo the offset into the first page).  If we
 271  272           * fail here, we still have the bunch of resource and reference count
 272  273           * cleanup to do.
 273  274           */
 274  275          flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP :
 275  276              IBT_MR_NOSLEEP;
 276  277          mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
 277  278          mr_attr.mr_len   = srq->srq_wqinfo.qa_size;
 278  279          mr_attr.mr_as    = NULL;
 279  280          mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
 280  281          if (srq_is_umap) {
 281  282                  mr_op.mro_bind_type   = state->ts_cfg_profile->cp_iommu_bypass;
 282  283          } else {
 283  284                  if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
 284  285                          mr_op.mro_bind_type =
 285  286                              state->ts_cfg_profile->cp_iommu_bypass;
 286  287                          dma_xfer_mode =
 287  288                              state->ts_cfg_profile->cp_streaming_consistent;
 288  289                          if (dma_xfer_mode == DDI_DMA_STREAMING) {
 289  290                                  mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
 290  291                          }
 291  292                  } else {
 292  293                          mr_op.mro_bind_type = TAVOR_BINDMEM_BYPASS;
 293  294                  }
 294  295          }
 295  296          mr_op.mro_bind_dmahdl = srq->srq_wqinfo.qa_dmahdl;
 296  297          mr_op.mro_bind_override_addr = 1;
 297  298          status = tavor_mr_register(state, pd, &mr_attr, &mr, &mr_op);
 298  299          if (status != DDI_SUCCESS) {
 299  300                  /* Set "status" and "errormsg" and goto failure */
 300  301                  TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
 301  302                  goto srqalloc_fail5;
 302  303          }
 303  304          _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
 304  305          addr = mr->mr_bindinfo.bi_addr;
 305  306          lkey = mr->mr_lkey;
 306  307  
 307  308          /*
 308  309           * Calculate the offset between the kernel virtual address space
 309  310           * and the IB virtual address space.  This will be used when
 310  311           * posting work requests to properly initialize each WQE.
 311  312           */
 312  313          srq_desc_off = (uint64_t)(uintptr_t)srq->srq_wqinfo.qa_buf_aligned -
 313  314              (uint64_t)mr->mr_bindinfo.bi_addr;
 314  315  
 315  316          /*
 316  317           * Create WQL and Wridlist for use by this SRQ
 317  318           */
 318  319          srq->srq_wrid_wql = tavor_wrid_wql_create(state);
 319  320          if (srq->srq_wrid_wql == NULL) {
 320  321                  /* Set "status" and "errormsg" and goto failure */
 321  322                  TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed wql create");
 322  323                  goto srqalloc_fail6;
 323  324          }
 324  325          _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq->srq_wrid_wql)))
 325  326  
 326  327          srq->srq_wridlist = tavor_wrid_get_list(1 << log_srq_size);
 327  328          if (srq->srq_wridlist == NULL) {
 328  329                  /* Set "status" and "errormsg" and goto failure */
 329  330                  TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed wridlist create");
 330  331                  goto srqalloc_fail7;
 331  332          }
 332  333          _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq->srq_wridlist)))
 333  334  
 334  335          srq->srq_wridlist->wl_srq_en = 1;
 335  336          srq->srq_wridlist->wl_free_list_indx = -1;
 336  337  
 337  338          /*
 338  339           * Fill in all the return arguments (if necessary).  This includes
 339  340           * real queue size and real SGLs.
 340  341           */
 341  342          if (real_sizes != NULL) {
 342  343                  real_sizes->srq_wr_sz = (1 << log_srq_size);
 343  344                  real_sizes->srq_sgl_sz = srq->srq_wq_sgl;
 344  345          }
 345  346  
 346  347          /*
 347  348           * Fill in the SRQC entry.  This is the final step before passing
 348  349           * ownership of the SRQC entry to the Tavor hardware.  We use all of
 349  350           * the information collected/calculated above to fill in the
 350  351           * requisite portions of the SRQC.  Note: If this SRQ is going to be
 351  352           * used for userland access, then we need to set the UAR page number
 352  353           * appropriately (otherwise it's a "don't care")
 353  354           */
 354  355          bzero(&srqc_entry, sizeof (tavor_hw_srqc_t));
 355  356          srqc_entry.wqe_addr_h      = (addr >> 32);
 356  357          srqc_entry.next_wqe_addr_l = 0;
 357  358          srqc_entry.ds              = (wqesz >> 4);
 358  359          srqc_entry.state           = TAVOR_SRQ_STATE_HW_OWNER;
 359  360          srqc_entry.pd              = pd->pd_pdnum;
 360  361          srqc_entry.lkey            = lkey;
 361  362          srqc_entry.wqe_cnt         = 0;
 362  363          if (srq_is_umap) {
 363  364                  srqc_entry.uar     = uarpg;
 364  365          } else {
 365  366                  srqc_entry.uar     = 0;
 366  367          }
 367  368  
 368  369          /*
 369  370           * Write the SRQC entry to hardware.  Lastly, we pass ownership of
 370  371           * the entry to the hardware (using the Tavor SW2HW_SRQ firmware
 371  372           * command).  Note: In general, this operation shouldn't fail.  But
 372  373           * if it does, we have to undo everything we've done above before
 373  374           * returning error.
 374  375           */
 375  376          status = tavor_cmn_ownership_cmd_post(state, SW2HW_SRQ, &srqc_entry,
 376  377              sizeof (tavor_hw_srqc_t), srq->srq_srqnum,
 377  378              sleepflag);
 378  379          if (status != TAVOR_CMD_SUCCESS) {
 379  380                  cmn_err(CE_CONT, "Tavor: SW2HW_SRQ command failed: %08x\n",
 380  381                      status);
 381  382                  TNF_PROBE_1(tavor_srq_alloc_sw2hw_srq_cmd_fail,
 382  383                      TAVOR_TNF_ERROR, "", tnf_uint, status, status);
 383  384                  /* Set "status" and "errormsg" and goto failure */
 384  385                  TAVOR_TNF_FAIL(IBT_FAILURE, "tavor SW2HW_SRQ command");
 385  386                  goto srqalloc_fail8;
 386  387          }
 387  388  
 388  389          /*
 389  390           * Fill in the rest of the Tavor SRQ handle.  We can update
 390  391           * the following fields for use in further operations on the SRQ.
 391  392           */
 392  393          srq->srq_srqcrsrcp = srqc;
 393  394          srq->srq_rsrcp     = rsrc;
 394  395          srq->srq_mrhdl     = mr;
 395  396          srq->srq_refcnt    = 0;
 396  397          srq->srq_is_umap   = srq_is_umap;
 397  398          srq->srq_uarpg     = (srq->srq_is_umap) ? uarpg : 0;
 398  399          srq->srq_umap_dhp  = (devmap_cookie_t)NULL;
 399  400          srq->srq_pdhdl     = pd;
 400  401          srq->srq_wq_lastwqeindx = -1;
 401  402          srq->srq_wq_bufsz  = (1 << log_srq_size);
 402  403          srq->srq_wq_buf    = buf;
 403  404          srq->srq_desc_off  = srq_desc_off;
 404  405          srq->srq_hdlrarg   = (void *)ibt_srqhdl;
 405  406          srq->srq_state     = 0;
 406  407          srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);
 407  408          srq->srq_real_sizes.srq_sgl_sz = srq->srq_wq_sgl;
 408  409  
 409  410          /* Determine if later ddi_dma_sync will be necessary */
 410  411          srq->srq_sync = TAVOR_SRQ_IS_SYNC_REQ(state, srq->srq_wqinfo);
 411  412  
 412  413          /*
 413  414           * Put SRQ handle in Tavor SRQNum-to-SRQhdl list.  Then fill in the
 414  415           * "srqhdl" and return success
 415  416           */
 416  417          ASSERT(state->ts_srqhdl[srqc->tr_indx] == NULL);
 417  418          state->ts_srqhdl[srqc->tr_indx] = srq;
 418  419  
 419  420          /*
 420  421           * If this is a user-mappable SRQ, then we need to insert the
 421  422           * previously allocated entry into the "userland resources database".
 422  423           * This will allow for later lookup during devmap() (i.e. mmap())
 423  424           * calls.
 424  425           */
 425  426          if (srq->srq_is_umap) {
 426  427                  tavor_umap_db_add(umapdb);
 427  428          } else {
 428  429                  mutex_enter(&srq->srq_wrid_wql->wql_lock);
 429  430                  tavor_wrid_list_srq_init(srq->srq_wridlist, srq, 0);
 430  431                  mutex_exit(&srq->srq_wrid_wql->wql_lock);
 431  432          }
 432  433  
 433  434          *srqhdl = srq;
 434  435  
 435  436          TAVOR_TNF_EXIT(tavor_srq_alloc);
 436  437          return (status);
 437  438  
 438  439  /*
 439  440   * The following is cleanup for all possible failure cases in this routine
 440  441   */
 441  442  srqalloc_fail8:
 442  443          kmem_free(srq->srq_wridlist->wl_wre, srq->srq_wridlist->wl_size *
 443  444              sizeof (tavor_wrid_entry_t));
 444  445          kmem_free(srq->srq_wridlist, sizeof (tavor_wrid_list_hdr_t));
 445  446  srqalloc_fail7:
 446  447          tavor_wql_refcnt_dec(srq->srq_wrid_wql);
 447  448  srqalloc_fail6:
 448  449          if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
 449  450              TAVOR_SLEEPFLAG_FOR_CONTEXT()) != DDI_SUCCESS) {
 450  451                  TAVOR_WARNING(state, "failed to deregister SRQ memory");
 451  452          }
 452  453  srqalloc_fail5:
 453  454          tavor_queue_free(state, &srq->srq_wqinfo);
 454  455  srqalloc_fail4:
 455  456          if (srq_is_umap) {
 456  457                  tavor_umap_db_free(umapdb);
 457  458          }
 458  459  srqalloc_fail3:
 459  460          tavor_rsrc_free(state, &rsrc);
 460  461  srqalloc_fail2:
 461  462          tavor_rsrc_free(state, &srqc);
 462  463  srqalloc_fail1:
 463  464          tavor_pd_refcnt_dec(pd);
 464  465  srqalloc_fail:
 465  466          TNF_PROBE_1(tavor_srq_alloc_fail, TAVOR_TNF_ERROR, "",
 466  467              tnf_string, msg, errormsg);
 467  468          TAVOR_TNF_EXIT(tavor_srq_alloc);
 468  469          return (status);
 469  470  }
 470  471  
 471  472  
 472  473  /*
 473  474   * tavor_srq_free()
 474  475   *    Context: Can be called only from user or kernel context.
 475  476   */
 476  477  /* ARGSUSED */
 477  478  int
 478  479  tavor_srq_free(tavor_state_t *state, tavor_srqhdl_t *srqhdl, uint_t sleepflag)
 479  480  {
 480  481          tavor_rsrc_t            *srqc, *rsrc;
 481  482          tavor_umap_db_entry_t   *umapdb;
 482  483          uint64_t                value;
 483  484          tavor_srqhdl_t          srq;
 484  485          tavor_mrhdl_t           mr;
 485  486          tavor_pdhdl_t           pd;
 486  487          tavor_hw_srqc_t         srqc_entry;
 487  488          uint32_t                srqnum;
 488  489          uint32_t                size;
 489  490          uint_t                  maxprot;
 490  491          int                     status;
 491  492  
 492  493          TAVOR_TNF_ENTER(tavor_srq_free);
 493  494  
 494  495          /*
 495  496           * Pull all the necessary information from the Tavor Shared Receive
 496  497           * Queue handle.  This is necessary here because the resource for the
 497  498           * SRQ handle is going to be freed up as part of this operation.
 498  499           */
 499  500          srq     = *srqhdl;
 500  501          mutex_enter(&srq->srq_lock);
 501  502          srqc    = srq->srq_srqcrsrcp;
 502  503          rsrc    = srq->srq_rsrcp;
 503  504          pd      = srq->srq_pdhdl;
 504  505          mr      = srq->srq_mrhdl;
 505  506          srqnum  = srq->srq_srqnum;
 506  507  
 507  508          /*
 508  509           * If there are work queues still associated with the SRQ, then return
 509  510           * an error.  Otherwise, we will be holding the SRQ lock.
 510  511           */
 511  512          if (srq->srq_refcnt != 0) {
 512  513                  mutex_exit(&srq->srq_lock);
 513  514                  TNF_PROBE_1(tavor_srq_free_refcnt_fail, TAVOR_TNF_ERROR, "",
 514  515                      tnf_int, refcnt, srq->srq_refcnt);
 515  516                  TAVOR_TNF_EXIT(tavor_srq_free);
 516  517                  return (IBT_SRQ_IN_USE);
 517  518          }
 518  519  
 519  520          /*
 520  521           * If this was a user-mappable SRQ, then we need to remove its entry
 521  522           * from the "userland resources database".  If it is also currently
 522  523           * mmap()'d out to a user process, then we need to call
 523  524           * devmap_devmem_remap() to remap the SRQ memory to an invalid mapping.
 524  525           * We also need to invalidate the SRQ tracking information for the
 525  526           * user mapping.
 526  527           */
 527  528          if (srq->srq_is_umap) {
 528  529                  status = tavor_umap_db_find(state->ts_instance, srq->srq_srqnum,
 529  530                      MLNX_UMAP_SRQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
 530  531                      &umapdb);
 531  532                  if (status != DDI_SUCCESS) {
 532  533                          mutex_exit(&srq->srq_lock);
 533  534                          TAVOR_WARNING(state, "failed to find in database");
 534  535                          TAVOR_TNF_EXIT(tavor_srq_free);
 535  536                          return (ibc_get_ci_failure(0));
 536  537                  }
 537  538                  tavor_umap_db_free(umapdb);
 538  539                  if (srq->srq_umap_dhp != NULL) {
 539  540                          maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
 540  541                          status = devmap_devmem_remap(srq->srq_umap_dhp,
 541  542                              state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size,
 542  543                              maxprot, DEVMAP_MAPPING_INVALID, NULL);
 543  544                          if (status != DDI_SUCCESS) {
 544  545                                  mutex_exit(&srq->srq_lock);
 545  546                                  TAVOR_WARNING(state, "failed in SRQ memory "
 546  547                                      "devmap_devmem_remap()");
 547  548                                  TAVOR_TNF_EXIT(tavor_srq_free);
 548  549                                  return (ibc_get_ci_failure(0));
 549  550                          }
 550  551                          srq->srq_umap_dhp = (devmap_cookie_t)NULL;
 551  552                  }
 552  553          }
 553  554  
 554  555          /*
 555  556           * Put NULL into the Tavor SRQNum-to-SRQHdl list.  This will allow any
 556  557           * in-progress events to detect that the SRQ corresponding to this
 557  558           * number has been freed.
 558  559           */
 559  560          state->ts_srqhdl[srqc->tr_indx] = NULL;
 560  561  
 561  562          mutex_exit(&srq->srq_lock);
 562  563          _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq));
 563  564          _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq->srq_wridlist));
 564  565  
 565  566          /*
 566  567           * Reclaim SRQC entry from hardware (using the Tavor HW2SW_SRQ
 567  568           * firmware command).  If the ownership transfer fails for any reason,
 568  569           * then it is an indication that something (either in HW or SW) has
 569  570           * gone seriously wrong.
 570  571           */
 571  572          status = tavor_cmn_ownership_cmd_post(state, HW2SW_SRQ, &srqc_entry,
 572  573              sizeof (tavor_hw_srqc_t), srqnum, sleepflag);
 573  574          if (status != TAVOR_CMD_SUCCESS) {
 574  575                  TAVOR_WARNING(state, "failed to reclaim SRQC ownership");
 575  576                  cmn_err(CE_CONT, "Tavor: HW2SW_SRQ command failed: %08x\n",
 576  577                      status);
 577  578                  TNF_PROBE_1(tavor_srq_free_hw2sw_srq_cmd_fail,
 578  579                      TAVOR_TNF_ERROR, "", tnf_uint, status, status);
 579  580                  TAVOR_TNF_EXIT(tavor_srq_free);
 580  581                  return (IBT_FAILURE);
 581  582          }
 582  583  
 583  584          /*
 584  585           * Deregister the memory for the Shared Receive Queue.  If this fails
 585  586           * for any reason, then it is an indication that something (either
 586  587           * in HW or SW) has gone seriously wrong.  So we print a warning
 587  588           * message and return.
 588  589           */
 589  590          status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
 590  591              sleepflag);
 591  592          if (status != DDI_SUCCESS) {
 592  593                  TAVOR_WARNING(state, "failed to deregister SRQ memory");
 593  594                  TNF_PROBE_0(tavor_srq_free_dereg_mr_fail, TAVOR_TNF_ERROR, "");
 594  595                  TAVOR_TNF_EXIT(tavor_srq_free);
 595  596                  return (IBT_FAILURE);
 596  597          }
 597  598  
 598  599          /* Calculate the size and free the wridlist container */
 599  600          if (srq->srq_wridlist != NULL) {
 600  601                  size = (srq->srq_wridlist->wl_size *
 601  602                      sizeof (tavor_wrid_entry_t));
 602  603                  kmem_free(srq->srq_wridlist->wl_wre, size);
 603  604                  kmem_free(srq->srq_wridlist, sizeof (tavor_wrid_list_hdr_t));
 604  605  
 605  606                  /*
 606  607                   * Release reference to WQL; If this is the last reference,
 607  608                   * this call also has the side effect of freeing up the
 608  609                   * 'srq_wrid_wql' memory.
 609  610                   */
 610  611                  tavor_wql_refcnt_dec(srq->srq_wrid_wql);
 611  612          }
 612  613  
 613  614          /* Free the memory for the SRQ */
 614  615          tavor_queue_free(state, &srq->srq_wqinfo);
 615  616  
 616  617          /* Free the Tavor SRQ Handle */
 617  618          tavor_rsrc_free(state, &rsrc);
 618  619  
 619  620          /* Free the SRQC entry resource */
 620  621          tavor_rsrc_free(state, &srqc);
 621  622  
 622  623          /* Decrement the reference count on the protection domain (PD) */
 623  624          tavor_pd_refcnt_dec(pd);
 624  625  
 625  626          /* Set the srqhdl pointer to NULL and return success */
 626  627          *srqhdl = NULL;
 627  628  
 628  629          TAVOR_TNF_EXIT(tavor_srq_free);
 629  630          return (DDI_SUCCESS);
 630  631  }
 631  632  
 632  633  
 633  634  /*
 634  635   * tavor_srq_modify()
 635  636   *    Context: Can be called only from user or kernel context.
 636  637   */
 637  638  int
 638  639  tavor_srq_modify(tavor_state_t *state, tavor_srqhdl_t srq, uint_t size,
 639  640      uint_t *real_size, uint_t sleepflag)
 640  641  {
 641  642          tavor_qalloc_info_t     new_srqinfo, old_srqinfo;
 642  643          tavor_rsrc_t            *mtt, *mpt, *old_mtt;
 643  644          tavor_bind_info_t       bind;
 644  645          tavor_bind_info_t       old_bind;
 645  646          tavor_rsrc_pool_info_t  *rsrc_pool;
 646  647          tavor_mrhdl_t           mr;
 647  648          tavor_hw_mpt_t          mpt_entry;
 648  649          tavor_wrid_entry_t      *wre_new, *wre_old;
 649  650          uint64_t                mtt_ddrbaseaddr, mtt_addr;
 650  651          uint64_t                srq_desc_off;
 651  652          uint32_t                *buf, srq_old_bufsz;
 652  653          uint32_t                wqesz;
 653  654          uint_t                  max_srq_size;
 654  655          uint_t                  dma_xfer_mode, mtt_pgsize_bits;
 655  656          uint_t                  srq_sync, log_srq_size, maxprot;
 656  657          uint_t                  wq_location;
 657  658          int                     status;
 658  659          char                    *errormsg;
 659  660  
 660  661          TAVOR_TNF_ENTER(tavor_srq_modify);
 661  662  
 662  663          /*
 663  664           * Check the "inddr" flag.  This flag tells the driver whether or not
 664  665           * the SRQ's work queues should be come from normal system memory or
 665  666           * whether they should be allocated from DDR memory.
 666  667           */
 667  668          wq_location = state->ts_cfg_profile->cp_srq_wq_inddr;
 668  669  
 669  670          /*
 670  671           * If size requested is larger than device capability, return
 671  672           * Insufficient Resources
 672  673           */
 673  674          max_srq_size = (1 << state->ts_cfg_profile->cp_log_max_srq_sz);
 674  675          if (size > max_srq_size) {
 675  676                  TNF_PROBE_0(tavor_srq_modify_size_larger_than_maxsize,
 676  677                      TAVOR_TNF_ERROR, "");
 677  678                  TAVOR_TNF_EXIT(tavor_srq_modify);
 678  679                  return (IBT_HCA_WR_EXCEEDED);

↓ open down ↓

493 lines elided

↑ open up ↑

 679  680          }
 680  681  
 681  682          /*
 682  683           * Calculate the appropriate size for the SRQ.
 683  684           * Note:  All Tavor SRQs must be a power-of-2 in size.  Also
 684  685           * they may not be any smaller than TAVOR_SRQ_MIN_SIZE.  This step
 685  686           * is to round the requested size up to the next highest power-of-2
 686  687           */
 687  688          size = max(size, TAVOR_SRQ_MIN_SIZE);
 688  689          log_srq_size = highbit(size);
 689      -        if ((size & (size - 1)) == 0) {
      690 +        if (ISP2(size)) {
 690  691                  log_srq_size = log_srq_size - 1;
 691  692          }
 692  693  
 693  694          /*
 694  695           * Next we verify that the rounded-up size is valid (i.e. consistent
 695  696           * with the device limits and/or software-configured limits).
 696  697           */
 697  698          if (log_srq_size > state->ts_cfg_profile->cp_log_max_srq_sz) {
 698  699                  /* Set "status" and "errormsg" and goto failure */
 699  700                  TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max SRQ size");

 700  701                  goto srqmodify_fail;
 701  702          }
 702  703  
 703  704          /*
 704  705           * Allocate the memory for newly resized Shared Receive Queue.
 705  706           *
 706  707           * Note: If SRQ is not user-mappable, then it may come from either
 707  708           * kernel system memory or from HCA-attached local DDR memory.
 708  709           *
 709  710           * Note2: We align this queue on a pagesize boundary.  This is required
 710  711           * to make sure that all the resulting IB addresses will start at 0,
 711  712           * for a zero-based queue.  By making sure we are aligned on at least a
 712  713           * page, any offset we use into our queue will be the same as it was
 713  714           * when we allocated it at tavor_srq_alloc() time.
 714  715           */
 715  716          wqesz = (1 << srq->srq_wq_log_wqesz);
 716  717          new_srqinfo.qa_size = (1 << log_srq_size) * wqesz;
 717  718          new_srqinfo.qa_alloc_align = PAGESIZE;
 718  719          new_srqinfo.qa_bind_align  = PAGESIZE;
 719  720          if (srq->srq_is_umap) {
 720  721                  new_srqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
 721  722          } else {
 722  723                  new_srqinfo.qa_location = wq_location;
 723  724          }
 724  725          status = tavor_queue_alloc(state, &new_srqinfo, sleepflag);
 725  726          if (status != DDI_SUCCESS) {
 726  727                  /* Set "status" and "errormsg" and goto failure */
 727  728                  TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed srq");
 728  729                  goto srqmodify_fail;
 729  730          }
 730  731          buf = (uint32_t *)new_srqinfo.qa_buf_aligned;
 731  732          _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
 732  733  
 733  734          /*
 734  735           * Allocate the memory for the new WRE list.  This will be used later
 735  736           * when we resize the wridlist based on the new SRQ size.
 736  737           */
 737  738          wre_new = (tavor_wrid_entry_t *)kmem_zalloc((1 << log_srq_size) *
 738  739              sizeof (tavor_wrid_entry_t), sleepflag);
 739  740          if (wre_new == NULL) {
 740  741                  /* Set "status" and "errormsg" and goto failure */
 741  742                  TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE,
 742  743                      "failed wre_new alloc");
 743  744                  goto srqmodify_fail;
 744  745          }
 745  746  
 746  747          /*
 747  748           * Fill in the "bind" struct.  This struct provides the majority
 748  749           * of the information that will be used to distinguish between an
 749  750           * "addr" binding (as is the case here) and a "buf" binding (see
 750  751           * below).  The "bind" struct is later passed to tavor_mr_mem_bind()
 751  752           * which does most of the "heavy lifting" for the Tavor memory
 752  753           * registration routines.
 753  754           */
 754  755          _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(bind))
 755  756          bzero(&bind, sizeof (tavor_bind_info_t));
 756  757          bind.bi_type  = TAVOR_BINDHDL_VADDR;
 757  758          bind.bi_addr  = (uint64_t)(uintptr_t)buf;
 758  759          bind.bi_len   = new_srqinfo.qa_size;
 759  760          bind.bi_as    = NULL;
 760  761          bind.bi_flags = sleepflag == TAVOR_SLEEP ? IBT_MR_SLEEP :
 761  762              IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
 762  763          if (srq->srq_is_umap) {
 763  764                  bind.bi_bypass = state->ts_cfg_profile->cp_iommu_bypass;
 764  765          } else {
 765  766                  if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
 766  767                          bind.bi_bypass =
 767  768                              state->ts_cfg_profile->cp_iommu_bypass;
 768  769                          dma_xfer_mode =
 769  770                              state->ts_cfg_profile->cp_streaming_consistent;
 770  771                          if (dma_xfer_mode == DDI_DMA_STREAMING) {
 771  772                                  bind.bi_flags |= IBT_MR_NONCOHERENT;
 772  773                          }
 773  774                  } else {
 774  775                          bind.bi_bypass = TAVOR_BINDMEM_BYPASS;
 775  776                  }
 776  777          }
 777  778          status = tavor_mr_mtt_bind(state, &bind, new_srqinfo.qa_dmahdl, &mtt,
 778  779              &mtt_pgsize_bits);
 779  780          if (status != DDI_SUCCESS) {
 780  781                  /* Set "status" and "errormsg" and goto failure */
 781  782                  TAVOR_TNF_FAIL(status, "failed mtt bind");
 782  783                  kmem_free(wre_new, srq->srq_wq_bufsz *
 783  784                      sizeof (tavor_wrid_entry_t));
 784  785                  tavor_queue_free(state, &new_srqinfo);
 785  786                  goto srqmodify_fail;
 786  787          }
 787  788  
 788  789          /*
 789  790           * Calculate the offset between the kernel virtual address space
 790  791           * and the IB virtual address space.  This will be used when
 791  792           * posting work requests to properly initialize each WQE.
 792  793           *
 793  794           * Note: bind addr is zero-based (from alloc) so we calculate the
 794  795           * correct new offset here.
 795  796           */
 796  797          bind.bi_addr = bind.bi_addr & ((1 << mtt_pgsize_bits) - 1);
 797  798          srq_desc_off = (uint64_t)(uintptr_t)new_srqinfo.qa_buf_aligned -
 798  799              (uint64_t)bind.bi_addr;
 799  800  
 800  801          /*
 801  802           * Get the base address for the MTT table.  This will be necessary
 802  803           * below when we are modifying the MPT entry.
 803  804           */
 804  805          rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
 805  806          mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
 806  807  
 807  808          /*
 808  809           * Fill in the MPT entry.  This is the final step before passing
 809  810           * ownership of the MPT entry to the Tavor hardware.  We use all of
 810  811           * the information collected/calculated above to fill in the
 811  812           * requisite portions of the MPT.
 812  813           */
 813  814          bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
 814  815          mpt_entry.reg_win_len   = bind.bi_len;
 815  816          mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
 816  817          mpt_entry.mttseg_addr_h = mtt_addr >> 32;
 817  818          mpt_entry.mttseg_addr_l = mtt_addr >> 6;
 818  819  
 819  820          /*
 820  821           * Now we grab the SRQ lock.  Since we will be updating the actual
 821  822           * SRQ location and the producer/consumer indexes, we should hold
 822  823           * the lock.
 823  824           *
 824  825           * We do a TAVOR_NOSLEEP here (and below), though, because we are
 825  826           * holding the "srq_lock" and if we got raised to interrupt level
 826  827           * by priority inversion, we would not want to block in this routine
 827  828           * waiting for success.
 828  829           */
 829  830          mutex_enter(&srq->srq_lock);
 830  831  
 831  832          /*
 832  833           * Copy old entries to new buffer
 833  834           */
 834  835          srq_old_bufsz = srq->srq_wq_bufsz;
 835  836          bcopy(srq->srq_wq_buf, buf, srq_old_bufsz * wqesz);
 836  837  
 837  838          /* Determine if later ddi_dma_sync will be necessary */
 838  839          srq_sync = TAVOR_SRQ_IS_SYNC_REQ(state, srq->srq_wqinfo);
 839  840  
 840  841          /* Sync entire "new" SRQ for use by hardware (if necessary) */
 841  842          if (srq_sync) {
 842  843                  (void) ddi_dma_sync(bind.bi_dmahdl, 0,
 843  844                      new_srqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
 844  845          }
 845  846  
 846  847          /*
 847  848           * Setup MPT information for use in the MODIFY_MPT command
 848  849           */
 849  850          mr = srq->srq_mrhdl;
 850  851          mutex_enter(&mr->mr_lock);
 851  852          mpt = srq->srq_mrhdl->mr_mptrsrcp;
 852  853  
 853  854          /*
 854  855           * MODIFY_MPT
 855  856           *
 856  857           * If this fails for any reason, then it is an indication that
 857  858           * something (either in HW or SW) has gone seriously wrong.  So we
 858  859           * print a warning message and return.
 859  860           */
 860  861          status = tavor_modify_mpt_cmd_post(state, &mpt_entry, mpt->tr_indx,
 861  862              TAVOR_CMD_MODIFY_MPT_RESIZESRQ, sleepflag);
 862  863          if (status != TAVOR_CMD_SUCCESS) {
 863  864                  cmn_err(CE_CONT, "Tavor: MODIFY_MPT command failed: %08x\n",
 864  865                      status);
 865  866                  TNF_PROBE_1(tavor_mr_common_reg_sw2hw_mpt_cmd_fail,
 866  867                      TAVOR_TNF_ERROR, "", tnf_uint, status, status);
 867  868                  TAVOR_TNF_FAIL(status, "MODIFY_MPT command failed");
 868  869                  (void) tavor_mr_mtt_unbind(state, &srq->srq_mrhdl->mr_bindinfo,
 869  870                      srq->srq_mrhdl->mr_mttrsrcp);
 870  871                  kmem_free(wre_new, srq->srq_wq_bufsz *
 871  872                      sizeof (tavor_wrid_entry_t));
 872  873                  tavor_queue_free(state, &new_srqinfo);
 873  874                  mutex_exit(&mr->mr_lock);
 874  875                  mutex_exit(&srq->srq_lock);
 875  876                  return (ibc_get_ci_failure(0));
 876  877          }
 877  878  
 878  879          /*
 879  880           * Update the Tavor Shared Receive Queue handle with all the new
 880  881           * information.  At the same time, save away all the necessary
 881  882           * information for freeing up the old resources
 882  883           */
 883  884          old_srqinfo        = srq->srq_wqinfo;
 884  885          old_mtt            = srq->srq_mrhdl->mr_mttrsrcp;
 885  886          bcopy(&srq->srq_mrhdl->mr_bindinfo, &old_bind,
 886  887              sizeof (tavor_bind_info_t));
 887  888  
 888  889          /* Now set the new info */
 889  890          srq->srq_wqinfo    = new_srqinfo;
 890  891          srq->srq_wq_buf    = buf;
 891  892          srq->srq_wq_bufsz  = (1 << log_srq_size);
 892  893          bcopy(&bind, &srq->srq_mrhdl->mr_bindinfo, sizeof (tavor_bind_info_t));
 893  894          srq->srq_mrhdl->mr_mttrsrcp = mtt;
 894  895          srq->srq_desc_off  = srq_desc_off;
 895  896          srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);
 896  897  
 897  898          /* Update MR mtt pagesize */
 898  899          mr->mr_logmttpgsz = mtt_pgsize_bits;
 899  900          mutex_exit(&mr->mr_lock);
 900  901  
 901  902  #ifdef __lock_lint
 902  903          mutex_enter(&srq->srq_wrid_wql->wql_lock);
 903  904  #else
 904  905          if (srq->srq_wrid_wql != NULL) {
 905  906                  mutex_enter(&srq->srq_wrid_wql->wql_lock);
 906  907          }
 907  908  #endif
 908  909  
 909  910          /*
 910  911           * Initialize new wridlist, if needed.
 911  912           *
 912  913           * If a wridlist already is setup on an SRQ (the QP associated with an
 913  914           * SRQ has moved "from_reset") then we must update this wridlist based
 914  915           * on the new SRQ size.  We allocate the new size of Work Request ID
 915  916           * Entries, copy over the old entries to the new list, and
 916  917           * re-initialize the srq wridlist in non-umap case
 917  918           */
 918  919          wre_old = NULL;
 919  920          if (srq->srq_wridlist != NULL) {
 920  921                  wre_old = srq->srq_wridlist->wl_wre;
 921  922  
 922  923                  bcopy(wre_old, wre_new, srq_old_bufsz *
 923  924                      sizeof (tavor_wrid_entry_t));
 924  925  
 925  926                  /* Setup new sizes in wre */
 926  927                  srq->srq_wridlist->wl_wre = wre_new;
 927  928                  srq->srq_wridlist->wl_size = srq->srq_wq_bufsz;
 928  929  
 929  930                  if (!srq->srq_is_umap) {
 930  931                          tavor_wrid_list_srq_init(srq->srq_wridlist, srq,
 931  932                              srq_old_bufsz);
 932  933                  }
 933  934          }
 934  935  
 935  936  #ifdef __lock_lint
 936  937          mutex_exit(&srq->srq_wrid_wql->wql_lock);
 937  938  #else
 938  939          if (srq->srq_wrid_wql != NULL) {
 939  940                  mutex_exit(&srq->srq_wrid_wql->wql_lock);
 940  941          }
 941  942  #endif
 942  943  
 943  944          /*
 944  945           * If "old" SRQ was a user-mappable SRQ that is currently mmap()'d out
 945  946           * to a user process, then we need to call devmap_devmem_remap() to
 946  947           * invalidate the mapping to the SRQ memory.  We also need to
 947  948           * invalidate the SRQ tracking information for the user mapping.
 948  949           *
 949  950           * Note: On failure, the remap really shouldn't ever happen.  So, if it
 950  951           * does, it is an indication that something has gone seriously wrong.
 951  952           * So we print a warning message and return error (knowing, of course,
 952  953           * that the "old" SRQ memory will be leaked)
 953  954           */
 954  955          if ((srq->srq_is_umap) && (srq->srq_umap_dhp != NULL)) {
 955  956                  maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
 956  957                  status = devmap_devmem_remap(srq->srq_umap_dhp,
 957  958                      state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size, maxprot,
 958  959                      DEVMAP_MAPPING_INVALID, NULL);
 959  960                  if (status != DDI_SUCCESS) {
 960  961                          mutex_exit(&srq->srq_lock);
 961  962                          TAVOR_WARNING(state, "failed in SRQ memory "
 962  963                              "devmap_devmem_remap()");
 963  964                          /* We can, however, free the memory for old wre */
 964  965                          if (wre_old != NULL) {
 965  966                                  kmem_free(wre_old, srq_old_bufsz *
 966  967                                      sizeof (tavor_wrid_entry_t));
 967  968                          }
 968  969                          TAVOR_TNF_EXIT(tavor_srq_modify);
 969  970                          return (ibc_get_ci_failure(0));
 970  971                  }
 971  972                  srq->srq_umap_dhp = (devmap_cookie_t)NULL;
 972  973          }
 973  974  
 974  975          /*
 975  976           * Drop the SRQ lock now.  The only thing left to do is to free up
 976  977           * the old resources.
 977  978           */
 978  979          mutex_exit(&srq->srq_lock);
 979  980  
 980  981          /*
 981  982           * Unbind the MTT entries.
 982  983           */
 983  984          status = tavor_mr_mtt_unbind(state, &old_bind, old_mtt);
 984  985          if (status != DDI_SUCCESS) {
 985  986                  TAVOR_WARNING(state, "failed to unbind old SRQ memory");
 986  987                  /* Set "status" and "errormsg" and goto failure */
 987  988                  TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
 988  989                      "failed to unbind (old)");
 989  990                  goto srqmodify_fail;
 990  991          }
 991  992  
 992  993          /* Free the memory for old wre */
 993  994          if (wre_old != NULL) {
 994  995                  kmem_free(wre_old, srq_old_bufsz *
 995  996                      sizeof (tavor_wrid_entry_t));
 996  997          }
 997  998  
 998  999          /* Free the memory for the old SRQ */
 999 1000          tavor_queue_free(state, &old_srqinfo);
1000 1001  
1001 1002          /*
1002 1003           * Fill in the return arguments (if necessary).  This includes the
1003 1004           * real new completion queue size.
1004 1005           */
1005 1006          if (real_size != NULL) {
1006 1007                  *real_size = (1 << log_srq_size);
1007 1008          }
1008 1009  
1009 1010          TAVOR_TNF_EXIT(tavor_srq_modify);
1010 1011          return (DDI_SUCCESS);
1011 1012  
1012 1013  srqmodify_fail:
1013 1014          TNF_PROBE_1(tavor_srq_modify_fail, TAVOR_TNF_ERROR, "",
1014 1015              tnf_string, msg, errormsg);
1015 1016          TAVOR_TNF_EXIT(tavor_srq_modify);
1016 1017          return (status);
1017 1018  }
1018 1019  
1019 1020  
1020 1021  /*
1021 1022   * tavor_srq_refcnt_inc()
1022 1023   *    Context: Can be called from interrupt or base context.
1023 1024   */
1024 1025  void
1025 1026  tavor_srq_refcnt_inc(tavor_srqhdl_t srq)
1026 1027  {
1027 1028          mutex_enter(&srq->srq_lock);
1028 1029          TNF_PROBE_1_DEBUG(tavor_srq_refcnt_inc, TAVOR_TNF_TRACE, "",
1029 1030              tnf_uint, refcnt, srq->srq_refcnt);
1030 1031          srq->srq_refcnt++;
1031 1032          mutex_exit(&srq->srq_lock);
1032 1033  }
1033 1034  
1034 1035  
1035 1036  /*
1036 1037   * tavor_srq_refcnt_dec()
1037 1038   *    Context: Can be called from interrupt or base context.
1038 1039   */
1039 1040  void
1040 1041  tavor_srq_refcnt_dec(tavor_srqhdl_t srq)
1041 1042  {
1042 1043          mutex_enter(&srq->srq_lock);
1043 1044          srq->srq_refcnt--;
1044 1045          TNF_PROBE_1_DEBUG(tavor_srq_refcnt_dec, TAVOR_TNF_TRACE, "",
1045 1046              tnf_uint, refcnt, srq->srq_refcnt);
1046 1047          mutex_exit(&srq->srq_lock);
1047 1048  }
1048 1049  
1049 1050  
1050 1051  /*
1051 1052   * tavor_srqhdl_from_srqnum()
1052 1053   *    Context: Can be called from interrupt or base context.
1053 1054   *
1054 1055   *    This routine is important because changing the unconstrained
1055 1056   *    portion of the SRQ number is critical to the detection of a
1056 1057   *    potential race condition in the SRQ handler code (i.e. the case
1057 1058   *    where a SRQ is freed and alloc'd again before an event for the
1058 1059   *    "old" SRQ can be handled).
1059 1060   *
1060 1061   *    While this is not a perfect solution (not sure that one exists)
1061 1062   *    it does help to mitigate the chance that this race condition will
1062 1063   *    cause us to deliver a "stale" event to the new SRQ owner.  Note:
1063 1064   *    this solution does not scale well because the number of constrained
1064 1065   *    bits increases (and, hence, the number of unconstrained bits
1065 1066   *    decreases) as the number of supported SRQ grows.  For small and
1066 1067   *    intermediate values, it should hopefully provide sufficient
1067 1068   *    protection.
1068 1069   */
1069 1070  tavor_srqhdl_t
1070 1071  tavor_srqhdl_from_srqnum(tavor_state_t *state, uint_t srqnum)
1071 1072  {
1072 1073          uint_t  srqindx, srqmask;
1073 1074  
1074 1075          /* Calculate the SRQ table index from the srqnum */
1075 1076          srqmask = (1 << state->ts_cfg_profile->cp_log_num_srq) - 1;
1076 1077          srqindx = srqnum & srqmask;
1077 1078          return (state->ts_srqhdl[srqindx]);
1078 1079  }
1079 1080  
1080 1081  
1081 1082  /*
1082 1083   * tavor_srq_sgl_to_logwqesz()
1083 1084   *    Context: Can be called from interrupt or base context.
1084 1085   */
1085 1086  static void
1086 1087  tavor_srq_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
1087 1088      tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl)
1088 1089  {
1089 1090          uint_t  max_size, log2, actual_sgl;
1090 1091  
1091 1092          TAVOR_TNF_ENTER(tavor_srq_sgl_to_logwqesz);

↓ open down ↓

392 lines elided

↑ open up ↑

1092 1093  
1093 1094          switch (wq_type) {
1094 1095          case TAVOR_QP_WQ_TYPE_RECVQ:
1095 1096                  /*
1096 1097                   * Use requested maximum SGL to calculate max descriptor size
1097 1098                   * (while guaranteeing that the descriptor size is a
1098 1099                   * power-of-2 cachelines).
1099 1100                   */
1100 1101                  max_size = (TAVOR_QP_WQE_MLX_RCV_HDRS + (num_sgl << 4));
1101 1102                  log2 = highbit(max_size);
1102      -                if ((max_size & (max_size - 1)) == 0) {
     1103 +                if (ISP2(max_size)) {
1103 1104                          log2 = log2 - 1;
1104 1105                  }
1105 1106  
1106 1107                  /* Make sure descriptor is at least the minimum size */
1107 1108                  log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
1108 1109  
1109 1110                  /* Calculate actual number of SGL (given WQE size) */
1110 1111                  actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_RCV_HDRS) >> 4;
1111 1112                  break;
1112 1113

1113 1114          default:
1114 1115                  TAVOR_WARNING(state, "unexpected work queue type");
1115 1116                  TNF_PROBE_0(tavor_srq_sgl_to_logwqesz_inv_wqtype_fail,
1116 1117                      TAVOR_TNF_ERROR, "");
1117 1118                  break;
1118 1119          }
1119 1120  
1120 1121          /* Fill in the return values */
1121 1122          *logwqesz = log2;
1122 1123          *max_sgl  = min(state->ts_cfg_profile->cp_srq_max_sgl, actual_sgl);
1123 1124  
1124 1125          TAVOR_TNF_EXIT(tavor_qp_sgl_to_logwqesz);
1125 1126  }

↓ open down ↓

13 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX