Print this page
5255 uts shouldn't open-code ISP2
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/io/ib/adapters/tavor/tavor_srq.c
+++ new/usr/src/uts/common/io/ib/adapters/tavor/tavor_srq.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 24 * Use is subject to license terms.
↓ open down ↓ |
24 lines elided |
↑ open up ↑ |
25 25 */
26 26
27 27 /*
28 28 * tavor_srq.c
29 29 * Tavor Shared Receive Queue Processing Routines
30 30 *
31 31 * Implements all the routines necessary for allocating, freeing, querying,
32 32 * modifying and posting shared receive queues.
33 33 */
34 34
35 +#include <sys/sysmacros.h>
35 36 #include <sys/types.h>
36 37 #include <sys/conf.h>
37 38 #include <sys/ddi.h>
38 39 #include <sys/sunddi.h>
39 40 #include <sys/modctl.h>
40 41 #include <sys/bitmap.h>
41 42
42 43 #include <sys/ib/adapters/tavor/tavor.h>
43 44
44 45 static void tavor_srq_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
45 46 tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl);
46 47
47 48 /*
48 49 * tavor_srq_alloc()
49 50 * Context: Can be called only from user or kernel context.
50 51 */
51 52 int
52 53 tavor_srq_alloc(tavor_state_t *state, tavor_srq_info_t *srqinfo,
53 54 uint_t sleepflag, tavor_srq_options_t *op)
54 55 {
55 56 ibt_srq_hdl_t ibt_srqhdl;
56 57 tavor_pdhdl_t pd;
57 58 ibt_srq_sizes_t *sizes;
58 59 ibt_srq_sizes_t *real_sizes;
59 60 tavor_srqhdl_t *srqhdl;
60 61 ibt_srq_flags_t flags;
61 62 tavor_rsrc_t *srqc, *rsrc;
62 63 tavor_hw_srqc_t srqc_entry;
63 64 uint32_t *buf;
64 65 tavor_srqhdl_t srq;
65 66 tavor_umap_db_entry_t *umapdb;
66 67 ibt_mr_attr_t mr_attr;
67 68 tavor_mr_options_t mr_op;
68 69 tavor_mrhdl_t mr;
69 70 uint64_t addr;
70 71 uint64_t value, srq_desc_off;
71 72 uint32_t lkey;
72 73 uint32_t log_srq_size;
73 74 uint32_t uarpg;
74 75 uint_t wq_location, dma_xfer_mode, srq_is_umap;
75 76 int flag, status;
76 77 char *errormsg;
77 78 uint_t max_sgl;
78 79 uint_t wqesz;
79 80
80 81 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sizes))
81 82
82 83 TAVOR_TNF_ENTER(tavor_srq_alloc);
83 84
84 85 /*
85 86 * Check the "options" flag. Currently this flag tells the driver
86 87 * whether or not the SRQ's work queues should be come from normal
87 88 * system memory or whether they should be allocated from DDR memory.
88 89 */
89 90 if (op == NULL) {
90 91 wq_location = TAVOR_QUEUE_LOCATION_NORMAL;
91 92 } else {
92 93 wq_location = op->srqo_wq_loc;
93 94 }
94 95
95 96 /*
96 97 * Extract the necessary info from the tavor_srq_info_t structure
97 98 */
98 99 real_sizes = srqinfo->srqi_real_sizes;
99 100 sizes = srqinfo->srqi_sizes;
100 101 pd = srqinfo->srqi_pd;
101 102 ibt_srqhdl = srqinfo->srqi_ibt_srqhdl;
102 103 flags = srqinfo->srqi_flags;
103 104 srqhdl = srqinfo->srqi_srqhdl;
104 105
105 106 /*
106 107 * Determine whether SRQ is being allocated for userland access or
107 108 * whether it is being allocated for kernel access. If the SRQ is
108 109 * being allocated for userland access, then lookup the UAR doorbell
109 110 * page number for the current process. Note: If this is not found
110 111 * (e.g. if the process has not previously open()'d the Tavor driver),
111 112 * then an error is returned.
112 113 */
113 114 srq_is_umap = (flags & IBT_SRQ_USER_MAP) ? 1 : 0;
114 115 if (srq_is_umap) {
115 116 status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
116 117 MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
117 118 if (status != DDI_SUCCESS) {
118 119 /* Set "status" and "errormsg" and goto failure */
119 120 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "failed UAR page");
120 121 goto srqalloc_fail3;
121 122 }
122 123 uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
123 124 }
124 125
125 126 /* Increase PD refcnt */
126 127 tavor_pd_refcnt_inc(pd);
127 128
128 129 /* Allocate an SRQ context entry */
129 130 status = tavor_rsrc_alloc(state, TAVOR_SRQC, 1, sleepflag, &srqc);
130 131 if (status != DDI_SUCCESS) {
131 132 /* Set "status" and "errormsg" and goto failure */
132 133 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed SRQ context");
133 134 goto srqalloc_fail1;
134 135 }
135 136
136 137 /* Allocate the SRQ Handle entry */
137 138 status = tavor_rsrc_alloc(state, TAVOR_SRQHDL, 1, sleepflag, &rsrc);
138 139 if (status != DDI_SUCCESS) {
139 140 /* Set "status" and "errormsg" and goto failure */
140 141 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed SRQ handle");
141 142 goto srqalloc_fail2;
142 143 }
143 144
144 145 srq = (tavor_srqhdl_t)rsrc->tr_addr;
145 146 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq))
146 147
147 148 srq->srq_srqnum = srqc->tr_indx; /* just use index */
148 149
149 150 /*
150 151 * If this will be a user-mappable SRQ, then allocate an entry for
151 152 * the "userland resources database". This will later be added to
152 153 * the database (after all further SRQ operations are successful).
153 154 * If we fail here, we must undo the reference counts and the
154 155 * previous resource allocation.
155 156 */
156 157 if (srq_is_umap) {
157 158 umapdb = tavor_umap_db_alloc(state->ts_instance,
158 159 srq->srq_srqnum, MLNX_UMAP_SRQMEM_RSRC,
159 160 (uint64_t)(uintptr_t)rsrc);
160 161 if (umapdb == NULL) {
161 162 /* Set "status" and "errormsg" and goto failure */
162 163 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
163 164 goto srqalloc_fail3;
164 165 }
↓ open down ↓ |
120 lines elided |
↑ open up ↑ |
165 166 }
166 167
167 168 /*
168 169 * Calculate the appropriate size for the SRQ.
169 170 * Note: All Tavor SRQs must be a power-of-2 in size. Also
170 171 * they may not be any smaller than TAVOR_SRQ_MIN_SIZE. This step
171 172 * is to round the requested size up to the next highest power-of-2
172 173 */
173 174 sizes->srq_wr_sz = max(sizes->srq_wr_sz, TAVOR_SRQ_MIN_SIZE);
174 175 log_srq_size = highbit(sizes->srq_wr_sz);
175 - if ((sizes->srq_wr_sz & (sizes->srq_wr_sz - 1)) == 0) {
176 + if (ISP2(sizes->srq_wr_sz)) {
176 177 log_srq_size = log_srq_size - 1;
177 178 }
178 179
179 180 /*
180 181 * Next we verify that the rounded-up size is valid (i.e. consistent
181 182 * with the device limits and/or software-configured limits). If not,
182 183 * then obviously we have a lot of cleanup to do before returning.
183 184 */
184 185 if (log_srq_size > state->ts_cfg_profile->cp_log_max_srq_sz) {
185 186 /* Set "status" and "errormsg" and goto failure */
186 187 TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max SRQ size");
187 188 goto srqalloc_fail4;
188 189 }
189 190
190 191 /*
191 192 * Next we verify that the requested number of SGL is valid (i.e.
192 193 * consistent with the device limits and/or software-configured
193 194 * limits). If not, then obviously the same cleanup needs to be done.
194 195 */
195 196 max_sgl = state->ts_cfg_profile->cp_srq_max_sgl;
196 197 if (sizes->srq_sgl_sz > max_sgl) {
197 198 /* Set "status" and "errormsg" and goto failure */
198 199 TAVOR_TNF_FAIL(IBT_HCA_SGL_EXCEEDED, "max SRQ SGL");
199 200 goto srqalloc_fail4;
200 201 }
201 202
202 203 /*
203 204 * Determine the SRQ's WQE sizes. This depends on the requested
204 205 * number of SGLs. Note: This also has the side-effect of
205 206 * calculating the real number of SGLs (for the calculated WQE size)
206 207 */
207 208 tavor_srq_sgl_to_logwqesz(state, sizes->srq_sgl_sz,
208 209 TAVOR_QP_WQ_TYPE_RECVQ, &srq->srq_wq_log_wqesz,
209 210 &srq->srq_wq_sgl);
210 211
211 212 /*
212 213 * Allocate the memory for SRQ work queues. Note: The location from
213 214 * which we will allocate these work queues has been passed in through
214 215 * the tavor_qp_options_t structure. Since Tavor work queues are not
215 216 * allowed to cross a 32-bit (4GB) boundary, the alignment of the work
216 217 * queue memory is very important. We used to allocate work queues
217 218 * (the combined receive and send queues) so that they would be aligned
218 219 * on their combined size. That alignment guaranteed that they would
219 220 * never cross the 4GB boundary (Tavor work queues are on the order of
220 221 * MBs at maximum). Now we are able to relax this alignment constraint
221 222 * by ensuring that the IB address assigned to the queue memory (as a
222 223 * result of the tavor_mr_register() call) is offset from zero.
223 224 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
224 225 * guarantee the alignment, but when attempting to use IOMMU bypass
225 226 * mode we found that we were not allowed to specify any alignment that
226 227 * was more restrictive than the system page size. So we avoided this
227 228 * constraint by passing two alignment values, one for the memory
228 229 * allocation itself and the other for the DMA handle (for later bind).
229 230 * This used to cause more memory than necessary to be allocated (in
230 231 * order to guarantee the more restrictive alignment contraint). But
231 232 * be guaranteeing the zero-based IB virtual address for the queue, we
232 233 * are able to conserve this memory.
233 234 *
234 235 * Note: If SRQ is not user-mappable, then it may come from either
235 236 * kernel system memory or from HCA-attached local DDR memory.
236 237 *
237 238 * Note2: We align this queue on a pagesize boundary. This is required
238 239 * to make sure that all the resulting IB addresses will start at 0, for
239 240 * a zero-based queue. By making sure we are aligned on at least a
240 241 * page, any offset we use into our queue will be the same as when we
241 242 * perform tavor_srq_modify() operations later.
242 243 */
243 244 wqesz = (1 << srq->srq_wq_log_wqesz);
244 245 srq->srq_wqinfo.qa_size = (1 << log_srq_size) * wqesz;
245 246 srq->srq_wqinfo.qa_alloc_align = PAGESIZE;
246 247 srq->srq_wqinfo.qa_bind_align = PAGESIZE;
247 248 if (srq_is_umap) {
248 249 srq->srq_wqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
249 250 } else {
250 251 srq->srq_wqinfo.qa_location = wq_location;
251 252 }
252 253 status = tavor_queue_alloc(state, &srq->srq_wqinfo, sleepflag);
253 254 if (status != DDI_SUCCESS) {
254 255 /* Set "status" and "errormsg" and goto failure */
255 256 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed srq");
256 257 goto srqalloc_fail4;
257 258 }
258 259 buf = (uint32_t *)srq->srq_wqinfo.qa_buf_aligned;
259 260 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
260 261
261 262 /*
262 263 * Register the memory for the SRQ work queues. The memory for the SRQ
263 264 * must be registered in the Tavor TPT tables. This gives us the LKey
264 265 * to specify in the SRQ context later. Note: If the work queue is to
265 266 * be allocated from DDR memory, then only a "bypass" mapping is
266 267 * appropriate. And if the SRQ memory is user-mappable, then we force
267 268 * DDI_DMA_CONSISTENT mapping. Also, in order to meet the alignment
268 269 * restriction, we pass the "mro_bind_override_addr" flag in the call
269 270 * to tavor_mr_register(). This guarantees that the resulting IB vaddr
270 271 * will be zero-based (modulo the offset into the first page). If we
271 272 * fail here, we still have the bunch of resource and reference count
272 273 * cleanup to do.
273 274 */
274 275 flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP :
275 276 IBT_MR_NOSLEEP;
276 277 mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
277 278 mr_attr.mr_len = srq->srq_wqinfo.qa_size;
278 279 mr_attr.mr_as = NULL;
279 280 mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
280 281 if (srq_is_umap) {
281 282 mr_op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
282 283 } else {
283 284 if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
284 285 mr_op.mro_bind_type =
285 286 state->ts_cfg_profile->cp_iommu_bypass;
286 287 dma_xfer_mode =
287 288 state->ts_cfg_profile->cp_streaming_consistent;
288 289 if (dma_xfer_mode == DDI_DMA_STREAMING) {
289 290 mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
290 291 }
291 292 } else {
292 293 mr_op.mro_bind_type = TAVOR_BINDMEM_BYPASS;
293 294 }
294 295 }
295 296 mr_op.mro_bind_dmahdl = srq->srq_wqinfo.qa_dmahdl;
296 297 mr_op.mro_bind_override_addr = 1;
297 298 status = tavor_mr_register(state, pd, &mr_attr, &mr, &mr_op);
298 299 if (status != DDI_SUCCESS) {
299 300 /* Set "status" and "errormsg" and goto failure */
300 301 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
301 302 goto srqalloc_fail5;
302 303 }
303 304 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
304 305 addr = mr->mr_bindinfo.bi_addr;
305 306 lkey = mr->mr_lkey;
306 307
307 308 /*
308 309 * Calculate the offset between the kernel virtual address space
309 310 * and the IB virtual address space. This will be used when
310 311 * posting work requests to properly initialize each WQE.
311 312 */
312 313 srq_desc_off = (uint64_t)(uintptr_t)srq->srq_wqinfo.qa_buf_aligned -
313 314 (uint64_t)mr->mr_bindinfo.bi_addr;
314 315
315 316 /*
316 317 * Create WQL and Wridlist for use by this SRQ
317 318 */
318 319 srq->srq_wrid_wql = tavor_wrid_wql_create(state);
319 320 if (srq->srq_wrid_wql == NULL) {
320 321 /* Set "status" and "errormsg" and goto failure */
321 322 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed wql create");
322 323 goto srqalloc_fail6;
323 324 }
324 325 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq->srq_wrid_wql)))
325 326
326 327 srq->srq_wridlist = tavor_wrid_get_list(1 << log_srq_size);
327 328 if (srq->srq_wridlist == NULL) {
328 329 /* Set "status" and "errormsg" and goto failure */
329 330 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed wridlist create");
330 331 goto srqalloc_fail7;
331 332 }
332 333 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq->srq_wridlist)))
333 334
334 335 srq->srq_wridlist->wl_srq_en = 1;
335 336 srq->srq_wridlist->wl_free_list_indx = -1;
336 337
337 338 /*
338 339 * Fill in all the return arguments (if necessary). This includes
339 340 * real queue size and real SGLs.
340 341 */
341 342 if (real_sizes != NULL) {
342 343 real_sizes->srq_wr_sz = (1 << log_srq_size);
343 344 real_sizes->srq_sgl_sz = srq->srq_wq_sgl;
344 345 }
345 346
346 347 /*
347 348 * Fill in the SRQC entry. This is the final step before passing
348 349 * ownership of the SRQC entry to the Tavor hardware. We use all of
349 350 * the information collected/calculated above to fill in the
350 351 * requisite portions of the SRQC. Note: If this SRQ is going to be
351 352 * used for userland access, then we need to set the UAR page number
352 353 * appropriately (otherwise it's a "don't care")
353 354 */
354 355 bzero(&srqc_entry, sizeof (tavor_hw_srqc_t));
355 356 srqc_entry.wqe_addr_h = (addr >> 32);
356 357 srqc_entry.next_wqe_addr_l = 0;
357 358 srqc_entry.ds = (wqesz >> 4);
358 359 srqc_entry.state = TAVOR_SRQ_STATE_HW_OWNER;
359 360 srqc_entry.pd = pd->pd_pdnum;
360 361 srqc_entry.lkey = lkey;
361 362 srqc_entry.wqe_cnt = 0;
362 363 if (srq_is_umap) {
363 364 srqc_entry.uar = uarpg;
364 365 } else {
365 366 srqc_entry.uar = 0;
366 367 }
367 368
368 369 /*
369 370 * Write the SRQC entry to hardware. Lastly, we pass ownership of
370 371 * the entry to the hardware (using the Tavor SW2HW_SRQ firmware
371 372 * command). Note: In general, this operation shouldn't fail. But
372 373 * if it does, we have to undo everything we've done above before
373 374 * returning error.
374 375 */
375 376 status = tavor_cmn_ownership_cmd_post(state, SW2HW_SRQ, &srqc_entry,
376 377 sizeof (tavor_hw_srqc_t), srq->srq_srqnum,
377 378 sleepflag);
378 379 if (status != TAVOR_CMD_SUCCESS) {
379 380 cmn_err(CE_CONT, "Tavor: SW2HW_SRQ command failed: %08x\n",
380 381 status);
381 382 TNF_PROBE_1(tavor_srq_alloc_sw2hw_srq_cmd_fail,
382 383 TAVOR_TNF_ERROR, "", tnf_uint, status, status);
383 384 /* Set "status" and "errormsg" and goto failure */
384 385 TAVOR_TNF_FAIL(IBT_FAILURE, "tavor SW2HW_SRQ command");
385 386 goto srqalloc_fail8;
386 387 }
387 388
388 389 /*
389 390 * Fill in the rest of the Tavor SRQ handle. We can update
390 391 * the following fields for use in further operations on the SRQ.
391 392 */
392 393 srq->srq_srqcrsrcp = srqc;
393 394 srq->srq_rsrcp = rsrc;
394 395 srq->srq_mrhdl = mr;
395 396 srq->srq_refcnt = 0;
396 397 srq->srq_is_umap = srq_is_umap;
397 398 srq->srq_uarpg = (srq->srq_is_umap) ? uarpg : 0;
398 399 srq->srq_umap_dhp = (devmap_cookie_t)NULL;
399 400 srq->srq_pdhdl = pd;
400 401 srq->srq_wq_lastwqeindx = -1;
401 402 srq->srq_wq_bufsz = (1 << log_srq_size);
402 403 srq->srq_wq_buf = buf;
403 404 srq->srq_desc_off = srq_desc_off;
404 405 srq->srq_hdlrarg = (void *)ibt_srqhdl;
405 406 srq->srq_state = 0;
406 407 srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);
407 408 srq->srq_real_sizes.srq_sgl_sz = srq->srq_wq_sgl;
408 409
409 410 /* Determine if later ddi_dma_sync will be necessary */
410 411 srq->srq_sync = TAVOR_SRQ_IS_SYNC_REQ(state, srq->srq_wqinfo);
411 412
412 413 /*
413 414 * Put SRQ handle in Tavor SRQNum-to-SRQhdl list. Then fill in the
414 415 * "srqhdl" and return success
415 416 */
416 417 ASSERT(state->ts_srqhdl[srqc->tr_indx] == NULL);
417 418 state->ts_srqhdl[srqc->tr_indx] = srq;
418 419
419 420 /*
420 421 * If this is a user-mappable SRQ, then we need to insert the
421 422 * previously allocated entry into the "userland resources database".
422 423 * This will allow for later lookup during devmap() (i.e. mmap())
423 424 * calls.
424 425 */
425 426 if (srq->srq_is_umap) {
426 427 tavor_umap_db_add(umapdb);
427 428 } else {
428 429 mutex_enter(&srq->srq_wrid_wql->wql_lock);
429 430 tavor_wrid_list_srq_init(srq->srq_wridlist, srq, 0);
430 431 mutex_exit(&srq->srq_wrid_wql->wql_lock);
431 432 }
432 433
433 434 *srqhdl = srq;
434 435
435 436 TAVOR_TNF_EXIT(tavor_srq_alloc);
436 437 return (status);
437 438
438 439 /*
439 440 * The following is cleanup for all possible failure cases in this routine
440 441 */
441 442 srqalloc_fail8:
442 443 kmem_free(srq->srq_wridlist->wl_wre, srq->srq_wridlist->wl_size *
443 444 sizeof (tavor_wrid_entry_t));
444 445 kmem_free(srq->srq_wridlist, sizeof (tavor_wrid_list_hdr_t));
445 446 srqalloc_fail7:
446 447 tavor_wql_refcnt_dec(srq->srq_wrid_wql);
447 448 srqalloc_fail6:
448 449 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
449 450 TAVOR_SLEEPFLAG_FOR_CONTEXT()) != DDI_SUCCESS) {
450 451 TAVOR_WARNING(state, "failed to deregister SRQ memory");
451 452 }
452 453 srqalloc_fail5:
453 454 tavor_queue_free(state, &srq->srq_wqinfo);
454 455 srqalloc_fail4:
455 456 if (srq_is_umap) {
456 457 tavor_umap_db_free(umapdb);
457 458 }
458 459 srqalloc_fail3:
459 460 tavor_rsrc_free(state, &rsrc);
460 461 srqalloc_fail2:
461 462 tavor_rsrc_free(state, &srqc);
462 463 srqalloc_fail1:
463 464 tavor_pd_refcnt_dec(pd);
464 465 srqalloc_fail:
465 466 TNF_PROBE_1(tavor_srq_alloc_fail, TAVOR_TNF_ERROR, "",
466 467 tnf_string, msg, errormsg);
467 468 TAVOR_TNF_EXIT(tavor_srq_alloc);
468 469 return (status);
469 470 }
470 471
471 472
472 473 /*
473 474 * tavor_srq_free()
474 475 * Context: Can be called only from user or kernel context.
475 476 */
476 477 /* ARGSUSED */
477 478 int
478 479 tavor_srq_free(tavor_state_t *state, tavor_srqhdl_t *srqhdl, uint_t sleepflag)
479 480 {
480 481 tavor_rsrc_t *srqc, *rsrc;
481 482 tavor_umap_db_entry_t *umapdb;
482 483 uint64_t value;
483 484 tavor_srqhdl_t srq;
484 485 tavor_mrhdl_t mr;
485 486 tavor_pdhdl_t pd;
486 487 tavor_hw_srqc_t srqc_entry;
487 488 uint32_t srqnum;
488 489 uint32_t size;
489 490 uint_t maxprot;
490 491 int status;
491 492
492 493 TAVOR_TNF_ENTER(tavor_srq_free);
493 494
494 495 /*
495 496 * Pull all the necessary information from the Tavor Shared Receive
496 497 * Queue handle. This is necessary here because the resource for the
497 498 * SRQ handle is going to be freed up as part of this operation.
498 499 */
499 500 srq = *srqhdl;
500 501 mutex_enter(&srq->srq_lock);
501 502 srqc = srq->srq_srqcrsrcp;
502 503 rsrc = srq->srq_rsrcp;
503 504 pd = srq->srq_pdhdl;
504 505 mr = srq->srq_mrhdl;
505 506 srqnum = srq->srq_srqnum;
506 507
507 508 /*
508 509 * If there are work queues still associated with the SRQ, then return
509 510 * an error. Otherwise, we will be holding the SRQ lock.
510 511 */
511 512 if (srq->srq_refcnt != 0) {
512 513 mutex_exit(&srq->srq_lock);
513 514 TNF_PROBE_1(tavor_srq_free_refcnt_fail, TAVOR_TNF_ERROR, "",
514 515 tnf_int, refcnt, srq->srq_refcnt);
515 516 TAVOR_TNF_EXIT(tavor_srq_free);
516 517 return (IBT_SRQ_IN_USE);
517 518 }
518 519
519 520 /*
520 521 * If this was a user-mappable SRQ, then we need to remove its entry
521 522 * from the "userland resources database". If it is also currently
522 523 * mmap()'d out to a user process, then we need to call
523 524 * devmap_devmem_remap() to remap the SRQ memory to an invalid mapping.
524 525 * We also need to invalidate the SRQ tracking information for the
525 526 * user mapping.
526 527 */
527 528 if (srq->srq_is_umap) {
528 529 status = tavor_umap_db_find(state->ts_instance, srq->srq_srqnum,
529 530 MLNX_UMAP_SRQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
530 531 &umapdb);
531 532 if (status != DDI_SUCCESS) {
532 533 mutex_exit(&srq->srq_lock);
533 534 TAVOR_WARNING(state, "failed to find in database");
534 535 TAVOR_TNF_EXIT(tavor_srq_free);
535 536 return (ibc_get_ci_failure(0));
536 537 }
537 538 tavor_umap_db_free(umapdb);
538 539 if (srq->srq_umap_dhp != NULL) {
539 540 maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
540 541 status = devmap_devmem_remap(srq->srq_umap_dhp,
541 542 state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size,
542 543 maxprot, DEVMAP_MAPPING_INVALID, NULL);
543 544 if (status != DDI_SUCCESS) {
544 545 mutex_exit(&srq->srq_lock);
545 546 TAVOR_WARNING(state, "failed in SRQ memory "
546 547 "devmap_devmem_remap()");
547 548 TAVOR_TNF_EXIT(tavor_srq_free);
548 549 return (ibc_get_ci_failure(0));
549 550 }
550 551 srq->srq_umap_dhp = (devmap_cookie_t)NULL;
551 552 }
552 553 }
553 554
554 555 /*
555 556 * Put NULL into the Tavor SRQNum-to-SRQHdl list. This will allow any
556 557 * in-progress events to detect that the SRQ corresponding to this
557 558 * number has been freed.
558 559 */
559 560 state->ts_srqhdl[srqc->tr_indx] = NULL;
560 561
561 562 mutex_exit(&srq->srq_lock);
562 563 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq));
563 564 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq->srq_wridlist));
564 565
565 566 /*
566 567 * Reclaim SRQC entry from hardware (using the Tavor HW2SW_SRQ
567 568 * firmware command). If the ownership transfer fails for any reason,
568 569 * then it is an indication that something (either in HW or SW) has
569 570 * gone seriously wrong.
570 571 */
571 572 status = tavor_cmn_ownership_cmd_post(state, HW2SW_SRQ, &srqc_entry,
572 573 sizeof (tavor_hw_srqc_t), srqnum, sleepflag);
573 574 if (status != TAVOR_CMD_SUCCESS) {
574 575 TAVOR_WARNING(state, "failed to reclaim SRQC ownership");
575 576 cmn_err(CE_CONT, "Tavor: HW2SW_SRQ command failed: %08x\n",
576 577 status);
577 578 TNF_PROBE_1(tavor_srq_free_hw2sw_srq_cmd_fail,
578 579 TAVOR_TNF_ERROR, "", tnf_uint, status, status);
579 580 TAVOR_TNF_EXIT(tavor_srq_free);
580 581 return (IBT_FAILURE);
581 582 }
582 583
583 584 /*
584 585 * Deregister the memory for the Shared Receive Queue. If this fails
585 586 * for any reason, then it is an indication that something (either
586 587 * in HW or SW) has gone seriously wrong. So we print a warning
587 588 * message and return.
588 589 */
589 590 status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
590 591 sleepflag);
591 592 if (status != DDI_SUCCESS) {
592 593 TAVOR_WARNING(state, "failed to deregister SRQ memory");
593 594 TNF_PROBE_0(tavor_srq_free_dereg_mr_fail, TAVOR_TNF_ERROR, "");
594 595 TAVOR_TNF_EXIT(tavor_srq_free);
595 596 return (IBT_FAILURE);
596 597 }
597 598
598 599 /* Calculate the size and free the wridlist container */
599 600 if (srq->srq_wridlist != NULL) {
600 601 size = (srq->srq_wridlist->wl_size *
601 602 sizeof (tavor_wrid_entry_t));
602 603 kmem_free(srq->srq_wridlist->wl_wre, size);
603 604 kmem_free(srq->srq_wridlist, sizeof (tavor_wrid_list_hdr_t));
604 605
605 606 /*
606 607 * Release reference to WQL; If this is the last reference,
607 608 * this call also has the side effect of freeing up the
608 609 * 'srq_wrid_wql' memory.
609 610 */
610 611 tavor_wql_refcnt_dec(srq->srq_wrid_wql);
611 612 }
612 613
613 614 /* Free the memory for the SRQ */
614 615 tavor_queue_free(state, &srq->srq_wqinfo);
615 616
616 617 /* Free the Tavor SRQ Handle */
617 618 tavor_rsrc_free(state, &rsrc);
618 619
619 620 /* Free the SRQC entry resource */
620 621 tavor_rsrc_free(state, &srqc);
621 622
622 623 /* Decrement the reference count on the protection domain (PD) */
623 624 tavor_pd_refcnt_dec(pd);
624 625
625 626 /* Set the srqhdl pointer to NULL and return success */
626 627 *srqhdl = NULL;
627 628
628 629 TAVOR_TNF_EXIT(tavor_srq_free);
629 630 return (DDI_SUCCESS);
630 631 }
631 632
632 633
633 634 /*
634 635 * tavor_srq_modify()
635 636 * Context: Can be called only from user or kernel context.
636 637 */
637 638 int
638 639 tavor_srq_modify(tavor_state_t *state, tavor_srqhdl_t srq, uint_t size,
639 640 uint_t *real_size, uint_t sleepflag)
640 641 {
641 642 tavor_qalloc_info_t new_srqinfo, old_srqinfo;
642 643 tavor_rsrc_t *mtt, *mpt, *old_mtt;
643 644 tavor_bind_info_t bind;
644 645 tavor_bind_info_t old_bind;
645 646 tavor_rsrc_pool_info_t *rsrc_pool;
646 647 tavor_mrhdl_t mr;
647 648 tavor_hw_mpt_t mpt_entry;
648 649 tavor_wrid_entry_t *wre_new, *wre_old;
649 650 uint64_t mtt_ddrbaseaddr, mtt_addr;
650 651 uint64_t srq_desc_off;
651 652 uint32_t *buf, srq_old_bufsz;
652 653 uint32_t wqesz;
653 654 uint_t max_srq_size;
654 655 uint_t dma_xfer_mode, mtt_pgsize_bits;
655 656 uint_t srq_sync, log_srq_size, maxprot;
656 657 uint_t wq_location;
657 658 int status;
658 659 char *errormsg;
659 660
660 661 TAVOR_TNF_ENTER(tavor_srq_modify);
661 662
662 663 /*
663 664 * Check the "inddr" flag. This flag tells the driver whether or not
664 665 * the SRQ's work queues should be come from normal system memory or
665 666 * whether they should be allocated from DDR memory.
666 667 */
667 668 wq_location = state->ts_cfg_profile->cp_srq_wq_inddr;
668 669
669 670 /*
670 671 * If size requested is larger than device capability, return
671 672 * Insufficient Resources
672 673 */
673 674 max_srq_size = (1 << state->ts_cfg_profile->cp_log_max_srq_sz);
674 675 if (size > max_srq_size) {
675 676 TNF_PROBE_0(tavor_srq_modify_size_larger_than_maxsize,
676 677 TAVOR_TNF_ERROR, "");
677 678 TAVOR_TNF_EXIT(tavor_srq_modify);
678 679 return (IBT_HCA_WR_EXCEEDED);
↓ open down ↓ |
493 lines elided |
↑ open up ↑ |
679 680 }
680 681
681 682 /*
682 683 * Calculate the appropriate size for the SRQ.
683 684 * Note: All Tavor SRQs must be a power-of-2 in size. Also
684 685 * they may not be any smaller than TAVOR_SRQ_MIN_SIZE. This step
685 686 * is to round the requested size up to the next highest power-of-2
686 687 */
687 688 size = max(size, TAVOR_SRQ_MIN_SIZE);
688 689 log_srq_size = highbit(size);
689 - if ((size & (size - 1)) == 0) {
690 + if (ISP2(size)) {
690 691 log_srq_size = log_srq_size - 1;
691 692 }
692 693
693 694 /*
694 695 * Next we verify that the rounded-up size is valid (i.e. consistent
695 696 * with the device limits and/or software-configured limits).
696 697 */
697 698 if (log_srq_size > state->ts_cfg_profile->cp_log_max_srq_sz) {
698 699 /* Set "status" and "errormsg" and goto failure */
699 700 TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max SRQ size");
700 701 goto srqmodify_fail;
701 702 }
702 703
703 704 /*
704 705 * Allocate the memory for newly resized Shared Receive Queue.
705 706 *
706 707 * Note: If SRQ is not user-mappable, then it may come from either
707 708 * kernel system memory or from HCA-attached local DDR memory.
708 709 *
709 710 * Note2: We align this queue on a pagesize boundary. This is required
710 711 * to make sure that all the resulting IB addresses will start at 0,
711 712 * for a zero-based queue. By making sure we are aligned on at least a
712 713 * page, any offset we use into our queue will be the same as it was
713 714 * when we allocated it at tavor_srq_alloc() time.
714 715 */
715 716 wqesz = (1 << srq->srq_wq_log_wqesz);
716 717 new_srqinfo.qa_size = (1 << log_srq_size) * wqesz;
717 718 new_srqinfo.qa_alloc_align = PAGESIZE;
718 719 new_srqinfo.qa_bind_align = PAGESIZE;
719 720 if (srq->srq_is_umap) {
720 721 new_srqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
721 722 } else {
722 723 new_srqinfo.qa_location = wq_location;
723 724 }
724 725 status = tavor_queue_alloc(state, &new_srqinfo, sleepflag);
725 726 if (status != DDI_SUCCESS) {
726 727 /* Set "status" and "errormsg" and goto failure */
727 728 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed srq");
728 729 goto srqmodify_fail;
729 730 }
730 731 buf = (uint32_t *)new_srqinfo.qa_buf_aligned;
731 732 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
732 733
733 734 /*
734 735 * Allocate the memory for the new WRE list. This will be used later
735 736 * when we resize the wridlist based on the new SRQ size.
736 737 */
737 738 wre_new = (tavor_wrid_entry_t *)kmem_zalloc((1 << log_srq_size) *
738 739 sizeof (tavor_wrid_entry_t), sleepflag);
739 740 if (wre_new == NULL) {
740 741 /* Set "status" and "errormsg" and goto failure */
741 742 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE,
742 743 "failed wre_new alloc");
743 744 goto srqmodify_fail;
744 745 }
745 746
746 747 /*
747 748 * Fill in the "bind" struct. This struct provides the majority
748 749 * of the information that will be used to distinguish between an
749 750 * "addr" binding (as is the case here) and a "buf" binding (see
750 751 * below). The "bind" struct is later passed to tavor_mr_mem_bind()
751 752 * which does most of the "heavy lifting" for the Tavor memory
752 753 * registration routines.
753 754 */
754 755 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(bind))
755 756 bzero(&bind, sizeof (tavor_bind_info_t));
756 757 bind.bi_type = TAVOR_BINDHDL_VADDR;
757 758 bind.bi_addr = (uint64_t)(uintptr_t)buf;
758 759 bind.bi_len = new_srqinfo.qa_size;
759 760 bind.bi_as = NULL;
760 761 bind.bi_flags = sleepflag == TAVOR_SLEEP ? IBT_MR_SLEEP :
761 762 IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
762 763 if (srq->srq_is_umap) {
763 764 bind.bi_bypass = state->ts_cfg_profile->cp_iommu_bypass;
764 765 } else {
765 766 if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
766 767 bind.bi_bypass =
767 768 state->ts_cfg_profile->cp_iommu_bypass;
768 769 dma_xfer_mode =
769 770 state->ts_cfg_profile->cp_streaming_consistent;
770 771 if (dma_xfer_mode == DDI_DMA_STREAMING) {
771 772 bind.bi_flags |= IBT_MR_NONCOHERENT;
772 773 }
773 774 } else {
774 775 bind.bi_bypass = TAVOR_BINDMEM_BYPASS;
775 776 }
776 777 }
777 778 status = tavor_mr_mtt_bind(state, &bind, new_srqinfo.qa_dmahdl, &mtt,
778 779 &mtt_pgsize_bits);
779 780 if (status != DDI_SUCCESS) {
780 781 /* Set "status" and "errormsg" and goto failure */
781 782 TAVOR_TNF_FAIL(status, "failed mtt bind");
782 783 kmem_free(wre_new, srq->srq_wq_bufsz *
783 784 sizeof (tavor_wrid_entry_t));
784 785 tavor_queue_free(state, &new_srqinfo);
785 786 goto srqmodify_fail;
786 787 }
787 788
788 789 /*
789 790 * Calculate the offset between the kernel virtual address space
790 791 * and the IB virtual address space. This will be used when
791 792 * posting work requests to properly initialize each WQE.
792 793 *
793 794 * Note: bind addr is zero-based (from alloc) so we calculate the
794 795 * correct new offset here.
795 796 */
796 797 bind.bi_addr = bind.bi_addr & ((1 << mtt_pgsize_bits) - 1);
797 798 srq_desc_off = (uint64_t)(uintptr_t)new_srqinfo.qa_buf_aligned -
798 799 (uint64_t)bind.bi_addr;
799 800
800 801 /*
801 802 * Get the base address for the MTT table. This will be necessary
802 803 * below when we are modifying the MPT entry.
803 804 */
804 805 rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
805 806 mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
806 807
807 808 /*
808 809 * Fill in the MPT entry. This is the final step before passing
809 810 * ownership of the MPT entry to the Tavor hardware. We use all of
810 811 * the information collected/calculated above to fill in the
811 812 * requisite portions of the MPT.
812 813 */
813 814 bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
814 815 mpt_entry.reg_win_len = bind.bi_len;
815 816 mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
816 817 mpt_entry.mttseg_addr_h = mtt_addr >> 32;
817 818 mpt_entry.mttseg_addr_l = mtt_addr >> 6;
818 819
819 820 /*
820 821 * Now we grab the SRQ lock. Since we will be updating the actual
821 822 * SRQ location and the producer/consumer indexes, we should hold
822 823 * the lock.
823 824 *
824 825 * We do a TAVOR_NOSLEEP here (and below), though, because we are
825 826 * holding the "srq_lock" and if we got raised to interrupt level
826 827 * by priority inversion, we would not want to block in this routine
827 828 * waiting for success.
828 829 */
829 830 mutex_enter(&srq->srq_lock);
830 831
831 832 /*
832 833 * Copy old entries to new buffer
833 834 */
834 835 srq_old_bufsz = srq->srq_wq_bufsz;
835 836 bcopy(srq->srq_wq_buf, buf, srq_old_bufsz * wqesz);
836 837
837 838 /* Determine if later ddi_dma_sync will be necessary */
838 839 srq_sync = TAVOR_SRQ_IS_SYNC_REQ(state, srq->srq_wqinfo);
839 840
840 841 /* Sync entire "new" SRQ for use by hardware (if necessary) */
841 842 if (srq_sync) {
842 843 (void) ddi_dma_sync(bind.bi_dmahdl, 0,
843 844 new_srqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
844 845 }
845 846
846 847 /*
847 848 * Setup MPT information for use in the MODIFY_MPT command
848 849 */
849 850 mr = srq->srq_mrhdl;
850 851 mutex_enter(&mr->mr_lock);
851 852 mpt = srq->srq_mrhdl->mr_mptrsrcp;
852 853
853 854 /*
854 855 * MODIFY_MPT
855 856 *
856 857 * If this fails for any reason, then it is an indication that
857 858 * something (either in HW or SW) has gone seriously wrong. So we
858 859 * print a warning message and return.
859 860 */
860 861 status = tavor_modify_mpt_cmd_post(state, &mpt_entry, mpt->tr_indx,
861 862 TAVOR_CMD_MODIFY_MPT_RESIZESRQ, sleepflag);
862 863 if (status != TAVOR_CMD_SUCCESS) {
863 864 cmn_err(CE_CONT, "Tavor: MODIFY_MPT command failed: %08x\n",
864 865 status);
865 866 TNF_PROBE_1(tavor_mr_common_reg_sw2hw_mpt_cmd_fail,
866 867 TAVOR_TNF_ERROR, "", tnf_uint, status, status);
867 868 TAVOR_TNF_FAIL(status, "MODIFY_MPT command failed");
868 869 (void) tavor_mr_mtt_unbind(state, &srq->srq_mrhdl->mr_bindinfo,
869 870 srq->srq_mrhdl->mr_mttrsrcp);
870 871 kmem_free(wre_new, srq->srq_wq_bufsz *
871 872 sizeof (tavor_wrid_entry_t));
872 873 tavor_queue_free(state, &new_srqinfo);
873 874 mutex_exit(&mr->mr_lock);
874 875 mutex_exit(&srq->srq_lock);
875 876 return (ibc_get_ci_failure(0));
876 877 }
877 878
878 879 /*
879 880 * Update the Tavor Shared Receive Queue handle with all the new
880 881 * information. At the same time, save away all the necessary
881 882 * information for freeing up the old resources
882 883 */
883 884 old_srqinfo = srq->srq_wqinfo;
884 885 old_mtt = srq->srq_mrhdl->mr_mttrsrcp;
885 886 bcopy(&srq->srq_mrhdl->mr_bindinfo, &old_bind,
886 887 sizeof (tavor_bind_info_t));
887 888
888 889 /* Now set the new info */
889 890 srq->srq_wqinfo = new_srqinfo;
890 891 srq->srq_wq_buf = buf;
891 892 srq->srq_wq_bufsz = (1 << log_srq_size);
892 893 bcopy(&bind, &srq->srq_mrhdl->mr_bindinfo, sizeof (tavor_bind_info_t));
893 894 srq->srq_mrhdl->mr_mttrsrcp = mtt;
894 895 srq->srq_desc_off = srq_desc_off;
895 896 srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);
896 897
897 898 /* Update MR mtt pagesize */
898 899 mr->mr_logmttpgsz = mtt_pgsize_bits;
899 900 mutex_exit(&mr->mr_lock);
900 901
901 902 #ifdef __lock_lint
902 903 mutex_enter(&srq->srq_wrid_wql->wql_lock);
903 904 #else
904 905 if (srq->srq_wrid_wql != NULL) {
905 906 mutex_enter(&srq->srq_wrid_wql->wql_lock);
906 907 }
907 908 #endif
908 909
909 910 /*
910 911 * Initialize new wridlist, if needed.
911 912 *
912 913 * If a wridlist already is setup on an SRQ (the QP associated with an
913 914 * SRQ has moved "from_reset") then we must update this wridlist based
914 915 * on the new SRQ size. We allocate the new size of Work Request ID
915 916 * Entries, copy over the old entries to the new list, and
916 917 * re-initialize the srq wridlist in non-umap case
917 918 */
918 919 wre_old = NULL;
919 920 if (srq->srq_wridlist != NULL) {
920 921 wre_old = srq->srq_wridlist->wl_wre;
921 922
922 923 bcopy(wre_old, wre_new, srq_old_bufsz *
923 924 sizeof (tavor_wrid_entry_t));
924 925
925 926 /* Setup new sizes in wre */
926 927 srq->srq_wridlist->wl_wre = wre_new;
927 928 srq->srq_wridlist->wl_size = srq->srq_wq_bufsz;
928 929
929 930 if (!srq->srq_is_umap) {
930 931 tavor_wrid_list_srq_init(srq->srq_wridlist, srq,
931 932 srq_old_bufsz);
932 933 }
933 934 }
934 935
935 936 #ifdef __lock_lint
936 937 mutex_exit(&srq->srq_wrid_wql->wql_lock);
937 938 #else
938 939 if (srq->srq_wrid_wql != NULL) {
939 940 mutex_exit(&srq->srq_wrid_wql->wql_lock);
940 941 }
941 942 #endif
942 943
943 944 /*
944 945 * If "old" SRQ was a user-mappable SRQ that is currently mmap()'d out
945 946 * to a user process, then we need to call devmap_devmem_remap() to
946 947 * invalidate the mapping to the SRQ memory. We also need to
947 948 * invalidate the SRQ tracking information for the user mapping.
948 949 *
949 950 * Note: On failure, the remap really shouldn't ever happen. So, if it
950 951 * does, it is an indication that something has gone seriously wrong.
951 952 * So we print a warning message and return error (knowing, of course,
952 953 * that the "old" SRQ memory will be leaked)
953 954 */
954 955 if ((srq->srq_is_umap) && (srq->srq_umap_dhp != NULL)) {
955 956 maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
956 957 status = devmap_devmem_remap(srq->srq_umap_dhp,
957 958 state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size, maxprot,
958 959 DEVMAP_MAPPING_INVALID, NULL);
959 960 if (status != DDI_SUCCESS) {
960 961 mutex_exit(&srq->srq_lock);
961 962 TAVOR_WARNING(state, "failed in SRQ memory "
962 963 "devmap_devmem_remap()");
963 964 /* We can, however, free the memory for old wre */
964 965 if (wre_old != NULL) {
965 966 kmem_free(wre_old, srq_old_bufsz *
966 967 sizeof (tavor_wrid_entry_t));
967 968 }
968 969 TAVOR_TNF_EXIT(tavor_srq_modify);
969 970 return (ibc_get_ci_failure(0));
970 971 }
971 972 srq->srq_umap_dhp = (devmap_cookie_t)NULL;
972 973 }
973 974
974 975 /*
975 976 * Drop the SRQ lock now. The only thing left to do is to free up
976 977 * the old resources.
977 978 */
978 979 mutex_exit(&srq->srq_lock);
979 980
980 981 /*
981 982 * Unbind the MTT entries.
982 983 */
983 984 status = tavor_mr_mtt_unbind(state, &old_bind, old_mtt);
984 985 if (status != DDI_SUCCESS) {
985 986 TAVOR_WARNING(state, "failed to unbind old SRQ memory");
986 987 /* Set "status" and "errormsg" and goto failure */
987 988 TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
988 989 "failed to unbind (old)");
989 990 goto srqmodify_fail;
990 991 }
991 992
992 993 /* Free the memory for old wre */
993 994 if (wre_old != NULL) {
994 995 kmem_free(wre_old, srq_old_bufsz *
995 996 sizeof (tavor_wrid_entry_t));
996 997 }
997 998
998 999 /* Free the memory for the old SRQ */
999 1000 tavor_queue_free(state, &old_srqinfo);
1000 1001
1001 1002 /*
1002 1003 * Fill in the return arguments (if necessary). This includes the
1003 1004 * real new completion queue size.
1004 1005 */
1005 1006 if (real_size != NULL) {
1006 1007 *real_size = (1 << log_srq_size);
1007 1008 }
1008 1009
1009 1010 TAVOR_TNF_EXIT(tavor_srq_modify);
1010 1011 return (DDI_SUCCESS);
1011 1012
1012 1013 srqmodify_fail:
1013 1014 TNF_PROBE_1(tavor_srq_modify_fail, TAVOR_TNF_ERROR, "",
1014 1015 tnf_string, msg, errormsg);
1015 1016 TAVOR_TNF_EXIT(tavor_srq_modify);
1016 1017 return (status);
1017 1018 }
1018 1019
1019 1020
1020 1021 /*
1021 1022 * tavor_srq_refcnt_inc()
1022 1023 * Context: Can be called from interrupt or base context.
1023 1024 */
1024 1025 void
1025 1026 tavor_srq_refcnt_inc(tavor_srqhdl_t srq)
1026 1027 {
1027 1028 mutex_enter(&srq->srq_lock);
1028 1029 TNF_PROBE_1_DEBUG(tavor_srq_refcnt_inc, TAVOR_TNF_TRACE, "",
1029 1030 tnf_uint, refcnt, srq->srq_refcnt);
1030 1031 srq->srq_refcnt++;
1031 1032 mutex_exit(&srq->srq_lock);
1032 1033 }
1033 1034
1034 1035
1035 1036 /*
1036 1037 * tavor_srq_refcnt_dec()
1037 1038 * Context: Can be called from interrupt or base context.
1038 1039 */
1039 1040 void
1040 1041 tavor_srq_refcnt_dec(tavor_srqhdl_t srq)
1041 1042 {
1042 1043 mutex_enter(&srq->srq_lock);
1043 1044 srq->srq_refcnt--;
1044 1045 TNF_PROBE_1_DEBUG(tavor_srq_refcnt_dec, TAVOR_TNF_TRACE, "",
1045 1046 tnf_uint, refcnt, srq->srq_refcnt);
1046 1047 mutex_exit(&srq->srq_lock);
1047 1048 }
1048 1049
1049 1050
1050 1051 /*
1051 1052 * tavor_srqhdl_from_srqnum()
1052 1053 * Context: Can be called from interrupt or base context.
1053 1054 *
1054 1055 * This routine is important because changing the unconstrained
1055 1056 * portion of the SRQ number is critical to the detection of a
1056 1057 * potential race condition in the SRQ handler code (i.e. the case
1057 1058 * where a SRQ is freed and alloc'd again before an event for the
1058 1059 * "old" SRQ can be handled).
1059 1060 *
1060 1061 * While this is not a perfect solution (not sure that one exists)
1061 1062 * it does help to mitigate the chance that this race condition will
1062 1063 * cause us to deliver a "stale" event to the new SRQ owner. Note:
1063 1064 * this solution does not scale well because the number of constrained
1064 1065 * bits increases (and, hence, the number of unconstrained bits
1065 1066 * decreases) as the number of supported SRQ grows. For small and
1066 1067 * intermediate values, it should hopefully provide sufficient
1067 1068 * protection.
1068 1069 */
1069 1070 tavor_srqhdl_t
1070 1071 tavor_srqhdl_from_srqnum(tavor_state_t *state, uint_t srqnum)
1071 1072 {
1072 1073 uint_t srqindx, srqmask;
1073 1074
1074 1075 /* Calculate the SRQ table index from the srqnum */
1075 1076 srqmask = (1 << state->ts_cfg_profile->cp_log_num_srq) - 1;
1076 1077 srqindx = srqnum & srqmask;
1077 1078 return (state->ts_srqhdl[srqindx]);
1078 1079 }
1079 1080
1080 1081
1081 1082 /*
1082 1083 * tavor_srq_sgl_to_logwqesz()
1083 1084 * Context: Can be called from interrupt or base context.
1084 1085 */
1085 1086 static void
1086 1087 tavor_srq_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
1087 1088 tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl)
1088 1089 {
1089 1090 uint_t max_size, log2, actual_sgl;
1090 1091
1091 1092 TAVOR_TNF_ENTER(tavor_srq_sgl_to_logwqesz);
↓ open down ↓ |
392 lines elided |
↑ open up ↑ |
1092 1093
1093 1094 switch (wq_type) {
1094 1095 case TAVOR_QP_WQ_TYPE_RECVQ:
1095 1096 /*
1096 1097 * Use requested maximum SGL to calculate max descriptor size
1097 1098 * (while guaranteeing that the descriptor size is a
1098 1099 * power-of-2 cachelines).
1099 1100 */
1100 1101 max_size = (TAVOR_QP_WQE_MLX_RCV_HDRS + (num_sgl << 4));
1101 1102 log2 = highbit(max_size);
1102 - if ((max_size & (max_size - 1)) == 0) {
1103 + if (ISP2(max_size)) {
1103 1104 log2 = log2 - 1;
1104 1105 }
1105 1106
1106 1107 /* Make sure descriptor is at least the minimum size */
1107 1108 log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
1108 1109
1109 1110 /* Calculate actual number of SGL (given WQE size) */
1110 1111 actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_RCV_HDRS) >> 4;
1111 1112 break;
1112 1113
1113 1114 default:
1114 1115 TAVOR_WARNING(state, "unexpected work queue type");
1115 1116 TNF_PROBE_0(tavor_srq_sgl_to_logwqesz_inv_wqtype_fail,
1116 1117 TAVOR_TNF_ERROR, "");
1117 1118 break;
1118 1119 }
1119 1120
1120 1121 /* Fill in the return values */
1121 1122 *logwqesz = log2;
1122 1123 *max_sgl = min(state->ts_cfg_profile->cp_srq_max_sgl, actual_sgl);
1123 1124
1124 1125 TAVOR_TNF_EXIT(tavor_qp_sgl_to_logwqesz);
1125 1126 }
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX