1 | /* |
2 | * Copyright (c) 2016 Oracle. All rights reserved. |
3 | * |
4 | * This software is available to you under a choice of one of two |
5 | * licenses. You may choose to be licensed under the terms of the GNU |
6 | * General Public License (GPL) Version 2, available from the file |
7 | * COPYING in the main directory of this source tree, or the |
8 | * OpenIB.org BSD license below: |
9 | * |
10 | * Redistribution and use in source and binary forms, with or |
11 | * without modification, are permitted provided that the following |
12 | * conditions are met: |
13 | * |
14 | * - Redistributions of source code must retain the above |
15 | * copyright notice, this list of conditions and the following |
16 | * disclaimer. |
17 | * |
18 | * - Redistributions in binary form must reproduce the above |
19 | * copyright notice, this list of conditions and the following |
20 | * disclaimer in the documentation and/or other materials |
21 | * provided with the distribution. |
22 | * |
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
30 | * SOFTWARE. |
31 | */ |
32 | |
33 | #include "ib_mr.h" |
34 | |
35 | static inline void |
36 | rds_transition_frwr_state(struct rds_ib_mr *ibmr, |
37 | enum rds_ib_fr_state old_state, |
38 | enum rds_ib_fr_state new_state) |
39 | { |
40 | if (cmpxchg(&ibmr->u.frmr.fr_state, |
41 | old_state, new_state) == old_state && |
42 | old_state == FRMR_IS_INUSE) { |
43 | /* enforce order of ibmr->u.frmr.fr_state update |
44 | * before decrementing i_fastreg_inuse_count |
45 | */ |
46 | smp_mb__before_atomic(); |
47 | atomic_dec(v: &ibmr->ic->i_fastreg_inuse_count); |
48 | if (waitqueue_active(wq_head: &rds_ib_ring_empty_wait)) |
49 | wake_up(&rds_ib_ring_empty_wait); |
50 | } |
51 | } |
52 | |
53 | static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev, |
54 | int npages) |
55 | { |
56 | struct rds_ib_mr_pool *pool; |
57 | struct rds_ib_mr *ibmr = NULL; |
58 | struct rds_ib_frmr *frmr; |
59 | int err = 0; |
60 | |
61 | if (npages <= RDS_MR_8K_MSG_SIZE) |
62 | pool = rds_ibdev->mr_8k_pool; |
63 | else |
64 | pool = rds_ibdev->mr_1m_pool; |
65 | |
66 | ibmr = rds_ib_try_reuse_ibmr(pool); |
67 | if (ibmr) |
68 | return ibmr; |
69 | |
70 | ibmr = kzalloc_node(size: sizeof(*ibmr), GFP_KERNEL, |
71 | rdsibdev_to_node(rds_ibdev)); |
72 | if (!ibmr) { |
73 | err = -ENOMEM; |
74 | goto out_no_cigar; |
75 | } |
76 | |
77 | frmr = &ibmr->u.frmr; |
78 | frmr->mr = ib_alloc_mr(pd: rds_ibdev->pd, mr_type: IB_MR_TYPE_MEM_REG, |
79 | max_num_sg: pool->max_pages); |
80 | if (IS_ERR(ptr: frmr->mr)) { |
81 | pr_warn("RDS/IB: %s failed to allocate MR" , __func__); |
82 | err = PTR_ERR(ptr: frmr->mr); |
83 | goto out_no_cigar; |
84 | } |
85 | |
86 | ibmr->pool = pool; |
87 | if (pool->pool_type == RDS_IB_MR_8K_POOL) |
88 | rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc); |
89 | else |
90 | rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc); |
91 | |
92 | if (atomic_read(v: &pool->item_count) > pool->max_items_soft) |
93 | pool->max_items_soft = pool->max_items; |
94 | |
95 | frmr->fr_state = FRMR_IS_FREE; |
96 | init_waitqueue_head(&frmr->fr_inv_done); |
97 | init_waitqueue_head(&frmr->fr_reg_done); |
98 | return ibmr; |
99 | |
100 | out_no_cigar: |
101 | kfree(objp: ibmr); |
102 | atomic_dec(v: &pool->item_count); |
103 | return ERR_PTR(error: err); |
104 | } |
105 | |
106 | static void rds_ib_free_frmr(struct rds_ib_mr *ibmr, bool drop) |
107 | { |
108 | struct rds_ib_mr_pool *pool = ibmr->pool; |
109 | |
110 | if (drop) |
111 | llist_add(new: &ibmr->llnode, head: &pool->drop_list); |
112 | else |
113 | llist_add(new: &ibmr->llnode, head: &pool->free_list); |
114 | atomic_add(i: ibmr->sg_len, v: &pool->free_pinned); |
115 | atomic_inc(v: &pool->dirty_count); |
116 | |
117 | /* If we've pinned too many pages, request a flush */ |
118 | if (atomic_read(v: &pool->free_pinned) >= pool->max_free_pinned || |
119 | atomic_read(v: &pool->dirty_count) >= pool->max_items / 5) |
120 | queue_delayed_work(wq: rds_ib_mr_wq, dwork: &pool->flush_worker, delay: 10); |
121 | } |
122 | |
123 | static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) |
124 | { |
125 | struct rds_ib_frmr *frmr = &ibmr->u.frmr; |
126 | struct ib_reg_wr reg_wr; |
127 | int ret, off = 0; |
128 | |
129 | while (atomic_dec_return(v: &ibmr->ic->i_fastreg_wrs) <= 0) { |
130 | atomic_inc(v: &ibmr->ic->i_fastreg_wrs); |
131 | cpu_relax(); |
132 | } |
133 | |
134 | ret = ib_map_mr_sg_zbva(mr: frmr->mr, sg: ibmr->sg, sg_nents: ibmr->sg_dma_len, |
135 | sg_offset: &off, PAGE_SIZE); |
136 | if (unlikely(ret != ibmr->sg_dma_len)) |
137 | return ret < 0 ? ret : -EINVAL; |
138 | |
139 | if (cmpxchg(&frmr->fr_state, |
140 | FRMR_IS_FREE, FRMR_IS_INUSE) != FRMR_IS_FREE) |
141 | return -EBUSY; |
142 | |
143 | atomic_inc(v: &ibmr->ic->i_fastreg_inuse_count); |
144 | |
145 | /* Perform a WR for the fast_reg_mr. Each individual page |
146 | * in the sg list is added to the fast reg page list and placed |
147 | * inside the fast_reg_mr WR. The key used is a rolling 8bit |
148 | * counter, which should guarantee uniqueness. |
149 | */ |
150 | ib_update_fast_reg_key(mr: frmr->mr, newkey: ibmr->remap_count++); |
151 | frmr->fr_reg = true; |
152 | |
153 | memset(®_wr, 0, sizeof(reg_wr)); |
154 | reg_wr.wr.wr_id = (unsigned long)(void *)ibmr; |
155 | reg_wr.wr.opcode = IB_WR_REG_MR; |
156 | reg_wr.wr.num_sge = 0; |
157 | reg_wr.mr = frmr->mr; |
158 | reg_wr.key = frmr->mr->rkey; |
159 | reg_wr.access = IB_ACCESS_LOCAL_WRITE | |
160 | IB_ACCESS_REMOTE_READ | |
161 | IB_ACCESS_REMOTE_WRITE; |
162 | reg_wr.wr.send_flags = IB_SEND_SIGNALED; |
163 | |
164 | ret = ib_post_send(qp: ibmr->ic->i_cm_id->qp, send_wr: ®_wr.wr, NULL); |
165 | if (unlikely(ret)) { |
166 | /* Failure here can be because of -ENOMEM as well */ |
167 | rds_transition_frwr_state(ibmr, old_state: FRMR_IS_INUSE, new_state: FRMR_IS_STALE); |
168 | |
169 | atomic_inc(v: &ibmr->ic->i_fastreg_wrs); |
170 | if (printk_ratelimit()) |
171 | pr_warn("RDS/IB: %s returned error(%d)\n" , |
172 | __func__, ret); |
173 | goto out; |
174 | } |
175 | |
176 | /* Wait for the registration to complete in order to prevent an invalid |
177 | * access error resulting from a race between the memory region already |
178 | * being accessed while registration is still pending. |
179 | */ |
180 | wait_event(frmr->fr_reg_done, !frmr->fr_reg); |
181 | |
182 | out: |
183 | |
184 | return ret; |
185 | } |
186 | |
187 | static int rds_ib_map_frmr(struct rds_ib_device *rds_ibdev, |
188 | struct rds_ib_mr_pool *pool, |
189 | struct rds_ib_mr *ibmr, |
190 | struct scatterlist *sg, unsigned int sg_len) |
191 | { |
192 | struct ib_device *dev = rds_ibdev->dev; |
193 | struct rds_ib_frmr *frmr = &ibmr->u.frmr; |
194 | int i; |
195 | u32 len; |
196 | int ret = 0; |
197 | |
198 | /* We want to teardown old ibmr values here and fill it up with |
199 | * new sg values |
200 | */ |
201 | rds_ib_teardown_mr(ibmr); |
202 | |
203 | ibmr->sg = sg; |
204 | ibmr->sg_len = sg_len; |
205 | ibmr->sg_dma_len = 0; |
206 | frmr->sg_byte_len = 0; |
207 | WARN_ON(ibmr->sg_dma_len); |
208 | ibmr->sg_dma_len = ib_dma_map_sg(dev, sg: ibmr->sg, nents: ibmr->sg_len, |
209 | direction: DMA_BIDIRECTIONAL); |
210 | if (unlikely(!ibmr->sg_dma_len)) { |
211 | pr_warn("RDS/IB: %s failed!\n" , __func__); |
212 | return -EBUSY; |
213 | } |
214 | |
215 | frmr->sg_byte_len = 0; |
216 | frmr->dma_npages = 0; |
217 | len = 0; |
218 | |
219 | ret = -EINVAL; |
220 | for (i = 0; i < ibmr->sg_dma_len; ++i) { |
221 | unsigned int dma_len = sg_dma_len(&ibmr->sg[i]); |
222 | u64 dma_addr = sg_dma_address(&ibmr->sg[i]); |
223 | |
224 | frmr->sg_byte_len += dma_len; |
225 | if (dma_addr & ~PAGE_MASK) { |
226 | if (i > 0) |
227 | goto out_unmap; |
228 | else |
229 | ++frmr->dma_npages; |
230 | } |
231 | |
232 | if ((dma_addr + dma_len) & ~PAGE_MASK) { |
233 | if (i < ibmr->sg_dma_len - 1) |
234 | goto out_unmap; |
235 | else |
236 | ++frmr->dma_npages; |
237 | } |
238 | |
239 | len += dma_len; |
240 | } |
241 | frmr->dma_npages += len >> PAGE_SHIFT; |
242 | |
243 | if (frmr->dma_npages > ibmr->pool->max_pages) { |
244 | ret = -EMSGSIZE; |
245 | goto out_unmap; |
246 | } |
247 | |
248 | ret = rds_ib_post_reg_frmr(ibmr); |
249 | if (ret) |
250 | goto out_unmap; |
251 | |
252 | if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) |
253 | rds_ib_stats_inc(s_ib_rdma_mr_8k_used); |
254 | else |
255 | rds_ib_stats_inc(s_ib_rdma_mr_1m_used); |
256 | |
257 | return ret; |
258 | |
259 | out_unmap: |
260 | ib_dma_unmap_sg(dev: rds_ibdev->dev, sg: ibmr->sg, nents: ibmr->sg_len, |
261 | direction: DMA_BIDIRECTIONAL); |
262 | ibmr->sg_dma_len = 0; |
263 | return ret; |
264 | } |
265 | |
266 | static int rds_ib_post_inv(struct rds_ib_mr *ibmr) |
267 | { |
268 | struct ib_send_wr *s_wr; |
269 | struct rds_ib_frmr *frmr = &ibmr->u.frmr; |
270 | struct rdma_cm_id *i_cm_id = ibmr->ic->i_cm_id; |
271 | int ret = -EINVAL; |
272 | |
273 | if (!i_cm_id || !i_cm_id->qp || !frmr->mr) |
274 | goto out; |
275 | |
276 | if (frmr->fr_state != FRMR_IS_INUSE) |
277 | goto out; |
278 | |
279 | while (atomic_dec_return(v: &ibmr->ic->i_fastreg_wrs) <= 0) { |
280 | atomic_inc(v: &ibmr->ic->i_fastreg_wrs); |
281 | cpu_relax(); |
282 | } |
283 | |
284 | frmr->fr_inv = true; |
285 | s_wr = &frmr->fr_wr; |
286 | |
287 | memset(s_wr, 0, sizeof(*s_wr)); |
288 | s_wr->wr_id = (unsigned long)(void *)ibmr; |
289 | s_wr->opcode = IB_WR_LOCAL_INV; |
290 | s_wr->ex.invalidate_rkey = frmr->mr->rkey; |
291 | s_wr->send_flags = IB_SEND_SIGNALED; |
292 | |
293 | ret = ib_post_send(qp: i_cm_id->qp, send_wr: s_wr, NULL); |
294 | if (unlikely(ret)) { |
295 | rds_transition_frwr_state(ibmr, old_state: FRMR_IS_INUSE, new_state: FRMR_IS_STALE); |
296 | frmr->fr_inv = false; |
297 | /* enforce order of frmr->fr_inv update |
298 | * before incrementing i_fastreg_wrs |
299 | */ |
300 | smp_mb__before_atomic(); |
301 | atomic_inc(v: &ibmr->ic->i_fastreg_wrs); |
302 | pr_err("RDS/IB: %s returned error(%d)\n" , __func__, ret); |
303 | goto out; |
304 | } |
305 | |
306 | /* Wait for the FRMR_IS_FREE (or FRMR_IS_STALE) transition in order to |
307 | * 1) avoid a silly bouncing between "clean_list" and "drop_list" |
308 | * triggered by function "rds_ib_reg_frmr" as it is releases frmr |
309 | * regions whose state is not "FRMR_IS_FREE" right away. |
310 | * 2) prevents an invalid access error in a race |
311 | * from a pending "IB_WR_LOCAL_INV" operation |
312 | * with a teardown ("dma_unmap_sg", "put_page") |
313 | * and de-registration ("ib_dereg_mr") of the corresponding |
314 | * memory region. |
315 | */ |
316 | wait_event(frmr->fr_inv_done, frmr->fr_state != FRMR_IS_INUSE); |
317 | |
318 | out: |
319 | return ret; |
320 | } |
321 | |
322 | void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) |
323 | { |
324 | struct rds_ib_mr *ibmr = (void *)(unsigned long)wc->wr_id; |
325 | struct rds_ib_frmr *frmr = &ibmr->u.frmr; |
326 | |
327 | if (wc->status != IB_WC_SUCCESS) { |
328 | rds_transition_frwr_state(ibmr, old_state: FRMR_IS_INUSE, new_state: FRMR_IS_STALE); |
329 | if (rds_conn_up(conn: ic->conn)) |
330 | rds_ib_conn_error(ic->conn, |
331 | "frmr completion <%pI4,%pI4> status %u(%s), vendor_err 0x%x, disconnecting and reconnecting\n" , |
332 | &ic->conn->c_laddr, |
333 | &ic->conn->c_faddr, |
334 | wc->status, |
335 | ib_wc_status_msg(wc->status), |
336 | wc->vendor_err); |
337 | } |
338 | |
339 | if (frmr->fr_inv) { |
340 | rds_transition_frwr_state(ibmr, old_state: FRMR_IS_INUSE, new_state: FRMR_IS_FREE); |
341 | frmr->fr_inv = false; |
342 | wake_up(&frmr->fr_inv_done); |
343 | } |
344 | |
345 | if (frmr->fr_reg) { |
346 | frmr->fr_reg = false; |
347 | wake_up(&frmr->fr_reg_done); |
348 | } |
349 | |
350 | /* enforce order of frmr->{fr_reg,fr_inv} update |
351 | * before incrementing i_fastreg_wrs |
352 | */ |
353 | smp_mb__before_atomic(); |
354 | atomic_inc(v: &ic->i_fastreg_wrs); |
355 | } |
356 | |
357 | void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed, |
358 | unsigned long *unpinned, unsigned int goal) |
359 | { |
360 | struct rds_ib_mr *ibmr, *next; |
361 | struct rds_ib_frmr *frmr; |
362 | int ret = 0, ret2; |
363 | unsigned int freed = *nfreed; |
364 | |
365 | /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ |
366 | list_for_each_entry(ibmr, list, unmap_list) { |
367 | if (ibmr->sg_dma_len) { |
368 | ret2 = rds_ib_post_inv(ibmr); |
369 | if (ret2 && !ret) |
370 | ret = ret2; |
371 | } |
372 | } |
373 | |
374 | if (ret) |
375 | pr_warn("RDS/IB: %s failed (err=%d)\n" , __func__, ret); |
376 | |
377 | /* Now we can destroy the DMA mapping and unpin any pages */ |
378 | list_for_each_entry_safe(ibmr, next, list, unmap_list) { |
379 | *unpinned += ibmr->sg_len; |
380 | frmr = &ibmr->u.frmr; |
381 | __rds_ib_teardown_mr(ibmr); |
382 | if (freed < goal || frmr->fr_state == FRMR_IS_STALE) { |
383 | /* Don't de-allocate if the MR is not free yet */ |
384 | if (frmr->fr_state == FRMR_IS_INUSE) |
385 | continue; |
386 | |
387 | if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) |
388 | rds_ib_stats_inc(s_ib_rdma_mr_8k_free); |
389 | else |
390 | rds_ib_stats_inc(s_ib_rdma_mr_1m_free); |
391 | list_del(entry: &ibmr->unmap_list); |
392 | if (frmr->mr) |
393 | ib_dereg_mr(mr: frmr->mr); |
394 | kfree(objp: ibmr); |
395 | freed++; |
396 | } |
397 | } |
398 | *nfreed = freed; |
399 | } |
400 | |
401 | struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev, |
402 | struct rds_ib_connection *ic, |
403 | struct scatterlist *sg, |
404 | unsigned long nents, u32 *key) |
405 | { |
406 | struct rds_ib_mr *ibmr = NULL; |
407 | struct rds_ib_frmr *frmr; |
408 | int ret; |
409 | |
410 | if (!ic) { |
411 | /* TODO: Add FRWR support for RDS_GET_MR using proxy qp*/ |
412 | return ERR_PTR(error: -EOPNOTSUPP); |
413 | } |
414 | |
415 | do { |
416 | if (ibmr) |
417 | rds_ib_free_frmr(ibmr, drop: true); |
418 | ibmr = rds_ib_alloc_frmr(rds_ibdev, npages: nents); |
419 | if (IS_ERR(ptr: ibmr)) |
420 | return ibmr; |
421 | frmr = &ibmr->u.frmr; |
422 | } while (frmr->fr_state != FRMR_IS_FREE); |
423 | |
424 | ibmr->ic = ic; |
425 | ibmr->device = rds_ibdev; |
426 | ret = rds_ib_map_frmr(rds_ibdev, pool: ibmr->pool, ibmr, sg, sg_len: nents); |
427 | if (ret == 0) { |
428 | *key = frmr->mr->rkey; |
429 | } else { |
430 | rds_ib_free_frmr(ibmr, drop: false); |
431 | ibmr = ERR_PTR(error: ret); |
432 | } |
433 | |
434 | return ibmr; |
435 | } |
436 | |
437 | void rds_ib_free_frmr_list(struct rds_ib_mr *ibmr) |
438 | { |
439 | struct rds_ib_mr_pool *pool = ibmr->pool; |
440 | struct rds_ib_frmr *frmr = &ibmr->u.frmr; |
441 | |
442 | if (frmr->fr_state == FRMR_IS_STALE) |
443 | llist_add(new: &ibmr->llnode, head: &pool->drop_list); |
444 | else |
445 | llist_add(new: &ibmr->llnode, head: &pool->free_list); |
446 | } |
447 | |