1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #ifndef _RDS_IB_H |
3 | #define _RDS_IB_H |
4 | |
5 | #include <rdma/ib_verbs.h> |
6 | #include <rdma/rdma_cm.h> |
7 | #include <linux/interrupt.h> |
8 | #include <linux/pci.h> |
9 | #include <linux/slab.h> |
10 | #include "rds.h" |
11 | #include "rdma_transport.h" |
12 | |
13 | #define RDS_IB_MAX_SGE 8 |
14 | #define RDS_IB_RECV_SGE 2 |
15 | |
16 | #define RDS_IB_DEFAULT_RECV_WR 1024 |
17 | #define RDS_IB_DEFAULT_SEND_WR 256 |
18 | #define RDS_IB_DEFAULT_FR_WR 512 |
19 | |
20 | #define RDS_IB_DEFAULT_RETRY_COUNT 1 |
21 | |
22 | #define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ |
23 | |
24 | #define RDS_IB_RECYCLE_BATCH_COUNT 32 |
25 | |
26 | #define RDS_IB_WC_MAX 32 |
27 | |
28 | extern struct rw_semaphore rds_ib_devices_lock; |
29 | extern struct list_head rds_ib_devices; |
30 | |
31 | /* |
32 | * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to |
33 | * try and minimize the amount of memory tied up both the device and |
34 | * socket receive queues. |
35 | */ |
36 | struct rds_page_frag { |
37 | struct list_head f_item; |
38 | struct list_head f_cache_entry; |
39 | struct scatterlist f_sg; |
40 | }; |
41 | |
42 | struct rds_ib_incoming { |
43 | struct list_head ii_frags; |
44 | struct list_head ii_cache_entry; |
45 | struct rds_incoming ii_inc; |
46 | }; |
47 | |
48 | struct rds_ib_cache_head { |
49 | struct list_head *first; |
50 | unsigned long count; |
51 | }; |
52 | |
53 | struct rds_ib_refill_cache { |
54 | struct rds_ib_cache_head __percpu *percpu; |
55 | struct list_head *xfer; |
56 | struct list_head *ready; |
57 | }; |
58 | |
59 | /* This is the common structure for the IB private data exchange in setting up |
60 | * an RDS connection. The exchange is different for IPv4 and IPv6 connections. |
61 | * The reason is that the address size is different and the addresses |
62 | * exchanged are in the beginning of the structure. Hence it is not possible |
63 | * for interoperability if same structure is used. |
64 | */ |
65 | struct rds_ib_conn_priv_cmn { |
66 | u8 ricpc_protocol_major; |
67 | u8 ricpc_protocol_minor; |
68 | __be16 ricpc_protocol_minor_mask; /* bitmask */ |
69 | u8 ricpc_dp_toss; |
70 | u8 ripc_reserved1; |
71 | __be16 ripc_reserved2; |
72 | __be64 ricpc_ack_seq; |
73 | __be32 ricpc_credit; /* non-zero enables flow ctl */ |
74 | }; |
75 | |
76 | struct rds_ib_connect_private { |
77 | /* Add new fields at the end, and don't permute existing fields. */ |
78 | __be32 dp_saddr; |
79 | __be32 dp_daddr; |
80 | struct rds_ib_conn_priv_cmn dp_cmn; |
81 | }; |
82 | |
83 | struct rds6_ib_connect_private { |
84 | /* Add new fields at the end, and don't permute existing fields. */ |
85 | struct in6_addr dp_saddr; |
86 | struct in6_addr dp_daddr; |
87 | struct rds_ib_conn_priv_cmn dp_cmn; |
88 | }; |
89 | |
90 | #define dp_protocol_major dp_cmn.ricpc_protocol_major |
91 | #define dp_protocol_minor dp_cmn.ricpc_protocol_minor |
92 | #define dp_protocol_minor_mask dp_cmn.ricpc_protocol_minor_mask |
93 | #define dp_ack_seq dp_cmn.ricpc_ack_seq |
94 | #define dp_credit dp_cmn.ricpc_credit |
95 | |
96 | union rds_ib_conn_priv { |
97 | struct rds_ib_connect_private ricp_v4; |
98 | struct rds6_ib_connect_private ricp_v6; |
99 | }; |
100 | |
101 | struct rds_ib_send_work { |
102 | void *s_op; |
103 | union { |
104 | struct ib_send_wr s_wr; |
105 | struct ib_rdma_wr s_rdma_wr; |
106 | struct ib_atomic_wr s_atomic_wr; |
107 | }; |
108 | struct ib_sge s_sge[RDS_IB_MAX_SGE]; |
109 | unsigned long s_queued; |
110 | }; |
111 | |
112 | struct rds_ib_recv_work { |
113 | struct rds_ib_incoming *r_ibinc; |
114 | struct rds_page_frag *r_frag; |
115 | struct ib_recv_wr r_wr; |
116 | struct ib_sge r_sge[2]; |
117 | }; |
118 | |
119 | struct rds_ib_work_ring { |
120 | u32 w_nr; |
121 | u32 w_alloc_ptr; |
122 | u32 w_alloc_ctr; |
123 | u32 w_free_ptr; |
124 | atomic_t w_free_ctr; |
125 | }; |
126 | |
127 | /* Rings are posted with all the allocations they'll need to queue the |
128 | * incoming message to the receiving socket so this can't fail. |
129 | * All fragments start with a header, so we can make sure we're not receiving |
130 | * garbage, and we can tell a small 8 byte fragment from an ACK frame. |
131 | */ |
132 | struct rds_ib_ack_state { |
133 | u64 ack_next; |
134 | u64 ack_recv; |
135 | unsigned int ack_required:1; |
136 | unsigned int ack_next_valid:1; |
137 | unsigned int ack_recv_valid:1; |
138 | }; |
139 | |
140 | |
141 | struct rds_ib_device; |
142 | |
143 | struct rds_ib_connection { |
144 | |
145 | struct list_head ib_node; |
146 | struct rds_ib_device *rds_ibdev; |
147 | struct rds_connection *conn; |
148 | |
149 | /* alphabet soup, IBTA style */ |
150 | struct rdma_cm_id *i_cm_id; |
151 | struct ib_pd *i_pd; |
152 | struct ib_cq *i_send_cq; |
153 | struct ib_cq *i_recv_cq; |
154 | struct ib_wc i_send_wc[RDS_IB_WC_MAX]; |
155 | struct ib_wc i_recv_wc[RDS_IB_WC_MAX]; |
156 | |
157 | /* To control the number of wrs from fastreg */ |
158 | atomic_t i_fastreg_wrs; |
159 | atomic_t i_fastreg_inuse_count; |
160 | |
161 | /* interrupt handling */ |
162 | struct tasklet_struct i_send_tasklet; |
163 | struct tasklet_struct i_recv_tasklet; |
164 | |
165 | /* tx */ |
166 | struct rds_ib_work_ring i_send_ring; |
167 | struct rm_data_op *i_data_op; |
168 | struct rds_header **i_send_hdrs; |
169 | dma_addr_t *i_send_hdrs_dma; |
170 | struct rds_ib_send_work *i_sends; |
171 | atomic_t i_signaled_sends; |
172 | |
173 | /* rx */ |
174 | struct mutex i_recv_mutex; |
175 | struct rds_ib_work_ring i_recv_ring; |
176 | struct rds_ib_incoming *i_ibinc; |
177 | u32 i_recv_data_rem; |
178 | struct rds_header **i_recv_hdrs; |
179 | dma_addr_t *i_recv_hdrs_dma; |
180 | struct rds_ib_recv_work *i_recvs; |
181 | u64 i_ack_recv; /* last ACK received */ |
182 | struct rds_ib_refill_cache i_cache_incs; |
183 | struct rds_ib_refill_cache i_cache_frags; |
184 | atomic_t i_cache_allocs; |
185 | |
186 | /* sending acks */ |
187 | unsigned long i_ack_flags; |
188 | #ifdef KERNEL_HAS_ATOMIC64 |
189 | atomic64_t i_ack_next; /* next ACK to send */ |
190 | #else |
191 | spinlock_t i_ack_lock; /* protect i_ack_next */ |
192 | u64 i_ack_next; /* next ACK to send */ |
193 | #endif |
194 | struct rds_header *i_ack; |
195 | struct ib_send_wr i_ack_wr; |
196 | struct ib_sge i_ack_sge; |
197 | dma_addr_t i_ack_dma; |
198 | unsigned long i_ack_queued; |
199 | |
200 | /* Flow control related information |
201 | * |
202 | * Our algorithm uses a pair variables that we need to access |
203 | * atomically - one for the send credits, and one posted |
204 | * recv credits we need to transfer to remote. |
205 | * Rather than protect them using a slow spinlock, we put both into |
206 | * a single atomic_t and update it using cmpxchg |
207 | */ |
208 | atomic_t i_credits; |
209 | |
210 | /* Protocol version specific information */ |
211 | unsigned int i_flowctl:1; /* enable/disable flow ctl */ |
212 | |
213 | /* Batched completions */ |
214 | unsigned int i_unsignaled_wrs; |
215 | |
216 | /* Endpoint role in connection */ |
217 | bool i_active_side; |
218 | atomic_t i_cq_quiesce; |
219 | |
220 | /* Send/Recv vectors */ |
221 | int i_scq_vector; |
222 | int i_rcq_vector; |
223 | u8 i_sl; |
224 | }; |
225 | |
226 | /* This assumes that atomic_t is at least 32 bits */ |
227 | #define IB_GET_SEND_CREDITS(v) ((v) & 0xffff) |
228 | #define IB_GET_POST_CREDITS(v) ((v) >> 16) |
229 | #define IB_SET_SEND_CREDITS(v) ((v) & 0xffff) |
230 | #define IB_SET_POST_CREDITS(v) ((v) << 16) |
231 | |
232 | struct rds_ib_ipaddr { |
233 | struct list_head list; |
234 | __be32 ipaddr; |
235 | struct rcu_head rcu; |
236 | }; |
237 | |
238 | enum { |
239 | RDS_IB_MR_8K_POOL, |
240 | RDS_IB_MR_1M_POOL, |
241 | }; |
242 | |
243 | struct rds_ib_device { |
244 | struct list_head list; |
245 | struct list_head ipaddr_list; |
246 | struct list_head conn_list; |
247 | struct ib_device *dev; |
248 | struct ib_pd *pd; |
249 | u8 odp_capable:1; |
250 | |
251 | unsigned int max_mrs; |
252 | struct rds_ib_mr_pool *mr_1m_pool; |
253 | struct rds_ib_mr_pool *mr_8k_pool; |
254 | unsigned int max_8k_mrs; |
255 | unsigned int max_1m_mrs; |
256 | int max_sge; |
257 | unsigned int max_wrs; |
258 | unsigned int max_initiator_depth; |
259 | unsigned int max_responder_resources; |
260 | spinlock_t spinlock; /* protect the above */ |
261 | refcount_t refcount; |
262 | struct work_struct free_work; |
263 | int *vector_load; |
264 | }; |
265 | |
266 | #define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev) |
267 | |
268 | /* bits for i_ack_flags */ |
269 | #define IB_ACK_IN_FLIGHT 0 |
270 | #define IB_ACK_REQUESTED 1 |
271 | |
272 | /* Magic WR_ID for ACKs */ |
273 | #define RDS_IB_ACK_WR_ID (~(u64) 0) |
274 | |
275 | struct rds_ib_statistics { |
276 | uint64_t s_ib_connect_raced; |
277 | uint64_t s_ib_listen_closed_stale; |
278 | uint64_t s_ib_evt_handler_call; |
279 | uint64_t s_ib_tasklet_call; |
280 | uint64_t s_ib_tx_cq_event; |
281 | uint64_t s_ib_tx_ring_full; |
282 | uint64_t s_ib_tx_throttle; |
283 | uint64_t s_ib_tx_sg_mapping_failure; |
284 | uint64_t s_ib_tx_stalled; |
285 | uint64_t s_ib_tx_credit_updates; |
286 | uint64_t s_ib_rx_cq_event; |
287 | uint64_t s_ib_rx_ring_empty; |
288 | uint64_t s_ib_rx_refill_from_cq; |
289 | uint64_t s_ib_rx_refill_from_thread; |
290 | uint64_t s_ib_rx_alloc_limit; |
291 | uint64_t s_ib_rx_total_frags; |
292 | uint64_t s_ib_rx_total_incs; |
293 | uint64_t s_ib_rx_credit_updates; |
294 | uint64_t s_ib_ack_sent; |
295 | uint64_t s_ib_ack_send_failure; |
296 | uint64_t s_ib_ack_send_delayed; |
297 | uint64_t s_ib_ack_send_piggybacked; |
298 | uint64_t s_ib_ack_received; |
299 | uint64_t s_ib_rdma_mr_8k_alloc; |
300 | uint64_t s_ib_rdma_mr_8k_free; |
301 | uint64_t s_ib_rdma_mr_8k_used; |
302 | uint64_t s_ib_rdma_mr_8k_pool_flush; |
303 | uint64_t s_ib_rdma_mr_8k_pool_wait; |
304 | uint64_t s_ib_rdma_mr_8k_pool_depleted; |
305 | uint64_t s_ib_rdma_mr_1m_alloc; |
306 | uint64_t s_ib_rdma_mr_1m_free; |
307 | uint64_t s_ib_rdma_mr_1m_used; |
308 | uint64_t s_ib_rdma_mr_1m_pool_flush; |
309 | uint64_t s_ib_rdma_mr_1m_pool_wait; |
310 | uint64_t s_ib_rdma_mr_1m_pool_depleted; |
311 | uint64_t s_ib_rdma_mr_8k_reused; |
312 | uint64_t s_ib_rdma_mr_1m_reused; |
313 | uint64_t s_ib_atomic_cswp; |
314 | uint64_t s_ib_atomic_fadd; |
315 | uint64_t s_ib_recv_added_to_cache; |
316 | uint64_t s_ib_recv_removed_from_cache; |
317 | }; |
318 | |
319 | extern struct workqueue_struct *rds_ib_wq; |
320 | |
321 | /* |
322 | * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h |
323 | * doesn't define it. |
324 | */ |
325 | static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev, |
326 | struct scatterlist *sglist, |
327 | unsigned int sg_dma_len, |
328 | int direction) |
329 | { |
330 | struct scatterlist *sg; |
331 | unsigned int i; |
332 | |
333 | for_each_sg(sglist, sg, sg_dma_len, i) { |
334 | ib_dma_sync_single_for_cpu(dev, sg_dma_address(sg), |
335 | sg_dma_len(sg), dir: direction); |
336 | } |
337 | } |
338 | #define ib_dma_sync_sg_for_cpu rds_ib_dma_sync_sg_for_cpu |
339 | |
340 | static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev, |
341 | struct scatterlist *sglist, |
342 | unsigned int sg_dma_len, |
343 | int direction) |
344 | { |
345 | struct scatterlist *sg; |
346 | unsigned int i; |
347 | |
348 | for_each_sg(sglist, sg, sg_dma_len, i) { |
349 | ib_dma_sync_single_for_device(dev, sg_dma_address(sg), |
350 | sg_dma_len(sg), dir: direction); |
351 | } |
352 | } |
353 | #define ib_dma_sync_sg_for_device rds_ib_dma_sync_sg_for_device |
354 | |
355 | |
356 | /* ib.c */ |
357 | extern struct rds_transport rds_ib_transport; |
358 | struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device); |
359 | void rds_ib_dev_put(struct rds_ib_device *rds_ibdev); |
360 | extern struct ib_client rds_ib_client; |
361 | |
362 | extern unsigned int rds_ib_retry_count; |
363 | |
364 | extern spinlock_t ib_nodev_conns_lock; |
365 | extern struct list_head ib_nodev_conns; |
366 | |
367 | /* ib_cm.c */ |
368 | int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp); |
369 | void rds_ib_conn_free(void *arg); |
370 | int rds_ib_conn_path_connect(struct rds_conn_path *cp); |
371 | void rds_ib_conn_path_shutdown(struct rds_conn_path *cp); |
372 | void rds_ib_state_change(struct sock *sk); |
373 | int rds_ib_listen_init(void); |
374 | void rds_ib_listen_stop(void); |
375 | __printf(2, 3) |
376 | void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); |
377 | int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, |
378 | struct rdma_cm_event *event, bool isv6); |
379 | int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6); |
380 | void rds_ib_cm_connect_complete(struct rds_connection *conn, |
381 | struct rdma_cm_event *event); |
382 | |
383 | #define rds_ib_conn_error(conn, fmt...) \ |
384 | __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt) |
385 | |
386 | /* ib_rdma.c */ |
387 | int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, |
388 | struct in6_addr *ipaddr); |
389 | void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); |
390 | void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); |
391 | void rds_ib_destroy_nodev_conns(void); |
392 | void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc); |
393 | |
394 | /* ib_recv.c */ |
395 | int rds_ib_recv_init(void); |
396 | void rds_ib_recv_exit(void); |
397 | int rds_ib_recv_path(struct rds_conn_path *conn); |
398 | int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic, gfp_t gfp); |
399 | void rds_ib_recv_free_caches(struct rds_ib_connection *ic); |
400 | void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp); |
401 | void rds_ib_inc_free(struct rds_incoming *inc); |
402 | int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to); |
403 | void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc, |
404 | struct rds_ib_ack_state *state); |
405 | void rds_ib_recv_tasklet_fn(unsigned long data); |
406 | void rds_ib_recv_init_ring(struct rds_ib_connection *ic); |
407 | void rds_ib_recv_clear_ring(struct rds_ib_connection *ic); |
408 | void rds_ib_recv_init_ack(struct rds_ib_connection *ic); |
409 | void rds_ib_attempt_ack(struct rds_ib_connection *ic); |
410 | void rds_ib_ack_send_complete(struct rds_ib_connection *ic); |
411 | u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic); |
412 | void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required); |
413 | |
414 | /* ib_ring.c */ |
415 | void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr); |
416 | void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr); |
417 | u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos); |
418 | void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val); |
419 | void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val); |
420 | int rds_ib_ring_empty(struct rds_ib_work_ring *ring); |
421 | int rds_ib_ring_low(struct rds_ib_work_ring *ring); |
422 | u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring); |
423 | u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest); |
424 | extern wait_queue_head_t rds_ib_ring_empty_wait; |
425 | |
426 | /* ib_send.c */ |
427 | void rds_ib_xmit_path_complete(struct rds_conn_path *cp); |
428 | int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, |
429 | unsigned int hdr_off, unsigned int sg, unsigned int off); |
430 | void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc); |
431 | void rds_ib_send_init_ring(struct rds_ib_connection *ic); |
432 | void rds_ib_send_clear_ring(struct rds_ib_connection *ic); |
433 | int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); |
434 | void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits); |
435 | void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted); |
436 | int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, |
437 | u32 *adv_credits, int need_posted, int max_posted); |
438 | int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op); |
439 | |
440 | /* ib_stats.c */ |
441 | DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); |
442 | #define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member) |
443 | #define rds_ib_stats_add(member, count) \ |
444 | rds_stats_add_which(rds_ib_stats, member, count) |
445 | unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, |
446 | unsigned int avail); |
447 | |
448 | /* ib_sysctl.c */ |
449 | int rds_ib_sysctl_init(void); |
450 | void rds_ib_sysctl_exit(void); |
451 | extern unsigned long rds_ib_sysctl_max_send_wr; |
452 | extern unsigned long rds_ib_sysctl_max_recv_wr; |
453 | extern unsigned long rds_ib_sysctl_max_unsig_wrs; |
454 | extern unsigned long rds_ib_sysctl_max_unsig_bytes; |
455 | extern unsigned long rds_ib_sysctl_max_recv_allocation; |
456 | extern unsigned int rds_ib_sysctl_flow_control; |
457 | |
458 | #endif |
459 | |