1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Shared Memory Communications over RDMA (SMC-R) and RoCE |
4 | * |
5 | * IB infrastructure: |
6 | * Establish SMC-R as an Infiniband Client to be notified about added and |
7 | * removed IB devices of type RDMA. |
8 | * Determine device and port characteristics for these IB devices. |
9 | * |
10 | * Copyright IBM Corp. 2016 |
11 | * |
12 | * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> |
13 | */ |
14 | |
15 | #include <linux/etherdevice.h> |
16 | #include <linux/if_vlan.h> |
17 | #include <linux/random.h> |
18 | #include <linux/workqueue.h> |
19 | #include <linux/scatterlist.h> |
20 | #include <linux/wait.h> |
21 | #include <linux/mutex.h> |
22 | #include <linux/inetdevice.h> |
23 | #include <rdma/ib_verbs.h> |
24 | #include <rdma/ib_cache.h> |
25 | |
26 | #include "smc_pnet.h" |
27 | #include "smc_ib.h" |
28 | #include "smc_core.h" |
29 | #include "smc_wr.h" |
30 | #include "smc.h" |
31 | #include "smc_netlink.h" |
32 | |
33 | #define SMC_MAX_CQE 32766 /* max. # of completion queue elements */ |
34 | |
35 | #define SMC_QP_MIN_RNR_TIMER 5 |
36 | #define SMC_QP_TIMEOUT 15 /* 4096 * 2 ** timeout usec */ |
37 | #define SMC_QP_RETRY_CNT 7 /* 7: infinite */ |
38 | #define SMC_QP_RNR_RETRY 7 /* 7: infinite */ |
39 | |
40 | struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */ |
41 | .mutex = __MUTEX_INITIALIZER(smc_ib_devices.mutex), |
42 | .list = LIST_HEAD_INIT(smc_ib_devices.list), |
43 | }; |
44 | |
45 | u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */ |
46 | |
47 | static int smc_ib_modify_qp_init(struct smc_link *lnk) |
48 | { |
49 | struct ib_qp_attr qp_attr; |
50 | |
51 | memset(&qp_attr, 0, sizeof(qp_attr)); |
52 | qp_attr.qp_state = IB_QPS_INIT; |
53 | qp_attr.pkey_index = 0; |
54 | qp_attr.port_num = lnk->ibport; |
55 | qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE |
56 | | IB_ACCESS_REMOTE_WRITE; |
57 | return ib_modify_qp(qp: lnk->roce_qp, qp_attr: &qp_attr, |
58 | qp_attr_mask: IB_QP_STATE | IB_QP_PKEY_INDEX | |
59 | IB_QP_ACCESS_FLAGS | IB_QP_PORT); |
60 | } |
61 | |
62 | static int smc_ib_modify_qp_rtr(struct smc_link *lnk) |
63 | { |
64 | enum ib_qp_attr_mask qp_attr_mask = |
65 | IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | |
66 | IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER; |
67 | struct ib_qp_attr qp_attr; |
68 | u8 hop_lim = 1; |
69 | |
70 | memset(&qp_attr, 0, sizeof(qp_attr)); |
71 | qp_attr.qp_state = IB_QPS_RTR; |
72 | qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu); |
73 | qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; |
74 | rdma_ah_set_port_num(attr: &qp_attr.ah_attr, port_num: lnk->ibport); |
75 | if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway) |
76 | hop_lim = IPV6_DEFAULT_HOPLIMIT; |
77 | rdma_ah_set_grh(attr: &qp_attr.ah_attr, NULL, flow_label: 0, sgid_index: lnk->sgid_index, hop_limit: hop_lim, traffic_class: 0); |
78 | rdma_ah_set_dgid_raw(attr: &qp_attr.ah_attr, dgid: lnk->peer_gid); |
79 | if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway) |
80 | memcpy(&qp_attr.ah_attr.roce.dmac, lnk->lgr->nexthop_mac, |
81 | sizeof(lnk->lgr->nexthop_mac)); |
82 | else |
83 | memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac, |
84 | sizeof(lnk->peer_mac)); |
85 | qp_attr.dest_qp_num = lnk->peer_qpn; |
86 | qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */ |
87 | qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming |
88 | * requests |
89 | */ |
90 | qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER; |
91 | |
92 | return ib_modify_qp(qp: lnk->roce_qp, qp_attr: &qp_attr, qp_attr_mask); |
93 | } |
94 | |
95 | int smc_ib_modify_qp_rts(struct smc_link *lnk) |
96 | { |
97 | struct ib_qp_attr qp_attr; |
98 | |
99 | memset(&qp_attr, 0, sizeof(qp_attr)); |
100 | qp_attr.qp_state = IB_QPS_RTS; |
101 | qp_attr.timeout = SMC_QP_TIMEOUT; /* local ack timeout */ |
102 | qp_attr.retry_cnt = SMC_QP_RETRY_CNT; /* retry count */ |
103 | qp_attr.rnr_retry = SMC_QP_RNR_RETRY; /* RNR retries, 7=infinite */ |
104 | qp_attr.sq_psn = lnk->psn_initial; /* starting send packet seq # */ |
105 | qp_attr.max_rd_atomic = 1; /* # of outstanding RDMA reads and |
106 | * atomic ops allowed |
107 | */ |
108 | return ib_modify_qp(qp: lnk->roce_qp, qp_attr: &qp_attr, |
109 | qp_attr_mask: IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT | |
110 | IB_QP_SQ_PSN | IB_QP_RNR_RETRY | |
111 | IB_QP_MAX_QP_RD_ATOMIC); |
112 | } |
113 | |
114 | int smc_ib_modify_qp_error(struct smc_link *lnk) |
115 | { |
116 | struct ib_qp_attr qp_attr; |
117 | |
118 | memset(&qp_attr, 0, sizeof(qp_attr)); |
119 | qp_attr.qp_state = IB_QPS_ERR; |
120 | return ib_modify_qp(qp: lnk->roce_qp, qp_attr: &qp_attr, qp_attr_mask: IB_QP_STATE); |
121 | } |
122 | |
123 | int smc_ib_ready_link(struct smc_link *lnk) |
124 | { |
125 | struct smc_link_group *lgr = smc_get_lgr(link: lnk); |
126 | int rc = 0; |
127 | |
128 | rc = smc_ib_modify_qp_init(lnk); |
129 | if (rc) |
130 | goto out; |
131 | |
132 | rc = smc_ib_modify_qp_rtr(lnk); |
133 | if (rc) |
134 | goto out; |
135 | smc_wr_remember_qp_attr(lnk); |
136 | rc = ib_req_notify_cq(cq: lnk->smcibdev->roce_cq_recv, |
137 | flags: IB_CQ_SOLICITED_MASK); |
138 | if (rc) |
139 | goto out; |
140 | rc = smc_wr_rx_post_init(link: lnk); |
141 | if (rc) |
142 | goto out; |
143 | smc_wr_remember_qp_attr(lnk); |
144 | |
145 | if (lgr->role == SMC_SERV) { |
146 | rc = smc_ib_modify_qp_rts(lnk); |
147 | if (rc) |
148 | goto out; |
149 | smc_wr_remember_qp_attr(lnk); |
150 | } |
151 | out: |
152 | return rc; |
153 | } |
154 | |
155 | static int smc_ib_fill_mac(struct smc_ib_device *smcibdev, u8 ibport) |
156 | { |
157 | const struct ib_gid_attr *attr; |
158 | int rc; |
159 | |
160 | attr = rdma_get_gid_attr(device: smcibdev->ibdev, port_num: ibport, index: 0); |
161 | if (IS_ERR(ptr: attr)) |
162 | return -ENODEV; |
163 | |
164 | rc = rdma_read_gid_l2_fields(attr, NULL, smac: smcibdev->mac[ibport - 1]); |
165 | rdma_put_gid_attr(attr); |
166 | return rc; |
167 | } |
168 | |
169 | /* Create an identifier unique for this instance of SMC-R. |
170 | * The MAC-address of the first active registered IB device |
171 | * plus a random 2-byte number is used to create this identifier. |
172 | * This name is delivered to the peer during connection initialization. |
173 | */ |
174 | static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev, |
175 | u8 ibport) |
176 | { |
177 | memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1], |
178 | sizeof(smcibdev->mac[ibport - 1])); |
179 | } |
180 | |
181 | bool smc_ib_is_valid_local_systemid(void) |
182 | { |
183 | return !is_zero_ether_addr(addr: &local_systemid[2]); |
184 | } |
185 | |
186 | static void smc_ib_init_local_systemid(void) |
187 | { |
188 | get_random_bytes(buf: &local_systemid[0], len: 2); |
189 | } |
190 | |
191 | bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport) |
192 | { |
193 | return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE; |
194 | } |
195 | |
196 | int smc_ib_find_route(struct net *net, __be32 saddr, __be32 daddr, |
197 | u8 nexthop_mac[], u8 *uses_gateway) |
198 | { |
199 | struct neighbour *neigh = NULL; |
200 | struct rtable *rt = NULL; |
201 | struct flowi4 fl4 = { |
202 | .saddr = saddr, |
203 | .daddr = daddr |
204 | }; |
205 | |
206 | if (daddr == cpu_to_be32(INADDR_NONE)) |
207 | goto out; |
208 | rt = ip_route_output_flow(net, flp: &fl4, NULL); |
209 | if (IS_ERR(ptr: rt)) |
210 | goto out; |
211 | if (rt->rt_uses_gateway && rt->rt_gw_family != AF_INET) |
212 | goto out; |
213 | neigh = rt->dst.ops->neigh_lookup(&rt->dst, NULL, &fl4.daddr); |
214 | if (neigh) { |
215 | memcpy(nexthop_mac, neigh->ha, ETH_ALEN); |
216 | *uses_gateway = rt->rt_uses_gateway; |
217 | return 0; |
218 | } |
219 | out: |
220 | return -ENOENT; |
221 | } |
222 | |
223 | static int smc_ib_determine_gid_rcu(const struct net_device *ndev, |
224 | const struct ib_gid_attr *attr, |
225 | u8 gid[], u8 *sgid_index, |
226 | struct smc_init_info_smcrv2 *smcrv2) |
227 | { |
228 | if (!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) { |
229 | if (gid) |
230 | memcpy(gid, &attr->gid, SMC_GID_SIZE); |
231 | if (sgid_index) |
232 | *sgid_index = attr->index; |
233 | return 0; |
234 | } |
235 | if (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP && |
236 | smc_ib_gid_to_ipv4(gid: (u8 *)&attr->gid) != cpu_to_be32(INADDR_NONE)) { |
237 | struct in_device *in_dev = __in_dev_get_rcu(dev: ndev); |
238 | struct net *net = dev_net(dev: ndev); |
239 | const struct in_ifaddr *ifa; |
240 | bool subnet_match = false; |
241 | |
242 | if (!in_dev) |
243 | goto out; |
244 | in_dev_for_each_ifa_rcu(ifa, in_dev) { |
245 | if (!inet_ifa_match(addr: smcrv2->saddr, ifa)) |
246 | continue; |
247 | subnet_match = true; |
248 | break; |
249 | } |
250 | if (!subnet_match) |
251 | goto out; |
252 | if (smcrv2->daddr && smc_ib_find_route(net, saddr: smcrv2->saddr, |
253 | daddr: smcrv2->daddr, |
254 | nexthop_mac: smcrv2->nexthop_mac, |
255 | uses_gateway: &smcrv2->uses_gateway)) |
256 | goto out; |
257 | |
258 | if (gid) |
259 | memcpy(gid, &attr->gid, SMC_GID_SIZE); |
260 | if (sgid_index) |
261 | *sgid_index = attr->index; |
262 | return 0; |
263 | } |
264 | out: |
265 | return -ENODEV; |
266 | } |
267 | |
268 | /* determine the gid for an ib-device port and vlan id */ |
269 | int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, |
270 | unsigned short vlan_id, u8 gid[], u8 *sgid_index, |
271 | struct smc_init_info_smcrv2 *smcrv2) |
272 | { |
273 | const struct ib_gid_attr *attr; |
274 | const struct net_device *ndev; |
275 | int i; |
276 | |
277 | for (i = 0; i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) { |
278 | attr = rdma_get_gid_attr(device: smcibdev->ibdev, port_num: ibport, index: i); |
279 | if (IS_ERR(ptr: attr)) |
280 | continue; |
281 | |
282 | rcu_read_lock(); |
283 | ndev = rdma_read_gid_attr_ndev_rcu(attr); |
284 | if (!IS_ERR(ptr: ndev) && |
285 | ((!vlan_id && !is_vlan_dev(dev: ndev)) || |
286 | (vlan_id && is_vlan_dev(dev: ndev) && |
287 | vlan_dev_vlan_id(dev: ndev) == vlan_id))) { |
288 | if (!smc_ib_determine_gid_rcu(ndev, attr, gid, |
289 | sgid_index, smcrv2)) { |
290 | rcu_read_unlock(); |
291 | rdma_put_gid_attr(attr); |
292 | return 0; |
293 | } |
294 | } |
295 | rcu_read_unlock(); |
296 | rdma_put_gid_attr(attr); |
297 | } |
298 | return -ENODEV; |
299 | } |
300 | |
301 | /* check if gid is still defined on smcibdev */ |
302 | static bool smc_ib_check_link_gid(u8 gid[SMC_GID_SIZE], bool smcrv2, |
303 | struct smc_ib_device *smcibdev, u8 ibport) |
304 | { |
305 | const struct ib_gid_attr *attr; |
306 | bool rc = false; |
307 | int i; |
308 | |
309 | for (i = 0; !rc && i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) { |
310 | attr = rdma_get_gid_attr(device: smcibdev->ibdev, port_num: ibport, index: i); |
311 | if (IS_ERR(ptr: attr)) |
312 | continue; |
313 | |
314 | rcu_read_lock(); |
315 | if ((!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) || |
316 | (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP && |
317 | !(ipv6_addr_type(addr: (const struct in6_addr *)&attr->gid) |
318 | & IPV6_ADDR_LINKLOCAL))) |
319 | if (!memcmp(p: gid, q: &attr->gid, SMC_GID_SIZE)) |
320 | rc = true; |
321 | rcu_read_unlock(); |
322 | rdma_put_gid_attr(attr); |
323 | } |
324 | return rc; |
325 | } |
326 | |
327 | /* check all links if the gid is still defined on smcibdev */ |
328 | static void smc_ib_gid_check(struct smc_ib_device *smcibdev, u8 ibport) |
329 | { |
330 | struct smc_link_group *lgr; |
331 | int i; |
332 | |
333 | spin_lock_bh(lock: &smc_lgr_list.lock); |
334 | list_for_each_entry(lgr, &smc_lgr_list.list, list) { |
335 | if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, |
336 | SMC_MAX_PNETID_LEN)) |
337 | continue; /* lgr is not affected */ |
338 | if (list_empty(head: &lgr->list)) |
339 | continue; |
340 | for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { |
341 | if (lgr->lnk[i].state == SMC_LNK_UNUSED || |
342 | lgr->lnk[i].smcibdev != smcibdev) |
343 | continue; |
344 | if (!smc_ib_check_link_gid(gid: lgr->lnk[i].gid, |
345 | smcrv2: lgr->smc_version == SMC_V2, |
346 | smcibdev, ibport)) |
347 | smcr_port_err(smcibdev, ibport); |
348 | } |
349 | } |
350 | spin_unlock_bh(lock: &smc_lgr_list.lock); |
351 | } |
352 | |
353 | static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) |
354 | { |
355 | int rc; |
356 | |
357 | memset(&smcibdev->pattr[ibport - 1], 0, |
358 | sizeof(smcibdev->pattr[ibport - 1])); |
359 | rc = ib_query_port(device: smcibdev->ibdev, port_num: ibport, |
360 | port_attr: &smcibdev->pattr[ibport - 1]); |
361 | if (rc) |
362 | goto out; |
363 | /* the SMC protocol requires specification of the RoCE MAC address */ |
364 | rc = smc_ib_fill_mac(smcibdev, ibport); |
365 | if (rc) |
366 | goto out; |
367 | if (!smc_ib_is_valid_local_systemid() && |
368 | smc_ib_port_active(smcibdev, ibport)) |
369 | /* create unique system identifier */ |
370 | smc_ib_define_local_systemid(smcibdev, ibport); |
371 | out: |
372 | return rc; |
373 | } |
374 | |
375 | /* process context wrapper for might_sleep smc_ib_remember_port_attr */ |
376 | static void smc_ib_port_event_work(struct work_struct *work) |
377 | { |
378 | struct smc_ib_device *smcibdev = container_of( |
379 | work, struct smc_ib_device, port_event_work); |
380 | u8 port_idx; |
381 | |
382 | for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) { |
383 | smc_ib_remember_port_attr(smcibdev, ibport: port_idx + 1); |
384 | clear_bit(nr: port_idx, addr: &smcibdev->port_event_mask); |
385 | if (!smc_ib_port_active(smcibdev, ibport: port_idx + 1)) { |
386 | set_bit(nr: port_idx, addr: smcibdev->ports_going_away); |
387 | smcr_port_err(smcibdev, ibport: port_idx + 1); |
388 | } else { |
389 | clear_bit(nr: port_idx, addr: smcibdev->ports_going_away); |
390 | smcr_port_add(smcibdev, ibport: port_idx + 1); |
391 | smc_ib_gid_check(smcibdev, ibport: port_idx + 1); |
392 | } |
393 | } |
394 | } |
395 | |
396 | /* can be called in IRQ context */ |
397 | static void smc_ib_global_event_handler(struct ib_event_handler *handler, |
398 | struct ib_event *ibevent) |
399 | { |
400 | struct smc_ib_device *smcibdev; |
401 | bool schedule = false; |
402 | u8 port_idx; |
403 | |
404 | smcibdev = container_of(handler, struct smc_ib_device, event_handler); |
405 | |
406 | switch (ibevent->event) { |
407 | case IB_EVENT_DEVICE_FATAL: |
408 | /* terminate all ports on device */ |
409 | for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) { |
410 | set_bit(nr: port_idx, addr: &smcibdev->port_event_mask); |
411 | if (!test_and_set_bit(nr: port_idx, |
412 | addr: smcibdev->ports_going_away)) |
413 | schedule = true; |
414 | } |
415 | if (schedule) |
416 | schedule_work(work: &smcibdev->port_event_work); |
417 | break; |
418 | case IB_EVENT_PORT_ACTIVE: |
419 | port_idx = ibevent->element.port_num - 1; |
420 | if (port_idx >= SMC_MAX_PORTS) |
421 | break; |
422 | set_bit(nr: port_idx, addr: &smcibdev->port_event_mask); |
423 | if (test_and_clear_bit(nr: port_idx, addr: smcibdev->ports_going_away)) |
424 | schedule_work(work: &smcibdev->port_event_work); |
425 | break; |
426 | case IB_EVENT_PORT_ERR: |
427 | port_idx = ibevent->element.port_num - 1; |
428 | if (port_idx >= SMC_MAX_PORTS) |
429 | break; |
430 | set_bit(nr: port_idx, addr: &smcibdev->port_event_mask); |
431 | if (!test_and_set_bit(nr: port_idx, addr: smcibdev->ports_going_away)) |
432 | schedule_work(work: &smcibdev->port_event_work); |
433 | break; |
434 | case IB_EVENT_GID_CHANGE: |
435 | port_idx = ibevent->element.port_num - 1; |
436 | if (port_idx >= SMC_MAX_PORTS) |
437 | break; |
438 | set_bit(nr: port_idx, addr: &smcibdev->port_event_mask); |
439 | schedule_work(work: &smcibdev->port_event_work); |
440 | break; |
441 | default: |
442 | break; |
443 | } |
444 | } |
445 | |
446 | void smc_ib_dealloc_protection_domain(struct smc_link *lnk) |
447 | { |
448 | if (lnk->roce_pd) |
449 | ib_dealloc_pd(pd: lnk->roce_pd); |
450 | lnk->roce_pd = NULL; |
451 | } |
452 | |
453 | int smc_ib_create_protection_domain(struct smc_link *lnk) |
454 | { |
455 | int rc; |
456 | |
457 | lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0); |
458 | rc = PTR_ERR_OR_ZERO(ptr: lnk->roce_pd); |
459 | if (IS_ERR(ptr: lnk->roce_pd)) |
460 | lnk->roce_pd = NULL; |
461 | return rc; |
462 | } |
463 | |
464 | static bool smcr_diag_is_dev_critical(struct smc_lgr_list *smc_lgr, |
465 | struct smc_ib_device *smcibdev) |
466 | { |
467 | struct smc_link_group *lgr; |
468 | bool rc = false; |
469 | int i; |
470 | |
471 | spin_lock_bh(lock: &smc_lgr->lock); |
472 | list_for_each_entry(lgr, &smc_lgr->list, list) { |
473 | if (lgr->is_smcd) |
474 | continue; |
475 | for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { |
476 | if (lgr->lnk[i].state == SMC_LNK_UNUSED || |
477 | lgr->lnk[i].smcibdev != smcibdev) |
478 | continue; |
479 | if (lgr->type == SMC_LGR_SINGLE || |
480 | lgr->type == SMC_LGR_ASYMMETRIC_LOCAL) { |
481 | rc = true; |
482 | goto out; |
483 | } |
484 | } |
485 | } |
486 | out: |
487 | spin_unlock_bh(lock: &smc_lgr->lock); |
488 | return rc; |
489 | } |
490 | |
491 | static int smc_nl_handle_dev_port(struct sk_buff *skb, |
492 | struct ib_device *ibdev, |
493 | struct smc_ib_device *smcibdev, |
494 | int port) |
495 | { |
496 | char smc_pnet[SMC_MAX_PNETID_LEN + 1]; |
497 | struct nlattr *port_attrs; |
498 | unsigned char port_state; |
499 | int lnk_count = 0; |
500 | |
501 | port_attrs = nla_nest_start(skb, attrtype: SMC_NLA_DEV_PORT + port); |
502 | if (!port_attrs) |
503 | goto errout; |
504 | |
505 | if (nla_put_u8(skb, attrtype: SMC_NLA_DEV_PORT_PNET_USR, |
506 | value: smcibdev->pnetid_by_user[port])) |
507 | goto errattr; |
508 | memcpy(smc_pnet, &smcibdev->pnetid[port], SMC_MAX_PNETID_LEN); |
509 | smc_pnet[SMC_MAX_PNETID_LEN] = 0; |
510 | if (nla_put_string(skb, attrtype: SMC_NLA_DEV_PORT_PNETID, str: smc_pnet)) |
511 | goto errattr; |
512 | if (nla_put_u32(skb, attrtype: SMC_NLA_DEV_PORT_NETDEV, |
513 | value: smcibdev->ndev_ifidx[port])) |
514 | goto errattr; |
515 | if (nla_put_u8(skb, attrtype: SMC_NLA_DEV_PORT_VALID, value: 1)) |
516 | goto errattr; |
517 | port_state = smc_ib_port_active(smcibdev, ibport: port + 1); |
518 | if (nla_put_u8(skb, attrtype: SMC_NLA_DEV_PORT_STATE, value: port_state)) |
519 | goto errattr; |
520 | lnk_count = atomic_read(v: &smcibdev->lnk_cnt_by_port[port]); |
521 | if (nla_put_u32(skb, attrtype: SMC_NLA_DEV_PORT_LNK_CNT, value: lnk_count)) |
522 | goto errattr; |
523 | nla_nest_end(skb, start: port_attrs); |
524 | return 0; |
525 | errattr: |
526 | nla_nest_cancel(skb, start: port_attrs); |
527 | errout: |
528 | return -EMSGSIZE; |
529 | } |
530 | |
531 | static bool smc_nl_handle_pci_values(const struct smc_pci_dev *smc_pci_dev, |
532 | struct sk_buff *skb) |
533 | { |
534 | if (nla_put_u32(skb, attrtype: SMC_NLA_DEV_PCI_FID, value: smc_pci_dev->pci_fid)) |
535 | return false; |
536 | if (nla_put_u16(skb, attrtype: SMC_NLA_DEV_PCI_CHID, value: smc_pci_dev->pci_pchid)) |
537 | return false; |
538 | if (nla_put_u16(skb, attrtype: SMC_NLA_DEV_PCI_VENDOR, value: smc_pci_dev->pci_vendor)) |
539 | return false; |
540 | if (nla_put_u16(skb, attrtype: SMC_NLA_DEV_PCI_DEVICE, value: smc_pci_dev->pci_device)) |
541 | return false; |
542 | if (nla_put_string(skb, attrtype: SMC_NLA_DEV_PCI_ID, str: smc_pci_dev->pci_id)) |
543 | return false; |
544 | return true; |
545 | } |
546 | |
547 | static int smc_nl_handle_smcr_dev(struct smc_ib_device *smcibdev, |
548 | struct sk_buff *skb, |
549 | struct netlink_callback *cb) |
550 | { |
551 | char smc_ibname[IB_DEVICE_NAME_MAX]; |
552 | struct smc_pci_dev smc_pci_dev; |
553 | struct pci_dev *pci_dev; |
554 | unsigned char is_crit; |
555 | struct nlattr *attrs; |
556 | void *nlh; |
557 | int i; |
558 | |
559 | nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, seq: cb->nlh->nlmsg_seq, |
560 | family: &smc_gen_nl_family, NLM_F_MULTI, |
561 | cmd: SMC_NETLINK_GET_DEV_SMCR); |
562 | if (!nlh) |
563 | goto errmsg; |
564 | attrs = nla_nest_start(skb, attrtype: SMC_GEN_DEV_SMCR); |
565 | if (!attrs) |
566 | goto errout; |
567 | is_crit = smcr_diag_is_dev_critical(smc_lgr: &smc_lgr_list, smcibdev); |
568 | if (nla_put_u8(skb, attrtype: SMC_NLA_DEV_IS_CRIT, value: is_crit)) |
569 | goto errattr; |
570 | if (smcibdev->ibdev->dev.parent) { |
571 | memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); |
572 | pci_dev = to_pci_dev(smcibdev->ibdev->dev.parent); |
573 | smc_set_pci_values(pci_dev, smc_dev: &smc_pci_dev); |
574 | if (!smc_nl_handle_pci_values(smc_pci_dev: &smc_pci_dev, skb)) |
575 | goto errattr; |
576 | } |
577 | snprintf(buf: smc_ibname, size: sizeof(smc_ibname), fmt: "%s" , smcibdev->ibdev->name); |
578 | if (nla_put_string(skb, attrtype: SMC_NLA_DEV_IB_NAME, str: smc_ibname)) |
579 | goto errattr; |
580 | for (i = 1; i <= SMC_MAX_PORTS; i++) { |
581 | if (!rdma_is_port_valid(device: smcibdev->ibdev, port: i)) |
582 | continue; |
583 | if (smc_nl_handle_dev_port(skb, ibdev: smcibdev->ibdev, |
584 | smcibdev, port: i - 1)) |
585 | goto errattr; |
586 | } |
587 | |
588 | nla_nest_end(skb, start: attrs); |
589 | genlmsg_end(skb, hdr: nlh); |
590 | return 0; |
591 | |
592 | errattr: |
593 | nla_nest_cancel(skb, start: attrs); |
594 | errout: |
595 | genlmsg_cancel(skb, hdr: nlh); |
596 | errmsg: |
597 | return -EMSGSIZE; |
598 | } |
599 | |
600 | static void smc_nl_prep_smcr_dev(struct smc_ib_devices *dev_list, |
601 | struct sk_buff *skb, |
602 | struct netlink_callback *cb) |
603 | { |
604 | struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(c: cb); |
605 | struct smc_ib_device *smcibdev; |
606 | int snum = cb_ctx->pos[0]; |
607 | int num = 0; |
608 | |
609 | mutex_lock(&dev_list->mutex); |
610 | list_for_each_entry(smcibdev, &dev_list->list, list) { |
611 | if (num < snum) |
612 | goto next; |
613 | if (smc_nl_handle_smcr_dev(smcibdev, skb, cb)) |
614 | goto errout; |
615 | next: |
616 | num++; |
617 | } |
618 | errout: |
619 | mutex_unlock(lock: &dev_list->mutex); |
620 | cb_ctx->pos[0] = num; |
621 | } |
622 | |
623 | int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) |
624 | { |
625 | smc_nl_prep_smcr_dev(dev_list: &smc_ib_devices, skb, cb); |
626 | return skb->len; |
627 | } |
628 | |
629 | static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) |
630 | { |
631 | struct smc_link *lnk = (struct smc_link *)priv; |
632 | struct smc_ib_device *smcibdev = lnk->smcibdev; |
633 | u8 port_idx; |
634 | |
635 | switch (ibevent->event) { |
636 | case IB_EVENT_QP_FATAL: |
637 | case IB_EVENT_QP_ACCESS_ERR: |
638 | port_idx = ibevent->element.qp->port - 1; |
639 | if (port_idx >= SMC_MAX_PORTS) |
640 | break; |
641 | set_bit(nr: port_idx, addr: &smcibdev->port_event_mask); |
642 | if (!test_and_set_bit(nr: port_idx, addr: smcibdev->ports_going_away)) |
643 | schedule_work(work: &smcibdev->port_event_work); |
644 | break; |
645 | default: |
646 | break; |
647 | } |
648 | } |
649 | |
650 | void smc_ib_destroy_queue_pair(struct smc_link *lnk) |
651 | { |
652 | if (lnk->roce_qp) |
653 | ib_destroy_qp(qp: lnk->roce_qp); |
654 | lnk->roce_qp = NULL; |
655 | } |
656 | |
657 | /* create a queue pair within the protection domain for a link */ |
658 | int smc_ib_create_queue_pair(struct smc_link *lnk) |
659 | { |
660 | int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; |
661 | struct ib_qp_init_attr qp_attr = { |
662 | .event_handler = smc_ib_qp_event_handler, |
663 | .qp_context = lnk, |
664 | .send_cq = lnk->smcibdev->roce_cq_send, |
665 | .recv_cq = lnk->smcibdev->roce_cq_recv, |
666 | .srq = NULL, |
667 | .cap = { |
668 | /* include unsolicited rdma_writes as well, |
669 | * there are max. 2 RDMA_WRITE per 1 WR_SEND |
670 | */ |
671 | .max_send_wr = SMC_WR_BUF_CNT * 3, |
672 | .max_recv_wr = SMC_WR_BUF_CNT * 3, |
673 | .max_send_sge = SMC_IB_MAX_SEND_SGE, |
674 | .max_recv_sge = sges_per_buf, |
675 | .max_inline_data = 0, |
676 | }, |
677 | .sq_sig_type = IB_SIGNAL_REQ_WR, |
678 | .qp_type = IB_QPT_RC, |
679 | }; |
680 | int rc; |
681 | |
682 | lnk->roce_qp = ib_create_qp(pd: lnk->roce_pd, init_attr: &qp_attr); |
683 | rc = PTR_ERR_OR_ZERO(ptr: lnk->roce_qp); |
684 | if (IS_ERR(ptr: lnk->roce_qp)) |
685 | lnk->roce_qp = NULL; |
686 | else |
687 | smc_wr_remember_qp_attr(lnk); |
688 | return rc; |
689 | } |
690 | |
691 | void smc_ib_put_memory_region(struct ib_mr *mr) |
692 | { |
693 | ib_dereg_mr(mr); |
694 | } |
695 | |
696 | static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx) |
697 | { |
698 | unsigned int offset = 0; |
699 | int sg_num; |
700 | |
701 | /* map the largest prefix of a dma mapped SG list */ |
702 | sg_num = ib_map_mr_sg(mr: buf_slot->mr[link_idx], |
703 | sg: buf_slot->sgt[link_idx].sgl, |
704 | sg_nents: buf_slot->sgt[link_idx].orig_nents, |
705 | sg_offset: &offset, PAGE_SIZE); |
706 | |
707 | return sg_num; |
708 | } |
709 | |
710 | /* Allocate a memory region and map the dma mapped SG list of buf_slot */ |
711 | int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, |
712 | struct smc_buf_desc *buf_slot, u8 link_idx) |
713 | { |
714 | if (buf_slot->mr[link_idx]) |
715 | return 0; /* already done */ |
716 | |
717 | buf_slot->mr[link_idx] = |
718 | ib_alloc_mr(pd, mr_type: IB_MR_TYPE_MEM_REG, max_num_sg: 1 << buf_slot->order); |
719 | if (IS_ERR(ptr: buf_slot->mr[link_idx])) { |
720 | int rc; |
721 | |
722 | rc = PTR_ERR(ptr: buf_slot->mr[link_idx]); |
723 | buf_slot->mr[link_idx] = NULL; |
724 | return rc; |
725 | } |
726 | |
727 | if (smc_ib_map_mr_sg(buf_slot, link_idx) != |
728 | buf_slot->sgt[link_idx].orig_nents) |
729 | return -EINVAL; |
730 | |
731 | return 0; |
732 | } |
733 | |
734 | bool smc_ib_is_sg_need_sync(struct smc_link *lnk, |
735 | struct smc_buf_desc *buf_slot) |
736 | { |
737 | struct scatterlist *sg; |
738 | unsigned int i; |
739 | bool ret = false; |
740 | |
741 | /* for now there is just one DMA address */ |
742 | for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, |
743 | buf_slot->sgt[lnk->link_idx].nents, i) { |
744 | if (!sg_dma_len(sg)) |
745 | break; |
746 | if (dma_need_sync(dev: lnk->smcibdev->ibdev->dma_device, |
747 | sg_dma_address(sg))) { |
748 | ret = true; |
749 | goto out; |
750 | } |
751 | } |
752 | |
753 | out: |
754 | return ret; |
755 | } |
756 | |
757 | /* synchronize buffer usage for cpu access */ |
758 | void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, |
759 | struct smc_buf_desc *buf_slot, |
760 | enum dma_data_direction data_direction) |
761 | { |
762 | struct scatterlist *sg; |
763 | unsigned int i; |
764 | |
765 | if (!(buf_slot->is_dma_need_sync & (1U << lnk->link_idx))) |
766 | return; |
767 | |
768 | /* for now there is just one DMA address */ |
769 | for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, |
770 | buf_slot->sgt[lnk->link_idx].nents, i) { |
771 | if (!sg_dma_len(sg)) |
772 | break; |
773 | ib_dma_sync_single_for_cpu(dev: lnk->smcibdev->ibdev, |
774 | sg_dma_address(sg), |
775 | sg_dma_len(sg), |
776 | dir: data_direction); |
777 | } |
778 | } |
779 | |
780 | /* synchronize buffer usage for device access */ |
781 | void smc_ib_sync_sg_for_device(struct smc_link *lnk, |
782 | struct smc_buf_desc *buf_slot, |
783 | enum dma_data_direction data_direction) |
784 | { |
785 | struct scatterlist *sg; |
786 | unsigned int i; |
787 | |
788 | if (!(buf_slot->is_dma_need_sync & (1U << lnk->link_idx))) |
789 | return; |
790 | |
791 | /* for now there is just one DMA address */ |
792 | for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, |
793 | buf_slot->sgt[lnk->link_idx].nents, i) { |
794 | if (!sg_dma_len(sg)) |
795 | break; |
796 | ib_dma_sync_single_for_device(dev: lnk->smcibdev->ibdev, |
797 | sg_dma_address(sg), |
798 | sg_dma_len(sg), |
799 | dir: data_direction); |
800 | } |
801 | } |
802 | |
803 | /* Map a new TX or RX buffer SG-table to DMA */ |
804 | int smc_ib_buf_map_sg(struct smc_link *lnk, |
805 | struct smc_buf_desc *buf_slot, |
806 | enum dma_data_direction data_direction) |
807 | { |
808 | int mapped_nents; |
809 | |
810 | mapped_nents = ib_dma_map_sg(dev: lnk->smcibdev->ibdev, |
811 | sg: buf_slot->sgt[lnk->link_idx].sgl, |
812 | nents: buf_slot->sgt[lnk->link_idx].orig_nents, |
813 | direction: data_direction); |
814 | if (!mapped_nents) |
815 | return -ENOMEM; |
816 | |
817 | return mapped_nents; |
818 | } |
819 | |
820 | void smc_ib_buf_unmap_sg(struct smc_link *lnk, |
821 | struct smc_buf_desc *buf_slot, |
822 | enum dma_data_direction data_direction) |
823 | { |
824 | if (!buf_slot->sgt[lnk->link_idx].sgl->dma_address) |
825 | return; /* already unmapped */ |
826 | |
827 | ib_dma_unmap_sg(dev: lnk->smcibdev->ibdev, |
828 | sg: buf_slot->sgt[lnk->link_idx].sgl, |
829 | nents: buf_slot->sgt[lnk->link_idx].orig_nents, |
830 | direction: data_direction); |
831 | buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0; |
832 | } |
833 | |
834 | long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) |
835 | { |
836 | struct ib_cq_init_attr cqattr = { |
837 | .cqe = SMC_MAX_CQE, .comp_vector = 0 }; |
838 | int cqe_size_order, smc_order; |
839 | long rc; |
840 | |
841 | mutex_lock(&smcibdev->mutex); |
842 | rc = 0; |
843 | if (smcibdev->initialized) |
844 | goto out; |
845 | /* the calculated number of cq entries fits to mlx5 cq allocation */ |
846 | cqe_size_order = cache_line_size() == 128 ? 7 : 6; |
847 | smc_order = MAX_ORDER - cqe_size_order; |
848 | if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) |
849 | cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; |
850 | smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, |
851 | smc_wr_tx_cq_handler, NULL, |
852 | smcibdev, &cqattr); |
853 | rc = PTR_ERR_OR_ZERO(ptr: smcibdev->roce_cq_send); |
854 | if (IS_ERR(ptr: smcibdev->roce_cq_send)) { |
855 | smcibdev->roce_cq_send = NULL; |
856 | goto out; |
857 | } |
858 | smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev, |
859 | smc_wr_rx_cq_handler, NULL, |
860 | smcibdev, &cqattr); |
861 | rc = PTR_ERR_OR_ZERO(ptr: smcibdev->roce_cq_recv); |
862 | if (IS_ERR(ptr: smcibdev->roce_cq_recv)) { |
863 | smcibdev->roce_cq_recv = NULL; |
864 | goto err; |
865 | } |
866 | smc_wr_add_dev(smcibdev); |
867 | smcibdev->initialized = 1; |
868 | goto out; |
869 | |
870 | err: |
871 | ib_destroy_cq(cq: smcibdev->roce_cq_send); |
872 | out: |
873 | mutex_unlock(lock: &smcibdev->mutex); |
874 | return rc; |
875 | } |
876 | |
877 | static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) |
878 | { |
879 | mutex_lock(&smcibdev->mutex); |
880 | if (!smcibdev->initialized) |
881 | goto out; |
882 | smcibdev->initialized = 0; |
883 | ib_destroy_cq(cq: smcibdev->roce_cq_recv); |
884 | ib_destroy_cq(cq: smcibdev->roce_cq_send); |
885 | smc_wr_remove_dev(smcibdev); |
886 | out: |
887 | mutex_unlock(lock: &smcibdev->mutex); |
888 | } |
889 | |
890 | static struct ib_client smc_ib_client; |
891 | |
892 | static void smc_copy_netdev_ifindex(struct smc_ib_device *smcibdev, int port) |
893 | { |
894 | struct ib_device *ibdev = smcibdev->ibdev; |
895 | struct net_device *ndev; |
896 | |
897 | if (!ibdev->ops.get_netdev) |
898 | return; |
899 | ndev = ibdev->ops.get_netdev(ibdev, port + 1); |
900 | if (ndev) { |
901 | smcibdev->ndev_ifidx[port] = ndev->ifindex; |
902 | dev_put(dev: ndev); |
903 | } |
904 | } |
905 | |
906 | void smc_ib_ndev_change(struct net_device *ndev, unsigned long event) |
907 | { |
908 | struct smc_ib_device *smcibdev; |
909 | struct ib_device *libdev; |
910 | struct net_device *lndev; |
911 | u8 port_cnt; |
912 | int i; |
913 | |
914 | mutex_lock(&smc_ib_devices.mutex); |
915 | list_for_each_entry(smcibdev, &smc_ib_devices.list, list) { |
916 | port_cnt = smcibdev->ibdev->phys_port_cnt; |
917 | for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); i++) { |
918 | libdev = smcibdev->ibdev; |
919 | if (!libdev->ops.get_netdev) |
920 | continue; |
921 | lndev = libdev->ops.get_netdev(libdev, i + 1); |
922 | dev_put(dev: lndev); |
923 | if (lndev != ndev) |
924 | continue; |
925 | if (event == NETDEV_REGISTER) |
926 | smcibdev->ndev_ifidx[i] = ndev->ifindex; |
927 | if (event == NETDEV_UNREGISTER) |
928 | smcibdev->ndev_ifidx[i] = 0; |
929 | } |
930 | } |
931 | mutex_unlock(lock: &smc_ib_devices.mutex); |
932 | } |
933 | |
934 | /* callback function for ib_register_client() */ |
935 | static int smc_ib_add_dev(struct ib_device *ibdev) |
936 | { |
937 | struct smc_ib_device *smcibdev; |
938 | u8 port_cnt; |
939 | int i; |
940 | |
941 | if (ibdev->node_type != RDMA_NODE_IB_CA) |
942 | return -EOPNOTSUPP; |
943 | |
944 | smcibdev = kzalloc(size: sizeof(*smcibdev), GFP_KERNEL); |
945 | if (!smcibdev) |
946 | return -ENOMEM; |
947 | |
948 | smcibdev->ibdev = ibdev; |
949 | INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work); |
950 | atomic_set(v: &smcibdev->lnk_cnt, i: 0); |
951 | init_waitqueue_head(&smcibdev->lnks_deleted); |
952 | mutex_init(&smcibdev->mutex); |
953 | mutex_lock(&smc_ib_devices.mutex); |
954 | list_add_tail(new: &smcibdev->list, head: &smc_ib_devices.list); |
955 | mutex_unlock(lock: &smc_ib_devices.mutex); |
956 | ib_set_client_data(device: ibdev, client: &smc_ib_client, data: smcibdev); |
957 | INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev, |
958 | smc_ib_global_event_handler); |
959 | ib_register_event_handler(event_handler: &smcibdev->event_handler); |
960 | |
961 | /* trigger reading of the port attributes */ |
962 | port_cnt = smcibdev->ibdev->phys_port_cnt; |
963 | pr_warn_ratelimited("smc: adding ib device %s with port count %d\n" , |
964 | smcibdev->ibdev->name, port_cnt); |
965 | for (i = 0; |
966 | i < min_t(size_t, port_cnt, SMC_MAX_PORTS); |
967 | i++) { |
968 | set_bit(nr: i, addr: &smcibdev->port_event_mask); |
969 | /* determine pnetids of the port */ |
970 | if (smc_pnetid_by_dev_port(dev: ibdev->dev.parent, port: i, |
971 | pnetid: smcibdev->pnetid[i])) |
972 | smc_pnetid_by_table_ib(smcibdev, ib_port: i + 1); |
973 | smc_copy_netdev_ifindex(smcibdev, port: i); |
974 | pr_warn_ratelimited("smc: ib device %s port %d has pnetid " |
975 | "%.16s%s\n" , |
976 | smcibdev->ibdev->name, i + 1, |
977 | smcibdev->pnetid[i], |
978 | smcibdev->pnetid_by_user[i] ? |
979 | " (user defined)" : |
980 | "" ); |
981 | } |
982 | schedule_work(work: &smcibdev->port_event_work); |
983 | return 0; |
984 | } |
985 | |
986 | /* callback function for ib_unregister_client() */ |
987 | static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) |
988 | { |
989 | struct smc_ib_device *smcibdev = client_data; |
990 | |
991 | mutex_lock(&smc_ib_devices.mutex); |
992 | list_del_init(entry: &smcibdev->list); /* remove from smc_ib_devices */ |
993 | mutex_unlock(lock: &smc_ib_devices.mutex); |
994 | pr_warn_ratelimited("smc: removing ib device %s\n" , |
995 | smcibdev->ibdev->name); |
996 | smc_smcr_terminate_all(smcibdev); |
997 | smc_ib_cleanup_per_ibdev(smcibdev); |
998 | ib_unregister_event_handler(event_handler: &smcibdev->event_handler); |
999 | cancel_work_sync(work: &smcibdev->port_event_work); |
1000 | kfree(objp: smcibdev); |
1001 | } |
1002 | |
1003 | static struct ib_client smc_ib_client = { |
1004 | .name = "smc_ib" , |
1005 | .add = smc_ib_add_dev, |
1006 | .remove = smc_ib_remove_dev, |
1007 | }; |
1008 | |
1009 | int __init smc_ib_register_client(void) |
1010 | { |
1011 | smc_ib_init_local_systemid(); |
1012 | return ib_register_client(client: &smc_ib_client); |
1013 | } |
1014 | |
1015 | void smc_ib_unregister_client(void) |
1016 | { |
1017 | ib_unregister_client(client: &smc_ib_client); |
1018 | } |
1019 | |