1 | /* |
2 | * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. |
3 | * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. |
4 | * Copyright (c) 2005 Mellanox Technologies. All rights reserved. |
5 | * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved. |
6 | * |
7 | * This software is available to you under a choice of one of two |
8 | * licenses. You may choose to be licensed under the terms of the GNU |
9 | * General Public License (GPL) Version 2, available from the file |
10 | * COPYING in the main directory of this source tree, or the |
11 | * OpenIB.org BSD license below: |
12 | * |
13 | * Redistribution and use in source and binary forms, with or |
14 | * without modification, are permitted provided that the following |
15 | * conditions are met: |
16 | * |
17 | * - Redistributions of source code must retain the above |
18 | * copyright notice, this list of conditions and the following |
19 | * disclaimer. |
20 | * |
21 | * - Redistributions in binary form must reproduce the above |
22 | * copyright notice, this list of conditions and the following |
23 | * disclaimer in the documentation and/or other materials |
24 | * provided with the distribution. |
25 | * |
26 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
27 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
28 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
29 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
30 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
31 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
32 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
33 | * SOFTWARE. |
34 | */ |
35 | |
36 | #include <linux/delay.h> |
37 | #include <linux/moduleparam.h> |
38 | #include <linux/dma-mapping.h> |
39 | #include <linux/slab.h> |
40 | |
41 | #include <linux/ip.h> |
42 | #include <linux/tcp.h> |
43 | #include <rdma/ib_cache.h> |
44 | |
45 | #include "ipoib.h" |
46 | |
47 | #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA |
48 | static int data_debug_level; |
49 | |
50 | module_param(data_debug_level, int, 0644); |
51 | MODULE_PARM_DESC(data_debug_level, |
52 | "Enable data path debug tracing if > 0" ); |
53 | #endif |
54 | |
55 | struct ipoib_ah *ipoib_create_ah(struct net_device *dev, |
56 | struct ib_pd *pd, struct rdma_ah_attr *attr) |
57 | { |
58 | struct ipoib_ah *ah; |
59 | struct ib_ah *vah; |
60 | |
61 | ah = kmalloc(size: sizeof(*ah), GFP_KERNEL); |
62 | if (!ah) |
63 | return ERR_PTR(error: -ENOMEM); |
64 | |
65 | ah->dev = dev; |
66 | ah->last_send = 0; |
67 | kref_init(kref: &ah->ref); |
68 | |
69 | vah = rdma_create_ah(pd, ah_attr: attr, flags: RDMA_CREATE_AH_SLEEPABLE); |
70 | if (IS_ERR(ptr: vah)) { |
71 | kfree(objp: ah); |
72 | ah = (struct ipoib_ah *)vah; |
73 | } else { |
74 | ah->ah = vah; |
75 | ipoib_dbg(ipoib_priv(dev), "Created ah %p\n" , ah->ah); |
76 | } |
77 | |
78 | return ah; |
79 | } |
80 | |
81 | void ipoib_free_ah(struct kref *kref) |
82 | { |
83 | struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref); |
84 | struct ipoib_dev_priv *priv = ipoib_priv(dev: ah->dev); |
85 | |
86 | unsigned long flags; |
87 | |
88 | spin_lock_irqsave(&priv->lock, flags); |
89 | list_add_tail(new: &ah->list, head: &priv->dead_ahs); |
90 | spin_unlock_irqrestore(lock: &priv->lock, flags); |
91 | } |
92 | |
93 | static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv, |
94 | u64 mapping[IPOIB_UD_RX_SG]) |
95 | { |
96 | ib_dma_unmap_single(dev: priv->ca, addr: mapping[0], |
97 | IPOIB_UD_BUF_SIZE(priv->max_ib_mtu), |
98 | direction: DMA_FROM_DEVICE); |
99 | } |
100 | |
101 | static int ipoib_ib_post_receive(struct net_device *dev, int id) |
102 | { |
103 | struct ipoib_dev_priv *priv = ipoib_priv(dev); |
104 | int ret; |
105 | |
106 | priv->rx_wr.wr_id = id | IPOIB_OP_RECV; |
107 | priv->rx_sge[0].addr = priv->rx_ring[id].mapping[0]; |
108 | priv->rx_sge[1].addr = priv->rx_ring[id].mapping[1]; |
109 | |
110 | |
111 | ret = ib_post_recv(qp: priv->qp, recv_wr: &priv->rx_wr, NULL); |
112 | if (unlikely(ret)) { |
113 | ipoib_warn(priv, "receive failed for buf %d (%d)\n" , id, ret); |
114 | ipoib_ud_dma_unmap_rx(priv, mapping: priv->rx_ring[id].mapping); |
115 | dev_kfree_skb_any(skb: priv->rx_ring[id].skb); |
116 | priv->rx_ring[id].skb = NULL; |
117 | } |
118 | |
119 | return ret; |
120 | } |
121 | |
122 | static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id) |
123 | { |
124 | struct ipoib_dev_priv *priv = ipoib_priv(dev); |
125 | struct sk_buff *skb; |
126 | int buf_size; |
127 | u64 *mapping; |
128 | |
129 | buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); |
130 | |
131 | skb = dev_alloc_skb(length: buf_size + IPOIB_HARD_LEN); |
132 | if (unlikely(!skb)) |
133 | return NULL; |
134 | |
135 | /* |
136 | * the IP header will be at IPOIP_HARD_LEN + IB_GRH_BYTES, that is |
137 | * 64 bytes aligned |
138 | */ |
139 | skb_reserve(skb, len: sizeof(struct ipoib_pseudo_header)); |
140 | |
141 | mapping = priv->rx_ring[id].mapping; |
142 | mapping[0] = ib_dma_map_single(dev: priv->ca, cpu_addr: skb->data, size: buf_size, |
143 | direction: DMA_FROM_DEVICE); |
144 | if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) |
145 | goto error; |
146 | |
147 | priv->rx_ring[id].skb = skb; |
148 | return skb; |
149 | error: |
150 | dev_kfree_skb_any(skb); |
151 | return NULL; |
152 | } |
153 | |
154 | static int ipoib_ib_post_receives(struct net_device *dev) |
155 | { |
156 | struct ipoib_dev_priv *priv = ipoib_priv(dev); |
157 | int i; |
158 | |
159 | for (i = 0; i < ipoib_recvq_size; ++i) { |
160 | if (!ipoib_alloc_rx_skb(dev, id: i)) { |
161 | ipoib_warn(priv, "failed to allocate receive buffer %d\n" , i); |
162 | return -ENOMEM; |
163 | } |
164 | if (ipoib_ib_post_receive(dev, id: i)) { |
165 | ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n" , i); |
166 | return -EIO; |
167 | } |
168 | } |
169 | |
170 | return 0; |
171 | } |
172 | |
173 | static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) |
174 | { |
175 | struct ipoib_dev_priv *priv = ipoib_priv(dev); |
176 | unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV; |
177 | struct sk_buff *skb; |
178 | u64 mapping[IPOIB_UD_RX_SG]; |
179 | union ib_gid *dgid; |
180 | union ib_gid *sgid; |
181 | |
182 | ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n" , |
183 | wr_id, wc->status); |
184 | |
185 | if (unlikely(wr_id >= ipoib_recvq_size)) { |
186 | ipoib_warn(priv, "recv completion event with wrid %d (> %d)\n" , |
187 | wr_id, ipoib_recvq_size); |
188 | return; |
189 | } |
190 | |
191 | skb = priv->rx_ring[wr_id].skb; |
192 | |
193 | if (unlikely(wc->status != IB_WC_SUCCESS)) { |
194 | if (wc->status != IB_WC_WR_FLUSH_ERR) |
195 | ipoib_warn(priv, |
196 | "failed recv event (status=%d, wrid=%d vend_err %#x)\n" , |
197 | wc->status, wr_id, wc->vendor_err); |
198 | ipoib_ud_dma_unmap_rx(priv, mapping: priv->rx_ring[wr_id].mapping); |
199 | dev_kfree_skb_any(skb); |
200 | priv->rx_ring[wr_id].skb = NULL; |
201 | return; |
202 | } |
203 | |
204 | memcpy(mapping, priv->rx_ring[wr_id].mapping, |
205 | IPOIB_UD_RX_SG * sizeof(*mapping)); |
206 | |
207 | /* |
208 | * If we can't allocate a new RX buffer, dump |
209 | * this packet and reuse the old buffer. |
210 | */ |
211 | if (unlikely(!ipoib_alloc_rx_skb(dev, wr_id))) { |
212 | ++dev->stats.rx_dropped; |
213 | goto repost; |
214 | } |
215 | |
216 | ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n" , |
217 | wc->byte_len, wc->slid); |
218 | |
219 | ipoib_ud_dma_unmap_rx(priv, mapping); |
220 | |
221 | skb_put(skb, len: wc->byte_len); |
222 | |
223 | /* First byte of dgid signals multicast when 0xff */ |
224 | dgid = &((struct ib_grh *)skb->data)->dgid; |
225 | |
226 | if (!(wc->wc_flags & IB_WC_GRH) || dgid->raw[0] != 0xff) |
227 | skb->pkt_type = PACKET_HOST; |
228 | else if (memcmp(p: dgid, q: dev->broadcast + 4, size: sizeof(union ib_gid)) == 0) |
229 | skb->pkt_type = PACKET_BROADCAST; |
230 | else |
231 | skb->pkt_type = PACKET_MULTICAST; |
232 | |
233 | sgid = &((struct ib_grh *)skb->data)->sgid; |
234 | |
235 | /* |
236 | * Drop packets that this interface sent, ie multicast packets |
237 | * that the HCA has replicated. |
238 | */ |
239 | if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num) { |
240 | int need_repost = 1; |
241 | |
242 | if ((wc->wc_flags & IB_WC_GRH) && |
243 | sgid->global.interface_id != priv->local_gid.global.interface_id) |
244 | need_repost = 0; |
245 | |
246 | if (need_repost) { |
247 | dev_kfree_skb_any(skb); |
248 | goto repost; |
249 | } |
250 | } |
251 | |
252 | skb_pull(skb, len: IB_GRH_BYTES); |
253 | |
254 | skb->protocol = ((struct ipoib_header *) skb->data)->proto; |
255 | skb_add_pseudo_hdr(skb); |
256 | |
257 | ++dev->stats.rx_packets; |
258 | dev->stats.rx_bytes += skb->len; |
259 | if (skb->pkt_type == PACKET_MULTICAST) |
260 | dev->stats.multicast++; |
261 | |
262 | skb->dev = dev; |
263 | if ((dev->features & NETIF_F_RXCSUM) && |
264 | likely(wc->wc_flags & IB_WC_IP_CSUM_OK)) |
265 | skb->ip_summed = CHECKSUM_UNNECESSARY; |
266 | |
267 | napi_gro_receive(napi: &priv->recv_napi, skb); |
268 | |
269 | repost: |
270 | if (unlikely(ipoib_ib_post_receive(dev, wr_id))) |
271 | ipoib_warn(priv, "ipoib_ib_post_receive failed " |
272 | "for buf %d\n" , wr_id); |
273 | } |
274 | |
275 | int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req) |
276 | { |
277 | struct sk_buff *skb = tx_req->skb; |
278 | u64 *mapping = tx_req->mapping; |
279 | int i; |
280 | int off; |
281 | |
282 | if (skb_headlen(skb)) { |
283 | mapping[0] = ib_dma_map_single(dev: ca, cpu_addr: skb->data, size: skb_headlen(skb), |
284 | direction: DMA_TO_DEVICE); |
285 | if (unlikely(ib_dma_mapping_error(ca, mapping[0]))) |
286 | return -EIO; |
287 | |
288 | off = 1; |
289 | } else |
290 | off = 0; |
291 | |
292 | for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { |
293 | const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; |
294 | mapping[i + off] = ib_dma_map_page(dev: ca, |
295 | page: skb_frag_page(frag), |
296 | offset: skb_frag_off(frag), |
297 | size: skb_frag_size(frag), |
298 | direction: DMA_TO_DEVICE); |
299 | if (unlikely(ib_dma_mapping_error(ca, mapping[i + off]))) |
300 | goto partial_error; |
301 | } |
302 | return 0; |
303 | |
304 | partial_error: |
305 | for (; i > 0; --i) { |
306 | const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1]; |
307 | |
308 | ib_dma_unmap_page(dev: ca, addr: mapping[i - !off], size: skb_frag_size(frag), direction: DMA_TO_DEVICE); |
309 | } |
310 | |
311 | if (off) |
312 | ib_dma_unmap_single(dev: ca, addr: mapping[0], size: skb_headlen(skb), direction: DMA_TO_DEVICE); |
313 | |
314 | return -EIO; |
315 | } |
316 | |
317 | void ipoib_dma_unmap_tx(struct ipoib_dev_priv *priv, |
318 | struct ipoib_tx_buf *tx_req) |
319 | { |
320 | struct sk_buff *skb = tx_req->skb; |
321 | u64 *mapping = tx_req->mapping; |
322 | int i; |
323 | int off; |
324 | |
325 | if (skb_headlen(skb)) { |
326 | ib_dma_unmap_single(dev: priv->ca, addr: mapping[0], size: skb_headlen(skb), |
327 | direction: DMA_TO_DEVICE); |
328 | off = 1; |
329 | } else |
330 | off = 0; |
331 | |
332 | for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { |
333 | const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; |
334 | |
335 | ib_dma_unmap_page(dev: priv->ca, addr: mapping[i + off], |
336 | size: skb_frag_size(frag), direction: DMA_TO_DEVICE); |
337 | } |
338 | } |
339 | |
340 | /* |
341 | * As the result of a completion error the QP Can be transferred to SQE states. |
342 | * The function checks if the (send)QP is in SQE state and |
343 | * moves it back to RTS state, that in order to have it functional again. |
344 | */ |
345 | static void ipoib_qp_state_validate_work(struct work_struct *work) |
346 | { |
347 | struct ipoib_qp_state_validate *qp_work = |
348 | container_of(work, struct ipoib_qp_state_validate, work); |
349 | |
350 | struct ipoib_dev_priv *priv = qp_work->priv; |
351 | struct ib_qp_attr qp_attr; |
352 | struct ib_qp_init_attr query_init_attr; |
353 | int ret; |
354 | |
355 | ret = ib_query_qp(qp: priv->qp, qp_attr: &qp_attr, qp_attr_mask: IB_QP_STATE, qp_init_attr: &query_init_attr); |
356 | if (ret) { |
357 | ipoib_warn(priv, "%s: Failed to query QP ret: %d\n" , |
358 | __func__, ret); |
359 | goto free_res; |
360 | } |
361 | pr_info("%s: QP: 0x%x is in state: %d\n" , |
362 | __func__, priv->qp->qp_num, qp_attr.qp_state); |
363 | |
364 | /* currently support only in SQE->RTS transition*/ |
365 | if (qp_attr.qp_state == IB_QPS_SQE) { |
366 | qp_attr.qp_state = IB_QPS_RTS; |
367 | |
368 | ret = ib_modify_qp(qp: priv->qp, qp_attr: &qp_attr, qp_attr_mask: IB_QP_STATE); |
369 | if (ret) { |
370 | pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n" , |
371 | ret, priv->qp->qp_num); |
372 | goto free_res; |
373 | } |
374 | pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n" , |
375 | __func__, priv->qp->qp_num); |
376 | } else { |
377 | pr_warn("QP (%d) will stay in state: %d\n" , |
378 | priv->qp->qp_num, qp_attr.qp_state); |
379 | } |
380 | |
381 | free_res: |
382 | kfree(objp: qp_work); |
383 | } |
384 | |
385 | static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) |
386 | { |
387 | struct ipoib_dev_priv *priv = ipoib_priv(dev); |
388 | unsigned int wr_id = wc->wr_id; |
389 | struct ipoib_tx_buf *tx_req; |
390 | |
391 | ipoib_dbg_data(priv, "send completion: id %d, status: %d\n" , |
392 | wr_id, wc->status); |
393 | |
394 | if (unlikely(wr_id >= ipoib_sendq_size)) { |
395 | ipoib_warn(priv, "send completion event with wrid %d (> %d)\n" , |
396 | wr_id, ipoib_sendq_size); |
397 | return; |
398 | } |
399 | |
400 | tx_req = &priv->tx_ring[wr_id]; |
401 | |
402 | ipoib_dma_unmap_tx(priv, tx_req); |
403 | |
404 | ++dev->stats.tx_packets; |
405 | dev->stats.tx_bytes += tx_req->skb->len; |
406 | |
407 | dev_kfree_skb_any(skb: tx_req->skb); |
408 | |
409 | ++priv->tx_tail; |
410 | ++priv->global_tx_tail; |
411 | |
412 | if (unlikely(netif_queue_stopped(dev) && |
413 | ((priv->global_tx_head - priv->global_tx_tail) <= |
414 | ipoib_sendq_size >> 1) && |
415 | test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))) |
416 | netif_wake_queue(dev); |
417 | |
418 | if (wc->status != IB_WC_SUCCESS && |
419 | wc->status != IB_WC_WR_FLUSH_ERR) { |
420 | struct ipoib_qp_state_validate *qp_work; |
421 | ipoib_warn(priv, |
422 | "failed send event (status=%d, wrid=%d vend_err %#x)\n" , |
423 | wc->status, wr_id, wc->vendor_err); |
424 | qp_work = kzalloc(size: sizeof(*qp_work), GFP_ATOMIC); |
425 | if (!qp_work) |
426 | return; |
427 | |
428 | INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work); |
429 | qp_work->priv = priv; |
430 | queue_work(wq: priv->wq, work: &qp_work->work); |
431 | } |
432 | } |
433 | |
434 | static int poll_tx(struct ipoib_dev_priv *priv) |
435 | { |
436 | int n, i; |
437 | struct ib_wc *wc; |
438 | |
439 | n = ib_poll_cq(cq: priv->send_cq, num_entries: MAX_SEND_CQE, wc: priv->send_wc); |
440 | for (i = 0; i < n; ++i) { |
441 | wc = priv->send_wc + i; |
442 | if (wc->wr_id & IPOIB_OP_CM) |
443 | ipoib_cm_handle_tx_wc(dev: priv->dev, wc: priv->send_wc + i); |
444 | else |
445 | ipoib_ib_handle_tx_wc(dev: priv->dev, wc: priv->send_wc + i); |
446 | } |
447 | return n == MAX_SEND_CQE; |
448 | } |
449 | |
450 | int ipoib_rx_poll(struct napi_struct *napi, int budget) |
451 | { |
452 | struct ipoib_dev_priv *priv = |
453 | container_of(napi, struct ipoib_dev_priv, recv_napi); |
454 | struct net_device *dev = priv->dev; |
455 | int done; |
456 | int t; |
457 | int n, i; |
458 | |
459 | done = 0; |
460 | |
461 | poll_more: |
462 | while (done < budget) { |
463 | int max = (budget - done); |
464 | |
465 | t = min(IPOIB_NUM_WC, max); |
466 | n = ib_poll_cq(cq: priv->recv_cq, num_entries: t, wc: priv->ibwc); |
467 | |
468 | for (i = 0; i < n; i++) { |
469 | struct ib_wc *wc = priv->ibwc + i; |
470 | |
471 | if (wc->wr_id & IPOIB_OP_RECV) { |
472 | ++done; |
473 | if (wc->wr_id & IPOIB_OP_CM) |
474 | ipoib_cm_handle_rx_wc(dev, wc); |
475 | else |
476 | ipoib_ib_handle_rx_wc(dev, wc); |
477 | } else { |
478 | pr_warn("%s: Got unexpected wqe id\n" , __func__); |
479 | } |
480 | } |
481 | |
482 | if (n != t) |
483 | break; |
484 | } |
485 | |
486 | if (done < budget) { |
487 | napi_complete(n: napi); |
488 | if (unlikely(ib_req_notify_cq(priv->recv_cq, |
489 | IB_CQ_NEXT_COMP | |
490 | IB_CQ_REPORT_MISSED_EVENTS)) && |
491 | napi_schedule(n: napi)) |
492 | goto poll_more; |
493 | } |
494 | |
495 | return done; |
496 | } |
497 | |
498 | int ipoib_tx_poll(struct napi_struct *napi, int budget) |
499 | { |
500 | struct ipoib_dev_priv *priv = container_of(napi, struct ipoib_dev_priv, |
501 | send_napi); |
502 | struct net_device *dev = priv->dev; |
503 | int n, i; |
504 | struct ib_wc *wc; |
505 | |
506 | poll_more: |
507 | n = ib_poll_cq(cq: priv->send_cq, num_entries: MAX_SEND_CQE, wc: priv->send_wc); |
508 | |
509 | for (i = 0; i < n; i++) { |
510 | wc = priv->send_wc + i; |
511 | if (wc->wr_id & IPOIB_OP_CM) |
512 | ipoib_cm_handle_tx_wc(dev, wc); |
513 | else |
514 | ipoib_ib_handle_tx_wc(dev, wc); |
515 | } |
516 | |
517 | if (n < budget) { |
518 | napi_complete(n: napi); |
519 | if (unlikely(ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP | |
520 | IB_CQ_REPORT_MISSED_EVENTS)) && |
521 | napi_schedule(n: napi)) |
522 | goto poll_more; |
523 | } |
524 | return n < 0 ? 0 : n; |
525 | } |
526 | |
527 | void ipoib_ib_rx_completion(struct ib_cq *cq, void *ctx_ptr) |
528 | { |
529 | struct ipoib_dev_priv *priv = ctx_ptr; |
530 | |
531 | napi_schedule(n: &priv->recv_napi); |
532 | } |
533 | |
534 | /* The function will force napi_schedule */ |
535 | void ipoib_napi_schedule_work(struct work_struct *work) |
536 | { |
537 | struct ipoib_dev_priv *priv = |
538 | container_of(work, struct ipoib_dev_priv, reschedule_napi_work); |
539 | bool ret; |
540 | |
541 | do { |
542 | ret = napi_schedule(n: &priv->send_napi); |
543 | if (!ret) |
544 | msleep(msecs: 3); |
545 | } while (!ret && netif_queue_stopped(dev: priv->dev) && |
546 | test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)); |
547 | } |
548 | |
549 | void ipoib_ib_tx_completion(struct ib_cq *cq, void *ctx_ptr) |
550 | { |
551 | struct ipoib_dev_priv *priv = ctx_ptr; |
552 | bool ret; |
553 | |
554 | ret = napi_schedule(n: &priv->send_napi); |
555 | /* |
556 | * if the queue is closed the driver must be able to schedule napi, |
557 | * otherwise we can end with closed queue forever, because no new |
558 | * packets to send and napi callback might not get new event after |
559 | * its re-arm of the napi. |
560 | */ |
561 | if (!ret && netif_queue_stopped(dev: priv->dev)) |
562 | schedule_work(work: &priv->reschedule_napi_work); |
563 | } |
564 | |
565 | static inline int post_send(struct ipoib_dev_priv *priv, |
566 | unsigned int wr_id, |
567 | struct ib_ah *address, u32 dqpn, |
568 | struct ipoib_tx_buf *tx_req, |
569 | void *head, int hlen) |
570 | { |
571 | struct sk_buff *skb = tx_req->skb; |
572 | |
573 | ipoib_build_sge(priv, tx_req); |
574 | |
575 | priv->tx_wr.wr.wr_id = wr_id; |
576 | priv->tx_wr.remote_qpn = dqpn; |
577 | priv->tx_wr.ah = address; |
578 | |
579 | if (head) { |
580 | priv->tx_wr.mss = skb_shinfo(skb)->gso_size; |
581 | priv->tx_wr.header = head; |
582 | priv->tx_wr.hlen = hlen; |
583 | priv->tx_wr.wr.opcode = IB_WR_LSO; |
584 | } else |
585 | priv->tx_wr.wr.opcode = IB_WR_SEND; |
586 | |
587 | return ib_post_send(qp: priv->qp, send_wr: &priv->tx_wr.wr, NULL); |
588 | } |
589 | |
590 | int ipoib_send(struct net_device *dev, struct sk_buff *skb, |
591 | struct ib_ah *address, u32 dqpn) |
592 | { |
593 | struct ipoib_dev_priv *priv = ipoib_priv(dev); |
594 | struct ipoib_tx_buf *tx_req; |
595 | int hlen, rc; |
596 | void *phead; |
597 | unsigned int usable_sge = priv->max_send_sge - !!skb_headlen(skb); |
598 | |
599 | if (skb_is_gso(skb)) { |
600 | hlen = skb_tcp_all_headers(skb); |
601 | phead = skb->data; |
602 | if (unlikely(!skb_pull(skb, hlen))) { |
603 | ipoib_warn(priv, "linear data too small\n" ); |
604 | ++dev->stats.tx_dropped; |
605 | ++dev->stats.tx_errors; |
606 | dev_kfree_skb_any(skb); |
607 | return -1; |
608 | } |
609 | } else { |
610 | if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) { |
611 | ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n" , |
612 | skb->len, priv->mcast_mtu + IPOIB_ENCAP_LEN); |
613 | ++dev->stats.tx_dropped; |
614 | ++dev->stats.tx_errors; |
615 | ipoib_cm_skb_too_long(dev, skb, mtu: priv->mcast_mtu); |
616 | return -1; |
617 | } |
618 | phead = NULL; |
619 | hlen = 0; |
620 | } |
621 | if (skb_shinfo(skb)->nr_frags > usable_sge) { |
622 | if (skb_linearize(skb) < 0) { |
623 | ipoib_warn(priv, "skb could not be linearized\n" ); |
624 | ++dev->stats.tx_dropped; |
625 | ++dev->stats.tx_errors; |
626 | dev_kfree_skb_any(skb); |
627 | return -1; |
628 | } |
629 | /* Does skb_linearize return ok without reducing nr_frags? */ |
630 | if (skb_shinfo(skb)->nr_frags > usable_sge) { |
631 | ipoib_warn(priv, "too many frags after skb linearize\n" ); |
632 | ++dev->stats.tx_dropped; |
633 | ++dev->stats.tx_errors; |
634 | dev_kfree_skb_any(skb); |
635 | return -1; |
636 | } |
637 | } |
638 | |
639 | ipoib_dbg_data(priv, |
640 | "sending packet, length=%d address=%p dqpn=0x%06x\n" , |
641 | skb->len, address, dqpn); |
642 | |
643 | /* |
644 | * We put the skb into the tx_ring _before_ we call post_send() |
645 | * because it's entirely possible that the completion handler will |
646 | * run before we execute anything after the post_send(). That |
647 | * means we have to make sure everything is properly recorded and |
648 | * our state is consistent before we call post_send(). |
649 | */ |
650 | tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)]; |
651 | tx_req->skb = skb; |
652 | if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) { |
653 | ++dev->stats.tx_errors; |
654 | dev_kfree_skb_any(skb); |
655 | return -1; |
656 | } |
657 | |
658 | if (skb->ip_summed == CHECKSUM_PARTIAL) |
659 | priv->tx_wr.wr.send_flags |= IB_SEND_IP_CSUM; |
660 | else |
661 | priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM; |
662 | /* increase the tx_head after send success, but use it for queue state */ |
663 | if ((priv->global_tx_head - priv->global_tx_tail) == |
664 | ipoib_sendq_size - 1) { |
665 | ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n" ); |
666 | netif_stop_queue(dev); |
667 | } |
668 | |
669 | skb_orphan(skb); |
670 | skb_dst_drop(skb); |
671 | |
672 | if (netif_queue_stopped(dev)) |
673 | if (ib_req_notify_cq(cq: priv->send_cq, flags: IB_CQ_NEXT_COMP | |
674 | IB_CQ_REPORT_MISSED_EVENTS) < 0) |
675 | ipoib_warn(priv, "request notify on send CQ failed\n" ); |
676 | |
677 | rc = post_send(priv, wr_id: priv->tx_head & (ipoib_sendq_size - 1), |
678 | address, dqpn, tx_req, head: phead, hlen); |
679 | if (unlikely(rc)) { |
680 | ipoib_warn(priv, "post_send failed, error %d\n" , rc); |
681 | ++dev->stats.tx_errors; |
682 | ipoib_dma_unmap_tx(priv, tx_req); |
683 | dev_kfree_skb_any(skb); |
684 | if (netif_queue_stopped(dev)) |
685 | netif_wake_queue(dev); |
686 | rc = 0; |
687 | } else { |
688 | netif_trans_update(dev); |
689 | |
690 | rc = priv->tx_head; |
691 | ++priv->tx_head; |
692 | ++priv->global_tx_head; |
693 | } |
694 | return rc; |
695 | } |
696 | |
697 | static void ipoib_reap_dead_ahs(struct ipoib_dev_priv *priv) |
698 | { |
699 | struct ipoib_ah *ah, *tah; |
700 | unsigned long flags; |
701 | |
702 | netif_tx_lock_bh(dev: priv->dev); |
703 | spin_lock_irqsave(&priv->lock, flags); |
704 | |
705 | list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list) |
706 | if ((int) priv->tx_tail - (int) ah->last_send >= 0) { |
707 | list_del(entry: &ah->list); |
708 | rdma_destroy_ah(ah: ah->ah, flags: 0); |
709 | kfree(objp: ah); |
710 | } |
711 | |
712 | spin_unlock_irqrestore(lock: &priv->lock, flags); |
713 | netif_tx_unlock_bh(dev: priv->dev); |
714 | } |
715 | |
716 | void ipoib_reap_ah(struct work_struct *work) |
717 | { |
718 | struct ipoib_dev_priv *priv = |
719 | container_of(work, struct ipoib_dev_priv, ah_reap_task.work); |
720 | |
721 | ipoib_reap_dead_ahs(priv); |
722 | |
723 | if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) |
724 | queue_delayed_work(wq: priv->wq, dwork: &priv->ah_reap_task, |
725 | delay: round_jiffies_relative(HZ)); |
726 | } |
727 | |
728 | static void ipoib_start_ah_reaper(struct ipoib_dev_priv *priv) |
729 | { |
730 | clear_bit(nr: IPOIB_STOP_REAPER, addr: &priv->flags); |
731 | queue_delayed_work(wq: priv->wq, dwork: &priv->ah_reap_task, |
732 | delay: round_jiffies_relative(HZ)); |
733 | } |
734 | |
735 | static void ipoib_stop_ah_reaper(struct ipoib_dev_priv *priv) |
736 | { |
737 | set_bit(nr: IPOIB_STOP_REAPER, addr: &priv->flags); |
738 | cancel_delayed_work(dwork: &priv->ah_reap_task); |
739 | /* |
740 | * After ipoib_stop_ah_reaper() we always go through |
741 | * ipoib_reap_dead_ahs() which ensures the work is really stopped and |
742 | * does a final flush out of the dead_ah's list |
743 | */ |
744 | } |
745 | |
746 | static int recvs_pending(struct net_device *dev) |
747 | { |
748 | struct ipoib_dev_priv *priv = ipoib_priv(dev); |
749 | int pending = 0; |
750 | int i; |
751 | |
752 | for (i = 0; i < ipoib_recvq_size; ++i) |
753 | if (priv->rx_ring[i].skb) |
754 | ++pending; |
755 | |
756 | return pending; |
757 | } |
758 | |
759 | static void check_qp_movement_and_print(struct ipoib_dev_priv *priv, |
760 | struct ib_qp *qp, |
761 | enum ib_qp_state new_state) |
762 | { |
763 | struct ib_qp_attr qp_attr; |
764 | struct ib_qp_init_attr query_init_attr; |
765 | int ret; |
766 | |
767 | ret = ib_query_qp(qp, qp_attr: &qp_attr, qp_attr_mask: IB_QP_STATE, qp_init_attr: &query_init_attr); |
768 | if (ret) { |
769 | ipoib_warn(priv, "%s: Failed to query QP\n" , __func__); |
770 | return; |
771 | } |
772 | /* print according to the new-state and the previous state.*/ |
773 | if (new_state == IB_QPS_ERR && qp_attr.qp_state == IB_QPS_RESET) |
774 | ipoib_dbg(priv, "Failed modify QP, IB_QPS_RESET to IB_QPS_ERR, acceptable\n" ); |
775 | else |
776 | ipoib_warn(priv, "Failed to modify QP to state: %d from state: %d\n" , |
777 | new_state, qp_attr.qp_state); |
778 | } |
779 | |
780 | static void ipoib_napi_enable(struct net_device *dev) |
781 | { |
782 | struct ipoib_dev_priv *priv = ipoib_priv(dev); |
783 | |
784 | napi_enable(n: &priv->recv_napi); |
785 | napi_enable(n: &priv->send_napi); |
786 | } |
787 | |
788 | static void ipoib_napi_disable(struct net_device *dev) |
789 | { |
790 | struct ipoib_dev_priv *priv = ipoib_priv(dev); |
791 | |
792 | napi_disable(n: &priv->recv_napi); |
793 | napi_disable(n: &priv->send_napi); |
794 | } |
795 | |
796 | int ipoib_ib_dev_stop_default(struct net_device *dev) |
797 | { |
798 | struct ipoib_dev_priv *priv = ipoib_priv(dev); |
799 | struct ib_qp_attr qp_attr; |
800 | unsigned long begin; |
801 | struct ipoib_tx_buf *tx_req; |
802 | int i; |
803 | |
804 | if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) |
805 | ipoib_napi_disable(dev); |
806 | |
807 | ipoib_cm_dev_stop(dev); |
808 | |
809 | /* |
810 | * Move our QP to the error state and then reinitialize in |
811 | * when all work requests have completed or have been flushed. |
812 | */ |
813 | qp_attr.qp_state = IB_QPS_ERR; |
814 | if (ib_modify_qp(qp: priv->qp, qp_attr: &qp_attr, qp_attr_mask: IB_QP_STATE)) |
815 | check_qp_movement_and_print(priv, qp: priv->qp, new_state: IB_QPS_ERR); |
816 | |
817 | /* Wait for all sends and receives to complete */ |
818 | begin = jiffies; |
819 | |
820 | while (priv->tx_head != priv->tx_tail || recvs_pending(dev)) { |
821 | if (time_after(jiffies, begin + 5 * HZ)) { |
822 | ipoib_warn(priv, |
823 | "timing out; %d sends %d receives not completed\n" , |
824 | priv->tx_head - priv->tx_tail, |
825 | recvs_pending(dev)); |
826 | |
827 | /* |
828 | * assume the HW is wedged and just free up |
829 | * all our pending work requests. |
830 | */ |
831 | while ((int)priv->tx_tail - (int)priv->tx_head < 0) { |
832 | tx_req = &priv->tx_ring[priv->tx_tail & |
833 | (ipoib_sendq_size - 1)]; |
834 | ipoib_dma_unmap_tx(priv, tx_req); |
835 | dev_kfree_skb_any(skb: tx_req->skb); |
836 | ++priv->tx_tail; |
837 | ++priv->global_tx_tail; |
838 | } |
839 | |
840 | for (i = 0; i < ipoib_recvq_size; ++i) { |
841 | struct ipoib_rx_buf *rx_req; |
842 | |
843 | rx_req = &priv->rx_ring[i]; |
844 | if (!rx_req->skb) |
845 | continue; |
846 | ipoib_ud_dma_unmap_rx(priv, |
847 | mapping: priv->rx_ring[i].mapping); |
848 | dev_kfree_skb_any(skb: rx_req->skb); |
849 | rx_req->skb = NULL; |
850 | } |
851 | |
852 | goto timeout; |
853 | } |
854 | |
855 | ipoib_drain_cq(dev); |
856 | |
857 | usleep_range(min: 1000, max: 2000); |
858 | } |
859 | |
860 | ipoib_dbg(priv, "All sends and receives done.\n" ); |
861 | |
862 | timeout: |
863 | qp_attr.qp_state = IB_QPS_RESET; |
864 | if (ib_modify_qp(qp: priv->qp, qp_attr: &qp_attr, qp_attr_mask: IB_QP_STATE)) |
865 | ipoib_warn(priv, "Failed to modify QP to RESET state\n" ); |
866 | |
867 | ib_req_notify_cq(cq: priv->recv_cq, flags: IB_CQ_NEXT_COMP); |
868 | |
869 | return 0; |
870 | } |
871 | |
872 | int ipoib_ib_dev_open_default(struct net_device *dev) |
873 | { |
874 | struct ipoib_dev_priv *priv = ipoib_priv(dev); |
875 | int ret; |
876 | |
877 | ret = ipoib_init_qp(dev); |
878 | if (ret) { |
879 | ipoib_warn(priv, "ipoib_init_qp returned %d\n" , ret); |
880 | return -1; |
881 | } |
882 | |
883 | ret = ipoib_ib_post_receives(dev); |
884 | if (ret) { |
885 | ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n" , ret); |
886 | goto out; |
887 | } |
888 | |
889 | ret = ipoib_cm_dev_open(dev); |
890 | if (ret) { |
891 | ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n" , ret); |
892 | goto out; |
893 | } |
894 | |
895 | if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) |
896 | ipoib_napi_enable(dev); |
897 | |
898 | return 0; |
899 | out: |
900 | return -1; |
901 | } |
902 | |
903 | int ipoib_ib_dev_open(struct net_device *dev) |
904 | { |
905 | struct ipoib_dev_priv *priv = ipoib_priv(dev); |
906 | |
907 | ipoib_pkey_dev_check_presence(dev); |
908 | |
909 | if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { |
910 | ipoib_warn(priv, "P_Key 0x%04x is %s\n" , priv->pkey, |
911 | (!(priv->pkey & 0x7fff) ? "Invalid" : "not found" )); |
912 | return -1; |
913 | } |
914 | |
915 | ipoib_start_ah_reaper(priv); |
916 | if (priv->rn_ops->ndo_open(dev)) { |
917 | pr_warn("%s: Failed to open dev\n" , dev->name); |
918 | goto dev_stop; |
919 | } |
920 | |
921 | set_bit(nr: IPOIB_FLAG_INITIALIZED, addr: &priv->flags); |
922 | |
923 | return 0; |
924 | |
925 | dev_stop: |
926 | ipoib_stop_ah_reaper(priv); |
927 | return -1; |
928 | } |
929 | |
930 | void ipoib_ib_dev_stop(struct net_device *dev) |
931 | { |
932 | struct ipoib_dev_priv *priv = ipoib_priv(dev); |
933 | |
934 | priv->rn_ops->ndo_stop(dev); |
935 | |
936 | clear_bit(nr: IPOIB_FLAG_INITIALIZED, addr: &priv->flags); |
937 | ipoib_stop_ah_reaper(priv); |
938 | } |
939 | |
940 | void ipoib_pkey_dev_check_presence(struct net_device *dev) |
941 | { |
942 | struct ipoib_dev_priv *priv = ipoib_priv(dev); |
943 | struct rdma_netdev *rn = netdev_priv(dev); |
944 | |
945 | if (!(priv->pkey & 0x7fff) || |
946 | ib_find_pkey(device: priv->ca, port_num: priv->port, pkey: priv->pkey, |
947 | index: &priv->pkey_index)) { |
948 | clear_bit(nr: IPOIB_PKEY_ASSIGNED, addr: &priv->flags); |
949 | } else { |
950 | if (rn->set_id) |
951 | rn->set_id(dev, priv->pkey_index); |
952 | set_bit(nr: IPOIB_PKEY_ASSIGNED, addr: &priv->flags); |
953 | } |
954 | } |
955 | |
956 | void ipoib_ib_dev_up(struct net_device *dev) |
957 | { |
958 | struct ipoib_dev_priv *priv = ipoib_priv(dev); |
959 | |
960 | ipoib_pkey_dev_check_presence(dev); |
961 | |
962 | if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { |
963 | ipoib_dbg(priv, "PKEY is not assigned.\n" ); |
964 | return; |
965 | } |
966 | |
967 | set_bit(nr: IPOIB_FLAG_OPER_UP, addr: &priv->flags); |
968 | |
969 | ipoib_mcast_start_thread(dev); |
970 | } |
971 | |
972 | void ipoib_ib_dev_down(struct net_device *dev) |
973 | { |
974 | struct ipoib_dev_priv *priv = ipoib_priv(dev); |
975 | |
976 | ipoib_dbg(priv, "downing ib_dev\n" ); |
977 | |
978 | clear_bit(nr: IPOIB_FLAG_OPER_UP, addr: &priv->flags); |
979 | netif_carrier_off(dev); |
980 | |
981 | ipoib_mcast_stop_thread(dev); |
982 | ipoib_mcast_dev_flush(dev); |
983 | |
984 | ipoib_flush_paths(dev); |
985 | } |
986 | |
987 | void ipoib_drain_cq(struct net_device *dev) |
988 | { |
989 | struct ipoib_dev_priv *priv = ipoib_priv(dev); |
990 | int i, n; |
991 | |
992 | /* |
993 | * We call completion handling routines that expect to be |
994 | * called from the BH-disabled NAPI poll context, so disable |
995 | * BHs here too. |
996 | */ |
997 | local_bh_disable(); |
998 | |
999 | do { |
1000 | n = ib_poll_cq(cq: priv->recv_cq, num_entries: IPOIB_NUM_WC, wc: priv->ibwc); |
1001 | for (i = 0; i < n; ++i) { |
1002 | /* |
1003 | * Convert any successful completions to flush |
1004 | * errors to avoid passing packets up the |
1005 | * stack after bringing the device down. |
1006 | */ |
1007 | if (priv->ibwc[i].status == IB_WC_SUCCESS) |
1008 | priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR; |
1009 | |
1010 | if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) { |
1011 | if (priv->ibwc[i].wr_id & IPOIB_OP_CM) |
1012 | ipoib_cm_handle_rx_wc(dev, wc: priv->ibwc + i); |
1013 | else |
1014 | ipoib_ib_handle_rx_wc(dev, wc: priv->ibwc + i); |
1015 | } else { |
1016 | pr_warn("%s: Got unexpected wqe id\n" , __func__); |
1017 | } |
1018 | } |
1019 | } while (n == IPOIB_NUM_WC); |
1020 | |
1021 | while (poll_tx(priv)) |
1022 | ; /* nothing */ |
1023 | |
1024 | local_bh_enable(); |
1025 | } |
1026 | |
1027 | /* |
1028 | * Takes whatever value which is in pkey index 0 and updates priv->pkey |
1029 | * returns 0 if the pkey value was changed. |
1030 | */ |
1031 | static inline int update_parent_pkey(struct ipoib_dev_priv *priv) |
1032 | { |
1033 | int result; |
1034 | u16 prev_pkey; |
1035 | |
1036 | prev_pkey = priv->pkey; |
1037 | result = ib_query_pkey(device: priv->ca, port_num: priv->port, index: 0, pkey: &priv->pkey); |
1038 | if (result) { |
1039 | ipoib_warn(priv, "ib_query_pkey port %d failed (ret = %d)\n" , |
1040 | priv->port, result); |
1041 | return result; |
1042 | } |
1043 | |
1044 | priv->pkey |= 0x8000; |
1045 | |
1046 | if (prev_pkey != priv->pkey) { |
1047 | ipoib_dbg(priv, "pkey changed from 0x%x to 0x%x\n" , |
1048 | prev_pkey, priv->pkey); |
1049 | /* |
1050 | * Update the pkey in the broadcast address, while making sure to set |
1051 | * the full membership bit, so that we join the right broadcast group. |
1052 | */ |
1053 | priv->dev->broadcast[8] = priv->pkey >> 8; |
1054 | priv->dev->broadcast[9] = priv->pkey & 0xff; |
1055 | return 0; |
1056 | } |
1057 | |
1058 | return 1; |
1059 | } |
1060 | /* |
1061 | * returns 0 if pkey value was found in a different slot. |
1062 | */ |
1063 | static inline int update_child_pkey(struct ipoib_dev_priv *priv) |
1064 | { |
1065 | u16 old_index = priv->pkey_index; |
1066 | |
1067 | priv->pkey_index = 0; |
1068 | ipoib_pkey_dev_check_presence(dev: priv->dev); |
1069 | |
1070 | if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) && |
1071 | (old_index == priv->pkey_index)) |
1072 | return 1; |
1073 | return 0; |
1074 | } |
1075 | |
1076 | /* |
1077 | * returns true if the device address of the ipoib interface has changed and the |
1078 | * new address is a valid one (i.e in the gid table), return false otherwise. |
1079 | */ |
1080 | static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv) |
1081 | { |
1082 | union ib_gid search_gid; |
1083 | union ib_gid gid0; |
1084 | int err; |
1085 | u16 index; |
1086 | u32 port; |
1087 | bool ret = false; |
1088 | |
1089 | if (rdma_query_gid(device: priv->ca, port_num: priv->port, index: 0, gid: &gid0)) |
1090 | return false; |
1091 | |
1092 | netif_addr_lock_bh(dev: priv->dev); |
1093 | |
1094 | /* The subnet prefix may have changed, update it now so we won't have |
1095 | * to do it later |
1096 | */ |
1097 | priv->local_gid.global.subnet_prefix = gid0.global.subnet_prefix; |
1098 | dev_addr_mod(dev: priv->dev, offset: 4, addr: (u8 *)&gid0.global.subnet_prefix, |
1099 | len: sizeof(gid0.global.subnet_prefix)); |
1100 | search_gid.global.subnet_prefix = gid0.global.subnet_prefix; |
1101 | |
1102 | search_gid.global.interface_id = priv->local_gid.global.interface_id; |
1103 | |
1104 | netif_addr_unlock_bh(dev: priv->dev); |
1105 | |
1106 | err = ib_find_gid(device: priv->ca, gid: &search_gid, port_num: &port, index: &index); |
1107 | |
1108 | netif_addr_lock_bh(dev: priv->dev); |
1109 | |
1110 | if (search_gid.global.interface_id != |
1111 | priv->local_gid.global.interface_id) |
1112 | /* There was a change while we were looking up the gid, bail |
1113 | * here and let the next work sort this out |
1114 | */ |
1115 | goto out; |
1116 | |
1117 | /* The next section of code needs some background: |
1118 | * Per IB spec the port GUID can't change if the HCA is powered on. |
1119 | * port GUID is the basis for GID at index 0 which is the basis for |
1120 | * the default device address of a ipoib interface. |
1121 | * |
1122 | * so it seems the flow should be: |
1123 | * if user_changed_dev_addr && gid in gid tbl |
1124 | * set bit dev_addr_set |
1125 | * return true |
1126 | * else |
1127 | * return false |
1128 | * |
1129 | * The issue is that there are devices that don't follow the spec, |
1130 | * they change the port GUID when the HCA is powered, so in order |
1131 | * not to break userspace applications, We need to check if the |
1132 | * user wanted to control the device address and we assume that |
1133 | * if he sets the device address back to be based on GID index 0, |
1134 | * he no longer wishs to control it. |
1135 | * |
1136 | * If the user doesn't control the device address, |
1137 | * IPOIB_FLAG_DEV_ADDR_SET is set and ib_find_gid failed it means |
1138 | * the port GUID has changed and GID at index 0 has changed |
1139 | * so we need to change priv->local_gid and priv->dev->dev_addr |
1140 | * to reflect the new GID. |
1141 | */ |
1142 | if (!test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) { |
1143 | if (!err && port == priv->port) { |
1144 | set_bit(nr: IPOIB_FLAG_DEV_ADDR_SET, addr: &priv->flags); |
1145 | if (index == 0) |
1146 | clear_bit(nr: IPOIB_FLAG_DEV_ADDR_CTRL, |
1147 | addr: &priv->flags); |
1148 | else |
1149 | set_bit(nr: IPOIB_FLAG_DEV_ADDR_CTRL, addr: &priv->flags); |
1150 | ret = true; |
1151 | } else { |
1152 | ret = false; |
1153 | } |
1154 | } else { |
1155 | if (!err && port == priv->port) { |
1156 | ret = true; |
1157 | } else { |
1158 | if (!test_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags)) { |
1159 | memcpy(&priv->local_gid, &gid0, |
1160 | sizeof(priv->local_gid)); |
1161 | dev_addr_mod(dev: priv->dev, offset: 4, addr: (u8 *)&gid0, |
1162 | len: sizeof(priv->local_gid)); |
1163 | ret = true; |
1164 | } |
1165 | } |
1166 | } |
1167 | |
1168 | out: |
1169 | netif_addr_unlock_bh(dev: priv->dev); |
1170 | |
1171 | return ret; |
1172 | } |
1173 | |
1174 | static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, |
1175 | enum ipoib_flush_level level, |
1176 | int nesting) |
1177 | { |
1178 | struct ipoib_dev_priv *cpriv; |
1179 | struct net_device *dev = priv->dev; |
1180 | int result; |
1181 | |
1182 | down_read_nested(sem: &priv->vlan_rwsem, subclass: nesting); |
1183 | |
1184 | /* |
1185 | * Flush any child interfaces too -- they might be up even if |
1186 | * the parent is down. |
1187 | */ |
1188 | list_for_each_entry(cpriv, &priv->child_intfs, list) |
1189 | __ipoib_ib_dev_flush(priv: cpriv, level, nesting: nesting + 1); |
1190 | |
1191 | up_read(sem: &priv->vlan_rwsem); |
1192 | |
1193 | if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) && |
1194 | level != IPOIB_FLUSH_HEAVY) { |
1195 | /* Make sure the dev_addr is set even if not flushing */ |
1196 | if (level == IPOIB_FLUSH_LIGHT) |
1197 | ipoib_dev_addr_changed_valid(priv); |
1198 | ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n" ); |
1199 | return; |
1200 | } |
1201 | |
1202 | if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { |
1203 | /* interface is down. update pkey and leave. */ |
1204 | if (level == IPOIB_FLUSH_HEAVY) { |
1205 | if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) |
1206 | update_parent_pkey(priv); |
1207 | else |
1208 | update_child_pkey(priv); |
1209 | } else if (level == IPOIB_FLUSH_LIGHT) |
1210 | ipoib_dev_addr_changed_valid(priv); |
1211 | ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n" ); |
1212 | return; |
1213 | } |
1214 | |
1215 | if (level == IPOIB_FLUSH_HEAVY) { |
1216 | /* child devices chase their origin pkey value, while non-child |
1217 | * (parent) devices should always takes what present in pkey index 0 |
1218 | */ |
1219 | if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { |
1220 | result = update_child_pkey(priv); |
1221 | if (result) { |
1222 | /* restart QP only if P_Key index is changed */ |
1223 | ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n" ); |
1224 | return; |
1225 | } |
1226 | |
1227 | } else { |
1228 | result = update_parent_pkey(priv); |
1229 | /* restart QP only if P_Key value changed */ |
1230 | if (result) { |
1231 | ipoib_dbg(priv, "Not flushing - P_Key value not changed.\n" ); |
1232 | return; |
1233 | } |
1234 | } |
1235 | } |
1236 | |
1237 | if (level == IPOIB_FLUSH_LIGHT) { |
1238 | int oper_up; |
1239 | ipoib_mark_paths_invalid(dev); |
1240 | /* Set IPoIB operation as down to prevent races between: |
1241 | * the flush flow which leaves MCG and on the fly joins |
1242 | * which can happen during that time. mcast restart task |
1243 | * should deal with join requests we missed. |
1244 | */ |
1245 | oper_up = test_and_clear_bit(nr: IPOIB_FLAG_OPER_UP, addr: &priv->flags); |
1246 | ipoib_mcast_dev_flush(dev); |
1247 | if (oper_up) |
1248 | set_bit(nr: IPOIB_FLAG_OPER_UP, addr: &priv->flags); |
1249 | ipoib_reap_dead_ahs(priv); |
1250 | } |
1251 | |
1252 | if (level >= IPOIB_FLUSH_NORMAL) |
1253 | ipoib_ib_dev_down(dev); |
1254 | |
1255 | if (level == IPOIB_FLUSH_HEAVY) { |
1256 | if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) |
1257 | ipoib_ib_dev_stop(dev); |
1258 | |
1259 | if (ipoib_ib_dev_open(dev)) |
1260 | return; |
1261 | |
1262 | if (netif_queue_stopped(dev)) |
1263 | netif_start_queue(dev); |
1264 | } |
1265 | |
1266 | /* |
1267 | * The device could have been brought down between the start and when |
1268 | * we get here, don't bring it back up if it's not configured up |
1269 | */ |
1270 | if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { |
1271 | if (level >= IPOIB_FLUSH_NORMAL) |
1272 | ipoib_ib_dev_up(dev); |
1273 | if (ipoib_dev_addr_changed_valid(priv)) |
1274 | ipoib_mcast_restart_task(work: &priv->restart_task); |
1275 | } |
1276 | } |
1277 | |
1278 | void ipoib_ib_dev_flush_light(struct work_struct *work) |
1279 | { |
1280 | struct ipoib_dev_priv *priv = |
1281 | container_of(work, struct ipoib_dev_priv, flush_light); |
1282 | |
1283 | __ipoib_ib_dev_flush(priv, level: IPOIB_FLUSH_LIGHT, nesting: 0); |
1284 | } |
1285 | |
1286 | void ipoib_ib_dev_flush_normal(struct work_struct *work) |
1287 | { |
1288 | struct ipoib_dev_priv *priv = |
1289 | container_of(work, struct ipoib_dev_priv, flush_normal); |
1290 | |
1291 | __ipoib_ib_dev_flush(priv, level: IPOIB_FLUSH_NORMAL, nesting: 0); |
1292 | } |
1293 | |
1294 | void ipoib_ib_dev_flush_heavy(struct work_struct *work) |
1295 | { |
1296 | struct ipoib_dev_priv *priv = |
1297 | container_of(work, struct ipoib_dev_priv, flush_heavy); |
1298 | |
1299 | rtnl_lock(); |
1300 | __ipoib_ib_dev_flush(priv, level: IPOIB_FLUSH_HEAVY, nesting: 0); |
1301 | rtnl_unlock(); |
1302 | } |
1303 | |
1304 | void ipoib_ib_dev_cleanup(struct net_device *dev) |
1305 | { |
1306 | struct ipoib_dev_priv *priv = ipoib_priv(dev); |
1307 | |
1308 | ipoib_dbg(priv, "cleaning up ib_dev\n" ); |
1309 | /* |
1310 | * We must make sure there are no more (path) completions |
1311 | * that may wish to touch priv fields that are no longer valid |
1312 | */ |
1313 | ipoib_flush_paths(dev); |
1314 | |
1315 | ipoib_mcast_stop_thread(dev); |
1316 | ipoib_mcast_dev_flush(dev); |
1317 | |
1318 | /* |
1319 | * All of our ah references aren't free until after |
1320 | * ipoib_mcast_dev_flush(), ipoib_flush_paths, and |
1321 | * the neighbor garbage collection is stopped and reaped. |
1322 | * That should all be done now, so make a final ah flush. |
1323 | */ |
1324 | ipoib_reap_dead_ahs(priv); |
1325 | |
1326 | clear_bit(nr: IPOIB_PKEY_ASSIGNED, addr: &priv->flags); |
1327 | |
1328 | priv->rn_ops->ndo_uninit(dev); |
1329 | |
1330 | if (priv->pd) { |
1331 | ib_dealloc_pd(pd: priv->pd); |
1332 | priv->pd = NULL; |
1333 | } |
1334 | } |
1335 | |
1336 | |