1 | // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause |
2 | /* |
3 | * Copyright(c) 2015 - 2020 Intel Corporation. |
4 | * Copyright(c) 2021 Cornelis Networks. |
5 | */ |
6 | |
7 | #include <linux/pci.h> |
8 | #include <linux/netdevice.h> |
9 | #include <linux/vmalloc.h> |
10 | #include <linux/delay.h> |
11 | #include <linux/xarray.h> |
12 | #include <linux/module.h> |
13 | #include <linux/printk.h> |
14 | #include <linux/hrtimer.h> |
15 | #include <linux/bitmap.h> |
16 | #include <linux/numa.h> |
17 | #include <rdma/rdma_vt.h> |
18 | |
19 | #include "hfi.h" |
20 | #include "device.h" |
21 | #include "common.h" |
22 | #include "trace.h" |
23 | #include "mad.h" |
24 | #include "sdma.h" |
25 | #include "debugfs.h" |
26 | #include "verbs.h" |
27 | #include "aspm.h" |
28 | #include "affinity.h" |
29 | #include "vnic.h" |
30 | #include "exp_rcv.h" |
31 | #include "netdev.h" |
32 | |
33 | #undef pr_fmt |
34 | #define pr_fmt(fmt) DRIVER_NAME ": " fmt |
35 | |
36 | /* |
37 | * min buffers we want to have per context, after driver |
38 | */ |
39 | #define HFI1_MIN_USER_CTXT_BUFCNT 7 |
40 | |
41 | #define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */ |
42 | #define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */ |
43 | |
44 | #define NUM_IB_PORTS 1 |
45 | |
46 | /* |
47 | * Number of user receive contexts we are configured to use (to allow for more |
48 | * pio buffers per ctxt, etc.) Zero means use one user context per CPU. |
49 | */ |
50 | int num_user_contexts = -1; |
51 | module_param_named(num_user_contexts, num_user_contexts, int, 0444); |
52 | MODULE_PARM_DESC( |
53 | num_user_contexts, "Set max number of user contexts to use (default: -1 will use the real (non-HT) CPU count)" ); |
54 | |
55 | uint krcvqs[RXE_NUM_DATA_VL]; |
56 | int krcvqsset; |
57 | module_param_array(krcvqs, uint, &krcvqsset, S_IRUGO); |
58 | MODULE_PARM_DESC(krcvqs, "Array of the number of non-control kernel receive queues by VL" ); |
59 | |
60 | /* computed based on above array */ |
61 | unsigned long n_krcvqs; |
62 | |
63 | static unsigned hfi1_rcvarr_split = 25; |
64 | module_param_named(rcvarr_split, hfi1_rcvarr_split, uint, S_IRUGO); |
65 | MODULE_PARM_DESC(rcvarr_split, "Percent of context's RcvArray entries used for Eager buffers" ); |
66 | |
67 | static uint eager_buffer_size = (8 << 20); /* 8MB */ |
68 | module_param(eager_buffer_size, uint, S_IRUGO); |
69 | MODULE_PARM_DESC(eager_buffer_size, "Size of the eager buffers, default: 8MB" ); |
70 | |
71 | static uint rcvhdrcnt = 2048; /* 2x the max eager buffer count */ |
72 | module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO); |
73 | MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)" ); |
74 | |
75 | static uint hfi1_hdrq_entsize = 32; |
76 | module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, 0444); |
77 | MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B, 32 - 128B (default)" ); |
78 | |
79 | unsigned int user_credit_return_threshold = 33; /* default is 33% */ |
80 | module_param(user_credit_return_threshold, uint, S_IRUGO); |
81 | MODULE_PARM_DESC(user_credit_return_threshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)" ); |
82 | |
83 | DEFINE_XARRAY_FLAGS(hfi1_dev_table, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ); |
84 | |
85 | static int hfi1_create_kctxt(struct hfi1_devdata *dd, |
86 | struct hfi1_pportdata *ppd) |
87 | { |
88 | struct hfi1_ctxtdata *rcd; |
89 | int ret; |
90 | |
91 | /* Control context has to be always 0 */ |
92 | BUILD_BUG_ON(HFI1_CTRL_CTXT != 0); |
93 | |
94 | ret = hfi1_create_ctxtdata(ppd, numa: dd->node, rcd: &rcd); |
95 | if (ret < 0) { |
96 | dd_dev_err(dd, "Kernel receive context allocation failed\n" ); |
97 | return ret; |
98 | } |
99 | |
100 | /* |
101 | * Set up the kernel context flags here and now because they use |
102 | * default values for all receive side memories. User contexts will |
103 | * be handled as they are created. |
104 | */ |
105 | rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) | |
106 | HFI1_CAP_KGET(NODROP_RHQ_FULL) | |
107 | HFI1_CAP_KGET(NODROP_EGR_FULL) | |
108 | HFI1_CAP_KGET(DMA_RTAIL); |
109 | |
110 | /* Control context must use DMA_RTAIL */ |
111 | if (rcd->ctxt == HFI1_CTRL_CTXT) |
112 | rcd->flags |= HFI1_CAP_DMA_RTAIL; |
113 | rcd->fast_handler = get_dma_rtail_setting(rcd) ? |
114 | handle_receive_interrupt_dma_rtail : |
115 | handle_receive_interrupt_nodma_rtail; |
116 | |
117 | hfi1_set_seq_cnt(rcd, cnt: 1); |
118 | |
119 | rcd->sc = sc_alloc(dd, SC_ACK, hdrqentsize: rcd->rcvhdrqentsize, numa: dd->node); |
120 | if (!rcd->sc) { |
121 | dd_dev_err(dd, "Kernel send context allocation failed\n" ); |
122 | return -ENOMEM; |
123 | } |
124 | hfi1_init_ctxt(sc: rcd->sc); |
125 | |
126 | return 0; |
127 | } |
128 | |
129 | /* |
130 | * Create the receive context array and one or more kernel contexts |
131 | */ |
132 | int hfi1_create_kctxts(struct hfi1_devdata *dd) |
133 | { |
134 | u16 i; |
135 | int ret; |
136 | |
137 | dd->rcd = kcalloc_node(n: dd->num_rcv_contexts, size: sizeof(*dd->rcd), |
138 | GFP_KERNEL, node: dd->node); |
139 | if (!dd->rcd) |
140 | return -ENOMEM; |
141 | |
142 | for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) { |
143 | ret = hfi1_create_kctxt(dd, ppd: dd->pport); |
144 | if (ret) |
145 | goto bail; |
146 | } |
147 | |
148 | return 0; |
149 | bail: |
150 | for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i) |
151 | hfi1_free_ctxt(rcd: dd->rcd[i]); |
152 | |
153 | /* All the contexts should be freed, free the array */ |
154 | kfree(objp: dd->rcd); |
155 | dd->rcd = NULL; |
156 | return ret; |
157 | } |
158 | |
159 | /* |
160 | * Helper routines for the receive context reference count (rcd and uctxt). |
161 | */ |
162 | static void hfi1_rcd_init(struct hfi1_ctxtdata *rcd) |
163 | { |
164 | kref_init(kref: &rcd->kref); |
165 | } |
166 | |
167 | /** |
168 | * hfi1_rcd_free - When reference is zero clean up. |
169 | * @kref: pointer to an initialized rcd data structure |
170 | * |
171 | */ |
172 | static void hfi1_rcd_free(struct kref *kref) |
173 | { |
174 | unsigned long flags; |
175 | struct hfi1_ctxtdata *rcd = |
176 | container_of(kref, struct hfi1_ctxtdata, kref); |
177 | |
178 | spin_lock_irqsave(&rcd->dd->uctxt_lock, flags); |
179 | rcd->dd->rcd[rcd->ctxt] = NULL; |
180 | spin_unlock_irqrestore(lock: &rcd->dd->uctxt_lock, flags); |
181 | |
182 | hfi1_free_ctxtdata(dd: rcd->dd, rcd); |
183 | |
184 | kfree(objp: rcd); |
185 | } |
186 | |
187 | /** |
188 | * hfi1_rcd_put - decrement reference for rcd |
189 | * @rcd: pointer to an initialized rcd data structure |
190 | * |
191 | * Use this to put a reference after the init. |
192 | */ |
193 | int hfi1_rcd_put(struct hfi1_ctxtdata *rcd) |
194 | { |
195 | if (rcd) |
196 | return kref_put(kref: &rcd->kref, release: hfi1_rcd_free); |
197 | |
198 | return 0; |
199 | } |
200 | |
201 | /** |
202 | * hfi1_rcd_get - increment reference for rcd |
203 | * @rcd: pointer to an initialized rcd data structure |
204 | * |
205 | * Use this to get a reference after the init. |
206 | * |
207 | * Return : reflect kref_get_unless_zero(), which returns non-zero on |
208 | * increment, otherwise 0. |
209 | */ |
210 | int hfi1_rcd_get(struct hfi1_ctxtdata *rcd) |
211 | { |
212 | return kref_get_unless_zero(kref: &rcd->kref); |
213 | } |
214 | |
215 | /** |
216 | * allocate_rcd_index - allocate an rcd index from the rcd array |
217 | * @dd: pointer to a valid devdata structure |
218 | * @rcd: rcd data structure to assign |
219 | * @index: pointer to index that is allocated |
220 | * |
221 | * Find an empty index in the rcd array, and assign the given rcd to it. |
222 | * If the array is full, we are EBUSY. |
223 | * |
224 | */ |
225 | static int allocate_rcd_index(struct hfi1_devdata *dd, |
226 | struct hfi1_ctxtdata *rcd, u16 *index) |
227 | { |
228 | unsigned long flags; |
229 | u16 ctxt; |
230 | |
231 | spin_lock_irqsave(&dd->uctxt_lock, flags); |
232 | for (ctxt = 0; ctxt < dd->num_rcv_contexts; ctxt++) |
233 | if (!dd->rcd[ctxt]) |
234 | break; |
235 | |
236 | if (ctxt < dd->num_rcv_contexts) { |
237 | rcd->ctxt = ctxt; |
238 | dd->rcd[ctxt] = rcd; |
239 | hfi1_rcd_init(rcd); |
240 | } |
241 | spin_unlock_irqrestore(lock: &dd->uctxt_lock, flags); |
242 | |
243 | if (ctxt >= dd->num_rcv_contexts) |
244 | return -EBUSY; |
245 | |
246 | *index = ctxt; |
247 | |
248 | return 0; |
249 | } |
250 | |
251 | /** |
252 | * hfi1_rcd_get_by_index_safe - validate the ctxt index before accessing the |
253 | * array |
254 | * @dd: pointer to a valid devdata structure |
255 | * @ctxt: the index of an possilbe rcd |
256 | * |
257 | * This is a wrapper for hfi1_rcd_get_by_index() to validate that the given |
258 | * ctxt index is valid. |
259 | * |
260 | * The caller is responsible for making the _put(). |
261 | * |
262 | */ |
263 | struct hfi1_ctxtdata *hfi1_rcd_get_by_index_safe(struct hfi1_devdata *dd, |
264 | u16 ctxt) |
265 | { |
266 | if (ctxt < dd->num_rcv_contexts) |
267 | return hfi1_rcd_get_by_index(dd, ctxt); |
268 | |
269 | return NULL; |
270 | } |
271 | |
272 | /** |
273 | * hfi1_rcd_get_by_index - get by index |
274 | * @dd: pointer to a valid devdata structure |
275 | * @ctxt: the index of an possilbe rcd |
276 | * |
277 | * We need to protect access to the rcd array. If access is needed to |
278 | * one or more index, get the protecting spinlock and then increment the |
279 | * kref. |
280 | * |
281 | * The caller is responsible for making the _put(). |
282 | * |
283 | */ |
284 | struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt) |
285 | { |
286 | unsigned long flags; |
287 | struct hfi1_ctxtdata *rcd = NULL; |
288 | |
289 | spin_lock_irqsave(&dd->uctxt_lock, flags); |
290 | if (dd->rcd[ctxt]) { |
291 | rcd = dd->rcd[ctxt]; |
292 | if (!hfi1_rcd_get(rcd)) |
293 | rcd = NULL; |
294 | } |
295 | spin_unlock_irqrestore(lock: &dd->uctxt_lock, flags); |
296 | |
297 | return rcd; |
298 | } |
299 | |
300 | /* |
301 | * Common code for user and kernel context create and setup. |
302 | * NOTE: the initial kref is done here (hf1_rcd_init()). |
303 | */ |
304 | int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, |
305 | struct hfi1_ctxtdata **context) |
306 | { |
307 | struct hfi1_devdata *dd = ppd->dd; |
308 | struct hfi1_ctxtdata *rcd; |
309 | unsigned kctxt_ngroups = 0; |
310 | u32 base; |
311 | |
312 | if (dd->rcv_entries.nctxt_extra > |
313 | dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt) |
314 | kctxt_ngroups = (dd->rcv_entries.nctxt_extra - |
315 | (dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt)); |
316 | rcd = kzalloc_node(size: sizeof(*rcd), GFP_KERNEL, node: numa); |
317 | if (rcd) { |
318 | u32 rcvtids, max_entries; |
319 | u16 ctxt; |
320 | int ret; |
321 | |
322 | ret = allocate_rcd_index(dd, rcd, index: &ctxt); |
323 | if (ret) { |
324 | *context = NULL; |
325 | kfree(objp: rcd); |
326 | return ret; |
327 | } |
328 | |
329 | INIT_LIST_HEAD(list: &rcd->qp_wait_list); |
330 | hfi1_exp_tid_group_init(rcd); |
331 | rcd->ppd = ppd; |
332 | rcd->dd = dd; |
333 | rcd->numa_id = numa; |
334 | rcd->rcv_array_groups = dd->rcv_entries.ngroups; |
335 | rcd->rhf_rcv_function_map = normal_rhf_rcv_functions; |
336 | rcd->slow_handler = handle_receive_interrupt; |
337 | rcd->do_interrupt = rcd->slow_handler; |
338 | rcd->msix_intr = CCE_NUM_MSIX_VECTORS; |
339 | |
340 | mutex_init(&rcd->exp_mutex); |
341 | spin_lock_init(&rcd->exp_lock); |
342 | INIT_LIST_HEAD(list: &rcd->flow_queue.queue_head); |
343 | INIT_LIST_HEAD(list: &rcd->rarr_queue.queue_head); |
344 | |
345 | hfi1_cdbg(PROC, "setting up context %u" , rcd->ctxt); |
346 | |
347 | /* |
348 | * Calculate the context's RcvArray entry starting point. |
349 | * We do this here because we have to take into account all |
350 | * the RcvArray entries that previous context would have |
351 | * taken and we have to account for any extra groups assigned |
352 | * to the static (kernel) or dynamic (vnic/user) contexts. |
353 | */ |
354 | if (ctxt < dd->first_dyn_alloc_ctxt) { |
355 | if (ctxt < kctxt_ngroups) { |
356 | base = ctxt * (dd->rcv_entries.ngroups + 1); |
357 | rcd->rcv_array_groups++; |
358 | } else { |
359 | base = kctxt_ngroups + |
360 | (ctxt * dd->rcv_entries.ngroups); |
361 | } |
362 | } else { |
363 | u16 ct = ctxt - dd->first_dyn_alloc_ctxt; |
364 | |
365 | base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) + |
366 | kctxt_ngroups); |
367 | if (ct < dd->rcv_entries.nctxt_extra) { |
368 | base += ct * (dd->rcv_entries.ngroups + 1); |
369 | rcd->rcv_array_groups++; |
370 | } else { |
371 | base += dd->rcv_entries.nctxt_extra + |
372 | (ct * dd->rcv_entries.ngroups); |
373 | } |
374 | } |
375 | rcd->eager_base = base * dd->rcv_entries.group_size; |
376 | |
377 | rcd->rcvhdrq_cnt = rcvhdrcnt; |
378 | rcd->rcvhdrqentsize = hfi1_hdrq_entsize; |
379 | rcd->rhf_offset = |
380 | rcd->rcvhdrqentsize - sizeof(u64) / sizeof(u32); |
381 | /* |
382 | * Simple Eager buffer allocation: we have already pre-allocated |
383 | * the number of RcvArray entry groups. Each ctxtdata structure |
384 | * holds the number of groups for that context. |
385 | * |
386 | * To follow CSR requirements and maintain cacheline alignment, |
387 | * make sure all sizes and bases are multiples of group_size. |
388 | * |
389 | * The expected entry count is what is left after assigning |
390 | * eager. |
391 | */ |
392 | max_entries = rcd->rcv_array_groups * |
393 | dd->rcv_entries.group_size; |
394 | rcvtids = ((max_entries * hfi1_rcvarr_split) / 100); |
395 | rcd->egrbufs.count = round_down(rcvtids, |
396 | dd->rcv_entries.group_size); |
397 | if (rcd->egrbufs.count > MAX_EAGER_ENTRIES) { |
398 | dd_dev_err(dd, "ctxt%u: requested too many RcvArray entries.\n" , |
399 | rcd->ctxt); |
400 | rcd->egrbufs.count = MAX_EAGER_ENTRIES; |
401 | } |
402 | hfi1_cdbg(PROC, |
403 | "ctxt%u: max Eager buffer RcvArray entries: %u" , |
404 | rcd->ctxt, rcd->egrbufs.count); |
405 | |
406 | /* |
407 | * Allocate array that will hold the eager buffer accounting |
408 | * data. |
409 | * This will allocate the maximum possible buffer count based |
410 | * on the value of the RcvArray split parameter. |
411 | * The resulting value will be rounded down to the closest |
412 | * multiple of dd->rcv_entries.group_size. |
413 | */ |
414 | rcd->egrbufs.buffers = |
415 | kcalloc_node(n: rcd->egrbufs.count, |
416 | size: sizeof(*rcd->egrbufs.buffers), |
417 | GFP_KERNEL, node: numa); |
418 | if (!rcd->egrbufs.buffers) |
419 | goto bail; |
420 | rcd->egrbufs.rcvtids = |
421 | kcalloc_node(n: rcd->egrbufs.count, |
422 | size: sizeof(*rcd->egrbufs.rcvtids), |
423 | GFP_KERNEL, node: numa); |
424 | if (!rcd->egrbufs.rcvtids) |
425 | goto bail; |
426 | rcd->egrbufs.size = eager_buffer_size; |
427 | /* |
428 | * The size of the buffers programmed into the RcvArray |
429 | * entries needs to be big enough to handle the highest |
430 | * MTU supported. |
431 | */ |
432 | if (rcd->egrbufs.size < hfi1_max_mtu) { |
433 | rcd->egrbufs.size = __roundup_pow_of_two(n: hfi1_max_mtu); |
434 | hfi1_cdbg(PROC, |
435 | "ctxt%u: eager bufs size too small. Adjusting to %u" , |
436 | rcd->ctxt, rcd->egrbufs.size); |
437 | } |
438 | rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE; |
439 | |
440 | /* Applicable only for statically created kernel contexts */ |
441 | if (ctxt < dd->first_dyn_alloc_ctxt) { |
442 | rcd->opstats = kzalloc_node(size: sizeof(*rcd->opstats), |
443 | GFP_KERNEL, node: numa); |
444 | if (!rcd->opstats) |
445 | goto bail; |
446 | |
447 | /* Initialize TID flow generations for the context */ |
448 | hfi1_kern_init_ctxt_generations(rcd); |
449 | } |
450 | |
451 | *context = rcd; |
452 | return 0; |
453 | } |
454 | |
455 | bail: |
456 | *context = NULL; |
457 | hfi1_free_ctxt(rcd); |
458 | return -ENOMEM; |
459 | } |
460 | |
461 | /** |
462 | * hfi1_free_ctxt - free context |
463 | * @rcd: pointer to an initialized rcd data structure |
464 | * |
465 | * This wrapper is the free function that matches hfi1_create_ctxtdata(). |
466 | * When a context is done being used (kernel or user), this function is called |
467 | * for the "final" put to match the kref init from hfi1_create_ctxtdata(). |
468 | * Other users of the context do a get/put sequence to make sure that the |
469 | * structure isn't removed while in use. |
470 | */ |
471 | void hfi1_free_ctxt(struct hfi1_ctxtdata *rcd) |
472 | { |
473 | hfi1_rcd_put(rcd); |
474 | } |
475 | |
476 | /* |
477 | * Select the largest ccti value over all SLs to determine the intra- |
478 | * packet gap for the link. |
479 | * |
480 | * called with cca_timer_lock held (to protect access to cca_timer |
481 | * array), and rcu_read_lock() (to protect access to cc_state). |
482 | */ |
483 | void set_link_ipg(struct hfi1_pportdata *ppd) |
484 | { |
485 | struct hfi1_devdata *dd = ppd->dd; |
486 | struct cc_state *cc_state; |
487 | int i; |
488 | u16 cce, ccti_limit, max_ccti = 0; |
489 | u16 shift, mult; |
490 | u64 src; |
491 | u32 current_egress_rate; /* Mbits /sec */ |
492 | u64 max_pkt_time; |
493 | /* |
494 | * max_pkt_time is the maximum packet egress time in units |
495 | * of the fabric clock period 1/(805 MHz). |
496 | */ |
497 | |
498 | cc_state = get_cc_state(ppd); |
499 | |
500 | if (!cc_state) |
501 | /* |
502 | * This should _never_ happen - rcu_read_lock() is held, |
503 | * and set_link_ipg() should not be called if cc_state |
504 | * is NULL. |
505 | */ |
506 | return; |
507 | |
508 | for (i = 0; i < OPA_MAX_SLS; i++) { |
509 | u16 ccti = ppd->cca_timer[i].ccti; |
510 | |
511 | if (ccti > max_ccti) |
512 | max_ccti = ccti; |
513 | } |
514 | |
515 | ccti_limit = cc_state->cct.ccti_limit; |
516 | if (max_ccti > ccti_limit) |
517 | max_ccti = ccti_limit; |
518 | |
519 | cce = cc_state->cct.entries[max_ccti].entry; |
520 | shift = (cce & 0xc000) >> 14; |
521 | mult = (cce & 0x3fff); |
522 | |
523 | current_egress_rate = active_egress_rate(ppd); |
524 | |
525 | max_pkt_time = egress_cycles(len: ppd->ibmaxlen, rate: current_egress_rate); |
526 | |
527 | src = (max_pkt_time >> shift) * mult; |
528 | |
529 | src &= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK; |
530 | src <<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT; |
531 | |
532 | write_csr(dd, SEND_STATIC_RATE_CONTROL, value: src); |
533 | } |
534 | |
535 | static enum hrtimer_restart cca_timer_fn(struct hrtimer *t) |
536 | { |
537 | struct cca_timer *cca_timer; |
538 | struct hfi1_pportdata *ppd; |
539 | int sl; |
540 | u16 ccti_timer, ccti_min; |
541 | struct cc_state *cc_state; |
542 | unsigned long flags; |
543 | enum hrtimer_restart ret = HRTIMER_NORESTART; |
544 | |
545 | cca_timer = container_of(t, struct cca_timer, hrtimer); |
546 | ppd = cca_timer->ppd; |
547 | sl = cca_timer->sl; |
548 | |
549 | rcu_read_lock(); |
550 | |
551 | cc_state = get_cc_state(ppd); |
552 | |
553 | if (!cc_state) { |
554 | rcu_read_unlock(); |
555 | return HRTIMER_NORESTART; |
556 | } |
557 | |
558 | /* |
559 | * 1) decrement ccti for SL |
560 | * 2) calculate IPG for link (set_link_ipg()) |
561 | * 3) restart timer, unless ccti is at min value |
562 | */ |
563 | |
564 | ccti_min = cc_state->cong_setting.entries[sl].ccti_min; |
565 | ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer; |
566 | |
567 | spin_lock_irqsave(&ppd->cca_timer_lock, flags); |
568 | |
569 | if (cca_timer->ccti > ccti_min) { |
570 | cca_timer->ccti--; |
571 | set_link_ipg(ppd); |
572 | } |
573 | |
574 | if (cca_timer->ccti > ccti_min) { |
575 | unsigned long nsec = 1024 * ccti_timer; |
576 | /* ccti_timer is in units of 1.024 usec */ |
577 | hrtimer_forward_now(timer: t, interval: ns_to_ktime(ns: nsec)); |
578 | ret = HRTIMER_RESTART; |
579 | } |
580 | |
581 | spin_unlock_irqrestore(lock: &ppd->cca_timer_lock, flags); |
582 | rcu_read_unlock(); |
583 | return ret; |
584 | } |
585 | |
586 | /* |
587 | * Common code for initializing the physical port structure. |
588 | */ |
589 | void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, |
590 | struct hfi1_devdata *dd, u8 hw_pidx, u32 port) |
591 | { |
592 | int i; |
593 | uint default_pkey_idx; |
594 | struct cc_state *cc_state; |
595 | |
596 | ppd->dd = dd; |
597 | ppd->hw_pidx = hw_pidx; |
598 | ppd->port = port; /* IB port number, not index */ |
599 | ppd->prev_link_width = LINK_WIDTH_DEFAULT; |
600 | /* |
601 | * There are C_VL_COUNT number of PortVLXmitWait counters. |
602 | * Adding 1 to C_VL_COUNT to include the PortXmitWait counter. |
603 | */ |
604 | for (i = 0; i < C_VL_COUNT + 1; i++) { |
605 | ppd->port_vl_xmit_wait_last[i] = 0; |
606 | ppd->vl_xmit_flit_cnt[i] = 0; |
607 | } |
608 | |
609 | default_pkey_idx = 1; |
610 | |
611 | ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY; |
612 | ppd->part_enforce |= HFI1_PART_ENFORCE_IN; |
613 | ppd->pkeys[0] = 0x8001; |
614 | |
615 | INIT_WORK(&ppd->link_vc_work, handle_verify_cap); |
616 | INIT_WORK(&ppd->link_up_work, handle_link_up); |
617 | INIT_WORK(&ppd->link_down_work, handle_link_down); |
618 | INIT_WORK(&ppd->freeze_work, handle_freeze); |
619 | INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade); |
620 | INIT_WORK(&ppd->sma_message_work, handle_sma_message); |
621 | INIT_WORK(&ppd->link_bounce_work, handle_link_bounce); |
622 | INIT_DELAYED_WORK(&ppd->start_link_work, handle_start_link); |
623 | INIT_WORK(&ppd->linkstate_active_work, receive_interrupt_work); |
624 | INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event); |
625 | |
626 | mutex_init(&ppd->hls_lock); |
627 | spin_lock_init(&ppd->qsfp_info.qsfp_lock); |
628 | |
629 | ppd->qsfp_info.ppd = ppd; |
630 | ppd->sm_trap_qp = 0x0; |
631 | ppd->sa_qp = 0x1; |
632 | |
633 | ppd->hfi1_wq = NULL; |
634 | |
635 | spin_lock_init(&ppd->cca_timer_lock); |
636 | |
637 | for (i = 0; i < OPA_MAX_SLS; i++) { |
638 | hrtimer_init(timer: &ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC, |
639 | mode: HRTIMER_MODE_REL); |
640 | ppd->cca_timer[i].ppd = ppd; |
641 | ppd->cca_timer[i].sl = i; |
642 | ppd->cca_timer[i].ccti = 0; |
643 | ppd->cca_timer[i].hrtimer.function = cca_timer_fn; |
644 | } |
645 | |
646 | ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT; |
647 | |
648 | spin_lock_init(&ppd->cc_state_lock); |
649 | spin_lock_init(&ppd->cc_log_lock); |
650 | cc_state = kzalloc(size: sizeof(*cc_state), GFP_KERNEL); |
651 | RCU_INIT_POINTER(ppd->cc_state, cc_state); |
652 | if (!cc_state) |
653 | goto bail; |
654 | return; |
655 | |
656 | bail: |
657 | dd_dev_err(dd, "Congestion Control Agent disabled for port %d\n" , port); |
658 | } |
659 | |
660 | /* |
661 | * Do initialization for device that is only needed on |
662 | * first detect, not on resets. |
663 | */ |
664 | static int loadtime_init(struct hfi1_devdata *dd) |
665 | { |
666 | return 0; |
667 | } |
668 | |
669 | /** |
670 | * init_after_reset - re-initialize after a reset |
671 | * @dd: the hfi1_ib device |
672 | * |
673 | * sanity check at least some of the values after reset, and |
674 | * ensure no receive or transmit (explicitly, in case reset |
675 | * failed |
676 | */ |
677 | static int init_after_reset(struct hfi1_devdata *dd) |
678 | { |
679 | int i; |
680 | struct hfi1_ctxtdata *rcd; |
681 | /* |
682 | * Ensure chip does no sends or receives, tail updates, or |
683 | * pioavail updates while we re-initialize. This is mostly |
684 | * for the driver data structures, not chip registers. |
685 | */ |
686 | for (i = 0; i < dd->num_rcv_contexts; i++) { |
687 | rcd = hfi1_rcd_get_by_index(dd, ctxt: i); |
688 | hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS | |
689 | HFI1_RCVCTRL_INTRAVAIL_DIS | |
690 | HFI1_RCVCTRL_TAILUPD_DIS, rcd); |
691 | hfi1_rcd_put(rcd); |
692 | } |
693 | pio_send_control(dd, PSC_GLOBAL_DISABLE); |
694 | for (i = 0; i < dd->num_send_contexts; i++) |
695 | sc_disable(sc: dd->send_contexts[i].sc); |
696 | |
697 | return 0; |
698 | } |
699 | |
700 | static void enable_chip(struct hfi1_devdata *dd) |
701 | { |
702 | struct hfi1_ctxtdata *rcd; |
703 | u32 rcvmask; |
704 | u16 i; |
705 | |
706 | /* enable PIO send */ |
707 | pio_send_control(dd, PSC_GLOBAL_ENABLE); |
708 | |
709 | /* |
710 | * Enable kernel ctxts' receive and receive interrupt. |
711 | * Other ctxts done as user opens and initializes them. |
712 | */ |
713 | for (i = 0; i < dd->first_dyn_alloc_ctxt; ++i) { |
714 | rcd = hfi1_rcd_get_by_index(dd, ctxt: i); |
715 | if (!rcd) |
716 | continue; |
717 | rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB; |
718 | rcvmask |= HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ? |
719 | HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS; |
720 | if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) |
721 | rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB; |
722 | if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_RHQ_FULL)) |
723 | rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; |
724 | if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL)) |
725 | rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; |
726 | if (HFI1_CAP_IS_KSET(TID_RDMA)) |
727 | rcvmask |= HFI1_RCVCTRL_TIDFLOW_ENB; |
728 | hfi1_rcvctrl(dd, op: rcvmask, rcd); |
729 | sc_enable(sc: rcd->sc); |
730 | hfi1_rcd_put(rcd); |
731 | } |
732 | } |
733 | |
734 | /** |
735 | * create_workqueues - create per port workqueues |
736 | * @dd: the hfi1_ib device |
737 | */ |
738 | static int create_workqueues(struct hfi1_devdata *dd) |
739 | { |
740 | int pidx; |
741 | struct hfi1_pportdata *ppd; |
742 | |
743 | for (pidx = 0; pidx < dd->num_pports; ++pidx) { |
744 | ppd = dd->pport + pidx; |
745 | if (!ppd->hfi1_wq) { |
746 | ppd->hfi1_wq = |
747 | alloc_workqueue( |
748 | fmt: "hfi%d_%d" , |
749 | flags: WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | |
750 | WQ_MEM_RECLAIM, |
751 | HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES, |
752 | dd->unit, pidx); |
753 | if (!ppd->hfi1_wq) |
754 | goto wq_error; |
755 | } |
756 | if (!ppd->link_wq) { |
757 | /* |
758 | * Make the link workqueue single-threaded to enforce |
759 | * serialization. |
760 | */ |
761 | ppd->link_wq = |
762 | alloc_workqueue( |
763 | fmt: "hfi_link_%d_%d" , |
764 | flags: WQ_SYSFS | WQ_MEM_RECLAIM | WQ_UNBOUND, |
765 | max_active: 1, /* max_active */ |
766 | dd->unit, pidx); |
767 | if (!ppd->link_wq) |
768 | goto wq_error; |
769 | } |
770 | } |
771 | return 0; |
772 | wq_error: |
773 | pr_err("alloc_workqueue failed for port %d\n" , pidx + 1); |
774 | for (pidx = 0; pidx < dd->num_pports; ++pidx) { |
775 | ppd = dd->pport + pidx; |
776 | if (ppd->hfi1_wq) { |
777 | destroy_workqueue(wq: ppd->hfi1_wq); |
778 | ppd->hfi1_wq = NULL; |
779 | } |
780 | if (ppd->link_wq) { |
781 | destroy_workqueue(wq: ppd->link_wq); |
782 | ppd->link_wq = NULL; |
783 | } |
784 | } |
785 | return -ENOMEM; |
786 | } |
787 | |
788 | /** |
789 | * destroy_workqueues - destroy per port workqueues |
790 | * @dd: the hfi1_ib device |
791 | */ |
792 | static void destroy_workqueues(struct hfi1_devdata *dd) |
793 | { |
794 | int pidx; |
795 | struct hfi1_pportdata *ppd; |
796 | |
797 | for (pidx = 0; pidx < dd->num_pports; ++pidx) { |
798 | ppd = dd->pport + pidx; |
799 | |
800 | if (ppd->hfi1_wq) { |
801 | destroy_workqueue(wq: ppd->hfi1_wq); |
802 | ppd->hfi1_wq = NULL; |
803 | } |
804 | if (ppd->link_wq) { |
805 | destroy_workqueue(wq: ppd->link_wq); |
806 | ppd->link_wq = NULL; |
807 | } |
808 | } |
809 | } |
810 | |
811 | /** |
812 | * enable_general_intr() - Enable the IRQs that will be handled by the |
813 | * general interrupt handler. |
814 | * @dd: valid devdata |
815 | * |
816 | */ |
817 | static void enable_general_intr(struct hfi1_devdata *dd) |
818 | { |
819 | set_intr_bits(dd, CCE_ERR_INT, MISC_ERR_INT, set: true); |
820 | set_intr_bits(dd, PIO_ERR_INT, TXE_ERR_INT, set: true); |
821 | set_intr_bits(dd, IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END, set: true); |
822 | set_intr_bits(dd, PBC_INT, GPIO_ASSERT_INT, set: true); |
823 | set_intr_bits(dd, TCRIT_INT, TCRIT_INT, set: true); |
824 | set_intr_bits(dd, IS_DC_START, IS_DC_END, set: true); |
825 | set_intr_bits(dd, IS_SENDCREDIT_START, IS_SENDCREDIT_END, set: true); |
826 | } |
827 | |
828 | /** |
829 | * hfi1_init - do the actual initialization sequence on the chip |
830 | * @dd: the hfi1_ib device |
831 | * @reinit: re-initializing, so don't allocate new memory |
832 | * |
833 | * Do the actual initialization sequence on the chip. This is done |
834 | * both from the init routine called from the PCI infrastructure, and |
835 | * when we reset the chip, or detect that it was reset internally, |
836 | * or it's administratively re-enabled. |
837 | * |
838 | * Memory allocation here and in called routines is only done in |
839 | * the first case (reinit == 0). We have to be careful, because even |
840 | * without memory allocation, we need to re-write all the chip registers |
841 | * TIDs, etc. after the reset or enable has completed. |
842 | */ |
843 | int hfi1_init(struct hfi1_devdata *dd, int reinit) |
844 | { |
845 | int ret = 0, pidx, lastfail = 0; |
846 | unsigned long len; |
847 | u16 i; |
848 | struct hfi1_ctxtdata *rcd; |
849 | struct hfi1_pportdata *ppd; |
850 | |
851 | /* Set up send low level handlers */ |
852 | dd->process_pio_send = hfi1_verbs_send_pio; |
853 | dd->process_dma_send = hfi1_verbs_send_dma; |
854 | dd->pio_inline_send = pio_copy; |
855 | dd->process_vnic_dma_send = hfi1_vnic_send_dma; |
856 | |
857 | if (is_ax(dd)) { |
858 | atomic_set(v: &dd->drop_packet, DROP_PACKET_ON); |
859 | dd->do_drop = true; |
860 | } else { |
861 | atomic_set(v: &dd->drop_packet, DROP_PACKET_OFF); |
862 | dd->do_drop = false; |
863 | } |
864 | |
865 | /* make sure the link is not "up" */ |
866 | for (pidx = 0; pidx < dd->num_pports; ++pidx) { |
867 | ppd = dd->pport + pidx; |
868 | ppd->linkup = 0; |
869 | } |
870 | |
871 | if (reinit) |
872 | ret = init_after_reset(dd); |
873 | else |
874 | ret = loadtime_init(dd); |
875 | if (ret) |
876 | goto done; |
877 | |
878 | /* dd->rcd can be NULL if early initialization failed */ |
879 | for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i) { |
880 | /* |
881 | * Set up the (kernel) rcvhdr queue and egr TIDs. If doing |
882 | * re-init, the simplest way to handle this is to free |
883 | * existing, and re-allocate. |
884 | * Need to re-create rest of ctxt 0 ctxtdata as well. |
885 | */ |
886 | rcd = hfi1_rcd_get_by_index(dd, ctxt: i); |
887 | if (!rcd) |
888 | continue; |
889 | |
890 | lastfail = hfi1_create_rcvhdrq(dd, rcd); |
891 | if (!lastfail) |
892 | lastfail = hfi1_setup_eagerbufs(rcd); |
893 | if (!lastfail) |
894 | lastfail = hfi1_kern_exp_rcv_init(rcd, reinit); |
895 | if (lastfail) { |
896 | dd_dev_err(dd, |
897 | "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n" ); |
898 | ret = lastfail; |
899 | } |
900 | /* enable IRQ */ |
901 | hfi1_rcd_put(rcd); |
902 | } |
903 | |
904 | /* Allocate enough memory for user event notification. */ |
905 | len = PAGE_ALIGN(chip_rcv_contexts(dd) * HFI1_MAX_SHARED_CTXTS * |
906 | sizeof(*dd->events)); |
907 | dd->events = vmalloc_user(size: len); |
908 | if (!dd->events) |
909 | dd_dev_err(dd, "Failed to allocate user events page\n" ); |
910 | /* |
911 | * Allocate a page for device and port status. |
912 | * Page will be shared amongst all user processes. |
913 | */ |
914 | dd->status = vmalloc_user(PAGE_SIZE); |
915 | if (!dd->status) |
916 | dd_dev_err(dd, "Failed to allocate dev status page\n" ); |
917 | for (pidx = 0; pidx < dd->num_pports; ++pidx) { |
918 | ppd = dd->pport + pidx; |
919 | if (dd->status) |
920 | /* Currently, we only have one port */ |
921 | ppd->statusp = &dd->status->port; |
922 | |
923 | set_mtu(ppd); |
924 | } |
925 | |
926 | /* enable chip even if we have an error, so we can debug cause */ |
927 | enable_chip(dd); |
928 | |
929 | done: |
930 | /* |
931 | * Set status even if port serdes is not initialized |
932 | * so that diags will work. |
933 | */ |
934 | if (dd->status) |
935 | dd->status->dev |= HFI1_STATUS_CHIP_PRESENT | |
936 | HFI1_STATUS_INITTED; |
937 | if (!ret) { |
938 | /* enable all interrupts from the chip */ |
939 | enable_general_intr(dd); |
940 | init_qsfp_int(dd); |
941 | |
942 | /* chip is OK for user apps; mark it as initialized */ |
943 | for (pidx = 0; pidx < dd->num_pports; ++pidx) { |
944 | ppd = dd->pport + pidx; |
945 | |
946 | /* |
947 | * start the serdes - must be after interrupts are |
948 | * enabled so we are notified when the link goes up |
949 | */ |
950 | lastfail = bringup_serdes(ppd); |
951 | if (lastfail) |
952 | dd_dev_info(dd, |
953 | "Failed to bring up port %u\n" , |
954 | ppd->port); |
955 | |
956 | /* |
957 | * Set status even if port serdes is not initialized |
958 | * so that diags will work. |
959 | */ |
960 | if (ppd->statusp) |
961 | *ppd->statusp |= HFI1_STATUS_CHIP_PRESENT | |
962 | HFI1_STATUS_INITTED; |
963 | if (!ppd->link_speed_enabled) |
964 | continue; |
965 | } |
966 | } |
967 | |
968 | /* if ret is non-zero, we probably should do some cleanup here... */ |
969 | return ret; |
970 | } |
971 | |
972 | struct hfi1_devdata *hfi1_lookup(int unit) |
973 | { |
974 | return xa_load(&hfi1_dev_table, index: unit); |
975 | } |
976 | |
977 | /* |
978 | * Stop the timers during unit shutdown, or after an error late |
979 | * in initialization. |
980 | */ |
981 | static void stop_timers(struct hfi1_devdata *dd) |
982 | { |
983 | struct hfi1_pportdata *ppd; |
984 | int pidx; |
985 | |
986 | for (pidx = 0; pidx < dd->num_pports; ++pidx) { |
987 | ppd = dd->pport + pidx; |
988 | if (ppd->led_override_timer.function) { |
989 | del_timer_sync(timer: &ppd->led_override_timer); |
990 | atomic_set(v: &ppd->led_override_timer_active, i: 0); |
991 | } |
992 | } |
993 | } |
994 | |
995 | /** |
996 | * shutdown_device - shut down a device |
997 | * @dd: the hfi1_ib device |
998 | * |
999 | * This is called to make the device quiet when we are about to |
1000 | * unload the driver, and also when the device is administratively |
1001 | * disabled. It does not free any data structures. |
1002 | * Everything it does has to be setup again by hfi1_init(dd, 1) |
1003 | */ |
1004 | static void shutdown_device(struct hfi1_devdata *dd) |
1005 | { |
1006 | struct hfi1_pportdata *ppd; |
1007 | struct hfi1_ctxtdata *rcd; |
1008 | unsigned pidx; |
1009 | int i; |
1010 | |
1011 | if (dd->flags & HFI1_SHUTDOWN) |
1012 | return; |
1013 | dd->flags |= HFI1_SHUTDOWN; |
1014 | |
1015 | for (pidx = 0; pidx < dd->num_pports; ++pidx) { |
1016 | ppd = dd->pport + pidx; |
1017 | |
1018 | ppd->linkup = 0; |
1019 | if (ppd->statusp) |
1020 | *ppd->statusp &= ~(HFI1_STATUS_IB_CONF | |
1021 | HFI1_STATUS_IB_READY); |
1022 | } |
1023 | dd->flags &= ~HFI1_INITTED; |
1024 | |
1025 | /* mask and clean up interrupts */ |
1026 | set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, set: false); |
1027 | msix_clean_up_interrupts(dd); |
1028 | |
1029 | for (pidx = 0; pidx < dd->num_pports; ++pidx) { |
1030 | for (i = 0; i < dd->num_rcv_contexts; i++) { |
1031 | rcd = hfi1_rcd_get_by_index(dd, ctxt: i); |
1032 | hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS | |
1033 | HFI1_RCVCTRL_CTXT_DIS | |
1034 | HFI1_RCVCTRL_INTRAVAIL_DIS | |
1035 | HFI1_RCVCTRL_PKEY_DIS | |
1036 | HFI1_RCVCTRL_ONE_PKT_EGR_DIS, rcd); |
1037 | hfi1_rcd_put(rcd); |
1038 | } |
1039 | /* |
1040 | * Gracefully stop all sends allowing any in progress to |
1041 | * trickle out first. |
1042 | */ |
1043 | for (i = 0; i < dd->num_send_contexts; i++) |
1044 | sc_flush(sc: dd->send_contexts[i].sc); |
1045 | } |
1046 | |
1047 | /* |
1048 | * Enough for anything that's going to trickle out to have actually |
1049 | * done so. |
1050 | */ |
1051 | udelay(20); |
1052 | |
1053 | for (pidx = 0; pidx < dd->num_pports; ++pidx) { |
1054 | ppd = dd->pport + pidx; |
1055 | |
1056 | /* disable all contexts */ |
1057 | for (i = 0; i < dd->num_send_contexts; i++) |
1058 | sc_disable(sc: dd->send_contexts[i].sc); |
1059 | /* disable the send device */ |
1060 | pio_send_control(dd, PSC_GLOBAL_DISABLE); |
1061 | |
1062 | shutdown_led_override(ppd); |
1063 | |
1064 | /* |
1065 | * Clear SerdesEnable. |
1066 | * We can't count on interrupts since we are stopping. |
1067 | */ |
1068 | hfi1_quiet_serdes(ppd); |
1069 | if (ppd->hfi1_wq) |
1070 | flush_workqueue(ppd->hfi1_wq); |
1071 | if (ppd->link_wq) |
1072 | flush_workqueue(ppd->link_wq); |
1073 | } |
1074 | sdma_exit(dd); |
1075 | } |
1076 | |
1077 | /** |
1078 | * hfi1_free_ctxtdata - free a context's allocated data |
1079 | * @dd: the hfi1_ib device |
1080 | * @rcd: the ctxtdata structure |
1081 | * |
1082 | * free up any allocated data for a context |
1083 | * It should never change any chip state, or global driver state. |
1084 | */ |
1085 | void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) |
1086 | { |
1087 | u32 e; |
1088 | |
1089 | if (!rcd) |
1090 | return; |
1091 | |
1092 | if (rcd->rcvhdrq) { |
1093 | dma_free_coherent(dev: &dd->pcidev->dev, size: rcvhdrq_size(rcd), |
1094 | cpu_addr: rcd->rcvhdrq, dma_handle: rcd->rcvhdrq_dma); |
1095 | rcd->rcvhdrq = NULL; |
1096 | if (hfi1_rcvhdrtail_kvaddr(rcd)) { |
1097 | dma_free_coherent(dev: &dd->pcidev->dev, PAGE_SIZE, |
1098 | cpu_addr: (void *)hfi1_rcvhdrtail_kvaddr(rcd), |
1099 | dma_handle: rcd->rcvhdrqtailaddr_dma); |
1100 | rcd->rcvhdrtail_kvaddr = NULL; |
1101 | } |
1102 | } |
1103 | |
1104 | /* all the RcvArray entries should have been cleared by now */ |
1105 | kfree(objp: rcd->egrbufs.rcvtids); |
1106 | rcd->egrbufs.rcvtids = NULL; |
1107 | |
1108 | for (e = 0; e < rcd->egrbufs.alloced; e++) { |
1109 | if (rcd->egrbufs.buffers[e].addr) |
1110 | dma_free_coherent(dev: &dd->pcidev->dev, |
1111 | size: rcd->egrbufs.buffers[e].len, |
1112 | cpu_addr: rcd->egrbufs.buffers[e].addr, |
1113 | dma_handle: rcd->egrbufs.buffers[e].dma); |
1114 | } |
1115 | kfree(objp: rcd->egrbufs.buffers); |
1116 | rcd->egrbufs.alloced = 0; |
1117 | rcd->egrbufs.buffers = NULL; |
1118 | |
1119 | sc_free(sc: rcd->sc); |
1120 | rcd->sc = NULL; |
1121 | |
1122 | vfree(addr: rcd->subctxt_uregbase); |
1123 | vfree(addr: rcd->subctxt_rcvegrbuf); |
1124 | vfree(addr: rcd->subctxt_rcvhdr_base); |
1125 | kfree(objp: rcd->opstats); |
1126 | |
1127 | rcd->subctxt_uregbase = NULL; |
1128 | rcd->subctxt_rcvegrbuf = NULL; |
1129 | rcd->subctxt_rcvhdr_base = NULL; |
1130 | rcd->opstats = NULL; |
1131 | } |
1132 | |
1133 | /* |
1134 | * Release our hold on the shared asic data. If we are the last one, |
1135 | * return the structure to be finalized outside the lock. Must be |
1136 | * holding hfi1_dev_table lock. |
1137 | */ |
1138 | static struct hfi1_asic_data *release_asic_data(struct hfi1_devdata *dd) |
1139 | { |
1140 | struct hfi1_asic_data *ad; |
1141 | int other; |
1142 | |
1143 | if (!dd->asic_data) |
1144 | return NULL; |
1145 | dd->asic_data->dds[dd->hfi1_id] = NULL; |
1146 | other = dd->hfi1_id ? 0 : 1; |
1147 | ad = dd->asic_data; |
1148 | dd->asic_data = NULL; |
1149 | /* return NULL if the other dd still has a link */ |
1150 | return ad->dds[other] ? NULL : ad; |
1151 | } |
1152 | |
1153 | static void finalize_asic_data(struct hfi1_devdata *dd, |
1154 | struct hfi1_asic_data *ad) |
1155 | { |
1156 | clean_up_i2c(dd, ad); |
1157 | kfree(objp: ad); |
1158 | } |
1159 | |
1160 | /** |
1161 | * hfi1_free_devdata - cleans up and frees per-unit data structure |
1162 | * @dd: pointer to a valid devdata structure |
1163 | * |
1164 | * It cleans up and frees all data structures set up by |
1165 | * by hfi1_alloc_devdata(). |
1166 | */ |
1167 | void hfi1_free_devdata(struct hfi1_devdata *dd) |
1168 | { |
1169 | struct hfi1_asic_data *ad; |
1170 | unsigned long flags; |
1171 | |
1172 | xa_lock_irqsave(&hfi1_dev_table, flags); |
1173 | __xa_erase(&hfi1_dev_table, index: dd->unit); |
1174 | ad = release_asic_data(dd); |
1175 | xa_unlock_irqrestore(&hfi1_dev_table, flags); |
1176 | |
1177 | finalize_asic_data(dd, ad); |
1178 | free_platform_config(dd); |
1179 | rcu_barrier(); /* wait for rcu callbacks to complete */ |
1180 | free_percpu(pdata: dd->int_counter); |
1181 | free_percpu(pdata: dd->rcv_limit); |
1182 | free_percpu(pdata: dd->send_schedule); |
1183 | free_percpu(pdata: dd->tx_opstats); |
1184 | dd->int_counter = NULL; |
1185 | dd->rcv_limit = NULL; |
1186 | dd->send_schedule = NULL; |
1187 | dd->tx_opstats = NULL; |
1188 | kfree(objp: dd->comp_vect); |
1189 | dd->comp_vect = NULL; |
1190 | if (dd->rcvhdrtail_dummy_kvaddr) |
1191 | dma_free_coherent(dev: &dd->pcidev->dev, size: sizeof(u64), |
1192 | cpu_addr: (void *)dd->rcvhdrtail_dummy_kvaddr, |
1193 | dma_handle: dd->rcvhdrtail_dummy_dma); |
1194 | dd->rcvhdrtail_dummy_kvaddr = NULL; |
1195 | sdma_clean(dd, num_engines: dd->num_sdma); |
1196 | rvt_dealloc_device(rdi: &dd->verbs_dev.rdi); |
1197 | } |
1198 | |
1199 | /** |
1200 | * hfi1_alloc_devdata - Allocate our primary per-unit data structure. |
1201 | * @pdev: Valid PCI device |
1202 | * @extra: How many bytes to alloc past the default |
1203 | * |
1204 | * Must be done via verbs allocator, because the verbs cleanup process |
1205 | * both does cleanup and free of the data structure. |
1206 | * "extra" is for chip-specific data. |
1207 | */ |
1208 | static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, |
1209 | size_t ) |
1210 | { |
1211 | struct hfi1_devdata *dd; |
1212 | int ret, nports; |
1213 | |
1214 | /* extra is * number of ports */ |
1215 | nports = extra / sizeof(struct hfi1_pportdata); |
1216 | |
1217 | dd = (struct hfi1_devdata *)rvt_alloc_device(size: sizeof(*dd) + extra, |
1218 | nports); |
1219 | if (!dd) |
1220 | return ERR_PTR(error: -ENOMEM); |
1221 | dd->num_pports = nports; |
1222 | dd->pport = (struct hfi1_pportdata *)(dd + 1); |
1223 | dd->pcidev = pdev; |
1224 | pci_set_drvdata(pdev, data: dd); |
1225 | |
1226 | ret = xa_alloc_irq(xa: &hfi1_dev_table, id: &dd->unit, entry: dd, xa_limit_32b, |
1227 | GFP_KERNEL); |
1228 | if (ret < 0) { |
1229 | dev_err(&pdev->dev, |
1230 | "Could not allocate unit ID: error %d\n" , -ret); |
1231 | goto bail; |
1232 | } |
1233 | rvt_set_ibdev_name(rdi: &dd->verbs_dev.rdi, fmt: "%s_%d" , name: class_name(), unit: dd->unit); |
1234 | /* |
1235 | * If the BIOS does not have the NUMA node information set, select |
1236 | * NUMA 0 so we get consistent performance. |
1237 | */ |
1238 | dd->node = pcibus_to_node(pdev->bus); |
1239 | if (dd->node == NUMA_NO_NODE) { |
1240 | dd_dev_err(dd, "Invalid PCI NUMA node. Performance may be affected\n" ); |
1241 | dd->node = 0; |
1242 | } |
1243 | |
1244 | /* |
1245 | * Initialize all locks for the device. This needs to be as early as |
1246 | * possible so locks are usable. |
1247 | */ |
1248 | spin_lock_init(&dd->sc_lock); |
1249 | spin_lock_init(&dd->sendctrl_lock); |
1250 | spin_lock_init(&dd->rcvctrl_lock); |
1251 | spin_lock_init(&dd->uctxt_lock); |
1252 | spin_lock_init(&dd->hfi1_diag_trans_lock); |
1253 | spin_lock_init(&dd->sc_init_lock); |
1254 | spin_lock_init(&dd->dc8051_memlock); |
1255 | seqlock_init(&dd->sc2vl_lock); |
1256 | spin_lock_init(&dd->sde_map_lock); |
1257 | spin_lock_init(&dd->pio_map_lock); |
1258 | mutex_init(&dd->dc8051_lock); |
1259 | init_waitqueue_head(&dd->event_queue); |
1260 | spin_lock_init(&dd->irq_src_lock); |
1261 | |
1262 | dd->int_counter = alloc_percpu(u64); |
1263 | if (!dd->int_counter) { |
1264 | ret = -ENOMEM; |
1265 | goto bail; |
1266 | } |
1267 | |
1268 | dd->rcv_limit = alloc_percpu(u64); |
1269 | if (!dd->rcv_limit) { |
1270 | ret = -ENOMEM; |
1271 | goto bail; |
1272 | } |
1273 | |
1274 | dd->send_schedule = alloc_percpu(u64); |
1275 | if (!dd->send_schedule) { |
1276 | ret = -ENOMEM; |
1277 | goto bail; |
1278 | } |
1279 | |
1280 | dd->tx_opstats = alloc_percpu(struct hfi1_opcode_stats_perctx); |
1281 | if (!dd->tx_opstats) { |
1282 | ret = -ENOMEM; |
1283 | goto bail; |
1284 | } |
1285 | |
1286 | dd->comp_vect = kzalloc(size: sizeof(*dd->comp_vect), GFP_KERNEL); |
1287 | if (!dd->comp_vect) { |
1288 | ret = -ENOMEM; |
1289 | goto bail; |
1290 | } |
1291 | |
1292 | /* allocate dummy tail memory for all receive contexts */ |
1293 | dd->rcvhdrtail_dummy_kvaddr = |
1294 | dma_alloc_coherent(dev: &dd->pcidev->dev, size: sizeof(u64), |
1295 | dma_handle: &dd->rcvhdrtail_dummy_dma, GFP_KERNEL); |
1296 | if (!dd->rcvhdrtail_dummy_kvaddr) { |
1297 | ret = -ENOMEM; |
1298 | goto bail; |
1299 | } |
1300 | |
1301 | atomic_set(v: &dd->ipoib_rsm_usr_num, i: 0); |
1302 | return dd; |
1303 | |
1304 | bail: |
1305 | hfi1_free_devdata(dd); |
1306 | return ERR_PTR(error: ret); |
1307 | } |
1308 | |
1309 | /* |
1310 | * Called from freeze mode handlers, and from PCI error |
1311 | * reporting code. Should be paranoid about state of |
1312 | * system and data structures. |
1313 | */ |
1314 | void hfi1_disable_after_error(struct hfi1_devdata *dd) |
1315 | { |
1316 | if (dd->flags & HFI1_INITTED) { |
1317 | u32 pidx; |
1318 | |
1319 | dd->flags &= ~HFI1_INITTED; |
1320 | if (dd->pport) |
1321 | for (pidx = 0; pidx < dd->num_pports; ++pidx) { |
1322 | struct hfi1_pportdata *ppd; |
1323 | |
1324 | ppd = dd->pport + pidx; |
1325 | if (dd->flags & HFI1_PRESENT) |
1326 | set_link_state(ppd, HLS_DN_DISABLE); |
1327 | |
1328 | if (ppd->statusp) |
1329 | *ppd->statusp &= ~HFI1_STATUS_IB_READY; |
1330 | } |
1331 | } |
1332 | |
1333 | /* |
1334 | * Mark as having had an error for driver, and also |
1335 | * for /sys and status word mapped to user programs. |
1336 | * This marks unit as not usable, until reset. |
1337 | */ |
1338 | if (dd->status) |
1339 | dd->status->dev |= HFI1_STATUS_HWERROR; |
1340 | } |
1341 | |
1342 | static void remove_one(struct pci_dev *); |
1343 | static int init_one(struct pci_dev *, const struct pci_device_id *); |
1344 | static void shutdown_one(struct pci_dev *); |
1345 | |
1346 | #define DRIVER_LOAD_MSG "Cornelis " DRIVER_NAME " loaded: " |
1347 | #define PFX DRIVER_NAME ": " |
1348 | |
1349 | const struct pci_device_id hfi1_pci_tbl[] = { |
1350 | { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) }, |
1351 | { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) }, |
1352 | { 0, } |
1353 | }; |
1354 | |
1355 | MODULE_DEVICE_TABLE(pci, hfi1_pci_tbl); |
1356 | |
1357 | static struct pci_driver hfi1_pci_driver = { |
1358 | .name = DRIVER_NAME, |
1359 | .probe = init_one, |
1360 | .remove = remove_one, |
1361 | .shutdown = shutdown_one, |
1362 | .id_table = hfi1_pci_tbl, |
1363 | .err_handler = &hfi1_pci_err_handler, |
1364 | }; |
1365 | |
1366 | static void __init compute_krcvqs(void) |
1367 | { |
1368 | int i; |
1369 | |
1370 | for (i = 0; i < krcvqsset; i++) |
1371 | n_krcvqs += krcvqs[i]; |
1372 | } |
1373 | |
1374 | /* |
1375 | * Do all the generic driver unit- and chip-independent memory |
1376 | * allocation and initialization. |
1377 | */ |
1378 | static int __init hfi1_mod_init(void) |
1379 | { |
1380 | int ret; |
1381 | |
1382 | ret = dev_init(); |
1383 | if (ret) |
1384 | goto bail; |
1385 | |
1386 | ret = node_affinity_init(); |
1387 | if (ret) |
1388 | goto bail; |
1389 | |
1390 | /* validate max MTU before any devices start */ |
1391 | if (!valid_opa_max_mtu(mtu: hfi1_max_mtu)) { |
1392 | pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n" , |
1393 | hfi1_max_mtu, HFI1_DEFAULT_MAX_MTU); |
1394 | hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU; |
1395 | } |
1396 | /* valid CUs run from 1-128 in powers of 2 */ |
1397 | if (hfi1_cu > 128 || !is_power_of_2(n: hfi1_cu)) |
1398 | hfi1_cu = 1; |
1399 | /* valid credit return threshold is 0-100, variable is unsigned */ |
1400 | if (user_credit_return_threshold > 100) |
1401 | user_credit_return_threshold = 100; |
1402 | |
1403 | compute_krcvqs(); |
1404 | /* |
1405 | * sanitize receive interrupt count, time must wait until after |
1406 | * the hardware type is known |
1407 | */ |
1408 | if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK) |
1409 | rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK; |
1410 | /* reject invalid combinations */ |
1411 | if (rcv_intr_count == 0 && rcv_intr_timeout == 0) { |
1412 | pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n" ); |
1413 | rcv_intr_count = 1; |
1414 | } |
1415 | if (rcv_intr_count > 1 && rcv_intr_timeout == 0) { |
1416 | /* |
1417 | * Avoid indefinite packet delivery by requiring a timeout |
1418 | * if count is > 1. |
1419 | */ |
1420 | pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n" ); |
1421 | rcv_intr_timeout = 1; |
1422 | } |
1423 | if (rcv_intr_dynamic && !(rcv_intr_count > 1 && rcv_intr_timeout > 0)) { |
1424 | /* |
1425 | * The dynamic algorithm expects a non-zero timeout |
1426 | * and a count > 1. |
1427 | */ |
1428 | pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n" ); |
1429 | rcv_intr_dynamic = 0; |
1430 | } |
1431 | |
1432 | /* sanitize link CRC options */ |
1433 | link_crc_mask &= SUPPORTED_CRCS; |
1434 | |
1435 | ret = opfn_init(); |
1436 | if (ret < 0) { |
1437 | pr_err("Failed to allocate opfn_wq" ); |
1438 | goto bail_dev; |
1439 | } |
1440 | |
1441 | /* |
1442 | * These must be called before the driver is registered with |
1443 | * the PCI subsystem. |
1444 | */ |
1445 | hfi1_dbg_init(); |
1446 | ret = pci_register_driver(&hfi1_pci_driver); |
1447 | if (ret < 0) { |
1448 | pr_err("Unable to register driver: error %d\n" , -ret); |
1449 | goto bail_dev; |
1450 | } |
1451 | goto bail; /* all OK */ |
1452 | |
1453 | bail_dev: |
1454 | hfi1_dbg_exit(); |
1455 | dev_cleanup(); |
1456 | bail: |
1457 | return ret; |
1458 | } |
1459 | |
1460 | module_init(hfi1_mod_init); |
1461 | |
1462 | /* |
1463 | * Do the non-unit driver cleanup, memory free, etc. at unload. |
1464 | */ |
1465 | static void __exit hfi1_mod_cleanup(void) |
1466 | { |
1467 | pci_unregister_driver(dev: &hfi1_pci_driver); |
1468 | opfn_exit(); |
1469 | node_affinity_destroy_all(); |
1470 | hfi1_dbg_exit(); |
1471 | |
1472 | WARN_ON(!xa_empty(&hfi1_dev_table)); |
1473 | dispose_firmware(); /* asymmetric with obtain_firmware() */ |
1474 | dev_cleanup(); |
1475 | } |
1476 | |
1477 | module_exit(hfi1_mod_cleanup); |
1478 | |
1479 | /* this can only be called after a successful initialization */ |
1480 | static void cleanup_device_data(struct hfi1_devdata *dd) |
1481 | { |
1482 | int ctxt; |
1483 | int pidx; |
1484 | |
1485 | /* users can't do anything more with chip */ |
1486 | for (pidx = 0; pidx < dd->num_pports; ++pidx) { |
1487 | struct hfi1_pportdata *ppd = &dd->pport[pidx]; |
1488 | struct cc_state *cc_state; |
1489 | int i; |
1490 | |
1491 | if (ppd->statusp) |
1492 | *ppd->statusp &= ~HFI1_STATUS_CHIP_PRESENT; |
1493 | |
1494 | for (i = 0; i < OPA_MAX_SLS; i++) |
1495 | hrtimer_cancel(timer: &ppd->cca_timer[i].hrtimer); |
1496 | |
1497 | spin_lock(lock: &ppd->cc_state_lock); |
1498 | cc_state = get_cc_state_protected(ppd); |
1499 | RCU_INIT_POINTER(ppd->cc_state, NULL); |
1500 | spin_unlock(lock: &ppd->cc_state_lock); |
1501 | |
1502 | if (cc_state) |
1503 | kfree_rcu(cc_state, rcu); |
1504 | } |
1505 | |
1506 | free_credit_return(dd); |
1507 | |
1508 | /* |
1509 | * Free any resources still in use (usually just kernel contexts) |
1510 | * at unload; we do for ctxtcnt, because that's what we allocate. |
1511 | */ |
1512 | for (ctxt = 0; dd->rcd && ctxt < dd->num_rcv_contexts; ctxt++) { |
1513 | struct hfi1_ctxtdata *rcd = dd->rcd[ctxt]; |
1514 | |
1515 | if (rcd) { |
1516 | hfi1_free_ctxt_rcv_groups(rcd); |
1517 | hfi1_free_ctxt(rcd); |
1518 | } |
1519 | } |
1520 | |
1521 | kfree(objp: dd->rcd); |
1522 | dd->rcd = NULL; |
1523 | |
1524 | free_pio_map(dd); |
1525 | /* must follow rcv context free - need to remove rcv's hooks */ |
1526 | for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++) |
1527 | sc_free(sc: dd->send_contexts[ctxt].sc); |
1528 | dd->num_send_contexts = 0; |
1529 | kfree(objp: dd->send_contexts); |
1530 | dd->send_contexts = NULL; |
1531 | kfree(objp: dd->hw_to_sw); |
1532 | dd->hw_to_sw = NULL; |
1533 | kfree(objp: dd->boardname); |
1534 | vfree(addr: dd->events); |
1535 | vfree(addr: dd->status); |
1536 | } |
1537 | |
1538 | /* |
1539 | * Clean up on unit shutdown, or error during unit load after |
1540 | * successful initialization. |
1541 | */ |
1542 | static void postinit_cleanup(struct hfi1_devdata *dd) |
1543 | { |
1544 | hfi1_start_cleanup(dd); |
1545 | hfi1_comp_vectors_clean_up(dd); |
1546 | hfi1_dev_affinity_clean_up(dd); |
1547 | |
1548 | hfi1_pcie_ddcleanup(dd); |
1549 | hfi1_pcie_cleanup(pdev: dd->pcidev); |
1550 | |
1551 | cleanup_device_data(dd); |
1552 | |
1553 | hfi1_free_devdata(dd); |
1554 | } |
1555 | |
1556 | static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) |
1557 | { |
1558 | int ret = 0, j, pidx, initfail; |
1559 | struct hfi1_devdata *dd; |
1560 | struct hfi1_pportdata *ppd; |
1561 | |
1562 | /* First, lock the non-writable module parameters */ |
1563 | HFI1_CAP_LOCK(); |
1564 | |
1565 | /* Validate dev ids */ |
1566 | if (!(ent->device == PCI_DEVICE_ID_INTEL0 || |
1567 | ent->device == PCI_DEVICE_ID_INTEL1)) { |
1568 | dev_err(&pdev->dev, "Failing on unknown Intel deviceid 0x%x\n" , |
1569 | ent->device); |
1570 | ret = -ENODEV; |
1571 | goto bail; |
1572 | } |
1573 | |
1574 | /* Allocate the dd so we can get to work */ |
1575 | dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS * |
1576 | sizeof(struct hfi1_pportdata)); |
1577 | if (IS_ERR(ptr: dd)) { |
1578 | ret = PTR_ERR(ptr: dd); |
1579 | goto bail; |
1580 | } |
1581 | |
1582 | /* Validate some global module parameters */ |
1583 | ret = hfi1_validate_rcvhdrcnt(dd, thecnt: rcvhdrcnt); |
1584 | if (ret) |
1585 | goto bail; |
1586 | |
1587 | /* use the encoding function as a sanitization check */ |
1588 | if (!encode_rcv_header_entry_size(size: hfi1_hdrq_entsize)) { |
1589 | dd_dev_err(dd, "Invalid HdrQ Entry size %u\n" , |
1590 | hfi1_hdrq_entsize); |
1591 | ret = -EINVAL; |
1592 | goto bail; |
1593 | } |
1594 | |
1595 | /* The receive eager buffer size must be set before the receive |
1596 | * contexts are created. |
1597 | * |
1598 | * Set the eager buffer size. Validate that it falls in a range |
1599 | * allowed by the hardware - all powers of 2 between the min and |
1600 | * max. The maximum valid MTU is within the eager buffer range |
1601 | * so we do not need to cap the max_mtu by an eager buffer size |
1602 | * setting. |
1603 | */ |
1604 | if (eager_buffer_size) { |
1605 | if (!is_power_of_2(n: eager_buffer_size)) |
1606 | eager_buffer_size = |
1607 | roundup_pow_of_two(eager_buffer_size); |
1608 | eager_buffer_size = |
1609 | clamp_val(eager_buffer_size, |
1610 | MIN_EAGER_BUFFER * 8, |
1611 | MAX_EAGER_BUFFER_TOTAL); |
1612 | dd_dev_info(dd, "Eager buffer size %u\n" , |
1613 | eager_buffer_size); |
1614 | } else { |
1615 | dd_dev_err(dd, "Invalid Eager buffer size of 0\n" ); |
1616 | ret = -EINVAL; |
1617 | goto bail; |
1618 | } |
1619 | |
1620 | /* restrict value of hfi1_rcvarr_split */ |
1621 | hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100); |
1622 | |
1623 | ret = hfi1_pcie_init(dd); |
1624 | if (ret) |
1625 | goto bail; |
1626 | |
1627 | /* |
1628 | * Do device-specific initialization, function table setup, dd |
1629 | * allocation, etc. |
1630 | */ |
1631 | ret = hfi1_init_dd(dd); |
1632 | if (ret) |
1633 | goto clean_bail; /* error already printed */ |
1634 | |
1635 | ret = create_workqueues(dd); |
1636 | if (ret) |
1637 | goto clean_bail; |
1638 | |
1639 | /* do the generic initialization */ |
1640 | initfail = hfi1_init(dd, reinit: 0); |
1641 | |
1642 | ret = hfi1_register_ib_device(dd); |
1643 | |
1644 | /* |
1645 | * Now ready for use. this should be cleared whenever we |
1646 | * detect a reset, or initiate one. If earlier failure, |
1647 | * we still create devices, so diags, etc. can be used |
1648 | * to determine cause of problem. |
1649 | */ |
1650 | if (!initfail && !ret) { |
1651 | dd->flags |= HFI1_INITTED; |
1652 | /* create debufs files after init and ib register */ |
1653 | hfi1_dbg_ibdev_init(ibd: &dd->verbs_dev); |
1654 | } |
1655 | |
1656 | j = hfi1_device_create(dd); |
1657 | if (j) |
1658 | dd_dev_err(dd, "Failed to create /dev devices: %d\n" , -j); |
1659 | |
1660 | if (initfail || ret) { |
1661 | msix_clean_up_interrupts(dd); |
1662 | stop_timers(dd); |
1663 | flush_workqueue(ib_wq); |
1664 | for (pidx = 0; pidx < dd->num_pports; ++pidx) { |
1665 | hfi1_quiet_serdes(ppd: dd->pport + pidx); |
1666 | ppd = dd->pport + pidx; |
1667 | if (ppd->hfi1_wq) { |
1668 | destroy_workqueue(wq: ppd->hfi1_wq); |
1669 | ppd->hfi1_wq = NULL; |
1670 | } |
1671 | if (ppd->link_wq) { |
1672 | destroy_workqueue(wq: ppd->link_wq); |
1673 | ppd->link_wq = NULL; |
1674 | } |
1675 | } |
1676 | if (!j) |
1677 | hfi1_device_remove(dd); |
1678 | if (!ret) |
1679 | hfi1_unregister_ib_device(dd); |
1680 | postinit_cleanup(dd); |
1681 | if (initfail) |
1682 | ret = initfail; |
1683 | goto bail; /* everything already cleaned */ |
1684 | } |
1685 | |
1686 | sdma_start(dd); |
1687 | |
1688 | return 0; |
1689 | |
1690 | clean_bail: |
1691 | hfi1_pcie_cleanup(pdev); |
1692 | bail: |
1693 | return ret; |
1694 | } |
1695 | |
1696 | static void wait_for_clients(struct hfi1_devdata *dd) |
1697 | { |
1698 | /* |
1699 | * Remove the device init value and complete the device if there is |
1700 | * no clients or wait for active clients to finish. |
1701 | */ |
1702 | if (refcount_dec_and_test(r: &dd->user_refcount)) |
1703 | complete(&dd->user_comp); |
1704 | |
1705 | wait_for_completion(&dd->user_comp); |
1706 | } |
1707 | |
1708 | static void remove_one(struct pci_dev *pdev) |
1709 | { |
1710 | struct hfi1_devdata *dd = pci_get_drvdata(pdev); |
1711 | |
1712 | /* close debugfs files before ib unregister */ |
1713 | hfi1_dbg_ibdev_exit(ibd: &dd->verbs_dev); |
1714 | |
1715 | /* remove the /dev hfi1 interface */ |
1716 | hfi1_device_remove(dd); |
1717 | |
1718 | /* wait for existing user space clients to finish */ |
1719 | wait_for_clients(dd); |
1720 | |
1721 | /* unregister from IB core */ |
1722 | hfi1_unregister_ib_device(dd); |
1723 | |
1724 | /* free netdev data */ |
1725 | hfi1_free_rx(dd); |
1726 | |
1727 | /* |
1728 | * Disable the IB link, disable interrupts on the device, |
1729 | * clear dma engines, etc. |
1730 | */ |
1731 | shutdown_device(dd); |
1732 | destroy_workqueues(dd); |
1733 | |
1734 | stop_timers(dd); |
1735 | |
1736 | /* wait until all of our (qsfp) queue_work() calls complete */ |
1737 | flush_workqueue(ib_wq); |
1738 | |
1739 | postinit_cleanup(dd); |
1740 | } |
1741 | |
1742 | static void shutdown_one(struct pci_dev *pdev) |
1743 | { |
1744 | struct hfi1_devdata *dd = pci_get_drvdata(pdev); |
1745 | |
1746 | shutdown_device(dd); |
1747 | } |
1748 | |
1749 | /** |
1750 | * hfi1_create_rcvhdrq - create a receive header queue |
1751 | * @dd: the hfi1_ib device |
1752 | * @rcd: the context data |
1753 | * |
1754 | * This must be contiguous memory (from an i/o perspective), and must be |
1755 | * DMA'able (which means for some systems, it will go through an IOMMU, |
1756 | * or be forced into a low address range). |
1757 | */ |
1758 | int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) |
1759 | { |
1760 | unsigned amt; |
1761 | |
1762 | if (!rcd->rcvhdrq) { |
1763 | amt = rcvhdrq_size(rcd); |
1764 | |
1765 | rcd->rcvhdrq = dma_alloc_coherent(dev: &dd->pcidev->dev, size: amt, |
1766 | dma_handle: &rcd->rcvhdrq_dma, |
1767 | GFP_KERNEL); |
1768 | |
1769 | if (!rcd->rcvhdrq) { |
1770 | dd_dev_err(dd, |
1771 | "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n" , |
1772 | amt, rcd->ctxt); |
1773 | goto bail; |
1774 | } |
1775 | |
1776 | if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) || |
1777 | HFI1_CAP_UGET_MASK(rcd->flags, DMA_RTAIL)) { |
1778 | rcd->rcvhdrtail_kvaddr = dma_alloc_coherent(dev: &dd->pcidev->dev, |
1779 | PAGE_SIZE, |
1780 | dma_handle: &rcd->rcvhdrqtailaddr_dma, |
1781 | GFP_KERNEL); |
1782 | if (!rcd->rcvhdrtail_kvaddr) |
1783 | goto bail_free; |
1784 | } |
1785 | } |
1786 | |
1787 | set_hdrq_regs(dd: rcd->dd, ctxt: rcd->ctxt, entsize: rcd->rcvhdrqentsize, |
1788 | hdrcnt: rcd->rcvhdrq_cnt); |
1789 | |
1790 | return 0; |
1791 | |
1792 | bail_free: |
1793 | dd_dev_err(dd, |
1794 | "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n" , |
1795 | rcd->ctxt); |
1796 | dma_free_coherent(dev: &dd->pcidev->dev, size: amt, cpu_addr: rcd->rcvhdrq, |
1797 | dma_handle: rcd->rcvhdrq_dma); |
1798 | rcd->rcvhdrq = NULL; |
1799 | bail: |
1800 | return -ENOMEM; |
1801 | } |
1802 | |
1803 | /** |
1804 | * hfi1_setup_eagerbufs - llocate eager buffers, both kernel and user |
1805 | * contexts. |
1806 | * @rcd: the context we are setting up. |
1807 | * |
1808 | * Allocate the eager TID buffers and program them into hip. |
1809 | * They are no longer completely contiguous, we do multiple allocation |
1810 | * calls. Otherwise we get the OOM code involved, by asking for too |
1811 | * much per call, with disastrous results on some kernels. |
1812 | */ |
1813 | int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd) |
1814 | { |
1815 | struct hfi1_devdata *dd = rcd->dd; |
1816 | u32 max_entries, egrtop, alloced_bytes = 0; |
1817 | u16 order, idx = 0; |
1818 | int ret = 0; |
1819 | u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu); |
1820 | |
1821 | /* |
1822 | * The minimum size of the eager buffers is a groups of MTU-sized |
1823 | * buffers. |
1824 | * The global eager_buffer_size parameter is checked against the |
1825 | * theoretical lower limit of the value. Here, we check against the |
1826 | * MTU. |
1827 | */ |
1828 | if (rcd->egrbufs.size < (round_mtu * dd->rcv_entries.group_size)) |
1829 | rcd->egrbufs.size = round_mtu * dd->rcv_entries.group_size; |
1830 | /* |
1831 | * If using one-pkt-per-egr-buffer, lower the eager buffer |
1832 | * size to the max MTU (page-aligned). |
1833 | */ |
1834 | if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) |
1835 | rcd->egrbufs.rcvtid_size = round_mtu; |
1836 | |
1837 | /* |
1838 | * Eager buffers sizes of 1MB or less require smaller TID sizes |
1839 | * to satisfy the "multiple of 8 RcvArray entries" requirement. |
1840 | */ |
1841 | if (rcd->egrbufs.size <= (1 << 20)) |
1842 | rcd->egrbufs.rcvtid_size = max((unsigned long)round_mtu, |
1843 | rounddown_pow_of_two(rcd->egrbufs.size / 8)); |
1844 | |
1845 | while (alloced_bytes < rcd->egrbufs.size && |
1846 | rcd->egrbufs.alloced < rcd->egrbufs.count) { |
1847 | rcd->egrbufs.buffers[idx].addr = |
1848 | dma_alloc_coherent(dev: &dd->pcidev->dev, |
1849 | size: rcd->egrbufs.rcvtid_size, |
1850 | dma_handle: &rcd->egrbufs.buffers[idx].dma, |
1851 | GFP_KERNEL); |
1852 | if (rcd->egrbufs.buffers[idx].addr) { |
1853 | rcd->egrbufs.buffers[idx].len = |
1854 | rcd->egrbufs.rcvtid_size; |
1855 | rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].addr = |
1856 | rcd->egrbufs.buffers[idx].addr; |
1857 | rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].dma = |
1858 | rcd->egrbufs.buffers[idx].dma; |
1859 | rcd->egrbufs.alloced++; |
1860 | alloced_bytes += rcd->egrbufs.rcvtid_size; |
1861 | idx++; |
1862 | } else { |
1863 | u32 new_size, i, j; |
1864 | u64 offset = 0; |
1865 | |
1866 | /* |
1867 | * Fail the eager buffer allocation if: |
1868 | * - we are already using the lowest acceptable size |
1869 | * - we are using one-pkt-per-egr-buffer (this implies |
1870 | * that we are accepting only one size) |
1871 | */ |
1872 | if (rcd->egrbufs.rcvtid_size == round_mtu || |
1873 | !HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) { |
1874 | dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n" , |
1875 | rcd->ctxt); |
1876 | ret = -ENOMEM; |
1877 | goto bail_rcvegrbuf_phys; |
1878 | } |
1879 | |
1880 | new_size = rcd->egrbufs.rcvtid_size / 2; |
1881 | |
1882 | /* |
1883 | * If the first attempt to allocate memory failed, don't |
1884 | * fail everything but continue with the next lower |
1885 | * size. |
1886 | */ |
1887 | if (idx == 0) { |
1888 | rcd->egrbufs.rcvtid_size = new_size; |
1889 | continue; |
1890 | } |
1891 | |
1892 | /* |
1893 | * Re-partition already allocated buffers to a smaller |
1894 | * size. |
1895 | */ |
1896 | rcd->egrbufs.alloced = 0; |
1897 | for (i = 0, j = 0, offset = 0; j < idx; i++) { |
1898 | if (i >= rcd->egrbufs.count) |
1899 | break; |
1900 | rcd->egrbufs.rcvtids[i].dma = |
1901 | rcd->egrbufs.buffers[j].dma + offset; |
1902 | rcd->egrbufs.rcvtids[i].addr = |
1903 | rcd->egrbufs.buffers[j].addr + offset; |
1904 | rcd->egrbufs.alloced++; |
1905 | if ((rcd->egrbufs.buffers[j].dma + offset + |
1906 | new_size) == |
1907 | (rcd->egrbufs.buffers[j].dma + |
1908 | rcd->egrbufs.buffers[j].len)) { |
1909 | j++; |
1910 | offset = 0; |
1911 | } else { |
1912 | offset += new_size; |
1913 | } |
1914 | } |
1915 | rcd->egrbufs.rcvtid_size = new_size; |
1916 | } |
1917 | } |
1918 | rcd->egrbufs.numbufs = idx; |
1919 | rcd->egrbufs.size = alloced_bytes; |
1920 | |
1921 | hfi1_cdbg(PROC, |
1922 | "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %uKB" , |
1923 | rcd->ctxt, rcd->egrbufs.alloced, |
1924 | rcd->egrbufs.rcvtid_size / 1024, rcd->egrbufs.size / 1024); |
1925 | |
1926 | /* |
1927 | * Set the contexts rcv array head update threshold to the closest |
1928 | * power of 2 (so we can use a mask instead of modulo) below half |
1929 | * the allocated entries. |
1930 | */ |
1931 | rcd->egrbufs.threshold = |
1932 | rounddown_pow_of_two(rcd->egrbufs.alloced / 2); |
1933 | /* |
1934 | * Compute the expected RcvArray entry base. This is done after |
1935 | * allocating the eager buffers in order to maximize the |
1936 | * expected RcvArray entries for the context. |
1937 | */ |
1938 | max_entries = rcd->rcv_array_groups * dd->rcv_entries.group_size; |
1939 | egrtop = roundup(rcd->egrbufs.alloced, dd->rcv_entries.group_size); |
1940 | rcd->expected_count = max_entries - egrtop; |
1941 | if (rcd->expected_count > MAX_TID_PAIR_ENTRIES * 2) |
1942 | rcd->expected_count = MAX_TID_PAIR_ENTRIES * 2; |
1943 | |
1944 | rcd->expected_base = rcd->eager_base + egrtop; |
1945 | hfi1_cdbg(PROC, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u" , |
1946 | rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count, |
1947 | rcd->eager_base, rcd->expected_base); |
1948 | |
1949 | if (!hfi1_rcvbuf_validate(size: rcd->egrbufs.rcvtid_size, PT_EAGER, encode: &order)) { |
1950 | hfi1_cdbg(PROC, |
1951 | "ctxt%u: current Eager buffer size is invalid %u" , |
1952 | rcd->ctxt, rcd->egrbufs.rcvtid_size); |
1953 | ret = -EINVAL; |
1954 | goto bail_rcvegrbuf_phys; |
1955 | } |
1956 | |
1957 | for (idx = 0; idx < rcd->egrbufs.alloced; idx++) { |
1958 | hfi1_put_tid(dd, index: rcd->eager_base + idx, PT_EAGER, |
1959 | pa: rcd->egrbufs.rcvtids[idx].dma, order); |
1960 | cond_resched(); |
1961 | } |
1962 | |
1963 | return 0; |
1964 | |
1965 | bail_rcvegrbuf_phys: |
1966 | for (idx = 0; idx < rcd->egrbufs.alloced && |
1967 | rcd->egrbufs.buffers[idx].addr; |
1968 | idx++) { |
1969 | dma_free_coherent(dev: &dd->pcidev->dev, |
1970 | size: rcd->egrbufs.buffers[idx].len, |
1971 | cpu_addr: rcd->egrbufs.buffers[idx].addr, |
1972 | dma_handle: rcd->egrbufs.buffers[idx].dma); |
1973 | rcd->egrbufs.buffers[idx].addr = NULL; |
1974 | rcd->egrbufs.buffers[idx].dma = 0; |
1975 | rcd->egrbufs.buffers[idx].len = 0; |
1976 | } |
1977 | |
1978 | return ret; |
1979 | } |
1980 | |