init.c source code [linux/drivers/infiniband/hw/hfi1/init.c]

1	// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2	/*
3	* Copyright(c) 2015 - 2020 Intel Corporation.
4	* Copyright(c) 2021 Cornelis Networks.
5	*/
6
7	#include <linux/pci.h>
8	#include <linux/netdevice.h>
9	#include <linux/vmalloc.h>
10	#include <linux/delay.h>
11	#include <linux/xarray.h>
12	#include <linux/module.h>
13	#include <linux/printk.h>
14	#include <linux/hrtimer.h>
15	#include <linux/bitmap.h>
16	#include <linux/numa.h>
17	#include <rdma/rdma_vt.h>
18
19	#include "hfi.h"
20	#include "device.h"
21	#include "common.h"
22	#include "trace.h"
23	#include "mad.h"
24	#include "sdma.h"
25	#include "debugfs.h"
26	#include "verbs.h"
27	#include "aspm.h"
28	#include "affinity.h"
29	#include "vnic.h"
30	#include "exp_rcv.h"
31	#include "netdev.h"
32
33	#undef pr_fmt
34	#define pr_fmt(fmt) DRIVER_NAME ": " fmt
35
36	/*
37	* min buffers we want to have per context, after driver
38	*/
39	#define HFI1_MIN_USER_CTXT_BUFCNT 7
40
41	#define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */
42	#define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */
43
44	#define NUM_IB_PORTS 1
45
46	/*
47	* Number of user receive contexts we are configured to use (to allow for more
48	* pio buffers per ctxt, etc.) Zero means use one user context per CPU.
49	*/
50	int num_user_contexts = -`1`;
51	module_param_named(num_user_contexts, num_user_contexts, int, `0444`);
52	MODULE_PARM_DESC(
53	num_user_contexts, "Set max number of user contexts to use (default: -1 will use the real (non-HT) CPU count)");
54
55	uint krcvqs[RXE_NUM_DATA_VL];
56	int krcvqsset;
57	module_param_array(krcvqs, uint, &krcvqsset, S_IRUGO);
58	MODULE_PARM_DESC(krcvqs, "Array of the number of non-control kernel receive queues by VL");
59
60	/ computed based on above array /
61	unsigned long n_krcvqs;
62
63	static unsigned hfi1_rcvarr_split = `25`;
64	module_param_named(rcvarr_split, hfi1_rcvarr_split, uint, S_IRUGO);
65	MODULE_PARM_DESC(rcvarr_split, "Percent of context's RcvArray entries used for Eager buffers");
66
67	static uint eager_buffer_size = (`8` << `20`); / 8MB /
68	module_param(eager_buffer_size, uint, S_IRUGO);
69	MODULE_PARM_DESC(eager_buffer_size, "Size of the eager buffers, default: 8MB");
70
71	static uint rcvhdrcnt = `2048`; / 2x the max eager buffer count /
72	module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO);
73	MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)");
74
75	static uint hfi1_hdrq_entsize = `32`;
76	module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, `0444`);
77	MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B, 32 - 128B (default)");
78
79	unsigned int user_credit_return_threshold = `33`; / default is 33% /
80	module_param(user_credit_return_threshold, uint, S_IRUGO);
81	MODULE_PARM_DESC(user_credit_return_threshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)");
82
83	DEFINE_XARRAY_FLAGS(hfi1_dev_table, XA_FLAGS_ALLOC \| XA_FLAGS_LOCK_IRQ);
84
85	static int hfi1_create_kctxt(struct hfi1_devdata *dd,
86	struct hfi1_pportdata *ppd)
87	{
88	struct hfi1_ctxtdata *rcd;
89	int ret;
90
91	/ Control context has to be always 0 /
92	BUILD_BUG_ON(HFI1_CTRL_CTXT != `0`);
93
94	ret = hfi1_create_ctxtdata(ppd, numa: dd->node, rcd: &rcd);
95	if (ret < `0`) {
96	dd_dev_err(dd, "Kernel receive context allocation failed\n");
97	return ret;
98	}
99
100	/*
101	* Set up the kernel context flags here and now because they use
102	* default values for all receive side memories. User contexts will
103	* be handled as they are created.
104	*/
105	rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) \|
106	HFI1_CAP_KGET(NODROP_RHQ_FULL) \|
107	HFI1_CAP_KGET(NODROP_EGR_FULL) \|
108	HFI1_CAP_KGET(DMA_RTAIL);
109
110	/ Control context must use DMA_RTAIL /
111	if (rcd->ctxt == HFI1_CTRL_CTXT)
112	rcd->flags \|= HFI1_CAP_DMA_RTAIL;
113	rcd->fast_handler = get_dma_rtail_setting(rcd) ?
114	handle_receive_interrupt_dma_rtail :
115	handle_receive_interrupt_nodma_rtail;
116
117	hfi1_set_seq_cnt(rcd, cnt: `1`);
118
119	rcd->sc = sc_alloc(dd, SC_ACK, hdrqentsize: rcd->rcvhdrqentsize, numa: dd->node);
120	if (!rcd->sc) {
121	dd_dev_err(dd, "Kernel send context allocation failed\n");
122	return -ENOMEM;
123	}
124	hfi1_init_ctxt(sc: rcd->sc);
125
126	return `0`;
127	}
128
129	/*
130	* Create the receive context array and one or more kernel contexts
131	*/
132	int hfi1_create_kctxts(struct hfi1_devdata *dd)
133	{
134	u16 i;
135	int ret;
136
137	dd->rcd = kcalloc_node(n: dd->num_rcv_contexts, size: sizeof(*dd->rcd),
138	GFP_KERNEL, node: dd->node);
139	if (!dd->rcd)
140	return -ENOMEM;
141
142	for (i = `0`; i < dd->first_dyn_alloc_ctxt; ++i) {
143	ret = hfi1_create_kctxt(dd, ppd: dd->pport);
144	if (ret)
145	goto bail;
146	}
147
148	return `0`;
149	bail:
150	for (i = `0`; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i)
151	hfi1_free_ctxt(rcd: dd->rcd[i]);
152
153	/ All the contexts should be freed, free the array /
154	kfree(objp: dd->rcd);
155	dd->rcd = NULL;
156	return ret;
157	}
158
159	/*
160	* Helper routines for the receive context reference count (rcd and uctxt).
161	*/
162	static void hfi1_rcd_init(struct hfi1_ctxtdata *rcd)
163	{
164	kref_init(kref: &rcd->kref);
165	}
166
167	/**
168	* hfi1_rcd_free - When reference is zero clean up.
169	* @kref: pointer to an initialized rcd data structure
170	*
171	*/
172	static void hfi1_rcd_free(struct kref *kref)
173	{
174	unsigned long flags;
175	struct hfi1_ctxtdata *rcd =
176	container_of(kref, struct hfi1_ctxtdata, kref);
177
178	spin_lock_irqsave(&rcd->dd->uctxt_lock, flags);
179	rcd->dd->rcd[rcd->ctxt] = NULL;
180	spin_unlock_irqrestore(lock: &rcd->dd->uctxt_lock, flags);
181
182	hfi1_free_ctxtdata(dd: rcd->dd, rcd);
183
184	kfree(objp: rcd);
185	}
186
187	/**
188	* hfi1_rcd_put - decrement reference for rcd
189	* @rcd: pointer to an initialized rcd data structure
190	*
191	* Use this to put a reference after the init.
192	*/
193	int hfi1_rcd_put(struct hfi1_ctxtdata *rcd)
194	{
195	if (rcd)
196	return kref_put(kref: &rcd->kref, release: hfi1_rcd_free);
197
198	return `0`;
199	}
200
201	/**
202	* hfi1_rcd_get - increment reference for rcd
203	* @rcd: pointer to an initialized rcd data structure
204	*
205	* Use this to get a reference after the init.
206	*
207	* Return : reflect kref_get_unless_zero(), which returns non-zero on
208	* increment, otherwise 0.
209	*/
210	int hfi1_rcd_get(struct hfi1_ctxtdata *rcd)
211	{
212	return kref_get_unless_zero(kref: &rcd->kref);
213	}
214
215	/**
216	* allocate_rcd_index - allocate an rcd index from the rcd array
217	* @dd: pointer to a valid devdata structure
218	* @rcd: rcd data structure to assign
219	* @index: pointer to index that is allocated
220	*
221	* Find an empty index in the rcd array, and assign the given rcd to it.
222	* If the array is full, we are EBUSY.
223	*
224	*/
225	static int allocate_rcd_index(struct hfi1_devdata *dd,
226	struct hfi1_ctxtdata rcd, u16 index)
227	{
228	unsigned long flags;
229	u16 ctxt;
230
231	spin_lock_irqsave(&dd->uctxt_lock, flags);
232	for (ctxt = `0`; ctxt < dd->num_rcv_contexts; ctxt++)
233	if (!dd->rcd[ctxt])
234	break;
235
236	if (ctxt < dd->num_rcv_contexts) {
237	rcd->ctxt = ctxt;
238	dd->rcd[ctxt] = rcd;
239	hfi1_rcd_init(rcd);
240	}
241	spin_unlock_irqrestore(lock: &dd->uctxt_lock, flags);
242
243	if (ctxt >= dd->num_rcv_contexts)
244	return -EBUSY;
245
246	*index = ctxt;
247
248	return `0`;
249	}
250
251	/**
252	* hfi1_rcd_get_by_index_safe - validate the ctxt index before accessing the
253	* array
254	* @dd: pointer to a valid devdata structure
255	* @ctxt: the index of an possilbe rcd
256	*
257	* This is a wrapper for hfi1_rcd_get_by_index() to validate that the given
258	* ctxt index is valid.
259	*
260	* The caller is responsible for making the _put().
261	*
262	*/
263	struct hfi1_ctxtdata hfi1_rcd_get_by_index_safe(struct* hfi1_devdata *dd,
264	u16 ctxt)
265	{
266	if (ctxt < dd->num_rcv_contexts)
267	return hfi1_rcd_get_by_index(dd, ctxt);
268
269	return NULL;
270	}
271
272	/**
273	* hfi1_rcd_get_by_index - get by index
274	* @dd: pointer to a valid devdata structure
275	* @ctxt: the index of an possilbe rcd
276	*
277	* We need to protect access to the rcd array. If access is needed to
278	* one or more index, get the protecting spinlock and then increment the
279	* kref.
280	*
281	* The caller is responsible for making the _put().
282	*
283	*/
284	struct hfi1_ctxtdata hfi1_rcd_get_by_index(struct* hfi1_devdata *dd, u16 ctxt)
285	{
286	unsigned long flags;
287	struct hfi1_ctxtdata *rcd = NULL;
288
289	spin_lock_irqsave(&dd->uctxt_lock, flags);
290	if (dd->rcd[ctxt]) {
291	rcd = dd->rcd[ctxt];
292	if (!hfi1_rcd_get(rcd))
293	rcd = NULL;
294	}
295	spin_unlock_irqrestore(lock: &dd->uctxt_lock, flags);
296
297	return rcd;
298	}
299
300	/*
301	* Common code for user and kernel context create and setup.
302	* NOTE: the initial kref is done here (hf1_rcd_init()).
303	*/
304	int hfi1_create_ctxtdata(struct hfi1_pportdata ppd, int* numa,
305	struct hfi1_ctxtdata **context)
306	{
307	struct hfi1_devdata *dd = ppd->dd;
308	struct hfi1_ctxtdata *rcd;
309	unsigned kctxt_ngroups = `0`;
310	u32 base;
311
312	if (dd->rcv_entries.nctxt_extra >
313	dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt)
314	kctxt_ngroups = (dd->rcv_entries.nctxt_extra -
315	(dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt));
316	rcd = kzalloc_node(size: sizeof(*rcd), GFP_KERNEL, node: numa);
317	if (rcd) {
318	u32 rcvtids, max_entries;
319	u16 ctxt;
320	int ret;
321
322	ret = allocate_rcd_index(dd, rcd, index: &ctxt);
323	if (ret) {
324	*context = NULL;
325	kfree(objp: rcd);
326	return ret;
327	}
328
329	INIT_LIST_HEAD(list: &rcd->qp_wait_list);
330	hfi1_exp_tid_group_init(rcd);
331	rcd->ppd = ppd;
332	rcd->dd = dd;
333	rcd->numa_id = numa;
334	rcd->rcv_array_groups = dd->rcv_entries.ngroups;
335	rcd->rhf_rcv_function_map = normal_rhf_rcv_functions;
336	rcd->slow_handler = handle_receive_interrupt;
337	rcd->do_interrupt = rcd->slow_handler;
338	rcd->msix_intr = CCE_NUM_MSIX_VECTORS;
339
340	mutex_init(&rcd->exp_mutex);
341	spin_lock_init(&rcd->exp_lock);
342	INIT_LIST_HEAD(list: &rcd->flow_queue.queue_head);
343	INIT_LIST_HEAD(list: &rcd->rarr_queue.queue_head);
344
345	hfi1_cdbg(PROC, "setting up context %u", rcd->ctxt);
346
347	/*
348	* Calculate the context's RcvArray entry starting point.
349	* We do this here because we have to take into account all
350	* the RcvArray entries that previous context would have
351	* taken and we have to account for any extra groups assigned
352	* to the static (kernel) or dynamic (vnic/user) contexts.
353	*/
354	if (ctxt < dd->first_dyn_alloc_ctxt) {
355	if (ctxt < kctxt_ngroups) {
356	base = ctxt * (dd->rcv_entries.ngroups + `1`);
357	rcd->rcv_array_groups++;
358	} else {
359	base = kctxt_ngroups +
360	(ctxt * dd->rcv_entries.ngroups);
361	}
362	} else {
363	u16 ct = ctxt - dd->first_dyn_alloc_ctxt;
364
365	base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) +
366	kctxt_ngroups);
367	if (ct < dd->rcv_entries.nctxt_extra) {
368	base += ct * (dd->rcv_entries.ngroups + `1`);
369	rcd->rcv_array_groups++;
370	} else {
371	base += dd->rcv_entries.nctxt_extra +
372	(ct * dd->rcv_entries.ngroups);
373	}
374	}
375	rcd->eager_base = base * dd->rcv_entries.group_size;
376
377	rcd->rcvhdrq_cnt = rcvhdrcnt;
378	rcd->rcvhdrqentsize = hfi1_hdrq_entsize;
379	rcd->rhf_offset =
380	rcd->rcvhdrqentsize - sizeof(u64) / sizeof(u32);
381	/*
382	* Simple Eager buffer allocation: we have already pre-allocated
383	* the number of RcvArray entry groups. Each ctxtdata structure
384	* holds the number of groups for that context.
385	*
386	* To follow CSR requirements and maintain cacheline alignment,
387	* make sure all sizes and bases are multiples of group_size.
388	*
389	* The expected entry count is what is left after assigning
390	* eager.
391	*/
392	max_entries = rcd->rcv_array_groups *
393	dd->rcv_entries.group_size;
394	rcvtids = ((max_entries * hfi1_rcvarr_split) / `100`);
395	rcd->egrbufs.count = round_down(rcvtids,
396	dd->rcv_entries.group_size);
397	if (rcd->egrbufs.count > MAX_EAGER_ENTRIES) {
398	dd_dev_err(dd, "ctxt%u: requested too many RcvArray entries.\n",
399	rcd->ctxt);
400	rcd->egrbufs.count = MAX_EAGER_ENTRIES;
401	}
402	hfi1_cdbg(PROC,
403	"ctxt%u: max Eager buffer RcvArray entries: %u",
404	rcd->ctxt, rcd->egrbufs.count);
405
406	/*
407	* Allocate array that will hold the eager buffer accounting
408	* data.
409	* This will allocate the maximum possible buffer count based
410	* on the value of the RcvArray split parameter.
411	* The resulting value will be rounded down to the closest
412	* multiple of dd->rcv_entries.group_size.
413	*/
414	rcd->egrbufs.buffers =
415	kcalloc_node(n: rcd->egrbufs.count,
416	size: sizeof(*rcd->egrbufs.buffers),
417	GFP_KERNEL, node: numa);
418	if (!rcd->egrbufs.buffers)
419	goto bail;
420	rcd->egrbufs.rcvtids =
421	kcalloc_node(n: rcd->egrbufs.count,
422	size: sizeof(*rcd->egrbufs.rcvtids),
423	GFP_KERNEL, node: numa);
424	if (!rcd->egrbufs.rcvtids)
425	goto bail;
426	rcd->egrbufs.size = eager_buffer_size;
427	/*
428	* The size of the buffers programmed into the RcvArray
429	* entries needs to be big enough to handle the highest
430	* MTU supported.
431	*/
432	if (rcd->egrbufs.size < hfi1_max_mtu) {
433	rcd->egrbufs.size = __roundup_pow_of_two(n: hfi1_max_mtu);
434	hfi1_cdbg(PROC,
435	"ctxt%u: eager bufs size too small. Adjusting to %u",
436	rcd->ctxt, rcd->egrbufs.size);
437	}
438	rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE;
439
440	/ Applicable only for statically created kernel contexts /
441	if (ctxt < dd->first_dyn_alloc_ctxt) {
442	rcd->opstats = kzalloc_node(size: sizeof(*rcd->opstats),
443	GFP_KERNEL, node: numa);
444	if (!rcd->opstats)
445	goto bail;
446
447	/ Initialize TID flow generations for the context /
448	hfi1_kern_init_ctxt_generations(rcd);
449	}
450
451	*context = rcd;
452	return `0`;
453	}
454
455	bail:
456	*context = NULL;
457	hfi1_free_ctxt(rcd);
458	return -ENOMEM;
459	}
460
461	/**
462	* hfi1_free_ctxt - free context
463	* @rcd: pointer to an initialized rcd data structure
464	*
465	* This wrapper is the free function that matches hfi1_create_ctxtdata().
466	* When a context is done being used (kernel or user), this function is called
467	* for the "final" put to match the kref init from hfi1_create_ctxtdata().
468	* Other users of the context do a get/put sequence to make sure that the
469	* structure isn't removed while in use.
470	*/
471	void hfi1_free_ctxt(struct hfi1_ctxtdata *rcd)
472	{
473	hfi1_rcd_put(rcd);
474	}
475
476	/*
477	* Select the largest ccti value over all SLs to determine the intra-
478	* packet gap for the link.
479	*
480	* called with cca_timer_lock held (to protect access to cca_timer
481	* array), and rcu_read_lock() (to protect access to cc_state).
482	*/
483	void set_link_ipg(struct hfi1_pportdata *ppd)
484	{
485	struct hfi1_devdata *dd = ppd->dd;
486	struct cc_state *cc_state;
487	int i;
488	u16 cce, ccti_limit, max_ccti = `0`;
489	u16 shift, mult;
490	u64 src;
491	u32 current_egress_rate; / Mbits /sec /
492	u64 max_pkt_time;
493	/*
494	* max_pkt_time is the maximum packet egress time in units
495	* of the fabric clock period 1/(805 MHz).
496	*/
497
498	cc_state = get_cc_state(ppd);
499
500	if (!cc_state)
501	/*
502	* This should _never_ happen - rcu_read_lock() is held,
503	* and set_link_ipg() should not be called if cc_state
504	* is NULL.
505	*/
506	return;
507
508	for (i = `0`; i < OPA_MAX_SLS; i++) {
509	u16 ccti = ppd->cca_timer[i].ccti;
510
511	if (ccti > max_ccti)
512	max_ccti = ccti;
513	}
514
515	ccti_limit = cc_state->cct.ccti_limit;
516	if (max_ccti > ccti_limit)
517	max_ccti = ccti_limit;
518
519	cce = cc_state->cct.entries[max_ccti].entry;
520	shift = (cce & `0xc000`) >> `14`;
521	mult = (cce & `0x3fff`);
522
523	current_egress_rate = active_egress_rate(ppd);
524
525	max_pkt_time = egress_cycles(len: ppd->ibmaxlen, rate: current_egress_rate);
526
527	src = (max_pkt_time >> shift) * mult;
528
529	src &= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK;
530	src <<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT;
531
532	write_csr(dd, SEND_STATIC_RATE_CONTROL, value: src);
533	}
534
535	static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)
536	{
537	struct cca_timer *cca_timer;
538	struct hfi1_pportdata *ppd;
539	int sl;
540	u16 ccti_timer, ccti_min;
541	struct cc_state *cc_state;
542	unsigned long flags;
543	enum hrtimer_restart ret = HRTIMER_NORESTART;
544
545	cca_timer = container_of(t, struct cca_timer, hrtimer);
546	ppd = cca_timer->ppd;
547	sl = cca_timer->sl;
548
549	rcu_read_lock();
550
551	cc_state = get_cc_state(ppd);
552
553	if (!cc_state) {
554	rcu_read_unlock();
555	return HRTIMER_NORESTART;
556	}
557
558	/*
559	* 1) decrement ccti for SL
560	* 2) calculate IPG for link (set_link_ipg())
561	* 3) restart timer, unless ccti is at min value
562	*/
563
564	ccti_min = cc_state->cong_setting.entries[sl].ccti_min;
565	ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
566
567	spin_lock_irqsave(&ppd->cca_timer_lock, flags);
568
569	if (cca_timer->ccti > ccti_min) {
570	cca_timer->ccti--;
571	set_link_ipg(ppd);
572	}
573
574	if (cca_timer->ccti > ccti_min) {
575	unsigned long nsec = `1024` * ccti_timer;
576	/ ccti_timer is in units of 1.024 usec /
577	hrtimer_forward_now(timer: t, interval: ns_to_ktime(ns: nsec));
578	ret = HRTIMER_RESTART;
579	}
580
581	spin_unlock_irqrestore(lock: &ppd->cca_timer_lock, flags);
582	rcu_read_unlock();
583	return ret;
584	}
585
586	/*
587	* Common code for initializing the physical port structure.
588	*/
589	void hfi1_init_pportdata(struct pci_dev pdev, struct* hfi1_pportdata *ppd,
590	struct hfi1_devdata *dd, u8 hw_pidx, u32 port)
591	{
592	int i;
593	uint default_pkey_idx;
594	struct cc_state *cc_state;
595
596	ppd->dd = dd;
597	ppd->hw_pidx = hw_pidx;
598	ppd->port = port; / IB port number, not index /
599	ppd->prev_link_width = LINK_WIDTH_DEFAULT;
600	/*
601	* There are C_VL_COUNT number of PortVLXmitWait counters.
602	* Adding 1 to C_VL_COUNT to include the PortXmitWait counter.
603	*/
604	for (i = `0`; i < C_VL_COUNT + `1`; i++) {
605	ppd->port_vl_xmit_wait_last[i] = `0`;
606	ppd->vl_xmit_flit_cnt[i] = `0`;
607	}
608
609	default_pkey_idx = `1`;
610
611	ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY;
612	ppd->part_enforce \|= HFI1_PART_ENFORCE_IN;
613	ppd->pkeys[`0`] = `0x8001`;
614
615	INIT_WORK(&ppd->link_vc_work, handle_verify_cap);
616	INIT_WORK(&ppd->link_up_work, handle_link_up);
617	INIT_WORK(&ppd->link_down_work, handle_link_down);
618	INIT_WORK(&ppd->freeze_work, handle_freeze);
619	INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade);
620	INIT_WORK(&ppd->sma_message_work, handle_sma_message);
621	INIT_WORK(&ppd->link_bounce_work, handle_link_bounce);
622	INIT_DELAYED_WORK(&ppd->start_link_work, handle_start_link);
623	INIT_WORK(&ppd->linkstate_active_work, receive_interrupt_work);
624	INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event);
625
626	mutex_init(&ppd->hls_lock);
627	spin_lock_init(&ppd->qsfp_info.qsfp_lock);
628
629	ppd->qsfp_info.ppd = ppd;
630	ppd->sm_trap_qp = `0x0`;
631	ppd->sa_qp = `0x1`;
632
633	ppd->hfi1_wq = NULL;
634
635	spin_lock_init(&ppd->cca_timer_lock);
636
637	for (i = `0`; i < OPA_MAX_SLS; i++) {
638	hrtimer_init(timer: &ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC,
639	mode: HRTIMER_MODE_REL);
640	ppd->cca_timer[i].ppd = ppd;
641	ppd->cca_timer[i].sl = i;
642	ppd->cca_timer[i].ccti = `0`;
643	ppd->cca_timer[i].hrtimer.function = cca_timer_fn;
644	}
645
646	ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT;
647
648	spin_lock_init(&ppd->cc_state_lock);
649	spin_lock_init(&ppd->cc_log_lock);
650	cc_state = kzalloc(size: sizeof(*cc_state), GFP_KERNEL);
651	RCU_INIT_POINTER(ppd->cc_state, cc_state);
652	if (!cc_state)
653	goto bail;
654	return;
655
656	bail:
657	dd_dev_err(dd, "Congestion Control Agent disabled for port %d\n", port);
658	}
659
660	/*
661	* Do initialization for device that is only needed on
662	* first detect, not on resets.
663	*/
664	static int loadtime_init(struct hfi1_devdata *dd)
665	{
666	return `0`;
667	}
668
669	/**
670	* init_after_reset - re-initialize after a reset
671	* @dd: the hfi1_ib device
672	*
673	* sanity check at least some of the values after reset, and
674	* ensure no receive or transmit (explicitly, in case reset
675	* failed
676	*/
677	static int init_after_reset(struct hfi1_devdata *dd)
678	{
679	int i;
680	struct hfi1_ctxtdata *rcd;
681	/*
682	* Ensure chip does no sends or receives, tail updates, or
683	* pioavail updates while we re-initialize. This is mostly
684	* for the driver data structures, not chip registers.
685	*/
686	for (i = `0`; i < dd->num_rcv_contexts; i++) {
687	rcd = hfi1_rcd_get_by_index(dd, ctxt: i);
688	hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS \|
689	HFI1_RCVCTRL_INTRAVAIL_DIS \|
690	HFI1_RCVCTRL_TAILUPD_DIS, rcd);
691	hfi1_rcd_put(rcd);
692	}
693	pio_send_control(dd, PSC_GLOBAL_DISABLE);
694	for (i = `0`; i < dd->num_send_contexts; i++)
695	sc_disable(sc: dd->send_contexts[i].sc);
696
697	return `0`;
698	}
699
700	static void enable_chip(struct hfi1_devdata *dd)
701	{
702	struct hfi1_ctxtdata *rcd;
703	u32 rcvmask;
704	u16 i;
705
706	/ enable PIO send /
707	pio_send_control(dd, PSC_GLOBAL_ENABLE);
708
709	/*
710	* Enable kernel ctxts' receive and receive interrupt.
711	* Other ctxts done as user opens and initializes them.
712	*/
713	for (i = `0`; i < dd->first_dyn_alloc_ctxt; ++i) {
714	rcd = hfi1_rcd_get_by_index(dd, ctxt: i);
715	if (!rcd)
716	continue;
717	rcvmask = HFI1_RCVCTRL_CTXT_ENB \| HFI1_RCVCTRL_INTRAVAIL_ENB;
718	rcvmask \|= HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ?
719	HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
720	if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR))
721	rcvmask \|= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
722	if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_RHQ_FULL))
723	rcvmask \|= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
724	if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL))
725	rcvmask \|= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
726	if (HFI1_CAP_IS_KSET(TID_RDMA))
727	rcvmask \|= HFI1_RCVCTRL_TIDFLOW_ENB;
728	hfi1_rcvctrl(dd, op: rcvmask, rcd);
729	sc_enable(sc: rcd->sc);
730	hfi1_rcd_put(rcd);
731	}
732	}
733
734	/**
735	* create_workqueues - create per port workqueues
736	* @dd: the hfi1_ib device
737	*/
738	static int create_workqueues(struct hfi1_devdata *dd)
739	{
740	int pidx;
741	struct hfi1_pportdata *ppd;
742
743	for (pidx = `0`; pidx < dd->num_pports; ++pidx) {
744	ppd = dd->pport + pidx;
745	if (!ppd->hfi1_wq) {
746	ppd->hfi1_wq =
747	alloc_workqueue(
748	fmt: "hfi%d_%d",
749	flags: WQ_SYSFS \| WQ_HIGHPRI \| WQ_CPU_INTENSIVE \|
750	WQ_MEM_RECLAIM,
751	HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES,
752	dd->unit, pidx);
753	if (!ppd->hfi1_wq)
754	goto wq_error;
755	}
756	if (!ppd->link_wq) {
757	/*
758	* Make the link workqueue single-threaded to enforce
759	* serialization.
760	*/
761	ppd->link_wq =
762	alloc_workqueue(
763	fmt: "hfi_link_%d_%d",
764	flags: WQ_SYSFS \| WQ_MEM_RECLAIM \| WQ_UNBOUND,
765	max_active: `1`, / max_active /
766	dd->unit, pidx);
767	if (!ppd->link_wq)
768	goto wq_error;
769	}
770	}
771	return `0`;
772	wq_error:
773	pr_err("alloc_workqueue failed for port %d\n", pidx + `1`);
774	for (pidx = `0`; pidx < dd->num_pports; ++pidx) {
775	ppd = dd->pport + pidx;
776	if (ppd->hfi1_wq) {
777	destroy_workqueue(wq: ppd->hfi1_wq);
778	ppd->hfi1_wq = NULL;
779	}
780	if (ppd->link_wq) {
781	destroy_workqueue(wq: ppd->link_wq);
782	ppd->link_wq = NULL;
783	}
784	}
785	return -ENOMEM;
786	}
787
788	/**
789	* destroy_workqueues - destroy per port workqueues
790	* @dd: the hfi1_ib device
791	*/
792	static void destroy_workqueues(struct hfi1_devdata *dd)
793	{
794	int pidx;
795	struct hfi1_pportdata *ppd;
796
797	for (pidx = `0`; pidx < dd->num_pports; ++pidx) {
798	ppd = dd->pport + pidx;
799
800	if (ppd->hfi1_wq) {
801	destroy_workqueue(wq: ppd->hfi1_wq);
802	ppd->hfi1_wq = NULL;
803	}
804	if (ppd->link_wq) {
805	destroy_workqueue(wq: ppd->link_wq);
806	ppd->link_wq = NULL;
807	}
808	}
809	}
810
811	/**
812	* enable_general_intr() - Enable the IRQs that will be handled by the
813	* general interrupt handler.
814	* @dd: valid devdata
815	*
816	*/
817	static void enable_general_intr(struct hfi1_devdata *dd)
818	{
819	set_intr_bits(dd, CCE_ERR_INT, MISC_ERR_INT, set: true);
820	set_intr_bits(dd, PIO_ERR_INT, TXE_ERR_INT, set: true);
821	set_intr_bits(dd, IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END, set: true);
822	set_intr_bits(dd, PBC_INT, GPIO_ASSERT_INT, set: true);
823	set_intr_bits(dd, TCRIT_INT, TCRIT_INT, set: true);
824	set_intr_bits(dd, IS_DC_START, IS_DC_END, set: true);
825	set_intr_bits(dd, IS_SENDCREDIT_START, IS_SENDCREDIT_END, set: true);
826	}
827
828	/**
829	* hfi1_init - do the actual initialization sequence on the chip
830	* @dd: the hfi1_ib device
831	* @reinit: re-initializing, so don't allocate new memory
832	*
833	* Do the actual initialization sequence on the chip. This is done
834	* both from the init routine called from the PCI infrastructure, and
835	* when we reset the chip, or detect that it was reset internally,
836	* or it's administratively re-enabled.
837	*
838	* Memory allocation here and in called routines is only done in
839	* the first case (reinit == 0). We have to be careful, because even
840	* without memory allocation, we need to re-write all the chip registers
841	* TIDs, etc. after the reset or enable has completed.
842	*/
843	int hfi1_init(struct hfi1_devdata dd, int* reinit)
844	{
845	int ret = `0`, pidx, lastfail = `0`;
846	unsigned long len;
847	u16 i;
848	struct hfi1_ctxtdata *rcd;
849	struct hfi1_pportdata *ppd;
850
851	/ Set up send low level handlers /
852	dd->process_pio_send = hfi1_verbs_send_pio;
853	dd->process_dma_send = hfi1_verbs_send_dma;
854	dd->pio_inline_send = pio_copy;
855	dd->process_vnic_dma_send = hfi1_vnic_send_dma;
856
857	if (is_ax(dd)) {
858	atomic_set(v: &dd->drop_packet, DROP_PACKET_ON);
859	dd->do_drop = true;
860	} else {
861	atomic_set(v: &dd->drop_packet, DROP_PACKET_OFF);
862	dd->do_drop = false;
863	}
864
865	/ make sure the link is not "up" /
866	for (pidx = `0`; pidx < dd->num_pports; ++pidx) {
867	ppd = dd->pport + pidx;
868	ppd->linkup = `0`;
869	}
870
871	if (reinit)
872	ret = init_after_reset(dd);
873	else
874	ret = loadtime_init(dd);
875	if (ret)
876	goto done;
877
878	/ dd->rcd can be NULL if early initialization failed /
879	for (i = `0`; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i) {
880	/*
881	* Set up the (kernel) rcvhdr queue and egr TIDs. If doing
882	* re-init, the simplest way to handle this is to free
883	* existing, and re-allocate.
884	* Need to re-create rest of ctxt 0 ctxtdata as well.
885	*/
886	rcd = hfi1_rcd_get_by_index(dd, ctxt: i);
887	if (!rcd)
888	continue;
889
890	lastfail = hfi1_create_rcvhdrq(dd, rcd);
891	if (!lastfail)
892	lastfail = hfi1_setup_eagerbufs(rcd);
893	if (!lastfail)
894	lastfail = hfi1_kern_exp_rcv_init(rcd, reinit);
895	if (lastfail) {
896	dd_dev_err(dd,
897	"failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
898	ret = lastfail;
899	}
900	/ enable IRQ /
901	hfi1_rcd_put(rcd);
902	}
903
904	/ Allocate enough memory for user event notification. /
905	len = PAGE_ALIGN(chip_rcv_contexts(dd) * HFI1_MAX_SHARED_CTXTS *
906	sizeof(*dd->events));
907	dd->events = vmalloc_user(size: len);
908	if (!dd->events)
909	dd_dev_err(dd, "Failed to allocate user events page\n");
910	/*
911	* Allocate a page for device and port status.
912	* Page will be shared amongst all user processes.
913	*/
914	dd->status = vmalloc_user(PAGE_SIZE);
915	if (!dd->status)
916	dd_dev_err(dd, "Failed to allocate dev status page\n");
917	for (pidx = `0`; pidx < dd->num_pports; ++pidx) {
918	ppd = dd->pport + pidx;
919	if (dd->status)
920	/ Currently, we only have one port /
921	ppd->statusp = &dd->status->port;
922
923	set_mtu(ppd);
924	}
925
926	/ enable chip even if we have an error, so we can debug cause /
927	enable_chip(dd);
928
929	done:
930	/*
931	* Set status even if port serdes is not initialized
932	* so that diags will work.
933	*/
934	if (dd->status)
935	dd->status->dev \|= HFI1_STATUS_CHIP_PRESENT \|
936	HFI1_STATUS_INITTED;
937	if (!ret) {
938	/ enable all interrupts from the chip /
939	enable_general_intr(dd);
940	init_qsfp_int(dd);
941
942	/ chip is OK for user apps; mark it as initialized /
943	for (pidx = `0`; pidx < dd->num_pports; ++pidx) {
944	ppd = dd->pport + pidx;
945
946	/*
947	* start the serdes - must be after interrupts are
948	* enabled so we are notified when the link goes up
949	*/
950	lastfail = bringup_serdes(ppd);
951	if (lastfail)
952	dd_dev_info(dd,
953	"Failed to bring up port %u\n",
954	ppd->port);
955
956	/*
957	* Set status even if port serdes is not initialized
958	* so that diags will work.
959	*/
960	if (ppd->statusp)
961	*ppd->statusp \|= HFI1_STATUS_CHIP_PRESENT \|
962	HFI1_STATUS_INITTED;
963	if (!ppd->link_speed_enabled)
964	continue;
965	}
966	}
967
968	/ if ret is non-zero, we probably should do some cleanup here... /
969	return ret;
970	}
971
972	struct hfi1_devdata hfi1_lookup(int* unit)
973	{
974	return xa_load(&hfi1_dev_table, index: unit);
975	}
976
977	/*
978	* Stop the timers during unit shutdown, or after an error late
979	* in initialization.
980	*/
981	static void stop_timers(struct hfi1_devdata *dd)
982	{
983	struct hfi1_pportdata *ppd;
984	int pidx;
985
986	for (pidx = `0`; pidx < dd->num_pports; ++pidx) {
987	ppd = dd->pport + pidx;
988	if (ppd->led_override_timer.function) {
989	del_timer_sync(timer: &ppd->led_override_timer);
990	atomic_set(v: &ppd->led_override_timer_active, i: `0`);
991	}
992	}
993	}
994
995	/**
996	* shutdown_device - shut down a device
997	* @dd: the hfi1_ib device
998	*
999	* This is called to make the device quiet when we are about to
1000	* unload the driver, and also when the device is administratively
1001	* disabled. It does not free any data structures.
1002	* Everything it does has to be setup again by hfi1_init(dd, 1)
1003	*/
1004	static void shutdown_device(struct hfi1_devdata *dd)
1005	{
1006	struct hfi1_pportdata *ppd;
1007	struct hfi1_ctxtdata *rcd;
1008	unsigned pidx;
1009	int i;
1010
1011	if (dd->flags & HFI1_SHUTDOWN)
1012	return;
1013	dd->flags \|= HFI1_SHUTDOWN;
1014
1015	for (pidx = `0`; pidx < dd->num_pports; ++pidx) {
1016	ppd = dd->pport + pidx;
1017
1018	ppd->linkup = `0`;
1019	if (ppd->statusp)
1020	*ppd->statusp &= ~(HFI1_STATUS_IB_CONF \|
1021	HFI1_STATUS_IB_READY);
1022	}
1023	dd->flags &= ~HFI1_INITTED;
1024
1025	/ mask and clean up interrupts /
1026	set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, set: false);
1027	msix_clean_up_interrupts(dd);
1028
1029	for (pidx = `0`; pidx < dd->num_pports; ++pidx) {
1030	for (i = `0`; i < dd->num_rcv_contexts; i++) {
1031	rcd = hfi1_rcd_get_by_index(dd, ctxt: i);
1032	hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS \|
1033	HFI1_RCVCTRL_CTXT_DIS \|
1034	HFI1_RCVCTRL_INTRAVAIL_DIS \|
1035	HFI1_RCVCTRL_PKEY_DIS \|
1036	HFI1_RCVCTRL_ONE_PKT_EGR_DIS, rcd);
1037	hfi1_rcd_put(rcd);
1038	}
1039	/*
1040	* Gracefully stop all sends allowing any in progress to
1041	* trickle out first.
1042	*/
1043	for (i = `0`; i < dd->num_send_contexts; i++)
1044	sc_flush(sc: dd->send_contexts[i].sc);
1045	}
1046
1047	/*
1048	* Enough for anything that's going to trickle out to have actually
1049	* done so.
1050	*/
1051	udelay(`20`);
1052
1053	for (pidx = `0`; pidx < dd->num_pports; ++pidx) {
1054	ppd = dd->pport + pidx;
1055
1056	/ disable all contexts /
1057	for (i = `0`; i < dd->num_send_contexts; i++)
1058	sc_disable(sc: dd->send_contexts[i].sc);
1059	/ disable the send device /
1060	pio_send_control(dd, PSC_GLOBAL_DISABLE);
1061
1062	shutdown_led_override(ppd);
1063
1064	/*
1065	* Clear SerdesEnable.
1066	* We can't count on interrupts since we are stopping.
1067	*/
1068	hfi1_quiet_serdes(ppd);
1069	if (ppd->hfi1_wq)
1070	flush_workqueue(ppd->hfi1_wq);
1071	if (ppd->link_wq)
1072	flush_workqueue(ppd->link_wq);
1073	}
1074	sdma_exit(dd);
1075	}
1076
1077	/**
1078	* hfi1_free_ctxtdata - free a context's allocated data
1079	* @dd: the hfi1_ib device
1080	* @rcd: the ctxtdata structure
1081	*
1082	* free up any allocated data for a context
1083	* It should never change any chip state, or global driver state.
1084	*/
1085	void hfi1_free_ctxtdata(struct hfi1_devdata dd, struct* hfi1_ctxtdata *rcd)
1086	{
1087	u32 e;
1088
1089	if (!rcd)
1090	return;
1091
1092	if (rcd->rcvhdrq) {
1093	dma_free_coherent(dev: &dd->pcidev->dev, size: rcvhdrq_size(rcd),
1094	cpu_addr: rcd->rcvhdrq, dma_handle: rcd->rcvhdrq_dma);
1095	rcd->rcvhdrq = NULL;
1096	if (hfi1_rcvhdrtail_kvaddr(rcd)) {
1097	dma_free_coherent(dev: &dd->pcidev->dev, PAGE_SIZE,
1098	cpu_addr: (void *)hfi1_rcvhdrtail_kvaddr(rcd),
1099	dma_handle: rcd->rcvhdrqtailaddr_dma);
1100	rcd->rcvhdrtail_kvaddr = NULL;
1101	}
1102	}
1103
1104	/ all the RcvArray entries should have been cleared by now /
1105	kfree(objp: rcd->egrbufs.rcvtids);
1106	rcd->egrbufs.rcvtids = NULL;
1107
1108	for (e = `0`; e < rcd->egrbufs.alloced; e++) {
1109	if (rcd->egrbufs.buffers[e].addr)
1110	dma_free_coherent(dev: &dd->pcidev->dev,
1111	size: rcd->egrbufs.buffers[e].len,
1112	cpu_addr: rcd->egrbufs.buffers[e].addr,
1113	dma_handle: rcd->egrbufs.buffers[e].dma);
1114	}
1115	kfree(objp: rcd->egrbufs.buffers);
1116	rcd->egrbufs.alloced = `0`;
1117	rcd->egrbufs.buffers = NULL;
1118
1119	sc_free(sc: rcd->sc);
1120	rcd->sc = NULL;
1121
1122	vfree(addr: rcd->subctxt_uregbase);
1123	vfree(addr: rcd->subctxt_rcvegrbuf);
1124	vfree(addr: rcd->subctxt_rcvhdr_base);
1125	kfree(objp: rcd->opstats);
1126
1127	rcd->subctxt_uregbase = NULL;
1128	rcd->subctxt_rcvegrbuf = NULL;
1129	rcd->subctxt_rcvhdr_base = NULL;
1130	rcd->opstats = NULL;
1131	}
1132
1133	/*
1134	* Release our hold on the shared asic data. If we are the last one,
1135	* return the structure to be finalized outside the lock. Must be
1136	* holding hfi1_dev_table lock.
1137	*/
1138	static struct hfi1_asic_data release_asic_data(struct* hfi1_devdata *dd)
1139	{
1140	struct hfi1_asic_data *ad;
1141	int other;
1142
1143	if (!dd->asic_data)
1144	return NULL;
1145	dd->asic_data->dds[dd->hfi1_id] = NULL;
1146	other = dd->hfi1_id ? `0` : `1`;
1147	ad = dd->asic_data;
1148	dd->asic_data = NULL;
1149	/ return NULL if the other dd still has a link /
1150	return ad->dds[other] ? NULL : ad;
1151	}
1152
1153	static void finalize_asic_data(struct hfi1_devdata *dd,
1154	struct hfi1_asic_data *ad)
1155	{
1156	clean_up_i2c(dd, ad);
1157	kfree(objp: ad);
1158	}
1159
1160	/**
1161	* hfi1_free_devdata - cleans up and frees per-unit data structure
1162	* @dd: pointer to a valid devdata structure
1163	*
1164	* It cleans up and frees all data structures set up by
1165	* by hfi1_alloc_devdata().
1166	*/
1167	void hfi1_free_devdata(struct hfi1_devdata *dd)
1168	{
1169	struct hfi1_asic_data *ad;
1170	unsigned long flags;
1171
1172	xa_lock_irqsave(&hfi1_dev_table, flags);
1173	__xa_erase(&hfi1_dev_table, index: dd->unit);
1174	ad = release_asic_data(dd);
1175	xa_unlock_irqrestore(&hfi1_dev_table, flags);
1176
1177	finalize_asic_data(dd, ad);
1178	free_platform_config(dd);
1179	rcu_barrier(); / wait for rcu callbacks to complete /
1180	free_percpu(pdata: dd->int_counter);
1181	free_percpu(pdata: dd->rcv_limit);
1182	free_percpu(pdata: dd->send_schedule);
1183	free_percpu(pdata: dd->tx_opstats);
1184	dd->int_counter = NULL;
1185	dd->rcv_limit = NULL;
1186	dd->send_schedule = NULL;
1187	dd->tx_opstats = NULL;
1188	kfree(objp: dd->comp_vect);
1189	dd->comp_vect = NULL;
1190	if (dd->rcvhdrtail_dummy_kvaddr)
1191	dma_free_coherent(dev: &dd->pcidev->dev, size: sizeof(u64),
1192	cpu_addr: (void *)dd->rcvhdrtail_dummy_kvaddr,
1193	dma_handle: dd->rcvhdrtail_dummy_dma);
1194	dd->rcvhdrtail_dummy_kvaddr = NULL;
1195	sdma_clean(dd, num_engines: dd->num_sdma);
1196	rvt_dealloc_device(rdi: &dd->verbs_dev.rdi);
1197	}
1198
1199	/**
1200	* hfi1_alloc_devdata - Allocate our primary per-unit data structure.
1201	* @pdev: Valid PCI device
1202	* @extra: How many bytes to alloc past the default
1203	*
1204	* Must be done via verbs allocator, because the verbs cleanup process
1205	* both does cleanup and free of the data structure.
1206	* "extra" is for chip-specific data.
1207	*/
1208	static struct hfi1_devdata hfi1_alloc_devdata(struct* pci_dev *pdev,
1209	size_t extra)
1210	{
1211	struct hfi1_devdata *dd;
1212	int ret, nports;
1213
1214	/ extra is * number of ports /
1215	nports = extra / sizeof(struct hfi1_pportdata);
1216
1217	dd = (struct hfi1_devdata )rvt_alloc_device(size: sizeof(dd) + extra,
1218	nports);
1219	if (!dd)
1220	return ERR_PTR(error: -ENOMEM);
1221	dd->num_pports = nports;
1222	dd->pport = (struct hfi1_pportdata *)(dd + `1`);
1223	dd->pcidev = pdev;
1224	pci_set_drvdata(pdev, data: dd);
1225
1226	ret = xa_alloc_irq(xa: &hfi1_dev_table, id: &dd->unit, entry: dd, xa_limit_32b,
1227	GFP_KERNEL);
1228	if (ret < `0`) {
1229	dev_err(&pdev->dev,
1230	"Could not allocate unit ID: error %d\n", -ret);
1231	goto bail;
1232	}
1233	rvt_set_ibdev_name(rdi: &dd->verbs_dev.rdi, fmt: "%s_%d", name: class_name(), unit: dd->unit);
1234	/*
1235	* If the BIOS does not have the NUMA node information set, select
1236	* NUMA 0 so we get consistent performance.
1237	*/
1238	dd->node = pcibus_to_node(pdev->bus);
1239	if (dd->node == NUMA_NO_NODE) {
1240	dd_dev_err(dd, "Invalid PCI NUMA node. Performance may be affected\n");
1241	dd->node = `0`;
1242	}
1243
1244	/*
1245	* Initialize all locks for the device. This needs to be as early as
1246	* possible so locks are usable.
1247	*/
1248	spin_lock_init(&dd->sc_lock);
1249	spin_lock_init(&dd->sendctrl_lock);
1250	spin_lock_init(&dd->rcvctrl_lock);
1251	spin_lock_init(&dd->uctxt_lock);
1252	spin_lock_init(&dd->hfi1_diag_trans_lock);
1253	spin_lock_init(&dd->sc_init_lock);
1254	spin_lock_init(&dd->dc8051_memlock);
1255	seqlock_init(&dd->sc2vl_lock);
1256	spin_lock_init(&dd->sde_map_lock);
1257	spin_lock_init(&dd->pio_map_lock);
1258	mutex_init(&dd->dc8051_lock);
1259	init_waitqueue_head(&dd->event_queue);
1260	spin_lock_init(&dd->irq_src_lock);
1261
1262	dd->int_counter = alloc_percpu(u64);
1263	if (!dd->int_counter) {
1264	ret = -ENOMEM;
1265	goto bail;
1266	}
1267
1268	dd->rcv_limit = alloc_percpu(u64);
1269	if (!dd->rcv_limit) {
1270	ret = -ENOMEM;
1271	goto bail;
1272	}
1273
1274	dd->send_schedule = alloc_percpu(u64);
1275	if (!dd->send_schedule) {
1276	ret = -ENOMEM;
1277	goto bail;
1278	}
1279
1280	dd->tx_opstats = alloc_percpu(struct hfi1_opcode_stats_perctx);
1281	if (!dd->tx_opstats) {
1282	ret = -ENOMEM;
1283	goto bail;
1284	}
1285
1286	dd->comp_vect = kzalloc(size: sizeof(*dd->comp_vect), GFP_KERNEL);
1287	if (!dd->comp_vect) {
1288	ret = -ENOMEM;
1289	goto bail;
1290	}
1291
1292	/ allocate dummy tail memory for all receive contexts /
1293	dd->rcvhdrtail_dummy_kvaddr =
1294	dma_alloc_coherent(dev: &dd->pcidev->dev, size: sizeof(u64),
1295	dma_handle: &dd->rcvhdrtail_dummy_dma, GFP_KERNEL);
1296	if (!dd->rcvhdrtail_dummy_kvaddr) {
1297	ret = -ENOMEM;
1298	goto bail;
1299	}
1300
1301	atomic_set(v: &dd->ipoib_rsm_usr_num, i: `0`);
1302	return dd;
1303
1304	bail:
1305	hfi1_free_devdata(dd);
1306	return ERR_PTR(error: ret);
1307	}
1308
1309	/*
1310	* Called from freeze mode handlers, and from PCI error
1311	* reporting code. Should be paranoid about state of
1312	* system and data structures.
1313	*/
1314	void hfi1_disable_after_error(struct hfi1_devdata *dd)
1315	{
1316	if (dd->flags & HFI1_INITTED) {
1317	u32 pidx;
1318
1319	dd->flags &= ~HFI1_INITTED;
1320	if (dd->pport)
1321	for (pidx = `0`; pidx < dd->num_pports; ++pidx) {
1322	struct hfi1_pportdata *ppd;
1323
1324	ppd = dd->pport + pidx;
1325	if (dd->flags & HFI1_PRESENT)
1326	set_link_state(ppd, HLS_DN_DISABLE);
1327
1328	if (ppd->statusp)
1329	*ppd->statusp &= ~HFI1_STATUS_IB_READY;
1330	}
1331	}
1332
1333	/*
1334	* Mark as having had an error for driver, and also
1335	* for /sys and status word mapped to user programs.
1336	* This marks unit as not usable, until reset.
1337	*/
1338	if (dd->status)
1339	dd->status->dev \|= HFI1_STATUS_HWERROR;
1340	}
1341
1342	static void remove_one(struct pci_dev *);
1343	static int init_one(struct pci_dev , const* struct pci_device_id *);
1344	static void shutdown_one(struct pci_dev *);
1345
1346	#define DRIVER_LOAD_MSG "Cornelis " DRIVER_NAME " loaded: "
1347	#define PFX DRIVER_NAME ": "
1348
1349	const struct pci_device_id hfi1_pci_tbl[] = {
1350	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) },
1351	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) },
1352	{ `0`, }
1353	};
1354
1355	MODULE_DEVICE_TABLE(pci, hfi1_pci_tbl);
1356
1357	static struct pci_driver hfi1_pci_driver = {
1358	.name = DRIVER_NAME,
1359	.probe = init_one,
1360	.remove = remove_one,
1361	.shutdown = shutdown_one,
1362	.id_table = hfi1_pci_tbl,
1363	.err_handler = &hfi1_pci_err_handler,
1364	};
1365
1366	static void __init compute_krcvqs(void)
1367	{
1368	int i;
1369
1370	for (i = `0`; i < krcvqsset; i++)
1371	n_krcvqs += krcvqs[i];
1372	}
1373
1374	/*
1375	* Do all the generic driver unit- and chip-independent memory
1376	* allocation and initialization.
1377	*/
1378	static int __init hfi1_mod_init(void)
1379	{
1380	int ret;
1381
1382	ret = dev_init();
1383	if (ret)
1384	goto bail;
1385
1386	ret = node_affinity_init();
1387	if (ret)
1388	goto bail;
1389
1390	/ validate max MTU before any devices start /
1391	if (!valid_opa_max_mtu(mtu: hfi1_max_mtu)) {
1392	pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n",
1393	hfi1_max_mtu, HFI1_DEFAULT_MAX_MTU);
1394	hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
1395	}
1396	/ valid CUs run from 1-128 in powers of 2 /
1397	if (hfi1_cu > `128` \|\| !is_power_of_2(n: hfi1_cu))
1398	hfi1_cu = `1`;
1399	/ valid credit return threshold is 0-100, variable is unsigned /
1400	if (user_credit_return_threshold > `100`)
1401	user_credit_return_threshold = `100`;
1402
1403	compute_krcvqs();
1404	/*
1405	* sanitize receive interrupt count, time must wait until after
1406	* the hardware type is known
1407	*/
1408	if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK)
1409	rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK;
1410	/ reject invalid combinations /
1411	if (rcv_intr_count == `0` && rcv_intr_timeout == `0`) {
1412	pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n");
1413	rcv_intr_count = `1`;
1414	}
1415	if (rcv_intr_count > `1` && rcv_intr_timeout == `0`) {
1416	/*
1417	* Avoid indefinite packet delivery by requiring a timeout
1418	* if count is > 1.
1419	*/
1420	pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n");
1421	rcv_intr_timeout = `1`;
1422	}
1423	if (rcv_intr_dynamic && !(rcv_intr_count > `1` && rcv_intr_timeout > `0`)) {
1424	/*
1425	* The dynamic algorithm expects a non-zero timeout
1426	* and a count > 1.
1427	*/
1428	pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n");
1429	rcv_intr_dynamic = `0`;
1430	}
1431
1432	/ sanitize link CRC options /
1433	link_crc_mask &= SUPPORTED_CRCS;
1434
1435	ret = opfn_init();
1436	if (ret < `0`) {
1437	pr_err("Failed to allocate opfn_wq");
1438	goto bail_dev;
1439	}
1440
1441	/*
1442	* These must be called before the driver is registered with
1443	* the PCI subsystem.
1444	*/
1445	hfi1_dbg_init();
1446	ret = pci_register_driver(&hfi1_pci_driver);
1447	if (ret < `0`) {
1448	pr_err("Unable to register driver: error %d\n", -ret);
1449	goto bail_dev;
1450	}
1451	goto bail; / all OK /
1452
1453	bail_dev:
1454	hfi1_dbg_exit();
1455	dev_cleanup();
1456	bail:
1457	return ret;
1458	}
1459
1460	module_init(hfi1_mod_init);
1461
1462	/*
1463	* Do the non-unit driver cleanup, memory free, etc. at unload.
1464	*/
1465	static void __exit hfi1_mod_cleanup(void)
1466	{
1467	pci_unregister_driver(dev: &hfi1_pci_driver);
1468	opfn_exit();
1469	node_affinity_destroy_all();
1470	hfi1_dbg_exit();
1471
1472	WARN_ON(!xa_empty(&hfi1_dev_table));
1473	dispose_firmware(); / asymmetric with obtain_firmware() /
1474	dev_cleanup();
1475	}
1476
1477	module_exit(hfi1_mod_cleanup);
1478
1479	/ this can only be called after a successful initialization /
1480	static void cleanup_device_data(struct hfi1_devdata *dd)
1481	{
1482	int ctxt;
1483	int pidx;
1484
1485	/ users can't do anything more with chip /
1486	for (pidx = `0`; pidx < dd->num_pports; ++pidx) {
1487	struct hfi1_pportdata *ppd = &dd->pport[pidx];
1488	struct cc_state *cc_state;
1489	int i;
1490
1491	if (ppd->statusp)
1492	*ppd->statusp &= ~HFI1_STATUS_CHIP_PRESENT;
1493
1494	for (i = `0`; i < OPA_MAX_SLS; i++)
1495	hrtimer_cancel(timer: &ppd->cca_timer[i].hrtimer);
1496
1497	spin_lock(lock: &ppd->cc_state_lock);
1498	cc_state = get_cc_state_protected(ppd);
1499	RCU_INIT_POINTER(ppd->cc_state, NULL);
1500	spin_unlock(lock: &ppd->cc_state_lock);
1501
1502	if (cc_state)
1503	kfree_rcu(cc_state, rcu);
1504	}
1505
1506	free_credit_return(dd);
1507
1508	/*
1509	* Free any resources still in use (usually just kernel contexts)
1510	* at unload; we do for ctxtcnt, because that's what we allocate.
1511	*/
1512	for (ctxt = `0`; dd->rcd && ctxt < dd->num_rcv_contexts; ctxt++) {
1513	struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
1514
1515	if (rcd) {
1516	hfi1_free_ctxt_rcv_groups(rcd);
1517	hfi1_free_ctxt(rcd);
1518	}
1519	}
1520
1521	kfree(objp: dd->rcd);
1522	dd->rcd = NULL;
1523
1524	free_pio_map(dd);
1525	/ must follow rcv context free - need to remove rcv's hooks /
1526	for (ctxt = `0`; ctxt < dd->num_send_contexts; ctxt++)
1527	sc_free(sc: dd->send_contexts[ctxt].sc);
1528	dd->num_send_contexts = `0`;
1529	kfree(objp: dd->send_contexts);
1530	dd->send_contexts = NULL;
1531	kfree(objp: dd->hw_to_sw);
1532	dd->hw_to_sw = NULL;
1533	kfree(objp: dd->boardname);
1534	vfree(addr: dd->events);
1535	vfree(addr: dd->status);
1536	}
1537
1538	/*
1539	* Clean up on unit shutdown, or error during unit load after
1540	* successful initialization.
1541	*/
1542	static void postinit_cleanup(struct hfi1_devdata *dd)
1543	{
1544	hfi1_start_cleanup(dd);
1545	hfi1_comp_vectors_clean_up(dd);
1546	hfi1_dev_affinity_clean_up(dd);
1547
1548	hfi1_pcie_ddcleanup(dd);
1549	hfi1_pcie_cleanup(pdev: dd->pcidev);
1550
1551	cleanup_device_data(dd);
1552
1553	hfi1_free_devdata(dd);
1554	}
1555
1556	static int init_one(struct pci_dev pdev, const* struct pci_device_id *ent)
1557	{
1558	int ret = `0`, j, pidx, initfail;
1559	struct hfi1_devdata *dd;
1560	struct hfi1_pportdata *ppd;
1561
1562	/ First, lock the non-writable module parameters /
1563	HFI1_CAP_LOCK();
1564
1565	/ Validate dev ids /
1566	if (!(ent->device == PCI_DEVICE_ID_INTEL0 \|\|
1567	ent->device == PCI_DEVICE_ID_INTEL1)) {
1568	dev_err(&pdev->dev, "Failing on unknown Intel deviceid 0x%x\n",
1569	ent->device);
1570	ret = -ENODEV;
1571	goto bail;
1572	}
1573
1574	/ Allocate the dd so we can get to work /
1575	dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS *
1576	sizeof(struct hfi1_pportdata));
1577	if (IS_ERR(ptr: dd)) {
1578	ret = PTR_ERR(ptr: dd);
1579	goto bail;
1580	}
1581
1582	/ Validate some global module parameters /
1583	ret = hfi1_validate_rcvhdrcnt(dd, thecnt: rcvhdrcnt);
1584	if (ret)
1585	goto bail;
1586
1587	/ use the encoding function as a sanitization check /
1588	if (!encode_rcv_header_entry_size(size: hfi1_hdrq_entsize)) {
1589	dd_dev_err(dd, "Invalid HdrQ Entry size %u\n",
1590	hfi1_hdrq_entsize);
1591	ret = -EINVAL;
1592	goto bail;
1593	}
1594
1595	/ The receive eager buffer size must be set before the receive*
1596	* contexts are created.
1597	*
1598	* Set the eager buffer size. Validate that it falls in a range
1599	* allowed by the hardware - all powers of 2 between the min and
1600	* max. The maximum valid MTU is within the eager buffer range
1601	* so we do not need to cap the max_mtu by an eager buffer size
1602	* setting.
1603	*/
1604	if (eager_buffer_size) {
1605	if (!is_power_of_2(n: eager_buffer_size))
1606	eager_buffer_size =
1607	roundup_pow_of_two(eager_buffer_size);
1608	eager_buffer_size =
1609	clamp_val(eager_buffer_size,
1610	MIN_EAGER_BUFFER * `8`,
1611	MAX_EAGER_BUFFER_TOTAL);
1612	dd_dev_info(dd, "Eager buffer size %u\n",
1613	eager_buffer_size);
1614	} else {
1615	dd_dev_err(dd, "Invalid Eager buffer size of 0\n");
1616	ret = -EINVAL;
1617	goto bail;
1618	}
1619
1620	/ restrict value of hfi1_rcvarr_split /
1621	hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, `0`, `100`);
1622
1623	ret = hfi1_pcie_init(dd);
1624	if (ret)
1625	goto bail;
1626
1627	/*
1628	* Do device-specific initialization, function table setup, dd
1629	* allocation, etc.
1630	*/
1631	ret = hfi1_init_dd(dd);
1632	if (ret)
1633	goto clean_bail; / error already printed /
1634
1635	ret = create_workqueues(dd);
1636	if (ret)
1637	goto clean_bail;
1638
1639	/ do the generic initialization /
1640	initfail = hfi1_init(dd, reinit: `0`);
1641
1642	ret = hfi1_register_ib_device(dd);
1643
1644	/*
1645	* Now ready for use. this should be cleared whenever we
1646	* detect a reset, or initiate one. If earlier failure,
1647	* we still create devices, so diags, etc. can be used
1648	* to determine cause of problem.
1649	*/
1650	if (!initfail && !ret) {
1651	dd->flags \|= HFI1_INITTED;
1652	/ create debufs files after init and ib register /
1653	hfi1_dbg_ibdev_init(ibd: &dd->verbs_dev);
1654	}
1655
1656	j = hfi1_device_create(dd);
1657	if (j)
1658	dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j);
1659
1660	if (initfail \|\| ret) {
1661	msix_clean_up_interrupts(dd);
1662	stop_timers(dd);
1663	flush_workqueue(ib_wq);
1664	for (pidx = `0`; pidx < dd->num_pports; ++pidx) {
1665	hfi1_quiet_serdes(ppd: dd->pport + pidx);
1666	ppd = dd->pport + pidx;
1667	if (ppd->hfi1_wq) {
1668	destroy_workqueue(wq: ppd->hfi1_wq);
1669	ppd->hfi1_wq = NULL;
1670	}
1671	if (ppd->link_wq) {
1672	destroy_workqueue(wq: ppd->link_wq);
1673	ppd->link_wq = NULL;
1674	}
1675	}
1676	if (!j)
1677	hfi1_device_remove(dd);
1678	if (!ret)
1679	hfi1_unregister_ib_device(dd);
1680	postinit_cleanup(dd);
1681	if (initfail)
1682	ret = initfail;
1683	goto bail; / everything already cleaned /
1684	}
1685
1686	sdma_start(dd);
1687
1688	return `0`;
1689
1690	clean_bail:
1691	hfi1_pcie_cleanup(pdev);
1692	bail:
1693	return ret;
1694	}
1695
1696	static void wait_for_clients(struct hfi1_devdata *dd)
1697	{
1698	/*
1699	* Remove the device init value and complete the device if there is
1700	* no clients or wait for active clients to finish.
1701	*/
1702	if (refcount_dec_and_test(r: &dd->user_refcount))
1703	complete(&dd->user_comp);
1704
1705	wait_for_completion(&dd->user_comp);
1706	}
1707
1708	static void remove_one(struct pci_dev *pdev)
1709	{
1710	struct hfi1_devdata *dd = pci_get_drvdata(pdev);
1711
1712	/ close debugfs files before ib unregister /
1713	hfi1_dbg_ibdev_exit(ibd: &dd->verbs_dev);
1714
1715	/ remove the /dev hfi1 interface /
1716	hfi1_device_remove(dd);
1717
1718	/ wait for existing user space clients to finish /
1719	wait_for_clients(dd);
1720
1721	/ unregister from IB core /
1722	hfi1_unregister_ib_device(dd);
1723
1724	/ free netdev data /
1725	hfi1_free_rx(dd);
1726
1727	/*
1728	* Disable the IB link, disable interrupts on the device,
1729	* clear dma engines, etc.
1730	*/
1731	shutdown_device(dd);
1732	destroy_workqueues(dd);
1733
1734	stop_timers(dd);
1735
1736	/ wait until all of our (qsfp) queue_work() calls complete /
1737	flush_workqueue(ib_wq);
1738
1739	postinit_cleanup(dd);
1740	}
1741
1742	static void shutdown_one(struct pci_dev *pdev)
1743	{
1744	struct hfi1_devdata *dd = pci_get_drvdata(pdev);
1745
1746	shutdown_device(dd);
1747	}
1748
1749	/**
1750	* hfi1_create_rcvhdrq - create a receive header queue
1751	* @dd: the hfi1_ib device
1752	* @rcd: the context data
1753	*
1754	* This must be contiguous memory (from an i/o perspective), and must be
1755	* DMA'able (which means for some systems, it will go through an IOMMU,
1756	* or be forced into a low address range).
1757	*/
1758	int hfi1_create_rcvhdrq(struct hfi1_devdata dd, struct* hfi1_ctxtdata *rcd)
1759	{
1760	unsigned amt;
1761
1762	if (!rcd->rcvhdrq) {
1763	amt = rcvhdrq_size(rcd);
1764
1765	rcd->rcvhdrq = dma_alloc_coherent(dev: &dd->pcidev->dev, size: amt,
1766	dma_handle: &rcd->rcvhdrq_dma,
1767	GFP_KERNEL);
1768
1769	if (!rcd->rcvhdrq) {
1770	dd_dev_err(dd,
1771	"attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n",
1772	amt, rcd->ctxt);
1773	goto bail;
1774	}
1775
1776	if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) \|\|
1777	HFI1_CAP_UGET_MASK(rcd->flags, DMA_RTAIL)) {
1778	rcd->rcvhdrtail_kvaddr = dma_alloc_coherent(dev: &dd->pcidev->dev,
1779	PAGE_SIZE,
1780	dma_handle: &rcd->rcvhdrqtailaddr_dma,
1781	GFP_KERNEL);
1782	if (!rcd->rcvhdrtail_kvaddr)
1783	goto bail_free;
1784	}
1785	}
1786
1787	set_hdrq_regs(dd: rcd->dd, ctxt: rcd->ctxt, entsize: rcd->rcvhdrqentsize,
1788	hdrcnt: rcd->rcvhdrq_cnt);
1789
1790	return `0`;
1791
1792	bail_free:
1793	dd_dev_err(dd,
1794	"attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n",
1795	rcd->ctxt);
1796	dma_free_coherent(dev: &dd->pcidev->dev, size: amt, cpu_addr: rcd->rcvhdrq,
1797	dma_handle: rcd->rcvhdrq_dma);
1798	rcd->rcvhdrq = NULL;
1799	bail:
1800	return -ENOMEM;
1801	}
1802
1803	/**
1804	* hfi1_setup_eagerbufs - llocate eager buffers, both kernel and user
1805	* contexts.
1806	* @rcd: the context we are setting up.
1807	*
1808	* Allocate the eager TID buffers and program them into hip.
1809	* They are no longer completely contiguous, we do multiple allocation
1810	* calls. Otherwise we get the OOM code involved, by asking for too
1811	* much per call, with disastrous results on some kernels.
1812	*/
1813	int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd)
1814	{
1815	struct hfi1_devdata *dd = rcd->dd;
1816	u32 max_entries, egrtop, alloced_bytes = `0`;
1817	u16 order, idx = `0`;
1818	int ret = `0`;
1819	u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu);
1820
1821	/*
1822	* The minimum size of the eager buffers is a groups of MTU-sized
1823	* buffers.
1824	* The global eager_buffer_size parameter is checked against the
1825	* theoretical lower limit of the value. Here, we check against the
1826	* MTU.
1827	*/
1828	if (rcd->egrbufs.size < (round_mtu * dd->rcv_entries.group_size))
1829	rcd->egrbufs.size = round_mtu * dd->rcv_entries.group_size;
1830	/*
1831	* If using one-pkt-per-egr-buffer, lower the eager buffer
1832	* size to the max MTU (page-aligned).
1833	*/
1834	if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR))
1835	rcd->egrbufs.rcvtid_size = round_mtu;
1836
1837	/*
1838	* Eager buffers sizes of 1MB or less require smaller TID sizes
1839	* to satisfy the "multiple of 8 RcvArray entries" requirement.
1840	*/
1841	if (rcd->egrbufs.size <= (`1` << `20`))
1842	rcd->egrbufs.rcvtid_size = max((unsigned long)round_mtu,
1843	rounddown_pow_of_two(rcd->egrbufs.size / `8`));
1844
1845	while (alloced_bytes < rcd->egrbufs.size &&
1846	rcd->egrbufs.alloced < rcd->egrbufs.count) {
1847	rcd->egrbufs.buffers[idx].addr =
1848	dma_alloc_coherent(dev: &dd->pcidev->dev,
1849	size: rcd->egrbufs.rcvtid_size,
1850	dma_handle: &rcd->egrbufs.buffers[idx].dma,
1851	GFP_KERNEL);
1852	if (rcd->egrbufs.buffers[idx].addr) {
1853	rcd->egrbufs.buffers[idx].len =
1854	rcd->egrbufs.rcvtid_size;
1855	rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].addr =
1856	rcd->egrbufs.buffers[idx].addr;
1857	rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].dma =
1858	rcd->egrbufs.buffers[idx].dma;
1859	rcd->egrbufs.alloced++;
1860	alloced_bytes += rcd->egrbufs.rcvtid_size;
1861	idx++;
1862	} else {
1863	u32 new_size, i, j;
1864	u64 offset = `0`;
1865
1866	/*
1867	* Fail the eager buffer allocation if:
1868	* - we are already using the lowest acceptable size
1869	* - we are using one-pkt-per-egr-buffer (this implies
1870	* that we are accepting only one size)
1871	*/
1872	if (rcd->egrbufs.rcvtid_size == round_mtu \|\|
1873	!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) {
1874	dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n",
1875	rcd->ctxt);
1876	ret = -ENOMEM;
1877	goto bail_rcvegrbuf_phys;
1878	}
1879
1880	new_size = rcd->egrbufs.rcvtid_size / `2`;
1881
1882	/*
1883	* If the first attempt to allocate memory failed, don't
1884	* fail everything but continue with the next lower
1885	* size.
1886	*/
1887	if (idx == `0`) {
1888	rcd->egrbufs.rcvtid_size = new_size;
1889	continue;
1890	}
1891
1892	/*
1893	* Re-partition already allocated buffers to a smaller
1894	* size.
1895	*/
1896	rcd->egrbufs.alloced = `0`;
1897	for (i = `0`, j = `0`, offset = `0`; j < idx; i++) {
1898	if (i >= rcd->egrbufs.count)
1899	break;
1900	rcd->egrbufs.rcvtids[i].dma =
1901	rcd->egrbufs.buffers[j].dma + offset;
1902	rcd->egrbufs.rcvtids[i].addr =
1903	rcd->egrbufs.buffers[j].addr + offset;
1904	rcd->egrbufs.alloced++;
1905	if ((rcd->egrbufs.buffers[j].dma + offset +
1906	new_size) ==
1907	(rcd->egrbufs.buffers[j].dma +
1908	rcd->egrbufs.buffers[j].len)) {
1909	j++;
1910	offset = `0`;
1911	} else {
1912	offset += new_size;
1913	}
1914	}
1915	rcd->egrbufs.rcvtid_size = new_size;
1916	}
1917	}
1918	rcd->egrbufs.numbufs = idx;
1919	rcd->egrbufs.size = alloced_bytes;
1920
1921	hfi1_cdbg(PROC,
1922	"ctxt%u: Alloced %u rcv tid entries @ %uKB, total %uKB",
1923	rcd->ctxt, rcd->egrbufs.alloced,
1924	rcd->egrbufs.rcvtid_size / `1024`, rcd->egrbufs.size / `1024`);
1925
1926	/*
1927	* Set the contexts rcv array head update threshold to the closest
1928	* power of 2 (so we can use a mask instead of modulo) below half
1929	* the allocated entries.
1930	*/
1931	rcd->egrbufs.threshold =
1932	rounddown_pow_of_two(rcd->egrbufs.alloced / `2`);
1933	/*
1934	* Compute the expected RcvArray entry base. This is done after
1935	* allocating the eager buffers in order to maximize the
1936	* expected RcvArray entries for the context.
1937	*/
1938	max_entries = rcd->rcv_array_groups * dd->rcv_entries.group_size;
1939	egrtop = roundup(rcd->egrbufs.alloced, dd->rcv_entries.group_size);
1940	rcd->expected_count = max_entries - egrtop;
1941	if (rcd->expected_count > MAX_TID_PAIR_ENTRIES * `2`)
1942	rcd->expected_count = MAX_TID_PAIR_ENTRIES * `2`;
1943
1944	rcd->expected_base = rcd->eager_base + egrtop;
1945	hfi1_cdbg(PROC, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u",
1946	rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count,
1947	rcd->eager_base, rcd->expected_base);
1948
1949	if (!hfi1_rcvbuf_validate(size: rcd->egrbufs.rcvtid_size, PT_EAGER, encode: &order)) {
1950	hfi1_cdbg(PROC,
1951	"ctxt%u: current Eager buffer size is invalid %u",
1952	rcd->ctxt, rcd->egrbufs.rcvtid_size);
1953	ret = -EINVAL;
1954	goto bail_rcvegrbuf_phys;
1955	}
1956
1957	for (idx = `0`; idx < rcd->egrbufs.alloced; idx++) {
1958	hfi1_put_tid(dd, index: rcd->eager_base + idx, PT_EAGER,
1959	pa: rcd->egrbufs.rcvtids[idx].dma, order);
1960	cond_resched();
1961	}
1962
1963	return `0`;
1964
1965	bail_rcvegrbuf_phys:
1966	for (idx = `0`; idx < rcd->egrbufs.alloced &&
1967	rcd->egrbufs.buffers[idx].addr;
1968	idx++) {
1969	dma_free_coherent(dev: &dd->pcidev->dev,
1970	size: rcd->egrbufs.buffers[idx].len,
1971	cpu_addr: rcd->egrbufs.buffers[idx].addr,
1972	dma_handle: rcd->egrbufs.buffers[idx].dma);
1973	rcd->egrbufs.buffers[idx].addr = NULL;
1974	rcd->egrbufs.buffers[idx].dma = `0`;
1975	rcd->egrbufs.buffers[idx].len = `0`;
1976	}
1977
1978	return ret;
1979	}
1980

source code of linux/drivers/infiniband/hw/hfi1/init.c