i915_perf.c source code [linux/drivers/gpu/drm/i915/i915_perf.c]

1	/*
2	* Copyright © 2015-2016 Intel Corporation
3	*
4	* Permission is hereby granted, free of charge, to any person obtaining a
5	* copy of this software and associated documentation files (the "Software"),
6	* to deal in the Software without restriction, including without limitation
7	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8	* and/or sell copies of the Software, and to permit persons to whom the
9	* Software is furnished to do so, subject to the following conditions:
10	*
11	* The above copyright notice and this permission notice (including the next
12	* paragraph) shall be included in all copies or substantial portions of the
13	* Software.
14	*
15	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21	* IN THE SOFTWARE.
22	*
23	* Authors:
24	* Robert Bragg <robert@sixbynine.org>
25	*/
26
27
28	/**
29	* DOC: i915 Perf Overview
30	*
31	* Gen graphics supports a large number of performance counters that can help
32	* driver and application developers understand and optimize their use of the
33	* GPU.
34	*
35	* This i915 perf interface enables userspace to configure and open a file
36	* descriptor representing a stream of GPU metrics which can then be read() as
37	* a stream of sample records.
38	*
39	* The interface is particularly suited to exposing buffered metrics that are
40	* captured by DMA from the GPU, unsynchronized with and unrelated to the CPU.
41	*
42	* Streams representing a single context are accessible to applications with a
43	* corresponding drm file descriptor, such that OpenGL can use the interface
44	* without special privileges. Access to system-wide metrics requires root
45	* privileges by default, unless changed via the dev.i915.perf_event_paranoid
46	* sysctl option.
47	*
48	*/
49
50	/**
51	* DOC: i915 Perf History and Comparison with Core Perf
52	*
53	* The interface was initially inspired by the core Perf infrastructure but
54	* some notable differences are:
55	*
56	* i915 perf file descriptors represent a "stream" instead of an "event"; where
57	* a perf event primarily corresponds to a single 64bit value, while a stream
58	* might sample sets of tightly-coupled counters, depending on the
59	* configuration. For example the Gen OA unit isn't designed to support
60	* orthogonal configurations of individual counters; it's configured for a set
61	* of related counters. Samples for an i915 perf stream capturing OA metrics
62	* will include a set of counter values packed in a compact HW specific format.
63	* The OA unit supports a number of different packing formats which can be
64	* selected by the user opening the stream. Perf has support for grouping
65	* events, but each event in the group is configured, validated and
66	* authenticated individually with separate system calls.
67	*
68	* i915 perf stream configurations are provided as an array of u64 (key,value)
69	* pairs, instead of a fixed struct with multiple miscellaneous config members,
70	* interleaved with event-type specific members.
71	*
72	* i915 perf doesn't support exposing metrics via an mmap'd circular buffer.
73	* The supported metrics are being written to memory by the GPU unsynchronized
74	* with the CPU, using HW specific packing formats for counter sets. Sometimes
75	* the constraints on HW configuration require reports to be filtered before it
76	* would be acceptable to expose them to unprivileged applications - to hide
77	* the metrics of other processes/contexts. For these use cases a read() based
78	* interface is a good fit, and provides an opportunity to filter data as it
79	* gets copied from the GPU mapped buffers to userspace buffers.
80	*
81	*
82	* Issues hit with first prototype based on Core Perf
83	* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
84	*
85	* The first prototype of this driver was based on the core perf
86	* infrastructure, and while we did make that mostly work, with some changes to
87	* perf, we found we were breaking or working around too many assumptions baked
88	* into perf's currently cpu centric design.
89	*
90	* In the end we didn't see a clear benefit to making perf's implementation and
91	* interface more complex by changing design assumptions while we knew we still
92	* wouldn't be able to use any existing perf based userspace tools.
93	*
94	* Also considering the Gen specific nature of the Observability hardware and
95	* how userspace will sometimes need to combine i915 perf OA metrics with
96	* side-band OA data captured via MI_REPORT_PERF_COUNT commands; we're
97	* expecting the interface to be used by a platform specific userspace such as
98	* OpenGL or tools. This is to say; we aren't inherently missing out on having
99	* a standard vendor/architecture agnostic interface by not using perf.
100	*
101	*
102	* For posterity, in case we might re-visit trying to adapt core perf to be
103	* better suited to exposing i915 metrics these were the main pain points we
104	* hit:
105	*
106	* - The perf based OA PMU driver broke some significant design assumptions:
107	*
108	* Existing perf pmus are used for profiling work on a cpu and we were
109	* introducing the idea of _IS_DEVICE pmus with different security
110	* implications, the need to fake cpu-related data (such as user/kernel
111	* registers) to fit with perf's current design, and adding _DEVICE records
112	* as a way to forward device-specific status records.
113	*
114	* The OA unit writes reports of counters into a circular buffer, without
115	* involvement from the CPU, making our PMU driver the first of a kind.
116	*
117	* Given the way we were periodically forward data from the GPU-mapped, OA
118	* buffer to perf's buffer, those bursts of sample writes looked to perf like
119	* we were sampling too fast and so we had to subvert its throttling checks.
120	*
121	* Perf supports groups of counters and allows those to be read via
122	* transactions internally but transactions currently seem designed to be
123	* explicitly initiated from the cpu (say in response to a userspace read())
124	* and while we could pull a report out of the OA buffer we can't
125	* trigger a report from the cpu on demand.
126	*
127	* Related to being report based; the OA counters are configured in HW as a
128	* set while perf generally expects counter configurations to be orthogonal.
129	* Although counters can be associated with a group leader as they are
130	* opened, there's no clear precedent for being able to provide group-wide
131	* configuration attributes (for example we want to let userspace choose the
132	* OA unit report format used to capture all counters in a set, or specify a
133	* GPU context to filter metrics on). We avoided using perf's grouping
134	* feature and forwarded OA reports to userspace via perf's 'raw' sample
135	* field. This suited our userspace well considering how coupled the counters
136	* are when dealing with normalizing. It would be inconvenient to split
137	* counters up into separate events, only to require userspace to recombine
138	* them. For Mesa it's also convenient to be forwarded raw, periodic reports
139	* for combining with the side-band raw reports it captures using
140	* MI_REPORT_PERF_COUNT commands.
141	*
142	* - As a side note on perf's grouping feature; there was also some concern
143	* that using PERF_FORMAT_GROUP as a way to pack together counter values
144	* would quite drastically inflate our sample sizes, which would likely
145	* lower the effective sampling resolutions we could use when the available
146	* memory bandwidth is limited.
147	*
148	* With the OA unit's report formats, counters are packed together as 32
149	* or 40bit values, with the largest report size being 256 bytes.
150	*
151	* PERF_FORMAT_GROUP values are 64bit, but there doesn't appear to be a
152	* documented ordering to the values, implying PERF_FORMAT_ID must also be
153	* used to add a 64bit ID before each value; giving 16 bytes per counter.
154	*
155	* Related to counter orthogonality; we can't time share the OA unit, while
156	* event scheduling is a central design idea within perf for allowing
157	* userspace to open + enable more events than can be configured in HW at any
158	* one time. The OA unit is not designed to allow re-configuration while in
159	* use. We can't reconfigure the OA unit without losing internal OA unit
160	* state which we can't access explicitly to save and restore. Reconfiguring
161	* the OA unit is also relatively slow, involving ~100 register writes. From
162	* userspace Mesa also depends on a stable OA configuration when emitting
163	* MI_REPORT_PERF_COUNT commands and importantly the OA unit can't be
164	* disabled while there are outstanding MI_RPC commands lest we hang the
165	* command streamer.
166	*
167	* The contents of sample records aren't extensible by device drivers (i.e.
168	* the sample_type bits). As an example; Sourab Gupta had been looking to
169	* attach GPU timestamps to our OA samples. We were shoehorning OA reports
170	* into sample records by using the 'raw' field, but it's tricky to pack more
171	* than one thing into this field because events/core.c currently only lets a
172	* pmu give a single raw data pointer plus len which will be copied into the
173	* ring buffer. To include more than the OA report we'd have to copy the
174	* report into an intermediate larger buffer. I'd been considering allowing a
175	* vector of data+len values to be specified for copying the raw data, but
176	* it felt like a kludge to being using the raw field for this purpose.
177	*
178	* - It felt like our perf based PMU was making some technical compromises
179	* just for the sake of using perf:
180	*
181	* perf_event_open() requires events to either relate to a pid or a specific
182	* cpu core, while our device pmu related to neither. Events opened with a
183	* pid will be automatically enabled/disabled according to the scheduling of
184	* that process - so not appropriate for us. When an event is related to a
185	* cpu id, perf ensures pmu methods will be invoked via an inter process
186	* interrupt on that core. To avoid invasive changes our userspace opened OA
187	* perf events for a specific cpu. This was workable but it meant the
188	* majority of the OA driver ran in atomic context, including all OA report
189	* forwarding, which wasn't really necessary in our case and seems to make
190	* our locking requirements somewhat complex as we handled the interaction
191	* with the rest of the i915 driver.
192	*/
193
194	#include <linux/anon_inodes.h>
195	#include <linux/nospec.h>
196	#include <linux/sizes.h>
197	#include <linux/uuid.h>
198
199	#include "gem/i915_gem_context.h"
200	#include "gem/i915_gem_internal.h"
201	#include "gt/intel_engine_pm.h"
202	#include "gt/intel_engine_regs.h"
203	#include "gt/intel_engine_user.h"
204	#include "gt/intel_execlists_submission.h"
205	#include "gt/intel_gpu_commands.h"
206	#include "gt/intel_gt.h"
207	#include "gt/intel_gt_clock_utils.h"
208	#include "gt/intel_gt_mcr.h"
209	#include "gt/intel_gt_print.h"
210	#include "gt/intel_gt_regs.h"
211	#include "gt/intel_lrc.h"
212	#include "gt/intel_lrc_reg.h"
213	#include "gt/intel_rc6.h"
214	#include "gt/intel_ring.h"
215	#include "gt/uc/intel_guc_slpc.h"
216
217	#include "i915_drv.h"
218	#include "i915_file_private.h"
219	#include "i915_perf.h"
220	#include "i915_perf_oa_regs.h"
221	#include "i915_reg.h"
222
223	/ HW requires this to be a power of two, between 128k and 16M, though driver*
224	* is currently generally designed assuming the largest 16M size is used such
225	* that the overflow cases are unlikely in normal operation.
226	*/
227	#define OA_BUFFER_SIZE SZ_16M
228
229	#define OA_TAKEN(tail, head) ((tail - head) & (OA_BUFFER_SIZE - 1))
230
231	/**
232	* DOC: OA Tail Pointer Race
233	*
234	* There's a HW race condition between OA unit tail pointer register updates and
235	* writes to memory whereby the tail pointer can sometimes get ahead of what's
236	* been written out to the OA buffer so far (in terms of what's visible to the
237	* CPU).
238	*
239	* Although this can be observed explicitly while copying reports to userspace
240	* by checking for a zeroed report-id field in tail reports, we want to account
241	* for this earlier, as part of the oa_buffer_check_unlocked to avoid lots of
242	* redundant read() attempts.
243	*
244	* We workaround this issue in oa_buffer_check_unlocked() by reading the reports
245	* in the OA buffer, starting from the tail reported by the HW until we find a
246	* report with its first 2 dwords not 0 meaning its previous report is
247	* completely in memory and ready to be read. Those dwords are also set to 0
248	* once read and the whole buffer is cleared upon OA buffer initialization. The
249	* first dword is the reason for this report while the second is the timestamp,
250	* making the chances of having those 2 fields at 0 fairly unlikely. A more
251	* detailed explanation is available in oa_buffer_check_unlocked().
252	*
253	* Most of the implementation details for this workaround are in
254	* oa_buffer_check_unlocked() and _append_oa_reports()
255	*
256	* Note for posterity: previously the driver used to define an effective tail
257	* pointer that lagged the real pointer by a 'tail margin' measured in bytes
258	* derived from %OA_TAIL_MARGIN_NSEC and the configured sampling frequency.
259	* This was flawed considering that the OA unit may also automatically generate
260	* non-periodic reports (such as on context switch) or the OA unit may be
261	* enabled without any periodic sampling.
262	*/
263	#define OA_TAIL_MARGIN_NSEC 100000ULL
264	#define INVALID_TAIL_PTR 0xffffffff
265
266	/ The default frequency for checking whether the OA unit has written new*
267	* reports to the circular OA buffer...
268	*/
269	#define DEFAULT_POLL_FREQUENCY_HZ 200
270	#define DEFAULT_POLL_PERIOD_NS (NSEC_PER_SEC / DEFAULT_POLL_FREQUENCY_HZ)
271
272	/ for sysctl proc_dointvec_minmax of dev.i915.perf_stream_paranoid /
273	static u32 i915_perf_stream_paranoid = true;
274
275	/ The maximum exponent the hardware accepts is 63 (essentially it selects one*
276	* of the 64bit timestamp bits to trigger reports from) but there's currently
277	* no known use case for sampling as infrequently as once per 47 thousand years.
278	*
279	* Since the timestamps included in OA reports are only 32bits it seems
280	* reasonable to limit the OA exponent where it's still possible to account for
281	* overflow in OA report timestamps.
282	*/
283	#define OA_EXPONENT_MAX 31
284
285	#define INVALID_CTX_ID 0xffffffff
286
287	/ On Gen8+ automatically triggered OA reports include a 'reason' field... /
288	#define OAREPORT_REASON_MASK 0x3f
289	#define OAREPORT_REASON_MASK_EXTENDED 0x7f
290	#define OAREPORT_REASON_SHIFT 19
291	#define OAREPORT_REASON_TIMER (1<<0)
292	#define OAREPORT_REASON_CTX_SWITCH (1<<3)
293	#define OAREPORT_REASON_CLK_RATIO (1<<5)
294
295	#define HAS_MI_SET_PREDICATE(i915) (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50))
296
297	/ For sysctl proc_dointvec_minmax of i915_oa_max_sample_rate*
298	*
299	* The highest sampling frequency we can theoretically program the OA unit
300	* with is always half the timestamp frequency: E.g. 6.25Mhz for Haswell.
301	*
302	* Initialized just before we register the sysctl parameter.
303	*/
304	static int oa_sample_rate_hard_limit;
305
306	/ Theoretically we can program the OA unit to sample every 160ns but don't*
307	* allow that by default unless root...
308	*
309	* The default threshold of 100000Hz is based on perf's similar
310	* kernel.perf_event_max_sample_rate sysctl parameter.
311	*/
312	static u32 i915_oa_max_sample_rate = `100000`;
313
314	/ XXX: beware if future OA HW adds new report formats that the current*
315	* code assumes all reports have a power-of-two size and ~(size - 1) can
316	* be used as a mask to align the OA tail pointer.
317	*/
318	static const struct i915_oa_format oa_formats[I915_OA_FORMAT_MAX] = {
319	[I915_OA_FORMAT_A13] = { `0`, `64` },
320	[I915_OA_FORMAT_A29] = { .format: `1`, .size: `128` },
321	[I915_OA_FORMAT_A13_B8_C8] = { .format: `2`, .size: `128` },
322	/ A29_B8_C8 Disallowed as 192 bytes doesn't factor into buffer size /
323	[I915_OA_FORMAT_B4_C8] = { .format: `4`, .size: `64` },
324	[I915_OA_FORMAT_A45_B8_C8] = { .format: `5`, .size: `256` },
325	[I915_OA_FORMAT_B4_C8_A16] = { .format: `6`, .size: `128` },
326	[I915_OA_FORMAT_C4_B8] = { .format: `7`, .size: `64` },
327	[I915_OA_FORMAT_A12] = { .format: `0`, .size: `64` },
328	[I915_OA_FORMAT_A12_B8_C8] = { .format: `2`, .size: `128` },
329	[I915_OA_FORMAT_A32u40_A4u32_B8_C8] = { .format: `5`, .size: `256` },
330	[I915_OAR_FORMAT_A32u40_A4u32_B8_C8] = { .format: `5`, .size: `256` },
331	[I915_OA_FORMAT_A24u40_A14u32_B8_C8] = { .format: `5`, .size: `256` },
332	[I915_OAM_FORMAT_MPEC8u64_B8_C8] = { .format: `1`, .size: `192`, .type: TYPE_OAM, .header: HDR_64_BIT },
333	[I915_OAM_FORMAT_MPEC8u32_B8_C8] = { .format: `2`, .size: `128`, .type: TYPE_OAM, .header: HDR_64_BIT },
334	};
335
336	static const u32 mtl_oa_base[] = {
337	[PERF_GROUP_OAM_SAMEDIA_0] = `0x393000`,
338	};
339
340	#define SAMPLE_OA_REPORT (1<<0)
341
342	/**
343	* struct perf_open_properties - for validated properties given to open a stream
344	* @sample_flags: `DRM_I915_PERF_PROP_SAMPLE_*` properties are tracked as flags
345	* @single_context: Whether a single or all gpu contexts should be monitored
346	* @hold_preemption: Whether the preemption is disabled for the filtered
347	* context
348	* @ctx_handle: A gem ctx handle for use with @single_context
349	* @metrics_set: An ID for an OA unit metric set advertised via sysfs
350	* @oa_format: An OA unit HW report format
351	* @oa_periodic: Whether to enable periodic OA unit sampling
352	* @oa_period_exponent: The OA unit sampling period is derived from this
353	* @engine: The engine (typically rcs0) being monitored by the OA unit
354	* @has_sseu: Whether @sseu was specified by userspace
355	* @sseu: internal SSEU configuration computed either from the userspace
356	* specified configuration in the opening parameters or a default value
357	* (see get_default_sseu_config())
358	* @poll_oa_period: The period in nanoseconds at which the CPU will check for OA
359	* data availability
360	*
361	* As read_properties_unlocked() enumerates and validates the properties given
362	* to open a stream of metrics the configuration is built up in the structure
363	* which starts out zero initialized.
364	*/
365	struct perf_open_properties {
366	u32 sample_flags;
367
368	u64 single_context:`1`;
369	u64 hold_preemption:`1`;
370	u64 ctx_handle;
371
372	/ OA sampling state /
373	int metrics_set;
374	int oa_format;
375	bool oa_periodic;
376	int oa_period_exponent;
377
378	struct intel_engine_cs *engine;
379
380	bool has_sseu;
381	struct intel_sseu sseu;
382
383	u64 poll_oa_period;
384	};
385
386	struct i915_oa_config_bo {
387	struct llist_node node;
388
389	struct i915_oa_config *oa_config;
390	struct i915_vma *vma;
391	};
392
393	static struct ctl_table_header *sysctl_header;
394
395	static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer);
396
397	void i915_oa_config_release(struct kref *ref)
398	{
399	struct i915_oa_config *oa_config =
400	container_of(ref, typeof(*oa_config), ref);
401
402	kfree(objp: oa_config->flex_regs);
403	kfree(objp: oa_config->b_counter_regs);
404	kfree(objp: oa_config->mux_regs);
405
406	kfree_rcu(oa_config, rcu);
407	}
408
409	struct i915_oa_config *
410	i915_perf_get_oa_config(struct i915_perf perf, int* metrics_set)
411	{
412	struct i915_oa_config *oa_config;
413
414	rcu_read_lock();
415	oa_config = idr_find(&perf->metrics_idr, id: metrics_set);
416	if (oa_config)
417	oa_config = i915_oa_config_get(oa_config);
418	rcu_read_unlock();
419
420	return oa_config;
421	}
422
423	static void free_oa_config_bo(struct i915_oa_config_bo *oa_bo)
424	{
425	i915_oa_config_put(oa_config: oa_bo->oa_config);
426	i915_vma_put(vma: oa_bo->vma);
427	kfree(objp: oa_bo);
428	}
429
430	static inline const
431	struct i915_perf_regs __oa_regs(struct* i915_perf_stream *stream)
432	{
433	return &stream->engine->oa_group->regs;
434	}
435
436	static u32 gen12_oa_hw_tail_read(struct i915_perf_stream *stream)
437	{
438	struct intel_uncore *uncore = stream->uncore;
439
440	return intel_uncore_read(uncore, reg: __oa_regs(stream)->oa_tail_ptr) &
441	GEN12_OAG_OATAILPTR_MASK;
442	}
443
444	static u32 gen8_oa_hw_tail_read(struct i915_perf_stream *stream)
445	{
446	struct intel_uncore *uncore = stream->uncore;
447
448	return intel_uncore_read(uncore, GEN8_OATAILPTR) & GEN8_OATAILPTR_MASK;
449	}
450
451	static u32 gen7_oa_hw_tail_read(struct i915_perf_stream *stream)
452	{
453	struct intel_uncore *uncore = stream->uncore;
454	u32 oastatus1 = intel_uncore_read(uncore, GEN7_OASTATUS1);
455
456	return oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
457	}
458
459	#define oa_report_header_64bit(__s) \
460	((__s)->oa_buffer.format->header == HDR_64_BIT)
461
462	static u64 oa_report_id(struct i915_perf_stream stream, void* *report)
463	{
464	return oa_report_header_64bit(stream) ? (u64 )report : (u32 )report;
465	}
466
467	static u64 oa_report_reason(struct i915_perf_stream stream, void* *report)
468	{
469	return (oa_report_id(stream, report) >> OAREPORT_REASON_SHIFT) &
470	(GRAPHICS_VER(stream->perf->i915) == `12` ?
471	OAREPORT_REASON_MASK_EXTENDED :
472	OAREPORT_REASON_MASK);
473	}
474
475	static void oa_report_id_clear(struct i915_perf_stream stream, u32 report)
476	{
477	if (oa_report_header_64bit(stream))
478	(u64 )report = `0`;
479	else
480	*report = `0`;
481	}
482
483	static bool oa_report_ctx_invalid(struct i915_perf_stream stream, void* *report)
484	{
485	return !(oa_report_id(stream, report) &
486	stream->perf->gen8_valid_ctx_bit);
487	}
488
489	static u64 oa_timestamp(struct i915_perf_stream stream, void* *report)
490	{
491	return oa_report_header_64bit(stream) ?
492	((u64 )report + `1`) :
493	((u32 )report + `1`);
494	}
495
496	static void oa_timestamp_clear(struct i915_perf_stream stream, u32 report)
497	{
498	if (oa_report_header_64bit(stream))
499	(u64 )&report[`2`] = `0`;
500	else
501	report[`1`] = `0`;
502	}
503
504	static u32 oa_context_id(struct i915_perf_stream stream, u32 report)
505	{
506	u32 ctx_id = oa_report_header_64bit(stream) ? report[`4`] : report[`2`];
507
508	return ctx_id & stream->specific_ctx_id_mask;
509	}
510
511	static void oa_context_id_squash(struct i915_perf_stream stream, u32 report)
512	{
513	if (oa_report_header_64bit(stream))
514	report[`4`] = INVALID_CTX_ID;
515	else
516	report[`2`] = INVALID_CTX_ID;
517	}
518
519	/**
520	* oa_buffer_check_unlocked - check for data and update tail ptr state
521	* @stream: i915 stream instance
522	*
523	* This is either called via fops (for blocking reads in user ctx) or the poll
524	* check hrtimer (atomic ctx) to check the OA buffer tail pointer and check
525	* if there is data available for userspace to read.
526	*
527	* This function is central to providing a workaround for the OA unit tail
528	* pointer having a race with respect to what data is visible to the CPU.
529	* It is responsible for reading tail pointers from the hardware and giving
530	* the pointers time to 'age' before they are made available for reading.
531	* (See description of OA_TAIL_MARGIN_NSEC above for further details.)
532	*
533	* Besides returning true when there is data available to read() this function
534	* also updates the tail in the oa_buffer object.
535	*
536	* Note: It's safe to read OA config state here unlocked, assuming that this is
537	* only called while the stream is enabled, while the global OA configuration
538	* can't be modified.
539	*
540	* Returns: %true if the OA buffer contains data, else %false
541	*/
542	static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
543	{
544	u32 gtt_offset = i915_ggtt_offset(vma: stream->oa_buffer.vma);
545	int report_size = stream->oa_buffer.format->size;
546	u32 tail, hw_tail;
547	unsigned long flags;
548	bool pollin;
549	u32 partial_report_size;
550
551	/ We have to consider the (unlikely) possibility that read() errors*
552	* could result in an OA buffer reset which might reset the head and
553	* tail state.
554	*/
555	spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
556
557	hw_tail = stream->perf->ops.oa_hw_tail_read(stream);
558	hw_tail -= gtt_offset;
559
560	/ The tail pointer increases in 64 byte increments, not in report_size*
561	* steps. Also the report size may not be a power of 2. Compute
562	* potentially partially landed report in the OA buffer
563	*/
564	partial_report_size = OA_TAKEN(hw_tail, stream->oa_buffer.tail);
565	partial_report_size %= report_size;
566
567	/ Subtract partial amount off the tail /
568	hw_tail = OA_TAKEN(hw_tail, partial_report_size);
569
570	tail = hw_tail;
571
572	/ Walk the stream backward until we find a report with report*
573	* id and timestmap not at 0. Since the circular buffer pointers
574	* progress by increments of 64 bytes and that reports can be up
575	* to 256 bytes long, we can't tell whether a report has fully
576	* landed in memory before the report id and timestamp of the
577	* following report have effectively landed.
578	*
579	* This is assuming that the writes of the OA unit land in
580	* memory in the order they were written to.
581	* If not : (╯°□°）╯︵ ┻━┻
582	*/
583	while (OA_TAKEN(tail, stream->oa_buffer.tail) >= report_size) {
584	void *report = stream->oa_buffer.vaddr + tail;
585
586	if (oa_report_id(stream, report) \|\|
587	oa_timestamp(stream, report))
588	break;
589
590	tail = (tail - report_size) & (OA_BUFFER_SIZE - `1`);
591	}
592
593	if (OA_TAKEN(hw_tail, tail) > report_size &&
594	__ratelimit(&stream->perf->tail_pointer_race))
595	drm_notice(&stream->uncore->i915->drm,
596	"unlanded report(s) head=0x%x tail=0x%x hw_tail=0x%x\n",
597	stream->oa_buffer.head, tail, hw_tail);
598
599	stream->oa_buffer.tail = tail;
600
601	pollin = OA_TAKEN(stream->oa_buffer.tail,
602	stream->oa_buffer.head) >= report_size;
603
604	spin_unlock_irqrestore(lock: &stream->oa_buffer.ptr_lock, flags);
605
606	return pollin;
607	}
608
609	/**
610	* append_oa_status - Appends a status record to a userspace read() buffer.
611	* @stream: An i915-perf stream opened for OA metrics
612	* @buf: destination buffer given by userspace
613	* @count: the number of bytes userspace wants to read
614	* @offset: (inout): the current position for writing into @buf
615	* @type: The kind of status to report to userspace
616	*
617	* Writes a status record (such as `DRM_I915_PERF_RECORD_OA_REPORT_LOST`)
618	* into the userspace read() buffer.
619	*
620	* The @buf @offset will only be updated on success.
621	*
622	* Returns: 0 on success, negative error code on failure.
623	*/
624	static int append_oa_status(struct i915_perf_stream *stream,
625	char __user *buf,
626	size_t count,
627	size_t *offset,
628	enum drm_i915_perf_record_type type)
629	{
630	struct drm_i915_perf_record_header header = { type, `0`, sizeof(header) };
631
632	if ((count - *offset) < header.size)
633	return -ENOSPC;
634
635	if (copy_to_user(to: buf + offset, from: &header, n: sizeof*(header)))
636	return -EFAULT;
637
638	(*offset) += header.size;
639
640	return `0`;
641	}
642
643	/**
644	* append_oa_sample - Copies single OA report into userspace read() buffer.
645	* @stream: An i915-perf stream opened for OA metrics
646	* @buf: destination buffer given by userspace
647	* @count: the number of bytes userspace wants to read
648	* @offset: (inout): the current position for writing into @buf
649	* @report: A single OA report to (optionally) include as part of the sample
650	*
651	* The contents of a sample are configured through `DRM_I915_PERF_PROP_SAMPLE_*`
652	* properties when opening a stream, tracked as `stream->sample_flags`. This
653	* function copies the requested components of a single sample to the given
654	* read() @buf.
655	*
656	* The @buf @offset will only be updated on success.
657	*
658	* Returns: 0 on success, negative error code on failure.
659	*/
660	static int append_oa_sample(struct i915_perf_stream *stream,
661	char __user *buf,
662	size_t count,
663	size_t *offset,
664	const u8 *report)
665	{
666	int report_size = stream->oa_buffer.format->size;
667	struct drm_i915_perf_record_header header;
668	int report_size_partial;
669	u8 *oa_buf_end;
670
671	header.type = DRM_I915_PERF_RECORD_SAMPLE;
672	header.pad = `0`;
673	header.size = stream->sample_size;
674
675	if ((count - *offset) < header.size)
676	return -ENOSPC;
677
678	buf += *offset;
679	if (copy_to_user(to: buf, from: &header, n: sizeof(header)))
680	return -EFAULT;
681	buf += sizeof(header);
682
683	oa_buf_end = stream->oa_buffer.vaddr + OA_BUFFER_SIZE;
684	report_size_partial = oa_buf_end - report;
685
686	if (report_size_partial < report_size) {
687	if (copy_to_user(to: buf, from: report, n: report_size_partial))
688	return -EFAULT;
689	buf += report_size_partial;
690
691	if (copy_to_user(to: buf, from: stream->oa_buffer.vaddr,
692	n: report_size - report_size_partial))
693	return -EFAULT;
694	} else if (copy_to_user(to: buf, from: report, n: report_size)) {
695	return -EFAULT;
696	}
697
698	(*offset) += header.size;
699
700	return `0`;
701	}
702
703	/**
704	* gen8_append_oa_reports - Copies all buffered OA reports into
705	* userspace read() buffer.
706	* @stream: An i915-perf stream opened for OA metrics
707	* @buf: destination buffer given by userspace
708	* @count: the number of bytes userspace wants to read
709	* @offset: (inout): the current position for writing into @buf
710	*
711	* Notably any error condition resulting in a short read (-%ENOSPC or
712	* -%EFAULT) will be returned even though one or more records may
713	* have been successfully copied. In this case it's up to the caller
714	* to decide if the error should be squashed before returning to
715	* userspace.
716	*
717	* Note: reports are consumed from the head, and appended to the
718	* tail, so the tail chases the head?... If you think that's mad
719	* and back-to-front you're not alone, but this follows the
720	* Gen PRM naming convention.
721	*
722	* Returns: 0 on success, negative error code on failure.
723	*/
724	static int gen8_append_oa_reports(struct i915_perf_stream *stream,
725	char __user *buf,
726	size_t count,
727	size_t *offset)
728	{
729	struct intel_uncore *uncore = stream->uncore;
730	int report_size = stream->oa_buffer.format->size;
731	u8 *oa_buf_base = stream->oa_buffer.vaddr;
732	u32 gtt_offset = i915_ggtt_offset(vma: stream->oa_buffer.vma);
733	u32 mask = (OA_BUFFER_SIZE - `1`);
734	size_t start_offset = *offset;
735	unsigned long flags;
736	u32 head, tail;
737	int ret = `0`;
738
739	if (drm_WARN_ON(&uncore->i915->drm, !stream->enabled))
740	return -EIO;
741
742	spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
743
744	head = stream->oa_buffer.head;
745	tail = stream->oa_buffer.tail;
746
747	spin_unlock_irqrestore(lock: &stream->oa_buffer.ptr_lock, flags);
748
749	/*
750	* An out of bounds or misaligned head or tail pointer implies a driver
751	* bug since we validate + align the tail pointers we read from the
752	* hardware and we are in full control of the head pointer which should
753	* only be incremented by multiples of the report size.
754	*/
755	if (drm_WARN_ONCE(&uncore->i915->drm,
756	head > OA_BUFFER_SIZE \|\|
757	tail > OA_BUFFER_SIZE,
758	"Inconsistent OA buffer pointers: head = %u, tail = %u\n",
759	head, tail))
760	return -EIO;
761
762
763	for (/ none /;
764	OA_TAKEN(tail, head);
765	head = (head + report_size) & mask) {
766	u8 *report = oa_buf_base + head;
767	u32 report32 = (void* *)report;
768	u32 ctx_id;
769	u64 reason;
770
771	/*
772	* The reason field includes flags identifying what
773	* triggered this specific report (mostly timer
774	* triggered or e.g. due to a context switch).
775	*/
776	reason = oa_report_reason(stream, report);
777	ctx_id = oa_context_id(stream, report: report32);
778
779	/*
780	* Squash whatever is in the CTX_ID field if it's marked as
781	* invalid to be sure we avoid false-positive, single-context
782	* filtering below...
783	*
784	* Note: that we don't clear the valid_ctx_bit so userspace can
785	* understand that the ID has been squashed by the kernel.
786	*
787	* Update:
788	*
789	* On XEHP platforms the behavior of context id valid bit has
790	* changed compared to prior platforms. To describe this, we
791	* define a few terms:
792	*
793	* context-switch-report: This is a report with the reason type
794	* being context-switch. It is generated when a context switches
795	* out.
796	*
797	* context-valid-bit: A bit that is set in the report ID field
798	* to indicate that a valid context has been loaded.
799	*
800	* gpu-idle: A condition characterized by a
801	* context-switch-report with context-valid-bit set to 0.
802	*
803	* On prior platforms, context-id-valid bit is set to 0 only
804	* when GPU goes idle. In all other reports, it is set to 1.
805	*
806	* On XEHP platforms, context-valid-bit is set to 1 in a context
807	* switch report if a new context switched in. For all other
808	* reports it is set to 0.
809	*
810	* This change in behavior causes an issue with MMIO triggered
811	* reports. MMIO triggered reports have the markers in the
812	* context ID field and the context-valid-bit is 0. The logic
813	* below to squash the context ID would render the report
814	* useless since the user will not be able to find it in the OA
815	* buffer. Since MMIO triggered reports exist only on XEHP,
816	* we should avoid squashing these for XEHP platforms.
817	*/
818
819	if (oa_report_ctx_invalid(stream, report) &&
820	GRAPHICS_VER_FULL(stream->engine->i915) < IP_VER(`12`, `50`)) {
821	ctx_id = INVALID_CTX_ID;
822	oa_context_id_squash(stream, report: report32);
823	}
824
825	/*
826	* NB: For Gen 8 the OA unit no longer supports clock gating
827	* off for a specific context and the kernel can't securely
828	* stop the counters from updating as system-wide / global
829	* values.
830	*
831	* Automatic reports now include a context ID so reports can be
832	* filtered on the cpu but it's not worth trying to
833	* automatically subtract/hide counter progress for other
834	* contexts while filtering since we can't stop userspace
835	* issuing MI_REPORT_PERF_COUNT commands which would still
836	* provide a side-band view of the real values.
837	*
838	* To allow userspace (such as Mesa/GL_INTEL_performance_query)
839	* to normalize counters for a single filtered context then it
840	* needs be forwarded bookend context-switch reports so that it
841	* can track switches in between MI_REPORT_PERF_COUNT commands
842	* and can itself subtract/ignore the progress of counters
843	* associated with other contexts. Note that the hardware
844	* automatically triggers reports when switching to a new
845	* context which are tagged with the ID of the newly active
846	* context. To avoid the complexity (and likely fragility) of
847	* reading ahead while parsing reports to try and minimize
848	* forwarding redundant context switch reports (i.e. between
849	* other, unrelated contexts) we simply elect to forward them
850	* all.
851	*
852	* We don't rely solely on the reason field to identify context
853	* switches since it's not-uncommon for periodic samples to
854	* identify a switch before any 'context switch' report.
855	*/
856	if (!stream->ctx \|\|
857	stream->specific_ctx_id == ctx_id \|\|
858	stream->oa_buffer.last_ctx_id == stream->specific_ctx_id \|\|
859	reason & OAREPORT_REASON_CTX_SWITCH) {
860
861	/*
862	* While filtering for a single context we avoid
863	* leaking the IDs of other contexts.
864	*/
865	if (stream->ctx &&
866	stream->specific_ctx_id != ctx_id) {
867	oa_context_id_squash(stream, report: report32);
868	}
869
870	ret = append_oa_sample(stream, buf, count, offset,
871	report);
872	if (ret)
873	break;
874
875	stream->oa_buffer.last_ctx_id = ctx_id;
876	}
877
878	if (is_power_of_2(n: report_size)) {
879	/*
880	* Clear out the report id and timestamp as a means
881	* to detect unlanded reports.
882	*/
883	oa_report_id_clear(stream, report: report32);
884	oa_timestamp_clear(stream, report: report32);
885	} else {
886	u8 *oa_buf_end = stream->oa_buffer.vaddr +
887	OA_BUFFER_SIZE;
888	u32 part = oa_buf_end - (u8 *)report32;
889
890	/ Zero out the entire report /
891	if (report_size <= part) {
892	memset(report32, `0`, report_size);
893	} else {
894	memset(report32, `0`, part);
895	memset(oa_buf_base, `0`, report_size - part);
896	}
897	}
898	}
899
900	if (start_offset != *offset) {
901	i915_reg_t oaheadptr;
902
903	oaheadptr = GRAPHICS_VER(stream->perf->i915) == `12` ?
904	__oa_regs(stream)->oa_head_ptr :
905	GEN8_OAHEADPTR;
906
907	spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
908
909	/*
910	* We removed the gtt_offset for the copy loop above, indexing
911	* relative to oa_buf_base so put back here...
912	*/
913	intel_uncore_write(uncore, reg: oaheadptr,
914	val: (head + gtt_offset) & GEN12_OAG_OAHEADPTR_MASK);
915	stream->oa_buffer.head = head;
916
917	spin_unlock_irqrestore(lock: &stream->oa_buffer.ptr_lock, flags);
918	}
919
920	return ret;
921	}
922
923	/**
924	* gen8_oa_read - copy status records then buffered OA reports
925	* @stream: An i915-perf stream opened for OA metrics
926	* @buf: destination buffer given by userspace
927	* @count: the number of bytes userspace wants to read
928	* @offset: (inout): the current position for writing into @buf
929	*
930	* Checks OA unit status registers and if necessary appends corresponding
931	* status records for userspace (such as for a buffer full condition) and then
932	* initiate appending any buffered OA reports.
933	*
934	* Updates @offset according to the number of bytes successfully copied into
935	* the userspace buffer.
936	*
937	* NB: some data may be successfully copied to the userspace buffer
938	* even if an error is returned, and this is reflected in the
939	* updated @offset.
940	*
941	* Returns: zero on success or a negative error code
942	*/
943	static int gen8_oa_read(struct i915_perf_stream *stream,
944	char __user *buf,
945	size_t count,
946	size_t *offset)
947	{
948	struct intel_uncore *uncore = stream->uncore;
949	u32 oastatus;
950	i915_reg_t oastatus_reg;
951	int ret;
952
953	if (drm_WARN_ON(&uncore->i915->drm, !stream->oa_buffer.vaddr))
954	return -EIO;
955
956	oastatus_reg = GRAPHICS_VER(stream->perf->i915) == `12` ?
957	__oa_regs(stream)->oa_status :
958	GEN8_OASTATUS;
959
960	oastatus = intel_uncore_read(uncore, reg: oastatus_reg);
961
962	/*
963	* We treat OABUFFER_OVERFLOW as a significant error:
964	*
965	* Although theoretically we could handle this more gracefully
966	* sometimes, some Gens don't correctly suppress certain
967	* automatically triggered reports in this condition and so we
968	* have to assume that old reports are now being trampled
969	* over.
970	*
971	* Considering how we don't currently give userspace control
972	* over the OA buffer size and always configure a large 16MB
973	* buffer, then a buffer overflow does anyway likely indicate
974	* that something has gone quite badly wrong.
975	*/
976	if (oastatus & GEN8_OASTATUS_OABUFFER_OVERFLOW) {
977	ret = append_oa_status(stream, buf, count, offset,
978	type: DRM_I915_PERF_RECORD_OA_BUFFER_LOST);
979	if (ret)
980	return ret;
981
982	drm_dbg(&stream->perf->i915->drm,
983	"OA buffer overflow (exponent = %d): force restart\n",
984	stream->period_exponent);
985
986	stream->perf->ops.oa_disable(stream);
987	stream->perf->ops.oa_enable(stream);
988
989	/*
990	* Note: .oa_enable() is expected to re-init the oabuffer and
991	* reset GEN8_OASTATUS for us
992	*/
993	oastatus = intel_uncore_read(uncore, reg: oastatus_reg);
994	}
995
996	if (oastatus & GEN8_OASTATUS_REPORT_LOST) {
997	ret = append_oa_status(stream, buf, count, offset,
998	type: DRM_I915_PERF_RECORD_OA_REPORT_LOST);
999	if (ret)
1000	return ret;
1001
1002	intel_uncore_rmw(uncore, reg: oastatus_reg,
1003	GEN8_OASTATUS_COUNTER_OVERFLOW \|
1004	GEN8_OASTATUS_REPORT_LOST,
1005	IS_GRAPHICS_VER(uncore->i915, `8`, `11`) ?
1006	(GEN8_OASTATUS_HEAD_POINTER_WRAP \|
1007	GEN8_OASTATUS_TAIL_POINTER_WRAP) : `0`);
1008	}
1009
1010	return gen8_append_oa_reports(stream, buf, count, offset);
1011	}
1012
1013	/**
1014	* gen7_append_oa_reports - Copies all buffered OA reports into
1015	* userspace read() buffer.
1016	* @stream: An i915-perf stream opened for OA metrics
1017	* @buf: destination buffer given by userspace
1018	* @count: the number of bytes userspace wants to read
1019	* @offset: (inout): the current position for writing into @buf
1020	*
1021	* Notably any error condition resulting in a short read (-%ENOSPC or
1022	* -%EFAULT) will be returned even though one or more records may
1023	* have been successfully copied. In this case it's up to the caller
1024	* to decide if the error should be squashed before returning to
1025	* userspace.
1026	*
1027	* Note: reports are consumed from the head, and appended to the
1028	* tail, so the tail chases the head?... If you think that's mad
1029	* and back-to-front you're not alone, but this follows the
1030	* Gen PRM naming convention.
1031	*
1032	* Returns: 0 on success, negative error code on failure.
1033	*/
1034	static int gen7_append_oa_reports(struct i915_perf_stream *stream,
1035	char __user *buf,
1036	size_t count,
1037	size_t *offset)
1038	{
1039	struct intel_uncore *uncore = stream->uncore;
1040	int report_size = stream->oa_buffer.format->size;
1041	u8 *oa_buf_base = stream->oa_buffer.vaddr;
1042	u32 gtt_offset = i915_ggtt_offset(vma: stream->oa_buffer.vma);
1043	u32 mask = (OA_BUFFER_SIZE - `1`);
1044	size_t start_offset = *offset;
1045	unsigned long flags;
1046	u32 head, tail;
1047	int ret = `0`;
1048
1049	if (drm_WARN_ON(&uncore->i915->drm, !stream->enabled))
1050	return -EIO;
1051
1052	spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
1053
1054	head = stream->oa_buffer.head;
1055	tail = stream->oa_buffer.tail;
1056
1057	spin_unlock_irqrestore(lock: &stream->oa_buffer.ptr_lock, flags);
1058
1059	/ An out of bounds or misaligned head or tail pointer implies a driver*
1060	* bug since we validate + align the tail pointers we read from the
1061	* hardware and we are in full control of the head pointer which should
1062	* only be incremented by multiples of the report size (notably also
1063	* all a power of two).
1064	*/
1065	if (drm_WARN_ONCE(&uncore->i915->drm,
1066	head > OA_BUFFER_SIZE \|\| head % report_size \|\|
1067	tail > OA_BUFFER_SIZE \|\| tail % report_size,
1068	"Inconsistent OA buffer pointers: head = %u, tail = %u\n",
1069	head, tail))
1070	return -EIO;
1071
1072
1073	for (/ none /;
1074	OA_TAKEN(tail, head);
1075	head = (head + report_size) & mask) {
1076	u8 *report = oa_buf_base + head;
1077	u32 report32 = (void* *)report;
1078
1079	/ All the report sizes factor neatly into the buffer*
1080	* size so we never expect to see a report split
1081	* between the beginning and end of the buffer.
1082	*
1083	* Given the initial alignment check a misalignment
1084	* here would imply a driver bug that would result
1085	* in an overrun.
1086	*/
1087	if (drm_WARN_ON(&uncore->i915->drm,
1088	(OA_BUFFER_SIZE - head) < report_size)) {
1089	drm_err(&uncore->i915->drm,
1090	"Spurious OA head ptr: non-integral report offset\n");
1091	break;
1092	}
1093
1094	/ The report-ID field for periodic samples includes*
1095	* some undocumented flags related to what triggered
1096	* the report and is never expected to be zero so we
1097	* can check that the report isn't invalid before
1098	* copying it to userspace...
1099	*/
1100	if (report32[`0`] == `0`) {
1101	if (__ratelimit(&stream->perf->spurious_report_rs))
1102	drm_notice(&uncore->i915->drm,
1103	"Skipping spurious, invalid OA report\n");
1104	continue;
1105	}
1106
1107	ret = append_oa_sample(stream, buf, count, offset, report);
1108	if (ret)
1109	break;
1110
1111	/ Clear out the first 2 dwords as a mean to detect unlanded*
1112	* reports.
1113	*/
1114	report32[`0`] = `0`;
1115	report32[`1`] = `0`;
1116	}
1117
1118	if (start_offset != *offset) {
1119	spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
1120
1121	intel_uncore_write(uncore, GEN7_OASTATUS2,
1122	val: ((head + gtt_offset) & GEN7_OASTATUS2_HEAD_MASK) \|
1123	GEN7_OASTATUS2_MEM_SELECT_GGTT);
1124	stream->oa_buffer.head = head;
1125
1126	spin_unlock_irqrestore(lock: &stream->oa_buffer.ptr_lock, flags);
1127	}
1128
1129	return ret;
1130	}
1131
1132	/**
1133	* gen7_oa_read - copy status records then buffered OA reports
1134	* @stream: An i915-perf stream opened for OA metrics
1135	* @buf: destination buffer given by userspace
1136	* @count: the number of bytes userspace wants to read
1137	* @offset: (inout): the current position for writing into @buf
1138	*
1139	* Checks Gen 7 specific OA unit status registers and if necessary appends
1140	* corresponding status records for userspace (such as for a buffer full
1141	* condition) and then initiate appending any buffered OA reports.
1142	*
1143	* Updates @offset according to the number of bytes successfully copied into
1144	* the userspace buffer.
1145	*
1146	* Returns: zero on success or a negative error code
1147	*/
1148	static int gen7_oa_read(struct i915_perf_stream *stream,
1149	char __user *buf,
1150	size_t count,
1151	size_t *offset)
1152	{
1153	struct intel_uncore *uncore = stream->uncore;
1154	u32 oastatus1;
1155	int ret;
1156
1157	if (drm_WARN_ON(&uncore->i915->drm, !stream->oa_buffer.vaddr))
1158	return -EIO;
1159
1160	oastatus1 = intel_uncore_read(uncore, GEN7_OASTATUS1);
1161
1162	/ XXX: On Haswell we don't have a safe way to clear oastatus1*
1163	* bits while the OA unit is enabled (while the tail pointer
1164	* may be updated asynchronously) so we ignore status bits
1165	* that have already been reported to userspace.
1166	*/
1167	oastatus1 &= ~stream->perf->gen7_latched_oastatus1;
1168
1169	/ We treat OABUFFER_OVERFLOW as a significant error:*
1170	*
1171	* - The status can be interpreted to mean that the buffer is
1172	* currently full (with a higher precedence than OA_TAKEN()
1173	* which will start to report a near-empty buffer after an
1174	* overflow) but it's awkward that we can't clear the status
1175	* on Haswell, so without a reset we won't be able to catch
1176	* the state again.
1177	*
1178	* - Since it also implies the HW has started overwriting old
1179	* reports it may also affect our sanity checks for invalid
1180	* reports when copying to userspace that assume new reports
1181	* are being written to cleared memory.
1182	*
1183	* - In the future we may want to introduce a flight recorder
1184	* mode where the driver will automatically maintain a safe
1185	* guard band between head/tail, avoiding this overflow
1186	* condition, but we avoid the added driver complexity for
1187	* now.
1188	*/
1189	if (unlikely(oastatus1 & GEN7_OASTATUS1_OABUFFER_OVERFLOW)) {
1190	ret = append_oa_status(stream, buf, count, offset,
1191	type: DRM_I915_PERF_RECORD_OA_BUFFER_LOST);
1192	if (ret)
1193	return ret;
1194
1195	drm_dbg(&stream->perf->i915->drm,
1196	"OA buffer overflow (exponent = %d): force restart\n",
1197	stream->period_exponent);
1198
1199	stream->perf->ops.oa_disable(stream);
1200	stream->perf->ops.oa_enable(stream);
1201
1202	oastatus1 = intel_uncore_read(uncore, GEN7_OASTATUS1);
1203	}
1204
1205	if (unlikely(oastatus1 & GEN7_OASTATUS1_REPORT_LOST)) {
1206	ret = append_oa_status(stream, buf, count, offset,
1207	type: DRM_I915_PERF_RECORD_OA_REPORT_LOST);
1208	if (ret)
1209	return ret;
1210	stream->perf->gen7_latched_oastatus1 \|=
1211	GEN7_OASTATUS1_REPORT_LOST;
1212	}
1213
1214	return gen7_append_oa_reports(stream, buf, count, offset);
1215	}
1216
1217	/**
1218	* i915_oa_wait_unlocked - handles blocking IO until OA data available
1219	* @stream: An i915-perf stream opened for OA metrics
1220	*
1221	* Called when userspace tries to read() from a blocking stream FD opened
1222	* for OA metrics. It waits until the hrtimer callback finds a non-empty
1223	* OA buffer and wakes us.
1224	*
1225	* Note: it's acceptable to have this return with some false positives
1226	* since any subsequent read handling will return -EAGAIN if there isn't
1227	* really data ready for userspace yet.
1228	*
1229	* Returns: zero on success or a negative error code
1230	*/
1231	static int i915_oa_wait_unlocked(struct i915_perf_stream *stream)
1232	{
1233	/ We would wait indefinitely if periodic sampling is not enabled /
1234	if (!stream->periodic)
1235	return -EIO;
1236
1237	return wait_event_interruptible(stream->poll_wq,
1238	oa_buffer_check_unlocked(stream));
1239	}
1240
1241	/**
1242	* i915_oa_poll_wait - call poll_wait() for an OA stream poll()
1243	* @stream: An i915-perf stream opened for OA metrics
1244	* @file: An i915 perf stream file
1245	* @wait: poll() state table
1246	*
1247	* For handling userspace polling on an i915 perf stream opened for OA metrics,
1248	* this starts a poll_wait with the wait queue that our hrtimer callback wakes
1249	* when it sees data ready to read in the circular OA buffer.
1250	*/
1251	static void i915_oa_poll_wait(struct i915_perf_stream *stream,
1252	struct file *file,
1253	poll_table *wait)
1254	{
1255	poll_wait(filp: file, wait_address: &stream->poll_wq, p: wait);
1256	}
1257
1258	/**
1259	* i915_oa_read - just calls through to &i915_oa_ops->read
1260	* @stream: An i915-perf stream opened for OA metrics
1261	* @buf: destination buffer given by userspace
1262	* @count: the number of bytes userspace wants to read
1263	* @offset: (inout): the current position for writing into @buf
1264	*
1265	* Updates @offset according to the number of bytes successfully copied into
1266	* the userspace buffer.
1267	*
1268	* Returns: zero on success or a negative error code
1269	*/
1270	static int i915_oa_read(struct i915_perf_stream *stream,
1271	char __user *buf,
1272	size_t count,
1273	size_t *offset)
1274	{
1275	return stream->perf->ops.read(stream, buf, count, offset);
1276	}
1277
1278	static struct intel_context oa_pin_context(struct* i915_perf_stream *stream)
1279	{
1280	struct i915_gem_engines_iter it;
1281	struct i915_gem_context *ctx = stream->ctx;
1282	struct intel_context *ce;
1283	struct i915_gem_ww_ctx ww;
1284	int err = -ENODEV;
1285
1286	for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) {
1287	if (ce->engine != stream->engine) / first match! /
1288	continue;
1289
1290	err = `0`;
1291	break;
1292	}
1293	i915_gem_context_unlock_engines(ctx);
1294
1295	if (err)
1296	return ERR_PTR(error: err);
1297
1298	i915_gem_ww_ctx_init(ctx: &ww, intr: true);
1299	retry:
1300	/*
1301	* As the ID is the gtt offset of the context's vma we
1302	* pin the vma to ensure the ID remains fixed.
1303	*/
1304	err = intel_context_pin_ww(ce, ww: &ww);
1305	if (err == -EDEADLK) {
1306	err = i915_gem_ww_ctx_backoff(ctx: &ww);
1307	if (!err)
1308	goto retry;
1309	}
1310	i915_gem_ww_ctx_fini(ctx: &ww);
1311
1312	if (err)
1313	return ERR_PTR(error: err);
1314
1315	stream->pinned_ctx = ce;
1316	return stream->pinned_ctx;
1317	}
1318
1319	static int
1320	__store_reg_to_mem(struct i915_request *rq, i915_reg_t reg, u32 ggtt_offset)
1321	{
1322	u32 *cs, cmd;
1323
1324	cmd = MI_STORE_REGISTER_MEM \| MI_SRM_LRM_GLOBAL_GTT;
1325	if (GRAPHICS_VER(rq->i915) >= `8`)
1326	cmd++;
1327
1328	cs = intel_ring_begin(rq, num_dwords: `4`);
1329	if (IS_ERR(ptr: cs))
1330	return PTR_ERR(ptr: cs);
1331
1332	*cs++ = cmd;
1333	*cs++ = i915_mmio_reg_offset(reg);
1334	*cs++ = ggtt_offset;
1335	*cs++ = `0`;
1336
1337	intel_ring_advance(rq, cs);
1338
1339	return `0`;
1340	}
1341
1342	static int
1343	__read_reg(struct intel_context *ce, i915_reg_t reg, u32 ggtt_offset)
1344	{
1345	struct i915_request *rq;
1346	int err;
1347
1348	rq = i915_request_create(ce);
1349	if (IS_ERR(ptr: rq))
1350	return PTR_ERR(ptr: rq);
1351
1352	i915_request_get(rq);
1353
1354	err = __store_reg_to_mem(rq, reg, ggtt_offset);
1355
1356	i915_request_add(rq);
1357	if (!err && i915_request_wait(rq, flags: `0`, HZ / `2`) < `0`)
1358	err = -ETIME;
1359
1360	i915_request_put(rq);
1361
1362	return err;
1363	}
1364
1365	static int
1366	gen12_guc_sw_ctx_id(struct intel_context ce, u32 ctx_id)
1367	{
1368	struct i915_vma *scratch;
1369	u32 *val;
1370	int err;
1371
1372	scratch = __vm_create_scratch_for_read_pinned(vm: &ce->engine->gt->ggtt->vm, size: `4`);
1373	if (IS_ERR(ptr: scratch))
1374	return PTR_ERR(ptr: scratch);
1375
1376	err = i915_vma_sync(vma: scratch);
1377	if (err)
1378	goto err_scratch;
1379
1380	err = __read_reg(ce, RING_EXECLIST_STATUS_HI(ce->engine->mmio_base),
1381	ggtt_offset: i915_ggtt_offset(vma: scratch));
1382	if (err)
1383	goto err_scratch;
1384
1385	val = i915_gem_object_pin_map_unlocked(obj: scratch->obj, type: I915_MAP_WB);
1386	if (IS_ERR(ptr: val)) {
1387	err = PTR_ERR(ptr: val);
1388	goto err_scratch;
1389	}
1390
1391	ctx_id = val;
1392	i915_gem_object_unpin_map(obj: scratch->obj);
1393
1394	err_scratch:
1395	i915_vma_unpin_and_release(p_vma: &scratch, flags: `0`);
1396	return err;
1397	}
1398
1399	/*
1400	* For execlist mode of submission, pick an unused context id
1401	* 0 - (NUM_CONTEXT_TAG -1) are used by other contexts
1402	* XXX_MAX_CONTEXT_HW_ID is used by idle context
1403	*
1404	* For GuC mode of submission read context id from the upper dword of the
1405	* EXECLIST_STATUS register. Note that we read this value only once and expect
1406	* that the value stays fixed for the entire OA use case. There are cases where
1407	* GuC KMD implementation may deregister a context to reuse it's context id, but
1408	* we prevent that from happening to the OA context by pinning it.
1409	*/
1410	static int gen12_get_render_context_id(struct i915_perf_stream *stream)
1411	{
1412	u32 ctx_id, mask;
1413	int ret;
1414
1415	if (intel_engine_uses_guc(engine: stream->engine)) {
1416	ret = gen12_guc_sw_ctx_id(ce: stream->pinned_ctx, ctx_id: &ctx_id);
1417	if (ret)
1418	return ret;
1419
1420	mask = ((`1U` << GEN12_GUC_SW_CTX_ID_WIDTH) - `1`) <<
1421	(GEN12_GUC_SW_CTX_ID_SHIFT - `32`);
1422	} else if (GRAPHICS_VER_FULL(stream->engine->i915) >= IP_VER(`12`, `50`)) {
1423	ctx_id = (XEHP_MAX_CONTEXT_HW_ID - `1`) <<
1424	(XEHP_SW_CTX_ID_SHIFT - `32`);
1425
1426	mask = ((`1U` << XEHP_SW_CTX_ID_WIDTH) - `1`) <<
1427	(XEHP_SW_CTX_ID_SHIFT - `32`);
1428	} else {
1429	ctx_id = (GEN12_MAX_CONTEXT_HW_ID - `1`) <<
1430	(GEN11_SW_CTX_ID_SHIFT - `32`);
1431
1432	mask = ((`1U` << GEN11_SW_CTX_ID_WIDTH) - `1`) <<
1433	(GEN11_SW_CTX_ID_SHIFT - `32`);
1434	}
1435	stream->specific_ctx_id = ctx_id & mask;
1436	stream->specific_ctx_id_mask = mask;
1437
1438	return `0`;
1439	}
1440
1441	static bool oa_find_reg_in_lri(u32 state, u32 reg, u32 offset, u32 end)
1442	{
1443	u32 idx = *offset;
1444	u32 len = min(MI_LRI_LEN(state[idx]) + idx, end);
1445	bool found = false;
1446
1447	idx++;
1448	for (; idx < len; idx += `2`) {
1449	if (state[idx] == reg) {
1450	found = true;
1451	break;
1452	}
1453	}
1454
1455	*offset = idx;
1456	return found;
1457	}
1458
1459	static u32 oa_context_image_offset(struct intel_context *ce, u32 reg)
1460	{
1461	u32 offset, len = (ce->engine->context_size - PAGE_SIZE) / `4`;
1462	u32 *state = ce->lrc_reg_state;
1463
1464	if (drm_WARN_ON(&ce->engine->i915->drm, !state))
1465	return U32_MAX;
1466
1467	for (offset = `0`; offset < len; ) {
1468	if (IS_MI_LRI_CMD(state[offset])) {
1469	/*
1470	* We expect reg-value pairs in MI_LRI command, so
1471	* MI_LRI_LEN() should be even, if not, issue a warning.
1472	*/
1473	drm_WARN_ON(&ce->engine->i915->drm,
1474	MI_LRI_LEN(state[offset]) & `0x1`);
1475
1476	if (oa_find_reg_in_lri(state, reg, offset: &offset, end: len))
1477	break;
1478	} else {
1479	offset++;
1480	}
1481	}
1482
1483	return offset < len ? offset : U32_MAX;
1484	}
1485
1486	static int set_oa_ctx_ctrl_offset(struct intel_context *ce)
1487	{
1488	i915_reg_t reg = GEN12_OACTXCONTROL(ce->engine->mmio_base);
1489	struct i915_perf *perf = &ce->engine->i915->perf;
1490	u32 offset = perf->ctx_oactxctrl_offset;
1491
1492	/ Do this only once. Failure is stored as offset of U32_MAX /
1493	if (offset)
1494	goto exit;
1495
1496	offset = oa_context_image_offset(ce, i915_mmio_reg_offset(reg));
1497	perf->ctx_oactxctrl_offset = offset;
1498
1499	drm_dbg(&ce->engine->i915->drm,
1500	"%s oa ctx control at 0x%08x dword offset\n",
1501	ce->engine->name, offset);
1502
1503	exit:
1504	return offset && offset != U32_MAX ? `0` : -ENODEV;
1505	}
1506
1507	static bool engine_supports_mi_query(struct intel_engine_cs *engine)
1508	{
1509	return engine->class == RENDER_CLASS;
1510	}
1511
1512	/**
1513	* oa_get_render_ctx_id - determine and hold ctx hw id
1514	* @stream: An i915-perf stream opened for OA metrics
1515	*
1516	* Determine the render context hw id, and ensure it remains fixed for the
1517	* lifetime of the stream. This ensures that we don't have to worry about
1518	* updating the context ID in OACONTROL on the fly.
1519	*
1520	* Returns: zero on success or a negative error code
1521	*/
1522	static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
1523	{
1524	struct intel_context *ce;
1525	int ret = `0`;
1526
1527	ce = oa_pin_context(stream);
1528	if (IS_ERR(ptr: ce))
1529	return PTR_ERR(ptr: ce);
1530
1531	if (engine_supports_mi_query(engine: stream->engine) &&
1532	HAS_LOGICAL_RING_CONTEXTS(stream->perf->i915)) {
1533	/*
1534	* We are enabling perf query here. If we don't find the context
1535	* offset here, just return an error.
1536	*/
1537	ret = set_oa_ctx_ctrl_offset(ce);
1538	if (ret) {
1539	intel_context_unpin(ce);
1540	drm_err(&stream->perf->i915->drm,
1541	"Enabling perf query failed for %s\n",
1542	stream->engine->name);
1543	return ret;
1544	}
1545	}
1546
1547	switch (GRAPHICS_VER(ce->engine->i915)) {
1548	case `7`: {
1549	/*
1550	* On Haswell we don't do any post processing of the reports
1551	* and don't need to use the mask.
1552	*/
1553	stream->specific_ctx_id = i915_ggtt_offset(vma: ce->state);
1554	stream->specific_ctx_id_mask = `0`;
1555	break;
1556	}
1557
1558	case `8`:
1559	case `9`:
1560	if (intel_engine_uses_guc(engine: ce->engine)) {
1561	/*
1562	* When using GuC, the context descriptor we write in
1563	* i915 is read by GuC and rewritten before it's
1564	* actually written into the hardware. The LRCA is
1565	* what is put into the context id field of the
1566	* context descriptor by GuC. Because it's aligned to
1567	* a page, the lower 12bits are always at 0 and
1568	* dropped by GuC. They won't be part of the context
1569	* ID in the OA reports, so squash those lower bits.
1570	*/
1571	stream->specific_ctx_id = ce->lrc.lrca >> `12`;
1572
1573	/*
1574	* GuC uses the top bit to signal proxy submission, so
1575	* ignore that bit.
1576	*/
1577	stream->specific_ctx_id_mask =
1578	(`1U` << (GEN8_CTX_ID_WIDTH - `1`)) - `1`;
1579	} else {
1580	stream->specific_ctx_id_mask =
1581	(`1U` << GEN8_CTX_ID_WIDTH) - `1`;
1582	stream->specific_ctx_id = stream->specific_ctx_id_mask;
1583	}
1584	break;
1585
1586	case `11`:
1587	case `12`:
1588	ret = gen12_get_render_context_id(stream);
1589	break;
1590
1591	default:
1592	MISSING_CASE(GRAPHICS_VER(ce->engine->i915));
1593	}
1594
1595	ce->tag = stream->specific_ctx_id;
1596
1597	drm_dbg(&stream->perf->i915->drm,
1598	"filtering on ctx_id=0x%x ctx_id_mask=0x%x\n",
1599	stream->specific_ctx_id,
1600	stream->specific_ctx_id_mask);
1601
1602	return ret;
1603	}
1604
1605	/**
1606	* oa_put_render_ctx_id - counterpart to oa_get_render_ctx_id releases hold
1607	* @stream: An i915-perf stream opened for OA metrics
1608	*
1609	* In case anything needed doing to ensure the context HW ID would remain valid
1610	* for the lifetime of the stream, then that can be undone here.
1611	*/
1612	static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
1613	{
1614	struct intel_context *ce;
1615
1616	ce = fetch_and_zero(&stream->pinned_ctx);
1617	if (ce) {
1618	ce->tag = `0`; / recomputed on next submission after parking /
1619	intel_context_unpin(ce);
1620	}
1621
1622	stream->specific_ctx_id = INVALID_CTX_ID;
1623	stream->specific_ctx_id_mask = `0`;
1624	}
1625
1626	static void
1627	free_oa_buffer(struct i915_perf_stream *stream)
1628	{
1629	i915_vma_unpin_and_release(p_vma: &stream->oa_buffer.vma,
1630	I915_VMA_RELEASE_MAP);
1631
1632	stream->oa_buffer.vaddr = NULL;
1633	}
1634
1635	static void
1636	free_oa_configs(struct i915_perf_stream *stream)
1637	{
1638	struct i915_oa_config_bo oa_bo, tmp;
1639
1640	i915_oa_config_put(oa_config: stream->oa_config);
1641	llist_for_each_entry_safe(oa_bo, tmp, stream->oa_config_bos.first, node)
1642	free_oa_config_bo(oa_bo);
1643	}
1644
1645	static void
1646	free_noa_wait(struct i915_perf_stream *stream)
1647	{
1648	i915_vma_unpin_and_release(p_vma: &stream->noa_wait, flags: `0`);
1649	}
1650
1651	static bool engine_supports_oa(const struct intel_engine_cs *engine)
1652	{
1653	return engine->oa_group;
1654	}
1655
1656	static bool engine_supports_oa_format(struct intel_engine_cs engine, int* type)
1657	{
1658	return engine->oa_group && engine->oa_group->type == type;
1659	}
1660
1661	static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
1662	{
1663	struct i915_perf *perf = stream->perf;
1664	struct intel_gt *gt = stream->engine->gt;
1665	struct i915_perf_group *g = stream->engine->oa_group;
1666
1667	if (WARN_ON(stream != g->exclusive_stream))
1668	return;
1669
1670	/*
1671	* Unset exclusive_stream first, it will be checked while disabling
1672	* the metric set on gen8+.
1673	*
1674	* See i915_oa_init_reg_state() and lrc_configure_all_contexts()
1675	*/
1676	WRITE_ONCE(g->exclusive_stream, NULL);
1677	perf->ops.disable_metric_set(stream);
1678
1679	free_oa_buffer(stream);
1680
1681	intel_uncore_forcewake_put(uncore: stream->uncore, domains: FORCEWAKE_ALL);
1682	intel_engine_pm_put(engine: stream->engine);
1683
1684	if (stream->ctx)
1685	oa_put_render_ctx_id(stream);
1686
1687	free_oa_configs(stream);
1688	free_noa_wait(stream);
1689
1690	if (perf->spurious_report_rs.missed) {
1691	gt_notice(gt, "%d spurious OA report notices suppressed due to ratelimiting\n",
1692	perf->spurious_report_rs.missed);
1693	}
1694	}
1695
1696	static void gen7_init_oa_buffer(struct i915_perf_stream *stream)
1697	{
1698	struct intel_uncore *uncore = stream->uncore;
1699	u32 gtt_offset = i915_ggtt_offset(vma: stream->oa_buffer.vma);
1700	unsigned long flags;
1701
1702	spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
1703
1704	/ Pre-DevBDW: OABUFFER must be set with counters off,*
1705	* before OASTATUS1, but after OASTATUS2
1706	*/
1707	intel_uncore_write(uncore, GEN7_OASTATUS2, / head /
1708	val: gtt_offset \| GEN7_OASTATUS2_MEM_SELECT_GGTT);
1709	stream->oa_buffer.head = `0`;
1710
1711	intel_uncore_write(uncore, GEN7_OABUFFER, val: gtt_offset);
1712
1713	intel_uncore_write(uncore, GEN7_OASTATUS1, / tail /
1714	val: gtt_offset \| OABUFFER_SIZE_16M);
1715
1716	/ Mark that we need updated tail pointers to read from... /
1717	stream->oa_buffer.tail = `0`;
1718
1719	spin_unlock_irqrestore(lock: &stream->oa_buffer.ptr_lock, flags);
1720
1721	/ On Haswell we have to track which OASTATUS1 flags we've*
1722	* already seen since they can't be cleared while periodic
1723	* sampling is enabled.
1724	*/
1725	stream->perf->gen7_latched_oastatus1 = `0`;
1726
1727	/ NB: although the OA buffer will initially be allocated*
1728	* zeroed via shmfs (and so this memset is redundant when
1729	* first allocating), we may re-init the OA buffer, either
1730	* when re-enabling a stream or in error/reset paths.
1731	*
1732	* The reason we clear the buffer for each re-init is for the
1733	* sanity check in gen7_append_oa_reports() that looks at the
1734	* report-id field to make sure it's non-zero which relies on
1735	* the assumption that new reports are being written to zeroed
1736	* memory...
1737	*/
1738	memset(stream->oa_buffer.vaddr, `0`, OA_BUFFER_SIZE);
1739	}
1740
1741	static void gen8_init_oa_buffer(struct i915_perf_stream *stream)
1742	{
1743	struct intel_uncore *uncore = stream->uncore;
1744	u32 gtt_offset = i915_ggtt_offset(vma: stream->oa_buffer.vma);
1745	unsigned long flags;
1746
1747	spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
1748
1749	intel_uncore_write(uncore, GEN8_OASTATUS, val: `0`);
1750	intel_uncore_write(uncore, GEN8_OAHEADPTR, val: gtt_offset);
1751	stream->oa_buffer.head = `0`;
1752
1753	intel_uncore_write(uncore, GEN8_OABUFFER_UDW, val: `0`);
1754
1755	/*
1756	* PRM says:
1757	*
1758	* "This MMIO must be set before the OATAILPTR
1759	* register and after the OAHEADPTR register. This is
1760	* to enable proper functionality of the overflow
1761	* bit."
1762	*/
1763	intel_uncore_write(uncore, GEN8_OABUFFER, val: gtt_offset \|
1764	OABUFFER_SIZE_16M \| GEN8_OABUFFER_MEM_SELECT_GGTT);
1765	intel_uncore_write(uncore, GEN8_OATAILPTR, val: gtt_offset & GEN8_OATAILPTR_MASK);
1766
1767	/ Mark that we need updated tail pointers to read from... /
1768	stream->oa_buffer.tail = `0`;
1769
1770	/*
1771	* Reset state used to recognise context switches, affecting which
1772	* reports we will forward to userspace while filtering for a single
1773	* context.
1774	*/
1775	stream->oa_buffer.last_ctx_id = INVALID_CTX_ID;
1776
1777	spin_unlock_irqrestore(lock: &stream->oa_buffer.ptr_lock, flags);
1778
1779	/*
1780	* NB: although the OA buffer will initially be allocated
1781	* zeroed via shmfs (and so this memset is redundant when
1782	* first allocating), we may re-init the OA buffer, either
1783	* when re-enabling a stream or in error/reset paths.
1784	*
1785	* The reason we clear the buffer for each re-init is for the
1786	* sanity check in gen8_append_oa_reports() that looks at the
1787	* reason field to make sure it's non-zero which relies on
1788	* the assumption that new reports are being written to zeroed
1789	* memory...
1790	*/
1791	memset(stream->oa_buffer.vaddr, `0`, OA_BUFFER_SIZE);
1792	}
1793
1794	static void gen12_init_oa_buffer(struct i915_perf_stream *stream)
1795	{
1796	struct intel_uncore *uncore = stream->uncore;
1797	u32 gtt_offset = i915_ggtt_offset(vma: stream->oa_buffer.vma);
1798	unsigned long flags;
1799
1800	spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
1801
1802	intel_uncore_write(uncore, reg: __oa_regs(stream)->oa_status, val: `0`);
1803	intel_uncore_write(uncore, reg: __oa_regs(stream)->oa_head_ptr,
1804	val: gtt_offset & GEN12_OAG_OAHEADPTR_MASK);
1805	stream->oa_buffer.head = `0`;
1806
1807	/*
1808	* PRM says:
1809	*
1810	* "This MMIO must be set before the OATAILPTR
1811	* register and after the OAHEADPTR register. This is
1812	* to enable proper functionality of the overflow
1813	* bit."
1814	*/
1815	intel_uncore_write(uncore, reg: __oa_regs(stream)->oa_buffer, val: gtt_offset \|
1816	OABUFFER_SIZE_16M \| GEN8_OABUFFER_MEM_SELECT_GGTT);
1817	intel_uncore_write(uncore, reg: __oa_regs(stream)->oa_tail_ptr,
1818	val: gtt_offset & GEN12_OAG_OATAILPTR_MASK);
1819
1820	/ Mark that we need updated tail pointers to read from... /
1821	stream->oa_buffer.tail = `0`;
1822
1823	/*
1824	* Reset state used to recognise context switches, affecting which
1825	* reports we will forward to userspace while filtering for a single
1826	* context.
1827	*/
1828	stream->oa_buffer.last_ctx_id = INVALID_CTX_ID;
1829
1830	spin_unlock_irqrestore(lock: &stream->oa_buffer.ptr_lock, flags);
1831
1832	/*
1833	* NB: although the OA buffer will initially be allocated
1834	* zeroed via shmfs (and so this memset is redundant when
1835	* first allocating), we may re-init the OA buffer, either
1836	* when re-enabling a stream or in error/reset paths.
1837	*
1838	* The reason we clear the buffer for each re-init is for the
1839	* sanity check in gen8_append_oa_reports() that looks at the
1840	* reason field to make sure it's non-zero which relies on
1841	* the assumption that new reports are being written to zeroed
1842	* memory...
1843	*/
1844	memset(stream->oa_buffer.vaddr, `0`,
1845	stream->oa_buffer.vma->size);
1846	}
1847
1848	static int alloc_oa_buffer(struct i915_perf_stream *stream)
1849	{
1850	struct drm_i915_private *i915 = stream->perf->i915;
1851	struct intel_gt *gt = stream->engine->gt;
1852	struct drm_i915_gem_object *bo;
1853	struct i915_vma *vma;
1854	int ret;
1855
1856	if (drm_WARN_ON(&i915->drm, stream->oa_buffer.vma))
1857	return -ENODEV;
1858
1859	BUILD_BUG_ON_NOT_POWER_OF_2(OA_BUFFER_SIZE);
1860	BUILD_BUG_ON(OA_BUFFER_SIZE < SZ_128K \|\| OA_BUFFER_SIZE > SZ_16M);
1861
1862	bo = i915_gem_object_create_shmem(i915: stream->perf->i915, OA_BUFFER_SIZE);
1863	if (IS_ERR(ptr: bo)) {
1864	drm_err(&i915->drm, "Failed to allocate OA buffer\n");
1865	return PTR_ERR(ptr: bo);
1866	}
1867
1868	i915_gem_object_set_cache_coherency(obj: bo, cache_level: I915_CACHE_LLC);
1869
1870	/ PreHSW required 512K alignment, HSW requires 16M /
1871	vma = i915_vma_instance(obj: bo, vm: &gt->ggtt->vm, NULL);
1872	if (IS_ERR(ptr: vma)) {
1873	ret = PTR_ERR(ptr: vma);
1874	goto err_unref;
1875	}
1876
1877	/*
1878	* PreHSW required 512K alignment.
1879	* HSW and onwards, align to requested size of OA buffer.
1880	*/
1881	ret = i915_vma_pin(vma, size: `0`, SZ_16M, PIN_GLOBAL \| PIN_HIGH);
1882	if (ret) {
1883	gt_err(gt, "Failed to pin OA buffer %d\n", ret);
1884	goto err_unref;
1885	}
1886
1887	stream->oa_buffer.vma = vma;
1888
1889	stream->oa_buffer.vaddr =
1890	i915_gem_object_pin_map_unlocked(obj: bo, type: I915_MAP_WB);
1891	if (IS_ERR(ptr: stream->oa_buffer.vaddr)) {
1892	ret = PTR_ERR(ptr: stream->oa_buffer.vaddr);
1893	goto err_unpin;
1894	}
1895
1896	return `0`;
1897
1898	err_unpin:
1899	__i915_vma_unpin(vma);
1900
1901	err_unref:
1902	i915_gem_object_put(obj: bo);
1903
1904	stream->oa_buffer.vaddr = NULL;
1905	stream->oa_buffer.vma = NULL;
1906
1907	return ret;
1908	}
1909
1910	static u32 save_restore_register(struct* i915_perf_stream stream, u32 cs,
1911	bool save, i915_reg_t reg, u32 offset,
1912	u32 dword_count)
1913	{
1914	u32 cmd;
1915	u32 d;
1916
1917	cmd = save ? MI_STORE_REGISTER_MEM : MI_LOAD_REGISTER_MEM;
1918	cmd \|= MI_SRM_LRM_GLOBAL_GTT;
1919	if (GRAPHICS_VER(stream->perf->i915) >= `8`)
1920	cmd++;
1921
1922	for (d = `0`; d < dword_count; d++) {
1923	*cs++ = cmd;
1924	cs++ = i915_mmio_reg_offset(reg) + `4` d;
1925	cs++ = i915_ggtt_offset(vma: stream->noa_wait) + offset + `4` d;
1926	*cs++ = `0`;
1927	}
1928
1929	return cs;
1930	}
1931
1932	static int alloc_noa_wait(struct i915_perf_stream *stream)
1933	{
1934	struct drm_i915_private *i915 = stream->perf->i915;
1935	struct intel_gt *gt = stream->engine->gt;
1936	struct drm_i915_gem_object *bo;
1937	struct i915_vma *vma;
1938	const u64 delay_ticks = `0xffffffffffffffff` -
1939	intel_gt_ns_to_clock_interval(gt: to_gt(i915: stream->perf->i915),
1940	ns: atomic64_read(v: &stream->perf->noa_programming_delay));
1941	const u32 base = stream->engine->mmio_base;
1942	#define CS_GPR(x) GEN8_RING_CS_GPR(base, x)
1943	u32 batch, ts0, cs, jump;
1944	struct i915_gem_ww_ctx ww;
1945	int ret, i;
1946	enum {
1947	START_TS,
1948	NOW_TS,
1949	DELTA_TS,
1950	JUMP_PREDICATE,
1951	DELTA_TARGET,
1952	N_CS_GPR
1953	};
1954	i915_reg_t mi_predicate_result = HAS_MI_SET_PREDICATE(i915) ?
1955	MI_PREDICATE_RESULT_2_ENGINE(base) :
1956	MI_PREDICATE_RESULT_1(RENDER_RING_BASE);
1957
1958	/*
1959	* gt->scratch was being used to save/restore the GPR registers, but on
1960	* MTL the scratch uses stolen lmem. An MI_SRM to this memory region
1961	* causes an engine hang. Instead allocate an additional page here to
1962	* save/restore GPR registers
1963	*/
1964	bo = i915_gem_object_create_internal(i915, size: `8192`);
1965	if (IS_ERR(ptr: bo)) {
1966	drm_err(&i915->drm,
1967	"Failed to allocate NOA wait batchbuffer\n");
1968	return PTR_ERR(ptr: bo);
1969	}
1970
1971	i915_gem_ww_ctx_init(ctx: &ww, intr: true);
1972	retry:
1973	ret = i915_gem_object_lock(obj: bo, ww: &ww);
1974	if (ret)
1975	goto out_ww;
1976
1977	/*
1978	* We pin in GGTT because we jump into this buffer now because
1979	* multiple OA config BOs will have a jump to this address and it
1980	* needs to be fixed during the lifetime of the i915/perf stream.
1981	*/
1982	vma = i915_vma_instance(obj: bo, vm: &gt->ggtt->vm, NULL);
1983	if (IS_ERR(ptr: vma)) {
1984	ret = PTR_ERR(ptr: vma);
1985	goto out_ww;
1986	}
1987
1988	ret = i915_vma_pin_ww(vma, ww: &ww, size: `0`, alignment: `0`, PIN_GLOBAL \| PIN_HIGH);
1989	if (ret)
1990	goto out_ww;
1991
1992	batch = cs = i915_gem_object_pin_map(obj: bo, type: I915_MAP_WB);
1993	if (IS_ERR(ptr: batch)) {
1994	ret = PTR_ERR(ptr: batch);
1995	goto err_unpin;
1996	}
1997
1998	stream->noa_wait = vma;
1999
2000	#define GPR_SAVE_OFFSET 4096
2001	#define PREDICATE_SAVE_OFFSET 4160
2002
2003	/ Save registers. /
2004	for (i = `0`; i < N_CS_GPR; i++)
2005	cs = save_restore_register(
2006	stream, cs, save: true / save /, CS_GPR(i),
2007	GPR_SAVE_OFFSET + `8` * i, dword_count: `2`);
2008	cs = save_restore_register(
2009	stream, cs, save: true / save /, reg: mi_predicate_result,
2010	PREDICATE_SAVE_OFFSET, dword_count: `1`);
2011
2012	/ First timestamp snapshot location. /
2013	ts0 = cs;
2014
2015	/*
2016	* Initial snapshot of the timestamp register to implement the wait.
2017	* We work with 32b values, so clear out the top 32b bits of the
2018	* register because the ALU works 64bits.
2019	*/
2020	*cs++ = MI_LOAD_REGISTER_IMM(`1`);
2021	*cs++ = i915_mmio_reg_offset(CS_GPR(START_TS)) + `4`;
2022	*cs++ = `0`;
2023	*cs++ = MI_LOAD_REGISTER_REG \| (`3` - `2`);
2024	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(base));
2025	*cs++ = i915_mmio_reg_offset(CS_GPR(START_TS));
2026
2027	/*
2028	* This is the location we're going to jump back into until the
2029	* required amount of time has passed.
2030	*/
2031	jump = cs;
2032
2033	/*
2034	* Take another snapshot of the timestamp register. Take care to clear
2035	* up the top 32bits of CS_GPR(1) as we're using it for other
2036	* operations below.
2037	*/
2038	*cs++ = MI_LOAD_REGISTER_IMM(`1`);
2039	*cs++ = i915_mmio_reg_offset(CS_GPR(NOW_TS)) + `4`;
2040	*cs++ = `0`;
2041	*cs++ = MI_LOAD_REGISTER_REG \| (`3` - `2`);
2042	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(base));
2043	*cs++ = i915_mmio_reg_offset(CS_GPR(NOW_TS));
2044
2045	/*
2046	* Do a diff between the 2 timestamps and store the result back into
2047	* CS_GPR(1).
2048	*/
2049	*cs++ = MI_MATH(`5`);
2050	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
2051	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
2052	*cs++ = MI_MATH_SUB;
2053	*cs++ = MI_MATH_STORE(MI_MATH_REG(DELTA_TS), MI_MATH_REG_ACCU);
2054	*cs++ = MI_MATH_STORE(MI_MATH_REG(JUMP_PREDICATE), MI_MATH_REG_CF);
2055
2056	/*
2057	* Transfer the carry flag (set to 1 if ts1 < ts0, meaning the
2058	* timestamp have rolled over the 32bits) into the predicate register
2059	* to be used for the predicated jump.
2060	*/
2061	*cs++ = MI_LOAD_REGISTER_REG \| (`3` - `2`);
2062	*cs++ = i915_mmio_reg_offset(CS_GPR(JUMP_PREDICATE));
2063	*cs++ = i915_mmio_reg_offset(mi_predicate_result);
2064
2065	if (HAS_MI_SET_PREDICATE(i915))
2066	*cs++ = MI_SET_PREDICATE \| `1`;
2067
2068	/ Restart from the beginning if we had timestamps roll over. /
2069	*cs++ = (GRAPHICS_VER(i915) < `8` ?
2070	MI_BATCH_BUFFER_START :
2071	MI_BATCH_BUFFER_START_GEN8) \|
2072	MI_BATCH_PREDICATE;
2073	cs++ = i915_ggtt_offset(vma) + (ts0 - batch) `4`;
2074	*cs++ = `0`;
2075
2076	if (HAS_MI_SET_PREDICATE(i915))
2077	*cs++ = MI_SET_PREDICATE;
2078
2079	/*
2080	* Now add the diff between to previous timestamps and add it to :
2081	* (((1 * << 64) - 1) - delay_ns)
2082	*
2083	* When the Carry Flag contains 1 this means the elapsed time is
2084	* longer than the expected delay, and we can exit the wait loop.
2085	*/
2086	*cs++ = MI_LOAD_REGISTER_IMM(`2`);
2087	*cs++ = i915_mmio_reg_offset(CS_GPR(DELTA_TARGET));
2088	*cs++ = lower_32_bits(delay_ticks);
2089	*cs++ = i915_mmio_reg_offset(CS_GPR(DELTA_TARGET)) + `4`;
2090	*cs++ = upper_32_bits(delay_ticks);
2091
2092	*cs++ = MI_MATH(`4`);
2093	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(DELTA_TS));
2094	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(DELTA_TARGET));
2095	*cs++ = MI_MATH_ADD;
2096	*cs++ = MI_MATH_STOREINV(MI_MATH_REG(JUMP_PREDICATE), MI_MATH_REG_CF);
2097
2098	*cs++ = MI_ARB_CHECK;
2099
2100	/*
2101	* Transfer the result into the predicate register to be used for the
2102	* predicated jump.
2103	*/
2104	*cs++ = MI_LOAD_REGISTER_REG \| (`3` - `2`);
2105	*cs++ = i915_mmio_reg_offset(CS_GPR(JUMP_PREDICATE));
2106	*cs++ = i915_mmio_reg_offset(mi_predicate_result);
2107
2108	if (HAS_MI_SET_PREDICATE(i915))
2109	*cs++ = MI_SET_PREDICATE \| `1`;
2110
2111	/ Predicate the jump. /
2112	*cs++ = (GRAPHICS_VER(i915) < `8` ?
2113	MI_BATCH_BUFFER_START :
2114	MI_BATCH_BUFFER_START_GEN8) \|
2115	MI_BATCH_PREDICATE;
2116	cs++ = i915_ggtt_offset(vma) + (jump - batch) `4`;
2117	*cs++ = `0`;
2118
2119	if (HAS_MI_SET_PREDICATE(i915))
2120	*cs++ = MI_SET_PREDICATE;
2121
2122	/ Restore registers. /
2123	for (i = `0`; i < N_CS_GPR; i++)
2124	cs = save_restore_register(
2125	stream, cs, save: false / restore /, CS_GPR(i),
2126	GPR_SAVE_OFFSET + `8` * i, dword_count: `2`);
2127	cs = save_restore_register(
2128	stream, cs, save: false / restore /, reg: mi_predicate_result,
2129	PREDICATE_SAVE_OFFSET, dword_count: `1`);
2130
2131	/ And return to the ring. /
2132	*cs++ = MI_BATCH_BUFFER_END;
2133
2134	GEM_BUG_ON(cs - batch > PAGE_SIZE / sizeof(*batch));
2135
2136	i915_gem_object_flush_map(obj: bo);
2137	__i915_gem_object_release_map(obj: bo);
2138
2139	goto out_ww;
2140
2141	err_unpin:
2142	i915_vma_unpin_and_release(p_vma: &vma, flags: `0`);
2143	out_ww:
2144	if (ret == -EDEADLK) {
2145	ret = i915_gem_ww_ctx_backoff(ctx: &ww);
2146	if (!ret)
2147	goto retry;
2148	}
2149	i915_gem_ww_ctx_fini(ctx: &ww);
2150	if (ret)
2151	i915_gem_object_put(obj: bo);
2152	return ret;
2153	}
2154
2155	static u32 write_cs_mi_lri(u32 cs,
2156	const struct i915_oa_reg *reg_data,
2157	u32 n_regs)
2158	{
2159	u32 i;
2160
2161	for (i = `0`; i < n_regs; i++) {
2162	if ((i % MI_LOAD_REGISTER_IMM_MAX_REGS) == `0`) {
2163	u32 n_lri = min_t(u32,
2164	n_regs - i,
2165	MI_LOAD_REGISTER_IMM_MAX_REGS);
2166
2167	*cs++ = MI_LOAD_REGISTER_IMM(n_lri);
2168	}
2169	*cs++ = i915_mmio_reg_offset(reg_data[i].addr);
2170	*cs++ = reg_data[i].value;
2171	}
2172
2173	return cs;
2174	}
2175
2176	static int num_lri_dwords(int num_regs)
2177	{
2178	int count = `0`;
2179
2180	if (num_regs > `0`) {
2181	count += DIV_ROUND_UP(num_regs, MI_LOAD_REGISTER_IMM_MAX_REGS);
2182	count += num_regs * `2`;
2183	}
2184
2185	return count;
2186	}
2187
2188	static struct i915_oa_config_bo *
2189	alloc_oa_config_buffer(struct i915_perf_stream *stream,
2190	struct i915_oa_config *oa_config)
2191	{
2192	struct drm_i915_gem_object *obj;
2193	struct i915_oa_config_bo *oa_bo;
2194	struct i915_gem_ww_ctx ww;
2195	size_t config_length = `0`;
2196	u32 *cs;
2197	int err;
2198
2199	oa_bo = kzalloc(size: sizeof(*oa_bo), GFP_KERNEL);
2200	if (!oa_bo)
2201	return ERR_PTR(error: -ENOMEM);
2202
2203	config_length += num_lri_dwords(num_regs: oa_config->mux_regs_len);
2204	config_length += num_lri_dwords(num_regs: oa_config->b_counter_regs_len);
2205	config_length += num_lri_dwords(num_regs: oa_config->flex_regs_len);
2206	config_length += `3`; / MI_BATCH_BUFFER_START /
2207	config_length = ALIGN(sizeof(u32) * config_length, I915_GTT_PAGE_SIZE);
2208
2209	obj = i915_gem_object_create_shmem(i915: stream->perf->i915, size: config_length);
2210	if (IS_ERR(ptr: obj)) {
2211	err = PTR_ERR(ptr: obj);
2212	goto err_free;
2213	}
2214
2215	i915_gem_ww_ctx_init(ctx: &ww, intr: true);
2216	retry:
2217	err = i915_gem_object_lock(obj, ww: &ww);
2218	if (err)
2219	goto out_ww;
2220
2221	cs = i915_gem_object_pin_map(obj, type: I915_MAP_WB);
2222	if (IS_ERR(ptr: cs)) {
2223	err = PTR_ERR(ptr: cs);
2224	goto out_ww;
2225	}
2226
2227	cs = write_cs_mi_lri(cs,
2228	reg_data: oa_config->mux_regs,
2229	n_regs: oa_config->mux_regs_len);
2230	cs = write_cs_mi_lri(cs,
2231	reg_data: oa_config->b_counter_regs,
2232	n_regs: oa_config->b_counter_regs_len);
2233	cs = write_cs_mi_lri(cs,
2234	reg_data: oa_config->flex_regs,
2235	n_regs: oa_config->flex_regs_len);
2236
2237	/ Jump into the active wait. /
2238	*cs++ = (GRAPHICS_VER(stream->perf->i915) < `8` ?
2239	MI_BATCH_BUFFER_START :
2240	MI_BATCH_BUFFER_START_GEN8);
2241	*cs++ = i915_ggtt_offset(vma: stream->noa_wait);
2242	*cs++ = `0`;
2243
2244	i915_gem_object_flush_map(obj);
2245	__i915_gem_object_release_map(obj);
2246
2247	oa_bo->vma = i915_vma_instance(obj,
2248	vm: &stream->engine->gt->ggtt->vm,
2249	NULL);
2250	if (IS_ERR(ptr: oa_bo->vma)) {
2251	err = PTR_ERR(ptr: oa_bo->vma);
2252	goto out_ww;
2253	}
2254
2255	oa_bo->oa_config = i915_oa_config_get(oa_config);
2256	llist_add(new: &oa_bo->node, head: &stream->oa_config_bos);
2257
2258	out_ww:
2259	if (err == -EDEADLK) {
2260	err = i915_gem_ww_ctx_backoff(ctx: &ww);
2261	if (!err)
2262	goto retry;
2263	}
2264	i915_gem_ww_ctx_fini(ctx: &ww);
2265
2266	if (err)
2267	i915_gem_object_put(obj);
2268	err_free:
2269	if (err) {
2270	kfree(objp: oa_bo);
2271	return ERR_PTR(error: err);
2272	}
2273	return oa_bo;
2274	}
2275
2276	static struct i915_vma *
2277	get_oa_vma(struct i915_perf_stream stream, struct* i915_oa_config *oa_config)
2278	{
2279	struct i915_oa_config_bo *oa_bo;
2280
2281	/*
2282	* Look for the buffer in the already allocated BOs attached
2283	* to the stream.
2284	*/
2285	llist_for_each_entry(oa_bo, stream->oa_config_bos.first, node) {
2286	if (oa_bo->oa_config == oa_config &&
2287	memcmp(p: oa_bo->oa_config->uuid,
2288	q: oa_config->uuid,
2289	size: sizeof(oa_config->uuid)) == `0`)
2290	goto out;
2291	}
2292
2293	oa_bo = alloc_oa_config_buffer(stream, oa_config);
2294	if (IS_ERR(ptr: oa_bo))
2295	return ERR_CAST(ptr: oa_bo);
2296
2297	out:
2298	return i915_vma_get(vma: oa_bo->vma);
2299	}
2300
2301	static int
2302	emit_oa_config(struct i915_perf_stream *stream,
2303	struct i915_oa_config *oa_config,
2304	struct intel_context *ce,
2305	struct i915_active *active)
2306	{
2307	struct i915_request *rq;
2308	struct i915_vma *vma;
2309	struct i915_gem_ww_ctx ww;
2310	int err;
2311
2312	vma = get_oa_vma(stream, oa_config);
2313	if (IS_ERR(ptr: vma))
2314	return PTR_ERR(ptr: vma);
2315
2316	i915_gem_ww_ctx_init(ctx: &ww, intr: true);
2317	retry:
2318	err = i915_gem_object_lock(obj: vma->obj, ww: &ww);
2319	if (err)
2320	goto err;
2321
2322	err = i915_vma_pin_ww(vma, ww: &ww, size: `0`, alignment: `0`, PIN_GLOBAL \| PIN_HIGH);
2323	if (err)
2324	goto err;
2325
2326	intel_engine_pm_get(engine: ce->engine);
2327	rq = i915_request_create(ce);
2328	intel_engine_pm_put(engine: ce->engine);
2329	if (IS_ERR(ptr: rq)) {
2330	err = PTR_ERR(ptr: rq);
2331	goto err_vma_unpin;
2332	}
2333
2334	if (!IS_ERR_OR_NULL(ptr: active)) {
2335	/ After all individual context modifications /
2336	err = i915_request_await_active(rq, ref: active,
2337	I915_ACTIVE_AWAIT_ACTIVE);
2338	if (err)
2339	goto err_add_request;
2340
2341	err = i915_active_add_request(ref: active, rq);
2342	if (err)
2343	goto err_add_request;
2344	}
2345
2346	err = i915_vma_move_to_active(vma, rq, flags: `0`);
2347	if (err)
2348	goto err_add_request;
2349
2350	err = rq->engine->emit_bb_start(rq,
2351	i915_vma_offset(vma), `0`,
2352	I915_DISPATCH_SECURE);
2353	if (err)
2354	goto err_add_request;
2355
2356	err_add_request:
2357	i915_request_add(rq);
2358	err_vma_unpin:
2359	i915_vma_unpin(vma);
2360	err:
2361	if (err == -EDEADLK) {
2362	err = i915_gem_ww_ctx_backoff(ctx: &ww);
2363	if (!err)
2364	goto retry;
2365	}
2366
2367	i915_gem_ww_ctx_fini(ctx: &ww);
2368	i915_vma_put(vma);
2369	return err;
2370	}
2371
2372	static struct intel_context oa_context(struct* i915_perf_stream *stream)
2373	{
2374	return stream->pinned_ctx ?: stream->engine->kernel_context;
2375	}
2376
2377	static int
2378	hsw_enable_metric_set(struct i915_perf_stream *stream,
2379	struct i915_active *active)
2380	{
2381	struct intel_uncore *uncore = stream->uncore;
2382
2383	/*
2384	* PRM:
2385	*
2386	* OA unit is using “crclk” for its functionality. When trunk
2387	* level clock gating takes place, OA clock would be gated,
2388	* unable to count the events from non-render clock domain.
2389	* Render clock gating must be disabled when OA is enabled to
2390	* count the events from non-render domain. Unit level clock
2391	* gating for RCS should also be disabled.
2392	*/
2393	intel_uncore_rmw(uncore, GEN7_MISCCPCTL,
2394	GEN7_DOP_CLOCK_GATE_ENABLE, set: `0`);
2395	intel_uncore_rmw(uncore, GEN6_UCGCTL1,
2396	clear: `0`, GEN6_CSUNIT_CLOCK_GATE_DISABLE);
2397
2398	return emit_oa_config(stream,
2399	oa_config: stream->oa_config, ce: oa_context(stream),
2400	active);
2401	}
2402
2403	static void hsw_disable_metric_set(struct i915_perf_stream *stream)
2404	{
2405	struct intel_uncore *uncore = stream->uncore;
2406
2407	intel_uncore_rmw(uncore, GEN6_UCGCTL1,
2408	GEN6_CSUNIT_CLOCK_GATE_DISABLE, set: `0`);
2409	intel_uncore_rmw(uncore, GEN7_MISCCPCTL,
2410	clear: `0`, GEN7_DOP_CLOCK_GATE_ENABLE);
2411
2412	intel_uncore_rmw(uncore, GDT_CHICKEN_BITS, GT_NOA_ENABLE, set: `0`);
2413	}
2414
2415	static u32 oa_config_flex_reg(const struct i915_oa_config *oa_config,
2416	i915_reg_t reg)
2417	{
2418	u32 mmio = i915_mmio_reg_offset(reg);
2419	int i;
2420
2421	/*
2422	* This arbitrary default will select the 'EU FPU0 Pipeline
2423	* Active' event. In the future it's anticipated that there
2424	* will be an explicit 'No Event' we can select, but not yet...
2425	*/
2426	if (!oa_config)
2427	return `0`;
2428
2429	for (i = `0`; i < oa_config->flex_regs_len; i++) {
2430	if (i915_mmio_reg_offset(oa_config->flex_regs[i].addr) == mmio)
2431	return oa_config->flex_regs[i].value;
2432	}
2433
2434	return `0`;
2435	}
2436	/*
2437	* NB: It must always remain pointer safe to run this even if the OA unit
2438	* has been disabled.
2439	*
2440	* It's fine to put out-of-date values into these per-context registers
2441	* in the case that the OA unit has been disabled.
2442	*/
2443	static void
2444	gen8_update_reg_state_unlocked(const struct intel_context *ce,
2445	const struct i915_perf_stream *stream)
2446	{
2447	u32 ctx_oactxctrl = stream->perf->ctx_oactxctrl_offset;
2448	u32 ctx_flexeu0 = stream->perf->ctx_flexeu0_offset;
2449	/ The MMIO offsets for Flex EU registers aren't contiguous /
2450	static const i915_reg_t flex_regs[] = {
2451	EU_PERF_CNTL0,
2452	EU_PERF_CNTL1,
2453	EU_PERF_CNTL2,
2454	EU_PERF_CNTL3,
2455	EU_PERF_CNTL4,
2456	EU_PERF_CNTL5,
2457	EU_PERF_CNTL6,
2458	};
2459	u32 *reg_state = ce->lrc_reg_state;
2460	int i;
2461
2462	reg_state[ctx_oactxctrl + `1`] =
2463	(stream->period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) \|
2464	(stream->periodic ? GEN8_OA_TIMER_ENABLE : `0`) \|
2465	GEN8_OA_COUNTER_RESUME;
2466
2467	for (i = `0`; i < ARRAY_SIZE(flex_regs); i++)
2468	reg_state[ctx_flexeu0 + i * `2` + `1`] =
2469	oa_config_flex_reg(oa_config: stream->oa_config, reg: flex_regs[i]);
2470	}
2471
2472	struct flex {
2473	i915_reg_t reg;
2474	u32 offset;
2475	u32 value;
2476	};
2477
2478	static int
2479	gen8_store_flex(struct i915_request *rq,
2480	struct intel_context *ce,
2481	const struct flex flex, unsigned* int count)
2482	{
2483	u32 offset;
2484	u32 *cs;
2485
2486	cs = intel_ring_begin(rq, num_dwords: `4` * count);
2487	if (IS_ERR(ptr: cs))
2488	return PTR_ERR(ptr: cs);
2489
2490	offset = i915_ggtt_offset(vma: ce->state) + LRC_STATE_OFFSET;
2491	do {
2492	*cs++ = MI_STORE_DWORD_IMM_GEN4 \| MI_USE_GGTT;
2493	cs++ = offset + flex->offset sizeof(u32);
2494	*cs++ = `0`;
2495	*cs++ = flex->value;
2496	} while (flex++, --count);
2497
2498	intel_ring_advance(rq, cs);
2499
2500	return `0`;
2501	}
2502
2503	static int
2504	gen8_load_flex(struct i915_request *rq,
2505	struct intel_context *ce,
2506	const struct flex flex, unsigned* int count)
2507	{
2508	u32 *cs;
2509
2510	GEM_BUG_ON(!count \|\| count > `63`);
2511
2512	cs = intel_ring_begin(rq, num_dwords: `2` * count + `2`);
2513	if (IS_ERR(ptr: cs))
2514	return PTR_ERR(ptr: cs);
2515
2516	*cs++ = MI_LOAD_REGISTER_IMM(count);
2517	do {
2518	*cs++ = i915_mmio_reg_offset(flex->reg);
2519	*cs++ = flex->value;
2520	} while (flex++, --count);
2521	*cs++ = MI_NOOP;
2522
2523	intel_ring_advance(rq, cs);
2524
2525	return `0`;
2526	}
2527
2528	static int gen8_modify_context(struct intel_context *ce,
2529	const struct flex flex, unsigned* int count)
2530	{
2531	struct i915_request *rq;
2532	int err;
2533
2534	rq = intel_engine_create_kernel_request(engine: ce->engine);
2535	if (IS_ERR(ptr: rq))
2536	return PTR_ERR(ptr: rq);
2537
2538	/ Serialise with the remote context /
2539	err = intel_context_prepare_remote_request(ce, rq);
2540	if (err == `0`)
2541	err = gen8_store_flex(rq, ce, flex, count);
2542
2543	i915_request_add(rq);
2544	return err;
2545	}
2546
2547	static int
2548	gen8_modify_self(struct intel_context *ce,
2549	const struct flex flex, unsigned* int count,
2550	struct i915_active *active)
2551	{
2552	struct i915_request *rq;
2553	int err;
2554
2555	intel_engine_pm_get(engine: ce->engine);
2556	rq = i915_request_create(ce);
2557	intel_engine_pm_put(engine: ce->engine);
2558	if (IS_ERR(ptr: rq))
2559	return PTR_ERR(ptr: rq);
2560
2561	if (!IS_ERR_OR_NULL(ptr: active)) {
2562	err = i915_active_add_request(ref: active, rq);
2563	if (err)
2564	goto err_add_request;
2565	}
2566
2567	err = gen8_load_flex(rq, ce, flex, count);
2568	if (err)
2569	goto err_add_request;
2570
2571	err_add_request:
2572	i915_request_add(rq);
2573	return err;
2574	}
2575
2576	static int gen8_configure_context(struct i915_perf_stream *stream,
2577	struct i915_gem_context *ctx,
2578	struct flex flex, unsigned* int count)
2579	{
2580	struct i915_gem_engines_iter it;
2581	struct intel_context *ce;
2582	int err = `0`;
2583
2584	for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) {
2585	GEM_BUG_ON(ce == ce->engine->kernel_context);
2586
2587	if (ce->engine->class != RENDER_CLASS)
2588	continue;
2589
2590	/ Otherwise OA settings will be set upon first use /
2591	if (!intel_context_pin_if_active(ce))
2592	continue;
2593
2594	flex->value = intel_sseu_make_rpcs(gt: ce->engine->gt, req_sseu: &ce->sseu);
2595	err = gen8_modify_context(ce, flex, count);
2596
2597	intel_context_unpin(ce);
2598	if (err)
2599	break;
2600	}
2601	i915_gem_context_unlock_engines(ctx);
2602
2603	return err;
2604	}
2605
2606	static int gen12_configure_oar_context(struct i915_perf_stream *stream,
2607	struct i915_active *active)
2608	{
2609	int err;
2610	struct intel_context *ce = stream->pinned_ctx;
2611	u32 format = stream->oa_buffer.format->format;
2612	u32 offset = stream->perf->ctx_oactxctrl_offset;
2613	struct flex regs_context[] = {
2614	{
2615	GEN8_OACTXCONTROL,
2616	offset + `1`,
2617	active ? GEN8_OA_COUNTER_RESUME : `0`,
2618	},
2619	};
2620	/ Offsets in regs_lri are not used since this configuration is only*
2621	* applied using LRI. Initialize the correct offsets for posterity.
2622	*/
2623	#define GEN12_OAR_OACONTROL_OFFSET 0x5B0
2624	struct flex regs_lri[] = {
2625	{
2626	GEN12_OAR_OACONTROL,
2627	GEN12_OAR_OACONTROL_OFFSET + `1`,
2628	(format << GEN12_OAR_OACONTROL_COUNTER_FORMAT_SHIFT) \|
2629	(active ? GEN12_OAR_OACONTROL_COUNTER_ENABLE : `0`)
2630	},
2631	{
2632	RING_CONTEXT_CONTROL(ce->engine->mmio_base),
2633	CTX_CONTEXT_CONTROL,
2634	_MASKED_FIELD(GEN12_CTX_CTRL_OAR_CONTEXT_ENABLE,
2635	active ?
2636	GEN12_CTX_CTRL_OAR_CONTEXT_ENABLE :
2637	`0`)
2638	},
2639	};
2640
2641	/ Modify the context image of pinned context with regs_context /
2642	err = intel_context_lock_pinned(ce);
2643	if (err)
2644	return err;
2645
2646	err = gen8_modify_context(ce, flex: regs_context,
2647	ARRAY_SIZE(regs_context));
2648	intel_context_unlock_pinned(ce);
2649	if (err)
2650	return err;
2651
2652	/ Apply regs_lri using LRI with pinned context /
2653	return gen8_modify_self(ce, flex: regs_lri, ARRAY_SIZE(regs_lri), active);
2654	}
2655
2656	/*
2657	* Manages updating the per-context aspects of the OA stream
2658	* configuration across all contexts.
2659	*
2660	* The awkward consideration here is that OACTXCONTROL controls the
2661	* exponent for periodic sampling which is primarily used for system
2662	* wide profiling where we'd like a consistent sampling period even in
2663	* the face of context switches.
2664	*
2665	* Our approach of updating the register state context (as opposed to
2666	* say using a workaround batch buffer) ensures that the hardware
2667	* won't automatically reload an out-of-date timer exponent even
2668	* transiently before a WA BB could be parsed.
2669	*
2670	* This function needs to:
2671	* - Ensure the currently running context's per-context OA state is
2672	* updated
2673	* - Ensure that all existing contexts will have the correct per-context
2674	* OA state if they are scheduled for use.
2675	* - Ensure any new contexts will be initialized with the correct
2676	* per-context OA state.
2677	*
2678	* Note: it's only the RCS/Render context that has any OA state.
2679	* Note: the first flex register passed must always be R_PWR_CLK_STATE
2680	*/
2681	static int
2682	oa_configure_all_contexts(struct i915_perf_stream *stream,
2683	struct flex *regs,
2684	size_t num_regs,
2685	struct i915_active *active)
2686	{
2687	struct drm_i915_private *i915 = stream->perf->i915;
2688	struct intel_engine_cs *engine;
2689	struct intel_gt *gt = stream->engine->gt;
2690	struct i915_gem_context ctx, cn;
2691	int err;
2692
2693	lockdep_assert_held(&gt->perf.lock);
2694
2695	/*
2696	* The OA register config is setup through the context image. This image
2697	* might be written to by the GPU on context switch (in particular on
2698	* lite-restore). This means we can't safely update a context's image,
2699	* if this context is scheduled/submitted to run on the GPU.
2700	*
2701	* We could emit the OA register config through the batch buffer but
2702	* this might leave small interval of time where the OA unit is
2703	* configured at an invalid sampling period.
2704	*
2705	* Note that since we emit all requests from a single ring, there
2706	* is still an implicit global barrier here that may cause a high
2707	* priority context to wait for an otherwise independent low priority
2708	* context. Contexts idle at the time of reconfiguration are not
2709	* trapped behind the barrier.
2710	*/
2711	spin_lock(lock: &i915->gem.contexts.lock);
2712	list_for_each_entry_safe(ctx, cn, &i915->gem.contexts.list, link) {
2713	if (!kref_get_unless_zero(kref: &ctx->ref))
2714	continue;
2715
2716	spin_unlock(lock: &i915->gem.contexts.lock);
2717
2718	err = gen8_configure_context(stream, ctx, flex: regs, count: num_regs);
2719	if (err) {
2720	i915_gem_context_put(ctx);
2721	return err;
2722	}
2723
2724	spin_lock(lock: &i915->gem.contexts.lock);
2725	list_safe_reset_next(ctx, cn, link);
2726	i915_gem_context_put(ctx);
2727	}
2728	spin_unlock(lock: &i915->gem.contexts.lock);
2729
2730	/*
2731	* After updating all other contexts, we need to modify ourselves.
2732	* If we don't modify the kernel_context, we do not get events while
2733	* idle.
2734	*/
2735	for_each_uabi_engine(engine, i915) {
2736	struct intel_context *ce = engine->kernel_context;
2737
2738	if (engine->class != RENDER_CLASS)
2739	continue;
2740
2741	regs[`0`].value = intel_sseu_make_rpcs(gt: engine->gt, req_sseu: &ce->sseu);
2742
2743	err = gen8_modify_self(ce, flex: regs, count: num_regs, active);
2744	if (err)
2745	return err;
2746	}
2747
2748	return `0`;
2749	}
2750
2751	static int
2752	gen12_configure_all_contexts(struct i915_perf_stream *stream,
2753	const struct i915_oa_config *oa_config,
2754	struct i915_active *active)
2755	{
2756	struct flex regs[] = {
2757	{
2758	GEN8_R_PWR_CLK_STATE(RENDER_RING_BASE),
2759	CTX_R_PWR_CLK_STATE,
2760	},
2761	};
2762
2763	if (stream->engine->class != RENDER_CLASS)
2764	return `0`;
2765
2766	return oa_configure_all_contexts(stream,
2767	regs, ARRAY_SIZE(regs),
2768	active);
2769	}
2770
2771	static int
2772	lrc_configure_all_contexts(struct i915_perf_stream *stream,
2773	const struct i915_oa_config *oa_config,
2774	struct i915_active *active)
2775	{
2776	u32 ctx_oactxctrl = stream->perf->ctx_oactxctrl_offset;
2777	/ The MMIO offsets for Flex EU registers aren't contiguous /
2778	const u32 ctx_flexeu0 = stream->perf->ctx_flexeu0_offset;
2779	#define ctx_flexeuN(N) (ctx_flexeu0 + 2 * (N) + 1)
2780	struct flex regs[] = {
2781	{
2782	GEN8_R_PWR_CLK_STATE(RENDER_RING_BASE),
2783	CTX_R_PWR_CLK_STATE,
2784	},
2785	{
2786	GEN8_OACTXCONTROL,
2787	ctx_oactxctrl + `1`,
2788	},
2789	{ EU_PERF_CNTL0, ctx_flexeuN(`0`) },
2790	{ EU_PERF_CNTL1, ctx_flexeuN(`1`) },
2791	{ EU_PERF_CNTL2, ctx_flexeuN(`2`) },
2792	{ EU_PERF_CNTL3, ctx_flexeuN(`3`) },
2793	{ EU_PERF_CNTL4, ctx_flexeuN(`4`) },
2794	{ EU_PERF_CNTL5, ctx_flexeuN(`5`) },
2795	{ EU_PERF_CNTL6, ctx_flexeuN(`6`) },
2796	};
2797	#undef ctx_flexeuN
2798	int i;
2799
2800	regs[`1`].value =
2801	(stream->period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) \|
2802	(stream->periodic ? GEN8_OA_TIMER_ENABLE : `0`) \|
2803	GEN8_OA_COUNTER_RESUME;
2804
2805	for (i = `2`; i < ARRAY_SIZE(regs); i++)
2806	regs[i].value = oa_config_flex_reg(oa_config, reg: regs[i].reg);
2807
2808	return oa_configure_all_contexts(stream,
2809	regs, ARRAY_SIZE(regs),
2810	active);
2811	}
2812
2813	static int
2814	gen8_enable_metric_set(struct i915_perf_stream *stream,
2815	struct i915_active *active)
2816	{
2817	struct intel_uncore *uncore = stream->uncore;
2818	struct i915_oa_config *oa_config = stream->oa_config;
2819	int ret;
2820
2821	/*
2822	* We disable slice/unslice clock ratio change reports on SKL since
2823	* they are too noisy. The HW generates a lot of redundant reports
2824	* where the ratio hasn't really changed causing a lot of redundant
2825	* work to processes and increasing the chances we'll hit buffer
2826	* overruns.
2827	*
2828	* Although we don't currently use the 'disable overrun' OABUFFER
2829	* feature it's worth noting that clock ratio reports have to be
2830	* disabled before considering to use that feature since the HW doesn't
2831	* correctly block these reports.
2832	*
2833	* Currently none of the high-level metrics we have depend on knowing
2834	* this ratio to normalize.
2835	*
2836	* Note: This register is not power context saved and restored, but
2837	* that's OK considering that we disable RC6 while the OA unit is
2838	* enabled.
2839	*
2840	* The _INCLUDE_CLK_RATIO bit allows the slice/unslice frequency to
2841	* be read back from automatically triggered reports, as part of the
2842	* RPT_ID field.
2843	*/
2844	if (IS_GRAPHICS_VER(stream->perf->i915, `9`, `11`)) {
2845	intel_uncore_write(uncore, GEN8_OA_DEBUG,
2846	_MASKED_BIT_ENABLE(GEN9_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS \|
2847	GEN9_OA_DEBUG_INCLUDE_CLK_RATIO));
2848	}
2849
2850	/*
2851	* Update all contexts prior writing the mux configurations as we need
2852	* to make sure all slices/subslices are ON before writing to NOA
2853	* registers.
2854	*/
2855	ret = lrc_configure_all_contexts(stream, oa_config, active);
2856	if (ret)
2857	return ret;
2858
2859	return emit_oa_config(stream,
2860	oa_config: stream->oa_config, ce: oa_context(stream),
2861	active);
2862	}
2863
2864	static u32 oag_report_ctx_switches(const struct i915_perf_stream *stream)
2865	{
2866	return _MASKED_FIELD(GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS,
2867	(stream->sample_flags & SAMPLE_OA_REPORT) ?
2868	`0` : GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS);
2869	}
2870
2871	static int
2872	gen12_enable_metric_set(struct i915_perf_stream *stream,
2873	struct i915_active *active)
2874	{
2875	struct drm_i915_private *i915 = stream->perf->i915;
2876	struct intel_uncore *uncore = stream->uncore;
2877	struct i915_oa_config *oa_config = stream->oa_config;
2878	bool periodic = stream->periodic;
2879	u32 period_exponent = stream->period_exponent;
2880	u32 sqcnt1;
2881	int ret;
2882
2883	/*
2884	* Wa_1508761755:xehpsdv, dg2
2885	* EU NOA signals behave incorrectly if EU clock gating is enabled.
2886	* Disable thread stall DOP gating and EU DOP gating.
2887	*/
2888	if (IS_XEHPSDV(i915) \|\| IS_DG2(i915)) {
2889	intel_gt_mcr_multicast_write(gt: uncore->gt, GEN8_ROW_CHICKEN,
2890	_MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE));
2891	intel_uncore_write(uncore, GEN7_ROW_CHICKEN2,
2892	_MASKED_BIT_ENABLE(GEN12_DISABLE_DOP_GATING));
2893	}
2894
2895	intel_uncore_write(uncore, reg: __oa_regs(stream)->oa_debug,
2896	/ Disable clk ratio reports, like previous Gens. /
2897	_MASKED_BIT_ENABLE(GEN12_OAG_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS \|
2898	GEN12_OAG_OA_DEBUG_INCLUDE_CLK_RATIO) \|
2899	/*
2900	* If the user didn't require OA reports, instruct
2901	* the hardware not to emit ctx switch reports.
2902	*/
2903	oag_report_ctx_switches(stream));
2904
2905	intel_uncore_write(uncore, reg: __oa_regs(stream)->oa_ctx_ctrl, val: periodic ?
2906	(GEN12_OAG_OAGLBCTXCTRL_COUNTER_RESUME \|
2907	GEN12_OAG_OAGLBCTXCTRL_TIMER_ENABLE \|
2908	(period_exponent << GEN12_OAG_OAGLBCTXCTRL_TIMER_PERIOD_SHIFT))
2909	: `0`);
2910
2911	/*
2912	* Initialize Super Queue Internal Cnt Register
2913	* Set PMON Enable in order to collect valid metrics.
2914	* Enable byets per clock reporting in OA for XEHPSDV onward.
2915	*/
2916	sqcnt1 = GEN12_SQCNT1_PMON_ENABLE \|
2917	(HAS_OA_BPC_REPORTING(i915) ? GEN12_SQCNT1_OABPC : `0`);
2918
2919	intel_uncore_rmw(uncore, GEN12_SQCNT1, clear: `0`, set: sqcnt1);
2920
2921	/*
2922	* Update all contexts prior writing the mux configurations as we need
2923	* to make sure all slices/subslices are ON before writing to NOA
2924	* registers.
2925	*/
2926	ret = gen12_configure_all_contexts(stream, oa_config, active);
2927	if (ret)
2928	return ret;
2929
2930	/*
2931	* For Gen12, performance counters are context
2932	* saved/restored. Only enable it for the context that
2933	* requested this.
2934	*/
2935	if (stream->ctx) {
2936	ret = gen12_configure_oar_context(stream, active);
2937	if (ret)
2938	return ret;
2939	}
2940
2941	return emit_oa_config(stream,
2942	oa_config: stream->oa_config, ce: oa_context(stream),
2943	active);
2944	}
2945
2946	static void gen8_disable_metric_set(struct i915_perf_stream *stream)
2947	{
2948	struct intel_uncore *uncore = stream->uncore;
2949
2950	/ Reset all contexts' slices/subslices configurations. /
2951	lrc_configure_all_contexts(stream, NULL, NULL);
2952
2953	intel_uncore_rmw(uncore, GDT_CHICKEN_BITS, GT_NOA_ENABLE, set: `0`);
2954	}
2955
2956	static void gen11_disable_metric_set(struct i915_perf_stream *stream)
2957	{
2958	struct intel_uncore *uncore = stream->uncore;
2959
2960	/ Reset all contexts' slices/subslices configurations. /
2961	lrc_configure_all_contexts(stream, NULL, NULL);
2962
2963	/ Make sure we disable noa to save power. /
2964	intel_uncore_rmw(uncore, RPM_CONFIG1, GEN10_GT_NOA_ENABLE, set: `0`);
2965	}
2966
2967	static void gen12_disable_metric_set(struct i915_perf_stream *stream)
2968	{
2969	struct intel_uncore *uncore = stream->uncore;
2970	struct drm_i915_private *i915 = stream->perf->i915;
2971	u32 sqcnt1;
2972
2973	/*
2974	* Wa_1508761755:xehpsdv, dg2
2975	* Enable thread stall DOP gating and EU DOP gating.
2976	*/
2977	if (IS_XEHPSDV(i915) \|\| IS_DG2(i915)) {
2978	intel_gt_mcr_multicast_write(gt: uncore->gt, GEN8_ROW_CHICKEN,
2979	_MASKED_BIT_DISABLE(STALL_DOP_GATING_DISABLE));
2980	intel_uncore_write(uncore, GEN7_ROW_CHICKEN2,
2981	_MASKED_BIT_DISABLE(GEN12_DISABLE_DOP_GATING));
2982	}
2983
2984	/ Reset all contexts' slices/subslices configurations. /
2985	gen12_configure_all_contexts(stream, NULL, NULL);
2986
2987	/ disable the context save/restore or OAR counters /
2988	if (stream->ctx)
2989	gen12_configure_oar_context(stream, NULL);
2990
2991	/ Make sure we disable noa to save power. /
2992	intel_uncore_rmw(uncore, RPM_CONFIG1, GEN10_GT_NOA_ENABLE, set: `0`);
2993
2994	sqcnt1 = GEN12_SQCNT1_PMON_ENABLE \|
2995	(HAS_OA_BPC_REPORTING(i915) ? GEN12_SQCNT1_OABPC : `0`);
2996
2997	/ Reset PMON Enable to save power. /
2998	intel_uncore_rmw(uncore, GEN12_SQCNT1, clear: sqcnt1, set: `0`);
2999	}
3000
3001	static void gen7_oa_enable(struct i915_perf_stream *stream)
3002	{
3003	struct intel_uncore *uncore = stream->uncore;
3004	struct i915_gem_context *ctx = stream->ctx;
3005	u32 ctx_id = stream->specific_ctx_id;
3006	bool periodic = stream->periodic;
3007	u32 period_exponent = stream->period_exponent;
3008	u32 report_format = stream->oa_buffer.format->format;
3009
3010	/*
3011	* Reset buf pointers so we don't forward reports from before now.
3012	*
3013	* Think carefully if considering trying to avoid this, since it
3014	* also ensures status flags and the buffer itself are cleared
3015	* in error paths, and we have checks for invalid reports based
3016	* on the assumption that certain fields are written to zeroed
3017	* memory which this helps maintains.
3018	*/
3019	gen7_init_oa_buffer(stream);
3020
3021	intel_uncore_write(uncore, GEN7_OACONTROL,
3022	val: (ctx_id & GEN7_OACONTROL_CTX_MASK) \|
3023	(period_exponent <<
3024	GEN7_OACONTROL_TIMER_PERIOD_SHIFT) \|
3025	(periodic ? GEN7_OACONTROL_TIMER_ENABLE : `0`) \|
3026	(report_format << GEN7_OACONTROL_FORMAT_SHIFT) \|
3027	(ctx ? GEN7_OACONTROL_PER_CTX_ENABLE : `0`) \|
3028	GEN7_OACONTROL_ENABLE);
3029	}
3030
3031	static void gen8_oa_enable(struct i915_perf_stream *stream)
3032	{
3033	struct intel_uncore *uncore = stream->uncore;
3034	u32 report_format = stream->oa_buffer.format->format;
3035
3036	/*
3037	* Reset buf pointers so we don't forward reports from before now.
3038	*
3039	* Think carefully if considering trying to avoid this, since it
3040	* also ensures status flags and the buffer itself are cleared
3041	* in error paths, and we have checks for invalid reports based
3042	* on the assumption that certain fields are written to zeroed
3043	* memory which this helps maintains.
3044	*/
3045	gen8_init_oa_buffer(stream);
3046
3047	/*
3048	* Note: we don't rely on the hardware to perform single context
3049	* filtering and instead filter on the cpu based on the context-id
3050	* field of reports
3051	*/
3052	intel_uncore_write(uncore, GEN8_OACONTROL,
3053	val: (report_format << GEN8_OA_REPORT_FORMAT_SHIFT) \|
3054	GEN8_OA_COUNTER_ENABLE);
3055	}
3056
3057	static void gen12_oa_enable(struct i915_perf_stream *stream)
3058	{
3059	const struct i915_perf_regs *regs;
3060	u32 val;
3061
3062	/*
3063	* If we don't want OA reports from the OA buffer, then we don't even
3064	* need to program the OAG unit.
3065	*/
3066	if (!(stream->sample_flags & SAMPLE_OA_REPORT))
3067	return;
3068
3069	gen12_init_oa_buffer(stream);
3070
3071	regs = __oa_regs(stream);
3072	val = (stream->oa_buffer.format->format << regs->oa_ctrl_counter_format_shift) \|
3073	GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE;
3074
3075	intel_uncore_write(uncore: stream->uncore, reg: regs->oa_ctrl, val);
3076	}
3077
3078	/**
3079	* i915_oa_stream_enable - handle `I915_PERF_IOCTL_ENABLE` for OA stream
3080	* @stream: An i915 perf stream opened for OA metrics
3081	*
3082	* [Re]enables hardware periodic sampling according to the period configured
3083	* when opening the stream. This also starts a hrtimer that will periodically
3084	* check for data in the circular OA buffer for notifying userspace (e.g.
3085	* during a read() or poll()).
3086	*/
3087	static void i915_oa_stream_enable(struct i915_perf_stream *stream)
3088	{
3089	stream->pollin = false;
3090
3091	stream->perf->ops.oa_enable(stream);
3092
3093	if (stream->sample_flags & SAMPLE_OA_REPORT)
3094	hrtimer_start(timer: &stream->poll_check_timer,
3095	tim: ns_to_ktime(ns: stream->poll_oa_period),
3096	mode: HRTIMER_MODE_REL_PINNED);
3097	}
3098
3099	static void gen7_oa_disable(struct i915_perf_stream *stream)
3100	{
3101	struct intel_uncore *uncore = stream->uncore;
3102
3103	intel_uncore_write(uncore, GEN7_OACONTROL, val: `0`);
3104	if (intel_wait_for_register(uncore,
3105	GEN7_OACONTROL, GEN7_OACONTROL_ENABLE, value: `0`,
3106	timeout_ms: `50`))
3107	drm_err(&stream->perf->i915->drm,
3108	"wait for OA to be disabled timed out\n");
3109	}
3110
3111	static void gen8_oa_disable(struct i915_perf_stream *stream)
3112	{
3113	struct intel_uncore *uncore = stream->uncore;
3114
3115	intel_uncore_write(uncore, GEN8_OACONTROL, val: `0`);
3116	if (intel_wait_for_register(uncore,
3117	GEN8_OACONTROL, GEN8_OA_COUNTER_ENABLE, value: `0`,
3118	timeout_ms: `50`))
3119	drm_err(&stream->perf->i915->drm,
3120	"wait for OA to be disabled timed out\n");
3121	}
3122
3123	static void gen12_oa_disable(struct i915_perf_stream *stream)
3124	{
3125	struct intel_uncore *uncore = stream->uncore;
3126
3127	intel_uncore_write(uncore, reg: __oa_regs(stream)->oa_ctrl, val: `0`);
3128	if (intel_wait_for_register(uncore,
3129	reg: __oa_regs(stream)->oa_ctrl,
3130	GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE, value: `0`,
3131	timeout_ms: `50`))
3132	drm_err(&stream->perf->i915->drm,
3133	"wait for OA to be disabled timed out\n");
3134
3135	intel_uncore_write(uncore, GEN12_OA_TLB_INV_CR, val: `1`);
3136	if (intel_wait_for_register(uncore,
3137	GEN12_OA_TLB_INV_CR,
3138	mask: `1`, value: `0`,
3139	timeout_ms: `50`))
3140	drm_err(&stream->perf->i915->drm,
3141	"wait for OA tlb invalidate timed out\n");
3142	}
3143
3144	/**
3145	* i915_oa_stream_disable - handle `I915_PERF_IOCTL_DISABLE` for OA stream
3146	* @stream: An i915 perf stream opened for OA metrics
3147	*
3148	* Stops the OA unit from periodically writing counter reports into the
3149	* circular OA buffer. This also stops the hrtimer that periodically checks for
3150	* data in the circular OA buffer, for notifying userspace.
3151	*/
3152	static void i915_oa_stream_disable(struct i915_perf_stream *stream)
3153	{
3154	stream->perf->ops.oa_disable(stream);
3155
3156	if (stream->sample_flags & SAMPLE_OA_REPORT)
3157	hrtimer_cancel(timer: &stream->poll_check_timer);
3158	}
3159
3160	static const struct i915_perf_stream_ops i915_oa_stream_ops = {
3161	.destroy = i915_oa_stream_destroy,
3162	.enable = i915_oa_stream_enable,
3163	.disable = i915_oa_stream_disable,
3164	.wait_unlocked = i915_oa_wait_unlocked,
3165	.poll_wait = i915_oa_poll_wait,
3166	.read = i915_oa_read,
3167	};
3168
3169	static int i915_perf_stream_enable_sync(struct i915_perf_stream *stream)
3170	{
3171	struct i915_active *active;
3172	int err;
3173
3174	active = i915_active_create();
3175	if (!active)
3176	return -ENOMEM;
3177
3178	err = stream->perf->ops.enable_metric_set(stream, active);
3179	if (err == `0`)
3180	__i915_active_wait(ref: active, TASK_UNINTERRUPTIBLE);
3181
3182	i915_active_put(ref: active);
3183	return err;
3184	}
3185
3186	static void
3187	get_default_sseu_config(struct intel_sseu *out_sseu,
3188	struct intel_engine_cs *engine)
3189	{
3190	const struct sseu_dev_info *devinfo_sseu = &engine->gt->info.sseu;
3191
3192	*out_sseu = intel_sseu_from_device_info(sseu: devinfo_sseu);
3193
3194	if (GRAPHICS_VER(engine->i915) == `11`) {
3195	/*
3196	* We only need subslice count so it doesn't matter which ones
3197	* we select - just turn off low bits in the amount of half of
3198	* all available subslices per slice.
3199	*/
3200	out_sseu->subslice_mask =
3201	~(~`0` << (hweight8(out_sseu->subslice_mask) / `2`));
3202	out_sseu->slice_mask = `0x1`;
3203	}
3204	}
3205
3206	static int
3207	get_sseu_config(struct intel_sseu *out_sseu,
3208	struct intel_engine_cs *engine,
3209	const struct drm_i915_gem_context_param_sseu *drm_sseu)
3210	{
3211	if (drm_sseu->engine.engine_class != engine->uabi_class \|\|
3212	drm_sseu->engine.engine_instance != engine->uabi_instance)
3213	return -EINVAL;
3214
3215	return i915_gem_user_to_context_sseu(gt: engine->gt, user: drm_sseu, context: out_sseu);
3216	}
3217
3218	/*
3219	* OA timestamp frequency = CS timestamp frequency in most platforms. On some
3220	* platforms OA unit ignores the CTC_SHIFT and the 2 timestamps differ. In such
3221	* cases, return the adjusted CS timestamp frequency to the user.
3222	*/
3223	u32 i915_perf_oa_timestamp_frequency(struct drm_i915_private *i915)
3224	{
3225	struct intel_gt *gt = to_gt(i915);
3226
3227	/ Wa_18013179988 /
3228	if (IS_DG2(i915) \|\| IS_GFX_GT_IP_RANGE(gt, IP_VER(`12`, `70`), IP_VER(`12`, `74`))) {
3229	intel_wakeref_t wakeref;
3230	u32 reg, shift;
3231
3232	with_intel_runtime_pm(to_gt(i915)->uncore->rpm, wakeref)
3233	reg = intel_uncore_read(uncore: to_gt(i915)->uncore, RPM_CONFIG0);
3234
3235	shift = REG_FIELD_GET(GEN10_RPM_CONFIG0_CTC_SHIFT_PARAMETER_MASK,
3236	reg);
3237
3238	return to_gt(i915)->clock_frequency << (`3` - shift);
3239	}
3240
3241	return to_gt(i915)->clock_frequency;
3242	}
3243
3244	/**
3245	* i915_oa_stream_init - validate combined props for OA stream and init
3246	* @stream: An i915 perf stream
3247	* @param: The open parameters passed to `DRM_I915_PERF_OPEN`
3248	* @props: The property state that configures stream (individually validated)
3249	*
3250	* While read_properties_unlocked() validates properties in isolation it
3251	* doesn't ensure that the combination necessarily makes sense.
3252	*
3253	* At this point it has been determined that userspace wants a stream of
3254	* OA metrics, but still we need to further validate the combined
3255	* properties are OK.
3256	*
3257	* If the configuration makes sense then we can allocate memory for
3258	* a circular OA buffer and apply the requested metric set configuration.
3259	*
3260	* Returns: zero on success or a negative error code.
3261	*/
3262	static int i915_oa_stream_init(struct i915_perf_stream *stream,
3263	struct drm_i915_perf_open_param *param,
3264	struct perf_open_properties *props)
3265	{
3266	struct drm_i915_private *i915 = stream->perf->i915;
3267	struct i915_perf *perf = stream->perf;
3268	struct i915_perf_group *g;
3269	int ret;
3270
3271	if (!props->engine) {
3272	drm_dbg(&stream->perf->i915->drm,
3273	"OA engine not specified\n");
3274	return -EINVAL;
3275	}
3276	g = props->engine->oa_group;
3277
3278	/*
3279	* If the sysfs metrics/ directory wasn't registered for some
3280	* reason then don't let userspace try their luck with config
3281	* IDs
3282	*/
3283	if (!perf->metrics_kobj) {
3284	drm_dbg(&stream->perf->i915->drm,
3285	"OA metrics weren't advertised via sysfs\n");
3286	return -EINVAL;
3287	}
3288
3289	if (!(props->sample_flags & SAMPLE_OA_REPORT) &&
3290	(GRAPHICS_VER(perf->i915) < `12` \|\| !stream->ctx)) {
3291	drm_dbg(&stream->perf->i915->drm,
3292	"Only OA report sampling supported\n");
3293	return -EINVAL;
3294	}
3295
3296	if (!perf->ops.enable_metric_set) {
3297	drm_dbg(&stream->perf->i915->drm,
3298	"OA unit not supported\n");
3299	return -ENODEV;
3300	}
3301
3302	/*
3303	* To avoid the complexity of having to accurately filter
3304	* counter reports and marshal to the appropriate client
3305	* we currently only allow exclusive access
3306	*/
3307	if (g->exclusive_stream) {
3308	drm_dbg(&stream->perf->i915->drm,
3309	"OA unit already in use\n");
3310	return -EBUSY;
3311	}
3312
3313	if (!props->oa_format) {
3314	drm_dbg(&stream->perf->i915->drm,
3315	"OA report format not specified\n");
3316	return -EINVAL;
3317	}
3318
3319	stream->engine = props->engine;
3320	stream->uncore = stream->engine->gt->uncore;
3321
3322	stream->sample_size = sizeof(struct drm_i915_perf_record_header);
3323
3324	stream->oa_buffer.format = &perf->oa_formats[props->oa_format];
3325	if (drm_WARN_ON(&i915->drm, stream->oa_buffer.format->size == `0`))
3326	return -EINVAL;
3327
3328	stream->sample_flags = props->sample_flags;
3329	stream->sample_size += stream->oa_buffer.format->size;
3330
3331	stream->hold_preemption = props->hold_preemption;
3332
3333	stream->periodic = props->oa_periodic;
3334	if (stream->periodic)
3335	stream->period_exponent = props->oa_period_exponent;
3336
3337	if (stream->ctx) {
3338	ret = oa_get_render_ctx_id(stream);
3339	if (ret) {
3340	drm_dbg(&stream->perf->i915->drm,
3341	"Invalid context id to filter with\n");
3342	return ret;
3343	}
3344	}
3345
3346	ret = alloc_noa_wait(stream);
3347	if (ret) {
3348	drm_dbg(&stream->perf->i915->drm,
3349	"Unable to allocate NOA wait batch buffer\n");
3350	goto err_noa_wait_alloc;
3351	}
3352
3353	stream->oa_config = i915_perf_get_oa_config(perf, metrics_set: props->metrics_set);
3354	if (!stream->oa_config) {
3355	drm_dbg(&stream->perf->i915->drm,
3356	"Invalid OA config id=%i\n", props->metrics_set);
3357	ret = -EINVAL;
3358	goto err_config;
3359	}
3360
3361	/ PRM - observability performance counters:*
3362	*
3363	* OACONTROL, performance counter enable, note:
3364	*
3365	* "When this bit is set, in order to have coherent counts,
3366	* RC6 power state and trunk clock gating must be disabled.
3367	* This can be achieved by programming MMIO registers as
3368	* 0xA094=0 and 0xA090[31]=1"
3369	*
3370	* In our case we are expecting that taking pm + FORCEWAKE
3371	* references will effectively disable RC6.
3372	*/
3373	intel_engine_pm_get(engine: stream->engine);
3374	intel_uncore_forcewake_get(uncore: stream->uncore, domains: FORCEWAKE_ALL);
3375
3376	ret = alloc_oa_buffer(stream);
3377	if (ret)
3378	goto err_oa_buf_alloc;
3379
3380	stream->ops = &i915_oa_stream_ops;
3381
3382	stream->engine->gt->perf.sseu = props->sseu;
3383	WRITE_ONCE(g->exclusive_stream, stream);
3384
3385	ret = i915_perf_stream_enable_sync(stream);
3386	if (ret) {
3387	drm_dbg(&stream->perf->i915->drm,
3388	"Unable to enable metric set\n");
3389	goto err_enable;
3390	}
3391
3392	drm_dbg(&stream->perf->i915->drm,
3393	"opening stream oa config uuid=%s\n",
3394	stream->oa_config->uuid);
3395
3396	hrtimer_init(timer: &stream->poll_check_timer,
3397	CLOCK_MONOTONIC, mode: HRTIMER_MODE_REL);
3398	stream->poll_check_timer.function = oa_poll_check_timer_cb;
3399	init_waitqueue_head(&stream->poll_wq);
3400	spin_lock_init(&stream->oa_buffer.ptr_lock);
3401	mutex_init(&stream->lock);
3402
3403	return `0`;
3404
3405	err_enable:
3406	WRITE_ONCE(g->exclusive_stream, NULL);
3407	perf->ops.disable_metric_set(stream);
3408
3409	free_oa_buffer(stream);
3410
3411	err_oa_buf_alloc:
3412	intel_uncore_forcewake_put(uncore: stream->uncore, domains: FORCEWAKE_ALL);
3413	intel_engine_pm_put(engine: stream->engine);
3414
3415	free_oa_configs(stream);
3416
3417	err_config:
3418	free_noa_wait(stream);
3419
3420	err_noa_wait_alloc:
3421	if (stream->ctx)
3422	oa_put_render_ctx_id(stream);
3423
3424	return ret;
3425	}
3426
3427	void i915_oa_init_reg_state(const struct intel_context *ce,
3428	const struct intel_engine_cs *engine)
3429	{
3430	struct i915_perf_stream *stream;
3431
3432	if (engine->class != RENDER_CLASS)
3433	return;
3434
3435	/ perf.exclusive_stream serialised by lrc_configure_all_contexts() /
3436	stream = READ_ONCE(engine->oa_group->exclusive_stream);
3437	if (stream && GRAPHICS_VER(stream->perf->i915) < `12`)
3438	gen8_update_reg_state_unlocked(ce, stream);
3439	}
3440
3441	/**
3442	* i915_perf_read - handles read() FOP for i915 perf stream FDs
3443	* @file: An i915 perf stream file
3444	* @buf: destination buffer given by userspace
3445	* @count: the number of bytes userspace wants to read
3446	* @ppos: (inout) file seek position (unused)
3447	*
3448	* The entry point for handling a read() on a stream file descriptor from
3449	* userspace. Most of the work is left to the i915_perf_read_locked() and
3450	* &i915_perf_stream_ops->read but to save having stream implementations (of
3451	* which we might have multiple later) we handle blocking read here.
3452	*
3453	* We can also consistently treat trying to read from a disabled stream
3454	* as an IO error so implementations can assume the stream is enabled
3455	* while reading.
3456	*
3457	* Returns: The number of bytes copied or a negative error code on failure.
3458	*/
3459	static ssize_t i915_perf_read(struct file *file,
3460	char __user *buf,
3461	size_t count,
3462	loff_t *ppos)
3463	{
3464	struct i915_perf_stream *stream = file->private_data;
3465	size_t offset = `0`;
3466	int ret;
3467
3468	/ To ensure it's handled consistently we simply treat all reads of a*
3469	* disabled stream as an error. In particular it might otherwise lead
3470	* to a deadlock for blocking file descriptors...
3471	*/
3472	if (!stream->enabled \|\| !(stream->sample_flags & SAMPLE_OA_REPORT))
3473	return -EIO;
3474
3475	if (!(file->f_flags & O_NONBLOCK)) {
3476	/ There's the small chance of false positives from*
3477	* stream->ops->wait_unlocked.
3478	*
3479	* E.g. with single context filtering since we only wait until
3480	* oabuffer has >= 1 report we don't immediately know whether
3481	* any reports really belong to the current context
3482	*/
3483	do {
3484	ret = stream->ops->wait_unlocked(stream);
3485	if (ret)
3486	return ret;
3487
3488	mutex_lock(&stream->lock);
3489	ret = stream->ops->read(stream, buf, count, &offset);
3490	mutex_unlock(lock: &stream->lock);
3491	} while (!offset && !ret);
3492	} else {
3493	mutex_lock(&stream->lock);
3494	ret = stream->ops->read(stream, buf, count, &offset);
3495	mutex_unlock(lock: &stream->lock);
3496	}
3497
3498	/ We allow the poll checking to sometimes report false positive EPOLLIN*
3499	* events where we might actually report EAGAIN on read() if there's
3500	* not really any data available. In this situation though we don't
3501	* want to enter a busy loop between poll() reporting a EPOLLIN event
3502	* and read() returning -EAGAIN. Clearing the oa.pollin state here
3503	* effectively ensures we back off until the next hrtimer callback
3504	* before reporting another EPOLLIN event.
3505	* The exception to this is if ops->read() returned -ENOSPC which means
3506	* that more OA data is available than could fit in the user provided
3507	* buffer. In this case we want the next poll() call to not block.
3508	*/
3509	if (ret != -ENOSPC)
3510	stream->pollin = false;
3511
3512	/ Possible values for ret are 0, -EFAULT, -ENOSPC, -EIO, ... /
3513	return offset ?: (ret ?: -EAGAIN);
3514	}
3515
3516	static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer)
3517	{
3518	struct i915_perf_stream *stream =
3519	container_of(hrtimer, typeof(*stream), poll_check_timer);
3520
3521	if (oa_buffer_check_unlocked(stream)) {
3522	stream->pollin = true;
3523	wake_up(&stream->poll_wq);
3524	}
3525
3526	hrtimer_forward_now(timer: hrtimer,
3527	interval: ns_to_ktime(ns: stream->poll_oa_period));
3528
3529	return HRTIMER_RESTART;
3530	}
3531
3532	/**
3533	* i915_perf_poll_locked - poll_wait() with a suitable wait queue for stream
3534	* @stream: An i915 perf stream
3535	* @file: An i915 perf stream file
3536	* @wait: poll() state table
3537	*
3538	* For handling userspace polling on an i915 perf stream, this calls through to
3539	* &i915_perf_stream_ops->poll_wait to call poll_wait() with a wait queue that
3540	* will be woken for new stream data.
3541	*
3542	* Returns: any poll events that are ready without sleeping
3543	*/
3544	static __poll_t i915_perf_poll_locked(struct i915_perf_stream *stream,
3545	struct file *file,
3546	poll_table *wait)
3547	{
3548	__poll_t events = `0`;
3549
3550	stream->ops->poll_wait(stream, file, wait);
3551
3552	/ Note: we don't explicitly check whether there's something to read*
3553	* here since this path may be very hot depending on what else
3554	* userspace is polling, or on the timeout in use. We rely solely on
3555	* the hrtimer/oa_poll_check_timer_cb to notify us when there are
3556	* samples to read.
3557	*/
3558	if (stream->pollin)
3559	events \|= EPOLLIN;
3560
3561	return events;
3562	}
3563
3564	/**
3565	* i915_perf_poll - call poll_wait() with a suitable wait queue for stream
3566	* @file: An i915 perf stream file
3567	* @wait: poll() state table
3568	*
3569	* For handling userspace polling on an i915 perf stream, this ensures
3570	* poll_wait() gets called with a wait queue that will be woken for new stream
3571	* data.
3572	*
3573	* Note: Implementation deferred to i915_perf_poll_locked()
3574	*
3575	* Returns: any poll events that are ready without sleeping
3576	*/
3577	static __poll_t i915_perf_poll(struct file file, poll_table wait)
3578	{
3579	struct i915_perf_stream *stream = file->private_data;
3580	__poll_t ret;
3581
3582	mutex_lock(&stream->lock);
3583	ret = i915_perf_poll_locked(stream, file, wait);
3584	mutex_unlock(lock: &stream->lock);
3585
3586	return ret;
3587	}
3588
3589	/**
3590	* i915_perf_enable_locked - handle `I915_PERF_IOCTL_ENABLE` ioctl
3591	* @stream: A disabled i915 perf stream
3592	*
3593	* [Re]enables the associated capture of data for this stream.
3594	*
3595	* If a stream was previously enabled then there's currently no intention
3596	* to provide userspace any guarantee about the preservation of previously
3597	* buffered data.
3598	*/
3599	static void i915_perf_enable_locked(struct i915_perf_stream *stream)
3600	{
3601	if (stream->enabled)
3602	return;
3603
3604	/ Allow stream->ops->enable() to refer to this /
3605	stream->enabled = true;
3606
3607	if (stream->ops->enable)
3608	stream->ops->enable(stream);
3609
3610	if (stream->hold_preemption)
3611	intel_context_set_nopreempt(ce: stream->pinned_ctx);
3612	}
3613
3614	/**
3615	* i915_perf_disable_locked - handle `I915_PERF_IOCTL_DISABLE` ioctl
3616	* @stream: An enabled i915 perf stream
3617	*
3618	* Disables the associated capture of data for this stream.
3619	*
3620	* The intention is that disabling an re-enabling a stream will ideally be
3621	* cheaper than destroying and re-opening a stream with the same configuration,
3622	* though there are no formal guarantees about what state or buffered data
3623	* must be retained between disabling and re-enabling a stream.
3624	*
3625	* Note: while a stream is disabled it's considered an error for userspace
3626	* to attempt to read from the stream (-EIO).
3627	*/
3628	static void i915_perf_disable_locked(struct i915_perf_stream *stream)
3629	{
3630	if (!stream->enabled)
3631	return;
3632
3633	/ Allow stream->ops->disable() to refer to this /
3634	stream->enabled = false;
3635
3636	if (stream->hold_preemption)
3637	intel_context_clear_nopreempt(ce: stream->pinned_ctx);
3638
3639	if (stream->ops->disable)
3640	stream->ops->disable(stream);
3641	}
3642
3643	static long i915_perf_config_locked(struct i915_perf_stream *stream,
3644	unsigned long metrics_set)
3645	{
3646	struct i915_oa_config *config;
3647	long ret = stream->oa_config->id;
3648
3649	config = i915_perf_get_oa_config(perf: stream->perf, metrics_set);
3650	if (!config)
3651	return -EINVAL;
3652
3653	if (config != stream->oa_config) {
3654	int err;
3655
3656	/*
3657	* If OA is bound to a specific context, emit the
3658	* reconfiguration inline from that context. The update
3659	* will then be ordered with respect to submission on that
3660	* context.
3661	*
3662	* When set globally, we use a low priority kernel context,
3663	* so it will effectively take effect when idle.
3664	*/
3665	err = emit_oa_config(stream, oa_config: config, ce: oa_context(stream), NULL);
3666	if (!err)
3667	config = xchg(&stream->oa_config, config);
3668	else
3669	ret = err;
3670	}
3671
3672	i915_oa_config_put(oa_config: config);
3673
3674	return ret;
3675	}
3676
3677	/**
3678	* i915_perf_ioctl_locked - support ioctl() usage with i915 perf stream FDs
3679	* @stream: An i915 perf stream
3680	* @cmd: the ioctl request
3681	* @arg: the ioctl data
3682	*
3683	* Returns: zero on success or a negative error code. Returns -EINVAL for
3684	* an unknown ioctl request.
3685	*/
3686	static long i915_perf_ioctl_locked(struct i915_perf_stream *stream,
3687	unsigned int cmd,
3688	unsigned long arg)
3689	{
3690	switch (cmd) {
3691	case I915_PERF_IOCTL_ENABLE:
3692	i915_perf_enable_locked(stream);
3693	return `0`;
3694	case I915_PERF_IOCTL_DISABLE:
3695	i915_perf_disable_locked(stream);
3696	return `0`;
3697	case I915_PERF_IOCTL_CONFIG:
3698	return i915_perf_config_locked(stream, metrics_set: arg);
3699	}
3700
3701	return -EINVAL;
3702	}
3703
3704	/**
3705	* i915_perf_ioctl - support ioctl() usage with i915 perf stream FDs
3706	* @file: An i915 perf stream file
3707	* @cmd: the ioctl request
3708	* @arg: the ioctl data
3709	*
3710	* Implementation deferred to i915_perf_ioctl_locked().
3711	*
3712	* Returns: zero on success or a negative error code. Returns -EINVAL for
3713	* an unknown ioctl request.
3714	*/
3715	static long i915_perf_ioctl(struct file *file,
3716	unsigned int cmd,
3717	unsigned long arg)
3718	{
3719	struct i915_perf_stream *stream = file->private_data;
3720	long ret;
3721
3722	mutex_lock(&stream->lock);
3723	ret = i915_perf_ioctl_locked(stream, cmd, arg);
3724	mutex_unlock(lock: &stream->lock);
3725
3726	return ret;
3727	}
3728
3729	/**
3730	* i915_perf_destroy_locked - destroy an i915 perf stream
3731	* @stream: An i915 perf stream
3732	*
3733	* Frees all resources associated with the given i915 perf @stream, disabling
3734	* any associated data capture in the process.
3735	*
3736	* Note: The &gt->perf.lock mutex has been taken to serialize
3737	* with any non-file-operation driver hooks.
3738	*/
3739	static void i915_perf_destroy_locked(struct i915_perf_stream *stream)
3740	{
3741	if (stream->enabled)
3742	i915_perf_disable_locked(stream);
3743
3744	if (stream->ops->destroy)
3745	stream->ops->destroy(stream);
3746
3747	if (stream->ctx)
3748	i915_gem_context_put(ctx: stream->ctx);
3749
3750	kfree(objp: stream);
3751	}
3752
3753	/**
3754	* i915_perf_release - handles userspace close() of a stream file
3755	* @inode: anonymous inode associated with file
3756	* @file: An i915 perf stream file
3757	*
3758	* Cleans up any resources associated with an open i915 perf stream file.
3759	*
3760	* NB: close() can't really fail from the userspace point of view.
3761	*
3762	* Returns: zero on success or a negative error code.
3763	*/
3764	static int i915_perf_release(struct inode inode, struct* file *file)
3765	{
3766	struct i915_perf_stream *stream = file->private_data;
3767	struct i915_perf *perf = stream->perf;
3768	struct intel_gt *gt = stream->engine->gt;
3769
3770	/*
3771	* Within this call, we know that the fd is being closed and we have no
3772	* other user of stream->lock. Use the perf lock to destroy the stream
3773	* here.
3774	*/
3775	mutex_lock(&gt->perf.lock);
3776	i915_perf_destroy_locked(stream);
3777	mutex_unlock(lock: &gt->perf.lock);
3778
3779	/ Release the reference the perf stream kept on the driver. /
3780	drm_dev_put(dev: &perf->i915->drm);
3781
3782	return `0`;
3783	}
3784
3785
3786	static const struct file_operations fops = {
3787	.owner = THIS_MODULE,
3788	.llseek = no_llseek,
3789	.release = i915_perf_release,
3790	.poll = i915_perf_poll,
3791	.read = i915_perf_read,
3792	.unlocked_ioctl = i915_perf_ioctl,
3793	/ Our ioctl have no arguments, so it's safe to use the same function*
3794	* to handle 32bits compatibility.
3795	*/
3796	.compat_ioctl = i915_perf_ioctl,
3797	};
3798
3799
3800	/**
3801	* i915_perf_open_ioctl_locked - DRM ioctl() for userspace to open a stream FD
3802	* @perf: i915 perf instance
3803	* @param: The open parameters passed to 'DRM_I915_PERF_OPEN`
3804	* @props: individually validated u64 property value pairs
3805	* @file: drm file
3806	*
3807	* See i915_perf_ioctl_open() for interface details.
3808	*
3809	* Implements further stream config validation and stream initialization on
3810	* behalf of i915_perf_open_ioctl() with the &gt->perf.lock mutex
3811	* taken to serialize with any non-file-operation driver hooks.
3812	*
3813	* Note: at this point the @props have only been validated in isolation and
3814	* it's still necessary to validate that the combination of properties makes
3815	* sense.
3816	*
3817	* In the case where userspace is interested in OA unit metrics then further
3818	* config validation and stream initialization details will be handled by
3819	* i915_oa_stream_init(). The code here should only validate config state that
3820	* will be relevant to all stream types / backends.
3821	*
3822	* Returns: zero on success or a negative error code.
3823	*/
3824	static int
3825	i915_perf_open_ioctl_locked(struct i915_perf *perf,
3826	struct drm_i915_perf_open_param *param,
3827	struct perf_open_properties *props,
3828	struct drm_file *file)
3829	{
3830	struct i915_gem_context *specific_ctx = NULL;
3831	struct i915_perf_stream *stream = NULL;
3832	unsigned long f_flags = `0`;
3833	bool privileged_op = true;
3834	int stream_fd;
3835	int ret;
3836
3837	if (props->single_context) {
3838	u32 ctx_handle = props->ctx_handle;
3839	struct drm_i915_file_private *file_priv = file->driver_priv;
3840
3841	specific_ctx = i915_gem_context_lookup(file_priv, id: ctx_handle);
3842	if (IS_ERR(ptr: specific_ctx)) {
3843	drm_dbg(&perf->i915->drm,
3844	"Failed to look up context with ID %u for opening perf stream\n",
3845	ctx_handle);
3846	ret = PTR_ERR(ptr: specific_ctx);
3847	goto err;
3848	}
3849	}
3850
3851	/*
3852	* On Haswell the OA unit supports clock gating off for a specific
3853	* context and in this mode there's no visibility of metrics for the
3854	* rest of the system, which we consider acceptable for a
3855	* non-privileged client.
3856	*
3857	* For Gen8->11 the OA unit no longer supports clock gating off for a
3858	* specific context and the kernel can't securely stop the counters
3859	* from updating as system-wide / global values. Even though we can
3860	* filter reports based on the included context ID we can't block
3861	* clients from seeing the raw / global counter values via
3862	* MI_REPORT_PERF_COUNT commands and so consider it a privileged op to
3863	* enable the OA unit by default.
3864	*
3865	* For Gen12+ we gain a new OAR unit that only monitors the RCS on a
3866	* per context basis. So we can relax requirements there if the user
3867	* doesn't request global stream access (i.e. query based sampling
3868	* using MI_RECORD_PERF_COUNT.
3869	*/
3870	if (IS_HASWELL(perf->i915) && specific_ctx)
3871	privileged_op = false;
3872	else if (GRAPHICS_VER(perf->i915) == `12` && specific_ctx &&
3873	(props->sample_flags & SAMPLE_OA_REPORT) == `0`)
3874	privileged_op = false;
3875
3876	if (props->hold_preemption) {
3877	if (!props->single_context) {
3878	drm_dbg(&perf->i915->drm,
3879	"preemption disable with no context\n");
3880	ret = -EINVAL;
3881	goto err;
3882	}
3883	privileged_op = true;
3884	}
3885
3886	/*
3887	* Asking for SSEU configuration is a priviliged operation.
3888	*/
3889	if (props->has_sseu)
3890	privileged_op = true;
3891	else
3892	get_default_sseu_config(out_sseu: &props->sseu, engine: props->engine);
3893
3894	/ Similar to perf's kernel.perf_paranoid_cpu sysctl option*
3895	* we check a dev.i915.perf_stream_paranoid sysctl option
3896	* to determine if it's ok to access system wide OA counters
3897	* without CAP_PERFMON or CAP_SYS_ADMIN privileges.
3898	*/
3899	if (privileged_op &&
3900	i915_perf_stream_paranoid && !perfmon_capable()) {
3901	drm_dbg(&perf->i915->drm,
3902	"Insufficient privileges to open i915 perf stream\n");
3903	ret = -EACCES;
3904	goto err_ctx;
3905	}
3906
3907	stream = kzalloc(size: sizeof(*stream), GFP_KERNEL);
3908	if (!stream) {
3909	ret = -ENOMEM;
3910	goto err_ctx;
3911	}
3912
3913	stream->perf = perf;
3914	stream->ctx = specific_ctx;
3915	stream->poll_oa_period = props->poll_oa_period;
3916
3917	ret = i915_oa_stream_init(stream, param, props);
3918	if (ret)
3919	goto err_alloc;
3920
3921	/ we avoid simply assigning stream->sample_flags = props->sample_flags*
3922	* to have _stream_init check the combination of sample flags more
3923	* thoroughly, but still this is the expected result at this point.
3924	*/
3925	if (WARN_ON(stream->sample_flags != props->sample_flags)) {
3926	ret = -ENODEV;
3927	goto err_flags;
3928	}
3929
3930	if (param->flags & I915_PERF_FLAG_FD_CLOEXEC)
3931	f_flags \|= O_CLOEXEC;
3932	if (param->flags & I915_PERF_FLAG_FD_NONBLOCK)
3933	f_flags \|= O_NONBLOCK;
3934
3935	stream_fd = anon_inode_getfd(name: "[i915_perf]", fops: &fops, priv: stream, flags: f_flags);
3936	if (stream_fd < `0`) {
3937	ret = stream_fd;
3938	goto err_flags;
3939	}
3940
3941	if (!(param->flags & I915_PERF_FLAG_DISABLED))
3942	i915_perf_enable_locked(stream);
3943
3944	/ Take a reference on the driver that will be kept with stream_fd*
3945	* until its release.
3946	*/
3947	drm_dev_get(dev: &perf->i915->drm);
3948
3949	return stream_fd;
3950
3951	err_flags:
3952	if (stream->ops->destroy)
3953	stream->ops->destroy(stream);
3954	err_alloc:
3955	kfree(objp: stream);
3956	err_ctx:
3957	if (specific_ctx)
3958	i915_gem_context_put(ctx: specific_ctx);
3959	err:
3960	return ret;
3961	}
3962
3963	static u64 oa_exponent_to_ns(struct i915_perf perf, int* exponent)
3964	{
3965	u64 nom = (`2ULL` << exponent) * NSEC_PER_SEC;
3966	u32 den = i915_perf_oa_timestamp_frequency(i915: perf->i915);
3967
3968	return div_u64(dividend: nom + den - `1`, divisor: den);
3969	}
3970
3971	static __always_inline bool
3972	oa_format_valid(struct i915_perf perf, enum* drm_i915_oa_format format)
3973	{
3974	return test_bit(format, perf->format_mask);
3975	}
3976
3977	static __always_inline void
3978	oa_format_add(struct i915_perf perf, enum* drm_i915_oa_format format)
3979	{
3980	__set_bit(format, perf->format_mask);
3981	}
3982
3983	/**
3984	* read_properties_unlocked - validate + copy userspace stream open properties
3985	* @perf: i915 perf instance
3986	* @uprops: The array of u64 key value pairs given by userspace
3987	* @n_props: The number of key value pairs expected in @uprops
3988	* @props: The stream configuration built up while validating properties
3989	*
3990	* Note this function only validates properties in isolation it doesn't
3991	* validate that the combination of properties makes sense or that all
3992	* properties necessary for a particular kind of stream have been set.
3993	*
3994	* Note that there currently aren't any ordering requirements for properties so
3995	* we shouldn't validate or assume anything about ordering here. This doesn't
3996	* rule out defining new properties with ordering requirements in the future.
3997	*/
3998	static int read_properties_unlocked(struct i915_perf *perf,
3999	u64 __user *uprops,
4000	u32 n_props,
4001	struct perf_open_properties *props)
4002	{
4003	struct drm_i915_gem_context_param_sseu user_sseu;
4004	const struct i915_oa_format *f;
4005	u64 __user *uprop = uprops;
4006	bool config_instance = false;
4007	bool config_class = false;
4008	bool config_sseu = false;
4009	u8 class, instance;
4010	u32 i;
4011	int ret;
4012
4013	memset(props, `0`, sizeof(struct perf_open_properties));
4014	props->poll_oa_period = DEFAULT_POLL_PERIOD_NS;
4015
4016	/ Considering that ID = 0 is reserved and assuming that we don't*
4017	* (currently) expect any configurations to ever specify duplicate
4018	* values for a particular property ID then the last _PROP_MAX value is
4019	* one greater than the maximum number of properties we expect to get
4020	* from userspace.
4021	*/
4022	if (!n_props \|\| n_props >= DRM_I915_PERF_PROP_MAX) {
4023	drm_dbg(&perf->i915->drm,
4024	"Invalid number of i915 perf properties given\n");
4025	return -EINVAL;
4026	}
4027
4028	/ Defaults when class:instance is not passed /
4029	class = I915_ENGINE_CLASS_RENDER;
4030	instance = `0`;
4031
4032	for (i = `0`; i < n_props; i++) {
4033	u64 oa_period, oa_freq_hz;
4034	u64 id, value;
4035
4036	ret = get_user(id, uprop);
4037	if (ret)
4038	return ret;
4039
4040	ret = get_user(value, uprop + `1`);
4041	if (ret)
4042	return ret;
4043
4044	if (id == `0` \|\| id >= DRM_I915_PERF_PROP_MAX) {
4045	drm_dbg(&perf->i915->drm,
4046	"Unknown i915 perf property ID\n");
4047	return -EINVAL;
4048	}
4049
4050	switch ((enum drm_i915_perf_property_id)id) {
4051	case DRM_I915_PERF_PROP_CTX_HANDLE:
4052	props->single_context = `1`;
4053	props->ctx_handle = value;
4054	break;
4055	case DRM_I915_PERF_PROP_SAMPLE_OA:
4056	if (value)
4057	props->sample_flags \|= SAMPLE_OA_REPORT;
4058	break;
4059	case DRM_I915_PERF_PROP_OA_METRICS_SET:
4060	if (value == `0`) {
4061	drm_dbg(&perf->i915->drm,
4062	"Unknown OA metric set ID\n");
4063	return -EINVAL;
4064	}
4065	props->metrics_set = value;
4066	break;
4067	case DRM_I915_PERF_PROP_OA_FORMAT:
4068	if (value == `0` \|\| value >= I915_OA_FORMAT_MAX) {
4069	drm_dbg(&perf->i915->drm,
4070	"Out-of-range OA report format %llu\n",
4071	value);
4072	return -EINVAL;
4073	}
4074	if (!oa_format_valid(perf, format: value)) {
4075	drm_dbg(&perf->i915->drm,
4076	"Unsupported OA report format %llu\n",
4077	value);
4078	return -EINVAL;
4079	}
4080	props->oa_format = value;
4081	break;
4082	case DRM_I915_PERF_PROP_OA_EXPONENT:
4083	if (value > OA_EXPONENT_MAX) {
4084	drm_dbg(&perf->i915->drm,
4085	"OA timer exponent too high (> %u)\n",
4086	OA_EXPONENT_MAX);
4087	return -EINVAL;
4088	}
4089
4090	/ Theoretically we can program the OA unit to sample*
4091	* e.g. every 160ns for HSW, 167ns for BDW/SKL or 104ns
4092	* for BXT. We don't allow such high sampling
4093	* frequencies by default unless root.
4094	*/
4095
4096	BUILD_BUG_ON(sizeof(oa_period) != `8`);
4097	oa_period = oa_exponent_to_ns(perf, exponent: value);
4098
4099	/ This check is primarily to ensure that oa_period <=*
4100	* UINT32_MAX (before passing to do_div which only
4101	* accepts a u32 denominator), but we can also skip
4102	* checking anything < 1Hz which implicitly can't be
4103	* limited via an integer oa_max_sample_rate.
4104	*/
4105	if (oa_period <= NSEC_PER_SEC) {
4106	u64 tmp = NSEC_PER_SEC;
4107	do_div(tmp, oa_period);
4108	oa_freq_hz = tmp;
4109	} else
4110	oa_freq_hz = `0`;
4111
4112	if (oa_freq_hz > i915_oa_max_sample_rate && !perfmon_capable()) {
4113	drm_dbg(&perf->i915->drm,
4114	"OA exponent would exceed the max sampling frequency (sysctl dev.i915.oa_max_sample_rate) %uHz without CAP_PERFMON or CAP_SYS_ADMIN privileges\n",
4115	i915_oa_max_sample_rate);
4116	return -EACCES;
4117	}
4118
4119	props->oa_periodic = true;
4120	props->oa_period_exponent = value;
4121	break;
4122	case DRM_I915_PERF_PROP_HOLD_PREEMPTION:
4123	props->hold_preemption = !!value;
4124	break;
4125	case DRM_I915_PERF_PROP_GLOBAL_SSEU: {
4126	if (GRAPHICS_VER_FULL(perf->i915) >= IP_VER(`12`, `50`)) {
4127	drm_dbg(&perf->i915->drm,
4128	"SSEU config not supported on gfx %x\n",
4129	GRAPHICS_VER_FULL(perf->i915));
4130	return -ENODEV;
4131	}
4132
4133	if (copy_from_user(to: &user_sseu,
4134	u64_to_user_ptr(value),
4135	n: sizeof(user_sseu))) {
4136	drm_dbg(&perf->i915->drm,
4137	"Unable to copy global sseu parameter\n");
4138	return -EFAULT;
4139	}
4140	config_sseu = true;
4141	break;
4142	}
4143	case DRM_I915_PERF_PROP_POLL_OA_PERIOD:
4144	if (value < `100000` / 100us /) {
4145	drm_dbg(&perf->i915->drm,
4146	"OA availability timer too small (%lluns < 100us)\n",
4147	value);
4148	return -EINVAL;
4149	}
4150	props->poll_oa_period = value;
4151	break;
4152	case DRM_I915_PERF_PROP_OA_ENGINE_CLASS:
4153	class = (u8)value;
4154	config_class = true;
4155	break;
4156	case DRM_I915_PERF_PROP_OA_ENGINE_INSTANCE:
4157	instance = (u8)value;
4158	config_instance = true;
4159	break;
4160	default:
4161	MISSING_CASE(id);
4162	return -EINVAL;
4163	}
4164
4165	uprop += `2`;
4166	}
4167
4168	if ((config_class && !config_instance) \|\|
4169	(config_instance && !config_class)) {
4170	drm_dbg(&perf->i915->drm,
4171	"OA engine-class and engine-instance parameters must be passed together\n");
4172	return -EINVAL;
4173	}
4174
4175	props->engine = intel_engine_lookup_user(i915: perf->i915, class, instance);
4176	if (!props->engine) {
4177	drm_dbg(&perf->i915->drm,
4178	"OA engine class and instance invalid %d:%d\n",
4179	class, instance);
4180	return -EINVAL;
4181	}
4182
4183	if (!engine_supports_oa(engine: props->engine)) {
4184	drm_dbg(&perf->i915->drm,
4185	"Engine not supported by OA %d:%d\n",
4186	class, instance);
4187	return -EINVAL;
4188	}
4189
4190	/*
4191	* Wa_14017512683: mtl[a0..c0): Use of OAM must be preceded with Media
4192	* C6 disable in BIOS. Fail if Media C6 is enabled on steppings where OAM
4193	* does not work as expected.
4194	*/
4195	if (IS_MEDIA_GT_IP_STEP(props->engine->gt, IP_VER(`13`, `0`), STEP_A0, STEP_C0) &&
4196	props->engine->oa_group->type == TYPE_OAM &&
4197	intel_check_bios_c6_setup(rc6: &props->engine->gt->rc6)) {
4198	drm_dbg(&perf->i915->drm,
4199	"OAM requires media C6 to be disabled in BIOS\n");
4200	return -EINVAL;
4201	}
4202
4203	i = array_index_nospec(props->oa_format, I915_OA_FORMAT_MAX);
4204	f = &perf->oa_formats[i];
4205	if (!engine_supports_oa_format(engine: props->engine, type: f->type)) {
4206	drm_dbg(&perf->i915->drm,
4207	"Invalid OA format %d for class %d\n",
4208	f->type, props->engine->class);
4209	return -EINVAL;
4210	}
4211
4212	if (config_sseu) {
4213	ret = get_sseu_config(out_sseu: &props->sseu, engine: props->engine, drm_sseu: &user_sseu);
4214	if (ret) {
4215	drm_dbg(&perf->i915->drm,
4216	"Invalid SSEU configuration\n");
4217	return ret;
4218	}
4219	props->has_sseu = true;
4220	}
4221
4222	return `0`;
4223	}
4224
4225	/**
4226	* i915_perf_open_ioctl - DRM ioctl() for userspace to open a stream FD
4227	* @dev: drm device
4228	* @data: ioctl data copied from userspace (unvalidated)
4229	* @file: drm file
4230	*
4231	* Validates the stream open parameters given by userspace including flags
4232	* and an array of u64 key, value pair properties.
4233	*
4234	* Very little is assumed up front about the nature of the stream being
4235	* opened (for instance we don't assume it's for periodic OA unit metrics). An
4236	* i915-perf stream is expected to be a suitable interface for other forms of
4237	* buffered data written by the GPU besides periodic OA metrics.
4238	*
4239	* Note we copy the properties from userspace outside of the i915 perf
4240	* mutex to avoid an awkward lockdep with mmap_lock.
4241	*
4242	* Most of the implementation details are handled by
4243	* i915_perf_open_ioctl_locked() after taking the &gt->perf.lock
4244	* mutex for serializing with any non-file-operation driver hooks.
4245	*
4246	* Return: A newly opened i915 Perf stream file descriptor or negative
4247	* error code on failure.
4248	*/
4249	int i915_perf_open_ioctl(struct drm_device dev, void* *data,
4250	struct drm_file *file)
4251	{
4252	struct i915_perf *perf = &to_i915(dev)->perf;
4253	struct drm_i915_perf_open_param *param = data;
4254	struct intel_gt *gt;
4255	struct perf_open_properties props;
4256	u32 known_open_flags;
4257	int ret;
4258
4259	if (!perf->i915)
4260	return -ENOTSUPP;
4261
4262	known_open_flags = I915_PERF_FLAG_FD_CLOEXEC \|
4263	I915_PERF_FLAG_FD_NONBLOCK \|
4264	I915_PERF_FLAG_DISABLED;
4265	if (param->flags & ~known_open_flags) {
4266	drm_dbg(&perf->i915->drm,
4267	"Unknown drm_i915_perf_open_param flag\n");
4268	return -EINVAL;
4269	}
4270
4271	ret = read_properties_unlocked(perf,
4272	u64_to_user_ptr(param->properties_ptr),
4273	n_props: param->num_properties,
4274	props: &props);
4275	if (ret)
4276	return ret;
4277
4278	gt = props.engine->gt;
4279
4280	mutex_lock(&gt->perf.lock);
4281	ret = i915_perf_open_ioctl_locked(perf, param, props: &props, file);
4282	mutex_unlock(lock: &gt->perf.lock);
4283
4284	return ret;
4285	}
4286
4287	/**
4288	* i915_perf_register - exposes i915-perf to userspace
4289	* @i915: i915 device instance
4290	*
4291	* In particular OA metric sets are advertised under a sysfs metrics/
4292	* directory allowing userspace to enumerate valid IDs that can be
4293	* used to open an i915-perf stream.
4294	*/
4295	void i915_perf_register(struct drm_i915_private *i915)
4296	{
4297	struct i915_perf *perf = &i915->perf;
4298	struct intel_gt *gt = to_gt(i915);
4299
4300	if (!perf->i915)
4301	return;
4302
4303	/ To be sure we're synchronized with an attempted*
4304	* i915_perf_open_ioctl(); considering that we register after
4305	* being exposed to userspace.
4306	*/
4307	mutex_lock(&gt->perf.lock);
4308
4309	perf->metrics_kobj =
4310	kobject_create_and_add(name: "metrics",
4311	parent: &i915->drm.primary->kdev->kobj);
4312
4313	mutex_unlock(lock: &gt->perf.lock);
4314	}
4315
4316	/**
4317	* i915_perf_unregister - hide i915-perf from userspace
4318	* @i915: i915 device instance
4319	*
4320	* i915-perf state cleanup is split up into an 'unregister' and
4321	* 'deinit' phase where the interface is first hidden from
4322	* userspace by i915_perf_unregister() before cleaning up
4323	* remaining state in i915_perf_fini().
4324	*/
4325	void i915_perf_unregister(struct drm_i915_private *i915)
4326	{
4327	struct i915_perf *perf = &i915->perf;
4328
4329	if (!perf->metrics_kobj)
4330	return;
4331
4332	kobject_put(kobj: perf->metrics_kobj);
4333	perf->metrics_kobj = NULL;
4334	}
4335
4336	static bool gen8_is_valid_flex_addr(struct i915_perf *perf, u32 addr)
4337	{
4338	static const i915_reg_t flex_eu_regs[] = {
4339	EU_PERF_CNTL0,
4340	EU_PERF_CNTL1,
4341	EU_PERF_CNTL2,
4342	EU_PERF_CNTL3,
4343	EU_PERF_CNTL4,
4344	EU_PERF_CNTL5,
4345	EU_PERF_CNTL6,
4346	};
4347	int i;
4348
4349	for (i = `0`; i < ARRAY_SIZE(flex_eu_regs); i++) {
4350	if (i915_mmio_reg_offset(flex_eu_regs[i]) == addr)
4351	return true;
4352	}
4353	return false;
4354	}
4355
4356	static bool reg_in_range_table(u32 addr, const struct i915_range *table)
4357	{
4358	while (table->start \|\| table->end) {
4359	if (addr >= table->start && addr <= table->end)
4360	return true;
4361
4362	table++;
4363	}
4364
4365	return false;
4366	}
4367
4368	#define REG_EQUAL(addr, mmio) \
4369	((addr) == i915_mmio_reg_offset(mmio))
4370
4371	static const struct i915_range gen7_oa_b_counters[] = {
4372	{ .start = `0x2710`, .end = `0x272c` }, / OASTARTTRIG[1-8] /
4373	{ .start = `0x2740`, .end = `0x275c` }, / OAREPORTTRIG[1-8] /
4374	{ .start = `0x2770`, .end = `0x27ac` }, / OACEC[0-7][0-1] /
4375	{}
4376	};
4377
4378	static const struct i915_range gen12_oa_b_counters[] = {
4379	{ .start = `0x2b2c`, .end = `0x2b2c` }, / GEN12_OAG_OA_PESS /
4380	{ .start = `0xd900`, .end = `0xd91c` }, / GEN12_OAG_OASTARTTRIG[1-8] /
4381	{ .start = `0xd920`, .end = `0xd93c` }, / GEN12_OAG_OAREPORTTRIG1[1-8] /
4382	{ .start = `0xd940`, .end = `0xd97c` }, / GEN12_OAG_CEC[0-7][0-1] /
4383	{ .start = `0xdc00`, .end = `0xdc3c` }, / GEN12_OAG_SCEC[0-7][0-1] /
4384	{ .start = `0xdc40`, .end = `0xdc40` }, / GEN12_OAG_SPCTR_CNF /
4385	{ .start = `0xdc44`, .end = `0xdc44` }, / GEN12_OAA_DBG_REG /
4386	{}
4387	};
4388
4389	static const struct i915_range mtl_oam_b_counters[] = {
4390	{ .start = `0x393000`, .end = `0x39301c` }, / GEN12_OAM_STARTTRIG1[1-8] /
4391	{ .start = `0x393020`, .end = `0x39303c` }, / GEN12_OAM_REPORTTRIG1[1-8] /
4392	{ .start = `0x393040`, .end = `0x39307c` }, / GEN12_OAM_CEC[0-7][0-1] /
4393	{ .start = `0x393200`, .end = `0x39323C` }, / MPES[0-7] /
4394	{}
4395	};
4396
4397	static const struct i915_range xehp_oa_b_counters[] = {
4398	{ .start = `0xdc48`, .end = `0xdc48` }, / OAA_ENABLE_REG /
4399	{ .start = `0xdd00`, .end = `0xdd48` }, / OAG_LCE0_0 - OAA_LENABLE_REG /
4400	{}
4401	};
4402
4403	static const struct i915_range gen7_oa_mux_regs[] = {
4404	{ .start = `0x91b8`, .end = `0x91cc` }, / OA_PERFCNT[1-2], OA_PERFMATRIX /
4405	{ .start = `0x9800`, .end = `0x9888` }, / MICRO_BP0_0 - NOA_WRITE /
4406	{ .start = `0xe180`, .end = `0xe180` }, / HALF_SLICE_CHICKEN2 /
4407	{}
4408	};
4409
4410	static const struct i915_range hsw_oa_mux_regs[] = {
4411	{ .start = `0x09e80`, .end = `0x09ea4` }, / HSW_MBVID2_NOA[0-9] /
4412	{ .start = `0x09ec0`, .end = `0x09ec0` }, / HSW_MBVID2_MISR0 /
4413	{ .start = `0x25100`, .end = `0x2ff90` },
4414	{}
4415	};
4416
4417	static const struct i915_range chv_oa_mux_regs[] = {
4418	{ .start = `0x182300`, .end = `0x1823a4` },
4419	{}
4420	};
4421
4422	static const struct i915_range gen8_oa_mux_regs[] = {
4423	{ .start = `0x0d00`, .end = `0x0d2c` }, / RPM_CONFIG[0-1], NOA_CONFIG[0-8] /
4424	{ .start = `0x20cc`, .end = `0x20cc` }, / WAIT_FOR_RC6_EXIT /
4425	{}
4426	};
4427
4428	static const struct i915_range gen11_oa_mux_regs[] = {
4429	{ .start = `0x91c8`, .end = `0x91dc` }, / OA_PERFCNT[3-4] /
4430	{}
4431	};
4432
4433	static const struct i915_range gen12_oa_mux_regs[] = {
4434	{ .start = `0x0d00`, .end = `0x0d04` }, / RPM_CONFIG[0-1] /
4435	{ .start = `0x0d0c`, .end = `0x0d2c` }, / NOA_CONFIG[0-8] /
4436	{ .start = `0x9840`, .end = `0x9840` }, / GDT_CHICKEN_BITS /
4437	{ .start = `0x9884`, .end = `0x9888` }, / NOA_WRITE /
4438	{ .start = `0x20cc`, .end = `0x20cc` }, / WAIT_FOR_RC6_EXIT /
4439	{}
4440	};
4441
4442	/*
4443	* Ref: 14010536224:
4444	* 0x20cc is repurposed on MTL, so use a separate array for MTL.
4445	*/
4446	static const struct i915_range mtl_oa_mux_regs[] = {
4447	{ .start = `0x0d00`, .end = `0x0d04` }, / RPM_CONFIG[0-1] /
4448	{ .start = `0x0d0c`, .end = `0x0d2c` }, / NOA_CONFIG[0-8] /
4449	{ .start = `0x9840`, .end = `0x9840` }, / GDT_CHICKEN_BITS /
4450	{ .start = `0x9884`, .end = `0x9888` }, / NOA_WRITE /
4451	{ .start = `0x38d100`, .end = `0x38d114`}, / VISACTL /
4452	{}
4453	};
4454
4455	static bool gen7_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
4456	{
4457	return reg_in_range_table(addr, table: gen7_oa_b_counters);
4458	}
4459
4460	static bool gen8_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
4461	{
4462	return reg_in_range_table(addr, table: gen7_oa_mux_regs) \|\|
4463	reg_in_range_table(addr, table: gen8_oa_mux_regs);
4464	}
4465
4466	static bool gen11_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
4467	{
4468	return reg_in_range_table(addr, table: gen7_oa_mux_regs) \|\|
4469	reg_in_range_table(addr, table: gen8_oa_mux_regs) \|\|
4470	reg_in_range_table(addr, table: gen11_oa_mux_regs);
4471	}
4472
4473	static bool hsw_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
4474	{
4475	return reg_in_range_table(addr, table: gen7_oa_mux_regs) \|\|
4476	reg_in_range_table(addr, table: hsw_oa_mux_regs);
4477	}
4478
4479	static bool chv_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
4480	{
4481	return reg_in_range_table(addr, table: gen7_oa_mux_regs) \|\|
4482	reg_in_range_table(addr, table: chv_oa_mux_regs);
4483	}
4484
4485	static bool gen12_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
4486	{
4487	return reg_in_range_table(addr, table: gen12_oa_b_counters);
4488	}
4489
4490	static bool mtl_is_valid_oam_b_counter_addr(struct i915_perf *perf, u32 addr)
4491	{
4492	if (HAS_OAM(perf->i915) &&
4493	GRAPHICS_VER_FULL(perf->i915) >= IP_VER(`12`, `70`))
4494	return reg_in_range_table(addr, table: mtl_oam_b_counters);
4495
4496	return false;
4497	}
4498
4499	static bool xehp_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
4500	{
4501	return reg_in_range_table(addr, table: xehp_oa_b_counters) \|\|
4502	reg_in_range_table(addr, table: gen12_oa_b_counters) \|\|
4503	mtl_is_valid_oam_b_counter_addr(perf, addr);
4504	}
4505
4506	static bool gen12_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
4507	{
4508	if (GRAPHICS_VER_FULL(perf->i915) >= IP_VER(`12`, `70`))
4509	return reg_in_range_table(addr, table: mtl_oa_mux_regs);
4510	else
4511	return reg_in_range_table(addr, table: gen12_oa_mux_regs);
4512	}
4513
4514	static u32 mask_reg_value(u32 reg, u32 val)
4515	{
4516	/ HALF_SLICE_CHICKEN2 is programmed with a the*
4517	* WaDisableSTUnitPowerOptimization workaround. Make sure the value
4518	* programmed by userspace doesn't change this.
4519	*/
4520	if (REG_EQUAL(reg, HALF_SLICE_CHICKEN2))
4521	val = val & ~_MASKED_BIT_ENABLE(GEN8_ST_PO_DISABLE);
4522
4523	/ WAIT_FOR_RC6_EXIT has only one bit fullfilling the function*
4524	* indicated by its name and a bunch of selection fields used by OA
4525	* configs.
4526	*/
4527	if (REG_EQUAL(reg, WAIT_FOR_RC6_EXIT))
4528	val = val & ~_MASKED_BIT_ENABLE(HSW_WAIT_FOR_RC6_EXIT_ENABLE);
4529
4530	return val;
4531	}
4532
4533	static struct i915_oa_reg alloc_oa_regs(struct* i915_perf *perf,
4534	bool (is_valid)(struct* i915_perf *perf, u32 addr),
4535	u32 __user *regs,
4536	u32 n_regs)
4537	{
4538	struct i915_oa_reg *oa_regs;
4539	int err;
4540	u32 i;
4541
4542	if (!n_regs)
4543	return NULL;
4544
4545	/ No is_valid function means we're not allowing any register to be programmed. /
4546	GEM_BUG_ON(!is_valid);
4547	if (!is_valid)
4548	return ERR_PTR(error: -EINVAL);
4549
4550	oa_regs = kmalloc_array(n: n_regs, size: sizeof(*oa_regs), GFP_KERNEL);
4551	if (!oa_regs)
4552	return ERR_PTR(error: -ENOMEM);
4553
4554	for (i = `0`; i < n_regs; i++) {
4555	u32 addr, value;
4556
4557	err = get_user(addr, regs);
4558	if (err)
4559	goto addr_err;
4560
4561	if (!is_valid(perf, addr)) {
4562	drm_dbg(&perf->i915->drm,
4563	"Invalid oa_reg address: %X\n", addr);
4564	err = -EINVAL;
4565	goto addr_err;
4566	}
4567
4568	err = get_user(value, regs + `1`);
4569	if (err)
4570	goto addr_err;
4571
4572	oa_regs[i].addr = _MMIO(addr);
4573	oa_regs[i].value = mask_reg_value(reg: addr, val: value);
4574
4575	regs += `2`;
4576	}
4577
4578	return oa_regs;
4579
4580	addr_err:
4581	kfree(objp: oa_regs);
4582	return ERR_PTR(error: err);
4583	}
4584
4585	static ssize_t show_dynamic_id(struct kobject *kobj,
4586	struct kobj_attribute *attr,
4587	char *buf)
4588	{
4589	struct i915_oa_config *oa_config =
4590	container_of(attr, typeof(*oa_config), sysfs_metric_id);
4591
4592	return sprintf(buf, fmt: "%d\n", oa_config->id);
4593	}
4594
4595	static int create_dynamic_oa_sysfs_entry(struct i915_perf *perf,
4596	struct i915_oa_config *oa_config)
4597	{
4598	sysfs_attr_init(&oa_config->sysfs_metric_id.attr);
4599	oa_config->sysfs_metric_id.attr.name = "id";
4600	oa_config->sysfs_metric_id.attr.mode = S_IRUGO;
4601	oa_config->sysfs_metric_id.show = show_dynamic_id;
4602	oa_config->sysfs_metric_id.store = NULL;
4603
4604	oa_config->attrs[`0`] = &oa_config->sysfs_metric_id.attr;
4605	oa_config->attrs[`1`] = NULL;
4606
4607	oa_config->sysfs_metric.name = oa_config->uuid;
4608	oa_config->sysfs_metric.attrs = oa_config->attrs;
4609
4610	return sysfs_create_group(kobj: perf->metrics_kobj,
4611	grp: &oa_config->sysfs_metric);
4612	}
4613
4614	/**
4615	* i915_perf_add_config_ioctl - DRM ioctl() for userspace to add a new OA config
4616	* @dev: drm device
4617	* @data: ioctl data (pointer to struct drm_i915_perf_oa_config) copied from
4618	* userspace (unvalidated)
4619	* @file: drm file
4620	*
4621	* Validates the submitted OA register to be saved into a new OA config that
4622	* can then be used for programming the OA unit and its NOA network.
4623	*
4624	* Returns: A new allocated config number to be used with the perf open ioctl
4625	* or a negative error code on failure.
4626	*/
4627	int i915_perf_add_config_ioctl(struct drm_device dev, void* *data,
4628	struct drm_file *file)
4629	{
4630	struct i915_perf *perf = &to_i915(dev)->perf;
4631	struct drm_i915_perf_oa_config *args = data;
4632	struct i915_oa_config oa_config, tmp;
4633	struct i915_oa_reg *regs;
4634	int err, id;
4635
4636	if (!perf->i915)
4637	return -ENOTSUPP;
4638
4639	if (!perf->metrics_kobj) {
4640	drm_dbg(&perf->i915->drm,
4641	"OA metrics weren't advertised via sysfs\n");
4642	return -EINVAL;
4643	}
4644
4645	if (i915_perf_stream_paranoid && !perfmon_capable()) {
4646	drm_dbg(&perf->i915->drm,
4647	"Insufficient privileges to add i915 OA config\n");
4648	return -EACCES;
4649	}
4650
4651	if ((!args->mux_regs_ptr \|\| !args->n_mux_regs) &&
4652	(!args->boolean_regs_ptr \|\| !args->n_boolean_regs) &&
4653	(!args->flex_regs_ptr \|\| !args->n_flex_regs)) {
4654	drm_dbg(&perf->i915->drm,
4655	"No OA registers given\n");
4656	return -EINVAL;
4657	}
4658
4659	oa_config = kzalloc(size: sizeof(*oa_config), GFP_KERNEL);
4660	if (!oa_config) {
4661	drm_dbg(&perf->i915->drm,
4662	"Failed to allocate memory for the OA config\n");
4663	return -ENOMEM;
4664	}
4665
4666	oa_config->perf = perf;
4667	kref_init(kref: &oa_config->ref);
4668
4669	if (!uuid_is_valid(uuid: args->uuid)) {
4670	drm_dbg(&perf->i915->drm,
4671	"Invalid uuid format for OA config\n");
4672	err = -EINVAL;
4673	goto reg_err;
4674	}
4675
4676	/ Last character in oa_config->uuid will be 0 because oa_config is*
4677	* kzalloc.
4678	*/
4679	memcpy(oa_config->uuid, args->uuid, sizeof(args->uuid));
4680
4681	oa_config->mux_regs_len = args->n_mux_regs;
4682	regs = alloc_oa_regs(perf,
4683	is_valid: perf->ops.is_valid_mux_reg,
4684	u64_to_user_ptr(args->mux_regs_ptr),
4685	n_regs: args->n_mux_regs);
4686
4687	if (IS_ERR(ptr: regs)) {
4688	drm_dbg(&perf->i915->drm,
4689	"Failed to create OA config for mux_regs\n");
4690	err = PTR_ERR(ptr: regs);
4691	goto reg_err;
4692	}
4693	oa_config->mux_regs = regs;
4694
4695	oa_config->b_counter_regs_len = args->n_boolean_regs;
4696	regs = alloc_oa_regs(perf,
4697	is_valid: perf->ops.is_valid_b_counter_reg,
4698	u64_to_user_ptr(args->boolean_regs_ptr),
4699	n_regs: args->n_boolean_regs);
4700
4701	if (IS_ERR(ptr: regs)) {
4702	drm_dbg(&perf->i915->drm,
4703	"Failed to create OA config for b_counter_regs\n");
4704	err = PTR_ERR(ptr: regs);
4705	goto reg_err;
4706	}
4707	oa_config->b_counter_regs = regs;
4708
4709	if (GRAPHICS_VER(perf->i915) < `8`) {
4710	if (args->n_flex_regs != `0`) {
4711	err = -EINVAL;
4712	goto reg_err;
4713	}
4714	} else {
4715	oa_config->flex_regs_len = args->n_flex_regs;
4716	regs = alloc_oa_regs(perf,
4717	is_valid: perf->ops.is_valid_flex_reg,
4718	u64_to_user_ptr(args->flex_regs_ptr),
4719	n_regs: args->n_flex_regs);
4720
4721	if (IS_ERR(ptr: regs)) {
4722	drm_dbg(&perf->i915->drm,
4723	"Failed to create OA config for flex_regs\n");
4724	err = PTR_ERR(ptr: regs);
4725	goto reg_err;
4726	}
4727	oa_config->flex_regs = regs;
4728	}
4729
4730	err = mutex_lock_interruptible(&perf->metrics_lock);
4731	if (err)
4732	goto reg_err;
4733
4734	/ We shouldn't have too many configs, so this iteration shouldn't be*
4735	* too costly.
4736	*/
4737	idr_for_each_entry(&perf->metrics_idr, tmp, id) {
4738	if (!strcmp(tmp->uuid, oa_config->uuid)) {
4739	drm_dbg(&perf->i915->drm,
4740	"OA config already exists with this uuid\n");
4741	err = -EADDRINUSE;
4742	goto sysfs_err;
4743	}
4744	}
4745
4746	err = create_dynamic_oa_sysfs_entry(perf, oa_config);
4747	if (err) {
4748	drm_dbg(&perf->i915->drm,
4749	"Failed to create sysfs entry for OA config\n");
4750	goto sysfs_err;
4751	}
4752
4753	/ Config id 0 is invalid, id 1 for kernel stored test config. /
4754	oa_config->id = idr_alloc(&perf->metrics_idr,
4755	ptr: oa_config, start: `2`,
4756	end: `0`, GFP_KERNEL);
4757	if (oa_config->id < `0`) {
4758	drm_dbg(&perf->i915->drm,
4759	"Failed to create sysfs entry for OA config\n");
4760	err = oa_config->id;
4761	goto sysfs_err;
4762	}
4763	id = oa_config->id;
4764
4765	drm_dbg(&perf->i915->drm,
4766	"Added config %s id=%i\n", oa_config->uuid, oa_config->id);
4767	mutex_unlock(lock: &perf->metrics_lock);
4768
4769	return id;
4770
4771	sysfs_err:
4772	mutex_unlock(lock: &perf->metrics_lock);
4773	reg_err:
4774	i915_oa_config_put(oa_config);
4775	drm_dbg(&perf->i915->drm,
4776	"Failed to add new OA config\n");
4777	return err;
4778	}
4779
4780	/**
4781	* i915_perf_remove_config_ioctl - DRM ioctl() for userspace to remove an OA config
4782	* @dev: drm device
4783	* @data: ioctl data (pointer to u64 integer) copied from userspace
4784	* @file: drm file
4785	*
4786	* Configs can be removed while being used, the will stop appearing in sysfs
4787	* and their content will be freed when the stream using the config is closed.
4788	*
4789	* Returns: 0 on success or a negative error code on failure.
4790	*/
4791	int i915_perf_remove_config_ioctl(struct drm_device dev, void* *data,
4792	struct drm_file *file)
4793	{
4794	struct i915_perf *perf = &to_i915(dev)->perf;
4795	u64 *arg = data;
4796	struct i915_oa_config *oa_config;
4797	int ret;
4798
4799	if (!perf->i915)
4800	return -ENOTSUPP;
4801
4802	if (i915_perf_stream_paranoid && !perfmon_capable()) {
4803	drm_dbg(&perf->i915->drm,
4804	"Insufficient privileges to remove i915 OA config\n");
4805	return -EACCES;
4806	}
4807
4808	ret = mutex_lock_interruptible(&perf->metrics_lock);
4809	if (ret)
4810	return ret;
4811
4812	oa_config = idr_find(&perf->metrics_idr, id: *arg);
4813	if (!oa_config) {
4814	drm_dbg(&perf->i915->drm,
4815	"Failed to remove unknown OA config\n");
4816	ret = -ENOENT;
4817	goto err_unlock;
4818	}
4819
4820	GEM_BUG_ON(*arg != oa_config->id);
4821
4822	sysfs_remove_group(kobj: perf->metrics_kobj, grp: &oa_config->sysfs_metric);
4823
4824	idr_remove(&perf->metrics_idr, id: *arg);
4825
4826	mutex_unlock(lock: &perf->metrics_lock);
4827
4828	drm_dbg(&perf->i915->drm,
4829	"Removed config %s id=%i\n", oa_config->uuid, oa_config->id);
4830
4831	i915_oa_config_put(oa_config);
4832
4833	return `0`;
4834
4835	err_unlock:
4836	mutex_unlock(lock: &perf->metrics_lock);
4837	return ret;
4838	}
4839
4840	static struct ctl_table oa_table[] = {
4841	{
4842	.procname = "perf_stream_paranoid",
4843	.data = &i915_perf_stream_paranoid,
4844	.maxlen = sizeof(i915_perf_stream_paranoid),
4845	.mode = `0644`,
4846	.proc_handler = proc_dointvec_minmax,
4847	.extra1 = SYSCTL_ZERO,
4848	.extra2 = SYSCTL_ONE,
4849	},
4850	{
4851	.procname = "oa_max_sample_rate",
4852	.data = &i915_oa_max_sample_rate,
4853	.maxlen = sizeof(i915_oa_max_sample_rate),
4854	.mode = `0644`,
4855	.proc_handler = proc_dointvec_minmax,
4856	.extra1 = SYSCTL_ZERO,
4857	.extra2 = &oa_sample_rate_hard_limit,
4858	},
4859	};
4860
4861	static u32 num_perf_groups_per_gt(struct intel_gt *gt)
4862	{
4863	return `1`;
4864	}
4865
4866	static u32 __oam_engine_group(struct intel_engine_cs *engine)
4867	{
4868	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(`12`, `70`)) {
4869	/*
4870	* There's 1 SAMEDIA gt and 1 OAM per SAMEDIA gt. All media slices
4871	* within the gt use the same OAM. All MTL SKUs list 1 SA MEDIA.
4872	*/
4873	drm_WARN_ON(&engine->i915->drm,
4874	engine->gt->type != GT_MEDIA);
4875
4876	return PERF_GROUP_OAM_SAMEDIA_0;
4877	}
4878
4879	return PERF_GROUP_INVALID;
4880	}
4881
4882	static u32 __oa_engine_group(struct intel_engine_cs *engine)
4883	{
4884	switch (engine->class) {
4885	case RENDER_CLASS:
4886	return PERF_GROUP_OAG;
4887
4888	case VIDEO_DECODE_CLASS:
4889	case VIDEO_ENHANCEMENT_CLASS:
4890	return __oam_engine_group(engine);
4891
4892	default:
4893	return PERF_GROUP_INVALID;
4894	}
4895	}
4896
4897	static struct i915_perf_regs __oam_regs(u32 base)
4898	{
4899	return (struct i915_perf_regs) {
4900	base,
4901	GEN12_OAM_HEAD_POINTER(base),
4902	GEN12_OAM_TAIL_POINTER(base),
4903	GEN12_OAM_BUFFER(base),
4904	GEN12_OAM_CONTEXT_CONTROL(base),
4905	GEN12_OAM_CONTROL(base),
4906	GEN12_OAM_DEBUG(base),
4907	GEN12_OAM_STATUS(base),
4908	GEN12_OAM_CONTROL_COUNTER_FORMAT_SHIFT,
4909	};
4910	}
4911
4912	static struct i915_perf_regs __oag_regs(void)
4913	{
4914	return (struct i915_perf_regs) {
4915	`0`,
4916	GEN12_OAG_OAHEADPTR,
4917	GEN12_OAG_OATAILPTR,
4918	GEN12_OAG_OABUFFER,
4919	GEN12_OAG_OAGLBCTXCTRL,
4920	GEN12_OAG_OACONTROL,
4921	GEN12_OAG_OA_DEBUG,
4922	GEN12_OAG_OASTATUS,
4923	GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT,
4924	};
4925	}
4926
4927	static void oa_init_groups(struct intel_gt *gt)
4928	{
4929	int i, num_groups = gt->perf.num_perf_groups;
4930
4931	for (i = `0`; i < num_groups; i++) {
4932	struct i915_perf_group *g = &gt->perf.group[i];
4933
4934	/ Fused off engines can result in a group with num_engines == 0 /
4935	if (g->num_engines == `0`)
4936	continue;
4937
4938	if (i == PERF_GROUP_OAG && gt->type != GT_MEDIA) {
4939	g->regs = __oag_regs();
4940	g->type = TYPE_OAG;
4941	} else if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(`12`, `70`)) {
4942	g->regs = __oam_regs(base: mtl_oa_base[i]);
4943	g->type = TYPE_OAM;
4944	}
4945	}
4946	}
4947
4948	static int oa_init_gt(struct intel_gt *gt)
4949	{
4950	u32 num_groups = num_perf_groups_per_gt(gt);
4951	struct intel_engine_cs *engine;
4952	struct i915_perf_group *g;
4953	intel_engine_mask_t tmp;
4954
4955	g = kcalloc(n: num_groups, size: sizeof(*g), GFP_KERNEL);
4956	if (!g)
4957	return -ENOMEM;
4958
4959	for_each_engine_masked(engine, gt, ALL_ENGINES, tmp) {
4960	u32 index = __oa_engine_group(engine);
4961
4962	engine->oa_group = NULL;
4963	if (index < num_groups) {
4964	g[index].num_engines++;
4965	engine->oa_group = &g[index];
4966	}
4967	}
4968
4969	gt->perf.num_perf_groups = num_groups;
4970	gt->perf.group = g;
4971
4972	oa_init_groups(gt);
4973
4974	return `0`;
4975	}
4976
4977	static int oa_init_engine_groups(struct i915_perf *perf)
4978	{
4979	struct intel_gt *gt;
4980	int i, ret;
4981
4982	for_each_gt(gt, perf->i915, i) {
4983	ret = oa_init_gt(gt);
4984	if (ret)
4985	return ret;
4986	}
4987
4988	return `0`;
4989	}
4990
4991	static void oa_init_supported_formats(struct i915_perf *perf)
4992	{
4993	struct drm_i915_private *i915 = perf->i915;
4994	enum intel_platform platform = INTEL_INFO(i915)->platform;
4995
4996	switch (platform) {
4997	case INTEL_HASWELL:
4998	oa_format_add(perf, format: I915_OA_FORMAT_A13);
4999	oa_format_add(perf, format: I915_OA_FORMAT_A13);
5000	oa_format_add(perf, format: I915_OA_FORMAT_A29);
5001	oa_format_add(perf, format: I915_OA_FORMAT_A13_B8_C8);
5002	oa_format_add(perf, format: I915_OA_FORMAT_B4_C8);
5003	oa_format_add(perf, format: I915_OA_FORMAT_A45_B8_C8);
5004	oa_format_add(perf, format: I915_OA_FORMAT_B4_C8_A16);
5005	oa_format_add(perf, format: I915_OA_FORMAT_C4_B8);
5006	break;
5007
5008	case INTEL_BROADWELL:
5009	case INTEL_CHERRYVIEW:
5010	case INTEL_SKYLAKE:
5011	case INTEL_BROXTON:
5012	case INTEL_KABYLAKE:
5013	case INTEL_GEMINILAKE:
5014	case INTEL_COFFEELAKE:
5015	case INTEL_COMETLAKE:
5016	case INTEL_ICELAKE:
5017	case INTEL_ELKHARTLAKE:
5018	case INTEL_JASPERLAKE:
5019	case INTEL_TIGERLAKE:
5020	case INTEL_ROCKETLAKE:
5021	case INTEL_DG1:
5022	case INTEL_ALDERLAKE_S:
5023	case INTEL_ALDERLAKE_P:
5024	oa_format_add(perf, format: I915_OA_FORMAT_A12);
5025	oa_format_add(perf, format: I915_OA_FORMAT_A12_B8_C8);
5026	oa_format_add(perf, format: I915_OA_FORMAT_A32u40_A4u32_B8_C8);
5027	oa_format_add(perf, format: I915_OA_FORMAT_C4_B8);
5028	break;
5029
5030	case INTEL_DG2:
5031	oa_format_add(perf, format: I915_OAR_FORMAT_A32u40_A4u32_B8_C8);
5032	oa_format_add(perf, format: I915_OA_FORMAT_A24u40_A14u32_B8_C8);
5033	break;
5034
5035	case INTEL_METEORLAKE:
5036	oa_format_add(perf, format: I915_OAR_FORMAT_A32u40_A4u32_B8_C8);
5037	oa_format_add(perf, format: I915_OA_FORMAT_A24u40_A14u32_B8_C8);
5038	oa_format_add(perf, format: I915_OAM_FORMAT_MPEC8u64_B8_C8);
5039	oa_format_add(perf, format: I915_OAM_FORMAT_MPEC8u32_B8_C8);
5040	break;
5041
5042	default:
5043	MISSING_CASE(platform);
5044	}
5045	}
5046
5047	static void i915_perf_init_info(struct drm_i915_private *i915)
5048	{
5049	struct i915_perf *perf = &i915->perf;
5050
5051	switch (GRAPHICS_VER(i915)) {
5052	case `8`:
5053	perf->ctx_oactxctrl_offset = `0x120`;
5054	perf->ctx_flexeu0_offset = `0x2ce`;
5055	perf->gen8_valid_ctx_bit = BIT(`25`);
5056	break;
5057	case `9`:
5058	perf->ctx_oactxctrl_offset = `0x128`;
5059	perf->ctx_flexeu0_offset = `0x3de`;
5060	perf->gen8_valid_ctx_bit = BIT(`16`);
5061	break;
5062	case `11`:
5063	perf->ctx_oactxctrl_offset = `0x124`;
5064	perf->ctx_flexeu0_offset = `0x78e`;
5065	perf->gen8_valid_ctx_bit = BIT(`16`);
5066	break;
5067	case `12`:
5068	perf->gen8_valid_ctx_bit = BIT(`16`);
5069	/*
5070	* Calculate offset at runtime in oa_pin_context for gen12 and
5071	* cache the value in perf->ctx_oactxctrl_offset.
5072	*/
5073	break;
5074	default:
5075	MISSING_CASE(GRAPHICS_VER(i915));
5076	}
5077	}
5078
5079	/**
5080	* i915_perf_init - initialize i915-perf state on module bind
5081	* @i915: i915 device instance
5082	*
5083	* Initializes i915-perf state without exposing anything to userspace.
5084	*
5085	* Note: i915-perf initialization is split into an 'init' and 'register'
5086	* phase with the i915_perf_register() exposing state to userspace.
5087	*/
5088	int i915_perf_init(struct drm_i915_private *i915)
5089	{
5090	struct i915_perf *perf = &i915->perf;
5091
5092	perf->oa_formats = oa_formats;
5093	if (IS_HASWELL(i915)) {
5094	perf->ops.is_valid_b_counter_reg = gen7_is_valid_b_counter_addr;
5095	perf->ops.is_valid_mux_reg = hsw_is_valid_mux_addr;
5096	perf->ops.is_valid_flex_reg = NULL;
5097	perf->ops.enable_metric_set = hsw_enable_metric_set;
5098	perf->ops.disable_metric_set = hsw_disable_metric_set;
5099	perf->ops.oa_enable = gen7_oa_enable;
5100	perf->ops.oa_disable = gen7_oa_disable;
5101	perf->ops.read = gen7_oa_read;
5102	perf->ops.oa_hw_tail_read = gen7_oa_hw_tail_read;
5103	} else if (HAS_LOGICAL_RING_CONTEXTS(i915)) {
5104	/ Note: that although we could theoretically also support the*
5105	* legacy ringbuffer mode on BDW (and earlier iterations of
5106	* this driver, before upstreaming did this) it didn't seem
5107	* worth the complexity to maintain now that BDW+ enable
5108	* execlist mode by default.
5109	*/
5110	perf->ops.read = gen8_oa_read;
5111	i915_perf_init_info(i915);
5112
5113	if (IS_GRAPHICS_VER(i915, `8`, `9`)) {
5114	perf->ops.is_valid_b_counter_reg =
5115	gen7_is_valid_b_counter_addr;
5116	perf->ops.is_valid_mux_reg =
5117	gen8_is_valid_mux_addr;
5118	perf->ops.is_valid_flex_reg =
5119	gen8_is_valid_flex_addr;
5120
5121	if (IS_CHERRYVIEW(i915)) {
5122	perf->ops.is_valid_mux_reg =
5123	chv_is_valid_mux_addr;
5124	}
5125
5126	perf->ops.oa_enable = gen8_oa_enable;
5127	perf->ops.oa_disable = gen8_oa_disable;
5128	perf->ops.enable_metric_set = gen8_enable_metric_set;
5129	perf->ops.disable_metric_set = gen8_disable_metric_set;
5130	perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
5131	} else if (GRAPHICS_VER(i915) == `11`) {
5132	perf->ops.is_valid_b_counter_reg =
5133	gen7_is_valid_b_counter_addr;
5134	perf->ops.is_valid_mux_reg =
5135	gen11_is_valid_mux_addr;
5136	perf->ops.is_valid_flex_reg =
5137	gen8_is_valid_flex_addr;
5138
5139	perf->ops.oa_enable = gen8_oa_enable;
5140	perf->ops.oa_disable = gen8_oa_disable;
5141	perf->ops.enable_metric_set = gen8_enable_metric_set;
5142	perf->ops.disable_metric_set = gen11_disable_metric_set;
5143	perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
5144	} else if (GRAPHICS_VER(i915) == `12`) {
5145	perf->ops.is_valid_b_counter_reg =
5146	HAS_OA_SLICE_CONTRIB_LIMITS(i915) ?
5147	xehp_is_valid_b_counter_addr :
5148	gen12_is_valid_b_counter_addr;
5149	perf->ops.is_valid_mux_reg =
5150	gen12_is_valid_mux_addr;
5151	perf->ops.is_valid_flex_reg =
5152	gen8_is_valid_flex_addr;
5153
5154	perf->ops.oa_enable = gen12_oa_enable;
5155	perf->ops.oa_disable = gen12_oa_disable;
5156	perf->ops.enable_metric_set = gen12_enable_metric_set;
5157	perf->ops.disable_metric_set = gen12_disable_metric_set;
5158	perf->ops.oa_hw_tail_read = gen12_oa_hw_tail_read;
5159	}
5160	}
5161
5162	if (perf->ops.enable_metric_set) {
5163	struct intel_gt *gt;
5164	int i, ret;
5165
5166	for_each_gt(gt, i915, i)
5167	mutex_init(&gt->perf.lock);
5168
5169	/ Choose a representative limit /
5170	oa_sample_rate_hard_limit = to_gt(i915)->clock_frequency / `2`;
5171
5172	mutex_init(&perf->metrics_lock);
5173	idr_init_base(idr: &perf->metrics_idr, base: `1`);
5174
5175	/ We set up some ratelimit state to potentially throttle any*
5176	* _NOTES about spurious, invalid OA reports which we don't
5177	* forward to userspace.
5178	*
5179	* We print a _NOTE about any throttling when closing the
5180	* stream instead of waiting until driver _fini which no one
5181	* would ever see.
5182	*
5183	* Using the same limiting factors as printk_ratelimit()
5184	*/
5185	ratelimit_state_init(rs: &perf->spurious_report_rs, interval: `5` * HZ, burst: `10`);
5186	/ Since we use a DRM_NOTE for spurious reports it would be*
5187	* inconsistent to let __ratelimit() automatically print a
5188	* warning for throttling.
5189	*/
5190	ratelimit_set_flags(rs: &perf->spurious_report_rs,
5191	RATELIMIT_MSG_ON_RELEASE);
5192
5193	ratelimit_state_init(rs: &perf->tail_pointer_race,
5194	interval: `5` * HZ, burst: `10`);
5195	ratelimit_set_flags(rs: &perf->tail_pointer_race,
5196	RATELIMIT_MSG_ON_RELEASE);
5197
5198	atomic64_set(v: &perf->noa_programming_delay,
5199	i: `500` * `1000` / 500us /);
5200
5201	perf->i915 = i915;
5202
5203	ret = oa_init_engine_groups(perf);
5204	if (ret) {
5205	drm_err(&i915->drm,
5206	"OA initialization failed %d\n", ret);
5207	return ret;
5208	}
5209
5210	oa_init_supported_formats(perf);
5211	}
5212
5213	return `0`;
5214	}
5215
5216	static int destroy_config(int id, void p, void* *data)
5217	{
5218	i915_oa_config_put(oa_config: p);
5219	return `0`;
5220	}
5221
5222	int i915_perf_sysctl_register(void)
5223	{
5224	sysctl_header = register_sysctl("dev/i915", oa_table);
5225	return `0`;
5226	}
5227
5228	void i915_perf_sysctl_unregister(void)
5229	{
5230	unregister_sysctl_table(table: sysctl_header);
5231	}
5232
5233	/**
5234	* i915_perf_fini - Counter part to i915_perf_init()
5235	* @i915: i915 device instance
5236	*/
5237	void i915_perf_fini(struct drm_i915_private *i915)
5238	{
5239	struct i915_perf *perf = &i915->perf;
5240	struct intel_gt *gt;
5241	int i;
5242
5243	if (!perf->i915)
5244	return;
5245
5246	for_each_gt(gt, perf->i915, i)
5247	kfree(objp: gt->perf.group);
5248
5249	idr_for_each(&perf->metrics_idr, fn: destroy_config, data: perf);
5250	idr_destroy(&perf->metrics_idr);
5251
5252	memset(&perf->ops, `0`, sizeof(perf->ops));
5253	perf->i915 = NULL;
5254	}
5255
5256	/**
5257	* i915_perf_ioctl_version - Version of the i915-perf subsystem
5258	* @i915: The i915 device
5259	*
5260	* This version number is used by userspace to detect available features.
5261	*/
5262	int i915_perf_ioctl_version(struct drm_i915_private *i915)
5263	{
5264	/*
5265	* 1: Initial version
5266	* I915_PERF_IOCTL_ENABLE
5267	* I915_PERF_IOCTL_DISABLE
5268	*
5269	* 2: Added runtime modification of OA config.
5270	* I915_PERF_IOCTL_CONFIG
5271	*
5272	* 3: Add DRM_I915_PERF_PROP_HOLD_PREEMPTION parameter to hold
5273	* preemption on a particular context so that performance data is
5274	* accessible from a delta of MI_RPC reports without looking at the
5275	* OA buffer.
5276	*
5277	* 4: Add DRM_I915_PERF_PROP_ALLOWED_SSEU to limit what contexts can
5278	* be run for the duration of the performance recording based on
5279	* their SSEU configuration.
5280	*
5281	* 5: Add DRM_I915_PERF_PROP_POLL_OA_PERIOD parameter that controls the
5282	* interval for the hrtimer used to check for OA data.
5283	*
5284	* 6: Add DRM_I915_PERF_PROP_OA_ENGINE_CLASS and
5285	* DRM_I915_PERF_PROP_OA_ENGINE_INSTANCE
5286	*
5287	* 7: Add support for video decode and enhancement classes.
5288	*/
5289
5290	/*
5291	* Wa_14017512683: mtl[a0..c0): Use of OAM must be preceded with Media
5292	* C6 disable in BIOS. If Media C6 is enabled in BIOS, return version 6
5293	* to indicate that OA media is not supported.
5294	*/
5295	if (IS_MEDIA_GT_IP_STEP(i915->media_gt, IP_VER(`13`, `0`), STEP_A0, STEP_C0) &&
5296	intel_check_bios_c6_setup(rc6: &i915->media_gt->rc6))
5297	return `6`;
5298
5299	return `7`;
5300	}
5301
5302	#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5303	#include "selftests/i915_perf.c"
5304	#endif
5305

source code of linux/drivers/gpu/drm/i915/i915_perf.c