1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Resource Director Technology (RDT) |
4 | * |
5 | * Pseudo-locking support built on top of Cache Allocation Technology (CAT) |
6 | * |
7 | * Copyright (C) 2018 Intel Corporation |
8 | * |
9 | * Author: Reinette Chatre <reinette.chatre@intel.com> |
10 | */ |
11 | |
12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
13 | |
14 | #include <linux/cacheinfo.h> |
15 | #include <linux/cpu.h> |
16 | #include <linux/cpumask.h> |
17 | #include <linux/debugfs.h> |
18 | #include <linux/kthread.h> |
19 | #include <linux/mman.h> |
20 | #include <linux/perf_event.h> |
21 | #include <linux/pm_qos.h> |
22 | #include <linux/slab.h> |
23 | #include <linux/uaccess.h> |
24 | |
25 | #include <asm/cacheflush.h> |
26 | #include <asm/intel-family.h> |
27 | #include <asm/resctrl.h> |
28 | #include <asm/perf_event.h> |
29 | |
30 | #include "../../events/perf_event.h" /* For X86_CONFIG() */ |
31 | #include "internal.h" |
32 | |
33 | #define CREATE_TRACE_POINTS |
34 | #include "pseudo_lock_event.h" |
35 | |
36 | /* |
37 | * The bits needed to disable hardware prefetching varies based on the |
38 | * platform. During initialization we will discover which bits to use. |
39 | */ |
40 | static u64 prefetch_disable_bits; |
41 | |
42 | /* |
43 | * Major number assigned to and shared by all devices exposing |
44 | * pseudo-locked regions. |
45 | */ |
46 | static unsigned int pseudo_lock_major; |
47 | static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); |
48 | |
49 | static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) |
50 | { |
51 | const struct rdtgroup *rdtgrp; |
52 | |
53 | rdtgrp = dev_get_drvdata(dev); |
54 | if (mode) |
55 | *mode = 0600; |
56 | return kasprintf(GFP_KERNEL, fmt: "pseudo_lock/%s" , rdtgrp->kn->name); |
57 | } |
58 | |
59 | static const struct class pseudo_lock_class = { |
60 | .name = "pseudo_lock" , |
61 | .devnode = pseudo_lock_devnode, |
62 | }; |
63 | |
64 | /** |
65 | * get_prefetch_disable_bits - prefetch disable bits of supported platforms |
66 | * @void: It takes no parameters. |
67 | * |
68 | * Capture the list of platforms that have been validated to support |
69 | * pseudo-locking. This includes testing to ensure pseudo-locked regions |
70 | * with low cache miss rates can be created under variety of load conditions |
71 | * as well as that these pseudo-locked regions can maintain their low cache |
72 | * miss rates under variety of load conditions for significant lengths of time. |
73 | * |
74 | * After a platform has been validated to support pseudo-locking its |
75 | * hardware prefetch disable bits are included here as they are documented |
76 | * in the SDM. |
77 | * |
78 | * When adding a platform here also add support for its cache events to |
79 | * measure_cycles_perf_fn() |
80 | * |
81 | * Return: |
82 | * If platform is supported, the bits to disable hardware prefetchers, 0 |
83 | * if platform is not supported. |
84 | */ |
85 | static u64 get_prefetch_disable_bits(void) |
86 | { |
87 | if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || |
88 | boot_cpu_data.x86 != 6) |
89 | return 0; |
90 | |
91 | switch (boot_cpu_data.x86_model) { |
92 | case INTEL_FAM6_BROADWELL_X: |
93 | /* |
94 | * SDM defines bits of MSR_MISC_FEATURE_CONTROL register |
95 | * as: |
96 | * 0 L2 Hardware Prefetcher Disable (R/W) |
97 | * 1 L2 Adjacent Cache Line Prefetcher Disable (R/W) |
98 | * 2 DCU Hardware Prefetcher Disable (R/W) |
99 | * 3 DCU IP Prefetcher Disable (R/W) |
100 | * 63:4 Reserved |
101 | */ |
102 | return 0xF; |
103 | case INTEL_FAM6_ATOM_GOLDMONT: |
104 | case INTEL_FAM6_ATOM_GOLDMONT_PLUS: |
105 | /* |
106 | * SDM defines bits of MSR_MISC_FEATURE_CONTROL register |
107 | * as: |
108 | * 0 L2 Hardware Prefetcher Disable (R/W) |
109 | * 1 Reserved |
110 | * 2 DCU Hardware Prefetcher Disable (R/W) |
111 | * 63:3 Reserved |
112 | */ |
113 | return 0x5; |
114 | } |
115 | |
116 | return 0; |
117 | } |
118 | |
119 | /** |
120 | * pseudo_lock_minor_get - Obtain available minor number |
121 | * @minor: Pointer to where new minor number will be stored |
122 | * |
123 | * A bitmask is used to track available minor numbers. Here the next free |
124 | * minor number is marked as unavailable and returned. |
125 | * |
126 | * Return: 0 on success, <0 on failure. |
127 | */ |
128 | static int pseudo_lock_minor_get(unsigned int *minor) |
129 | { |
130 | unsigned long first_bit; |
131 | |
132 | first_bit = find_first_bit(addr: &pseudo_lock_minor_avail, MINORBITS); |
133 | |
134 | if (first_bit == MINORBITS) |
135 | return -ENOSPC; |
136 | |
137 | __clear_bit(first_bit, &pseudo_lock_minor_avail); |
138 | *minor = first_bit; |
139 | |
140 | return 0; |
141 | } |
142 | |
143 | /** |
144 | * pseudo_lock_minor_release - Return minor number to available |
145 | * @minor: The minor number made available |
146 | */ |
147 | static void pseudo_lock_minor_release(unsigned int minor) |
148 | { |
149 | __set_bit(minor, &pseudo_lock_minor_avail); |
150 | } |
151 | |
152 | /** |
153 | * region_find_by_minor - Locate a pseudo-lock region by inode minor number |
154 | * @minor: The minor number of the device representing pseudo-locked region |
155 | * |
156 | * When the character device is accessed we need to determine which |
157 | * pseudo-locked region it belongs to. This is done by matching the minor |
158 | * number of the device to the pseudo-locked region it belongs. |
159 | * |
160 | * Minor numbers are assigned at the time a pseudo-locked region is associated |
161 | * with a cache instance. |
162 | * |
163 | * Return: On success return pointer to resource group owning the pseudo-locked |
164 | * region, NULL on failure. |
165 | */ |
166 | static struct rdtgroup *region_find_by_minor(unsigned int minor) |
167 | { |
168 | struct rdtgroup *rdtgrp, *rdtgrp_match = NULL; |
169 | |
170 | list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { |
171 | if (rdtgrp->plr && rdtgrp->plr->minor == minor) { |
172 | rdtgrp_match = rdtgrp; |
173 | break; |
174 | } |
175 | } |
176 | return rdtgrp_match; |
177 | } |
178 | |
179 | /** |
180 | * struct pseudo_lock_pm_req - A power management QoS request list entry |
181 | * @list: Entry within the @pm_reqs list for a pseudo-locked region |
182 | * @req: PM QoS request |
183 | */ |
184 | struct pseudo_lock_pm_req { |
185 | struct list_head list; |
186 | struct dev_pm_qos_request req; |
187 | }; |
188 | |
189 | static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) |
190 | { |
191 | struct pseudo_lock_pm_req *pm_req, *next; |
192 | |
193 | list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) { |
194 | dev_pm_qos_remove_request(req: &pm_req->req); |
195 | list_del(entry: &pm_req->list); |
196 | kfree(objp: pm_req); |
197 | } |
198 | } |
199 | |
200 | /** |
201 | * pseudo_lock_cstates_constrain - Restrict cores from entering C6 |
202 | * @plr: Pseudo-locked region |
203 | * |
204 | * To prevent the cache from being affected by power management entering |
205 | * C6 has to be avoided. This is accomplished by requesting a latency |
206 | * requirement lower than lowest C6 exit latency of all supported |
207 | * platforms as found in the cpuidle state tables in the intel_idle driver. |
208 | * At this time it is possible to do so with a single latency requirement |
209 | * for all supported platforms. |
210 | * |
211 | * Since Goldmont is supported, which is affected by X86_BUG_MONITOR, |
212 | * the ACPI latencies need to be considered while keeping in mind that C2 |
213 | * may be set to map to deeper sleep states. In this case the latency |
214 | * requirement needs to prevent entering C2 also. |
215 | * |
216 | * Return: 0 on success, <0 on failure |
217 | */ |
218 | static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) |
219 | { |
220 | struct pseudo_lock_pm_req *pm_req; |
221 | int cpu; |
222 | int ret; |
223 | |
224 | for_each_cpu(cpu, &plr->d->cpu_mask) { |
225 | pm_req = kzalloc(size: sizeof(*pm_req), GFP_KERNEL); |
226 | if (!pm_req) { |
227 | rdt_last_cmd_puts(s: "Failure to allocate memory for PM QoS\n" ); |
228 | ret = -ENOMEM; |
229 | goto out_err; |
230 | } |
231 | ret = dev_pm_qos_add_request(dev: get_cpu_device(cpu), |
232 | req: &pm_req->req, |
233 | type: DEV_PM_QOS_RESUME_LATENCY, |
234 | value: 30); |
235 | if (ret < 0) { |
236 | rdt_last_cmd_printf(fmt: "Failed to add latency req CPU%d\n" , |
237 | cpu); |
238 | kfree(objp: pm_req); |
239 | ret = -1; |
240 | goto out_err; |
241 | } |
242 | list_add(new: &pm_req->list, head: &plr->pm_reqs); |
243 | } |
244 | |
245 | return 0; |
246 | |
247 | out_err: |
248 | pseudo_lock_cstates_relax(plr); |
249 | return ret; |
250 | } |
251 | |
252 | /** |
253 | * pseudo_lock_region_clear - Reset pseudo-lock region data |
254 | * @plr: pseudo-lock region |
255 | * |
256 | * All content of the pseudo-locked region is reset - any memory allocated |
257 | * freed. |
258 | * |
259 | * Return: void |
260 | */ |
261 | static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) |
262 | { |
263 | plr->size = 0; |
264 | plr->line_size = 0; |
265 | kfree(objp: plr->kmem); |
266 | plr->kmem = NULL; |
267 | plr->s = NULL; |
268 | if (plr->d) |
269 | plr->d->plr = NULL; |
270 | plr->d = NULL; |
271 | plr->cbm = 0; |
272 | plr->debugfs_dir = NULL; |
273 | } |
274 | |
275 | /** |
276 | * pseudo_lock_region_init - Initialize pseudo-lock region information |
277 | * @plr: pseudo-lock region |
278 | * |
279 | * Called after user provided a schemata to be pseudo-locked. From the |
280 | * schemata the &struct pseudo_lock_region is on entry already initialized |
281 | * with the resource, domain, and capacity bitmask. Here the information |
282 | * required for pseudo-locking is deduced from this data and &struct |
283 | * pseudo_lock_region initialized further. This information includes: |
284 | * - size in bytes of the region to be pseudo-locked |
285 | * - cache line size to know the stride with which data needs to be accessed |
286 | * to be pseudo-locked |
287 | * - a cpu associated with the cache instance on which the pseudo-locking |
288 | * flow can be executed |
289 | * |
290 | * Return: 0 on success, <0 on failure. Descriptive error will be written |
291 | * to last_cmd_status buffer. |
292 | */ |
293 | static int pseudo_lock_region_init(struct pseudo_lock_region *plr) |
294 | { |
295 | struct cpu_cacheinfo *ci; |
296 | int ret; |
297 | int i; |
298 | |
299 | /* Pick the first cpu we find that is associated with the cache. */ |
300 | plr->cpu = cpumask_first(srcp: &plr->d->cpu_mask); |
301 | |
302 | if (!cpu_online(cpu: plr->cpu)) { |
303 | rdt_last_cmd_printf(fmt: "CPU %u associated with cache not online\n" , |
304 | plr->cpu); |
305 | ret = -ENODEV; |
306 | goto out_region; |
307 | } |
308 | |
309 | ci = get_cpu_cacheinfo(cpu: plr->cpu); |
310 | |
311 | plr->size = rdtgroup_cbm_to_size(r: plr->s->res, d: plr->d, cbm: plr->cbm); |
312 | |
313 | for (i = 0; i < ci->num_leaves; i++) { |
314 | if (ci->info_list[i].level == plr->s->res->cache_level) { |
315 | plr->line_size = ci->info_list[i].coherency_line_size; |
316 | return 0; |
317 | } |
318 | } |
319 | |
320 | ret = -1; |
321 | rdt_last_cmd_puts(s: "Unable to determine cache line size\n" ); |
322 | out_region: |
323 | pseudo_lock_region_clear(plr); |
324 | return ret; |
325 | } |
326 | |
327 | /** |
328 | * pseudo_lock_init - Initialize a pseudo-lock region |
329 | * @rdtgrp: resource group to which new pseudo-locked region will belong |
330 | * |
331 | * A pseudo-locked region is associated with a resource group. When this |
332 | * association is created the pseudo-locked region is initialized. The |
333 | * details of the pseudo-locked region are not known at this time so only |
334 | * allocation is done and association established. |
335 | * |
336 | * Return: 0 on success, <0 on failure |
337 | */ |
338 | static int pseudo_lock_init(struct rdtgroup *rdtgrp) |
339 | { |
340 | struct pseudo_lock_region *plr; |
341 | |
342 | plr = kzalloc(size: sizeof(*plr), GFP_KERNEL); |
343 | if (!plr) |
344 | return -ENOMEM; |
345 | |
346 | init_waitqueue_head(&plr->lock_thread_wq); |
347 | INIT_LIST_HEAD(list: &plr->pm_reqs); |
348 | rdtgrp->plr = plr; |
349 | return 0; |
350 | } |
351 | |
352 | /** |
353 | * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked |
354 | * @plr: pseudo-lock region |
355 | * |
356 | * Initialize the details required to set up the pseudo-locked region and |
357 | * allocate the contiguous memory that will be pseudo-locked to the cache. |
358 | * |
359 | * Return: 0 on success, <0 on failure. Descriptive error will be written |
360 | * to last_cmd_status buffer. |
361 | */ |
362 | static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr) |
363 | { |
364 | int ret; |
365 | |
366 | ret = pseudo_lock_region_init(plr); |
367 | if (ret < 0) |
368 | return ret; |
369 | |
370 | /* |
371 | * We do not yet support contiguous regions larger than |
372 | * KMALLOC_MAX_SIZE. |
373 | */ |
374 | if (plr->size > KMALLOC_MAX_SIZE) { |
375 | rdt_last_cmd_puts(s: "Requested region exceeds maximum size\n" ); |
376 | ret = -E2BIG; |
377 | goto out_region; |
378 | } |
379 | |
380 | plr->kmem = kzalloc(size: plr->size, GFP_KERNEL); |
381 | if (!plr->kmem) { |
382 | rdt_last_cmd_puts(s: "Unable to allocate memory\n" ); |
383 | ret = -ENOMEM; |
384 | goto out_region; |
385 | } |
386 | |
387 | ret = 0; |
388 | goto out; |
389 | out_region: |
390 | pseudo_lock_region_clear(plr); |
391 | out: |
392 | return ret; |
393 | } |
394 | |
395 | /** |
396 | * pseudo_lock_free - Free a pseudo-locked region |
397 | * @rdtgrp: resource group to which pseudo-locked region belonged |
398 | * |
399 | * The pseudo-locked region's resources have already been released, or not |
400 | * yet created at this point. Now it can be freed and disassociated from the |
401 | * resource group. |
402 | * |
403 | * Return: void |
404 | */ |
405 | static void pseudo_lock_free(struct rdtgroup *rdtgrp) |
406 | { |
407 | pseudo_lock_region_clear(plr: rdtgrp->plr); |
408 | kfree(objp: rdtgrp->plr); |
409 | rdtgrp->plr = NULL; |
410 | } |
411 | |
412 | /** |
413 | * pseudo_lock_fn - Load kernel memory into cache |
414 | * @_rdtgrp: resource group to which pseudo-lock region belongs |
415 | * |
416 | * This is the core pseudo-locking flow. |
417 | * |
418 | * First we ensure that the kernel memory cannot be found in the cache. |
419 | * Then, while taking care that there will be as little interference as |
420 | * possible, the memory to be loaded is accessed while core is running |
421 | * with class of service set to the bitmask of the pseudo-locked region. |
422 | * After this is complete no future CAT allocations will be allowed to |
423 | * overlap with this bitmask. |
424 | * |
425 | * Local register variables are utilized to ensure that the memory region |
426 | * to be locked is the only memory access made during the critical locking |
427 | * loop. |
428 | * |
429 | * Return: 0. Waiter on waitqueue will be woken on completion. |
430 | */ |
431 | static int pseudo_lock_fn(void *_rdtgrp) |
432 | { |
433 | struct rdtgroup *rdtgrp = _rdtgrp; |
434 | struct pseudo_lock_region *plr = rdtgrp->plr; |
435 | u32 rmid_p, closid_p; |
436 | unsigned long i; |
437 | u64 saved_msr; |
438 | #ifdef CONFIG_KASAN |
439 | /* |
440 | * The registers used for local register variables are also used |
441 | * when KASAN is active. When KASAN is active we use a regular |
442 | * variable to ensure we always use a valid pointer, but the cost |
443 | * is that this variable will enter the cache through evicting the |
444 | * memory we are trying to lock into the cache. Thus expect lower |
445 | * pseudo-locking success rate when KASAN is active. |
446 | */ |
447 | unsigned int line_size; |
448 | unsigned int size; |
449 | void *mem_r; |
450 | #else |
451 | register unsigned int line_size asm("esi" ); |
452 | register unsigned int size asm("edi" ); |
453 | register void *mem_r asm(_ASM_BX); |
454 | #endif /* CONFIG_KASAN */ |
455 | |
456 | /* |
457 | * Make sure none of the allocated memory is cached. If it is we |
458 | * will get a cache hit in below loop from outside of pseudo-locked |
459 | * region. |
460 | * wbinvd (as opposed to clflush/clflushopt) is required to |
461 | * increase likelihood that allocated cache portion will be filled |
462 | * with associated memory. |
463 | */ |
464 | native_wbinvd(); |
465 | |
466 | /* |
467 | * Always called with interrupts enabled. By disabling interrupts |
468 | * ensure that we will not be preempted during this critical section. |
469 | */ |
470 | local_irq_disable(); |
471 | |
472 | /* |
473 | * Call wrmsr and rdmsr as directly as possible to avoid tracing |
474 | * clobbering local register variables or affecting cache accesses. |
475 | * |
476 | * Disable the hardware prefetcher so that when the end of the memory |
477 | * being pseudo-locked is reached the hardware will not read beyond |
478 | * the buffer and evict pseudo-locked memory read earlier from the |
479 | * cache. |
480 | */ |
481 | saved_msr = __rdmsr(MSR_MISC_FEATURE_CONTROL); |
482 | __wrmsr(MSR_MISC_FEATURE_CONTROL, low: prefetch_disable_bits, high: 0x0); |
483 | closid_p = this_cpu_read(pqr_state.cur_closid); |
484 | rmid_p = this_cpu_read(pqr_state.cur_rmid); |
485 | mem_r = plr->kmem; |
486 | size = plr->size; |
487 | line_size = plr->line_size; |
488 | /* |
489 | * Critical section begin: start by writing the closid associated |
490 | * with the capacity bitmask of the cache region being |
491 | * pseudo-locked followed by reading of kernel memory to load it |
492 | * into the cache. |
493 | */ |
494 | __wrmsr(MSR_IA32_PQR_ASSOC, low: rmid_p, high: rdtgrp->closid); |
495 | /* |
496 | * Cache was flushed earlier. Now access kernel memory to read it |
497 | * into cache region associated with just activated plr->closid. |
498 | * Loop over data twice: |
499 | * - In first loop the cache region is shared with the page walker |
500 | * as it populates the paging structure caches (including TLB). |
501 | * - In the second loop the paging structure caches are used and |
502 | * cache region is populated with the memory being referenced. |
503 | */ |
504 | for (i = 0; i < size; i += PAGE_SIZE) { |
505 | /* |
506 | * Add a barrier to prevent speculative execution of this |
507 | * loop reading beyond the end of the buffer. |
508 | */ |
509 | rmb(); |
510 | asm volatile("mov (%0,%1,1), %%eax\n\t" |
511 | : |
512 | : "r" (mem_r), "r" (i) |
513 | : "%eax" , "memory" ); |
514 | } |
515 | for (i = 0; i < size; i += line_size) { |
516 | /* |
517 | * Add a barrier to prevent speculative execution of this |
518 | * loop reading beyond the end of the buffer. |
519 | */ |
520 | rmb(); |
521 | asm volatile("mov (%0,%1,1), %%eax\n\t" |
522 | : |
523 | : "r" (mem_r), "r" (i) |
524 | : "%eax" , "memory" ); |
525 | } |
526 | /* |
527 | * Critical section end: restore closid with capacity bitmask that |
528 | * does not overlap with pseudo-locked region. |
529 | */ |
530 | __wrmsr(MSR_IA32_PQR_ASSOC, low: rmid_p, high: closid_p); |
531 | |
532 | /* Re-enable the hardware prefetcher(s) */ |
533 | wrmsrl(MSR_MISC_FEATURE_CONTROL, val: saved_msr); |
534 | local_irq_enable(); |
535 | |
536 | plr->thread_done = 1; |
537 | wake_up_interruptible(&plr->lock_thread_wq); |
538 | return 0; |
539 | } |
540 | |
541 | /** |
542 | * rdtgroup_monitor_in_progress - Test if monitoring in progress |
543 | * @rdtgrp: resource group being queried |
544 | * |
545 | * Return: 1 if monitor groups have been created for this resource |
546 | * group, 0 otherwise. |
547 | */ |
548 | static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) |
549 | { |
550 | return !list_empty(head: &rdtgrp->mon.crdtgrp_list); |
551 | } |
552 | |
553 | /** |
554 | * rdtgroup_locksetup_user_restrict - Restrict user access to group |
555 | * @rdtgrp: resource group needing access restricted |
556 | * |
557 | * A resource group used for cache pseudo-locking cannot have cpus or tasks |
558 | * assigned to it. This is communicated to the user by restricting access |
559 | * to all the files that can be used to make such changes. |
560 | * |
561 | * Permissions restored with rdtgroup_locksetup_user_restore() |
562 | * |
563 | * Return: 0 on success, <0 on failure. If a failure occurs during the |
564 | * restriction of access an attempt will be made to restore permissions but |
565 | * the state of the mode of these files will be uncertain when a failure |
566 | * occurs. |
567 | */ |
568 | static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) |
569 | { |
570 | int ret; |
571 | |
572 | ret = rdtgroup_kn_mode_restrict(r: rdtgrp, name: "tasks" ); |
573 | if (ret) |
574 | return ret; |
575 | |
576 | ret = rdtgroup_kn_mode_restrict(r: rdtgrp, name: "cpus" ); |
577 | if (ret) |
578 | goto err_tasks; |
579 | |
580 | ret = rdtgroup_kn_mode_restrict(r: rdtgrp, name: "cpus_list" ); |
581 | if (ret) |
582 | goto err_cpus; |
583 | |
584 | if (rdt_mon_capable) { |
585 | ret = rdtgroup_kn_mode_restrict(r: rdtgrp, name: "mon_groups" ); |
586 | if (ret) |
587 | goto err_cpus_list; |
588 | } |
589 | |
590 | ret = 0; |
591 | goto out; |
592 | |
593 | err_cpus_list: |
594 | rdtgroup_kn_mode_restore(r: rdtgrp, name: "cpus_list" , mask: 0777); |
595 | err_cpus: |
596 | rdtgroup_kn_mode_restore(r: rdtgrp, name: "cpus" , mask: 0777); |
597 | err_tasks: |
598 | rdtgroup_kn_mode_restore(r: rdtgrp, name: "tasks" , mask: 0777); |
599 | out: |
600 | return ret; |
601 | } |
602 | |
603 | /** |
604 | * rdtgroup_locksetup_user_restore - Restore user access to group |
605 | * @rdtgrp: resource group needing access restored |
606 | * |
607 | * Restore all file access previously removed using |
608 | * rdtgroup_locksetup_user_restrict() |
609 | * |
610 | * Return: 0 on success, <0 on failure. If a failure occurs during the |
611 | * restoration of access an attempt will be made to restrict permissions |
612 | * again but the state of the mode of these files will be uncertain when |
613 | * a failure occurs. |
614 | */ |
615 | static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) |
616 | { |
617 | int ret; |
618 | |
619 | ret = rdtgroup_kn_mode_restore(r: rdtgrp, name: "tasks" , mask: 0777); |
620 | if (ret) |
621 | return ret; |
622 | |
623 | ret = rdtgroup_kn_mode_restore(r: rdtgrp, name: "cpus" , mask: 0777); |
624 | if (ret) |
625 | goto err_tasks; |
626 | |
627 | ret = rdtgroup_kn_mode_restore(r: rdtgrp, name: "cpus_list" , mask: 0777); |
628 | if (ret) |
629 | goto err_cpus; |
630 | |
631 | if (rdt_mon_capable) { |
632 | ret = rdtgroup_kn_mode_restore(r: rdtgrp, name: "mon_groups" , mask: 0777); |
633 | if (ret) |
634 | goto err_cpus_list; |
635 | } |
636 | |
637 | ret = 0; |
638 | goto out; |
639 | |
640 | err_cpus_list: |
641 | rdtgroup_kn_mode_restrict(r: rdtgrp, name: "cpus_list" ); |
642 | err_cpus: |
643 | rdtgroup_kn_mode_restrict(r: rdtgrp, name: "cpus" ); |
644 | err_tasks: |
645 | rdtgroup_kn_mode_restrict(r: rdtgrp, name: "tasks" ); |
646 | out: |
647 | return ret; |
648 | } |
649 | |
650 | /** |
651 | * rdtgroup_locksetup_enter - Resource group enters locksetup mode |
652 | * @rdtgrp: resource group requested to enter locksetup mode |
653 | * |
654 | * A resource group enters locksetup mode to reflect that it would be used |
655 | * to represent a pseudo-locked region and is in the process of being set |
656 | * up to do so. A resource group used for a pseudo-locked region would |
657 | * lose the closid associated with it so we cannot allow it to have any |
658 | * tasks or cpus assigned nor permit tasks or cpus to be assigned in the |
659 | * future. Monitoring of a pseudo-locked region is not allowed either. |
660 | * |
661 | * The above and more restrictions on a pseudo-locked region are checked |
662 | * for and enforced before the resource group enters the locksetup mode. |
663 | * |
664 | * Returns: 0 if the resource group successfully entered locksetup mode, <0 |
665 | * on failure. On failure the last_cmd_status buffer is updated with text to |
666 | * communicate details of failure to the user. |
667 | */ |
668 | int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) |
669 | { |
670 | int ret; |
671 | |
672 | /* |
673 | * The default resource group can neither be removed nor lose the |
674 | * default closid associated with it. |
675 | */ |
676 | if (rdtgrp == &rdtgroup_default) { |
677 | rdt_last_cmd_puts(s: "Cannot pseudo-lock default group\n" ); |
678 | return -EINVAL; |
679 | } |
680 | |
681 | /* |
682 | * Cache Pseudo-locking not supported when CDP is enabled. |
683 | * |
684 | * Some things to consider if you would like to enable this |
685 | * support (using L3 CDP as example): |
686 | * - When CDP is enabled two separate resources are exposed, |
687 | * L3DATA and L3CODE, but they are actually on the same cache. |
688 | * The implication for pseudo-locking is that if a |
689 | * pseudo-locked region is created on a domain of one |
690 | * resource (eg. L3CODE), then a pseudo-locked region cannot |
691 | * be created on that same domain of the other resource |
692 | * (eg. L3DATA). This is because the creation of a |
693 | * pseudo-locked region involves a call to wbinvd that will |
694 | * affect all cache allocations on particular domain. |
695 | * - Considering the previous, it may be possible to only |
696 | * expose one of the CDP resources to pseudo-locking and |
697 | * hide the other. For example, we could consider to only |
698 | * expose L3DATA and since the L3 cache is unified it is |
699 | * still possible to place instructions there are execute it. |
700 | * - If only one region is exposed to pseudo-locking we should |
701 | * still keep in mind that availability of a portion of cache |
702 | * for pseudo-locking should take into account both resources. |
703 | * Similarly, if a pseudo-locked region is created in one |
704 | * resource, the portion of cache used by it should be made |
705 | * unavailable to all future allocations from both resources. |
706 | */ |
707 | if (resctrl_arch_get_cdp_enabled(l: RDT_RESOURCE_L3) || |
708 | resctrl_arch_get_cdp_enabled(l: RDT_RESOURCE_L2)) { |
709 | rdt_last_cmd_puts(s: "CDP enabled\n" ); |
710 | return -EINVAL; |
711 | } |
712 | |
713 | /* |
714 | * Not knowing the bits to disable prefetching implies that this |
715 | * platform does not support Cache Pseudo-Locking. |
716 | */ |
717 | prefetch_disable_bits = get_prefetch_disable_bits(); |
718 | if (prefetch_disable_bits == 0) { |
719 | rdt_last_cmd_puts(s: "Pseudo-locking not supported\n" ); |
720 | return -EINVAL; |
721 | } |
722 | |
723 | if (rdtgroup_monitor_in_progress(rdtgrp)) { |
724 | rdt_last_cmd_puts(s: "Monitoring in progress\n" ); |
725 | return -EINVAL; |
726 | } |
727 | |
728 | if (rdtgroup_tasks_assigned(r: rdtgrp)) { |
729 | rdt_last_cmd_puts(s: "Tasks assigned to resource group\n" ); |
730 | return -EINVAL; |
731 | } |
732 | |
733 | if (!cpumask_empty(srcp: &rdtgrp->cpu_mask)) { |
734 | rdt_last_cmd_puts(s: "CPUs assigned to resource group\n" ); |
735 | return -EINVAL; |
736 | } |
737 | |
738 | if (rdtgroup_locksetup_user_restrict(rdtgrp)) { |
739 | rdt_last_cmd_puts(s: "Unable to modify resctrl permissions\n" ); |
740 | return -EIO; |
741 | } |
742 | |
743 | ret = pseudo_lock_init(rdtgrp); |
744 | if (ret) { |
745 | rdt_last_cmd_puts(s: "Unable to init pseudo-lock region\n" ); |
746 | goto out_release; |
747 | } |
748 | |
749 | /* |
750 | * If this system is capable of monitoring a rmid would have been |
751 | * allocated when the control group was created. This is not needed |
752 | * anymore when this group would be used for pseudo-locking. This |
753 | * is safe to call on platforms not capable of monitoring. |
754 | */ |
755 | free_rmid(rmid: rdtgrp->mon.rmid); |
756 | |
757 | ret = 0; |
758 | goto out; |
759 | |
760 | out_release: |
761 | rdtgroup_locksetup_user_restore(rdtgrp); |
762 | out: |
763 | return ret; |
764 | } |
765 | |
766 | /** |
767 | * rdtgroup_locksetup_exit - resource group exist locksetup mode |
768 | * @rdtgrp: resource group |
769 | * |
770 | * When a resource group exits locksetup mode the earlier restrictions are |
771 | * lifted. |
772 | * |
773 | * Return: 0 on success, <0 on failure |
774 | */ |
775 | int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) |
776 | { |
777 | int ret; |
778 | |
779 | if (rdt_mon_capable) { |
780 | ret = alloc_rmid(); |
781 | if (ret < 0) { |
782 | rdt_last_cmd_puts(s: "Out of RMIDs\n" ); |
783 | return ret; |
784 | } |
785 | rdtgrp->mon.rmid = ret; |
786 | } |
787 | |
788 | ret = rdtgroup_locksetup_user_restore(rdtgrp); |
789 | if (ret) { |
790 | free_rmid(rmid: rdtgrp->mon.rmid); |
791 | return ret; |
792 | } |
793 | |
794 | pseudo_lock_free(rdtgrp); |
795 | return 0; |
796 | } |
797 | |
798 | /** |
799 | * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked |
800 | * @d: RDT domain |
801 | * @cbm: CBM to test |
802 | * |
803 | * @d represents a cache instance and @cbm a capacity bitmask that is |
804 | * considered for it. Determine if @cbm overlaps with any existing |
805 | * pseudo-locked region on @d. |
806 | * |
807 | * @cbm is unsigned long, even if only 32 bits are used, to make the |
808 | * bitmap functions work correctly. |
809 | * |
810 | * Return: true if @cbm overlaps with pseudo-locked region on @d, false |
811 | * otherwise. |
812 | */ |
813 | bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm) |
814 | { |
815 | unsigned int cbm_len; |
816 | unsigned long cbm_b; |
817 | |
818 | if (d->plr) { |
819 | cbm_len = d->plr->s->res->cache.cbm_len; |
820 | cbm_b = d->plr->cbm; |
821 | if (bitmap_intersects(src1: &cbm, src2: &cbm_b, nbits: cbm_len)) |
822 | return true; |
823 | } |
824 | return false; |
825 | } |
826 | |
827 | /** |
828 | * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy |
829 | * @d: RDT domain under test |
830 | * |
831 | * The setup of a pseudo-locked region affects all cache instances within |
832 | * the hierarchy of the region. It is thus essential to know if any |
833 | * pseudo-locked regions exist within a cache hierarchy to prevent any |
834 | * attempts to create new pseudo-locked regions in the same hierarchy. |
835 | * |
836 | * Return: true if a pseudo-locked region exists in the hierarchy of @d or |
837 | * if it is not possible to test due to memory allocation issue, |
838 | * false otherwise. |
839 | */ |
840 | bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) |
841 | { |
842 | cpumask_var_t cpu_with_psl; |
843 | struct rdt_resource *r; |
844 | struct rdt_domain *d_i; |
845 | bool ret = false; |
846 | |
847 | if (!zalloc_cpumask_var(mask: &cpu_with_psl, GFP_KERNEL)) |
848 | return true; |
849 | |
850 | /* |
851 | * First determine which cpus have pseudo-locked regions |
852 | * associated with them. |
853 | */ |
854 | for_each_alloc_capable_rdt_resource(r) { |
855 | list_for_each_entry(d_i, &r->domains, list) { |
856 | if (d_i->plr) |
857 | cpumask_or(dstp: cpu_with_psl, src1p: cpu_with_psl, |
858 | src2p: &d_i->cpu_mask); |
859 | } |
860 | } |
861 | |
862 | /* |
863 | * Next test if new pseudo-locked region would intersect with |
864 | * existing region. |
865 | */ |
866 | if (cpumask_intersects(src1p: &d->cpu_mask, src2p: cpu_with_psl)) |
867 | ret = true; |
868 | |
869 | free_cpumask_var(mask: cpu_with_psl); |
870 | return ret; |
871 | } |
872 | |
873 | /** |
874 | * measure_cycles_lat_fn - Measure cycle latency to read pseudo-locked memory |
875 | * @_plr: pseudo-lock region to measure |
876 | * |
877 | * There is no deterministic way to test if a memory region is cached. One |
878 | * way is to measure how long it takes to read the memory, the speed of |
879 | * access is a good way to learn how close to the cpu the data was. Even |
880 | * more, if the prefetcher is disabled and the memory is read at a stride |
881 | * of half the cache line, then a cache miss will be easy to spot since the |
882 | * read of the first half would be significantly slower than the read of |
883 | * the second half. |
884 | * |
885 | * Return: 0. Waiter on waitqueue will be woken on completion. |
886 | */ |
887 | static int measure_cycles_lat_fn(void *_plr) |
888 | { |
889 | struct pseudo_lock_region *plr = _plr; |
890 | u32 saved_low, saved_high; |
891 | unsigned long i; |
892 | u64 start, end; |
893 | void *mem_r; |
894 | |
895 | local_irq_disable(); |
896 | /* |
897 | * Disable hardware prefetchers. |
898 | */ |
899 | rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); |
900 | wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); |
901 | mem_r = READ_ONCE(plr->kmem); |
902 | /* |
903 | * Dummy execute of the time measurement to load the needed |
904 | * instructions into the L1 instruction cache. |
905 | */ |
906 | start = rdtsc_ordered(); |
907 | for (i = 0; i < plr->size; i += 32) { |
908 | start = rdtsc_ordered(); |
909 | asm volatile("mov (%0,%1,1), %%eax\n\t" |
910 | : |
911 | : "r" (mem_r), "r" (i) |
912 | : "%eax" , "memory" ); |
913 | end = rdtsc_ordered(); |
914 | trace_pseudo_lock_mem_latency(latency: (u32)(end - start)); |
915 | } |
916 | wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); |
917 | local_irq_enable(); |
918 | plr->thread_done = 1; |
919 | wake_up_interruptible(&plr->lock_thread_wq); |
920 | return 0; |
921 | } |
922 | |
923 | /* |
924 | * Create a perf_event_attr for the hit and miss perf events that will |
925 | * be used during the performance measurement. A perf_event maintains |
926 | * a pointer to its perf_event_attr so a unique attribute structure is |
927 | * created for each perf_event. |
928 | * |
929 | * The actual configuration of the event is set right before use in order |
930 | * to use the X86_CONFIG macro. |
931 | */ |
932 | static struct perf_event_attr perf_miss_attr = { |
933 | .type = PERF_TYPE_RAW, |
934 | .size = sizeof(struct perf_event_attr), |
935 | .pinned = 1, |
936 | .disabled = 0, |
937 | .exclude_user = 1, |
938 | }; |
939 | |
940 | static struct perf_event_attr perf_hit_attr = { |
941 | .type = PERF_TYPE_RAW, |
942 | .size = sizeof(struct perf_event_attr), |
943 | .pinned = 1, |
944 | .disabled = 0, |
945 | .exclude_user = 1, |
946 | }; |
947 | |
948 | struct residency_counts { |
949 | u64 miss_before, hits_before; |
950 | u64 miss_after, hits_after; |
951 | }; |
952 | |
953 | static int measure_residency_fn(struct perf_event_attr *miss_attr, |
954 | struct perf_event_attr *hit_attr, |
955 | struct pseudo_lock_region *plr, |
956 | struct residency_counts *counts) |
957 | { |
958 | u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0; |
959 | struct perf_event *miss_event, *hit_event; |
960 | int hit_pmcnum, miss_pmcnum; |
961 | u32 saved_low, saved_high; |
962 | unsigned int line_size; |
963 | unsigned int size; |
964 | unsigned long i; |
965 | void *mem_r; |
966 | u64 tmp; |
967 | |
968 | miss_event = perf_event_create_kernel_counter(attr: miss_attr, cpu: plr->cpu, |
969 | NULL, NULL, NULL); |
970 | if (IS_ERR(ptr: miss_event)) |
971 | goto out; |
972 | |
973 | hit_event = perf_event_create_kernel_counter(attr: hit_attr, cpu: plr->cpu, |
974 | NULL, NULL, NULL); |
975 | if (IS_ERR(ptr: hit_event)) |
976 | goto out_miss; |
977 | |
978 | local_irq_disable(); |
979 | /* |
980 | * Check any possible error state of events used by performing |
981 | * one local read. |
982 | */ |
983 | if (perf_event_read_local(event: miss_event, value: &tmp, NULL, NULL)) { |
984 | local_irq_enable(); |
985 | goto out_hit; |
986 | } |
987 | if (perf_event_read_local(event: hit_event, value: &tmp, NULL, NULL)) { |
988 | local_irq_enable(); |
989 | goto out_hit; |
990 | } |
991 | |
992 | /* |
993 | * Disable hardware prefetchers. |
994 | */ |
995 | rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); |
996 | wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); |
997 | |
998 | /* Initialize rest of local variables */ |
999 | /* |
1000 | * Performance event has been validated right before this with |
1001 | * interrupts disabled - it is thus safe to read the counter index. |
1002 | */ |
1003 | miss_pmcnum = x86_perf_rdpmc_index(event: miss_event); |
1004 | hit_pmcnum = x86_perf_rdpmc_index(event: hit_event); |
1005 | line_size = READ_ONCE(plr->line_size); |
1006 | mem_r = READ_ONCE(plr->kmem); |
1007 | size = READ_ONCE(plr->size); |
1008 | |
1009 | /* |
1010 | * Read counter variables twice - first to load the instructions |
1011 | * used in L1 cache, second to capture accurate value that does not |
1012 | * include cache misses incurred because of instruction loads. |
1013 | */ |
1014 | rdpmcl(hit_pmcnum, hits_before); |
1015 | rdpmcl(miss_pmcnum, miss_before); |
1016 | /* |
1017 | * From SDM: Performing back-to-back fast reads are not guaranteed |
1018 | * to be monotonic. |
1019 | * Use LFENCE to ensure all previous instructions are retired |
1020 | * before proceeding. |
1021 | */ |
1022 | rmb(); |
1023 | rdpmcl(hit_pmcnum, hits_before); |
1024 | rdpmcl(miss_pmcnum, miss_before); |
1025 | /* |
1026 | * Use LFENCE to ensure all previous instructions are retired |
1027 | * before proceeding. |
1028 | */ |
1029 | rmb(); |
1030 | for (i = 0; i < size; i += line_size) { |
1031 | /* |
1032 | * Add a barrier to prevent speculative execution of this |
1033 | * loop reading beyond the end of the buffer. |
1034 | */ |
1035 | rmb(); |
1036 | asm volatile("mov (%0,%1,1), %%eax\n\t" |
1037 | : |
1038 | : "r" (mem_r), "r" (i) |
1039 | : "%eax" , "memory" ); |
1040 | } |
1041 | /* |
1042 | * Use LFENCE to ensure all previous instructions are retired |
1043 | * before proceeding. |
1044 | */ |
1045 | rmb(); |
1046 | rdpmcl(hit_pmcnum, hits_after); |
1047 | rdpmcl(miss_pmcnum, miss_after); |
1048 | /* |
1049 | * Use LFENCE to ensure all previous instructions are retired |
1050 | * before proceeding. |
1051 | */ |
1052 | rmb(); |
1053 | /* Re-enable hardware prefetchers */ |
1054 | wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); |
1055 | local_irq_enable(); |
1056 | out_hit: |
1057 | perf_event_release_kernel(event: hit_event); |
1058 | out_miss: |
1059 | perf_event_release_kernel(event: miss_event); |
1060 | out: |
1061 | /* |
1062 | * All counts will be zero on failure. |
1063 | */ |
1064 | counts->miss_before = miss_before; |
1065 | counts->hits_before = hits_before; |
1066 | counts->miss_after = miss_after; |
1067 | counts->hits_after = hits_after; |
1068 | return 0; |
1069 | } |
1070 | |
1071 | static int measure_l2_residency(void *_plr) |
1072 | { |
1073 | struct pseudo_lock_region *plr = _plr; |
1074 | struct residency_counts counts = {0}; |
1075 | |
1076 | /* |
1077 | * Non-architectural event for the Goldmont Microarchitecture |
1078 | * from Intel x86 Architecture Software Developer Manual (SDM): |
1079 | * MEM_LOAD_UOPS_RETIRED D1H (event number) |
1080 | * Umask values: |
1081 | * L2_HIT 02H |
1082 | * L2_MISS 10H |
1083 | */ |
1084 | switch (boot_cpu_data.x86_model) { |
1085 | case INTEL_FAM6_ATOM_GOLDMONT: |
1086 | case INTEL_FAM6_ATOM_GOLDMONT_PLUS: |
1087 | perf_miss_attr.config = X86_CONFIG(.event = 0xd1, |
1088 | .umask = 0x10); |
1089 | perf_hit_attr.config = X86_CONFIG(.event = 0xd1, |
1090 | .umask = 0x2); |
1091 | break; |
1092 | default: |
1093 | goto out; |
1094 | } |
1095 | |
1096 | measure_residency_fn(miss_attr: &perf_miss_attr, hit_attr: &perf_hit_attr, plr, counts: &counts); |
1097 | /* |
1098 | * If a failure prevented the measurements from succeeding |
1099 | * tracepoints will still be written and all counts will be zero. |
1100 | */ |
1101 | trace_pseudo_lock_l2(l2_hits: counts.hits_after - counts.hits_before, |
1102 | l2_miss: counts.miss_after - counts.miss_before); |
1103 | out: |
1104 | plr->thread_done = 1; |
1105 | wake_up_interruptible(&plr->lock_thread_wq); |
1106 | return 0; |
1107 | } |
1108 | |
1109 | static int measure_l3_residency(void *_plr) |
1110 | { |
1111 | struct pseudo_lock_region *plr = _plr; |
1112 | struct residency_counts counts = {0}; |
1113 | |
1114 | /* |
1115 | * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event |
1116 | * has two "no fix" errata associated with it: BDM35 and BDM100. On |
1117 | * this platform the following events are used instead: |
1118 | * LONGEST_LAT_CACHE 2EH (Documented in SDM) |
1119 | * REFERENCE 4FH |
1120 | * MISS 41H |
1121 | */ |
1122 | |
1123 | switch (boot_cpu_data.x86_model) { |
1124 | case INTEL_FAM6_BROADWELL_X: |
1125 | /* On BDW the hit event counts references, not hits */ |
1126 | perf_hit_attr.config = X86_CONFIG(.event = 0x2e, |
1127 | .umask = 0x4f); |
1128 | perf_miss_attr.config = X86_CONFIG(.event = 0x2e, |
1129 | .umask = 0x41); |
1130 | break; |
1131 | default: |
1132 | goto out; |
1133 | } |
1134 | |
1135 | measure_residency_fn(miss_attr: &perf_miss_attr, hit_attr: &perf_hit_attr, plr, counts: &counts); |
1136 | /* |
1137 | * If a failure prevented the measurements from succeeding |
1138 | * tracepoints will still be written and all counts will be zero. |
1139 | */ |
1140 | |
1141 | counts.miss_after -= counts.miss_before; |
1142 | if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) { |
1143 | /* |
1144 | * On BDW references and misses are counted, need to adjust. |
1145 | * Sometimes the "hits" counter is a bit more than the |
1146 | * references, for example, x references but x + 1 hits. |
1147 | * To not report invalid hit values in this case we treat |
1148 | * that as misses equal to references. |
1149 | */ |
1150 | /* First compute the number of cache references measured */ |
1151 | counts.hits_after -= counts.hits_before; |
1152 | /* Next convert references to cache hits */ |
1153 | counts.hits_after -= min(counts.miss_after, counts.hits_after); |
1154 | } else { |
1155 | counts.hits_after -= counts.hits_before; |
1156 | } |
1157 | |
1158 | trace_pseudo_lock_l3(l3_hits: counts.hits_after, l3_miss: counts.miss_after); |
1159 | out: |
1160 | plr->thread_done = 1; |
1161 | wake_up_interruptible(&plr->lock_thread_wq); |
1162 | return 0; |
1163 | } |
1164 | |
1165 | /** |
1166 | * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region |
1167 | * @rdtgrp: Resource group to which the pseudo-locked region belongs. |
1168 | * @sel: Selector of which measurement to perform on a pseudo-locked region. |
1169 | * |
1170 | * The measurement of latency to access a pseudo-locked region should be |
1171 | * done from a cpu that is associated with that pseudo-locked region. |
1172 | * Determine which cpu is associated with this region and start a thread on |
1173 | * that cpu to perform the measurement, wait for that thread to complete. |
1174 | * |
1175 | * Return: 0 on success, <0 on failure |
1176 | */ |
1177 | static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) |
1178 | { |
1179 | struct pseudo_lock_region *plr = rdtgrp->plr; |
1180 | struct task_struct *thread; |
1181 | unsigned int cpu; |
1182 | int ret = -1; |
1183 | |
1184 | cpus_read_lock(); |
1185 | mutex_lock(&rdtgroup_mutex); |
1186 | |
1187 | if (rdtgrp->flags & RDT_DELETED) { |
1188 | ret = -ENODEV; |
1189 | goto out; |
1190 | } |
1191 | |
1192 | if (!plr->d) { |
1193 | ret = -ENODEV; |
1194 | goto out; |
1195 | } |
1196 | |
1197 | plr->thread_done = 0; |
1198 | cpu = cpumask_first(srcp: &plr->d->cpu_mask); |
1199 | if (!cpu_online(cpu)) { |
1200 | ret = -ENODEV; |
1201 | goto out; |
1202 | } |
1203 | |
1204 | plr->cpu = cpu; |
1205 | |
1206 | if (sel == 1) |
1207 | thread = kthread_create_on_node(threadfn: measure_cycles_lat_fn, data: plr, |
1208 | cpu_to_node(cpu), |
1209 | namefmt: "pseudo_lock_measure/%u" , |
1210 | cpu); |
1211 | else if (sel == 2) |
1212 | thread = kthread_create_on_node(threadfn: measure_l2_residency, data: plr, |
1213 | cpu_to_node(cpu), |
1214 | namefmt: "pseudo_lock_measure/%u" , |
1215 | cpu); |
1216 | else if (sel == 3) |
1217 | thread = kthread_create_on_node(threadfn: measure_l3_residency, data: plr, |
1218 | cpu_to_node(cpu), |
1219 | namefmt: "pseudo_lock_measure/%u" , |
1220 | cpu); |
1221 | else |
1222 | goto out; |
1223 | |
1224 | if (IS_ERR(ptr: thread)) { |
1225 | ret = PTR_ERR(ptr: thread); |
1226 | goto out; |
1227 | } |
1228 | kthread_bind(k: thread, cpu); |
1229 | wake_up_process(tsk: thread); |
1230 | |
1231 | ret = wait_event_interruptible(plr->lock_thread_wq, |
1232 | plr->thread_done == 1); |
1233 | if (ret < 0) |
1234 | goto out; |
1235 | |
1236 | ret = 0; |
1237 | |
1238 | out: |
1239 | mutex_unlock(lock: &rdtgroup_mutex); |
1240 | cpus_read_unlock(); |
1241 | return ret; |
1242 | } |
1243 | |
1244 | static ssize_t pseudo_lock_measure_trigger(struct file *file, |
1245 | const char __user *user_buf, |
1246 | size_t count, loff_t *ppos) |
1247 | { |
1248 | struct rdtgroup *rdtgrp = file->private_data; |
1249 | size_t buf_size; |
1250 | char buf[32]; |
1251 | int ret; |
1252 | int sel; |
1253 | |
1254 | buf_size = min(count, (sizeof(buf) - 1)); |
1255 | if (copy_from_user(to: buf, from: user_buf, n: buf_size)) |
1256 | return -EFAULT; |
1257 | |
1258 | buf[buf_size] = '\0'; |
1259 | ret = kstrtoint(s: buf, base: 10, res: &sel); |
1260 | if (ret == 0) { |
1261 | if (sel != 1 && sel != 2 && sel != 3) |
1262 | return -EINVAL; |
1263 | ret = debugfs_file_get(dentry: file->f_path.dentry); |
1264 | if (ret) |
1265 | return ret; |
1266 | ret = pseudo_lock_measure_cycles(rdtgrp, sel); |
1267 | if (ret == 0) |
1268 | ret = count; |
1269 | debugfs_file_put(dentry: file->f_path.dentry); |
1270 | } |
1271 | |
1272 | return ret; |
1273 | } |
1274 | |
1275 | static const struct file_operations pseudo_measure_fops = { |
1276 | .write = pseudo_lock_measure_trigger, |
1277 | .open = simple_open, |
1278 | .llseek = default_llseek, |
1279 | }; |
1280 | |
1281 | /** |
1282 | * rdtgroup_pseudo_lock_create - Create a pseudo-locked region |
1283 | * @rdtgrp: resource group to which pseudo-lock region belongs |
1284 | * |
1285 | * Called when a resource group in the pseudo-locksetup mode receives a |
1286 | * valid schemata that should be pseudo-locked. Since the resource group is |
1287 | * in pseudo-locksetup mode the &struct pseudo_lock_region has already been |
1288 | * allocated and initialized with the essential information. If a failure |
1289 | * occurs the resource group remains in the pseudo-locksetup mode with the |
1290 | * &struct pseudo_lock_region associated with it, but cleared from all |
1291 | * information and ready for the user to re-attempt pseudo-locking by |
1292 | * writing the schemata again. |
1293 | * |
1294 | * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0 |
1295 | * on failure. Descriptive error will be written to last_cmd_status buffer. |
1296 | */ |
1297 | int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) |
1298 | { |
1299 | struct pseudo_lock_region *plr = rdtgrp->plr; |
1300 | struct task_struct *thread; |
1301 | unsigned int new_minor; |
1302 | struct device *dev; |
1303 | int ret; |
1304 | |
1305 | ret = pseudo_lock_region_alloc(plr); |
1306 | if (ret < 0) |
1307 | return ret; |
1308 | |
1309 | ret = pseudo_lock_cstates_constrain(plr); |
1310 | if (ret < 0) { |
1311 | ret = -EINVAL; |
1312 | goto out_region; |
1313 | } |
1314 | |
1315 | plr->thread_done = 0; |
1316 | |
1317 | thread = kthread_create_on_node(threadfn: pseudo_lock_fn, data: rdtgrp, |
1318 | cpu_to_node(cpu: plr->cpu), |
1319 | namefmt: "pseudo_lock/%u" , plr->cpu); |
1320 | if (IS_ERR(ptr: thread)) { |
1321 | ret = PTR_ERR(ptr: thread); |
1322 | rdt_last_cmd_printf(fmt: "Locking thread returned error %d\n" , ret); |
1323 | goto out_cstates; |
1324 | } |
1325 | |
1326 | kthread_bind(k: thread, cpu: plr->cpu); |
1327 | wake_up_process(tsk: thread); |
1328 | |
1329 | ret = wait_event_interruptible(plr->lock_thread_wq, |
1330 | plr->thread_done == 1); |
1331 | if (ret < 0) { |
1332 | /* |
1333 | * If the thread does not get on the CPU for whatever |
1334 | * reason and the process which sets up the region is |
1335 | * interrupted then this will leave the thread in runnable |
1336 | * state and once it gets on the CPU it will dereference |
1337 | * the cleared, but not freed, plr struct resulting in an |
1338 | * empty pseudo-locking loop. |
1339 | */ |
1340 | rdt_last_cmd_puts(s: "Locking thread interrupted\n" ); |
1341 | goto out_cstates; |
1342 | } |
1343 | |
1344 | ret = pseudo_lock_minor_get(minor: &new_minor); |
1345 | if (ret < 0) { |
1346 | rdt_last_cmd_puts(s: "Unable to obtain a new minor number\n" ); |
1347 | goto out_cstates; |
1348 | } |
1349 | |
1350 | /* |
1351 | * Unlock access but do not release the reference. The |
1352 | * pseudo-locked region will still be here on return. |
1353 | * |
1354 | * The mutex has to be released temporarily to avoid a potential |
1355 | * deadlock with the mm->mmap_lock which is obtained in the |
1356 | * device_create() and debugfs_create_dir() callpath below as well as |
1357 | * before the mmap() callback is called. |
1358 | */ |
1359 | mutex_unlock(lock: &rdtgroup_mutex); |
1360 | |
1361 | if (!IS_ERR_OR_NULL(ptr: debugfs_resctrl)) { |
1362 | plr->debugfs_dir = debugfs_create_dir(name: rdtgrp->kn->name, |
1363 | parent: debugfs_resctrl); |
1364 | if (!IS_ERR_OR_NULL(ptr: plr->debugfs_dir)) |
1365 | debugfs_create_file(name: "pseudo_lock_measure" , mode: 0200, |
1366 | parent: plr->debugfs_dir, data: rdtgrp, |
1367 | fops: &pseudo_measure_fops); |
1368 | } |
1369 | |
1370 | dev = device_create(cls: &pseudo_lock_class, NULL, |
1371 | MKDEV(pseudo_lock_major, new_minor), |
1372 | drvdata: rdtgrp, fmt: "%s" , rdtgrp->kn->name); |
1373 | |
1374 | mutex_lock(&rdtgroup_mutex); |
1375 | |
1376 | if (IS_ERR(ptr: dev)) { |
1377 | ret = PTR_ERR(ptr: dev); |
1378 | rdt_last_cmd_printf(fmt: "Failed to create character device: %d\n" , |
1379 | ret); |
1380 | goto out_debugfs; |
1381 | } |
1382 | |
1383 | /* We released the mutex - check if group was removed while we did so */ |
1384 | if (rdtgrp->flags & RDT_DELETED) { |
1385 | ret = -ENODEV; |
1386 | goto out_device; |
1387 | } |
1388 | |
1389 | plr->minor = new_minor; |
1390 | |
1391 | rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; |
1392 | closid_free(closid: rdtgrp->closid); |
1393 | rdtgroup_kn_mode_restore(r: rdtgrp, name: "cpus" , mask: 0444); |
1394 | rdtgroup_kn_mode_restore(r: rdtgrp, name: "cpus_list" , mask: 0444); |
1395 | |
1396 | ret = 0; |
1397 | goto out; |
1398 | |
1399 | out_device: |
1400 | device_destroy(cls: &pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); |
1401 | out_debugfs: |
1402 | debugfs_remove_recursive(dentry: plr->debugfs_dir); |
1403 | pseudo_lock_minor_release(minor: new_minor); |
1404 | out_cstates: |
1405 | pseudo_lock_cstates_relax(plr); |
1406 | out_region: |
1407 | pseudo_lock_region_clear(plr); |
1408 | out: |
1409 | return ret; |
1410 | } |
1411 | |
1412 | /** |
1413 | * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region |
1414 | * @rdtgrp: resource group to which the pseudo-locked region belongs |
1415 | * |
1416 | * The removal of a pseudo-locked region can be initiated when the resource |
1417 | * group is removed from user space via a "rmdir" from userspace or the |
1418 | * unmount of the resctrl filesystem. On removal the resource group does |
1419 | * not go back to pseudo-locksetup mode before it is removed, instead it is |
1420 | * removed directly. There is thus asymmetry with the creation where the |
1421 | * &struct pseudo_lock_region is removed here while it was not created in |
1422 | * rdtgroup_pseudo_lock_create(). |
1423 | * |
1424 | * Return: void |
1425 | */ |
1426 | void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) |
1427 | { |
1428 | struct pseudo_lock_region *plr = rdtgrp->plr; |
1429 | |
1430 | if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { |
1431 | /* |
1432 | * Default group cannot be a pseudo-locked region so we can |
1433 | * free closid here. |
1434 | */ |
1435 | closid_free(closid: rdtgrp->closid); |
1436 | goto free; |
1437 | } |
1438 | |
1439 | pseudo_lock_cstates_relax(plr); |
1440 | debugfs_remove_recursive(dentry: rdtgrp->plr->debugfs_dir); |
1441 | device_destroy(cls: &pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); |
1442 | pseudo_lock_minor_release(minor: plr->minor); |
1443 | |
1444 | free: |
1445 | pseudo_lock_free(rdtgrp); |
1446 | } |
1447 | |
1448 | static int pseudo_lock_dev_open(struct inode *inode, struct file *filp) |
1449 | { |
1450 | struct rdtgroup *rdtgrp; |
1451 | |
1452 | mutex_lock(&rdtgroup_mutex); |
1453 | |
1454 | rdtgrp = region_find_by_minor(minor: iminor(inode)); |
1455 | if (!rdtgrp) { |
1456 | mutex_unlock(lock: &rdtgroup_mutex); |
1457 | return -ENODEV; |
1458 | } |
1459 | |
1460 | filp->private_data = rdtgrp; |
1461 | atomic_inc(v: &rdtgrp->waitcount); |
1462 | /* Perform a non-seekable open - llseek is not supported */ |
1463 | filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); |
1464 | |
1465 | mutex_unlock(lock: &rdtgroup_mutex); |
1466 | |
1467 | return 0; |
1468 | } |
1469 | |
1470 | static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) |
1471 | { |
1472 | struct rdtgroup *rdtgrp; |
1473 | |
1474 | mutex_lock(&rdtgroup_mutex); |
1475 | rdtgrp = filp->private_data; |
1476 | WARN_ON(!rdtgrp); |
1477 | if (!rdtgrp) { |
1478 | mutex_unlock(lock: &rdtgroup_mutex); |
1479 | return -ENODEV; |
1480 | } |
1481 | filp->private_data = NULL; |
1482 | atomic_dec(v: &rdtgrp->waitcount); |
1483 | mutex_unlock(lock: &rdtgroup_mutex); |
1484 | return 0; |
1485 | } |
1486 | |
1487 | static int pseudo_lock_dev_mremap(struct vm_area_struct *area) |
1488 | { |
1489 | /* Not supported */ |
1490 | return -EINVAL; |
1491 | } |
1492 | |
1493 | static const struct vm_operations_struct pseudo_mmap_ops = { |
1494 | .mremap = pseudo_lock_dev_mremap, |
1495 | }; |
1496 | |
1497 | static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) |
1498 | { |
1499 | unsigned long vsize = vma->vm_end - vma->vm_start; |
1500 | unsigned long off = vma->vm_pgoff << PAGE_SHIFT; |
1501 | struct pseudo_lock_region *plr; |
1502 | struct rdtgroup *rdtgrp; |
1503 | unsigned long physical; |
1504 | unsigned long psize; |
1505 | |
1506 | mutex_lock(&rdtgroup_mutex); |
1507 | |
1508 | rdtgrp = filp->private_data; |
1509 | WARN_ON(!rdtgrp); |
1510 | if (!rdtgrp) { |
1511 | mutex_unlock(lock: &rdtgroup_mutex); |
1512 | return -ENODEV; |
1513 | } |
1514 | |
1515 | plr = rdtgrp->plr; |
1516 | |
1517 | if (!plr->d) { |
1518 | mutex_unlock(lock: &rdtgroup_mutex); |
1519 | return -ENODEV; |
1520 | } |
1521 | |
1522 | /* |
1523 | * Task is required to run with affinity to the cpus associated |
1524 | * with the pseudo-locked region. If this is not the case the task |
1525 | * may be scheduled elsewhere and invalidate entries in the |
1526 | * pseudo-locked region. |
1527 | */ |
1528 | if (!cpumask_subset(current->cpus_ptr, src2p: &plr->d->cpu_mask)) { |
1529 | mutex_unlock(lock: &rdtgroup_mutex); |
1530 | return -EINVAL; |
1531 | } |
1532 | |
1533 | physical = __pa(plr->kmem) >> PAGE_SHIFT; |
1534 | psize = plr->size - off; |
1535 | |
1536 | if (off > plr->size) { |
1537 | mutex_unlock(lock: &rdtgroup_mutex); |
1538 | return -ENOSPC; |
1539 | } |
1540 | |
1541 | /* |
1542 | * Ensure changes are carried directly to the memory being mapped, |
1543 | * do not allow copy-on-write mapping. |
1544 | */ |
1545 | if (!(vma->vm_flags & VM_SHARED)) { |
1546 | mutex_unlock(lock: &rdtgroup_mutex); |
1547 | return -EINVAL; |
1548 | } |
1549 | |
1550 | if (vsize > psize) { |
1551 | mutex_unlock(lock: &rdtgroup_mutex); |
1552 | return -ENOSPC; |
1553 | } |
1554 | |
1555 | memset(plr->kmem + off, 0, vsize); |
1556 | |
1557 | if (remap_pfn_range(vma, addr: vma->vm_start, pfn: physical + vma->vm_pgoff, |
1558 | size: vsize, vma->vm_page_prot)) { |
1559 | mutex_unlock(lock: &rdtgroup_mutex); |
1560 | return -EAGAIN; |
1561 | } |
1562 | vma->vm_ops = &pseudo_mmap_ops; |
1563 | mutex_unlock(lock: &rdtgroup_mutex); |
1564 | return 0; |
1565 | } |
1566 | |
1567 | static const struct file_operations pseudo_lock_dev_fops = { |
1568 | .owner = THIS_MODULE, |
1569 | .llseek = no_llseek, |
1570 | .read = NULL, |
1571 | .write = NULL, |
1572 | .open = pseudo_lock_dev_open, |
1573 | .release = pseudo_lock_dev_release, |
1574 | .mmap = pseudo_lock_dev_mmap, |
1575 | }; |
1576 | |
1577 | int rdt_pseudo_lock_init(void) |
1578 | { |
1579 | int ret; |
1580 | |
1581 | ret = register_chrdev(major: 0, name: "pseudo_lock" , fops: &pseudo_lock_dev_fops); |
1582 | if (ret < 0) |
1583 | return ret; |
1584 | |
1585 | pseudo_lock_major = ret; |
1586 | |
1587 | ret = class_register(class: &pseudo_lock_class); |
1588 | if (ret) { |
1589 | unregister_chrdev(major: pseudo_lock_major, name: "pseudo_lock" ); |
1590 | return ret; |
1591 | } |
1592 | |
1593 | return 0; |
1594 | } |
1595 | |
1596 | void rdt_pseudo_lock_release(void) |
1597 | { |
1598 | class_unregister(class: &pseudo_lock_class); |
1599 | unregister_chrdev(major: pseudo_lock_major, name: "pseudo_lock" ); |
1600 | pseudo_lock_major = 0; |
1601 | } |
1602 | |