1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Ram backed block device driver. |
4 | * |
5 | * Copyright (C) 2007 Nick Piggin |
6 | * Copyright (C) 2007 Novell Inc. |
7 | * |
8 | * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright |
9 | * of their respective owners. |
10 | */ |
11 | |
12 | #include <linux/init.h> |
13 | #include <linux/initrd.h> |
14 | #include <linux/module.h> |
15 | #include <linux/moduleparam.h> |
16 | #include <linux/major.h> |
17 | #include <linux/blkdev.h> |
18 | #include <linux/bio.h> |
19 | #include <linux/highmem.h> |
20 | #include <linux/mutex.h> |
21 | #include <linux/pagemap.h> |
22 | #include <linux/xarray.h> |
23 | #include <linux/fs.h> |
24 | #include <linux/slab.h> |
25 | #include <linux/backing-dev.h> |
26 | #include <linux/debugfs.h> |
27 | |
28 | #include <linux/uaccess.h> |
29 | |
30 | /* |
31 | * Each block ramdisk device has a xarray brd_pages of pages that stores |
32 | * the pages containing the block device's contents. A brd page's ->index is |
33 | * its offset in PAGE_SIZE units. This is similar to, but in no way connected |
34 | * with, the kernel's pagecache or buffer cache (which sit above our block |
35 | * device). |
36 | */ |
37 | struct brd_device { |
38 | int brd_number; |
39 | struct gendisk *brd_disk; |
40 | struct list_head brd_list; |
41 | |
42 | /* |
43 | * Backing store of pages. This is the contents of the block device. |
44 | */ |
45 | struct xarray brd_pages; |
46 | u64 brd_nr_pages; |
47 | }; |
48 | |
49 | /* |
50 | * Look up and return a brd's page for a given sector. |
51 | */ |
52 | static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) |
53 | { |
54 | pgoff_t idx; |
55 | struct page *page; |
56 | |
57 | idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */ |
58 | page = xa_load(&brd->brd_pages, index: idx); |
59 | |
60 | BUG_ON(page && page->index != idx); |
61 | |
62 | return page; |
63 | } |
64 | |
65 | /* |
66 | * Insert a new page for a given sector, if one does not already exist. |
67 | */ |
68 | static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp) |
69 | { |
70 | pgoff_t idx; |
71 | struct page *page, *cur; |
72 | int ret = 0; |
73 | |
74 | page = brd_lookup_page(brd, sector); |
75 | if (page) |
76 | return 0; |
77 | |
78 | page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM); |
79 | if (!page) |
80 | return -ENOMEM; |
81 | |
82 | xa_lock(&brd->brd_pages); |
83 | |
84 | idx = sector >> PAGE_SECTORS_SHIFT; |
85 | page->index = idx; |
86 | |
87 | cur = __xa_cmpxchg(&brd->brd_pages, index: idx, NULL, entry: page, gfp); |
88 | |
89 | if (unlikely(cur)) { |
90 | __free_page(page); |
91 | ret = xa_err(entry: cur); |
92 | if (!ret && (cur->index != idx)) |
93 | ret = -EIO; |
94 | } else { |
95 | brd->brd_nr_pages++; |
96 | } |
97 | |
98 | xa_unlock(&brd->brd_pages); |
99 | |
100 | return ret; |
101 | } |
102 | |
103 | /* |
104 | * Free all backing store pages and xarray. This must only be called when |
105 | * there are no other users of the device. |
106 | */ |
107 | static void brd_free_pages(struct brd_device *brd) |
108 | { |
109 | struct page *page; |
110 | pgoff_t idx; |
111 | |
112 | xa_for_each(&brd->brd_pages, idx, page) { |
113 | __free_page(page); |
114 | cond_resched(); |
115 | } |
116 | |
117 | xa_destroy(&brd->brd_pages); |
118 | } |
119 | |
120 | /* |
121 | * copy_to_brd_setup must be called before copy_to_brd. It may sleep. |
122 | */ |
123 | static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n, |
124 | gfp_t gfp) |
125 | { |
126 | unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT; |
127 | size_t copy; |
128 | int ret; |
129 | |
130 | copy = min_t(size_t, n, PAGE_SIZE - offset); |
131 | ret = brd_insert_page(brd, sector, gfp); |
132 | if (ret) |
133 | return ret; |
134 | if (copy < n) { |
135 | sector += copy >> SECTOR_SHIFT; |
136 | ret = brd_insert_page(brd, sector, gfp); |
137 | } |
138 | return ret; |
139 | } |
140 | |
141 | /* |
142 | * Copy n bytes from src to the brd starting at sector. Does not sleep. |
143 | */ |
144 | static void copy_to_brd(struct brd_device *brd, const void *src, |
145 | sector_t sector, size_t n) |
146 | { |
147 | struct page *page; |
148 | void *dst; |
149 | unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT; |
150 | size_t copy; |
151 | |
152 | copy = min_t(size_t, n, PAGE_SIZE - offset); |
153 | page = brd_lookup_page(brd, sector); |
154 | BUG_ON(!page); |
155 | |
156 | dst = kmap_atomic(page); |
157 | memcpy(dst + offset, src, copy); |
158 | kunmap_atomic(dst); |
159 | |
160 | if (copy < n) { |
161 | src += copy; |
162 | sector += copy >> SECTOR_SHIFT; |
163 | copy = n - copy; |
164 | page = brd_lookup_page(brd, sector); |
165 | BUG_ON(!page); |
166 | |
167 | dst = kmap_atomic(page); |
168 | memcpy(dst, src, copy); |
169 | kunmap_atomic(dst); |
170 | } |
171 | } |
172 | |
173 | /* |
174 | * Copy n bytes to dst from the brd starting at sector. Does not sleep. |
175 | */ |
176 | static void copy_from_brd(void *dst, struct brd_device *brd, |
177 | sector_t sector, size_t n) |
178 | { |
179 | struct page *page; |
180 | void *src; |
181 | unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT; |
182 | size_t copy; |
183 | |
184 | copy = min_t(size_t, n, PAGE_SIZE - offset); |
185 | page = brd_lookup_page(brd, sector); |
186 | if (page) { |
187 | src = kmap_atomic(page); |
188 | memcpy(dst, src + offset, copy); |
189 | kunmap_atomic(src); |
190 | } else |
191 | memset(dst, 0, copy); |
192 | |
193 | if (copy < n) { |
194 | dst += copy; |
195 | sector += copy >> SECTOR_SHIFT; |
196 | copy = n - copy; |
197 | page = brd_lookup_page(brd, sector); |
198 | if (page) { |
199 | src = kmap_atomic(page); |
200 | memcpy(dst, src, copy); |
201 | kunmap_atomic(src); |
202 | } else |
203 | memset(dst, 0, copy); |
204 | } |
205 | } |
206 | |
207 | /* |
208 | * Process a single bvec of a bio. |
209 | */ |
210 | static int brd_do_bvec(struct brd_device *brd, struct page *page, |
211 | unsigned int len, unsigned int off, blk_opf_t opf, |
212 | sector_t sector) |
213 | { |
214 | void *mem; |
215 | int err = 0; |
216 | |
217 | if (op_is_write(op: opf)) { |
218 | /* |
219 | * Must use NOIO because we don't want to recurse back into the |
220 | * block or filesystem layers from page reclaim. |
221 | */ |
222 | gfp_t gfp = opf & REQ_NOWAIT ? GFP_NOWAIT : GFP_NOIO; |
223 | |
224 | err = copy_to_brd_setup(brd, sector, n: len, gfp); |
225 | if (err) |
226 | goto out; |
227 | } |
228 | |
229 | mem = kmap_atomic(page); |
230 | if (!op_is_write(op: opf)) { |
231 | copy_from_brd(dst: mem + off, brd, sector, n: len); |
232 | flush_dcache_page(page); |
233 | } else { |
234 | flush_dcache_page(page); |
235 | copy_to_brd(brd, src: mem + off, sector, n: len); |
236 | } |
237 | kunmap_atomic(mem); |
238 | |
239 | out: |
240 | return err; |
241 | } |
242 | |
243 | static void brd_submit_bio(struct bio *bio) |
244 | { |
245 | struct brd_device *brd = bio->bi_bdev->bd_disk->private_data; |
246 | sector_t sector = bio->bi_iter.bi_sector; |
247 | struct bio_vec bvec; |
248 | struct bvec_iter iter; |
249 | |
250 | bio_for_each_segment(bvec, bio, iter) { |
251 | unsigned int len = bvec.bv_len; |
252 | int err; |
253 | |
254 | /* Don't support un-aligned buffer */ |
255 | WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) || |
256 | (len & (SECTOR_SIZE - 1))); |
257 | |
258 | err = brd_do_bvec(brd, page: bvec.bv_page, len, off: bvec.bv_offset, |
259 | opf: bio->bi_opf, sector); |
260 | if (err) { |
261 | if (err == -ENOMEM && bio->bi_opf & REQ_NOWAIT) { |
262 | bio_wouldblock_error(bio); |
263 | return; |
264 | } |
265 | bio_io_error(bio); |
266 | return; |
267 | } |
268 | sector += len >> SECTOR_SHIFT; |
269 | } |
270 | |
271 | bio_endio(bio); |
272 | } |
273 | |
274 | static const struct block_device_operations brd_fops = { |
275 | .owner = THIS_MODULE, |
276 | .submit_bio = brd_submit_bio, |
277 | }; |
278 | |
279 | /* |
280 | * And now the modules code and kernel interface. |
281 | */ |
282 | static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT; |
283 | module_param(rd_nr, int, 0444); |
284 | MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices" ); |
285 | |
286 | unsigned long rd_size = CONFIG_BLK_DEV_RAM_SIZE; |
287 | module_param(rd_size, ulong, 0444); |
288 | MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes." ); |
289 | |
290 | static int max_part = 1; |
291 | module_param(max_part, int, 0444); |
292 | MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices" ); |
293 | |
294 | MODULE_LICENSE("GPL" ); |
295 | MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR); |
296 | MODULE_ALIAS("rd" ); |
297 | |
298 | #ifndef MODULE |
299 | /* Legacy boot options - nonmodular */ |
300 | static int __init ramdisk_size(char *str) |
301 | { |
302 | rd_size = simple_strtol(str, NULL, 0); |
303 | return 1; |
304 | } |
305 | __setup("ramdisk_size=" , ramdisk_size); |
306 | #endif |
307 | |
308 | /* |
309 | * The device scheme is derived from loop.c. Keep them in synch where possible |
310 | * (should share code eventually). |
311 | */ |
312 | static LIST_HEAD(brd_devices); |
313 | static struct dentry *brd_debugfs_dir; |
314 | |
315 | static int brd_alloc(int i) |
316 | { |
317 | struct brd_device *brd; |
318 | struct gendisk *disk; |
319 | char buf[DISK_NAME_LEN]; |
320 | int err = -ENOMEM; |
321 | |
322 | list_for_each_entry(brd, &brd_devices, brd_list) |
323 | if (brd->brd_number == i) |
324 | return -EEXIST; |
325 | brd = kzalloc(size: sizeof(*brd), GFP_KERNEL); |
326 | if (!brd) |
327 | return -ENOMEM; |
328 | brd->brd_number = i; |
329 | list_add_tail(new: &brd->brd_list, head: &brd_devices); |
330 | |
331 | xa_init(xa: &brd->brd_pages); |
332 | |
333 | snprintf(buf, DISK_NAME_LEN, fmt: "ram%d" , i); |
334 | if (!IS_ERR_OR_NULL(ptr: brd_debugfs_dir)) |
335 | debugfs_create_u64(name: buf, mode: 0444, parent: brd_debugfs_dir, |
336 | value: &brd->brd_nr_pages); |
337 | |
338 | disk = brd->brd_disk = blk_alloc_disk(NUMA_NO_NODE); |
339 | if (!disk) |
340 | goto out_free_dev; |
341 | |
342 | disk->major = RAMDISK_MAJOR; |
343 | disk->first_minor = i * max_part; |
344 | disk->minors = max_part; |
345 | disk->fops = &brd_fops; |
346 | disk->private_data = brd; |
347 | strscpy(p: disk->disk_name, q: buf, DISK_NAME_LEN); |
348 | set_capacity(disk, size: rd_size * 2); |
349 | |
350 | /* |
351 | * This is so fdisk will align partitions on 4k, because of |
352 | * direct_access API needing 4k alignment, returning a PFN |
353 | * (This is only a problem on very small devices <= 4M, |
354 | * otherwise fdisk will align on 1M. Regardless this call |
355 | * is harmless) |
356 | */ |
357 | blk_queue_physical_block_size(disk->queue, PAGE_SIZE); |
358 | |
359 | /* Tell the block layer that this is not a rotational device */ |
360 | blk_queue_flag_set(QUEUE_FLAG_NONROT, q: disk->queue); |
361 | blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, q: disk->queue); |
362 | blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q: disk->queue); |
363 | err = add_disk(disk); |
364 | if (err) |
365 | goto out_cleanup_disk; |
366 | |
367 | return 0; |
368 | |
369 | out_cleanup_disk: |
370 | put_disk(disk); |
371 | out_free_dev: |
372 | list_del(entry: &brd->brd_list); |
373 | kfree(objp: brd); |
374 | return err; |
375 | } |
376 | |
377 | static void brd_probe(dev_t dev) |
378 | { |
379 | brd_alloc(MINOR(dev) / max_part); |
380 | } |
381 | |
382 | static void brd_cleanup(void) |
383 | { |
384 | struct brd_device *brd, *next; |
385 | |
386 | debugfs_remove_recursive(dentry: brd_debugfs_dir); |
387 | |
388 | list_for_each_entry_safe(brd, next, &brd_devices, brd_list) { |
389 | del_gendisk(gp: brd->brd_disk); |
390 | put_disk(disk: brd->brd_disk); |
391 | brd_free_pages(brd); |
392 | list_del(entry: &brd->brd_list); |
393 | kfree(objp: brd); |
394 | } |
395 | } |
396 | |
397 | static inline void brd_check_and_reset_par(void) |
398 | { |
399 | if (unlikely(!max_part)) |
400 | max_part = 1; |
401 | |
402 | /* |
403 | * make sure 'max_part' can be divided exactly by (1U << MINORBITS), |
404 | * otherwise, it is possiable to get same dev_t when adding partitions. |
405 | */ |
406 | if ((1U << MINORBITS) % max_part != 0) |
407 | max_part = 1UL << fls(x: max_part); |
408 | |
409 | if (max_part > DISK_MAX_PARTS) { |
410 | pr_info("brd: max_part can't be larger than %d, reset max_part = %d.\n" , |
411 | DISK_MAX_PARTS, DISK_MAX_PARTS); |
412 | max_part = DISK_MAX_PARTS; |
413 | } |
414 | } |
415 | |
416 | static int __init brd_init(void) |
417 | { |
418 | int err, i; |
419 | |
420 | brd_check_and_reset_par(); |
421 | |
422 | brd_debugfs_dir = debugfs_create_dir(name: "ramdisk_pages" , NULL); |
423 | |
424 | for (i = 0; i < rd_nr; i++) { |
425 | err = brd_alloc(i); |
426 | if (err) |
427 | goto out_free; |
428 | } |
429 | |
430 | /* |
431 | * brd module now has a feature to instantiate underlying device |
432 | * structure on-demand, provided that there is an access dev node. |
433 | * |
434 | * (1) if rd_nr is specified, create that many upfront. else |
435 | * it defaults to CONFIG_BLK_DEV_RAM_COUNT |
436 | * (2) User can further extend brd devices by create dev node themselves |
437 | * and have kernel automatically instantiate actual device |
438 | * on-demand. Example: |
439 | * mknod /path/devnod_name b 1 X # 1 is the rd major |
440 | * fdisk -l /path/devnod_name |
441 | * If (X / max_part) was not already created it will be created |
442 | * dynamically. |
443 | */ |
444 | |
445 | if (__register_blkdev(RAMDISK_MAJOR, name: "ramdisk" , probe: brd_probe)) { |
446 | err = -EIO; |
447 | goto out_free; |
448 | } |
449 | |
450 | pr_info("brd: module loaded\n" ); |
451 | return 0; |
452 | |
453 | out_free: |
454 | brd_cleanup(); |
455 | |
456 | pr_info("brd: module NOT loaded !!!\n" ); |
457 | return err; |
458 | } |
459 | |
460 | static void __exit brd_exit(void) |
461 | { |
462 | |
463 | unregister_blkdev(RAMDISK_MAJOR, name: "ramdisk" ); |
464 | brd_cleanup(); |
465 | |
466 | pr_info("brd: module unloaded\n" ); |
467 | } |
468 | |
469 | module_init(brd_init); |
470 | module_exit(brd_exit); |
471 | |
472 | |