1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Ram backed block device driver. |
4 | * |
5 | * Copyright (C) 2007 Nick Piggin |
6 | * Copyright (C) 2007 Novell Inc. |
7 | * |
8 | * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright |
9 | * of their respective owners. |
10 | */ |
11 | |
12 | #include <linux/init.h> |
13 | #include <linux/initrd.h> |
14 | #include <linux/module.h> |
15 | #include <linux/moduleparam.h> |
16 | #include <linux/major.h> |
17 | #include <linux/blkdev.h> |
18 | #include <linux/bio.h> |
19 | #include <linux/highmem.h> |
20 | #include <linux/mutex.h> |
21 | #include <linux/pagemap.h> |
22 | #include <linux/xarray.h> |
23 | #include <linux/fs.h> |
24 | #include <linux/slab.h> |
25 | #include <linux/backing-dev.h> |
26 | #include <linux/debugfs.h> |
27 | |
28 | #include <linux/uaccess.h> |
29 | |
30 | /* |
31 | * Each block ramdisk device has a xarray brd_pages of pages that stores |
32 | * the pages containing the block device's contents. A brd page's ->index is |
33 | * its offset in PAGE_SIZE units. This is similar to, but in no way connected |
34 | * with, the kernel's pagecache or buffer cache (which sit above our block |
35 | * device). |
36 | */ |
37 | struct brd_device { |
38 | int brd_number; |
39 | struct gendisk *brd_disk; |
40 | struct list_head brd_list; |
41 | |
42 | /* |
43 | * Backing store of pages. This is the contents of the block device. |
44 | */ |
45 | struct xarray brd_pages; |
46 | u64 brd_nr_pages; |
47 | }; |
48 | |
49 | /* |
50 | * Look up and return a brd's page for a given sector. |
51 | */ |
52 | static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) |
53 | { |
54 | pgoff_t idx; |
55 | struct page *page; |
56 | |
57 | idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */ |
58 | page = xa_load(&brd->brd_pages, index: idx); |
59 | |
60 | BUG_ON(page && page->index != idx); |
61 | |
62 | return page; |
63 | } |
64 | |
65 | /* |
66 | * Insert a new page for a given sector, if one does not already exist. |
67 | */ |
68 | static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp) |
69 | { |
70 | pgoff_t idx; |
71 | struct page *page, *cur; |
72 | int ret = 0; |
73 | |
74 | page = brd_lookup_page(brd, sector); |
75 | if (page) |
76 | return 0; |
77 | |
78 | page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM); |
79 | if (!page) |
80 | return -ENOMEM; |
81 | |
82 | xa_lock(&brd->brd_pages); |
83 | |
84 | idx = sector >> PAGE_SECTORS_SHIFT; |
85 | page->index = idx; |
86 | |
87 | cur = __xa_cmpxchg(&brd->brd_pages, index: idx, NULL, entry: page, gfp); |
88 | |
89 | if (unlikely(cur)) { |
90 | __free_page(page); |
91 | ret = xa_err(entry: cur); |
92 | if (!ret && (cur->index != idx)) |
93 | ret = -EIO; |
94 | } else { |
95 | brd->brd_nr_pages++; |
96 | } |
97 | |
98 | xa_unlock(&brd->brd_pages); |
99 | |
100 | return ret; |
101 | } |
102 | |
103 | /* |
104 | * Free all backing store pages and xarray. This must only be called when |
105 | * there are no other users of the device. |
106 | */ |
107 | static void brd_free_pages(struct brd_device *brd) |
108 | { |
109 | struct page *page; |
110 | pgoff_t idx; |
111 | |
112 | xa_for_each(&brd->brd_pages, idx, page) { |
113 | __free_page(page); |
114 | cond_resched(); |
115 | } |
116 | |
117 | xa_destroy(&brd->brd_pages); |
118 | } |
119 | |
120 | /* |
121 | * copy_to_brd_setup must be called before copy_to_brd. It may sleep. |
122 | */ |
123 | static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n, |
124 | gfp_t gfp) |
125 | { |
126 | unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT; |
127 | size_t copy; |
128 | int ret; |
129 | |
130 | copy = min_t(size_t, n, PAGE_SIZE - offset); |
131 | ret = brd_insert_page(brd, sector, gfp); |
132 | if (ret) |
133 | return ret; |
134 | if (copy < n) { |
135 | sector += copy >> SECTOR_SHIFT; |
136 | ret = brd_insert_page(brd, sector, gfp); |
137 | } |
138 | return ret; |
139 | } |
140 | |
141 | /* |
142 | * Copy n bytes from src to the brd starting at sector. Does not sleep. |
143 | */ |
144 | static void copy_to_brd(struct brd_device *brd, const void *src, |
145 | sector_t sector, size_t n) |
146 | { |
147 | struct page *page; |
148 | void *dst; |
149 | unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT; |
150 | size_t copy; |
151 | |
152 | copy = min_t(size_t, n, PAGE_SIZE - offset); |
153 | page = brd_lookup_page(brd, sector); |
154 | BUG_ON(!page); |
155 | |
156 | dst = kmap_atomic(page); |
157 | memcpy(dst + offset, src, copy); |
158 | kunmap_atomic(dst); |
159 | |
160 | if (copy < n) { |
161 | src += copy; |
162 | sector += copy >> SECTOR_SHIFT; |
163 | copy = n - copy; |
164 | page = brd_lookup_page(brd, sector); |
165 | BUG_ON(!page); |
166 | |
167 | dst = kmap_atomic(page); |
168 | memcpy(dst, src, copy); |
169 | kunmap_atomic(dst); |
170 | } |
171 | } |
172 | |
173 | /* |
174 | * Copy n bytes to dst from the brd starting at sector. Does not sleep. |
175 | */ |
176 | static void copy_from_brd(void *dst, struct brd_device *brd, |
177 | sector_t sector, size_t n) |
178 | { |
179 | struct page *page; |
180 | void *src; |
181 | unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT; |
182 | size_t copy; |
183 | |
184 | copy = min_t(size_t, n, PAGE_SIZE - offset); |
185 | page = brd_lookup_page(brd, sector); |
186 | if (page) { |
187 | src = kmap_atomic(page); |
188 | memcpy(dst, src + offset, copy); |
189 | kunmap_atomic(src); |
190 | } else |
191 | memset(dst, 0, copy); |
192 | |
193 | if (copy < n) { |
194 | dst += copy; |
195 | sector += copy >> SECTOR_SHIFT; |
196 | copy = n - copy; |
197 | page = brd_lookup_page(brd, sector); |
198 | if (page) { |
199 | src = kmap_atomic(page); |
200 | memcpy(dst, src, copy); |
201 | kunmap_atomic(src); |
202 | } else |
203 | memset(dst, 0, copy); |
204 | } |
205 | } |
206 | |
207 | /* |
208 | * Process a single bvec of a bio. |
209 | */ |
210 | static int brd_do_bvec(struct brd_device *brd, struct page *page, |
211 | unsigned int len, unsigned int off, blk_opf_t opf, |
212 | sector_t sector) |
213 | { |
214 | void *mem; |
215 | int err = 0; |
216 | |
217 | if (op_is_write(op: opf)) { |
218 | /* |
219 | * Must use NOIO because we don't want to recurse back into the |
220 | * block or filesystem layers from page reclaim. |
221 | */ |
222 | gfp_t gfp = opf & REQ_NOWAIT ? GFP_NOWAIT : GFP_NOIO; |
223 | |
224 | err = copy_to_brd_setup(brd, sector, n: len, gfp); |
225 | if (err) |
226 | goto out; |
227 | } |
228 | |
229 | mem = kmap_atomic(page); |
230 | if (!op_is_write(op: opf)) { |
231 | copy_from_brd(dst: mem + off, brd, sector, n: len); |
232 | flush_dcache_page(page); |
233 | } else { |
234 | flush_dcache_page(page); |
235 | copy_to_brd(brd, src: mem + off, sector, n: len); |
236 | } |
237 | kunmap_atomic(mem); |
238 | |
239 | out: |
240 | return err; |
241 | } |
242 | |
243 | static void brd_submit_bio(struct bio *bio) |
244 | { |
245 | struct brd_device *brd = bio->bi_bdev->bd_disk->private_data; |
246 | sector_t sector = bio->bi_iter.bi_sector; |
247 | struct bio_vec bvec; |
248 | struct bvec_iter iter; |
249 | |
250 | bio_for_each_segment(bvec, bio, iter) { |
251 | unsigned int len = bvec.bv_len; |
252 | int err; |
253 | |
254 | /* Don't support un-aligned buffer */ |
255 | WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) || |
256 | (len & (SECTOR_SIZE - 1))); |
257 | |
258 | err = brd_do_bvec(brd, page: bvec.bv_page, len, off: bvec.bv_offset, |
259 | opf: bio->bi_opf, sector); |
260 | if (err) { |
261 | if (err == -ENOMEM && bio->bi_opf & REQ_NOWAIT) { |
262 | bio_wouldblock_error(bio); |
263 | return; |
264 | } |
265 | bio_io_error(bio); |
266 | return; |
267 | } |
268 | sector += len >> SECTOR_SHIFT; |
269 | } |
270 | |
271 | bio_endio(bio); |
272 | } |
273 | |
274 | static const struct block_device_operations brd_fops = { |
275 | .owner = THIS_MODULE, |
276 | .submit_bio = brd_submit_bio, |
277 | }; |
278 | |
279 | /* |
280 | * And now the modules code and kernel interface. |
281 | */ |
282 | static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT; |
283 | module_param(rd_nr, int, 0444); |
284 | MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices" ); |
285 | |
286 | unsigned long rd_size = CONFIG_BLK_DEV_RAM_SIZE; |
287 | module_param(rd_size, ulong, 0444); |
288 | MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes." ); |
289 | |
290 | static int max_part = 1; |
291 | module_param(max_part, int, 0444); |
292 | MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices" ); |
293 | |
294 | MODULE_LICENSE("GPL" ); |
295 | MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR); |
296 | MODULE_ALIAS("rd" ); |
297 | |
298 | #ifndef MODULE |
299 | /* Legacy boot options - nonmodular */ |
300 | static int __init ramdisk_size(char *str) |
301 | { |
302 | rd_size = simple_strtol(str, NULL, 0); |
303 | return 1; |
304 | } |
305 | __setup("ramdisk_size=" , ramdisk_size); |
306 | #endif |
307 | |
308 | /* |
309 | * The device scheme is derived from loop.c. Keep them in synch where possible |
310 | * (should share code eventually). |
311 | */ |
312 | static LIST_HEAD(brd_devices); |
313 | static struct dentry *brd_debugfs_dir; |
314 | |
315 | static int brd_alloc(int i) |
316 | { |
317 | struct brd_device *brd; |
318 | struct gendisk *disk; |
319 | char buf[DISK_NAME_LEN]; |
320 | int err = -ENOMEM; |
321 | struct queue_limits lim = { |
322 | /* |
323 | * This is so fdisk will align partitions on 4k, because of |
324 | * direct_access API needing 4k alignment, returning a PFN |
325 | * (This is only a problem on very small devices <= 4M, |
326 | * otherwise fdisk will align on 1M. Regardless this call |
327 | * is harmless) |
328 | */ |
329 | .physical_block_size = PAGE_SIZE, |
330 | }; |
331 | |
332 | list_for_each_entry(brd, &brd_devices, brd_list) |
333 | if (brd->brd_number == i) |
334 | return -EEXIST; |
335 | brd = kzalloc(size: sizeof(*brd), GFP_KERNEL); |
336 | if (!brd) |
337 | return -ENOMEM; |
338 | brd->brd_number = i; |
339 | list_add_tail(new: &brd->brd_list, head: &brd_devices); |
340 | |
341 | xa_init(xa: &brd->brd_pages); |
342 | |
343 | snprintf(buf, DISK_NAME_LEN, fmt: "ram%d" , i); |
344 | if (!IS_ERR_OR_NULL(ptr: brd_debugfs_dir)) |
345 | debugfs_create_u64(name: buf, mode: 0444, parent: brd_debugfs_dir, |
346 | value: &brd->brd_nr_pages); |
347 | |
348 | disk = brd->brd_disk = blk_alloc_disk(&lim, NUMA_NO_NODE); |
349 | if (IS_ERR(ptr: disk)) { |
350 | err = PTR_ERR(ptr: disk); |
351 | goto out_free_dev; |
352 | } |
353 | disk->major = RAMDISK_MAJOR; |
354 | disk->first_minor = i * max_part; |
355 | disk->minors = max_part; |
356 | disk->fops = &brd_fops; |
357 | disk->private_data = brd; |
358 | strscpy(disk->disk_name, buf, DISK_NAME_LEN); |
359 | set_capacity(disk, size: rd_size * 2); |
360 | |
361 | /* Tell the block layer that this is not a rotational device */ |
362 | blk_queue_flag_set(QUEUE_FLAG_NONROT, q: disk->queue); |
363 | blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, q: disk->queue); |
364 | blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q: disk->queue); |
365 | err = add_disk(disk); |
366 | if (err) |
367 | goto out_cleanup_disk; |
368 | |
369 | return 0; |
370 | |
371 | out_cleanup_disk: |
372 | put_disk(disk); |
373 | out_free_dev: |
374 | list_del(entry: &brd->brd_list); |
375 | kfree(objp: brd); |
376 | return err; |
377 | } |
378 | |
379 | static void brd_probe(dev_t dev) |
380 | { |
381 | brd_alloc(MINOR(dev) / max_part); |
382 | } |
383 | |
384 | static void brd_cleanup(void) |
385 | { |
386 | struct brd_device *brd, *next; |
387 | |
388 | debugfs_remove_recursive(dentry: brd_debugfs_dir); |
389 | |
390 | list_for_each_entry_safe(brd, next, &brd_devices, brd_list) { |
391 | del_gendisk(gp: brd->brd_disk); |
392 | put_disk(disk: brd->brd_disk); |
393 | brd_free_pages(brd); |
394 | list_del(entry: &brd->brd_list); |
395 | kfree(objp: brd); |
396 | } |
397 | } |
398 | |
399 | static inline void brd_check_and_reset_par(void) |
400 | { |
401 | if (unlikely(!max_part)) |
402 | max_part = 1; |
403 | |
404 | /* |
405 | * make sure 'max_part' can be divided exactly by (1U << MINORBITS), |
406 | * otherwise, it is possiable to get same dev_t when adding partitions. |
407 | */ |
408 | if ((1U << MINORBITS) % max_part != 0) |
409 | max_part = 1UL << fls(x: max_part); |
410 | |
411 | if (max_part > DISK_MAX_PARTS) { |
412 | pr_info("brd: max_part can't be larger than %d, reset max_part = %d.\n" , |
413 | DISK_MAX_PARTS, DISK_MAX_PARTS); |
414 | max_part = DISK_MAX_PARTS; |
415 | } |
416 | } |
417 | |
418 | static int __init brd_init(void) |
419 | { |
420 | int err, i; |
421 | |
422 | brd_check_and_reset_par(); |
423 | |
424 | brd_debugfs_dir = debugfs_create_dir(name: "ramdisk_pages" , NULL); |
425 | |
426 | for (i = 0; i < rd_nr; i++) { |
427 | err = brd_alloc(i); |
428 | if (err) |
429 | goto out_free; |
430 | } |
431 | |
432 | /* |
433 | * brd module now has a feature to instantiate underlying device |
434 | * structure on-demand, provided that there is an access dev node. |
435 | * |
436 | * (1) if rd_nr is specified, create that many upfront. else |
437 | * it defaults to CONFIG_BLK_DEV_RAM_COUNT |
438 | * (2) User can further extend brd devices by create dev node themselves |
439 | * and have kernel automatically instantiate actual device |
440 | * on-demand. Example: |
441 | * mknod /path/devnod_name b 1 X # 1 is the rd major |
442 | * fdisk -l /path/devnod_name |
443 | * If (X / max_part) was not already created it will be created |
444 | * dynamically. |
445 | */ |
446 | |
447 | if (__register_blkdev(RAMDISK_MAJOR, name: "ramdisk" , probe: brd_probe)) { |
448 | err = -EIO; |
449 | goto out_free; |
450 | } |
451 | |
452 | pr_info("brd: module loaded\n" ); |
453 | return 0; |
454 | |
455 | out_free: |
456 | brd_cleanup(); |
457 | |
458 | pr_info("brd: module NOT loaded !!!\n" ); |
459 | return err; |
460 | } |
461 | |
462 | static void __exit brd_exit(void) |
463 | { |
464 | |
465 | unregister_blkdev(RAMDISK_MAJOR, name: "ramdisk" ); |
466 | brd_cleanup(); |
467 | |
468 | pr_info("brd: module unloaded\n" ); |
469 | } |
470 | |
471 | module_init(brd_init); |
472 | module_exit(brd_exit); |
473 | |
474 | |