1 | /* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */ |
2 | /* |
3 | * aoedev.c |
4 | * AoE device utility functions; maintains device list. |
5 | */ |
6 | |
7 | #include <linux/hdreg.h> |
8 | #include <linux/blk-mq.h> |
9 | #include <linux/netdevice.h> |
10 | #include <linux/delay.h> |
11 | #include <linux/slab.h> |
12 | #include <linux/bitmap.h> |
13 | #include <linux/kdev_t.h> |
14 | #include <linux/moduleparam.h> |
15 | #include <linux/string.h> |
16 | #include "aoe.h" |
17 | |
18 | static void freetgt(struct aoedev *d, struct aoetgt *t); |
19 | static void skbpoolfree(struct aoedev *d); |
20 | |
21 | static int aoe_dyndevs = 1; |
22 | module_param(aoe_dyndevs, int, 0644); |
23 | MODULE_PARM_DESC(aoe_dyndevs, "Use dynamic minor numbers for devices." ); |
24 | |
25 | static struct aoedev *devlist; |
26 | static DEFINE_SPINLOCK(devlist_lock); |
27 | |
28 | /* Because some systems will have one, many, or no |
29 | * - partitions, |
30 | * - slots per shelf, |
31 | * - or shelves, |
32 | * we need some flexibility in the way the minor numbers |
33 | * are allocated. So they are dynamic. |
34 | */ |
35 | #define N_DEVS ((1U<<MINORBITS)/AOE_PARTITIONS) |
36 | |
37 | static DEFINE_SPINLOCK(used_minors_lock); |
38 | static DECLARE_BITMAP(used_minors, N_DEVS); |
39 | |
40 | static int |
41 | minor_get_dyn(ulong *sysminor) |
42 | { |
43 | ulong flags; |
44 | ulong n; |
45 | int error = 0; |
46 | |
47 | spin_lock_irqsave(&used_minors_lock, flags); |
48 | n = find_first_zero_bit(addr: used_minors, N_DEVS); |
49 | if (n < N_DEVS) |
50 | set_bit(nr: n, addr: used_minors); |
51 | else |
52 | error = -1; |
53 | spin_unlock_irqrestore(lock: &used_minors_lock, flags); |
54 | |
55 | *sysminor = n * AOE_PARTITIONS; |
56 | return error; |
57 | } |
58 | |
59 | static int |
60 | minor_get_static(ulong *sysminor, ulong aoemaj, int aoemin) |
61 | { |
62 | ulong flags; |
63 | ulong n; |
64 | int error = 0; |
65 | enum { |
66 | /* for backwards compatibility when !aoe_dyndevs, |
67 | * a static number of supported slots per shelf */ |
68 | NPERSHELF = 16, |
69 | }; |
70 | |
71 | if (aoemin >= NPERSHELF) { |
72 | pr_err("aoe: %s %d slots per shelf\n" , |
73 | "static minor device numbers support only" , |
74 | NPERSHELF); |
75 | error = -1; |
76 | goto out; |
77 | } |
78 | |
79 | n = aoemaj * NPERSHELF + aoemin; |
80 | if (n >= N_DEVS) { |
81 | pr_err("aoe: %s with e%ld.%d\n" , |
82 | "cannot use static minor device numbers" , |
83 | aoemaj, aoemin); |
84 | error = -1; |
85 | goto out; |
86 | } |
87 | |
88 | spin_lock_irqsave(&used_minors_lock, flags); |
89 | if (test_bit(n, used_minors)) { |
90 | pr_err("aoe: %s %lu\n" , |
91 | "existing device already has static minor number" , |
92 | n); |
93 | error = -1; |
94 | } else |
95 | set_bit(nr: n, addr: used_minors); |
96 | spin_unlock_irqrestore(lock: &used_minors_lock, flags); |
97 | *sysminor = n * AOE_PARTITIONS; |
98 | out: |
99 | return error; |
100 | } |
101 | |
102 | static int |
103 | minor_get(ulong *sysminor, ulong aoemaj, int aoemin) |
104 | { |
105 | if (aoe_dyndevs) |
106 | return minor_get_dyn(sysminor); |
107 | else |
108 | return minor_get_static(sysminor, aoemaj, aoemin); |
109 | } |
110 | |
111 | static void |
112 | minor_free(ulong minor) |
113 | { |
114 | ulong flags; |
115 | |
116 | minor /= AOE_PARTITIONS; |
117 | BUG_ON(minor >= N_DEVS); |
118 | |
119 | spin_lock_irqsave(&used_minors_lock, flags); |
120 | BUG_ON(!test_bit(minor, used_minors)); |
121 | clear_bit(nr: minor, addr: used_minors); |
122 | spin_unlock_irqrestore(lock: &used_minors_lock, flags); |
123 | } |
124 | |
125 | /* |
126 | * Users who grab a pointer to the device with aoedev_by_aoeaddr |
127 | * automatically get a reference count and must be responsible |
128 | * for performing a aoedev_put. With the addition of async |
129 | * kthread processing I'm no longer confident that we can |
130 | * guarantee consistency in the face of device flushes. |
131 | * |
132 | * For the time being, we only bother to add extra references for |
133 | * frames sitting on the iocq. When the kthreads finish processing |
134 | * these frames, they will aoedev_put the device. |
135 | */ |
136 | |
137 | void |
138 | aoedev_put(struct aoedev *d) |
139 | { |
140 | ulong flags; |
141 | |
142 | spin_lock_irqsave(&devlist_lock, flags); |
143 | d->ref--; |
144 | spin_unlock_irqrestore(lock: &devlist_lock, flags); |
145 | } |
146 | |
147 | static void |
148 | dummy_timer(struct timer_list *t) |
149 | { |
150 | struct aoedev *d; |
151 | |
152 | d = from_timer(d, t, timer); |
153 | if (d->flags & DEVFL_TKILL) |
154 | return; |
155 | d->timer.expires = jiffies + HZ; |
156 | add_timer(timer: &d->timer); |
157 | } |
158 | |
159 | static void |
160 | aoe_failip(struct aoedev *d) |
161 | { |
162 | struct request *rq; |
163 | struct aoe_req *req; |
164 | struct bio *bio; |
165 | |
166 | aoe_failbuf(d, d->ip.buf); |
167 | rq = d->ip.rq; |
168 | if (rq == NULL) |
169 | return; |
170 | |
171 | req = blk_mq_rq_to_pdu(rq); |
172 | while ((bio = d->ip.nxbio)) { |
173 | bio->bi_status = BLK_STS_IOERR; |
174 | d->ip.nxbio = bio->bi_next; |
175 | req->nr_bios--; |
176 | } |
177 | |
178 | if (!req->nr_bios) |
179 | aoe_end_request(d, rq, 0); |
180 | } |
181 | |
182 | static void |
183 | downdev_frame(struct list_head *pos) |
184 | { |
185 | struct frame *f; |
186 | |
187 | f = list_entry(pos, struct frame, head); |
188 | list_del(entry: pos); |
189 | if (f->buf) { |
190 | f->buf->nframesout--; |
191 | aoe_failbuf(f->t->d, f->buf); |
192 | } |
193 | aoe_freetframe(f); |
194 | } |
195 | |
196 | void |
197 | aoedev_downdev(struct aoedev *d) |
198 | { |
199 | struct aoetgt *t, **tt, **te; |
200 | struct list_head *head, *pos, *nx; |
201 | int i; |
202 | |
203 | d->flags &= ~DEVFL_UP; |
204 | |
205 | /* clean out active and to-be-retransmitted buffers */ |
206 | for (i = 0; i < NFACTIVE; i++) { |
207 | head = &d->factive[i]; |
208 | list_for_each_safe(pos, nx, head) |
209 | downdev_frame(pos); |
210 | } |
211 | head = &d->rexmitq; |
212 | list_for_each_safe(pos, nx, head) |
213 | downdev_frame(pos); |
214 | |
215 | /* reset window dressings */ |
216 | tt = d->targets; |
217 | te = tt + d->ntargets; |
218 | for (; tt < te && (t = *tt); tt++) { |
219 | aoecmd_wreset(t); |
220 | t->nout = 0; |
221 | } |
222 | |
223 | /* clean out the in-process request (if any) */ |
224 | aoe_failip(d); |
225 | |
226 | /* fast fail all pending I/O */ |
227 | if (d->blkq) { |
228 | /* UP is cleared, freeze+quiesce to insure all are errored */ |
229 | blk_mq_freeze_queue(q: d->blkq); |
230 | blk_mq_quiesce_queue(q: d->blkq); |
231 | blk_mq_unquiesce_queue(q: d->blkq); |
232 | blk_mq_unfreeze_queue(q: d->blkq); |
233 | } |
234 | |
235 | if (d->gd) |
236 | set_capacity(disk: d->gd, size: 0); |
237 | } |
238 | |
239 | /* return whether the user asked for this particular |
240 | * device to be flushed |
241 | */ |
242 | static int |
243 | user_req(char *s, size_t slen, struct aoedev *d) |
244 | { |
245 | const char *p; |
246 | size_t lim; |
247 | |
248 | if (!d->gd) |
249 | return 0; |
250 | p = kbasename(path: d->gd->disk_name); |
251 | lim = sizeof(d->gd->disk_name); |
252 | lim -= p - d->gd->disk_name; |
253 | if (slen < lim) |
254 | lim = slen; |
255 | |
256 | return !strncmp(s, p, lim); |
257 | } |
258 | |
259 | static void |
260 | freedev(struct aoedev *d) |
261 | { |
262 | struct aoetgt **t, **e; |
263 | int freeing = 0; |
264 | unsigned long flags; |
265 | |
266 | spin_lock_irqsave(&d->lock, flags); |
267 | if (d->flags & DEVFL_TKILL |
268 | && !(d->flags & DEVFL_FREEING)) { |
269 | d->flags |= DEVFL_FREEING; |
270 | freeing = 1; |
271 | } |
272 | spin_unlock_irqrestore(lock: &d->lock, flags); |
273 | if (!freeing) |
274 | return; |
275 | |
276 | del_timer_sync(timer: &d->timer); |
277 | if (d->gd) { |
278 | aoedisk_rm_debugfs(d); |
279 | del_gendisk(gp: d->gd); |
280 | put_disk(disk: d->gd); |
281 | blk_mq_free_tag_set(set: &d->tag_set); |
282 | } |
283 | t = d->targets; |
284 | e = t + d->ntargets; |
285 | for (; t < e && *t; t++) |
286 | freetgt(d, t: *t); |
287 | |
288 | mempool_destroy(pool: d->bufpool); |
289 | skbpoolfree(d); |
290 | minor_free(minor: d->sysminor); |
291 | |
292 | spin_lock_irqsave(&d->lock, flags); |
293 | d->flags |= DEVFL_FREED; |
294 | spin_unlock_irqrestore(lock: &d->lock, flags); |
295 | } |
296 | |
297 | enum flush_parms { |
298 | NOT_EXITING = 0, |
299 | EXITING = 1, |
300 | }; |
301 | |
302 | static int |
303 | flush(const char __user *str, size_t cnt, int exiting) |
304 | { |
305 | ulong flags; |
306 | struct aoedev *d, **dd; |
307 | char buf[16]; |
308 | int all = 0; |
309 | int specified = 0; /* flush a specific device */ |
310 | unsigned int skipflags; |
311 | |
312 | skipflags = DEVFL_GDALLOC | DEVFL_NEWSIZE | DEVFL_TKILL; |
313 | |
314 | if (!exiting && cnt >= 3) { |
315 | if (cnt > sizeof buf) |
316 | cnt = sizeof buf; |
317 | if (copy_from_user(to: buf, from: str, n: cnt)) |
318 | return -EFAULT; |
319 | all = !strncmp(buf, "all" , 3); |
320 | if (!all) |
321 | specified = 1; |
322 | } |
323 | |
324 | flush_workqueue(aoe_wq); |
325 | /* pass one: do aoedev_downdev, which might sleep */ |
326 | restart1: |
327 | spin_lock_irqsave(&devlist_lock, flags); |
328 | for (d = devlist; d; d = d->next) { |
329 | spin_lock(lock: &d->lock); |
330 | if (d->flags & DEVFL_TKILL) |
331 | goto cont; |
332 | |
333 | if (exiting) { |
334 | /* unconditionally take each device down */ |
335 | } else if (specified) { |
336 | if (!user_req(s: buf, slen: cnt, d)) |
337 | goto cont; |
338 | } else if ((!all && (d->flags & DEVFL_UP)) |
339 | || d->flags & skipflags |
340 | || d->nopen |
341 | || d->ref) |
342 | goto cont; |
343 | |
344 | spin_unlock(lock: &d->lock); |
345 | spin_unlock_irqrestore(lock: &devlist_lock, flags); |
346 | aoedev_downdev(d); |
347 | d->flags |= DEVFL_TKILL; |
348 | goto restart1; |
349 | cont: |
350 | spin_unlock(lock: &d->lock); |
351 | } |
352 | spin_unlock_irqrestore(lock: &devlist_lock, flags); |
353 | |
354 | /* pass two: call freedev, which might sleep, |
355 | * for aoedevs marked with DEVFL_TKILL |
356 | */ |
357 | restart2: |
358 | spin_lock_irqsave(&devlist_lock, flags); |
359 | for (d = devlist; d; d = d->next) { |
360 | spin_lock(lock: &d->lock); |
361 | if (d->flags & DEVFL_TKILL |
362 | && !(d->flags & DEVFL_FREEING)) { |
363 | spin_unlock(lock: &d->lock); |
364 | spin_unlock_irqrestore(lock: &devlist_lock, flags); |
365 | freedev(d); |
366 | goto restart2; |
367 | } |
368 | spin_unlock(lock: &d->lock); |
369 | } |
370 | |
371 | /* pass three: remove aoedevs marked with DEVFL_FREED */ |
372 | for (dd = &devlist, d = *dd; d; d = *dd) { |
373 | struct aoedev *doomed = NULL; |
374 | |
375 | spin_lock(lock: &d->lock); |
376 | if (d->flags & DEVFL_FREED) { |
377 | *dd = d->next; |
378 | doomed = d; |
379 | } else { |
380 | dd = &d->next; |
381 | } |
382 | spin_unlock(lock: &d->lock); |
383 | if (doomed) |
384 | kfree(objp: doomed->targets); |
385 | kfree(objp: doomed); |
386 | } |
387 | spin_unlock_irqrestore(lock: &devlist_lock, flags); |
388 | |
389 | return 0; |
390 | } |
391 | |
392 | int |
393 | aoedev_flush(const char __user *str, size_t cnt) |
394 | { |
395 | return flush(str, cnt, exiting: NOT_EXITING); |
396 | } |
397 | |
398 | /* This has been confirmed to occur once with Tms=3*1000 due to the |
399 | * driver changing link and not processing its transmit ring. The |
400 | * problem is hard enough to solve by returning an error that I'm |
401 | * still punting on "solving" this. |
402 | */ |
403 | static void |
404 | skbfree(struct sk_buff *skb) |
405 | { |
406 | enum { Sms = 250, Tms = 30 * 1000}; |
407 | int i = Tms / Sms; |
408 | |
409 | if (skb == NULL) |
410 | return; |
411 | while (atomic_read(v: &skb_shinfo(skb)->dataref) != 1 && i-- > 0) |
412 | msleep(msecs: Sms); |
413 | if (i < 0) { |
414 | printk(KERN_ERR |
415 | "aoe: %s holds ref: %s\n" , |
416 | skb->dev ? skb->dev->name : "netif" , |
417 | "cannot free skb -- memory leaked." ); |
418 | return; |
419 | } |
420 | skb->truesize -= skb->data_len; |
421 | skb_shinfo(skb)->nr_frags = skb->data_len = 0; |
422 | skb_trim(skb, len: 0); |
423 | dev_kfree_skb(skb); |
424 | } |
425 | |
426 | static void |
427 | skbpoolfree(struct aoedev *d) |
428 | { |
429 | struct sk_buff *skb, *tmp; |
430 | |
431 | skb_queue_walk_safe(&d->skbpool, skb, tmp) |
432 | skbfree(skb); |
433 | |
434 | __skb_queue_head_init(list: &d->skbpool); |
435 | } |
436 | |
437 | /* find it or allocate it */ |
438 | struct aoedev * |
439 | aoedev_by_aoeaddr(ulong maj, int min, int do_alloc) |
440 | { |
441 | struct aoedev *d; |
442 | int i; |
443 | ulong flags; |
444 | ulong sysminor = 0; |
445 | |
446 | spin_lock_irqsave(&devlist_lock, flags); |
447 | |
448 | for (d=devlist; d; d=d->next) |
449 | if (d->aoemajor == maj && d->aoeminor == min) { |
450 | spin_lock(lock: &d->lock); |
451 | if (d->flags & DEVFL_TKILL) { |
452 | spin_unlock(lock: &d->lock); |
453 | d = NULL; |
454 | goto out; |
455 | } |
456 | d->ref++; |
457 | spin_unlock(lock: &d->lock); |
458 | break; |
459 | } |
460 | if (d || !do_alloc || minor_get(sysminor: &sysminor, aoemaj: maj, aoemin: min) < 0) |
461 | goto out; |
462 | d = kcalloc(n: 1, size: sizeof *d, GFP_ATOMIC); |
463 | if (!d) |
464 | goto out; |
465 | d->targets = kcalloc(n: NTARGETS, size: sizeof(*d->targets), GFP_ATOMIC); |
466 | if (!d->targets) { |
467 | kfree(objp: d); |
468 | d = NULL; |
469 | goto out; |
470 | } |
471 | d->ntargets = NTARGETS; |
472 | INIT_WORK(&d->work, aoecmd_sleepwork); |
473 | spin_lock_init(&d->lock); |
474 | INIT_LIST_HEAD(list: &d->rq_list); |
475 | skb_queue_head_init(list: &d->skbpool); |
476 | timer_setup(&d->timer, dummy_timer, 0); |
477 | d->timer.expires = jiffies + HZ; |
478 | add_timer(timer: &d->timer); |
479 | d->bufpool = NULL; /* defer to aoeblk_gdalloc */ |
480 | d->tgt = d->targets; |
481 | d->ref = 1; |
482 | for (i = 0; i < NFACTIVE; i++) |
483 | INIT_LIST_HEAD(list: &d->factive[i]); |
484 | INIT_LIST_HEAD(list: &d->rexmitq); |
485 | d->sysminor = sysminor; |
486 | d->aoemajor = maj; |
487 | d->aoeminor = min; |
488 | d->rttavg = RTTAVG_INIT; |
489 | d->rttdev = RTTDEV_INIT; |
490 | d->next = devlist; |
491 | devlist = d; |
492 | out: |
493 | spin_unlock_irqrestore(lock: &devlist_lock, flags); |
494 | return d; |
495 | } |
496 | |
497 | static void |
498 | freetgt(struct aoedev *d, struct aoetgt *t) |
499 | { |
500 | struct frame *f; |
501 | struct list_head *pos, *nx, *head; |
502 | struct aoeif *ifp; |
503 | |
504 | for (ifp = t->ifs; ifp < &t->ifs[NAOEIFS]; ++ifp) { |
505 | if (!ifp->nd) |
506 | break; |
507 | dev_put(dev: ifp->nd); |
508 | } |
509 | |
510 | head = &t->ffree; |
511 | list_for_each_safe(pos, nx, head) { |
512 | list_del(entry: pos); |
513 | f = list_entry(pos, struct frame, head); |
514 | skbfree(skb: f->skb); |
515 | kfree(objp: f); |
516 | } |
517 | kfree(objp: t); |
518 | } |
519 | |
520 | void |
521 | aoedev_exit(void) |
522 | { |
523 | flush_workqueue(aoe_wq); |
524 | flush(NULL, cnt: 0, exiting: EXITING); |
525 | } |
526 | |
527 | int __init |
528 | aoedev_init(void) |
529 | { |
530 | return 0; |
531 | } |
532 | |