1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright 2017 Omnibond Systems, L.L.C. |
4 | */ |
5 | |
6 | #include "protocol.h" |
7 | #include "orangefs-kernel.h" |
8 | #include "orangefs-bufmap.h" |
9 | |
10 | struct orangefs_dir_part { |
11 | struct orangefs_dir_part *next; |
12 | size_t len; |
13 | }; |
14 | |
15 | struct orangefs_dir { |
16 | __u64 token; |
17 | struct orangefs_dir_part *part; |
18 | loff_t end; |
19 | int error; |
20 | }; |
21 | |
22 | #define PART_SHIFT (24) |
23 | #define PART_SIZE (1<<24) |
24 | #define PART_MASK (~(PART_SIZE - 1)) |
25 | |
26 | /* |
27 | * There can be up to 512 directory entries. Each entry is encoded as |
28 | * follows: |
29 | * 4 bytes: string size (n) |
30 | * n bytes: string |
31 | * 1 byte: trailing zero |
32 | * padding to 8 bytes |
33 | * 16 bytes: khandle |
34 | * padding to 8 bytes |
35 | * |
36 | * The trailer_buf starts with a struct orangefs_readdir_response_s |
37 | * which must be skipped to get to the directory data. |
38 | * |
39 | * The data which is received from the userspace daemon is termed a |
40 | * part and is stored in a linked list in case more than one part is |
41 | * needed for a large directory. |
42 | * |
43 | * The position pointer (ctx->pos) encodes the part and offset on which |
44 | * to begin reading at. Bits above PART_SHIFT encode the part and bits |
45 | * below PART_SHIFT encode the offset. Parts are stored in a linked |
46 | * list which grows as data is received from the server. The overhead |
47 | * associated with managing the list is presumed to be small compared to |
48 | * the overhead of communicating with the server. |
49 | * |
50 | * As data is received from the server, it is placed at the end of the |
51 | * part list. Data is parsed from the current position as it is needed. |
52 | * When data is determined to be corrupt, it is either because the |
53 | * userspace component has sent back corrupt data or because the file |
54 | * pointer has been moved to an invalid location. Since the two cannot |
55 | * be differentiated, return EIO. |
56 | * |
57 | * Part zero is synthesized to contains `.' and `..'. Part one is the |
58 | * first part of the part list. |
59 | */ |
60 | |
61 | static int do_readdir(struct orangefs_dir *od, struct inode *inode, |
62 | struct orangefs_kernel_op_s *op) |
63 | { |
64 | struct orangefs_inode_s *oi = ORANGEFS_I(inode); |
65 | struct orangefs_readdir_response_s *resp; |
66 | int bufi, r; |
67 | |
68 | /* |
69 | * Despite the badly named field, readdir does not use shared |
70 | * memory. However, there are a limited number of readdir |
71 | * slots, which must be allocated here. This flag simply tells |
72 | * the op scheduler to return the op here for retry. |
73 | */ |
74 | op->uses_shared_memory = 1; |
75 | op->upcall.req.readdir.refn = oi->refn; |
76 | op->upcall.req.readdir.token = od->token; |
77 | op->upcall.req.readdir.max_dirent_count = |
78 | ORANGEFS_MAX_DIRENT_COUNT_READDIR; |
79 | |
80 | again: |
81 | bufi = orangefs_readdir_index_get(); |
82 | if (bufi < 0) { |
83 | od->error = bufi; |
84 | return bufi; |
85 | } |
86 | |
87 | op->upcall.req.readdir.buf_index = bufi; |
88 | |
89 | r = service_operation(op, op_name: "orangefs_readdir" , |
90 | get_interruptible_flag(inode)); |
91 | |
92 | orangefs_readdir_index_put(buffer_index: bufi); |
93 | |
94 | if (op_state_purged(op)) { |
95 | if (r == -EAGAIN) { |
96 | vfree(addr: op->downcall.trailer_buf); |
97 | goto again; |
98 | } else if (r == -EIO) { |
99 | vfree(addr: op->downcall.trailer_buf); |
100 | od->error = r; |
101 | return r; |
102 | } |
103 | } |
104 | |
105 | if (r < 0) { |
106 | vfree(addr: op->downcall.trailer_buf); |
107 | od->error = r; |
108 | return r; |
109 | } else if (op->downcall.status) { |
110 | vfree(addr: op->downcall.trailer_buf); |
111 | od->error = op->downcall.status; |
112 | return op->downcall.status; |
113 | } |
114 | |
115 | /* |
116 | * The maximum size is size per entry times the 512 entries plus |
117 | * the header. This is well under the limit. |
118 | */ |
119 | if (op->downcall.trailer_size > PART_SIZE) { |
120 | vfree(addr: op->downcall.trailer_buf); |
121 | od->error = -EIO; |
122 | return -EIO; |
123 | } |
124 | |
125 | resp = (struct orangefs_readdir_response_s *) |
126 | op->downcall.trailer_buf; |
127 | od->token = resp->token; |
128 | return 0; |
129 | } |
130 | |
131 | static int parse_readdir(struct orangefs_dir *od, |
132 | struct orangefs_kernel_op_s *op) |
133 | { |
134 | struct orangefs_dir_part *part, *new; |
135 | size_t count; |
136 | |
137 | count = 1; |
138 | part = od->part; |
139 | while (part) { |
140 | count++; |
141 | if (part->next) |
142 | part = part->next; |
143 | else |
144 | break; |
145 | } |
146 | |
147 | new = (void *)op->downcall.trailer_buf; |
148 | new->next = NULL; |
149 | new->len = op->downcall.trailer_size - |
150 | sizeof(struct orangefs_readdir_response_s); |
151 | if (!od->part) |
152 | od->part = new; |
153 | else |
154 | part->next = new; |
155 | count++; |
156 | od->end = count << PART_SHIFT; |
157 | |
158 | return 0; |
159 | } |
160 | |
161 | static int orangefs_dir_more(struct orangefs_dir *od, struct inode *inode) |
162 | { |
163 | struct orangefs_kernel_op_s *op; |
164 | int r; |
165 | |
166 | op = op_alloc(ORANGEFS_VFS_OP_READDIR); |
167 | if (!op) { |
168 | od->error = -ENOMEM; |
169 | return -ENOMEM; |
170 | } |
171 | r = do_readdir(od, inode, op); |
172 | if (r) { |
173 | od->error = r; |
174 | goto out; |
175 | } |
176 | r = parse_readdir(od, op); |
177 | if (r) { |
178 | od->error = r; |
179 | goto out; |
180 | } |
181 | |
182 | od->error = 0; |
183 | out: |
184 | op_release(op); |
185 | return od->error; |
186 | } |
187 | |
188 | static int fill_from_part(struct orangefs_dir_part *part, |
189 | struct dir_context *ctx) |
190 | { |
191 | const int offset = sizeof(struct orangefs_readdir_response_s); |
192 | struct orangefs_khandle *khandle; |
193 | __u32 *len, padlen; |
194 | loff_t i; |
195 | char *s; |
196 | i = ctx->pos & ~PART_MASK; |
197 | |
198 | /* The file offset from userspace is too large. */ |
199 | if (i > part->len) |
200 | return 1; |
201 | |
202 | /* |
203 | * If the seek pointer is positioned just before an entry it |
204 | * should find the next entry. |
205 | */ |
206 | if (i % 8) |
207 | i = i + (8 - i%8)%8; |
208 | |
209 | while (i < part->len) { |
210 | if (part->len < i + sizeof *len) |
211 | break; |
212 | len = (void *)part + offset + i; |
213 | /* |
214 | * len is the size of the string itself. padlen is the |
215 | * total size of the encoded string. |
216 | */ |
217 | padlen = (sizeof *len + *len + 1) + |
218 | (8 - (sizeof *len + *len + 1)%8)%8; |
219 | if (part->len < i + padlen + sizeof *khandle) |
220 | goto next; |
221 | s = (void *)part + offset + i + sizeof *len; |
222 | if (s[*len] != 0) |
223 | goto next; |
224 | khandle = (void *)part + offset + i + padlen; |
225 | if (!dir_emit(ctx, name: s, namelen: *len, |
226 | ino: orangefs_khandle_to_ino(khandle), |
227 | DT_UNKNOWN)) |
228 | return 0; |
229 | i += padlen + sizeof *khandle; |
230 | i = i + (8 - i%8)%8; |
231 | BUG_ON(i > part->len); |
232 | ctx->pos = (ctx->pos & PART_MASK) | i; |
233 | continue; |
234 | next: |
235 | i += 8; |
236 | } |
237 | return 1; |
238 | } |
239 | |
240 | static int orangefs_dir_fill(struct orangefs_dir *od, struct dir_context *ctx) |
241 | { |
242 | struct orangefs_dir_part *part; |
243 | size_t count; |
244 | |
245 | count = ((ctx->pos & PART_MASK) >> PART_SHIFT) - 1; |
246 | |
247 | part = od->part; |
248 | while (part->next && count) { |
249 | count--; |
250 | part = part->next; |
251 | } |
252 | /* This means the userspace file offset is invalid. */ |
253 | if (count) { |
254 | od->error = -EIO; |
255 | return -EIO; |
256 | } |
257 | |
258 | while (part && part->len) { |
259 | int r; |
260 | r = fill_from_part(part, ctx); |
261 | if (r < 0) { |
262 | od->error = r; |
263 | return r; |
264 | } else if (r == 0) { |
265 | /* Userspace buffer is full. */ |
266 | break; |
267 | } else { |
268 | /* |
269 | * The part ran out of data. Move to the next |
270 | * part. */ |
271 | ctx->pos = (ctx->pos & PART_MASK) + |
272 | (1 << PART_SHIFT); |
273 | part = part->next; |
274 | } |
275 | } |
276 | return 0; |
277 | } |
278 | |
279 | static loff_t orangefs_dir_llseek(struct file *file, loff_t offset, |
280 | int whence) |
281 | { |
282 | struct orangefs_dir *od = file->private_data; |
283 | /* |
284 | * Delete the stored data so userspace sees new directory |
285 | * entries. |
286 | */ |
287 | if (!whence && offset < od->end) { |
288 | struct orangefs_dir_part *part = od->part; |
289 | while (part) { |
290 | struct orangefs_dir_part *next = part->next; |
291 | vfree(addr: part); |
292 | part = next; |
293 | } |
294 | od->token = ORANGEFS_ITERATE_START; |
295 | od->part = NULL; |
296 | od->end = 1 << PART_SHIFT; |
297 | } |
298 | return default_llseek(file, offset, whence); |
299 | } |
300 | |
301 | static int orangefs_dir_iterate(struct file *file, |
302 | struct dir_context *ctx) |
303 | { |
304 | struct orangefs_dir *od = file->private_data; |
305 | struct inode *inode = file_inode(f: file); |
306 | int r; |
307 | |
308 | if (od->error) |
309 | return od->error; |
310 | |
311 | if (ctx->pos == 0) { |
312 | if (!dir_emit_dot(file, ctx)) |
313 | return 0; |
314 | ctx->pos++; |
315 | } |
316 | if (ctx->pos == 1) { |
317 | if (!dir_emit_dotdot(file, ctx)) |
318 | return 0; |
319 | ctx->pos = 1 << PART_SHIFT; |
320 | } |
321 | |
322 | /* |
323 | * The seek position is in the first synthesized part but is not |
324 | * valid. |
325 | */ |
326 | if ((ctx->pos & PART_MASK) == 0) |
327 | return -EIO; |
328 | |
329 | r = 0; |
330 | |
331 | /* |
332 | * Must read more if the user has sought past what has been read |
333 | * so far. Stop a user who has sought past the end. |
334 | */ |
335 | while (od->token != ORANGEFS_ITERATE_END && |
336 | ctx->pos > od->end) { |
337 | r = orangefs_dir_more(od, inode); |
338 | if (r) |
339 | return r; |
340 | } |
341 | if (od->token == ORANGEFS_ITERATE_END && ctx->pos > od->end) |
342 | return -EIO; |
343 | |
344 | /* Then try to fill if there's any left in the buffer. */ |
345 | if (ctx->pos < od->end) { |
346 | r = orangefs_dir_fill(od, ctx); |
347 | if (r) |
348 | return r; |
349 | } |
350 | |
351 | /* Finally get some more and try to fill. */ |
352 | if (od->token != ORANGEFS_ITERATE_END) { |
353 | r = orangefs_dir_more(od, inode); |
354 | if (r) |
355 | return r; |
356 | r = orangefs_dir_fill(od, ctx); |
357 | } |
358 | |
359 | return r; |
360 | } |
361 | |
362 | static int orangefs_dir_open(struct inode *inode, struct file *file) |
363 | { |
364 | struct orangefs_dir *od; |
365 | file->private_data = kmalloc(size: sizeof(struct orangefs_dir), |
366 | GFP_KERNEL); |
367 | if (!file->private_data) |
368 | return -ENOMEM; |
369 | od = file->private_data; |
370 | od->token = ORANGEFS_ITERATE_START; |
371 | od->part = NULL; |
372 | od->end = 1 << PART_SHIFT; |
373 | od->error = 0; |
374 | return 0; |
375 | } |
376 | |
377 | static int orangefs_dir_release(struct inode *inode, struct file *file) |
378 | { |
379 | struct orangefs_dir *od = file->private_data; |
380 | struct orangefs_dir_part *part = od->part; |
381 | while (part) { |
382 | struct orangefs_dir_part *next = part->next; |
383 | vfree(addr: part); |
384 | part = next; |
385 | } |
386 | kfree(objp: od); |
387 | return 0; |
388 | } |
389 | |
390 | const struct file_operations orangefs_dir_operations = { |
391 | .llseek = orangefs_dir_llseek, |
392 | .read = generic_read_dir, |
393 | .iterate_shared = orangefs_dir_iterate, |
394 | .open = orangefs_dir_open, |
395 | .release = orangefs_dir_release |
396 | }; |
397 | |