1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright 2017 Omnibond Systems, L.L.C. |
4 | */ |
5 | |
6 | #include "protocol.h" |
7 | #include "orangefs-kernel.h" |
8 | #include "orangefs-bufmap.h" |
9 | |
10 | struct orangefs_dir_part { |
11 | struct orangefs_dir_part *next; |
12 | size_t len; |
13 | }; |
14 | |
15 | struct orangefs_dir { |
16 | __u64 token; |
17 | struct orangefs_dir_part *part; |
18 | loff_t end; |
19 | int error; |
20 | }; |
21 | |
22 | #define PART_SHIFT (24) |
23 | #define PART_SIZE (1<<24) |
24 | #define PART_MASK (~(PART_SIZE - 1)) |
25 | |
26 | /* |
27 | * There can be up to 512 directory entries. Each entry is encoded as |
28 | * follows: |
29 | * 4 bytes: string size (n) |
30 | * n bytes: string |
31 | * 1 byte: trailing zero |
32 | * padding to 8 bytes |
33 | * 16 bytes: khandle |
34 | * padding to 8 bytes |
35 | * |
36 | * The trailer_buf starts with a struct orangefs_readdir_response_s |
37 | * which must be skipped to get to the directory data. |
38 | * |
39 | * The data which is received from the userspace daemon is termed a |
40 | * part and is stored in a linked list in case more than one part is |
41 | * needed for a large directory. |
42 | * |
43 | * The position pointer (ctx->pos) encodes the part and offset on which |
44 | * to begin reading at. Bits above PART_SHIFT encode the part and bits |
45 | * below PART_SHIFT encode the offset. Parts are stored in a linked |
46 | * list which grows as data is received from the server. The overhead |
47 | * associated with managing the list is presumed to be small compared to |
48 | * the overhead of communicating with the server. |
49 | * |
50 | * As data is received from the server, it is placed at the end of the |
51 | * part list. Data is parsed from the current position as it is needed. |
52 | * When data is determined to be corrupt, it is either because the |
53 | * userspace component has sent back corrupt data or because the file |
54 | * pointer has been moved to an invalid location. Since the two cannot |
55 | * be differentiated, return EIO. |
56 | * |
57 | * Part zero is synthesized to contains `.' and `..'. Part one is the |
58 | * first part of the part list. |
59 | */ |
60 | |
61 | static int do_readdir(struct orangefs_inode_s *oi, |
62 | struct orangefs_dir *od, struct dentry *dentry, |
63 | struct orangefs_kernel_op_s *op) |
64 | { |
65 | struct orangefs_readdir_response_s *resp; |
66 | int bufi, r; |
67 | |
68 | /* |
69 | * Despite the badly named field, readdir does not use shared |
70 | * memory. However, there are a limited number of readdir |
71 | * slots, which must be allocated here. This flag simply tells |
72 | * the op scheduler to return the op here for retry. |
73 | */ |
74 | op->uses_shared_memory = 1; |
75 | op->upcall.req.readdir.refn = oi->refn; |
76 | op->upcall.req.readdir.token = od->token; |
77 | op->upcall.req.readdir.max_dirent_count = |
78 | ORANGEFS_MAX_DIRENT_COUNT_READDIR; |
79 | |
80 | again: |
81 | bufi = orangefs_readdir_index_get(); |
82 | if (bufi < 0) { |
83 | od->error = bufi; |
84 | return bufi; |
85 | } |
86 | |
87 | op->upcall.req.readdir.buf_index = bufi; |
88 | |
89 | r = service_operation(op, op_name: "orangefs_readdir" , |
90 | get_interruptible_flag(dentry->d_inode)); |
91 | |
92 | orangefs_readdir_index_put(buffer_index: bufi); |
93 | |
94 | if (op_state_purged(op)) { |
95 | if (r == -EAGAIN) { |
96 | vfree(addr: op->downcall.trailer_buf); |
97 | goto again; |
98 | } else if (r == -EIO) { |
99 | vfree(addr: op->downcall.trailer_buf); |
100 | od->error = r; |
101 | return r; |
102 | } |
103 | } |
104 | |
105 | if (r < 0) { |
106 | vfree(addr: op->downcall.trailer_buf); |
107 | od->error = r; |
108 | return r; |
109 | } else if (op->downcall.status) { |
110 | vfree(addr: op->downcall.trailer_buf); |
111 | od->error = op->downcall.status; |
112 | return op->downcall.status; |
113 | } |
114 | |
115 | /* |
116 | * The maximum size is size per entry times the 512 entries plus |
117 | * the header. This is well under the limit. |
118 | */ |
119 | if (op->downcall.trailer_size > PART_SIZE) { |
120 | vfree(addr: op->downcall.trailer_buf); |
121 | od->error = -EIO; |
122 | return -EIO; |
123 | } |
124 | |
125 | resp = (struct orangefs_readdir_response_s *) |
126 | op->downcall.trailer_buf; |
127 | od->token = resp->token; |
128 | return 0; |
129 | } |
130 | |
131 | static int parse_readdir(struct orangefs_dir *od, |
132 | struct orangefs_kernel_op_s *op) |
133 | { |
134 | struct orangefs_dir_part *part, *new; |
135 | size_t count; |
136 | |
137 | count = 1; |
138 | part = od->part; |
139 | while (part) { |
140 | count++; |
141 | if (part->next) |
142 | part = part->next; |
143 | else |
144 | break; |
145 | } |
146 | |
147 | new = (void *)op->downcall.trailer_buf; |
148 | new->next = NULL; |
149 | new->len = op->downcall.trailer_size - |
150 | sizeof(struct orangefs_readdir_response_s); |
151 | if (!od->part) |
152 | od->part = new; |
153 | else |
154 | part->next = new; |
155 | count++; |
156 | od->end = count << PART_SHIFT; |
157 | |
158 | return 0; |
159 | } |
160 | |
161 | static int orangefs_dir_more(struct orangefs_inode_s *oi, |
162 | struct orangefs_dir *od, struct dentry *dentry) |
163 | { |
164 | struct orangefs_kernel_op_s *op; |
165 | int r; |
166 | |
167 | op = op_alloc(ORANGEFS_VFS_OP_READDIR); |
168 | if (!op) { |
169 | od->error = -ENOMEM; |
170 | return -ENOMEM; |
171 | } |
172 | r = do_readdir(oi, od, dentry, op); |
173 | if (r) { |
174 | od->error = r; |
175 | goto out; |
176 | } |
177 | r = parse_readdir(od, op); |
178 | if (r) { |
179 | od->error = r; |
180 | goto out; |
181 | } |
182 | |
183 | od->error = 0; |
184 | out: |
185 | op_release(op); |
186 | return od->error; |
187 | } |
188 | |
189 | static int fill_from_part(struct orangefs_dir_part *part, |
190 | struct dir_context *ctx) |
191 | { |
192 | const int offset = sizeof(struct orangefs_readdir_response_s); |
193 | struct orangefs_khandle *khandle; |
194 | __u32 *len, padlen; |
195 | loff_t i; |
196 | char *s; |
197 | i = ctx->pos & ~PART_MASK; |
198 | |
199 | /* The file offset from userspace is too large. */ |
200 | if (i > part->len) |
201 | return 1; |
202 | |
203 | /* |
204 | * If the seek pointer is positioned just before an entry it |
205 | * should find the next entry. |
206 | */ |
207 | if (i % 8) |
208 | i = i + (8 - i%8)%8; |
209 | |
210 | while (i < part->len) { |
211 | if (part->len < i + sizeof *len) |
212 | break; |
213 | len = (void *)part + offset + i; |
214 | /* |
215 | * len is the size of the string itself. padlen is the |
216 | * total size of the encoded string. |
217 | */ |
218 | padlen = (sizeof *len + *len + 1) + |
219 | (8 - (sizeof *len + *len + 1)%8)%8; |
220 | if (part->len < i + padlen + sizeof *khandle) |
221 | goto next; |
222 | s = (void *)part + offset + i + sizeof *len; |
223 | if (s[*len] != 0) |
224 | goto next; |
225 | khandle = (void *)part + offset + i + padlen; |
226 | if (!dir_emit(ctx, name: s, namelen: *len, |
227 | ino: orangefs_khandle_to_ino(khandle), |
228 | DT_UNKNOWN)) |
229 | return 0; |
230 | i += padlen + sizeof *khandle; |
231 | i = i + (8 - i%8)%8; |
232 | BUG_ON(i > part->len); |
233 | ctx->pos = (ctx->pos & PART_MASK) | i; |
234 | continue; |
235 | next: |
236 | i += 8; |
237 | } |
238 | return 1; |
239 | } |
240 | |
241 | static int orangefs_dir_fill(struct orangefs_inode_s *oi, |
242 | struct orangefs_dir *od, struct dentry *dentry, |
243 | struct dir_context *ctx) |
244 | { |
245 | struct orangefs_dir_part *part; |
246 | size_t count; |
247 | |
248 | count = ((ctx->pos & PART_MASK) >> PART_SHIFT) - 1; |
249 | |
250 | part = od->part; |
251 | while (part->next && count) { |
252 | count--; |
253 | part = part->next; |
254 | } |
255 | /* This means the userspace file offset is invalid. */ |
256 | if (count) { |
257 | od->error = -EIO; |
258 | return -EIO; |
259 | } |
260 | |
261 | while (part && part->len) { |
262 | int r; |
263 | r = fill_from_part(part, ctx); |
264 | if (r < 0) { |
265 | od->error = r; |
266 | return r; |
267 | } else if (r == 0) { |
268 | /* Userspace buffer is full. */ |
269 | break; |
270 | } else { |
271 | /* |
272 | * The part ran out of data. Move to the next |
273 | * part. */ |
274 | ctx->pos = (ctx->pos & PART_MASK) + |
275 | (1 << PART_SHIFT); |
276 | part = part->next; |
277 | } |
278 | } |
279 | return 0; |
280 | } |
281 | |
282 | static loff_t orangefs_dir_llseek(struct file *file, loff_t offset, |
283 | int whence) |
284 | { |
285 | struct orangefs_dir *od = file->private_data; |
286 | /* |
287 | * Delete the stored data so userspace sees new directory |
288 | * entries. |
289 | */ |
290 | if (!whence && offset < od->end) { |
291 | struct orangefs_dir_part *part = od->part; |
292 | while (part) { |
293 | struct orangefs_dir_part *next = part->next; |
294 | vfree(addr: part); |
295 | part = next; |
296 | } |
297 | od->token = ORANGEFS_ITERATE_START; |
298 | od->part = NULL; |
299 | od->end = 1 << PART_SHIFT; |
300 | } |
301 | return default_llseek(file, offset, whence); |
302 | } |
303 | |
304 | static int orangefs_dir_iterate(struct file *file, |
305 | struct dir_context *ctx) |
306 | { |
307 | struct orangefs_inode_s *oi; |
308 | struct orangefs_dir *od; |
309 | struct dentry *dentry; |
310 | int r; |
311 | |
312 | dentry = file->f_path.dentry; |
313 | oi = ORANGEFS_I(inode: dentry->d_inode); |
314 | od = file->private_data; |
315 | |
316 | if (od->error) |
317 | return od->error; |
318 | |
319 | if (ctx->pos == 0) { |
320 | if (!dir_emit_dot(file, ctx)) |
321 | return 0; |
322 | ctx->pos++; |
323 | } |
324 | if (ctx->pos == 1) { |
325 | if (!dir_emit_dotdot(file, ctx)) |
326 | return 0; |
327 | ctx->pos = 1 << PART_SHIFT; |
328 | } |
329 | |
330 | /* |
331 | * The seek position is in the first synthesized part but is not |
332 | * valid. |
333 | */ |
334 | if ((ctx->pos & PART_MASK) == 0) |
335 | return -EIO; |
336 | |
337 | r = 0; |
338 | |
339 | /* |
340 | * Must read more if the user has sought past what has been read |
341 | * so far. Stop a user who has sought past the end. |
342 | */ |
343 | while (od->token != ORANGEFS_ITERATE_END && |
344 | ctx->pos > od->end) { |
345 | r = orangefs_dir_more(oi, od, dentry); |
346 | if (r) |
347 | return r; |
348 | } |
349 | if (od->token == ORANGEFS_ITERATE_END && ctx->pos > od->end) |
350 | return -EIO; |
351 | |
352 | /* Then try to fill if there's any left in the buffer. */ |
353 | if (ctx->pos < od->end) { |
354 | r = orangefs_dir_fill(oi, od, dentry, ctx); |
355 | if (r) |
356 | return r; |
357 | } |
358 | |
359 | /* Finally get some more and try to fill. */ |
360 | if (od->token != ORANGEFS_ITERATE_END) { |
361 | r = orangefs_dir_more(oi, od, dentry); |
362 | if (r) |
363 | return r; |
364 | r = orangefs_dir_fill(oi, od, dentry, ctx); |
365 | } |
366 | |
367 | return r; |
368 | } |
369 | |
370 | static int orangefs_dir_open(struct inode *inode, struct file *file) |
371 | { |
372 | struct orangefs_dir *od; |
373 | file->private_data = kmalloc(size: sizeof(struct orangefs_dir), |
374 | GFP_KERNEL); |
375 | if (!file->private_data) |
376 | return -ENOMEM; |
377 | od = file->private_data; |
378 | od->token = ORANGEFS_ITERATE_START; |
379 | od->part = NULL; |
380 | od->end = 1 << PART_SHIFT; |
381 | od->error = 0; |
382 | return 0; |
383 | } |
384 | |
385 | static int orangefs_dir_release(struct inode *inode, struct file *file) |
386 | { |
387 | struct orangefs_dir *od = file->private_data; |
388 | struct orangefs_dir_part *part = od->part; |
389 | while (part) { |
390 | struct orangefs_dir_part *next = part->next; |
391 | vfree(addr: part); |
392 | part = next; |
393 | } |
394 | kfree(objp: od); |
395 | return 0; |
396 | } |
397 | |
398 | const struct file_operations orangefs_dir_operations = { |
399 | .llseek = orangefs_dir_llseek, |
400 | .read = generic_read_dir, |
401 | .iterate_shared = orangefs_dir_iterate, |
402 | .open = orangefs_dir_open, |
403 | .release = orangefs_dir_release |
404 | }; |
405 | |