1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright 2017 Omnibond Systems, L.L.C.
4 */
5
6#include "protocol.h"
7#include "orangefs-kernel.h"
8#include "orangefs-bufmap.h"
9
10struct orangefs_dir_part {
11 struct orangefs_dir_part *next;
12 size_t len;
13};
14
15struct orangefs_dir {
16 __u64 token;
17 struct orangefs_dir_part *part;
18 loff_t end;
19 int error;
20};
21
22#define PART_SHIFT (24)
23#define PART_SIZE (1<<24)
24#define PART_MASK (~(PART_SIZE - 1))
25
26/*
27 * There can be up to 512 directory entries. Each entry is encoded as
28 * follows:
29 * 4 bytes: string size (n)
30 * n bytes: string
31 * 1 byte: trailing zero
32 * padding to 8 bytes
33 * 16 bytes: khandle
34 * padding to 8 bytes
35 *
36 * The trailer_buf starts with a struct orangefs_readdir_response_s
37 * which must be skipped to get to the directory data.
38 *
39 * The data which is received from the userspace daemon is termed a
40 * part and is stored in a linked list in case more than one part is
41 * needed for a large directory.
42 *
43 * The position pointer (ctx->pos) encodes the part and offset on which
44 * to begin reading at. Bits above PART_SHIFT encode the part and bits
45 * below PART_SHIFT encode the offset. Parts are stored in a linked
46 * list which grows as data is received from the server. The overhead
47 * associated with managing the list is presumed to be small compared to
48 * the overhead of communicating with the server.
49 *
50 * As data is received from the server, it is placed at the end of the
51 * part list. Data is parsed from the current position as it is needed.
52 * When data is determined to be corrupt, it is either because the
53 * userspace component has sent back corrupt data or because the file
54 * pointer has been moved to an invalid location. Since the two cannot
55 * be differentiated, return EIO.
56 *
57 * Part zero is synthesized to contains `.' and `..'. Part one is the
58 * first part of the part list.
59 */
60
61static int do_readdir(struct orangefs_dir *od, struct inode *inode,
62 struct orangefs_kernel_op_s *op)
63{
64 struct orangefs_inode_s *oi = ORANGEFS_I(inode);
65 struct orangefs_readdir_response_s *resp;
66 int bufi, r;
67
68 /*
69 * Despite the badly named field, readdir does not use shared
70 * memory. However, there are a limited number of readdir
71 * slots, which must be allocated here. This flag simply tells
72 * the op scheduler to return the op here for retry.
73 */
74 op->uses_shared_memory = 1;
75 op->upcall.req.readdir.refn = oi->refn;
76 op->upcall.req.readdir.token = od->token;
77 op->upcall.req.readdir.max_dirent_count =
78 ORANGEFS_MAX_DIRENT_COUNT_READDIR;
79
80again:
81 bufi = orangefs_readdir_index_get();
82 if (bufi < 0) {
83 od->error = bufi;
84 return bufi;
85 }
86
87 op->upcall.req.readdir.buf_index = bufi;
88
89 r = service_operation(op, op_name: "orangefs_readdir",
90 get_interruptible_flag(inode));
91
92 orangefs_readdir_index_put(buffer_index: bufi);
93
94 if (op_state_purged(op)) {
95 if (r == -EAGAIN) {
96 vfree(addr: op->downcall.trailer_buf);
97 goto again;
98 } else if (r == -EIO) {
99 vfree(addr: op->downcall.trailer_buf);
100 od->error = r;
101 return r;
102 }
103 }
104
105 if (r < 0) {
106 vfree(addr: op->downcall.trailer_buf);
107 od->error = r;
108 return r;
109 } else if (op->downcall.status) {
110 vfree(addr: op->downcall.trailer_buf);
111 od->error = op->downcall.status;
112 return op->downcall.status;
113 }
114
115 /*
116 * The maximum size is size per entry times the 512 entries plus
117 * the header. This is well under the limit.
118 */
119 if (op->downcall.trailer_size > PART_SIZE) {
120 vfree(addr: op->downcall.trailer_buf);
121 od->error = -EIO;
122 return -EIO;
123 }
124
125 resp = (struct orangefs_readdir_response_s *)
126 op->downcall.trailer_buf;
127 od->token = resp->token;
128 return 0;
129}
130
131static int parse_readdir(struct orangefs_dir *od,
132 struct orangefs_kernel_op_s *op)
133{
134 struct orangefs_dir_part *part, *new;
135 size_t count;
136
137 count = 1;
138 part = od->part;
139 while (part) {
140 count++;
141 if (part->next)
142 part = part->next;
143 else
144 break;
145 }
146
147 new = (void *)op->downcall.trailer_buf;
148 new->next = NULL;
149 new->len = op->downcall.trailer_size -
150 sizeof(struct orangefs_readdir_response_s);
151 if (!od->part)
152 od->part = new;
153 else
154 part->next = new;
155 count++;
156 od->end = count << PART_SHIFT;
157
158 return 0;
159}
160
161static int orangefs_dir_more(struct orangefs_dir *od, struct inode *inode)
162{
163 struct orangefs_kernel_op_s *op;
164 int r;
165
166 op = op_alloc(ORANGEFS_VFS_OP_READDIR);
167 if (!op) {
168 od->error = -ENOMEM;
169 return -ENOMEM;
170 }
171 r = do_readdir(od, inode, op);
172 if (r) {
173 od->error = r;
174 goto out;
175 }
176 r = parse_readdir(od, op);
177 if (r) {
178 od->error = r;
179 goto out;
180 }
181
182 od->error = 0;
183out:
184 op_release(op);
185 return od->error;
186}
187
188static int fill_from_part(struct orangefs_dir_part *part,
189 struct dir_context *ctx)
190{
191 const int offset = sizeof(struct orangefs_readdir_response_s);
192 struct orangefs_khandle *khandle;
193 __u32 *len, padlen;
194 loff_t i;
195 char *s;
196 i = ctx->pos & ~PART_MASK;
197
198 /* The file offset from userspace is too large. */
199 if (i > part->len)
200 return 1;
201
202 /*
203 * If the seek pointer is positioned just before an entry it
204 * should find the next entry.
205 */
206 if (i % 8)
207 i = i + (8 - i%8)%8;
208
209 while (i < part->len) {
210 if (part->len < i + sizeof *len)
211 break;
212 len = (void *)part + offset + i;
213 /*
214 * len is the size of the string itself. padlen is the
215 * total size of the encoded string.
216 */
217 padlen = (sizeof *len + *len + 1) +
218 (8 - (sizeof *len + *len + 1)%8)%8;
219 if (part->len < i + padlen + sizeof *khandle)
220 goto next;
221 s = (void *)part + offset + i + sizeof *len;
222 if (s[*len] != 0)
223 goto next;
224 khandle = (void *)part + offset + i + padlen;
225 if (!dir_emit(ctx, name: s, namelen: *len,
226 ino: orangefs_khandle_to_ino(khandle),
227 DT_UNKNOWN))
228 return 0;
229 i += padlen + sizeof *khandle;
230 i = i + (8 - i%8)%8;
231 BUG_ON(i > part->len);
232 ctx->pos = (ctx->pos & PART_MASK) | i;
233 continue;
234next:
235 i += 8;
236 }
237 return 1;
238}
239
240static int orangefs_dir_fill(struct orangefs_dir *od, struct dir_context *ctx)
241{
242 struct orangefs_dir_part *part;
243 size_t count;
244
245 count = ((ctx->pos & PART_MASK) >> PART_SHIFT) - 1;
246
247 part = od->part;
248 while (part->next && count) {
249 count--;
250 part = part->next;
251 }
252 /* This means the userspace file offset is invalid. */
253 if (count) {
254 od->error = -EIO;
255 return -EIO;
256 }
257
258 while (part && part->len) {
259 int r;
260 r = fill_from_part(part, ctx);
261 if (r < 0) {
262 od->error = r;
263 return r;
264 } else if (r == 0) {
265 /* Userspace buffer is full. */
266 break;
267 } else {
268 /*
269 * The part ran out of data. Move to the next
270 * part. */
271 ctx->pos = (ctx->pos & PART_MASK) +
272 (1 << PART_SHIFT);
273 part = part->next;
274 }
275 }
276 return 0;
277}
278
279static loff_t orangefs_dir_llseek(struct file *file, loff_t offset,
280 int whence)
281{
282 struct orangefs_dir *od = file->private_data;
283 /*
284 * Delete the stored data so userspace sees new directory
285 * entries.
286 */
287 if (!whence && offset < od->end) {
288 struct orangefs_dir_part *part = od->part;
289 while (part) {
290 struct orangefs_dir_part *next = part->next;
291 vfree(addr: part);
292 part = next;
293 }
294 od->token = ORANGEFS_ITERATE_START;
295 od->part = NULL;
296 od->end = 1 << PART_SHIFT;
297 }
298 return default_llseek(file, offset, whence);
299}
300
301static int orangefs_dir_iterate(struct file *file,
302 struct dir_context *ctx)
303{
304 struct orangefs_dir *od = file->private_data;
305 struct inode *inode = file_inode(f: file);
306 int r;
307
308 if (od->error)
309 return od->error;
310
311 if (ctx->pos == 0) {
312 if (!dir_emit_dot(file, ctx))
313 return 0;
314 ctx->pos++;
315 }
316 if (ctx->pos == 1) {
317 if (!dir_emit_dotdot(file, ctx))
318 return 0;
319 ctx->pos = 1 << PART_SHIFT;
320 }
321
322 /*
323 * The seek position is in the first synthesized part but is not
324 * valid.
325 */
326 if ((ctx->pos & PART_MASK) == 0)
327 return -EIO;
328
329 r = 0;
330
331 /*
332 * Must read more if the user has sought past what has been read
333 * so far. Stop a user who has sought past the end.
334 */
335 while (od->token != ORANGEFS_ITERATE_END &&
336 ctx->pos > od->end) {
337 r = orangefs_dir_more(od, inode);
338 if (r)
339 return r;
340 }
341 if (od->token == ORANGEFS_ITERATE_END && ctx->pos > od->end)
342 return -EIO;
343
344 /* Then try to fill if there's any left in the buffer. */
345 if (ctx->pos < od->end) {
346 r = orangefs_dir_fill(od, ctx);
347 if (r)
348 return r;
349 }
350
351 /* Finally get some more and try to fill. */
352 if (od->token != ORANGEFS_ITERATE_END) {
353 r = orangefs_dir_more(od, inode);
354 if (r)
355 return r;
356 r = orangefs_dir_fill(od, ctx);
357 }
358
359 return r;
360}
361
362static int orangefs_dir_open(struct inode *inode, struct file *file)
363{
364 struct orangefs_dir *od;
365 file->private_data = kmalloc(size: sizeof(struct orangefs_dir),
366 GFP_KERNEL);
367 if (!file->private_data)
368 return -ENOMEM;
369 od = file->private_data;
370 od->token = ORANGEFS_ITERATE_START;
371 od->part = NULL;
372 od->end = 1 << PART_SHIFT;
373 od->error = 0;
374 return 0;
375}
376
377static int orangefs_dir_release(struct inode *inode, struct file *file)
378{
379 struct orangefs_dir *od = file->private_data;
380 struct orangefs_dir_part *part = od->part;
381 while (part) {
382 struct orangefs_dir_part *next = part->next;
383 vfree(addr: part);
384 part = next;
385 }
386 kfree(objp: od);
387 return 0;
388}
389
390const struct file_operations orangefs_dir_operations = {
391 .llseek = orangefs_dir_llseek,
392 .read = generic_read_dir,
393 .iterate_shared = orangefs_dir_iterate,
394 .open = orangefs_dir_open,
395 .release = orangefs_dir_release
396};
397

source code of linux/fs/orangefs/dir.c