20993df143b571ab65d8d1e5e13eb2f30f2b0a8a
[deliverable/linux.git] / drivers / staging / lustre / lustre / llite / file.c
1 /*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 *
24 * GPL HEADER END
25 */
26 /*
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
29 *
30 * Copyright (c) 2011, 2012, Intel Corporation.
31 */
32 /*
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
35 *
36 * lustre/llite/file.c
37 *
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
41 */
42
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include "../include/lustre_dlm.h"
45 #include "../include/lustre_lite.h"
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include "../include/lustre/ll_fiemap.h"
50
51 #include "../include/cl_object.h"
52
53 static int
54 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
55
56 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
57 bool *lease_broken);
58
59 static enum llioc_iter
60 ll_iocontrol_call(struct inode *inode, struct file *file,
61 unsigned int cmd, unsigned long arg, int *rcp);
62
63 static struct ll_file_data *ll_file_data_get(void)
64 {
65 struct ll_file_data *fd;
66
67 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
68 if (fd == NULL)
69 return NULL;
70 fd->fd_write_failed = false;
71 return fd;
72 }
73
74 static void ll_file_data_put(struct ll_file_data *fd)
75 {
76 if (fd != NULL)
77 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
78 }
79
80 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
81 struct lustre_handle *fh)
82 {
83 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
84 op_data->op_attr.ia_mode = inode->i_mode;
85 op_data->op_attr.ia_atime = inode->i_atime;
86 op_data->op_attr.ia_mtime = inode->i_mtime;
87 op_data->op_attr.ia_ctime = inode->i_ctime;
88 op_data->op_attr.ia_size = i_size_read(inode);
89 op_data->op_attr_blocks = inode->i_blocks;
90 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
91 ll_inode_to_ext_flags(inode->i_flags);
92 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
93 if (fh)
94 op_data->op_handle = *fh;
95 op_data->op_capa1 = ll_mdscapa_get(inode);
96
97 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
98 op_data->op_bias |= MDS_DATA_MODIFIED;
99 }
100
101 /**
102 * Closes the IO epoch and packs all the attributes into @op_data for
103 * the CLOSE rpc.
104 */
105 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
106 struct obd_client_handle *och)
107 {
108 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
109 ATTR_MTIME | ATTR_MTIME_SET |
110 ATTR_CTIME | ATTR_CTIME_SET;
111
112 if (!(och->och_flags & FMODE_WRITE))
113 goto out;
114
115 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
116 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
117 else
118 ll_ioepoch_close(inode, op_data, &och, 0);
119
120 out:
121 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
122 ll_prep_md_op_data(op_data, inode, NULL, NULL,
123 0, 0, LUSTRE_OPC_ANY, NULL);
124 }
125
126 static int ll_close_inode_openhandle(struct obd_export *md_exp,
127 struct inode *inode,
128 struct obd_client_handle *och,
129 const __u64 *data_version)
130 {
131 struct obd_export *exp = ll_i2mdexp(inode);
132 struct md_op_data *op_data;
133 struct ptlrpc_request *req = NULL;
134 struct obd_device *obd = class_exp2obd(exp);
135 int epoch_close = 1;
136 int rc;
137
138 if (obd == NULL) {
139 /*
140 * XXX: in case of LMV, is this correct to access
141 * ->exp_handle?
142 */
143 CERROR("Invalid MDC connection handle %#llx\n",
144 ll_i2mdexp(inode)->exp_handle.h_cookie);
145 GOTO(out, rc = 0);
146 }
147
148 OBD_ALLOC_PTR(op_data);
149 if (op_data == NULL)
150 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
151
152 ll_prepare_close(inode, op_data, och);
153 if (data_version != NULL) {
154 /* Pass in data_version implies release. */
155 op_data->op_bias |= MDS_HSM_RELEASE;
156 op_data->op_data_version = *data_version;
157 op_data->op_lease_handle = och->och_lease_handle;
158 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
159 }
160 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
161 rc = md_close(md_exp, op_data, och->och_mod, &req);
162 if (rc == -EAGAIN) {
163 /* This close must have the epoch closed. */
164 LASSERT(epoch_close);
165 /* MDS has instructed us to obtain Size-on-MDS attribute from
166 * OSTs and send setattr to back to MDS. */
167 rc = ll_som_update(inode, op_data);
168 if (rc) {
169 CERROR("inode %lu mdc Size-on-MDS update failed: "
170 "rc = %d\n", inode->i_ino, rc);
171 rc = 0;
172 }
173 } else if (rc) {
174 CERROR("inode %lu mdc close failed: rc = %d\n",
175 inode->i_ino, rc);
176 }
177
178 /* DATA_MODIFIED flag was successfully sent on close, cancel data
179 * modification flag. */
180 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
181 struct ll_inode_info *lli = ll_i2info(inode);
182
183 spin_lock(&lli->lli_lock);
184 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
185 spin_unlock(&lli->lli_lock);
186 }
187
188 if (rc == 0) {
189 rc = ll_objects_destroy(req, inode);
190 if (rc)
191 CERROR("inode %lu ll_objects destroy: rc = %d\n",
192 inode->i_ino, rc);
193 }
194 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
195 struct mdt_body *body;
196 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
197 if (!(body->valid & OBD_MD_FLRELEASED))
198 rc = -EBUSY;
199 }
200
201 ll_finish_md_op_data(op_data);
202
203 out:
204 if (exp_connect_som(exp) && !epoch_close &&
205 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
206 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
207 } else {
208 md_clear_open_replay_data(md_exp, och);
209 /* Free @och if it is not waiting for DONE_WRITING. */
210 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
211 OBD_FREE_PTR(och);
212 }
213 if (req) /* This is close request */
214 ptlrpc_req_finished(req);
215 return rc;
216 }
217
218 int ll_md_real_close(struct inode *inode, fmode_t fmode)
219 {
220 struct ll_inode_info *lli = ll_i2info(inode);
221 struct obd_client_handle **och_p;
222 struct obd_client_handle *och;
223 __u64 *och_usecount;
224 int rc = 0;
225
226 if (fmode & FMODE_WRITE) {
227 och_p = &lli->lli_mds_write_och;
228 och_usecount = &lli->lli_open_fd_write_count;
229 } else if (fmode & FMODE_EXEC) {
230 och_p = &lli->lli_mds_exec_och;
231 och_usecount = &lli->lli_open_fd_exec_count;
232 } else {
233 LASSERT(fmode & FMODE_READ);
234 och_p = &lli->lli_mds_read_och;
235 och_usecount = &lli->lli_open_fd_read_count;
236 }
237
238 mutex_lock(&lli->lli_och_mutex);
239 if (*och_usecount > 0) {
240 /* There are still users of this handle, so skip
241 * freeing it. */
242 mutex_unlock(&lli->lli_och_mutex);
243 return 0;
244 }
245
246 och=*och_p;
247 *och_p = NULL;
248 mutex_unlock(&lli->lli_och_mutex);
249
250 if (och != NULL) {
251 /* There might be a race and this handle may already
252 be closed. */
253 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
254 inode, och, NULL);
255 }
256
257 return rc;
258 }
259
260 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
261 struct file *file)
262 {
263 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
264 struct ll_inode_info *lli = ll_i2info(inode);
265 int rc = 0;
266
267 /* clear group lock, if present */
268 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
269 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
270
271 if (fd->fd_lease_och != NULL) {
272 bool lease_broken;
273
274 /* Usually the lease is not released when the
275 * application crashed, we need to release here. */
276 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
277 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
278 PFID(&lli->lli_fid), rc, lease_broken);
279
280 fd->fd_lease_och = NULL;
281 }
282
283 if (fd->fd_och != NULL) {
284 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
285 fd->fd_och = NULL;
286 GOTO(out, rc);
287 }
288
289 /* Let's see if we have good enough OPEN lock on the file and if
290 we can skip talking to MDS */
291 if (file->f_dentry->d_inode) { /* Can this ever be false? */
292 int lockmode;
293 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
294 struct lustre_handle lockh;
295 struct inode *inode = file->f_dentry->d_inode;
296 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
297
298 mutex_lock(&lli->lli_och_mutex);
299 if (fd->fd_omode & FMODE_WRITE) {
300 lockmode = LCK_CW;
301 LASSERT(lli->lli_open_fd_write_count);
302 lli->lli_open_fd_write_count--;
303 } else if (fd->fd_omode & FMODE_EXEC) {
304 lockmode = LCK_PR;
305 LASSERT(lli->lli_open_fd_exec_count);
306 lli->lli_open_fd_exec_count--;
307 } else {
308 lockmode = LCK_CR;
309 LASSERT(lli->lli_open_fd_read_count);
310 lli->lli_open_fd_read_count--;
311 }
312 mutex_unlock(&lli->lli_och_mutex);
313
314 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
315 LDLM_IBITS, &policy, lockmode,
316 &lockh)) {
317 rc = ll_md_real_close(file->f_dentry->d_inode,
318 fd->fd_omode);
319 }
320 } else {
321 CERROR("Releasing a file %p with negative dentry %p. Name %s",
322 file, file->f_dentry, file->f_dentry->d_name.name);
323 }
324
325 out:
326 LUSTRE_FPRIVATE(file) = NULL;
327 ll_file_data_put(fd);
328 ll_capa_close(inode);
329
330 return rc;
331 }
332
333 /* While this returns an error code, fput() the caller does not, so we need
334 * to make every effort to clean up all of our state here. Also, applications
335 * rarely check close errors and even if an error is returned they will not
336 * re-try the close call.
337 */
338 int ll_file_release(struct inode *inode, struct file *file)
339 {
340 struct ll_file_data *fd;
341 struct ll_sb_info *sbi = ll_i2sbi(inode);
342 struct ll_inode_info *lli = ll_i2info(inode);
343 int rc;
344
345 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
346 inode->i_generation, inode);
347
348 #ifdef CONFIG_FS_POSIX_ACL
349 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
350 inode == inode->i_sb->s_root->d_inode) {
351 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
352
353 LASSERT(fd != NULL);
354 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
355 fd->fd_flags &= ~LL_FILE_RMTACL;
356 rct_del(&sbi->ll_rct, current_pid());
357 et_search_free(&sbi->ll_et, current_pid());
358 }
359 }
360 #endif
361
362 if (inode->i_sb->s_root != file->f_dentry)
363 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
364 fd = LUSTRE_FPRIVATE(file);
365 LASSERT(fd != NULL);
366
367 /* The last ref on @file, maybe not the the owner pid of statahead.
368 * Different processes can open the same dir, "ll_opendir_key" means:
369 * it is me that should stop the statahead thread. */
370 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
371 lli->lli_opendir_pid != 0)
372 ll_stop_statahead(inode, lli->lli_opendir_key);
373
374 if (inode->i_sb->s_root == file->f_dentry) {
375 LUSTRE_FPRIVATE(file) = NULL;
376 ll_file_data_put(fd);
377 return 0;
378 }
379
380 if (!S_ISDIR(inode->i_mode)) {
381 lov_read_and_clear_async_rc(lli->lli_clob);
382 lli->lli_async_rc = 0;
383 }
384
385 rc = ll_md_close(sbi->ll_md_exp, inode, file);
386
387 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
388 libcfs_debug_dumplog();
389
390 return rc;
391 }
392
393 static int ll_intent_file_open(struct file *file, void *lmm,
394 int lmmsize, struct lookup_intent *itp)
395 {
396 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
397 struct dentry *parent = file->f_dentry->d_parent;
398 const char *name = file->f_dentry->d_name.name;
399 const int len = file->f_dentry->d_name.len;
400 struct md_op_data *op_data;
401 struct ptlrpc_request *req;
402 __u32 opc = LUSTRE_OPC_ANY;
403 int rc;
404
405 if (!parent)
406 return -ENOENT;
407
408 /* Usually we come here only for NFSD, and we want open lock.
409 But we can also get here with pre 2.6.15 patchless kernels, and in
410 that case that lock is also ok */
411 /* We can also get here if there was cached open handle in revalidate_it
412 * but it disappeared while we were getting from there to ll_file_open.
413 * But this means this file was closed and immediately opened which
414 * makes a good candidate for using OPEN lock */
415 /* If lmmsize & lmm are not 0, we are just setting stripe info
416 * parameters. No need for the open lock */
417 if (lmm == NULL && lmmsize == 0) {
418 itp->it_flags |= MDS_OPEN_LOCK;
419 if (itp->it_flags & FMODE_WRITE)
420 opc = LUSTRE_OPC_CREATE;
421 }
422
423 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
424 file->f_dentry->d_inode, name, len,
425 O_RDWR, opc, NULL);
426 if (IS_ERR(op_data))
427 return PTR_ERR(op_data);
428
429 itp->it_flags |= MDS_OPEN_BY_FID;
430 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
431 0 /*unused */, &req, ll_md_blocking_ast, 0);
432 ll_finish_md_op_data(op_data);
433 if (rc == -ESTALE) {
434 /* reason for keep own exit path - don`t flood log
435 * with messages with -ESTALE errors.
436 */
437 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
438 it_open_error(DISP_OPEN_OPEN, itp))
439 GOTO(out, rc);
440 ll_release_openhandle(file->f_dentry, itp);
441 GOTO(out, rc);
442 }
443
444 if (it_disposition(itp, DISP_LOOKUP_NEG))
445 GOTO(out, rc = -ENOENT);
446
447 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
448 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
449 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
450 GOTO(out, rc);
451 }
452
453 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
454 if (!rc && itp->d.lustre.it_lock_mode)
455 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
456 itp, NULL);
457
458 out:
459 ptlrpc_req_finished(req);
460 ll_intent_drop_lock(itp);
461
462 return rc;
463 }
464
465 /**
466 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
467 * not believe attributes if a few ioepoch holders exist. Attributes for
468 * previous ioepoch if new one is opened are also skipped by MDS.
469 */
470 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
471 {
472 if (ioepoch && lli->lli_ioepoch != ioepoch) {
473 lli->lli_ioepoch = ioepoch;
474 CDEBUG(D_INODE, "Epoch %llu opened on "DFID"\n",
475 ioepoch, PFID(&lli->lli_fid));
476 }
477 }
478
479 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
480 struct obd_client_handle *och)
481 {
482 struct ptlrpc_request *req = it->d.lustre.it_data;
483 struct mdt_body *body;
484
485 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
486 och->och_fh = body->handle;
487 och->och_fid = body->fid1;
488 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
489 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
490 och->och_flags = it->it_flags;
491
492 return md_set_open_replay_data(md_exp, och, it);
493 }
494
495 static int ll_local_open(struct file *file, struct lookup_intent *it,
496 struct ll_file_data *fd, struct obd_client_handle *och)
497 {
498 struct inode *inode = file->f_dentry->d_inode;
499 struct ll_inode_info *lli = ll_i2info(inode);
500
501 LASSERT(!LUSTRE_FPRIVATE(file));
502
503 LASSERT(fd != NULL);
504
505 if (och) {
506 struct ptlrpc_request *req = it->d.lustre.it_data;
507 struct mdt_body *body;
508 int rc;
509
510 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
511 if (rc != 0)
512 return rc;
513
514 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
515 ll_ioepoch_open(lli, body->ioepoch);
516 }
517
518 LUSTRE_FPRIVATE(file) = fd;
519 ll_readahead_init(inode, &fd->fd_ras);
520 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
521 return 0;
522 }
523
524 /* Open a file, and (for the very first open) create objects on the OSTs at
525 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
526 * creation or open until ll_lov_setstripe() ioctl is called.
527 *
528 * If we already have the stripe MD locally then we don't request it in
529 * md_open(), by passing a lmm_size = 0.
530 *
531 * It is up to the application to ensure no other processes open this file
532 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
533 * used. We might be able to avoid races of that sort by getting lli_open_sem
534 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
535 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
536 */
537 int ll_file_open(struct inode *inode, struct file *file)
538 {
539 struct ll_inode_info *lli = ll_i2info(inode);
540 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
541 .it_flags = file->f_flags };
542 struct obd_client_handle **och_p = NULL;
543 __u64 *och_usecount = NULL;
544 struct ll_file_data *fd;
545 int rc = 0, opendir_set = 0;
546
547 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
548 inode->i_generation, inode, file->f_flags);
549
550 it = file->private_data; /* XXX: compat macro */
551 file->private_data = NULL; /* prevent ll_local_open assertion */
552
553 fd = ll_file_data_get();
554 if (fd == NULL)
555 GOTO(out_openerr, rc = -ENOMEM);
556
557 fd->fd_file = file;
558 if (S_ISDIR(inode->i_mode)) {
559 spin_lock(&lli->lli_sa_lock);
560 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
561 lli->lli_opendir_pid == 0) {
562 lli->lli_opendir_key = fd;
563 lli->lli_opendir_pid = current_pid();
564 opendir_set = 1;
565 }
566 spin_unlock(&lli->lli_sa_lock);
567 }
568
569 if (inode->i_sb->s_root == file->f_dentry) {
570 LUSTRE_FPRIVATE(file) = fd;
571 return 0;
572 }
573
574 if (!it || !it->d.lustre.it_disposition) {
575 /* Convert f_flags into access mode. We cannot use file->f_mode,
576 * because everything but O_ACCMODE mask was stripped from
577 * there */
578 if ((oit.it_flags + 1) & O_ACCMODE)
579 oit.it_flags++;
580 if (file->f_flags & O_TRUNC)
581 oit.it_flags |= FMODE_WRITE;
582
583 /* kernel only call f_op->open in dentry_open. filp_open calls
584 * dentry_open after call to open_namei that checks permissions.
585 * Only nfsd_open call dentry_open directly without checking
586 * permissions and because of that this code below is safe. */
587 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
588 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
589
590 /* We do not want O_EXCL here, presumably we opened the file
591 * already? XXX - NFS implications? */
592 oit.it_flags &= ~O_EXCL;
593
594 /* bug20584, if "it_flags" contains O_CREAT, the file will be
595 * created if necessary, then "IT_CREAT" should be set to keep
596 * consistent with it */
597 if (oit.it_flags & O_CREAT)
598 oit.it_op |= IT_CREAT;
599
600 it = &oit;
601 }
602
603 restart:
604 /* Let's see if we have file open on MDS already. */
605 if (it->it_flags & FMODE_WRITE) {
606 och_p = &lli->lli_mds_write_och;
607 och_usecount = &lli->lli_open_fd_write_count;
608 } else if (it->it_flags & FMODE_EXEC) {
609 och_p = &lli->lli_mds_exec_och;
610 och_usecount = &lli->lli_open_fd_exec_count;
611 } else {
612 och_p = &lli->lli_mds_read_och;
613 och_usecount = &lli->lli_open_fd_read_count;
614 }
615
616 mutex_lock(&lli->lli_och_mutex);
617 if (*och_p) { /* Open handle is present */
618 if (it_disposition(it, DISP_OPEN_OPEN)) {
619 /* Well, there's extra open request that we do not need,
620 let's close it somehow. This will decref request. */
621 rc = it_open_error(DISP_OPEN_OPEN, it);
622 if (rc) {
623 mutex_unlock(&lli->lli_och_mutex);
624 GOTO(out_openerr, rc);
625 }
626
627 ll_release_openhandle(file->f_dentry, it);
628 }
629 (*och_usecount)++;
630
631 rc = ll_local_open(file, it, fd, NULL);
632 if (rc) {
633 (*och_usecount)--;
634 mutex_unlock(&lli->lli_och_mutex);
635 GOTO(out_openerr, rc);
636 }
637 } else {
638 LASSERT(*och_usecount == 0);
639 if (!it->d.lustre.it_disposition) {
640 /* We cannot just request lock handle now, new ELC code
641 means that one of other OPEN locks for this file
642 could be cancelled, and since blocking ast handler
643 would attempt to grab och_mutex as well, that would
644 result in a deadlock */
645 mutex_unlock(&lli->lli_och_mutex);
646 it->it_create_mode |= M_CHECK_STALE;
647 rc = ll_intent_file_open(file, NULL, 0, it);
648 it->it_create_mode &= ~M_CHECK_STALE;
649 if (rc)
650 GOTO(out_openerr, rc);
651
652 goto restart;
653 }
654 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
655 if (!*och_p)
656 GOTO(out_och_free, rc = -ENOMEM);
657
658 (*och_usecount)++;
659
660 /* md_intent_lock() didn't get a request ref if there was an
661 * open error, so don't do cleanup on the request here
662 * (bug 3430) */
663 /* XXX (green): Should not we bail out on any error here, not
664 * just open error? */
665 rc = it_open_error(DISP_OPEN_OPEN, it);
666 if (rc)
667 GOTO(out_och_free, rc);
668
669 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
670
671 rc = ll_local_open(file, it, fd, *och_p);
672 if (rc)
673 GOTO(out_och_free, rc);
674 }
675 mutex_unlock(&lli->lli_och_mutex);
676 fd = NULL;
677
678 /* Must do this outside lli_och_mutex lock to prevent deadlock where
679 different kind of OPEN lock for this same inode gets cancelled
680 by ldlm_cancel_lru */
681 if (!S_ISREG(inode->i_mode))
682 GOTO(out_och_free, rc);
683
684 ll_capa_open(inode);
685
686 if (!lli->lli_has_smd &&
687 (cl_is_lov_delay_create(file->f_flags) ||
688 (file->f_mode & FMODE_WRITE) == 0)) {
689 CDEBUG(D_INODE, "object creation was delayed\n");
690 GOTO(out_och_free, rc);
691 }
692 cl_lov_delay_create_clear(&file->f_flags);
693 GOTO(out_och_free, rc);
694
695 out_och_free:
696 if (rc) {
697 if (och_p && *och_p) {
698 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
699 *och_p = NULL; /* OBD_FREE writes some magic there */
700 (*och_usecount)--;
701 }
702 mutex_unlock(&lli->lli_och_mutex);
703
704 out_openerr:
705 if (opendir_set != 0)
706 ll_stop_statahead(inode, lli->lli_opendir_key);
707 if (fd != NULL)
708 ll_file_data_put(fd);
709 } else {
710 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
711 }
712
713 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
714 ptlrpc_req_finished(it->d.lustre.it_data);
715 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
716 }
717
718 return rc;
719 }
720
721 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
722 struct ldlm_lock_desc *desc, void *data, int flag)
723 {
724 int rc;
725 struct lustre_handle lockh;
726
727 switch (flag) {
728 case LDLM_CB_BLOCKING:
729 ldlm_lock2handle(lock, &lockh);
730 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
731 if (rc < 0) {
732 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
733 return rc;
734 }
735 break;
736 case LDLM_CB_CANCELING:
737 /* do nothing */
738 break;
739 }
740 return 0;
741 }
742
743 /**
744 * Acquire a lease and open the file.
745 */
746 static struct obd_client_handle *
747 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
748 __u64 open_flags)
749 {
750 struct lookup_intent it = { .it_op = IT_OPEN };
751 struct ll_sb_info *sbi = ll_i2sbi(inode);
752 struct md_op_data *op_data;
753 struct ptlrpc_request *req;
754 struct lustre_handle old_handle = { 0 };
755 struct obd_client_handle *och = NULL;
756 int rc;
757 int rc2;
758
759 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
760 return ERR_PTR(-EINVAL);
761
762 if (file != NULL) {
763 struct ll_inode_info *lli = ll_i2info(inode);
764 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
765 struct obd_client_handle **och_p;
766 __u64 *och_usecount;
767
768 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
769 return ERR_PTR(-EPERM);
770
771 /* Get the openhandle of the file */
772 rc = -EBUSY;
773 mutex_lock(&lli->lli_och_mutex);
774 if (fd->fd_lease_och != NULL) {
775 mutex_unlock(&lli->lli_och_mutex);
776 return ERR_PTR(rc);
777 }
778
779 if (fd->fd_och == NULL) {
780 if (file->f_mode & FMODE_WRITE) {
781 LASSERT(lli->lli_mds_write_och != NULL);
782 och_p = &lli->lli_mds_write_och;
783 och_usecount = &lli->lli_open_fd_write_count;
784 } else {
785 LASSERT(lli->lli_mds_read_och != NULL);
786 och_p = &lli->lli_mds_read_och;
787 och_usecount = &lli->lli_open_fd_read_count;
788 }
789 if (*och_usecount == 1) {
790 fd->fd_och = *och_p;
791 *och_p = NULL;
792 *och_usecount = 0;
793 rc = 0;
794 }
795 }
796 mutex_unlock(&lli->lli_och_mutex);
797 if (rc < 0) /* more than 1 opener */
798 return ERR_PTR(rc);
799
800 LASSERT(fd->fd_och != NULL);
801 old_handle = fd->fd_och->och_fh;
802 }
803
804 OBD_ALLOC_PTR(och);
805 if (och == NULL)
806 return ERR_PTR(-ENOMEM);
807
808 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
809 LUSTRE_OPC_ANY, NULL);
810 if (IS_ERR(op_data))
811 GOTO(out, rc = PTR_ERR(op_data));
812
813 /* To tell the MDT this openhandle is from the same owner */
814 op_data->op_handle = old_handle;
815
816 it.it_flags = fmode | open_flags;
817 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
818 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
819 ll_md_blocking_lease_ast,
820 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
821 * it can be cancelled which may mislead applications that the lease is
822 * broken;
823 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
824 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
825 * doesn't deal with openhandle, so normal openhandle will be leaked. */
826 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
827 ll_finish_md_op_data(op_data);
828 ptlrpc_req_finished(req);
829 if (rc < 0)
830 GOTO(out_release_it, rc);
831
832 if (it_disposition(&it, DISP_LOOKUP_NEG))
833 GOTO(out_release_it, rc = -ENOENT);
834
835 rc = it_open_error(DISP_OPEN_OPEN, &it);
836 if (rc)
837 GOTO(out_release_it, rc);
838
839 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
840 ll_och_fill(sbi->ll_md_exp, &it, och);
841
842 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
843 GOTO(out_close, rc = -EOPNOTSUPP);
844
845 /* already get lease, handle lease lock */
846 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
847 if (it.d.lustre.it_lock_mode == 0 ||
848 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
849 /* open lock must return for lease */
850 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
851 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
852 it.d.lustre.it_lock_bits);
853 GOTO(out_close, rc = -EPROTO);
854 }
855
856 ll_intent_release(&it);
857 return och;
858
859 out_close:
860 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
861 if (rc2)
862 CERROR("Close openhandle returned %d\n", rc2);
863
864 /* cancel open lock */
865 if (it.d.lustre.it_lock_mode != 0) {
866 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
867 it.d.lustre.it_lock_mode);
868 it.d.lustre.it_lock_mode = 0;
869 }
870 out_release_it:
871 ll_intent_release(&it);
872 out:
873 OBD_FREE_PTR(och);
874 return ERR_PTR(rc);
875 }
876
877 /**
878 * Release lease and close the file.
879 * It will check if the lease has ever broken.
880 */
881 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
882 bool *lease_broken)
883 {
884 struct ldlm_lock *lock;
885 bool cancelled = true;
886 int rc;
887
888 lock = ldlm_handle2lock(&och->och_lease_handle);
889 if (lock != NULL) {
890 lock_res_and_lock(lock);
891 cancelled = ldlm_is_cancel(lock);
892 unlock_res_and_lock(lock);
893 ldlm_lock_put(lock);
894 }
895
896 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
897 PFID(&ll_i2info(inode)->lli_fid), cancelled);
898
899 if (!cancelled)
900 ldlm_cli_cancel(&och->och_lease_handle, 0);
901 if (lease_broken != NULL)
902 *lease_broken = cancelled;
903
904 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
905 NULL);
906 return rc;
907 }
908
909 /* Fills the obdo with the attributes for the lsm */
910 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
911 struct obd_capa *capa, struct obdo *obdo,
912 __u64 ioepoch, int sync)
913 {
914 struct ptlrpc_request_set *set;
915 struct obd_info oinfo = { { { 0 } } };
916 int rc;
917
918 LASSERT(lsm != NULL);
919
920 oinfo.oi_md = lsm;
921 oinfo.oi_oa = obdo;
922 oinfo.oi_oa->o_oi = lsm->lsm_oi;
923 oinfo.oi_oa->o_mode = S_IFREG;
924 oinfo.oi_oa->o_ioepoch = ioepoch;
925 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
926 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
927 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
928 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
929 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
930 OBD_MD_FLDATAVERSION;
931 oinfo.oi_capa = capa;
932 if (sync) {
933 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
934 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
935 }
936
937 set = ptlrpc_prep_set();
938 if (set == NULL) {
939 CERROR("can't allocate ptlrpc set\n");
940 rc = -ENOMEM;
941 } else {
942 rc = obd_getattr_async(exp, &oinfo, set);
943 if (rc == 0)
944 rc = ptlrpc_set_wait(set);
945 ptlrpc_set_destroy(set);
946 }
947 if (rc == 0)
948 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
949 OBD_MD_FLATIME | OBD_MD_FLMTIME |
950 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
951 OBD_MD_FLDATAVERSION);
952 return rc;
953 }
954
955 /**
956 * Performs the getattr on the inode and updates its fields.
957 * If @sync != 0, perform the getattr under the server-side lock.
958 */
959 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
960 __u64 ioepoch, int sync)
961 {
962 struct obd_capa *capa = ll_mdscapa_get(inode);
963 struct lov_stripe_md *lsm;
964 int rc;
965
966 lsm = ccc_inode_lsm_get(inode);
967 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
968 capa, obdo, ioepoch, sync);
969 capa_put(capa);
970 if (rc == 0) {
971 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
972
973 obdo_refresh_inode(inode, obdo, obdo->o_valid);
974 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
975 " blksize %lu\n", POSTID(oi), i_size_read(inode),
976 (unsigned long long)inode->i_blocks,
977 (unsigned long)ll_inode_blksize(inode));
978 }
979 ccc_inode_lsm_put(inode, lsm);
980 return rc;
981 }
982
983 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
984 {
985 struct ll_inode_info *lli = ll_i2info(inode);
986 struct cl_object *obj = lli->lli_clob;
987 struct cl_attr *attr = ccc_env_thread_attr(env);
988 struct ost_lvb lvb;
989 int rc = 0;
990
991 ll_inode_size_lock(inode);
992 /* merge timestamps the most recently obtained from mds with
993 timestamps obtained from osts */
994 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
995 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
996 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
997
998 lvb.lvb_size = i_size_read(inode);
999 lvb.lvb_blocks = inode->i_blocks;
1000 lvb.lvb_mtime = LTIME_S(inode->i_mtime);
1001 lvb.lvb_atime = LTIME_S(inode->i_atime);
1002 lvb.lvb_ctime = LTIME_S(inode->i_ctime);
1003
1004 cl_object_attr_lock(obj);
1005 rc = cl_object_attr_get(env, obj, attr);
1006 cl_object_attr_unlock(obj);
1007
1008 if (rc == 0) {
1009 if (lvb.lvb_atime < attr->cat_atime)
1010 lvb.lvb_atime = attr->cat_atime;
1011 if (lvb.lvb_ctime < attr->cat_ctime)
1012 lvb.lvb_ctime = attr->cat_ctime;
1013 if (lvb.lvb_mtime < attr->cat_mtime)
1014 lvb.lvb_mtime = attr->cat_mtime;
1015
1016 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1017 PFID(&lli->lli_fid), attr->cat_size);
1018 cl_isize_write_nolock(inode, attr->cat_size);
1019
1020 inode->i_blocks = attr->cat_blocks;
1021
1022 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1023 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1024 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1025 }
1026 ll_inode_size_unlock(inode);
1027
1028 return rc;
1029 }
1030
1031 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1032 lstat_t *st)
1033 {
1034 struct obdo obdo = { 0 };
1035 int rc;
1036
1037 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1038 if (rc == 0) {
1039 st->st_size = obdo.o_size;
1040 st->st_blocks = obdo.o_blocks;
1041 st->st_mtime = obdo.o_mtime;
1042 st->st_atime = obdo.o_atime;
1043 st->st_ctime = obdo.o_ctime;
1044 }
1045 return rc;
1046 }
1047
1048 static bool file_is_noatime(const struct file *file)
1049 {
1050 const struct vfsmount *mnt = file->f_path.mnt;
1051 const struct inode *inode = file->f_path.dentry->d_inode;
1052
1053 /* Adapted from file_accessed() and touch_atime().*/
1054 if (file->f_flags & O_NOATIME)
1055 return true;
1056
1057 if (inode->i_flags & S_NOATIME)
1058 return true;
1059
1060 if (IS_NOATIME(inode))
1061 return true;
1062
1063 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1064 return true;
1065
1066 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1067 return true;
1068
1069 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1070 return true;
1071
1072 return false;
1073 }
1074
1075 void ll_io_init(struct cl_io *io, const struct file *file, int write)
1076 {
1077 struct inode *inode = file->f_dentry->d_inode;
1078
1079 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1080 if (write) {
1081 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1082 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1083 file->f_flags & O_DIRECT ||
1084 IS_SYNC(inode);
1085 }
1086 io->ci_obj = ll_i2info(inode)->lli_clob;
1087 io->ci_lockreq = CILR_MAYBE;
1088 if (ll_file_nolock(file)) {
1089 io->ci_lockreq = CILR_NEVER;
1090 io->ci_no_srvlock = 1;
1091 } else if (file->f_flags & O_APPEND) {
1092 io->ci_lockreq = CILR_MANDATORY;
1093 }
1094
1095 io->ci_noatime = file_is_noatime(file);
1096 }
1097
1098 static ssize_t
1099 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1100 struct file *file, enum cl_io_type iot,
1101 loff_t *ppos, size_t count)
1102 {
1103 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1104 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1105 struct cl_io *io;
1106 ssize_t result;
1107
1108 restart:
1109 io = ccc_env_thread_io(env);
1110 ll_io_init(io, file, iot == CIT_WRITE);
1111
1112 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1113 struct vvp_io *vio = vvp_env_io(env);
1114 struct ccc_io *cio = ccc_env_io(env);
1115 int write_mutex_locked = 0;
1116
1117 cio->cui_fd = LUSTRE_FPRIVATE(file);
1118 vio->cui_io_subtype = args->via_io_subtype;
1119
1120 switch (vio->cui_io_subtype) {
1121 case IO_NORMAL:
1122 cio->cui_iter = args->u.normal.via_iter;
1123 cio->cui_iocb = args->u.normal.via_iocb;
1124 if ((iot == CIT_WRITE) &&
1125 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1126 if (mutex_lock_interruptible(&lli->
1127 lli_write_mutex))
1128 GOTO(out, result = -ERESTARTSYS);
1129 write_mutex_locked = 1;
1130 } else if (iot == CIT_READ) {
1131 down_read(&lli->lli_trunc_sem);
1132 }
1133 break;
1134 case IO_SPLICE:
1135 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1136 vio->u.splice.cui_flags = args->u.splice.via_flags;
1137 break;
1138 default:
1139 CERROR("Unknown IO type - %u\n", vio->cui_io_subtype);
1140 LBUG();
1141 }
1142 result = cl_io_loop(env, io);
1143 if (write_mutex_locked)
1144 mutex_unlock(&lli->lli_write_mutex);
1145 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1146 up_read(&lli->lli_trunc_sem);
1147 } else {
1148 /* cl_io_rw_init() handled IO */
1149 result = io->ci_result;
1150 }
1151
1152 if (io->ci_nob > 0) {
1153 result = io->ci_nob;
1154 *ppos = io->u.ci_wr.wr.crw_pos;
1155 }
1156 GOTO(out, result);
1157 out:
1158 cl_io_fini(env, io);
1159 /* If any bit been read/written (result != 0), we just return
1160 * short read/write instead of restart io. */
1161 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1162 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1163 iot == CIT_READ ? "read" : "write",
1164 file->f_dentry->d_name.name, *ppos, count);
1165 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1166 goto restart;
1167 }
1168
1169 if (iot == CIT_READ) {
1170 if (result >= 0)
1171 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1172 LPROC_LL_READ_BYTES, result);
1173 } else if (iot == CIT_WRITE) {
1174 if (result >= 0) {
1175 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1176 LPROC_LL_WRITE_BYTES, result);
1177 fd->fd_write_failed = false;
1178 } else if (result != -ERESTARTSYS) {
1179 fd->fd_write_failed = true;
1180 }
1181 }
1182
1183 return result;
1184 }
1185
1186 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1187 {
1188 struct lu_env *env;
1189 struct vvp_io_args *args;
1190 ssize_t result;
1191 int refcheck;
1192
1193 env = cl_env_get(&refcheck);
1194 if (IS_ERR(env))
1195 return PTR_ERR(env);
1196
1197 args = vvp_env_args(env, IO_NORMAL);
1198 args->u.normal.via_iter = to;
1199 args->u.normal.via_iocb = iocb;
1200
1201 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1202 &iocb->ki_pos, iov_iter_count(to));
1203 cl_env_put(env, &refcheck);
1204 return result;
1205 }
1206
1207 /*
1208 * Write to a file (through the page cache).
1209 */
1210 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1211 {
1212 struct lu_env *env;
1213 struct vvp_io_args *args;
1214 ssize_t result;
1215 int refcheck;
1216
1217 env = cl_env_get(&refcheck);
1218 if (IS_ERR(env))
1219 return PTR_ERR(env);
1220
1221 args = vvp_env_args(env, IO_NORMAL);
1222 args->u.normal.via_iter = from;
1223 args->u.normal.via_iocb = iocb;
1224
1225 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1226 &iocb->ki_pos, iov_iter_count(from));
1227 cl_env_put(env, &refcheck);
1228 return result;
1229 }
1230
1231 /*
1232 * Send file content (through pagecache) somewhere with helper
1233 */
1234 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1235 struct pipe_inode_info *pipe, size_t count,
1236 unsigned int flags)
1237 {
1238 struct lu_env *env;
1239 struct vvp_io_args *args;
1240 ssize_t result;
1241 int refcheck;
1242
1243 env = cl_env_get(&refcheck);
1244 if (IS_ERR(env))
1245 return PTR_ERR(env);
1246
1247 args = vvp_env_args(env, IO_SPLICE);
1248 args->u.splice.via_pipe = pipe;
1249 args->u.splice.via_flags = flags;
1250
1251 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1252 cl_env_put(env, &refcheck);
1253 return result;
1254 }
1255
1256 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi, u32 ost_idx)
1257 {
1258 struct obd_export *exp = ll_i2dtexp(inode);
1259 struct obd_trans_info oti = { 0 };
1260 struct obdo *oa = NULL;
1261 int lsm_size;
1262 int rc = 0;
1263 struct lov_stripe_md *lsm = NULL, *lsm2;
1264
1265 OBDO_ALLOC(oa);
1266 if (oa == NULL)
1267 return -ENOMEM;
1268
1269 lsm = ccc_inode_lsm_get(inode);
1270 if (!lsm_has_objects(lsm))
1271 GOTO(out, rc = -ENOENT);
1272
1273 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1274 (lsm->lsm_stripe_count));
1275
1276 OBD_ALLOC_LARGE(lsm2, lsm_size);
1277 if (lsm2 == NULL)
1278 GOTO(out, rc = -ENOMEM);
1279
1280 oa->o_oi = *oi;
1281 oa->o_nlink = ost_idx;
1282 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1283 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1284 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1285 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1286 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1287 memcpy(lsm2, lsm, lsm_size);
1288 ll_inode_size_lock(inode);
1289 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1290 ll_inode_size_unlock(inode);
1291
1292 OBD_FREE_LARGE(lsm2, lsm_size);
1293 GOTO(out, rc);
1294 out:
1295 ccc_inode_lsm_put(inode, lsm);
1296 OBDO_FREE(oa);
1297 return rc;
1298 }
1299
1300 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1301 {
1302 struct ll_recreate_obj ucreat;
1303 struct ost_id oi;
1304
1305 if (!capable(CFS_CAP_SYS_ADMIN))
1306 return -EPERM;
1307
1308 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1309 sizeof(ucreat)))
1310 return -EFAULT;
1311
1312 ostid_set_seq_mdt0(&oi);
1313 ostid_set_id(&oi, ucreat.lrc_id);
1314 return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
1315 }
1316
1317 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1318 {
1319 struct lu_fid fid;
1320 struct ost_id oi;
1321 u32 ost_idx;
1322
1323 if (!capable(CFS_CAP_SYS_ADMIN))
1324 return -EPERM;
1325
1326 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1327 return -EFAULT;
1328
1329 fid_to_ostid(&fid, &oi);
1330 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1331 return ll_lov_recreate(inode, &oi, ost_idx);
1332 }
1333
1334 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1335 int flags, struct lov_user_md *lum, int lum_size)
1336 {
1337 struct lov_stripe_md *lsm = NULL;
1338 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1339 int rc = 0;
1340
1341 lsm = ccc_inode_lsm_get(inode);
1342 if (lsm != NULL) {
1343 ccc_inode_lsm_put(inode, lsm);
1344 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1345 inode->i_ino);
1346 GOTO(out, rc = -EEXIST);
1347 }
1348
1349 ll_inode_size_lock(inode);
1350 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1351 if (rc)
1352 GOTO(out_unlock, rc);
1353 rc = oit.d.lustre.it_status;
1354 if (rc < 0)
1355 GOTO(out_req_free, rc);
1356
1357 ll_release_openhandle(file->f_dentry, &oit);
1358
1359 out_unlock:
1360 ll_inode_size_unlock(inode);
1361 ll_intent_release(&oit);
1362 ccc_inode_lsm_put(inode, lsm);
1363 out:
1364 cl_lov_delay_create_clear(&file->f_flags);
1365 return rc;
1366 out_req_free:
1367 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1368 goto out;
1369 }
1370
1371 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1372 struct lov_mds_md **lmmp, int *lmm_size,
1373 struct ptlrpc_request **request)
1374 {
1375 struct ll_sb_info *sbi = ll_i2sbi(inode);
1376 struct mdt_body *body;
1377 struct lov_mds_md *lmm = NULL;
1378 struct ptlrpc_request *req = NULL;
1379 struct md_op_data *op_data;
1380 int rc, lmmsize;
1381
1382 rc = ll_get_default_mdsize(sbi, &lmmsize);
1383 if (rc)
1384 return rc;
1385
1386 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1387 strlen(filename), lmmsize,
1388 LUSTRE_OPC_ANY, NULL);
1389 if (IS_ERR(op_data))
1390 return PTR_ERR(op_data);
1391
1392 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1393 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1394 ll_finish_md_op_data(op_data);
1395 if (rc < 0) {
1396 CDEBUG(D_INFO, "md_getattr_name failed "
1397 "on %s: rc %d\n", filename, rc);
1398 GOTO(out, rc);
1399 }
1400
1401 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1402 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1403
1404 lmmsize = body->eadatasize;
1405
1406 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1407 lmmsize == 0) {
1408 GOTO(out, rc = -ENODATA);
1409 }
1410
1411 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1412 LASSERT(lmm != NULL);
1413
1414 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1415 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1416 GOTO(out, rc = -EPROTO);
1417 }
1418
1419 /*
1420 * This is coming from the MDS, so is probably in
1421 * little endian. We convert it to host endian before
1422 * passing it to userspace.
1423 */
1424 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1425 int stripe_count;
1426
1427 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1428 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1429 stripe_count = 0;
1430
1431 /* if function called for directory - we should
1432 * avoid swab not existent lsm objects */
1433 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1434 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1435 if (S_ISREG(body->mode))
1436 lustre_swab_lov_user_md_objects(
1437 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1438 stripe_count);
1439 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1440 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1441 if (S_ISREG(body->mode))
1442 lustre_swab_lov_user_md_objects(
1443 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1444 stripe_count);
1445 }
1446 }
1447
1448 out:
1449 *lmmp = lmm;
1450 *lmm_size = lmmsize;
1451 *request = req;
1452 return rc;
1453 }
1454
1455 static int ll_lov_setea(struct inode *inode, struct file *file,
1456 unsigned long arg)
1457 {
1458 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1459 struct lov_user_md *lump;
1460 int lum_size = sizeof(struct lov_user_md) +
1461 sizeof(struct lov_user_ost_data);
1462 int rc;
1463
1464 if (!capable(CFS_CAP_SYS_ADMIN))
1465 return -EPERM;
1466
1467 OBD_ALLOC_LARGE(lump, lum_size);
1468 if (lump == NULL)
1469 return -ENOMEM;
1470
1471 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1472 OBD_FREE_LARGE(lump, lum_size);
1473 return -EFAULT;
1474 }
1475
1476 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1477
1478 OBD_FREE_LARGE(lump, lum_size);
1479 return rc;
1480 }
1481
1482 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1483 unsigned long arg)
1484 {
1485 struct lov_user_md_v3 lumv3;
1486 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1487 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1488 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1489 int lum_size, rc;
1490 int flags = FMODE_WRITE;
1491
1492 /* first try with v1 which is smaller than v3 */
1493 lum_size = sizeof(struct lov_user_md_v1);
1494 if (copy_from_user(lumv1, lumv1p, lum_size))
1495 return -EFAULT;
1496
1497 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1498 lum_size = sizeof(struct lov_user_md_v3);
1499 if (copy_from_user(&lumv3, lumv3p, lum_size))
1500 return -EFAULT;
1501 }
1502
1503 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1504 if (rc == 0) {
1505 struct lov_stripe_md *lsm;
1506 __u32 gen;
1507
1508 put_user(0, &lumv1p->lmm_stripe_count);
1509
1510 ll_layout_refresh(inode, &gen);
1511 lsm = ccc_inode_lsm_get(inode);
1512 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1513 0, lsm, (void *)arg);
1514 ccc_inode_lsm_put(inode, lsm);
1515 }
1516 return rc;
1517 }
1518
1519 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1520 {
1521 struct lov_stripe_md *lsm;
1522 int rc = -ENODATA;
1523
1524 lsm = ccc_inode_lsm_get(inode);
1525 if (lsm != NULL)
1526 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1527 lsm, (void *)arg);
1528 ccc_inode_lsm_put(inode, lsm);
1529 return rc;
1530 }
1531
1532 static int
1533 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1534 {
1535 struct ll_inode_info *lli = ll_i2info(inode);
1536 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1537 struct ccc_grouplock grouplock;
1538 int rc;
1539
1540 if (ll_file_nolock(file))
1541 return -EOPNOTSUPP;
1542
1543 spin_lock(&lli->lli_lock);
1544 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1545 CWARN("group lock already existed with gid %lu\n",
1546 fd->fd_grouplock.cg_gid);
1547 spin_unlock(&lli->lli_lock);
1548 return -EINVAL;
1549 }
1550 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1551 spin_unlock(&lli->lli_lock);
1552
1553 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1554 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1555 if (rc)
1556 return rc;
1557
1558 spin_lock(&lli->lli_lock);
1559 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1560 spin_unlock(&lli->lli_lock);
1561 CERROR("another thread just won the race\n");
1562 cl_put_grouplock(&grouplock);
1563 return -EINVAL;
1564 }
1565
1566 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1567 fd->fd_grouplock = grouplock;
1568 spin_unlock(&lli->lli_lock);
1569
1570 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1571 return 0;
1572 }
1573
1574 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1575 {
1576 struct ll_inode_info *lli = ll_i2info(inode);
1577 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1578 struct ccc_grouplock grouplock;
1579
1580 spin_lock(&lli->lli_lock);
1581 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1582 spin_unlock(&lli->lli_lock);
1583 CWARN("no group lock held\n");
1584 return -EINVAL;
1585 }
1586 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1587
1588 if (fd->fd_grouplock.cg_gid != arg) {
1589 CWARN("group lock %lu doesn't match current id %lu\n",
1590 arg, fd->fd_grouplock.cg_gid);
1591 spin_unlock(&lli->lli_lock);
1592 return -EINVAL;
1593 }
1594
1595 grouplock = fd->fd_grouplock;
1596 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1597 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1598 spin_unlock(&lli->lli_lock);
1599
1600 cl_put_grouplock(&grouplock);
1601 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1602 return 0;
1603 }
1604
1605 /**
1606 * Close inode open handle
1607 *
1608 * \param dentry [in] dentry which contains the inode
1609 * \param it [in,out] intent which contains open info and result
1610 *
1611 * \retval 0 success
1612 * \retval <0 failure
1613 */
1614 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1615 {
1616 struct inode *inode = dentry->d_inode;
1617 struct obd_client_handle *och;
1618 int rc;
1619
1620 LASSERT(inode);
1621
1622 /* Root ? Do nothing. */
1623 if (dentry->d_inode->i_sb->s_root == dentry)
1624 return 0;
1625
1626 /* No open handle to close? Move away */
1627 if (!it_disposition(it, DISP_OPEN_OPEN))
1628 return 0;
1629
1630 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1631
1632 OBD_ALLOC(och, sizeof(*och));
1633 if (!och)
1634 GOTO(out, rc = -ENOMEM);
1635
1636 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1637
1638 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1639 inode, och, NULL);
1640 out:
1641 /* this one is in place of ll_file_open */
1642 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1643 ptlrpc_req_finished(it->d.lustre.it_data);
1644 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1645 }
1646 return rc;
1647 }
1648
1649 /**
1650 * Get size for inode for which FIEMAP mapping is requested.
1651 * Make the FIEMAP get_info call and returns the result.
1652 */
1653 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1654 size_t num_bytes)
1655 {
1656 struct obd_export *exp = ll_i2dtexp(inode);
1657 struct lov_stripe_md *lsm = NULL;
1658 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1659 __u32 vallen = num_bytes;
1660 int rc;
1661
1662 /* Checks for fiemap flags */
1663 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1664 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1665 return -EBADR;
1666 }
1667
1668 /* Check for FIEMAP_FLAG_SYNC */
1669 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1670 rc = filemap_fdatawrite(inode->i_mapping);
1671 if (rc)
1672 return rc;
1673 }
1674
1675 lsm = ccc_inode_lsm_get(inode);
1676 if (lsm == NULL)
1677 return -ENOENT;
1678
1679 /* If the stripe_count > 1 and the application does not understand
1680 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1681 */
1682 if (lsm->lsm_stripe_count > 1 &&
1683 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1684 GOTO(out, rc = -EOPNOTSUPP);
1685
1686 fm_key.oa.o_oi = lsm->lsm_oi;
1687 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1688
1689 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1690 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1691 /* If filesize is 0, then there would be no objects for mapping */
1692 if (fm_key.oa.o_size == 0) {
1693 fiemap->fm_mapped_extents = 0;
1694 GOTO(out, rc = 0);
1695 }
1696
1697 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1698
1699 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1700 fiemap, lsm);
1701 if (rc)
1702 CERROR("obd_get_info failed: rc = %d\n", rc);
1703
1704 out:
1705 ccc_inode_lsm_put(inode, lsm);
1706 return rc;
1707 }
1708
1709 int ll_fid2path(struct inode *inode, void __user *arg)
1710 {
1711 struct obd_export *exp = ll_i2mdexp(inode);
1712 const struct getinfo_fid2path __user *gfin = arg;
1713 struct getinfo_fid2path *gfout;
1714 u32 pathlen;
1715 size_t outsize;
1716 int rc;
1717
1718 if (!capable(CFS_CAP_DAC_READ_SEARCH) &&
1719 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1720 return -EPERM;
1721
1722 /* Only need to get the buflen */
1723 if (get_user(pathlen, &gfin->gf_pathlen))
1724 return -EFAULT;
1725
1726 if (pathlen > PATH_MAX)
1727 return -EINVAL;
1728
1729 outsize = sizeof(*gfout) + pathlen;
1730
1731 OBD_ALLOC(gfout, outsize);
1732 if (gfout == NULL)
1733 return -ENOMEM;
1734
1735 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1736 GOTO(gf_free, rc = -EFAULT);
1737
1738 /* Call mdc_iocontrol */
1739 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1740 if (rc != 0)
1741 GOTO(gf_free, rc);
1742
1743 if (copy_to_user(arg, gfout, outsize))
1744 rc = -EFAULT;
1745
1746 gf_free:
1747 OBD_FREE(gfout, outsize);
1748 return rc;
1749 }
1750
1751 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1752 {
1753 struct ll_user_fiemap *fiemap_s;
1754 size_t num_bytes, ret_bytes;
1755 unsigned int extent_count;
1756 int rc = 0;
1757
1758 /* Get the extent count so we can calculate the size of
1759 * required fiemap buffer */
1760 if (get_user(extent_count,
1761 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1762 return -EFAULT;
1763
1764 if (extent_count >=
1765 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1766 return -EINVAL;
1767 num_bytes = sizeof(*fiemap_s) + (extent_count *
1768 sizeof(struct ll_fiemap_extent));
1769
1770 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1771 if (fiemap_s == NULL)
1772 return -ENOMEM;
1773
1774 /* get the fiemap value */
1775 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1776 sizeof(*fiemap_s)))
1777 GOTO(error, rc = -EFAULT);
1778
1779 /* If fm_extent_count is non-zero, read the first extent since
1780 * it is used to calculate end_offset and device from previous
1781 * fiemap call. */
1782 if (extent_count) {
1783 if (copy_from_user(&fiemap_s->fm_extents[0],
1784 (char __user *)arg + sizeof(*fiemap_s),
1785 sizeof(struct ll_fiemap_extent)))
1786 GOTO(error, rc = -EFAULT);
1787 }
1788
1789 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1790 if (rc)
1791 GOTO(error, rc);
1792
1793 ret_bytes = sizeof(struct ll_user_fiemap);
1794
1795 if (extent_count != 0)
1796 ret_bytes += (fiemap_s->fm_mapped_extents *
1797 sizeof(struct ll_fiemap_extent));
1798
1799 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1800 rc = -EFAULT;
1801
1802 error:
1803 OBD_FREE_LARGE(fiemap_s, num_bytes);
1804 return rc;
1805 }
1806
1807 /*
1808 * Read the data_version for inode.
1809 *
1810 * This value is computed using stripe object version on OST.
1811 * Version is computed using server side locking.
1812 *
1813 * @param extent_lock Take extent lock. Not needed if a process is already
1814 * holding the OST object group locks.
1815 */
1816 int ll_data_version(struct inode *inode, __u64 *data_version,
1817 int extent_lock)
1818 {
1819 struct lov_stripe_md *lsm = NULL;
1820 struct ll_sb_info *sbi = ll_i2sbi(inode);
1821 struct obdo *obdo = NULL;
1822 int rc;
1823
1824 /* If no stripe, we consider version is 0. */
1825 lsm = ccc_inode_lsm_get(inode);
1826 if (!lsm_has_objects(lsm)) {
1827 *data_version = 0;
1828 CDEBUG(D_INODE, "No object for inode\n");
1829 GOTO(out, rc = 0);
1830 }
1831
1832 OBD_ALLOC_PTR(obdo);
1833 if (obdo == NULL)
1834 GOTO(out, rc = -ENOMEM);
1835
1836 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1837 if (rc == 0) {
1838 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1839 rc = -EOPNOTSUPP;
1840 else
1841 *data_version = obdo->o_data_version;
1842 }
1843
1844 OBD_FREE_PTR(obdo);
1845 out:
1846 ccc_inode_lsm_put(inode, lsm);
1847 return rc;
1848 }
1849
1850 /*
1851 * Trigger a HSM release request for the provided inode.
1852 */
1853 int ll_hsm_release(struct inode *inode)
1854 {
1855 struct cl_env_nest nest;
1856 struct lu_env *env;
1857 struct obd_client_handle *och = NULL;
1858 __u64 data_version = 0;
1859 int rc;
1860
1861
1862 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1863 ll_get_fsname(inode->i_sb, NULL, 0),
1864 PFID(&ll_i2info(inode)->lli_fid));
1865
1866 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1867 if (IS_ERR(och))
1868 GOTO(out, rc = PTR_ERR(och));
1869
1870 /* Grab latest data_version and [am]time values */
1871 rc = ll_data_version(inode, &data_version, 1);
1872 if (rc != 0)
1873 GOTO(out, rc);
1874
1875 env = cl_env_nested_get(&nest);
1876 if (IS_ERR(env))
1877 GOTO(out, rc = PTR_ERR(env));
1878
1879 ll_merge_lvb(env, inode);
1880 cl_env_nested_put(&nest, env);
1881
1882 /* Release the file.
1883 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1884 * we still need it to pack l_remote_handle to MDT. */
1885 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1886 &data_version);
1887 och = NULL;
1888
1889
1890 out:
1891 if (och != NULL && !IS_ERR(och)) /* close the file */
1892 ll_lease_close(och, inode, NULL);
1893
1894 return rc;
1895 }
1896
1897 struct ll_swap_stack {
1898 struct iattr ia1, ia2;
1899 __u64 dv1, dv2;
1900 struct inode *inode1, *inode2;
1901 bool check_dv1, check_dv2;
1902 };
1903
1904 static int ll_swap_layouts(struct file *file1, struct file *file2,
1905 struct lustre_swap_layouts *lsl)
1906 {
1907 struct mdc_swap_layouts msl;
1908 struct md_op_data *op_data;
1909 __u32 gid;
1910 __u64 dv;
1911 struct ll_swap_stack *llss = NULL;
1912 int rc;
1913
1914 OBD_ALLOC_PTR(llss);
1915 if (llss == NULL)
1916 return -ENOMEM;
1917
1918 llss->inode1 = file1->f_dentry->d_inode;
1919 llss->inode2 = file2->f_dentry->d_inode;
1920
1921 if (!S_ISREG(llss->inode2->i_mode))
1922 GOTO(free, rc = -EINVAL);
1923
1924 if (inode_permission(llss->inode1, MAY_WRITE) ||
1925 inode_permission(llss->inode2, MAY_WRITE))
1926 GOTO(free, rc = -EPERM);
1927
1928 if (llss->inode2->i_sb != llss->inode1->i_sb)
1929 GOTO(free, rc = -EXDEV);
1930
1931 /* we use 2 bool because it is easier to swap than 2 bits */
1932 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1933 llss->check_dv1 = true;
1934
1935 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1936 llss->check_dv2 = true;
1937
1938 /* we cannot use lsl->sl_dvX directly because we may swap them */
1939 llss->dv1 = lsl->sl_dv1;
1940 llss->dv2 = lsl->sl_dv2;
1941
1942 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1943 if (rc == 0) /* same file, done! */
1944 GOTO(free, rc = 0);
1945
1946 if (rc < 0) { /* sequentialize it */
1947 swap(llss->inode1, llss->inode2);
1948 swap(file1, file2);
1949 swap(llss->dv1, llss->dv2);
1950 swap(llss->check_dv1, llss->check_dv2);
1951 }
1952
1953 gid = lsl->sl_gid;
1954 if (gid != 0) { /* application asks to flush dirty cache */
1955 rc = ll_get_grouplock(llss->inode1, file1, gid);
1956 if (rc < 0)
1957 GOTO(free, rc);
1958
1959 rc = ll_get_grouplock(llss->inode2, file2, gid);
1960 if (rc < 0) {
1961 ll_put_grouplock(llss->inode1, file1, gid);
1962 GOTO(free, rc);
1963 }
1964 }
1965
1966 /* to be able to restore mtime and atime after swap
1967 * we need to first save them */
1968 if (lsl->sl_flags &
1969 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1970 llss->ia1.ia_mtime = llss->inode1->i_mtime;
1971 llss->ia1.ia_atime = llss->inode1->i_atime;
1972 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1973 llss->ia2.ia_mtime = llss->inode2->i_mtime;
1974 llss->ia2.ia_atime = llss->inode2->i_atime;
1975 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1976 }
1977
1978 /* ultimate check, before swapping the layouts we check if
1979 * dataversion has changed (if requested) */
1980 if (llss->check_dv1) {
1981 rc = ll_data_version(llss->inode1, &dv, 0);
1982 if (rc)
1983 GOTO(putgl, rc);
1984 if (dv != llss->dv1)
1985 GOTO(putgl, rc = -EAGAIN);
1986 }
1987
1988 if (llss->check_dv2) {
1989 rc = ll_data_version(llss->inode2, &dv, 0);
1990 if (rc)
1991 GOTO(putgl, rc);
1992 if (dv != llss->dv2)
1993 GOTO(putgl, rc = -EAGAIN);
1994 }
1995
1996 /* struct md_op_data is used to send the swap args to the mdt
1997 * only flags is missing, so we use struct mdc_swap_layouts
1998 * through the md_op_data->op_data */
1999 /* flags from user space have to be converted before they are send to
2000 * server, no flag is sent today, they are only used on the client */
2001 msl.msl_flags = 0;
2002 rc = -ENOMEM;
2003 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2004 0, LUSTRE_OPC_ANY, &msl);
2005 if (IS_ERR(op_data))
2006 GOTO(free, rc = PTR_ERR(op_data));
2007
2008 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2009 sizeof(*op_data), op_data, NULL);
2010 ll_finish_md_op_data(op_data);
2011
2012 putgl:
2013 if (gid != 0) {
2014 ll_put_grouplock(llss->inode2, file2, gid);
2015 ll_put_grouplock(llss->inode1, file1, gid);
2016 }
2017
2018 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2019 if (rc != 0)
2020 GOTO(free, rc);
2021
2022 /* clear useless flags */
2023 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2024 llss->ia1.ia_valid &= ~ATTR_MTIME;
2025 llss->ia2.ia_valid &= ~ATTR_MTIME;
2026 }
2027
2028 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2029 llss->ia1.ia_valid &= ~ATTR_ATIME;
2030 llss->ia2.ia_valid &= ~ATTR_ATIME;
2031 }
2032
2033 /* update time if requested */
2034 rc = 0;
2035 if (llss->ia2.ia_valid != 0) {
2036 mutex_lock(&llss->inode1->i_mutex);
2037 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2038 mutex_unlock(&llss->inode1->i_mutex);
2039 }
2040
2041 if (llss->ia1.ia_valid != 0) {
2042 int rc1;
2043
2044 mutex_lock(&llss->inode2->i_mutex);
2045 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2046 mutex_unlock(&llss->inode2->i_mutex);
2047 if (rc == 0)
2048 rc = rc1;
2049 }
2050
2051 free:
2052 if (llss != NULL)
2053 OBD_FREE_PTR(llss);
2054
2055 return rc;
2056 }
2057
2058 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2059 {
2060 struct md_op_data *op_data;
2061 int rc;
2062
2063 /* Non-root users are forbidden to set or clear flags which are
2064 * NOT defined in HSM_USER_MASK. */
2065 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2066 !capable(CFS_CAP_SYS_ADMIN))
2067 return -EPERM;
2068
2069 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2070 LUSTRE_OPC_ANY, hss);
2071 if (IS_ERR(op_data))
2072 return PTR_ERR(op_data);
2073
2074 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2075 sizeof(*op_data), op_data, NULL);
2076
2077 ll_finish_md_op_data(op_data);
2078
2079 return rc;
2080 }
2081
2082 static int ll_hsm_import(struct inode *inode, struct file *file,
2083 struct hsm_user_import *hui)
2084 {
2085 struct hsm_state_set *hss = NULL;
2086 struct iattr *attr = NULL;
2087 int rc;
2088
2089
2090 if (!S_ISREG(inode->i_mode))
2091 return -EINVAL;
2092
2093 /* set HSM flags */
2094 OBD_ALLOC_PTR(hss);
2095 if (hss == NULL)
2096 GOTO(out, rc = -ENOMEM);
2097
2098 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2099 hss->hss_archive_id = hui->hui_archive_id;
2100 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2101 rc = ll_hsm_state_set(inode, hss);
2102 if (rc != 0)
2103 GOTO(out, rc);
2104
2105 OBD_ALLOC_PTR(attr);
2106 if (attr == NULL)
2107 GOTO(out, rc = -ENOMEM);
2108
2109 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2110 attr->ia_mode |= S_IFREG;
2111 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2112 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2113 attr->ia_size = hui->hui_size;
2114 attr->ia_mtime.tv_sec = hui->hui_mtime;
2115 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2116 attr->ia_atime.tv_sec = hui->hui_atime;
2117 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2118
2119 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2120 ATTR_UID | ATTR_GID |
2121 ATTR_MTIME | ATTR_MTIME_SET |
2122 ATTR_ATIME | ATTR_ATIME_SET;
2123
2124 mutex_lock(&inode->i_mutex);
2125
2126 rc = ll_setattr_raw(file->f_dentry, attr, true);
2127 if (rc == -ENODATA)
2128 rc = 0;
2129
2130 mutex_unlock(&inode->i_mutex);
2131
2132 out:
2133 if (hss != NULL)
2134 OBD_FREE_PTR(hss);
2135
2136 if (attr != NULL)
2137 OBD_FREE_PTR(attr);
2138
2139 return rc;
2140 }
2141
2142 static long
2143 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2144 {
2145 struct inode *inode = file->f_dentry->d_inode;
2146 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2147 int flags, rc;
2148
2149 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2150 inode->i_generation, inode, cmd);
2151 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2152
2153 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2154 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2155 return -ENOTTY;
2156
2157 switch (cmd) {
2158 case LL_IOC_GETFLAGS:
2159 /* Get the current value of the file flags */
2160 return put_user(fd->fd_flags, (int *)arg);
2161 case LL_IOC_SETFLAGS:
2162 case LL_IOC_CLRFLAGS:
2163 /* Set or clear specific file flags */
2164 /* XXX This probably needs checks to ensure the flags are
2165 * not abused, and to handle any flag side effects.
2166 */
2167 if (get_user(flags, (int *) arg))
2168 return -EFAULT;
2169
2170 if (cmd == LL_IOC_SETFLAGS) {
2171 if ((flags & LL_FILE_IGNORE_LOCK) &&
2172 !(file->f_flags & O_DIRECT)) {
2173 CERROR("%s: unable to disable locking on "
2174 "non-O_DIRECT file\n", current->comm);
2175 return -EINVAL;
2176 }
2177
2178 fd->fd_flags |= flags;
2179 } else {
2180 fd->fd_flags &= ~flags;
2181 }
2182 return 0;
2183 case LL_IOC_LOV_SETSTRIPE:
2184 return ll_lov_setstripe(inode, file, arg);
2185 case LL_IOC_LOV_SETEA:
2186 return ll_lov_setea(inode, file, arg);
2187 case LL_IOC_LOV_SWAP_LAYOUTS: {
2188 struct file *file2;
2189 struct lustre_swap_layouts lsl;
2190
2191 if (copy_from_user(&lsl, (char *)arg,
2192 sizeof(struct lustre_swap_layouts)))
2193 return -EFAULT;
2194
2195 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2196 return -EPERM;
2197
2198 file2 = fget(lsl.sl_fd);
2199 if (file2 == NULL)
2200 return -EBADF;
2201
2202 rc = -EPERM;
2203 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2204 rc = ll_swap_layouts(file, file2, &lsl);
2205 fput(file2);
2206 return rc;
2207 }
2208 case LL_IOC_LOV_GETSTRIPE:
2209 return ll_lov_getstripe(inode, arg);
2210 case LL_IOC_RECREATE_OBJ:
2211 return ll_lov_recreate_obj(inode, arg);
2212 case LL_IOC_RECREATE_FID:
2213 return ll_lov_recreate_fid(inode, arg);
2214 case FSFILT_IOC_FIEMAP:
2215 return ll_ioctl_fiemap(inode, arg);
2216 case FSFILT_IOC_GETFLAGS:
2217 case FSFILT_IOC_SETFLAGS:
2218 return ll_iocontrol(inode, file, cmd, arg);
2219 case FSFILT_IOC_GETVERSION_OLD:
2220 case FSFILT_IOC_GETVERSION:
2221 return put_user(inode->i_generation, (int *)arg);
2222 case LL_IOC_GROUP_LOCK:
2223 return ll_get_grouplock(inode, file, arg);
2224 case LL_IOC_GROUP_UNLOCK:
2225 return ll_put_grouplock(inode, file, arg);
2226 case IOC_OBD_STATFS:
2227 return ll_obd_statfs(inode, (void *)arg);
2228
2229 /* We need to special case any other ioctls we want to handle,
2230 * to send them to the MDS/OST as appropriate and to properly
2231 * network encode the arg field.
2232 case FSFILT_IOC_SETVERSION_OLD:
2233 case FSFILT_IOC_SETVERSION:
2234 */
2235 case LL_IOC_FLUSHCTX:
2236 return ll_flush_ctx(inode);
2237 case LL_IOC_PATH2FID: {
2238 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2239 sizeof(struct lu_fid)))
2240 return -EFAULT;
2241
2242 return 0;
2243 }
2244 case OBD_IOC_FID2PATH:
2245 return ll_fid2path(inode, (void *)arg);
2246 case LL_IOC_DATA_VERSION: {
2247 struct ioc_data_version idv;
2248 int rc;
2249
2250 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2251 return -EFAULT;
2252
2253 rc = ll_data_version(inode, &idv.idv_version,
2254 !(idv.idv_flags & LL_DV_NOFLUSH));
2255
2256 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2257 return -EFAULT;
2258
2259 return rc;
2260 }
2261
2262 case LL_IOC_GET_MDTIDX: {
2263 int mdtidx;
2264
2265 mdtidx = ll_get_mdt_idx(inode);
2266 if (mdtidx < 0)
2267 return mdtidx;
2268
2269 if (put_user((int)mdtidx, (int*)arg))
2270 return -EFAULT;
2271
2272 return 0;
2273 }
2274 case OBD_IOC_GETDTNAME:
2275 case OBD_IOC_GETMDNAME:
2276 return ll_get_obd_name(inode, cmd, arg);
2277 case LL_IOC_HSM_STATE_GET: {
2278 struct md_op_data *op_data;
2279 struct hsm_user_state *hus;
2280 int rc;
2281
2282 OBD_ALLOC_PTR(hus);
2283 if (hus == NULL)
2284 return -ENOMEM;
2285
2286 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2287 LUSTRE_OPC_ANY, hus);
2288 if (IS_ERR(op_data)) {
2289 OBD_FREE_PTR(hus);
2290 return PTR_ERR(op_data);
2291 }
2292
2293 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2294 op_data, NULL);
2295
2296 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2297 rc = -EFAULT;
2298
2299 ll_finish_md_op_data(op_data);
2300 OBD_FREE_PTR(hus);
2301 return rc;
2302 }
2303 case LL_IOC_HSM_STATE_SET: {
2304 struct hsm_state_set *hss;
2305 int rc;
2306
2307 OBD_ALLOC_PTR(hss);
2308 if (hss == NULL)
2309 return -ENOMEM;
2310
2311 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2312 OBD_FREE_PTR(hss);
2313 return -EFAULT;
2314 }
2315
2316 rc = ll_hsm_state_set(inode, hss);
2317
2318 OBD_FREE_PTR(hss);
2319 return rc;
2320 }
2321 case LL_IOC_HSM_ACTION: {
2322 struct md_op_data *op_data;
2323 struct hsm_current_action *hca;
2324 int rc;
2325
2326 OBD_ALLOC_PTR(hca);
2327 if (hca == NULL)
2328 return -ENOMEM;
2329
2330 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2331 LUSTRE_OPC_ANY, hca);
2332 if (IS_ERR(op_data)) {
2333 OBD_FREE_PTR(hca);
2334 return PTR_ERR(op_data);
2335 }
2336
2337 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2338 op_data, NULL);
2339
2340 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2341 rc = -EFAULT;
2342
2343 ll_finish_md_op_data(op_data);
2344 OBD_FREE_PTR(hca);
2345 return rc;
2346 }
2347 case LL_IOC_SET_LEASE: {
2348 struct ll_inode_info *lli = ll_i2info(inode);
2349 struct obd_client_handle *och = NULL;
2350 bool lease_broken;
2351 fmode_t mode = 0;
2352
2353 switch (arg) {
2354 case F_WRLCK:
2355 if (!(file->f_mode & FMODE_WRITE))
2356 return -EPERM;
2357 mode = FMODE_WRITE;
2358 break;
2359 case F_RDLCK:
2360 if (!(file->f_mode & FMODE_READ))
2361 return -EPERM;
2362 mode = FMODE_READ;
2363 break;
2364 case F_UNLCK:
2365 mutex_lock(&lli->lli_och_mutex);
2366 if (fd->fd_lease_och != NULL) {
2367 och = fd->fd_lease_och;
2368 fd->fd_lease_och = NULL;
2369 }
2370 mutex_unlock(&lli->lli_och_mutex);
2371
2372 if (och != NULL) {
2373 mode = och->och_flags &
2374 (FMODE_READ|FMODE_WRITE);
2375 rc = ll_lease_close(och, inode, &lease_broken);
2376 if (rc == 0 && lease_broken)
2377 mode = 0;
2378 } else {
2379 rc = -ENOLCK;
2380 }
2381
2382 /* return the type of lease or error */
2383 return rc < 0 ? rc : (int)mode;
2384 default:
2385 return -EINVAL;
2386 }
2387
2388 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2389
2390 /* apply for lease */
2391 och = ll_lease_open(inode, file, mode, 0);
2392 if (IS_ERR(och))
2393 return PTR_ERR(och);
2394
2395 rc = 0;
2396 mutex_lock(&lli->lli_och_mutex);
2397 if (fd->fd_lease_och == NULL) {
2398 fd->fd_lease_och = och;
2399 och = NULL;
2400 }
2401 mutex_unlock(&lli->lli_och_mutex);
2402 if (och != NULL) {
2403 /* impossible now that only excl is supported for now */
2404 ll_lease_close(och, inode, &lease_broken);
2405 rc = -EBUSY;
2406 }
2407 return rc;
2408 }
2409 case LL_IOC_GET_LEASE: {
2410 struct ll_inode_info *lli = ll_i2info(inode);
2411 struct ldlm_lock *lock = NULL;
2412
2413 rc = 0;
2414 mutex_lock(&lli->lli_och_mutex);
2415 if (fd->fd_lease_och != NULL) {
2416 struct obd_client_handle *och = fd->fd_lease_och;
2417
2418 lock = ldlm_handle2lock(&och->och_lease_handle);
2419 if (lock != NULL) {
2420 lock_res_and_lock(lock);
2421 if (!ldlm_is_cancel(lock))
2422 rc = och->och_flags &
2423 (FMODE_READ | FMODE_WRITE);
2424 unlock_res_and_lock(lock);
2425 ldlm_lock_put(lock);
2426 }
2427 }
2428 mutex_unlock(&lli->lli_och_mutex);
2429 return rc;
2430 }
2431 case LL_IOC_HSM_IMPORT: {
2432 struct hsm_user_import *hui;
2433
2434 OBD_ALLOC_PTR(hui);
2435 if (hui == NULL)
2436 return -ENOMEM;
2437
2438 if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2439 OBD_FREE_PTR(hui);
2440 return -EFAULT;
2441 }
2442
2443 rc = ll_hsm_import(inode, file, hui);
2444
2445 OBD_FREE_PTR(hui);
2446 return rc;
2447 }
2448 default: {
2449 int err;
2450
2451 if (LLIOC_STOP ==
2452 ll_iocontrol_call(inode, file, cmd, arg, &err))
2453 return err;
2454
2455 return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2456 (void *)arg);
2457 }
2458 }
2459 }
2460
2461
2462 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2463 {
2464 struct inode *inode = file->f_dentry->d_inode;
2465 loff_t retval, eof = 0;
2466
2467 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2468 (origin == SEEK_CUR) ? file->f_pos : 0);
2469 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2470 inode->i_ino, inode->i_generation, inode, retval, retval,
2471 origin);
2472 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2473
2474 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2475 retval = ll_glimpse_size(inode);
2476 if (retval != 0)
2477 return retval;
2478 eof = i_size_read(inode);
2479 }
2480
2481 retval = generic_file_llseek_size(file, offset, origin,
2482 ll_file_maxbytes(inode), eof);
2483 return retval;
2484 }
2485
2486 static int ll_flush(struct file *file, fl_owner_t id)
2487 {
2488 struct inode *inode = file->f_dentry->d_inode;
2489 struct ll_inode_info *lli = ll_i2info(inode);
2490 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2491 int rc, err;
2492
2493 LASSERT(!S_ISDIR(inode->i_mode));
2494
2495 /* catch async errors that were recorded back when async writeback
2496 * failed for pages in this mapping. */
2497 rc = lli->lli_async_rc;
2498 lli->lli_async_rc = 0;
2499 err = lov_read_and_clear_async_rc(lli->lli_clob);
2500 if (rc == 0)
2501 rc = err;
2502
2503 /* The application has been told write failure already.
2504 * Do not report failure again. */
2505 if (fd->fd_write_failed)
2506 return 0;
2507 return rc ? -EIO : 0;
2508 }
2509
2510 /**
2511 * Called to make sure a portion of file has been written out.
2512 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2513 *
2514 * Return how many pages have been written.
2515 */
2516 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2517 enum cl_fsync_mode mode, int ignore_layout)
2518 {
2519 struct cl_env_nest nest;
2520 struct lu_env *env;
2521 struct cl_io *io;
2522 struct obd_capa *capa = NULL;
2523 struct cl_fsync_io *fio;
2524 int result;
2525
2526 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2527 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2528 return -EINVAL;
2529
2530 env = cl_env_nested_get(&nest);
2531 if (IS_ERR(env))
2532 return PTR_ERR(env);
2533
2534 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2535
2536 io = ccc_env_thread_io(env);
2537 io->ci_obj = cl_i2info(inode)->lli_clob;
2538 io->ci_ignore_layout = ignore_layout;
2539
2540 /* initialize parameters for sync */
2541 fio = &io->u.ci_fsync;
2542 fio->fi_capa = capa;
2543 fio->fi_start = start;
2544 fio->fi_end = end;
2545 fio->fi_fid = ll_inode2fid(inode);
2546 fio->fi_mode = mode;
2547 fio->fi_nr_written = 0;
2548
2549 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2550 result = cl_io_loop(env, io);
2551 else
2552 result = io->ci_result;
2553 if (result == 0)
2554 result = fio->fi_nr_written;
2555 cl_io_fini(env, io);
2556 cl_env_nested_put(&nest, env);
2557
2558 capa_put(capa);
2559
2560 return result;
2561 }
2562
2563 /*
2564 * When dentry is provided (the 'else' case), *file->f_dentry may be
2565 * null and dentry must be used directly rather than pulled from
2566 * *file->f_dentry as is done otherwise.
2567 */
2568
2569 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2570 {
2571 struct dentry *dentry = file->f_dentry;
2572 struct inode *inode = dentry->d_inode;
2573 struct ll_inode_info *lli = ll_i2info(inode);
2574 struct ptlrpc_request *req;
2575 struct obd_capa *oc;
2576 int rc, err;
2577
2578 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2579 inode->i_generation, inode);
2580 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2581
2582 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2583 mutex_lock(&inode->i_mutex);
2584
2585 /* catch async errors that were recorded back when async writeback
2586 * failed for pages in this mapping. */
2587 if (!S_ISDIR(inode->i_mode)) {
2588 err = lli->lli_async_rc;
2589 lli->lli_async_rc = 0;
2590 if (rc == 0)
2591 rc = err;
2592 err = lov_read_and_clear_async_rc(lli->lli_clob);
2593 if (rc == 0)
2594 rc = err;
2595 }
2596
2597 oc = ll_mdscapa_get(inode);
2598 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2599 &req);
2600 capa_put(oc);
2601 if (!rc)
2602 rc = err;
2603 if (!err)
2604 ptlrpc_req_finished(req);
2605
2606 if (S_ISREG(inode->i_mode)) {
2607 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2608
2609 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2610 if (rc == 0 && err < 0)
2611 rc = err;
2612 if (rc < 0)
2613 fd->fd_write_failed = true;
2614 else
2615 fd->fd_write_failed = false;
2616 }
2617
2618 mutex_unlock(&inode->i_mutex);
2619 return rc;
2620 }
2621
2622 static int
2623 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2624 {
2625 struct inode *inode = file->f_dentry->d_inode;
2626 struct ll_sb_info *sbi = ll_i2sbi(inode);
2627 struct ldlm_enqueue_info einfo = {
2628 .ei_type = LDLM_FLOCK,
2629 .ei_cb_cp = ldlm_flock_completion_ast,
2630 .ei_cbdata = file_lock,
2631 };
2632 struct md_op_data *op_data;
2633 struct lustre_handle lockh = {0};
2634 ldlm_policy_data_t flock = {{0}};
2635 __u64 flags = 0;
2636 int rc;
2637 int rc2 = 0;
2638
2639 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2640 inode->i_ino, file_lock);
2641
2642 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2643
2644 if (file_lock->fl_flags & FL_FLOCK)
2645 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2646 else if (!(file_lock->fl_flags & FL_POSIX))
2647 return -EINVAL;
2648
2649 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2650 flock.l_flock.pid = file_lock->fl_pid;
2651 flock.l_flock.start = file_lock->fl_start;
2652 flock.l_flock.end = file_lock->fl_end;
2653
2654 /* Somewhat ugly workaround for svc lockd.
2655 * lockd installs custom fl_lmops->lm_compare_owner that checks
2656 * for the fl_owner to be the same (which it always is on local node
2657 * I guess between lockd processes) and then compares pid.
2658 * As such we assign pid to the owner field to make it all work,
2659 * conflict with normal locks is unlikely since pid space and
2660 * pointer space for current->files are not intersecting */
2661 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2662 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2663
2664 switch (file_lock->fl_type) {
2665 case F_RDLCK:
2666 einfo.ei_mode = LCK_PR;
2667 break;
2668 case F_UNLCK:
2669 /* An unlock request may or may not have any relation to
2670 * existing locks so we may not be able to pass a lock handle
2671 * via a normal ldlm_lock_cancel() request. The request may even
2672 * unlock a byte range in the middle of an existing lock. In
2673 * order to process an unlock request we need all of the same
2674 * information that is given with a normal read or write record
2675 * lock request. To avoid creating another ldlm unlock (cancel)
2676 * message we'll treat a LCK_NL flock request as an unlock. */
2677 einfo.ei_mode = LCK_NL;
2678 break;
2679 case F_WRLCK:
2680 einfo.ei_mode = LCK_PW;
2681 break;
2682 default:
2683 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2684 file_lock->fl_type);
2685 return -ENOTSUPP;
2686 }
2687
2688 switch (cmd) {
2689 case F_SETLKW:
2690 #ifdef F_SETLKW64
2691 case F_SETLKW64:
2692 #endif
2693 flags = 0;
2694 break;
2695 case F_SETLK:
2696 #ifdef F_SETLK64
2697 case F_SETLK64:
2698 #endif
2699 flags = LDLM_FL_BLOCK_NOWAIT;
2700 break;
2701 case F_GETLK:
2702 #ifdef F_GETLK64
2703 case F_GETLK64:
2704 #endif
2705 flags = LDLM_FL_TEST_LOCK;
2706 /* Save the old mode so that if the mode in the lock changes we
2707 * can decrement the appropriate reader or writer refcount. */
2708 file_lock->fl_type = einfo.ei_mode;
2709 break;
2710 default:
2711 CERROR("unknown fcntl lock command: %d\n", cmd);
2712 return -EINVAL;
2713 }
2714
2715 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2716 LUSTRE_OPC_ANY, NULL);
2717 if (IS_ERR(op_data))
2718 return PTR_ERR(op_data);
2719
2720 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n",
2721 inode->i_ino, flock.l_flock.pid, flags, einfo.ei_mode,
2722 flock.l_flock.start, flock.l_flock.end);
2723
2724 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2725 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2726
2727 if ((file_lock->fl_flags & FL_FLOCK) &&
2728 (rc == 0 || file_lock->fl_type == F_UNLCK))
2729 rc2 = flock_lock_file_wait(file, file_lock);
2730 if ((file_lock->fl_flags & FL_POSIX) &&
2731 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2732 !(flags & LDLM_FL_TEST_LOCK))
2733 rc2 = posix_lock_file_wait(file, file_lock);
2734
2735 if (rc2 && file_lock->fl_type != F_UNLCK) {
2736 einfo.ei_mode = LCK_NL;
2737 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2738 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2739 rc = rc2;
2740 }
2741
2742 ll_finish_md_op_data(op_data);
2743
2744 return rc;
2745 }
2746
2747 static int
2748 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2749 {
2750 return -ENOSYS;
2751 }
2752
2753 /**
2754 * test if some locks matching bits and l_req_mode are acquired
2755 * - bits can be in different locks
2756 * - if found clear the common lock bits in *bits
2757 * - the bits not found, are kept in *bits
2758 * \param inode [IN]
2759 * \param bits [IN] searched lock bits [IN]
2760 * \param l_req_mode [IN] searched lock mode
2761 * \retval boolean, true iff all bits are found
2762 */
2763 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2764 {
2765 struct lustre_handle lockh;
2766 ldlm_policy_data_t policy;
2767 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2768 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2769 struct lu_fid *fid;
2770 __u64 flags;
2771 int i;
2772
2773 if (!inode)
2774 return 0;
2775
2776 fid = &ll_i2info(inode)->lli_fid;
2777 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2778 ldlm_lockname[mode]);
2779
2780 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2781 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2782 policy.l_inodebits.bits = *bits & (1 << i);
2783 if (policy.l_inodebits.bits == 0)
2784 continue;
2785
2786 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2787 &policy, mode, &lockh)) {
2788 struct ldlm_lock *lock;
2789
2790 lock = ldlm_handle2lock(&lockh);
2791 if (lock) {
2792 *bits &=
2793 ~(lock->l_policy_data.l_inodebits.bits);
2794 LDLM_LOCK_PUT(lock);
2795 } else {
2796 *bits &= ~policy.l_inodebits.bits;
2797 }
2798 }
2799 }
2800 return *bits == 0;
2801 }
2802
2803 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2804 struct lustre_handle *lockh, __u64 flags,
2805 ldlm_mode_t mode)
2806 {
2807 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2808 struct lu_fid *fid;
2809 ldlm_mode_t rc;
2810
2811 fid = &ll_i2info(inode)->lli_fid;
2812 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2813
2814 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2815 fid, LDLM_IBITS, &policy, mode, lockh);
2816
2817 return rc;
2818 }
2819
2820 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2821 {
2822 /* Already unlinked. Just update nlink and return success */
2823 if (rc == -ENOENT) {
2824 clear_nlink(inode);
2825 /* This path cannot be hit for regular files unless in
2826 * case of obscure races, so no need to validate size.
2827 */
2828 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2829 return 0;
2830 } else if (rc != 0) {
2831 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
2832 "%s: revalidate FID "DFID" error: rc = %d\n",
2833 ll_get_fsname(inode->i_sb, NULL, 0),
2834 PFID(ll_inode2fid(inode)), rc);
2835 }
2836
2837 return rc;
2838 }
2839
2840 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2841 {
2842 struct inode *inode = dentry->d_inode;
2843 struct ptlrpc_request *req = NULL;
2844 struct obd_export *exp;
2845 int rc = 0;
2846
2847 LASSERT(inode != NULL);
2848
2849 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2850 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2851
2852 exp = ll_i2mdexp(inode);
2853
2854 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2855 * But under CMD case, it caused some lock issues, should be fixed
2856 * with new CMD ibits lock. See bug 12718 */
2857 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2858 struct lookup_intent oit = { .it_op = IT_GETATTR };
2859 struct md_op_data *op_data;
2860
2861 if (ibits == MDS_INODELOCK_LOOKUP)
2862 oit.it_op = IT_LOOKUP;
2863
2864 /* Call getattr by fid, so do not provide name at all. */
2865 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
2866 dentry->d_inode, NULL, 0, 0,
2867 LUSTRE_OPC_ANY, NULL);
2868 if (IS_ERR(op_data))
2869 return PTR_ERR(op_data);
2870
2871 oit.it_create_mode |= M_CHECK_STALE;
2872 rc = md_intent_lock(exp, op_data, NULL, 0,
2873 /* we are not interested in name
2874 based lookup */
2875 &oit, 0, &req,
2876 ll_md_blocking_ast, 0);
2877 ll_finish_md_op_data(op_data);
2878 oit.it_create_mode &= ~M_CHECK_STALE;
2879 if (rc < 0) {
2880 rc = ll_inode_revalidate_fini(inode, rc);
2881 GOTO (out, rc);
2882 }
2883
2884 rc = ll_revalidate_it_finish(req, &oit, dentry);
2885 if (rc != 0) {
2886 ll_intent_release(&oit);
2887 GOTO(out, rc);
2888 }
2889
2890 /* Unlinked? Unhash dentry, so it is not picked up later by
2891 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2892 here to preserve get_cwd functionality on 2.6.
2893 Bug 10503 */
2894 if (!dentry->d_inode->i_nlink)
2895 d_lustre_invalidate(dentry, 0);
2896
2897 ll_lookup_finish_locks(&oit, dentry);
2898 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2899 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2900 u64 valid = OBD_MD_FLGETATTR;
2901 struct md_op_data *op_data;
2902 int ealen = 0;
2903
2904 if (S_ISREG(inode->i_mode)) {
2905 rc = ll_get_default_mdsize(sbi, &ealen);
2906 if (rc)
2907 return rc;
2908 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2909 }
2910
2911 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2912 0, ealen, LUSTRE_OPC_ANY,
2913 NULL);
2914 if (IS_ERR(op_data))
2915 return PTR_ERR(op_data);
2916
2917 op_data->op_valid = valid;
2918 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2919 * capa for this inode. Because we only keep capas of dirs
2920 * fresh. */
2921 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2922 ll_finish_md_op_data(op_data);
2923 if (rc) {
2924 rc = ll_inode_revalidate_fini(inode, rc);
2925 return rc;
2926 }
2927
2928 rc = ll_prep_inode(&inode, req, NULL, NULL);
2929 }
2930 out:
2931 ptlrpc_req_finished(req);
2932 return rc;
2933 }
2934
2935 static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2936 {
2937 struct inode *inode = dentry->d_inode;
2938 int rc;
2939
2940 rc = __ll_inode_revalidate(dentry, ibits);
2941 if (rc != 0)
2942 return rc;
2943
2944 /* if object isn't regular file, don't validate size */
2945 if (!S_ISREG(inode->i_mode)) {
2946 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2947 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2948 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2949 } else {
2950 /* In case of restore, the MDT has the right size and has
2951 * already send it back without granting the layout lock,
2952 * inode is up-to-date so glimpse is useless.
2953 * Also to glimpse we need the layout, in case of a running
2954 * restore the MDT holds the layout lock so the glimpse will
2955 * block up to the end of restore (getattr will block)
2956 */
2957 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
2958 rc = ll_glimpse_size(inode);
2959 }
2960 return rc;
2961 }
2962
2963 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2964 {
2965 struct inode *inode = de->d_inode;
2966 struct ll_sb_info *sbi = ll_i2sbi(inode);
2967 struct ll_inode_info *lli = ll_i2info(inode);
2968 int res = 0;
2969
2970 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
2971 MDS_INODELOCK_LOOKUP);
2972 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2973
2974 if (res)
2975 return res;
2976
2977 stat->dev = inode->i_sb->s_dev;
2978 if (ll_need_32bit_api(sbi))
2979 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2980 else
2981 stat->ino = inode->i_ino;
2982 stat->mode = inode->i_mode;
2983 stat->nlink = inode->i_nlink;
2984 stat->uid = inode->i_uid;
2985 stat->gid = inode->i_gid;
2986 stat->rdev = inode->i_rdev;
2987 stat->atime = inode->i_atime;
2988 stat->mtime = inode->i_mtime;
2989 stat->ctime = inode->i_ctime;
2990 stat->blksize = 1 << inode->i_blkbits;
2991
2992 stat->size = i_size_read(inode);
2993 stat->blocks = inode->i_blocks;
2994
2995 return 0;
2996 }
2997
2998 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2999 __u64 start, __u64 len)
3000 {
3001 int rc;
3002 size_t num_bytes;
3003 struct ll_user_fiemap *fiemap;
3004 unsigned int extent_count = fieinfo->fi_extents_max;
3005
3006 num_bytes = sizeof(*fiemap) + (extent_count *
3007 sizeof(struct ll_fiemap_extent));
3008 OBD_ALLOC_LARGE(fiemap, num_bytes);
3009
3010 if (fiemap == NULL)
3011 return -ENOMEM;
3012
3013 fiemap->fm_flags = fieinfo->fi_flags;
3014 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3015 fiemap->fm_start = start;
3016 fiemap->fm_length = len;
3017 if (extent_count > 0)
3018 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3019 sizeof(struct ll_fiemap_extent));
3020
3021 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3022
3023 fieinfo->fi_flags = fiemap->fm_flags;
3024 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3025 if (extent_count > 0)
3026 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3027 fiemap->fm_mapped_extents *
3028 sizeof(struct ll_fiemap_extent));
3029
3030 OBD_FREE_LARGE(fiemap, num_bytes);
3031 return rc;
3032 }
3033
3034 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3035 {
3036 struct ll_inode_info *lli = ll_i2info(inode);
3037 struct posix_acl *acl = NULL;
3038
3039 spin_lock(&lli->lli_lock);
3040 /* VFS' acl_permission_check->check_acl will release the refcount */
3041 acl = posix_acl_dup(lli->lli_posix_acl);
3042 spin_unlock(&lli->lli_lock);
3043
3044 return acl;
3045 }
3046
3047
3048 int ll_inode_permission(struct inode *inode, int mask)
3049 {
3050 int rc = 0;
3051
3052 #ifdef MAY_NOT_BLOCK
3053 if (mask & MAY_NOT_BLOCK)
3054 return -ECHILD;
3055 #endif
3056
3057 /* as root inode are NOT getting validated in lookup operation,
3058 * need to do it before permission check. */
3059
3060 if (inode == inode->i_sb->s_root->d_inode) {
3061 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3062 MDS_INODELOCK_LOOKUP);
3063 if (rc)
3064 return rc;
3065 }
3066
3067 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3068 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3069
3070 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3071 return lustre_check_remote_perm(inode, mask);
3072
3073 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3074 rc = generic_permission(inode, mask);
3075
3076 return rc;
3077 }
3078
3079 /* -o localflock - only provides locally consistent flock locks */
3080 struct file_operations ll_file_operations = {
3081 .read = new_sync_read,
3082 .read_iter = ll_file_read_iter,
3083 .write = new_sync_write,
3084 .write_iter = ll_file_write_iter,
3085 .unlocked_ioctl = ll_file_ioctl,
3086 .open = ll_file_open,
3087 .release = ll_file_release,
3088 .mmap = ll_file_mmap,
3089 .llseek = ll_file_seek,
3090 .splice_read = ll_file_splice_read,
3091 .fsync = ll_fsync,
3092 .flush = ll_flush
3093 };
3094
3095 struct file_operations ll_file_operations_flock = {
3096 .read = new_sync_read,
3097 .read_iter = ll_file_read_iter,
3098 .write = new_sync_write,
3099 .write_iter = ll_file_write_iter,
3100 .unlocked_ioctl = ll_file_ioctl,
3101 .open = ll_file_open,
3102 .release = ll_file_release,
3103 .mmap = ll_file_mmap,
3104 .llseek = ll_file_seek,
3105 .splice_read = ll_file_splice_read,
3106 .fsync = ll_fsync,
3107 .flush = ll_flush,
3108 .flock = ll_file_flock,
3109 .lock = ll_file_flock
3110 };
3111
3112 /* These are for -o noflock - to return ENOSYS on flock calls */
3113 struct file_operations ll_file_operations_noflock = {
3114 .read = new_sync_read,
3115 .read_iter = ll_file_read_iter,
3116 .write = new_sync_write,
3117 .write_iter = ll_file_write_iter,
3118 .unlocked_ioctl = ll_file_ioctl,
3119 .open = ll_file_open,
3120 .release = ll_file_release,
3121 .mmap = ll_file_mmap,
3122 .llseek = ll_file_seek,
3123 .splice_read = ll_file_splice_read,
3124 .fsync = ll_fsync,
3125 .flush = ll_flush,
3126 .flock = ll_file_noflock,
3127 .lock = ll_file_noflock
3128 };
3129
3130 struct inode_operations ll_file_inode_operations = {
3131 .setattr = ll_setattr,
3132 .getattr = ll_getattr,
3133 .permission = ll_inode_permission,
3134 .setxattr = ll_setxattr,
3135 .getxattr = ll_getxattr,
3136 .listxattr = ll_listxattr,
3137 .removexattr = ll_removexattr,
3138 .fiemap = ll_fiemap,
3139 .get_acl = ll_get_acl,
3140 };
3141
3142 /* dynamic ioctl number support routines */
3143 static struct llioc_ctl_data {
3144 struct rw_semaphore ioc_sem;
3145 struct list_head ioc_head;
3146 } llioc = {
3147 __RWSEM_INITIALIZER(llioc.ioc_sem),
3148 LIST_HEAD_INIT(llioc.ioc_head)
3149 };
3150
3151
3152 struct llioc_data {
3153 struct list_head iocd_list;
3154 unsigned int iocd_size;
3155 llioc_callback_t iocd_cb;
3156 unsigned int iocd_count;
3157 unsigned int iocd_cmd[0];
3158 };
3159
3160 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3161 {
3162 unsigned int size;
3163 struct llioc_data *in_data = NULL;
3164
3165 if (cb == NULL || cmd == NULL ||
3166 count > LLIOC_MAX_CMD || count < 0)
3167 return NULL;
3168
3169 size = sizeof(*in_data) + count * sizeof(unsigned int);
3170 OBD_ALLOC(in_data, size);
3171 if (in_data == NULL)
3172 return NULL;
3173
3174 memset(in_data, 0, sizeof(*in_data));
3175 in_data->iocd_size = size;
3176 in_data->iocd_cb = cb;
3177 in_data->iocd_count = count;
3178 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3179
3180 down_write(&llioc.ioc_sem);
3181 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3182 up_write(&llioc.ioc_sem);
3183
3184 return in_data;
3185 }
3186
3187 void ll_iocontrol_unregister(void *magic)
3188 {
3189 struct llioc_data *tmp;
3190
3191 if (magic == NULL)
3192 return;
3193
3194 down_write(&llioc.ioc_sem);
3195 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3196 if (tmp == magic) {
3197 unsigned int size = tmp->iocd_size;
3198
3199 list_del(&tmp->iocd_list);
3200 up_write(&llioc.ioc_sem);
3201
3202 OBD_FREE(tmp, size);
3203 return;
3204 }
3205 }
3206 up_write(&llioc.ioc_sem);
3207
3208 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3209 }
3210
3211 EXPORT_SYMBOL(ll_iocontrol_register);
3212 EXPORT_SYMBOL(ll_iocontrol_unregister);
3213
3214 static enum llioc_iter
3215 ll_iocontrol_call(struct inode *inode, struct file *file,
3216 unsigned int cmd, unsigned long arg, int *rcp)
3217 {
3218 enum llioc_iter ret = LLIOC_CONT;
3219 struct llioc_data *data;
3220 int rc = -EINVAL, i;
3221
3222 down_read(&llioc.ioc_sem);
3223 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3224 for (i = 0; i < data->iocd_count; i++) {
3225 if (cmd != data->iocd_cmd[i])
3226 continue;
3227
3228 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3229 break;
3230 }
3231
3232 if (ret == LLIOC_STOP)
3233 break;
3234 }
3235 up_read(&llioc.ioc_sem);
3236
3237 if (rcp)
3238 *rcp = rc;
3239 return ret;
3240 }
3241
3242 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3243 {
3244 struct ll_inode_info *lli = ll_i2info(inode);
3245 struct cl_env_nest nest;
3246 struct lu_env *env;
3247 int result;
3248
3249 if (lli->lli_clob == NULL)
3250 return 0;
3251
3252 env = cl_env_nested_get(&nest);
3253 if (IS_ERR(env))
3254 return PTR_ERR(env);
3255
3256 result = cl_conf_set(env, lli->lli_clob, conf);
3257 cl_env_nested_put(&nest, env);
3258
3259 if (conf->coc_opc == OBJECT_CONF_SET) {
3260 struct ldlm_lock *lock = conf->coc_lock;
3261
3262 LASSERT(lock != NULL);
3263 LASSERT(ldlm_has_layout(lock));
3264 if (result == 0) {
3265 /* it can only be allowed to match after layout is
3266 * applied to inode otherwise false layout would be
3267 * seen. Applying layout should happen before dropping
3268 * the intent lock. */
3269 ldlm_lock_allow_match(lock);
3270 }
3271 }
3272 return result;
3273 }
3274
3275 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3276 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3277
3278 {
3279 struct ll_sb_info *sbi = ll_i2sbi(inode);
3280 struct obd_capa *oc;
3281 struct ptlrpc_request *req;
3282 struct mdt_body *body;
3283 void *lvbdata;
3284 void *lmm;
3285 int lmmsize;
3286 int rc;
3287
3288 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3289 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3290 lock->l_lvb_data, lock->l_lvb_len);
3291
3292 if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3293 return 0;
3294
3295 /* if layout lock was granted right away, the layout is returned
3296 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3297 * blocked and then granted via completion ast, we have to fetch
3298 * layout here. Please note that we can't use the LVB buffer in
3299 * completion AST because it doesn't have a large enough buffer */
3300 oc = ll_mdscapa_get(inode);
3301 rc = ll_get_default_mdsize(sbi, &lmmsize);
3302 if (rc == 0)
3303 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3304 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3305 lmmsize, 0, &req);
3306 capa_put(oc);
3307 if (rc < 0)
3308 return rc;
3309
3310 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3311 if (body == NULL)
3312 GOTO(out, rc = -EPROTO);
3313
3314 lmmsize = body->eadatasize;
3315 if (lmmsize == 0) /* empty layout */
3316 GOTO(out, rc = 0);
3317
3318 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3319 if (lmm == NULL)
3320 GOTO(out, rc = -EFAULT);
3321
3322 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3323 if (lvbdata == NULL)
3324 GOTO(out, rc = -ENOMEM);
3325
3326 memcpy(lvbdata, lmm, lmmsize);
3327 lock_res_and_lock(lock);
3328 if (lock->l_lvb_data != NULL)
3329 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3330
3331 lock->l_lvb_data = lvbdata;
3332 lock->l_lvb_len = lmmsize;
3333 unlock_res_and_lock(lock);
3334
3335 out:
3336 ptlrpc_req_finished(req);
3337 return rc;
3338 }
3339
3340 /**
3341 * Apply the layout to the inode. Layout lock is held and will be released
3342 * in this function.
3343 */
3344 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3345 struct inode *inode, __u32 *gen, bool reconf)
3346 {
3347 struct ll_inode_info *lli = ll_i2info(inode);
3348 struct ll_sb_info *sbi = ll_i2sbi(inode);
3349 struct ldlm_lock *lock;
3350 struct lustre_md md = { NULL };
3351 struct cl_object_conf conf;
3352 int rc = 0;
3353 bool lvb_ready;
3354 bool wait_layout = false;
3355
3356 LASSERT(lustre_handle_is_used(lockh));
3357
3358 lock = ldlm_handle2lock(lockh);
3359 LASSERT(lock != NULL);
3360 LASSERT(ldlm_has_layout(lock));
3361
3362 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3363 inode, PFID(&lli->lli_fid), reconf);
3364
3365 /* in case this is a caching lock and reinstate with new inode */
3366 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3367
3368 lock_res_and_lock(lock);
3369 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3370 unlock_res_and_lock(lock);
3371 /* checking lvb_ready is racy but this is okay. The worst case is
3372 * that multi processes may configure the file on the same time. */
3373 if (lvb_ready || !reconf) {
3374 rc = -ENODATA;
3375 if (lvb_ready) {
3376 /* layout_gen must be valid if layout lock is not
3377 * cancelled and stripe has already set */
3378 *gen = ll_layout_version_get(lli);
3379 rc = 0;
3380 }
3381 GOTO(out, rc);
3382 }
3383
3384 rc = ll_layout_fetch(inode, lock);
3385 if (rc < 0)
3386 GOTO(out, rc);
3387
3388 /* for layout lock, lmm is returned in lock's lvb.
3389 * lvb_data is immutable if the lock is held so it's safe to access it
3390 * without res lock. See the description in ldlm_lock_decref_internal()
3391 * for the condition to free lvb_data of layout lock */
3392 if (lock->l_lvb_data != NULL) {
3393 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3394 lock->l_lvb_data, lock->l_lvb_len);
3395 if (rc >= 0) {
3396 *gen = LL_LAYOUT_GEN_EMPTY;
3397 if (md.lsm != NULL)
3398 *gen = md.lsm->lsm_layout_gen;
3399 rc = 0;
3400 } else {
3401 CERROR("%s: file "DFID" unpackmd error: %d\n",
3402 ll_get_fsname(inode->i_sb, NULL, 0),
3403 PFID(&lli->lli_fid), rc);
3404 }
3405 }
3406 if (rc < 0)
3407 GOTO(out, rc);
3408
3409 /* set layout to file. Unlikely this will fail as old layout was
3410 * surely eliminated */
3411 memset(&conf, 0, sizeof(conf));
3412 conf.coc_opc = OBJECT_CONF_SET;
3413 conf.coc_inode = inode;
3414 conf.coc_lock = lock;
3415 conf.u.coc_md = &md;
3416 rc = ll_layout_conf(inode, &conf);
3417
3418 if (md.lsm != NULL)
3419 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3420
3421 /* refresh layout failed, need to wait */
3422 wait_layout = rc == -EBUSY;
3423
3424 out:
3425 LDLM_LOCK_PUT(lock);
3426 ldlm_lock_decref(lockh, mode);
3427
3428 /* wait for IO to complete if it's still being used. */
3429 if (wait_layout) {
3430 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3431 ll_get_fsname(inode->i_sb, NULL, 0),
3432 inode, PFID(&lli->lli_fid));
3433
3434 memset(&conf, 0, sizeof(conf));
3435 conf.coc_opc = OBJECT_CONF_WAIT;
3436 conf.coc_inode = inode;
3437 rc = ll_layout_conf(inode, &conf);
3438 if (rc == 0)
3439 rc = -EAGAIN;
3440
3441 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3442 PFID(&lli->lli_fid), rc);
3443 }
3444 return rc;
3445 }
3446
3447 /**
3448 * This function checks if there exists a LAYOUT lock on the client side,
3449 * or enqueues it if it doesn't have one in cache.
3450 *
3451 * This function will not hold layout lock so it may be revoked any time after
3452 * this function returns. Any operations depend on layout should be redone
3453 * in that case.
3454 *
3455 * This function should be called before lov_io_init() to get an uptodate
3456 * layout version, the caller should save the version number and after IO
3457 * is finished, this function should be called again to verify that layout
3458 * is not changed during IO time.
3459 */
3460 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3461 {
3462 struct ll_inode_info *lli = ll_i2info(inode);
3463 struct ll_sb_info *sbi = ll_i2sbi(inode);
3464 struct md_op_data *op_data;
3465 struct lookup_intent it;
3466 struct lustre_handle lockh;
3467 ldlm_mode_t mode;
3468 struct ldlm_enqueue_info einfo = {
3469 .ei_type = LDLM_IBITS,
3470 .ei_mode = LCK_CR,
3471 .ei_cb_bl = ll_md_blocking_ast,
3472 .ei_cb_cp = ldlm_completion_ast,
3473 };
3474 int rc;
3475
3476 *gen = ll_layout_version_get(lli);
3477 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3478 return 0;
3479
3480 /* sanity checks */
3481 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3482 LASSERT(S_ISREG(inode->i_mode));
3483
3484 /* take layout lock mutex to enqueue layout lock exclusively. */
3485 mutex_lock(&lli->lli_layout_mutex);
3486
3487 again:
3488 /* mostly layout lock is caching on the local side, so try to match
3489 * it before grabbing layout lock mutex. */
3490 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3491 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3492 if (mode != 0) { /* hit cached lock */
3493 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3494 if (rc == -EAGAIN)
3495 goto again;
3496
3497 mutex_unlock(&lli->lli_layout_mutex);
3498 return rc;
3499 }
3500
3501 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3502 0, 0, LUSTRE_OPC_ANY, NULL);
3503 if (IS_ERR(op_data)) {
3504 mutex_unlock(&lli->lli_layout_mutex);
3505 return PTR_ERR(op_data);
3506 }
3507
3508 /* have to enqueue one */
3509 memset(&it, 0, sizeof(it));
3510 it.it_op = IT_LAYOUT;
3511 lockh.cookie = 0ULL;
3512
3513 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3514 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3515 PFID(&lli->lli_fid));
3516
3517 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3518 NULL, 0, NULL, 0);
3519 if (it.d.lustre.it_data != NULL)
3520 ptlrpc_req_finished(it.d.lustre.it_data);
3521 it.d.lustre.it_data = NULL;
3522
3523 ll_finish_md_op_data(op_data);
3524
3525 mode = it.d.lustre.it_lock_mode;
3526 it.d.lustre.it_lock_mode = 0;
3527 ll_intent_drop_lock(&it);
3528
3529 if (rc == 0) {
3530 /* set lock data in case this is a new lock */
3531 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3532 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3533 if (rc == -EAGAIN)
3534 goto again;
3535 }
3536 mutex_unlock(&lli->lli_layout_mutex);
3537
3538 return rc;
3539 }
3540
3541 /**
3542 * This function send a restore request to the MDT
3543 */
3544 int ll_layout_restore(struct inode *inode)
3545 {
3546 struct hsm_user_request *hur;
3547 int len, rc;
3548
3549 len = sizeof(struct hsm_user_request) +
3550 sizeof(struct hsm_user_item);
3551 OBD_ALLOC(hur, len);
3552 if (hur == NULL)
3553 return -ENOMEM;
3554
3555 hur->hur_request.hr_action = HUA_RESTORE;
3556 hur->hur_request.hr_archive_id = 0;
3557 hur->hur_request.hr_flags = 0;
3558 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3559 sizeof(hur->hur_user_item[0].hui_fid));
3560 hur->hur_user_item[0].hui_extent.length = -1;
3561 hur->hur_request.hr_itemcount = 1;
3562 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,
3563 len, hur, NULL);
3564 OBD_FREE(hur, len);
3565 return rc;
3566 }
This page took 0.10083 seconds and 4 git commands to generate.