Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
6a5b99a4 | 18 | * http://www.gnu.org/licenses/gpl-2.0.html |
d7e09d03 | 19 | * |
d7e09d03 PT |
20 | * GPL HEADER END |
21 | */ | |
22 | /* | |
23 | * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. | |
24 | * Use is subject to license terms. | |
25 | * | |
1dc563a6 | 26 | * Copyright (c) 2011, 2015, Intel Corporation. |
d7e09d03 PT |
27 | */ |
28 | /* | |
29 | * This file is part of Lustre, http://www.lustre.org/ | |
30 | * Lustre is a trademark of Sun Microsystems, Inc. | |
31 | */ | |
32 | ||
33 | #include <linux/fs.h> | |
34 | #include <linux/sched.h> | |
35 | #include <linux/mm.h> | |
36 | #include <linux/highmem.h> | |
37 | #include <linux/pagemap.h> | |
38 | ||
39 | #define DEBUG_SUBSYSTEM S_LLITE | |
40 | ||
67a235f5 GKH |
41 | #include "../include/obd_support.h" |
42 | #include "../include/lustre_lite.h" | |
43 | #include "../include/lustre_dlm.h" | |
d7e09d03 PT |
44 | #include "llite_internal.h" |
45 | ||
46 | #define SA_OMITTED_ENTRY_MAX 8ULL | |
47 | ||
3f821732 | 48 | enum se_stat { |
d7e09d03 PT |
49 | /** negative values are for error cases */ |
50 | SA_ENTRY_INIT = 0, /** init entry */ | |
51 | SA_ENTRY_SUCC = 1, /** stat succeed */ | |
52 | SA_ENTRY_INVA = 2, /** invalid entry */ | |
53 | SA_ENTRY_DEST = 3, /** entry to be destroyed */ | |
3f821732 | 54 | }; |
d7e09d03 PT |
55 | |
56 | struct ll_sa_entry { | |
57 | /* link into sai->sai_entries */ | |
58 | struct list_head se_link; | |
59 | /* link into sai->sai_entries_{received,stated} */ | |
60 | struct list_head se_list; | |
61 | /* link into sai hash table locally */ | |
62 | struct list_head se_hash; | |
63 | /* entry reference count */ | |
64 | atomic_t se_refcount; | |
65 | /* entry index in the sai */ | |
66 | __u64 se_index; | |
67 | /* low layer ldlm lock handle */ | |
68 | __u64 se_handle; | |
69 | /* entry status */ | |
3f821732 | 70 | enum se_stat se_stat; |
d7e09d03 PT |
71 | /* entry size, contains name */ |
72 | int se_size; | |
73 | /* pointer to async getattr enqueue info */ | |
74 | struct md_enqueue_info *se_minfo; | |
75 | /* pointer to the async getattr request */ | |
76 | struct ptlrpc_request *se_req; | |
77 | /* pointer to the target inode */ | |
78 | struct inode *se_inode; | |
79 | /* entry name */ | |
80 | struct qstr se_qstr; | |
81 | }; | |
82 | ||
225f597c | 83 | static unsigned int sai_generation; |
d7e09d03 PT |
84 | static DEFINE_SPINLOCK(sai_generation_lock); |
85 | ||
d7e09d03 PT |
86 | /* |
87 | * The entry only can be released by the caller, it is necessary to hold lock. | |
88 | */ | |
89 | static inline int ll_sa_entry_stated(struct ll_sa_entry *entry) | |
90 | { | |
91 | smp_rmb(); | |
92 | return (entry->se_stat != SA_ENTRY_INIT); | |
93 | } | |
94 | ||
95 | static inline int ll_sa_entry_hash(int val) | |
96 | { | |
97 | return val & LL_SA_CACHE_MASK; | |
98 | } | |
99 | ||
100 | /* | |
101 | * Insert entry to hash SA table. | |
102 | */ | |
103 | static inline void | |
104 | ll_sa_entry_enhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry) | |
105 | { | |
106 | int i = ll_sa_entry_hash(entry->se_qstr.hash); | |
107 | ||
108 | spin_lock(&sai->sai_cache_lock[i]); | |
109 | list_add_tail(&entry->se_hash, &sai->sai_cache[i]); | |
110 | spin_unlock(&sai->sai_cache_lock[i]); | |
111 | } | |
112 | ||
113 | /* | |
114 | * Remove entry from SA table. | |
115 | */ | |
116 | static inline void | |
117 | ll_sa_entry_unhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry) | |
118 | { | |
119 | int i = ll_sa_entry_hash(entry->se_qstr.hash); | |
120 | ||
121 | spin_lock(&sai->sai_cache_lock[i]); | |
122 | list_del_init(&entry->se_hash); | |
123 | spin_unlock(&sai->sai_cache_lock[i]); | |
124 | } | |
125 | ||
126 | static inline int agl_should_run(struct ll_statahead_info *sai, | |
127 | struct inode *inode) | |
128 | { | |
6e16818b | 129 | return (inode && S_ISREG(inode->i_mode) && sai->sai_agl_valid); |
d7e09d03 PT |
130 | } |
131 | ||
d7e09d03 PT |
132 | static inline int sa_sent_full(struct ll_statahead_info *sai) |
133 | { | |
134 | return atomic_read(&sai->sai_cache_count) >= sai->sai_max; | |
135 | } | |
136 | ||
137 | static inline int sa_received_empty(struct ll_statahead_info *sai) | |
138 | { | |
139 | return list_empty(&sai->sai_entries_received); | |
140 | } | |
141 | ||
142 | static inline int agl_list_empty(struct ll_statahead_info *sai) | |
143 | { | |
144 | return list_empty(&sai->sai_entries_agl); | |
145 | } | |
146 | ||
147 | /** | |
148 | * (1) hit ratio less than 80% | |
149 | * or | |
150 | * (2) consecutive miss more than 8 | |
151 | * then means low hit. | |
152 | */ | |
153 | static inline int sa_low_hit(struct ll_statahead_info *sai) | |
154 | { | |
155 | return ((sai->sai_hit > 7 && sai->sai_hit < 4 * sai->sai_miss) || | |
156 | (sai->sai_consecutive_miss > 8)); | |
157 | } | |
158 | ||
159 | /* | |
160 | * If the given index is behind of statahead window more than | |
161 | * SA_OMITTED_ENTRY_MAX, then it is old. | |
162 | */ | |
163 | static inline int is_omitted_entry(struct ll_statahead_info *sai, __u64 index) | |
164 | { | |
165 | return ((__u64)sai->sai_max + index + SA_OMITTED_ENTRY_MAX < | |
166 | sai->sai_index); | |
167 | } | |
168 | ||
169 | /* | |
170 | * Insert it into sai_entries tail when init. | |
171 | */ | |
172 | static struct ll_sa_entry * | |
8387ff25 LT |
173 | ll_sa_entry_alloc(struct dentry *parent, |
174 | struct ll_statahead_info *sai, __u64 index, | |
d7e09d03 PT |
175 | const char *name, int len) |
176 | { | |
177 | struct ll_inode_info *lli; | |
178 | struct ll_sa_entry *entry; | |
179 | int entry_size; | |
180 | char *dname; | |
d7e09d03 PT |
181 | |
182 | entry_size = sizeof(struct ll_sa_entry) + (len & ~3) + 4; | |
496a51bd JL |
183 | entry = kzalloc(entry_size, GFP_NOFS); |
184 | if (unlikely(!entry)) | |
0a3bdb00 | 185 | return ERR_PTR(-ENOMEM); |
d7e09d03 | 186 | |
b0f5aad5 | 187 | CDEBUG(D_READA, "alloc sa entry %.*s(%p) index %llu\n", |
d7e09d03 PT |
188 | len, name, entry, index); |
189 | ||
190 | entry->se_index = index; | |
191 | ||
192 | /* | |
193 | * Statahead entry reference rules: | |
194 | * | |
195 | * 1) When statahead entry is initialized, its reference is set as 2. | |
196 | * One reference is used by the directory scanner. When the scanner | |
197 | * searches the statahead cache for the given name, it can perform | |
198 | * lockless hash lookup (only the scanner can remove entry from hash | |
199 | * list), and once found, it needn't to call "atomic_inc()" for the | |
200 | * entry reference. So the performance is improved. After using the | |
201 | * statahead entry, the scanner will call "atomic_dec()" to drop the | |
202 | * reference held when initialization. If it is the last reference, | |
203 | * the statahead entry will be freed. | |
204 | * | |
205 | * 2) All other threads, including statahead thread and ptlrpcd thread, | |
206 | * when they process the statahead entry, the reference for target | |
207 | * should be held to guarantee the entry will not be released by the | |
208 | * directory scanner. After processing the entry, these threads will | |
209 | * drop the entry reference. If it is the last reference, the entry | |
210 | * will be freed. | |
211 | * | |
212 | * The second reference when initializes the statahead entry is used | |
213 | * by the statahead thread, following the rule 2). | |
214 | */ | |
215 | atomic_set(&entry->se_refcount, 2); | |
216 | entry->se_stat = SA_ENTRY_INIT; | |
217 | entry->se_size = entry_size; | |
218 | dname = (char *)entry + sizeof(struct ll_sa_entry); | |
219 | memcpy(dname, name, len); | |
220 | dname[len] = 0; | |
8387ff25 LT |
221 | |
222 | entry->se_qstr.hash = full_name_hash(parent, name, len); | |
d7e09d03 PT |
223 | entry->se_qstr.len = len; |
224 | entry->se_qstr.name = dname; | |
225 | ||
226 | lli = ll_i2info(sai->sai_inode); | |
227 | spin_lock(&lli->lli_sa_lock); | |
228 | list_add_tail(&entry->se_link, &sai->sai_entries); | |
229 | INIT_LIST_HEAD(&entry->se_list); | |
230 | ll_sa_entry_enhash(sai, entry); | |
231 | spin_unlock(&lli->lli_sa_lock); | |
232 | ||
233 | atomic_inc(&sai->sai_cache_count); | |
234 | ||
0a3bdb00 | 235 | return entry; |
d7e09d03 PT |
236 | } |
237 | ||
238 | /* | |
239 | * Used by the directory scanner to search entry with name. | |
240 | * | |
241 | * Only the caller can remove the entry from hash, so it is unnecessary to hold | |
242 | * hash lock. It is caller's duty to release the init refcount on the entry, so | |
243 | * it is also unnecessary to increase refcount on the entry. | |
244 | */ | |
245 | static struct ll_sa_entry * | |
246 | ll_sa_entry_get_byname(struct ll_statahead_info *sai, const struct qstr *qstr) | |
247 | { | |
248 | struct ll_sa_entry *entry; | |
249 | int i = ll_sa_entry_hash(qstr->hash); | |
250 | ||
251 | list_for_each_entry(entry, &sai->sai_cache[i], se_hash) { | |
252 | if (entry->se_qstr.hash == qstr->hash && | |
253 | entry->se_qstr.len == qstr->len && | |
254 | memcmp(entry->se_qstr.name, qstr->name, qstr->len) == 0) | |
255 | return entry; | |
256 | } | |
257 | return NULL; | |
258 | } | |
259 | ||
260 | /* | |
261 | * Used by the async getattr request callback to find entry with index. | |
262 | * | |
263 | * Inside lli_sa_lock to prevent others to change the list during the search. | |
264 | * It needs to increase entry refcount before returning to guarantee that the | |
265 | * entry cannot be freed by others. | |
266 | */ | |
267 | static struct ll_sa_entry * | |
268 | ll_sa_entry_get_byindex(struct ll_statahead_info *sai, __u64 index) | |
269 | { | |
270 | struct ll_sa_entry *entry; | |
271 | ||
272 | list_for_each_entry(entry, &sai->sai_entries, se_link) { | |
273 | if (entry->se_index == index) { | |
274 | LASSERT(atomic_read(&entry->se_refcount) > 0); | |
275 | atomic_inc(&entry->se_refcount); | |
276 | return entry; | |
277 | } | |
278 | if (entry->se_index > index) | |
279 | break; | |
280 | } | |
281 | return NULL; | |
282 | } | |
283 | ||
284 | static void ll_sa_entry_cleanup(struct ll_statahead_info *sai, | |
e15ba45d | 285 | struct ll_sa_entry *entry) |
d7e09d03 PT |
286 | { |
287 | struct md_enqueue_info *minfo = entry->se_minfo; | |
288 | struct ptlrpc_request *req = entry->se_req; | |
289 | ||
290 | if (minfo) { | |
291 | entry->se_minfo = NULL; | |
292 | ll_intent_release(&minfo->mi_it); | |
293 | iput(minfo->mi_dir); | |
97903a26 | 294 | kfree(minfo); |
d7e09d03 PT |
295 | } |
296 | ||
297 | if (req) { | |
298 | entry->se_req = NULL; | |
299 | ptlrpc_req_finished(req); | |
300 | } | |
301 | } | |
302 | ||
303 | static void ll_sa_entry_put(struct ll_statahead_info *sai, | |
e15ba45d | 304 | struct ll_sa_entry *entry) |
d7e09d03 PT |
305 | { |
306 | if (atomic_dec_and_test(&entry->se_refcount)) { | |
b0f5aad5 | 307 | CDEBUG(D_READA, "free sa entry %.*s(%p) index %llu\n", |
d7e09d03 PT |
308 | entry->se_qstr.len, entry->se_qstr.name, entry, |
309 | entry->se_index); | |
310 | ||
311 | LASSERT(list_empty(&entry->se_link)); | |
312 | LASSERT(list_empty(&entry->se_list)); | |
b0d14255 | 313 | LASSERT(list_empty(&entry->se_hash)); |
d7e09d03 PT |
314 | |
315 | ll_sa_entry_cleanup(sai, entry); | |
13cb076d | 316 | iput(entry->se_inode); |
d7e09d03 | 317 | |
97903a26 | 318 | kfree(entry); |
d7e09d03 PT |
319 | atomic_dec(&sai->sai_cache_count); |
320 | } | |
321 | } | |
322 | ||
323 | static inline void | |
324 | do_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry) | |
325 | { | |
326 | struct ll_inode_info *lli = ll_i2info(sai->sai_inode); | |
327 | ||
b0d14255 | 328 | LASSERT(!list_empty(&entry->se_hash)); |
d7e09d03 PT |
329 | LASSERT(!list_empty(&entry->se_link)); |
330 | ||
331 | ll_sa_entry_unhash(sai, entry); | |
332 | ||
333 | spin_lock(&lli->lli_sa_lock); | |
334 | entry->se_stat = SA_ENTRY_DEST; | |
335 | list_del_init(&entry->se_link); | |
336 | if (likely(!list_empty(&entry->se_list))) | |
337 | list_del_init(&entry->se_list); | |
338 | spin_unlock(&lli->lli_sa_lock); | |
339 | ||
340 | ll_sa_entry_put(sai, entry); | |
341 | } | |
342 | ||
343 | /* | |
344 | * Delete it from sai_entries_stated list when fini. | |
345 | */ | |
346 | static void | |
347 | ll_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry) | |
348 | { | |
349 | struct ll_sa_entry *pos, *next; | |
350 | ||
351 | if (entry) | |
352 | do_sa_entry_fini(sai, entry); | |
353 | ||
354 | /* drop old entry, only 'scanner' process does this, no need to lock */ | |
355 | list_for_each_entry_safe(pos, next, &sai->sai_entries, se_link) { | |
356 | if (!is_omitted_entry(sai, pos->se_index)) | |
357 | break; | |
358 | do_sa_entry_fini(sai, pos); | |
359 | } | |
360 | } | |
361 | ||
362 | /* | |
363 | * Inside lli_sa_lock. | |
364 | */ | |
365 | static void | |
366 | do_sa_entry_to_stated(struct ll_statahead_info *sai, | |
3f821732 | 367 | struct ll_sa_entry *entry, enum se_stat stat) |
d7e09d03 PT |
368 | { |
369 | struct ll_sa_entry *se; | |
370 | struct list_head *pos = &sai->sai_entries_stated; | |
371 | ||
372 | if (!list_empty(&entry->se_list)) | |
373 | list_del_init(&entry->se_list); | |
374 | ||
375 | list_for_each_entry_reverse(se, &sai->sai_entries_stated, se_list) { | |
376 | if (se->se_index < entry->se_index) { | |
377 | pos = &se->se_list; | |
378 | break; | |
379 | } | |
380 | } | |
381 | ||
382 | list_add(&entry->se_list, pos); | |
383 | entry->se_stat = stat; | |
384 | } | |
385 | ||
386 | /* | |
387 | * Move entry to sai_entries_stated and sort with the index. | |
388 | * \retval 1 -- entry to be destroyed. | |
389 | * \retval 0 -- entry is inserted into stated list. | |
390 | */ | |
391 | static int | |
392 | ll_sa_entry_to_stated(struct ll_statahead_info *sai, | |
3f821732 | 393 | struct ll_sa_entry *entry, enum se_stat stat) |
d7e09d03 PT |
394 | { |
395 | struct ll_inode_info *lli = ll_i2info(sai->sai_inode); | |
396 | int ret = 1; | |
397 | ||
398 | ll_sa_entry_cleanup(sai, entry); | |
399 | ||
400 | spin_lock(&lli->lli_sa_lock); | |
401 | if (likely(entry->se_stat != SA_ENTRY_DEST)) { | |
402 | do_sa_entry_to_stated(sai, entry, stat); | |
403 | ret = 0; | |
404 | } | |
405 | spin_unlock(&lli->lli_sa_lock); | |
406 | ||
407 | return ret; | |
408 | } | |
409 | ||
410 | /* | |
411 | * Insert inode into the list of sai_entries_agl. | |
412 | */ | |
413 | static void ll_agl_add(struct ll_statahead_info *sai, | |
414 | struct inode *inode, int index) | |
415 | { | |
416 | struct ll_inode_info *child = ll_i2info(inode); | |
417 | struct ll_inode_info *parent = ll_i2info(sai->sai_inode); | |
418 | int added = 0; | |
419 | ||
420 | spin_lock(&child->lli_agl_lock); | |
421 | if (child->lli_agl_index == 0) { | |
422 | child->lli_agl_index = index; | |
423 | spin_unlock(&child->lli_agl_lock); | |
424 | ||
425 | LASSERT(list_empty(&child->lli_agl_list)); | |
426 | ||
427 | igrab(inode); | |
428 | spin_lock(&parent->lli_agl_lock); | |
24a85e88 | 429 | if (list_empty(&sai->sai_entries_agl)) |
d7e09d03 PT |
430 | added = 1; |
431 | list_add_tail(&child->lli_agl_list, &sai->sai_entries_agl); | |
432 | spin_unlock(&parent->lli_agl_lock); | |
433 | } else { | |
434 | spin_unlock(&child->lli_agl_lock); | |
435 | } | |
436 | ||
437 | if (added > 0) | |
438 | wake_up(&sai->sai_agl_thread.t_ctl_waitq); | |
439 | } | |
440 | ||
441 | static struct ll_statahead_info *ll_sai_alloc(void) | |
442 | { | |
443 | struct ll_statahead_info *sai; | |
444 | int i; | |
d7e09d03 | 445 | |
496a51bd | 446 | sai = kzalloc(sizeof(*sai), GFP_NOFS); |
d7e09d03 | 447 | if (!sai) |
0a3bdb00 | 448 | return NULL; |
d7e09d03 PT |
449 | |
450 | atomic_set(&sai->sai_refcount, 1); | |
451 | ||
452 | spin_lock(&sai_generation_lock); | |
453 | sai->sai_generation = ++sai_generation; | |
454 | if (unlikely(sai_generation == 0)) | |
455 | sai->sai_generation = ++sai_generation; | |
456 | spin_unlock(&sai_generation_lock); | |
457 | ||
458 | sai->sai_max = LL_SA_RPC_MIN; | |
459 | sai->sai_index = 1; | |
460 | init_waitqueue_head(&sai->sai_waitq); | |
461 | init_waitqueue_head(&sai->sai_thread.t_ctl_waitq); | |
462 | init_waitqueue_head(&sai->sai_agl_thread.t_ctl_waitq); | |
463 | ||
464 | INIT_LIST_HEAD(&sai->sai_entries); | |
465 | INIT_LIST_HEAD(&sai->sai_entries_received); | |
466 | INIT_LIST_HEAD(&sai->sai_entries_stated); | |
467 | INIT_LIST_HEAD(&sai->sai_entries_agl); | |
468 | ||
469 | for (i = 0; i < LL_SA_CACHE_SIZE; i++) { | |
470 | INIT_LIST_HEAD(&sai->sai_cache[i]); | |
471 | spin_lock_init(&sai->sai_cache_lock[i]); | |
472 | } | |
473 | atomic_set(&sai->sai_cache_count, 0); | |
474 | ||
0a3bdb00 | 475 | return sai; |
d7e09d03 PT |
476 | } |
477 | ||
478 | static inline struct ll_statahead_info * | |
479 | ll_sai_get(struct ll_statahead_info *sai) | |
480 | { | |
481 | atomic_inc(&sai->sai_refcount); | |
482 | return sai; | |
483 | } | |
484 | ||
485 | static void ll_sai_put(struct ll_statahead_info *sai) | |
486 | { | |
487 | struct inode *inode = sai->sai_inode; | |
488 | struct ll_inode_info *lli = ll_i2info(inode); | |
d7e09d03 PT |
489 | |
490 | if (atomic_dec_and_lock(&sai->sai_refcount, &lli->lli_sa_lock)) { | |
491 | struct ll_sa_entry *entry, *next; | |
492 | ||
493 | if (unlikely(atomic_read(&sai->sai_refcount) > 0)) { | |
494 | /* It is race case, the interpret callback just hold | |
c0894c6c OD |
495 | * a reference count |
496 | */ | |
d7e09d03 | 497 | spin_unlock(&lli->lli_sa_lock); |
e05e02e4 | 498 | return; |
d7e09d03 PT |
499 | } |
500 | ||
6e16818b | 501 | LASSERT(!lli->lli_opendir_key); |
d7e09d03 PT |
502 | LASSERT(thread_is_stopped(&sai->sai_thread)); |
503 | LASSERT(thread_is_stopped(&sai->sai_agl_thread)); | |
504 | ||
505 | lli->lli_sai = NULL; | |
506 | lli->lli_opendir_pid = 0; | |
507 | spin_unlock(&lli->lli_sa_lock); | |
508 | ||
509 | if (sai->sai_sent > sai->sai_replied) | |
1d8cb70c | 510 | CDEBUG(D_READA, "statahead for dir "DFID |
b0f5aad5 | 511 | " does not finish: [sent:%llu] [replied:%llu]\n", |
d7e09d03 PT |
512 | PFID(&lli->lli_fid), |
513 | sai->sai_sent, sai->sai_replied); | |
514 | ||
e15ba45d OD |
515 | list_for_each_entry_safe(entry, next, &sai->sai_entries, |
516 | se_link) | |
d7e09d03 PT |
517 | do_sa_entry_fini(sai, entry); |
518 | ||
519 | LASSERT(list_empty(&sai->sai_entries)); | |
615f9a68 | 520 | LASSERT(list_empty(&sai->sai_entries_received)); |
d7e09d03 PT |
521 | LASSERT(list_empty(&sai->sai_entries_stated)); |
522 | ||
523 | LASSERT(atomic_read(&sai->sai_cache_count) == 0); | |
24a85e88 | 524 | LASSERT(list_empty(&sai->sai_entries_agl)); |
d7e09d03 PT |
525 | |
526 | iput(inode); | |
97903a26 | 527 | kfree(sai); |
d7e09d03 | 528 | } |
d7e09d03 PT |
529 | } |
530 | ||
531 | /* Do NOT forget to drop inode refcount when into sai_entries_agl. */ | |
532 | static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai) | |
533 | { | |
534 | struct ll_inode_info *lli = ll_i2info(inode); | |
535 | __u64 index = lli->lli_agl_index; | |
536 | int rc; | |
d7e09d03 PT |
537 | |
538 | LASSERT(list_empty(&lli->lli_agl_list)); | |
539 | ||
540 | /* AGL maybe fall behind statahead with one entry */ | |
541 | if (is_omitted_entry(sai, index + 1)) { | |
542 | lli->lli_agl_index = 0; | |
543 | iput(inode); | |
e05e02e4 | 544 | return; |
d7e09d03 PT |
545 | } |
546 | ||
547 | /* Someone is in glimpse (sync or async), do nothing. */ | |
548 | rc = down_write_trylock(&lli->lli_glimpse_sem); | |
549 | if (rc == 0) { | |
550 | lli->lli_agl_index = 0; | |
551 | iput(inode); | |
e05e02e4 | 552 | return; |
d7e09d03 PT |
553 | } |
554 | ||
555 | /* | |
556 | * Someone triggered glimpse within 1 sec before. | |
557 | * 1) The former glimpse succeeded with glimpse lock granted by OST, and | |
558 | * if the lock is still cached on client, AGL needs to do nothing. If | |
d0a0acc3 | 559 | * it is cancelled by other client, AGL maybe cannot obtain new lock |
d7e09d03 PT |
560 | * for no glimpse callback triggered by AGL. |
561 | * 2) The former glimpse succeeded, but OST did not grant glimpse lock. | |
562 | * Under such case, it is quite possible that the OST will not grant | |
563 | * glimpse lock for AGL also. | |
564 | * 3) The former glimpse failed, compared with other two cases, it is | |
565 | * relative rare. AGL can ignore such case, and it will not muchly | |
566 | * affect the performance. | |
567 | */ | |
568 | if (lli->lli_glimpse_time != 0 && | |
699503bc | 569 | time_before(cfs_time_shift(-1), lli->lli_glimpse_time)) { |
d7e09d03 PT |
570 | up_write(&lli->lli_glimpse_sem); |
571 | lli->lli_agl_index = 0; | |
572 | iput(inode); | |
e05e02e4 | 573 | return; |
d7e09d03 PT |
574 | } |
575 | ||
576 | CDEBUG(D_READA, "Handling (init) async glimpse: inode = " | |
b0f5aad5 | 577 | DFID", idx = %llu\n", PFID(&lli->lli_fid), index); |
d7e09d03 PT |
578 | |
579 | cl_agl(inode); | |
580 | lli->lli_agl_index = 0; | |
581 | lli->lli_glimpse_time = cfs_time_current(); | |
582 | up_write(&lli->lli_glimpse_sem); | |
583 | ||
584 | CDEBUG(D_READA, "Handled (init) async glimpse: inode= " | |
b0f5aad5 | 585 | DFID", idx = %llu, rc = %d\n", |
d7e09d03 PT |
586 | PFID(&lli->lli_fid), index, rc); |
587 | ||
588 | iput(inode); | |
d7e09d03 PT |
589 | } |
590 | ||
591 | static void ll_post_statahead(struct ll_statahead_info *sai) | |
592 | { | |
593 | struct inode *dir = sai->sai_inode; | |
594 | struct inode *child; | |
595 | struct ll_inode_info *lli = ll_i2info(dir); | |
596 | struct ll_sa_entry *entry; | |
597 | struct md_enqueue_info *minfo; | |
598 | struct lookup_intent *it; | |
599 | struct ptlrpc_request *req; | |
600 | struct mdt_body *body; | |
601 | int rc = 0; | |
d7e09d03 PT |
602 | |
603 | spin_lock(&lli->lli_sa_lock); | |
615f9a68 | 604 | if (unlikely(list_empty(&sai->sai_entries_received))) { |
d7e09d03 | 605 | spin_unlock(&lli->lli_sa_lock); |
e05e02e4 | 606 | return; |
d7e09d03 | 607 | } |
13ce3246 SB |
608 | entry = list_entry(sai->sai_entries_received.next, |
609 | struct ll_sa_entry, se_list); | |
d7e09d03 PT |
610 | atomic_inc(&entry->se_refcount); |
611 | list_del_init(&entry->se_list); | |
612 | spin_unlock(&lli->lli_sa_lock); | |
613 | ||
614 | LASSERT(entry->se_handle != 0); | |
615 | ||
616 | minfo = entry->se_minfo; | |
617 | it = &minfo->mi_it; | |
618 | req = entry->se_req; | |
619 | body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); | |
6e16818b | 620 | if (!body) { |
34e1f2bb JL |
621 | rc = -EFAULT; |
622 | goto out; | |
623 | } | |
d7e09d03 PT |
624 | |
625 | child = entry->se_inode; | |
6e16818b | 626 | if (!child) { |
d7e09d03 PT |
627 | /* |
628 | * lookup. | |
629 | */ | |
630 | LASSERT(fid_is_zero(&minfo->mi_data.op_fid2)); | |
631 | ||
bef31c78 | 632 | /* XXX: No fid in reply, this is probably cross-ref case. |
c0894c6c OD |
633 | * SA can't handle it yet. |
634 | */ | |
2e1b5b8b | 635 | if (body->mbo_valid & OBD_MD_MDS) { |
34e1f2bb JL |
636 | rc = -EAGAIN; |
637 | goto out; | |
638 | } | |
d7e09d03 PT |
639 | } else { |
640 | /* | |
641 | * revalidate. | |
642 | */ | |
643 | /* unlinked and re-created with the same name */ | |
2e1b5b8b | 644 | if (unlikely(!lu_fid_eq(&minfo->mi_data.op_fid2, &body->mbo_fid1))) { |
d7e09d03 PT |
645 | entry->se_inode = NULL; |
646 | iput(child); | |
647 | child = NULL; | |
648 | } | |
649 | } | |
650 | ||
e476f2e5 | 651 | it->it_lock_handle = entry->se_handle; |
d7e09d03 | 652 | rc = md_revalidate_lock(ll_i2mdexp(dir), it, ll_inode2fid(dir), NULL); |
34e1f2bb JL |
653 | if (rc != 1) { |
654 | rc = -EAGAIN; | |
655 | goto out; | |
656 | } | |
d7e09d03 PT |
657 | |
658 | rc = ll_prep_inode(&child, req, dir->i_sb, it); | |
659 | if (rc) | |
34e1f2bb | 660 | goto out; |
d7e09d03 | 661 | |
97a075cd JN |
662 | CDEBUG(D_DLMTRACE, "%s: setting l_data to inode "DFID"%p\n", |
663 | ll_get_fsname(child->i_sb, NULL, 0), | |
664 | PFID(ll_inode2fid(child)), child); | |
d7e09d03 PT |
665 | ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL); |
666 | ||
667 | entry->se_inode = child; | |
668 | ||
669 | if (agl_should_run(sai, child)) | |
670 | ll_agl_add(sai, child, entry->se_index); | |
671 | ||
d7e09d03 PT |
672 | out: |
673 | /* The "ll_sa_entry_to_stated()" will drop related ldlm ibits lock | |
674 | * reference count by calling "ll_intent_drop_lock()" in spite of the | |
675 | * above operations failed or not. Do not worry about calling | |
c0894c6c OD |
676 | * "ll_intent_drop_lock()" more than once. |
677 | */ | |
d7e09d03 PT |
678 | rc = ll_sa_entry_to_stated(sai, entry, |
679 | rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC); | |
680 | if (rc == 0 && entry->se_index == sai->sai_index_wait) | |
681 | wake_up(&sai->sai_waitq); | |
682 | ll_sa_entry_put(sai, entry); | |
683 | } | |
684 | ||
685 | static int ll_statahead_interpret(struct ptlrpc_request *req, | |
686 | struct md_enqueue_info *minfo, int rc) | |
687 | { | |
688 | struct lookup_intent *it = &minfo->mi_it; | |
689 | struct inode *dir = minfo->mi_dir; | |
690 | struct ll_inode_info *lli = ll_i2info(dir); | |
691 | struct ll_statahead_info *sai = NULL; | |
692 | struct ll_sa_entry *entry; | |
aac2e54f | 693 | __u64 handle = 0; |
d7e09d03 | 694 | int wakeup; |
d7e09d03 PT |
695 | |
696 | if (it_disposition(it, DISP_LOOKUP_NEG)) | |
697 | rc = -ENOENT; | |
698 | ||
aac2e54f LS |
699 | if (rc == 0) { |
700 | /* release ibits lock ASAP to avoid deadlock when statahead | |
701 | * thread enqueues lock on parent in readdir and another | |
702 | * process enqueues lock on child with parent lock held, eg. | |
c0894c6c OD |
703 | * unlink. |
704 | */ | |
e476f2e5 | 705 | handle = it->it_lock_handle; |
aac2e54f LS |
706 | ll_intent_drop_lock(it); |
707 | } | |
708 | ||
d7e09d03 PT |
709 | spin_lock(&lli->lli_sa_lock); |
710 | /* stale entry */ | |
6e16818b | 711 | if (unlikely(!lli->lli_sai || |
d7e09d03 PT |
712 | lli->lli_sai->sai_generation != minfo->mi_generation)) { |
713 | spin_unlock(&lli->lli_sa_lock); | |
34e1f2bb JL |
714 | rc = -ESTALE; |
715 | goto out; | |
d7e09d03 PT |
716 | } else { |
717 | sai = ll_sai_get(lli->lli_sai); | |
718 | if (unlikely(!thread_is_running(&sai->sai_thread))) { | |
719 | sai->sai_replied++; | |
720 | spin_unlock(&lli->lli_sa_lock); | |
34e1f2bb JL |
721 | rc = -EBADFD; |
722 | goto out; | |
d7e09d03 PT |
723 | } |
724 | ||
725 | entry = ll_sa_entry_get_byindex(sai, minfo->mi_cbdata); | |
6e16818b | 726 | if (!entry) { |
d7e09d03 PT |
727 | sai->sai_replied++; |
728 | spin_unlock(&lli->lli_sa_lock); | |
34e1f2bb JL |
729 | rc = -EIDRM; |
730 | goto out; | |
d7e09d03 PT |
731 | } |
732 | ||
733 | if (rc != 0) { | |
734 | do_sa_entry_to_stated(sai, entry, SA_ENTRY_INVA); | |
735 | wakeup = (entry->se_index == sai->sai_index_wait); | |
736 | } else { | |
737 | entry->se_minfo = minfo; | |
738 | entry->se_req = ptlrpc_request_addref(req); | |
739 | /* Release the async ibits lock ASAP to avoid deadlock | |
740 | * when statahead thread tries to enqueue lock on parent | |
741 | * for readpage and other tries to enqueue lock on child | |
c0894c6c OD |
742 | * with parent's lock held, for example: unlink. |
743 | */ | |
aac2e54f | 744 | entry->se_handle = handle; |
615f9a68 | 745 | wakeup = list_empty(&sai->sai_entries_received); |
d7e09d03 | 746 | list_add_tail(&entry->se_list, |
e15ba45d | 747 | &sai->sai_entries_received); |
d7e09d03 PT |
748 | } |
749 | sai->sai_replied++; | |
750 | spin_unlock(&lli->lli_sa_lock); | |
751 | ||
752 | ll_sa_entry_put(sai, entry); | |
753 | if (wakeup) | |
754 | wake_up(&sai->sai_thread.t_ctl_waitq); | |
755 | } | |
756 | ||
d7e09d03 PT |
757 | out: |
758 | if (rc != 0) { | |
759 | ll_intent_release(it); | |
760 | iput(dir); | |
97903a26 | 761 | kfree(minfo); |
d7e09d03 | 762 | } |
6e16818b | 763 | if (sai) |
d7e09d03 PT |
764 | ll_sai_put(sai); |
765 | return rc; | |
766 | } | |
767 | ||
768 | static void sa_args_fini(struct md_enqueue_info *minfo, | |
769 | struct ldlm_enqueue_info *einfo) | |
770 | { | |
771 | LASSERT(minfo && einfo); | |
772 | iput(minfo->mi_dir); | |
97903a26 JL |
773 | kfree(minfo); |
774 | kfree(einfo); | |
d7e09d03 PT |
775 | } |
776 | ||
777 | /** | |
ef2e0f55 | 778 | * prepare arguments for async stat RPC. |
d7e09d03 PT |
779 | */ |
780 | static int sa_args_init(struct inode *dir, struct inode *child, | |
781 | struct ll_sa_entry *entry, struct md_enqueue_info **pmi, | |
ef2e0f55 | 782 | struct ldlm_enqueue_info **pei) |
d7e09d03 | 783 | { |
1e95e9a0 | 784 | const struct qstr *qstr = &entry->se_qstr; |
d7e09d03 PT |
785 | struct ll_inode_info *lli = ll_i2info(dir); |
786 | struct md_enqueue_info *minfo; | |
787 | struct ldlm_enqueue_info *einfo; | |
788 | struct md_op_data *op_data; | |
789 | ||
496a51bd JL |
790 | einfo = kzalloc(sizeof(*einfo), GFP_NOFS); |
791 | if (!einfo) | |
d7e09d03 PT |
792 | return -ENOMEM; |
793 | ||
496a51bd JL |
794 | minfo = kzalloc(sizeof(*minfo), GFP_NOFS); |
795 | if (!minfo) { | |
97903a26 | 796 | kfree(einfo); |
d7e09d03 PT |
797 | return -ENOMEM; |
798 | } | |
799 | ||
800 | op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child, qstr->name, | |
801 | qstr->len, 0, LUSTRE_OPC_ANY, NULL); | |
802 | if (IS_ERR(op_data)) { | |
97903a26 JL |
803 | kfree(einfo); |
804 | kfree(minfo); | |
d7e09d03 PT |
805 | return PTR_ERR(op_data); |
806 | } | |
807 | ||
808 | minfo->mi_it.it_op = IT_GETATTR; | |
809 | minfo->mi_dir = igrab(dir); | |
810 | minfo->mi_cb = ll_statahead_interpret; | |
811 | minfo->mi_generation = lli->lli_sai->sai_generation; | |
812 | minfo->mi_cbdata = entry->se_index; | |
813 | ||
814 | einfo->ei_type = LDLM_IBITS; | |
815 | einfo->ei_mode = it_to_lock_mode(&minfo->mi_it); | |
816 | einfo->ei_cb_bl = ll_md_blocking_ast; | |
817 | einfo->ei_cb_cp = ldlm_completion_ast; | |
818 | einfo->ei_cb_gl = NULL; | |
819 | einfo->ei_cbdata = NULL; | |
820 | ||
821 | *pmi = minfo; | |
822 | *pei = einfo; | |
d7e09d03 PT |
823 | |
824 | return 0; | |
825 | } | |
826 | ||
827 | static int do_sa_lookup(struct inode *dir, struct ll_sa_entry *entry) | |
828 | { | |
829 | struct md_enqueue_info *minfo; | |
830 | struct ldlm_enqueue_info *einfo; | |
d7e09d03 | 831 | int rc; |
d7e09d03 | 832 | |
ef2e0f55 | 833 | rc = sa_args_init(dir, NULL, entry, &minfo, &einfo); |
d7e09d03 | 834 | if (rc) |
0a3bdb00 | 835 | return rc; |
d7e09d03 PT |
836 | |
837 | rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo); | |
ef2e0f55 | 838 | if (rc < 0) |
d7e09d03 | 839 | sa_args_fini(minfo, einfo); |
d7e09d03 | 840 | |
0a3bdb00 | 841 | return rc; |
d7e09d03 PT |
842 | } |
843 | ||
844 | /** | |
845 | * similar to ll_revalidate_it(). | |
846 | * \retval 1 -- dentry valid | |
847 | * \retval 0 -- will send stat-ahead request | |
848 | * \retval others -- prepare stat-ahead request failed | |
849 | */ | |
850 | static int do_sa_revalidate(struct inode *dir, struct ll_sa_entry *entry, | |
851 | struct dentry *dentry) | |
852 | { | |
2b0143b5 | 853 | struct inode *inode = d_inode(dentry); |
d7e09d03 | 854 | struct lookup_intent it = { .it_op = IT_GETATTR, |
e476f2e5 | 855 | .it_lock_handle = 0 }; |
d7e09d03 PT |
856 | struct md_enqueue_info *minfo; |
857 | struct ldlm_enqueue_info *einfo; | |
d7e09d03 | 858 | int rc; |
d7e09d03 | 859 | |
6e16818b | 860 | if (unlikely(!inode)) |
0a3bdb00 | 861 | return 1; |
d7e09d03 PT |
862 | |
863 | if (d_mountpoint(dentry)) | |
0a3bdb00 | 864 | return 1; |
d7e09d03 | 865 | |
d7e09d03 | 866 | entry->se_inode = igrab(inode); |
1d8cb70c GD |
867 | rc = md_revalidate_lock(ll_i2mdexp(dir), &it, ll_inode2fid(inode), |
868 | NULL); | |
d7e09d03 | 869 | if (rc == 1) { |
e476f2e5 | 870 | entry->se_handle = it.it_lock_handle; |
d7e09d03 | 871 | ll_intent_release(&it); |
0a3bdb00 | 872 | return 1; |
d7e09d03 PT |
873 | } |
874 | ||
ef2e0f55 | 875 | rc = sa_args_init(dir, inode, entry, &minfo, &einfo); |
d7e09d03 PT |
876 | if (rc) { |
877 | entry->se_inode = NULL; | |
878 | iput(inode); | |
0a3bdb00 | 879 | return rc; |
d7e09d03 PT |
880 | } |
881 | ||
882 | rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo); | |
ef2e0f55 | 883 | if (rc < 0) { |
d7e09d03 PT |
884 | entry->se_inode = NULL; |
885 | iput(inode); | |
886 | sa_args_fini(minfo, einfo); | |
887 | } | |
888 | ||
0a3bdb00 | 889 | return rc; |
d7e09d03 PT |
890 | } |
891 | ||
9c234f6c | 892 | static void ll_statahead_one(struct dentry *parent, const char *entry_name, |
d7e09d03 PT |
893 | int entry_name_len) |
894 | { | |
2b0143b5 | 895 | struct inode *dir = d_inode(parent); |
d7e09d03 PT |
896 | struct ll_inode_info *lli = ll_i2info(dir); |
897 | struct ll_statahead_info *sai = lli->lli_sai; | |
898 | struct dentry *dentry = NULL; | |
899 | struct ll_sa_entry *entry; | |
900 | int rc; | |
901 | int rc1; | |
d7e09d03 | 902 | |
8387ff25 | 903 | entry = ll_sa_entry_alloc(parent, sai, sai->sai_index, entry_name, |
d7e09d03 PT |
904 | entry_name_len); |
905 | if (IS_ERR(entry)) | |
e05e02e4 | 906 | return; |
d7e09d03 PT |
907 | |
908 | dentry = d_lookup(parent, &entry->se_qstr); | |
909 | if (!dentry) { | |
910 | rc = do_sa_lookup(dir, entry); | |
911 | } else { | |
912 | rc = do_sa_revalidate(dir, entry, dentry); | |
2b0143b5 DH |
913 | if (rc == 1 && agl_should_run(sai, d_inode(dentry))) |
914 | ll_agl_add(sai, d_inode(dentry), entry->se_index); | |
d7e09d03 | 915 | |
d7e09d03 | 916 | dput(dentry); |
6e16818b | 917 | } |
d7e09d03 PT |
918 | |
919 | if (rc) { | |
920 | rc1 = ll_sa_entry_to_stated(sai, entry, | |
24c198e9 OD |
921 | rc < 0 ? SA_ENTRY_INVA : |
922 | SA_ENTRY_SUCC); | |
d7e09d03 PT |
923 | if (rc1 == 0 && entry->se_index == sai->sai_index_wait) |
924 | wake_up(&sai->sai_waitq); | |
925 | } else { | |
926 | sai->sai_sent++; | |
927 | } | |
928 | ||
929 | sai->sai_index++; | |
930 | /* drop one refcount on entry by ll_sa_entry_alloc */ | |
931 | ll_sa_entry_put(sai, entry); | |
d7e09d03 PT |
932 | } |
933 | ||
934 | static int ll_agl_thread(void *arg) | |
935 | { | |
f9459c0a | 936 | struct dentry *parent = arg; |
2b0143b5 | 937 | struct inode *dir = d_inode(parent); |
d7e09d03 PT |
938 | struct ll_inode_info *plli = ll_i2info(dir); |
939 | struct ll_inode_info *clli; | |
940 | struct ll_sb_info *sbi = ll_i2sbi(dir); | |
941 | struct ll_statahead_info *sai = ll_sai_get(plli->lli_sai); | |
942 | struct ptlrpc_thread *thread = &sai->sai_agl_thread; | |
943 | struct l_wait_info lwi = { 0 }; | |
d7e09d03 | 944 | |
9fc3b028 | 945 | thread->t_pid = current_pid(); |
09561a53 AV |
946 | CDEBUG(D_READA, "agl thread started: sai %p, parent %pd\n", |
947 | sai, parent); | |
d7e09d03 PT |
948 | |
949 | atomic_inc(&sbi->ll_agl_total); | |
950 | spin_lock(&plli->lli_agl_lock); | |
951 | sai->sai_agl_valid = 1; | |
717d1c2e CM |
952 | if (thread_is_init(thread)) |
953 | /* If someone else has changed the thread state | |
954 | * (e.g. already changed to SVC_STOPPING), we can't just | |
c0894c6c OD |
955 | * blindly overwrite that setting. |
956 | */ | |
717d1c2e | 957 | thread_set_flags(thread, SVC_RUNNING); |
d7e09d03 PT |
958 | spin_unlock(&plli->lli_agl_lock); |
959 | wake_up(&thread->t_ctl_waitq); | |
960 | ||
961 | while (1) { | |
962 | l_wait_event(thread->t_ctl_waitq, | |
24a85e88 | 963 | !list_empty(&sai->sai_entries_agl) || |
d7e09d03 PT |
964 | !thread_is_running(thread), |
965 | &lwi); | |
966 | ||
967 | if (!thread_is_running(thread)) | |
968 | break; | |
969 | ||
970 | spin_lock(&plli->lli_agl_lock); | |
971 | /* The statahead thread maybe help to process AGL entries, | |
c0894c6c OD |
972 | * so check whether list empty again. |
973 | */ | |
24a85e88 | 974 | if (!list_empty(&sai->sai_entries_agl)) { |
6c3d0ea6 SB |
975 | clli = list_entry(sai->sai_entries_agl.next, |
976 | struct ll_inode_info, lli_agl_list); | |
d7e09d03 PT |
977 | list_del_init(&clli->lli_agl_list); |
978 | spin_unlock(&plli->lli_agl_lock); | |
979 | ll_agl_trigger(&clli->lli_vfs_inode, sai); | |
980 | } else { | |
981 | spin_unlock(&plli->lli_agl_lock); | |
982 | } | |
983 | } | |
984 | ||
985 | spin_lock(&plli->lli_agl_lock); | |
986 | sai->sai_agl_valid = 0; | |
24a85e88 | 987 | while (!list_empty(&sai->sai_entries_agl)) { |
6c3d0ea6 SB |
988 | clli = list_entry(sai->sai_entries_agl.next, |
989 | struct ll_inode_info, lli_agl_list); | |
d7e09d03 PT |
990 | list_del_init(&clli->lli_agl_list); |
991 | spin_unlock(&plli->lli_agl_lock); | |
992 | clli->lli_agl_index = 0; | |
993 | iput(&clli->lli_vfs_inode); | |
994 | spin_lock(&plli->lli_agl_lock); | |
995 | } | |
996 | thread_set_flags(thread, SVC_STOPPED); | |
997 | spin_unlock(&plli->lli_agl_lock); | |
998 | wake_up(&thread->t_ctl_waitq); | |
999 | ll_sai_put(sai); | |
09561a53 AV |
1000 | CDEBUG(D_READA, "agl thread stopped: sai %p, parent %pd\n", |
1001 | sai, parent); | |
0a3bdb00 | 1002 | return 0; |
d7e09d03 PT |
1003 | } |
1004 | ||
1005 | static void ll_start_agl(struct dentry *parent, struct ll_statahead_info *sai) | |
1006 | { | |
1007 | struct ptlrpc_thread *thread = &sai->sai_agl_thread; | |
1008 | struct l_wait_info lwi = { 0 }; | |
1009 | struct ll_inode_info *plli; | |
68b636b6 | 1010 | struct task_struct *task; |
d7e09d03 | 1011 | |
09561a53 AV |
1012 | CDEBUG(D_READA, "start agl thread: sai %p, parent %pd\n", |
1013 | sai, parent); | |
d7e09d03 | 1014 | |
2b0143b5 | 1015 | plli = ll_i2info(d_inode(parent)); |
e15ba45d OD |
1016 | task = kthread_run(ll_agl_thread, parent, "ll_agl_%u", |
1017 | plli->lli_opendir_pid); | |
d7e09d03 PT |
1018 | if (IS_ERR(task)) { |
1019 | CERROR("can't start ll_agl thread, rc: %ld\n", PTR_ERR(task)); | |
1020 | thread_set_flags(thread, SVC_STOPPED); | |
e05e02e4 | 1021 | return; |
d7e09d03 PT |
1022 | } |
1023 | ||
1024 | l_wait_event(thread->t_ctl_waitq, | |
1025 | thread_is_running(thread) || thread_is_stopped(thread), | |
1026 | &lwi); | |
d7e09d03 PT |
1027 | } |
1028 | ||
1029 | static int ll_statahead_thread(void *arg) | |
1030 | { | |
f9459c0a | 1031 | struct dentry *parent = arg; |
2b0143b5 | 1032 | struct inode *dir = d_inode(parent); |
d7e09d03 PT |
1033 | struct ll_inode_info *plli = ll_i2info(dir); |
1034 | struct ll_inode_info *clli; | |
1035 | struct ll_sb_info *sbi = ll_i2sbi(dir); | |
1036 | struct ll_statahead_info *sai = ll_sai_get(plli->lli_sai); | |
1037 | struct ptlrpc_thread *thread = &sai->sai_thread; | |
1038 | struct ptlrpc_thread *agl_thread = &sai->sai_agl_thread; | |
4f76f0ec | 1039 | struct page *page = NULL; |
d7e09d03 PT |
1040 | __u64 pos = 0; |
1041 | int first = 0; | |
1042 | int rc = 0; | |
307bef74 | 1043 | struct md_op_data *op_data; |
d7e09d03 PT |
1044 | struct ll_dir_chain chain; |
1045 | struct l_wait_info lwi = { 0 }; | |
d7e09d03 | 1046 | |
9fc3b028 | 1047 | thread->t_pid = current_pid(); |
09561a53 AV |
1048 | CDEBUG(D_READA, "statahead thread starting: sai %p, parent %pd\n", |
1049 | sai, parent); | |
d7e09d03 | 1050 | |
307bef74 | 1051 | op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0, |
1052 | LUSTRE_OPC_ANY, dir); | |
1053 | if (IS_ERR(op_data)) | |
1054 | return PTR_ERR(op_data); | |
1055 | ||
bce1bbf4 | 1056 | op_data->op_max_pages = ll_i2sbi(dir)->ll_md_brw_pages; |
1057 | ||
d7e09d03 PT |
1058 | if (sbi->ll_flags & LL_SBI_AGL_ENABLED) |
1059 | ll_start_agl(parent, sai); | |
1060 | ||
1061 | atomic_inc(&sbi->ll_sa_total); | |
1062 | spin_lock(&plli->lli_sa_lock); | |
717d1c2e CM |
1063 | if (thread_is_init(thread)) |
1064 | /* If someone else has changed the thread state | |
1065 | * (e.g. already changed to SVC_STOPPING), we can't just | |
c0894c6c OD |
1066 | * blindly overwrite that setting. |
1067 | */ | |
717d1c2e | 1068 | thread_set_flags(thread, SVC_RUNNING); |
d7e09d03 PT |
1069 | spin_unlock(&plli->lli_sa_lock); |
1070 | wake_up(&thread->t_ctl_waitq); | |
1071 | ||
1072 | ll_dir_chain_init(&chain); | |
3978732f | 1073 | page = ll_get_dir_page(dir, op_data, pos, &chain); |
d7e09d03 PT |
1074 | |
1075 | while (1) { | |
1076 | struct lu_dirpage *dp; | |
1077 | struct lu_dirent *ent; | |
1078 | ||
1079 | if (IS_ERR(page)) { | |
1080 | rc = PTR_ERR(page); | |
4f48c52c | 1081 | CDEBUG(D_READA, "error reading dir "DFID" at %llu/%llu: opendir_pid = %u: rc = %d\n", |
d7e09d03 | 1082 | PFID(ll_inode2fid(dir)), pos, sai->sai_index, |
4f48c52c | 1083 | plli->lli_opendir_pid, rc); |
34e1f2bb | 1084 | goto out; |
d7e09d03 PT |
1085 | } |
1086 | ||
1087 | dp = page_address(page); | |
6e16818b | 1088 | for (ent = lu_dirent_start(dp); ent; |
d7e09d03 PT |
1089 | ent = lu_dirent_next(ent)) { |
1090 | __u64 hash; | |
1091 | int namelen; | |
1092 | char *name; | |
1093 | ||
1094 | hash = le64_to_cpu(ent->lde_hash); | |
1095 | if (unlikely(hash < pos)) | |
1096 | /* | |
1097 | * Skip until we find target hash value. | |
1098 | */ | |
1099 | continue; | |
1100 | ||
1101 | namelen = le16_to_cpu(ent->lde_namelen); | |
1102 | if (unlikely(namelen == 0)) | |
1103 | /* | |
1104 | * Skip dummy record. | |
1105 | */ | |
1106 | continue; | |
1107 | ||
1108 | name = ent->lde_name; | |
1109 | if (name[0] == '.') { | |
1110 | if (namelen == 1) { | |
1111 | /* | |
1112 | * skip "." | |
1113 | */ | |
1114 | continue; | |
1115 | } else if (name[1] == '.' && namelen == 2) { | |
1116 | /* | |
1117 | * skip ".." | |
1118 | */ | |
1119 | continue; | |
1120 | } else if (!sai->sai_ls_all) { | |
1121 | /* | |
1122 | * skip hidden files. | |
1123 | */ | |
1124 | sai->sai_skip_hidden++; | |
1125 | continue; | |
1126 | } | |
1127 | } | |
1128 | ||
1129 | /* | |
1130 | * don't stat-ahead first entry. | |
1131 | */ | |
1132 | if (unlikely(++first == 1)) | |
1133 | continue; | |
1134 | ||
1135 | keep_it: | |
1136 | l_wait_event(thread->t_ctl_waitq, | |
1137 | !sa_sent_full(sai) || | |
615f9a68 | 1138 | !list_empty(&sai->sai_entries_received) || |
24a85e88 | 1139 | !list_empty(&sai->sai_entries_agl) || |
d7e09d03 PT |
1140 | !thread_is_running(thread), |
1141 | &lwi); | |
1142 | ||
1143 | interpret_it: | |
615f9a68 | 1144 | while (!list_empty(&sai->sai_entries_received)) |
d7e09d03 PT |
1145 | ll_post_statahead(sai); |
1146 | ||
1147 | if (unlikely(!thread_is_running(thread))) { | |
77a782ab | 1148 | ll_release_page(dir, page, false); |
34e1f2bb JL |
1149 | rc = 0; |
1150 | goto out; | |
d7e09d03 PT |
1151 | } |
1152 | ||
1153 | /* If no window for metadata statahead, but there are | |
1154 | * some AGL entries to be triggered, then try to help | |
c0894c6c OD |
1155 | * to process the AGL entries. |
1156 | */ | |
d7e09d03 PT |
1157 | if (sa_sent_full(sai)) { |
1158 | spin_lock(&plli->lli_agl_lock); | |
24a85e88 | 1159 | while (!list_empty(&sai->sai_entries_agl)) { |
6c3d0ea6 SB |
1160 | clli = list_entry(sai->sai_entries_agl.next, |
1161 | struct ll_inode_info, lli_agl_list); | |
d7e09d03 PT |
1162 | list_del_init(&clli->lli_agl_list); |
1163 | spin_unlock(&plli->lli_agl_lock); | |
1164 | ll_agl_trigger(&clli->lli_vfs_inode, | |
1165 | sai); | |
1166 | ||
615f9a68 | 1167 | if (!list_empty(&sai->sai_entries_received)) |
d7e09d03 PT |
1168 | goto interpret_it; |
1169 | ||
4f76f0ec | 1170 | if (unlikely(!thread_is_running(thread))) { |
77a782ab | 1171 | ll_release_page(dir, page, false); |
34e1f2bb JL |
1172 | rc = 0; |
1173 | goto out; | |
d7e09d03 PT |
1174 | } |
1175 | ||
1176 | if (!sa_sent_full(sai)) | |
1177 | goto do_it; | |
1178 | ||
1179 | spin_lock(&plli->lli_agl_lock); | |
1180 | } | |
1181 | spin_unlock(&plli->lli_agl_lock); | |
1182 | ||
1183 | goto keep_it; | |
1184 | } | |
d7e09d03 PT |
1185 | do_it: |
1186 | ll_statahead_one(parent, name, namelen); | |
1187 | } | |
4f76f0ec | 1188 | |
d7e09d03 PT |
1189 | pos = le64_to_cpu(dp->ldp_hash_end); |
1190 | if (pos == MDS_DIR_END_OFF) { | |
1191 | /* | |
1192 | * End of directory reached. | |
1193 | */ | |
77a782ab | 1194 | ll_release_page(dir, page, false); |
d7e09d03 PT |
1195 | while (1) { |
1196 | l_wait_event(thread->t_ctl_waitq, | |
615f9a68 | 1197 | !list_empty(&sai->sai_entries_received) || |
b2952d62 | 1198 | sai->sai_sent == sai->sai_replied || |
d7e09d03 PT |
1199 | !thread_is_running(thread), |
1200 | &lwi); | |
1201 | ||
615f9a68 | 1202 | while (!list_empty(&sai->sai_entries_received)) |
d7e09d03 PT |
1203 | ll_post_statahead(sai); |
1204 | ||
34e1f2bb JL |
1205 | if (unlikely(!thread_is_running(thread))) { |
1206 | rc = 0; | |
1207 | goto out; | |
1208 | } | |
d7e09d03 PT |
1209 | |
1210 | if (sai->sai_sent == sai->sai_replied && | |
615f9a68 | 1211 | list_empty(&sai->sai_entries_received)) |
d7e09d03 PT |
1212 | break; |
1213 | } | |
1214 | ||
1215 | spin_lock(&plli->lli_agl_lock); | |
24a85e88 | 1216 | while (!list_empty(&sai->sai_entries_agl) && |
d7e09d03 | 1217 | thread_is_running(thread)) { |
6c3d0ea6 SB |
1218 | clli = list_entry(sai->sai_entries_agl.next, |
1219 | struct ll_inode_info, lli_agl_list); | |
d7e09d03 PT |
1220 | list_del_init(&clli->lli_agl_list); |
1221 | spin_unlock(&plli->lli_agl_lock); | |
1222 | ll_agl_trigger(&clli->lli_vfs_inode, sai); | |
1223 | spin_lock(&plli->lli_agl_lock); | |
1224 | } | |
1225 | spin_unlock(&plli->lli_agl_lock); | |
1226 | ||
34e1f2bb JL |
1227 | rc = 0; |
1228 | goto out; | |
26f5c084 | 1229 | } else { |
d7e09d03 PT |
1230 | /* |
1231 | * chain is exhausted. | |
1232 | * Normal case: continue to the next page. | |
1233 | */ | |
006e4dcd | 1234 | ll_release_page(dir, page, |
4f76f0ec | 1235 | le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); |
2afad7fc | 1236 | sai->sai_in_readpage = 1; |
3978732f | 1237 | page = ll_get_dir_page(dir, op_data, pos, &chain); |
2afad7fc | 1238 | sai->sai_in_readpage = 0; |
d7e09d03 PT |
1239 | } |
1240 | } | |
d7e09d03 | 1241 | out: |
307bef74 | 1242 | ll_finish_md_op_data(op_data); |
d7e09d03 PT |
1243 | if (sai->sai_agl_valid) { |
1244 | spin_lock(&plli->lli_agl_lock); | |
1245 | thread_set_flags(agl_thread, SVC_STOPPING); | |
1246 | spin_unlock(&plli->lli_agl_lock); | |
1247 | wake_up(&agl_thread->t_ctl_waitq); | |
1248 | ||
9fc3b028 CM |
1249 | CDEBUG(D_READA, "stop agl thread: sai %p pid %u\n", |
1250 | sai, (unsigned int)agl_thread->t_pid); | |
d7e09d03 PT |
1251 | l_wait_event(agl_thread->t_ctl_waitq, |
1252 | thread_is_stopped(agl_thread), | |
1253 | &lwi); | |
1254 | } else { | |
1255 | /* Set agl_thread flags anyway. */ | |
1256 | thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED); | |
1257 | } | |
1258 | ll_dir_chain_fini(&chain); | |
1259 | spin_lock(&plli->lli_sa_lock); | |
615f9a68 | 1260 | if (!list_empty(&sai->sai_entries_received)) { |
d7e09d03 PT |
1261 | thread_set_flags(thread, SVC_STOPPING); |
1262 | spin_unlock(&plli->lli_sa_lock); | |
1263 | ||
1264 | /* To release the resources held by received entries. */ | |
615f9a68 | 1265 | while (!list_empty(&sai->sai_entries_received)) |
d7e09d03 PT |
1266 | ll_post_statahead(sai); |
1267 | ||
1268 | spin_lock(&plli->lli_sa_lock); | |
1269 | } | |
1270 | thread_set_flags(thread, SVC_STOPPED); | |
1271 | spin_unlock(&plli->lli_sa_lock); | |
1272 | wake_up(&sai->sai_waitq); | |
1273 | wake_up(&thread->t_ctl_waitq); | |
1274 | ll_sai_put(sai); | |
1275 | dput(parent); | |
09561a53 AV |
1276 | CDEBUG(D_READA, "statahead thread stopped: sai %p, parent %pd\n", |
1277 | sai, parent); | |
d7e09d03 PT |
1278 | return rc; |
1279 | } | |
1280 | ||
1281 | /** | |
1282 | * called in ll_file_release(). | |
1283 | */ | |
1284 | void ll_stop_statahead(struct inode *dir, void *key) | |
1285 | { | |
1286 | struct ll_inode_info *lli = ll_i2info(dir); | |
1287 | ||
6e16818b | 1288 | if (unlikely(!key)) |
d7e09d03 PT |
1289 | return; |
1290 | ||
1291 | spin_lock(&lli->lli_sa_lock); | |
1292 | if (lli->lli_opendir_key != key || lli->lli_opendir_pid == 0) { | |
1293 | spin_unlock(&lli->lli_sa_lock); | |
1294 | return; | |
1295 | } | |
1296 | ||
1297 | lli->lli_opendir_key = NULL; | |
1298 | ||
1299 | if (lli->lli_sai) { | |
1300 | struct l_wait_info lwi = { 0 }; | |
1301 | struct ptlrpc_thread *thread = &lli->lli_sai->sai_thread; | |
1302 | ||
1303 | if (!thread_is_stopped(thread)) { | |
1304 | thread_set_flags(thread, SVC_STOPPING); | |
1305 | spin_unlock(&lli->lli_sa_lock); | |
1306 | wake_up(&thread->t_ctl_waitq); | |
1307 | ||
9fc3b028 CM |
1308 | CDEBUG(D_READA, "stop statahead thread: sai %p pid %u\n", |
1309 | lli->lli_sai, (unsigned int)thread->t_pid); | |
d7e09d03 PT |
1310 | l_wait_event(thread->t_ctl_waitq, |
1311 | thread_is_stopped(thread), | |
1312 | &lwi); | |
1313 | } else { | |
1314 | spin_unlock(&lli->lli_sa_lock); | |
1315 | } | |
1316 | ||
1317 | /* | |
1318 | * Put the ref which was held when first statahead_enter. | |
1319 | * It maybe not the last ref for some statahead requests | |
1320 | * maybe inflight. | |
1321 | */ | |
1322 | ll_sai_put(lli->lli_sai); | |
1323 | } else { | |
1324 | lli->lli_opendir_pid = 0; | |
1325 | spin_unlock(&lli->lli_sa_lock); | |
1326 | } | |
1327 | } | |
1328 | ||
1329 | enum { | |
1330 | /** | |
1331 | * not first dirent, or is "." | |
1332 | */ | |
1333 | LS_NONE_FIRST_DE = 0, | |
1334 | /** | |
1335 | * the first non-hidden dirent | |
1336 | */ | |
1337 | LS_FIRST_DE, | |
1338 | /** | |
1339 | * the first hidden dirent, that is "." | |
1340 | */ | |
1341 | LS_FIRST_DOT_DE | |
1342 | }; | |
1343 | ||
1344 | static int is_first_dirent(struct inode *dir, struct dentry *dentry) | |
1345 | { | |
1346 | struct ll_dir_chain chain; | |
1e95e9a0 | 1347 | const struct qstr *target = &dentry->d_name; |
3978732f | 1348 | struct md_op_data *op_data; |
d7e09d03 PT |
1349 | struct page *page; |
1350 | __u64 pos = 0; | |
1351 | int dot_de; | |
1352 | int rc = LS_NONE_FIRST_DE; | |
d7e09d03 | 1353 | |
3978732f | 1354 | op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0, |
1355 | LUSTRE_OPC_ANY, dir); | |
1356 | if (IS_ERR(op_data)) | |
1357 | return PTR_ERR(op_data); | |
bce1bbf4 | 1358 | /** |
1359 | * FIXME choose the start offset of the readdir | |
1360 | */ | |
1361 | op_data->op_max_pages = ll_i2sbi(dir)->ll_md_brw_pages; | |
3978732f | 1362 | |
d7e09d03 | 1363 | ll_dir_chain_init(&chain); |
3978732f | 1364 | page = ll_get_dir_page(dir, op_data, pos, &chain); |
d7e09d03 PT |
1365 | |
1366 | while (1) { | |
1367 | struct lu_dirpage *dp; | |
1368 | struct lu_dirent *ent; | |
1369 | ||
1370 | if (IS_ERR(page)) { | |
1371 | struct ll_inode_info *lli = ll_i2info(dir); | |
1372 | ||
1373 | rc = PTR_ERR(page); | |
4f48c52c | 1374 | CERROR("%s: error reading dir "DFID" at %llu: opendir_pid = %u : rc = %d\n", |
1375 | ll_get_fsname(dir->i_sb, NULL, 0), | |
d7e09d03 | 1376 | PFID(ll_inode2fid(dir)), pos, |
4f48c52c | 1377 | lli->lli_opendir_pid, rc); |
d7e09d03 PT |
1378 | break; |
1379 | } | |
1380 | ||
1381 | dp = page_address(page); | |
6e16818b | 1382 | for (ent = lu_dirent_start(dp); ent; |
d7e09d03 PT |
1383 | ent = lu_dirent_next(ent)) { |
1384 | __u64 hash; | |
1385 | int namelen; | |
1386 | char *name; | |
1387 | ||
1388 | hash = le64_to_cpu(ent->lde_hash); | |
1389 | /* The ll_get_dir_page() can return any page containing | |
c0894c6c OD |
1390 | * the given hash which may be not the start hash. |
1391 | */ | |
d7e09d03 PT |
1392 | if (unlikely(hash < pos)) |
1393 | continue; | |
1394 | ||
1395 | namelen = le16_to_cpu(ent->lde_namelen); | |
1396 | if (unlikely(namelen == 0)) | |
1397 | /* | |
1398 | * skip dummy record. | |
1399 | */ | |
1400 | continue; | |
1401 | ||
1402 | name = ent->lde_name; | |
1403 | if (name[0] == '.') { | |
1404 | if (namelen == 1) | |
1405 | /* | |
1406 | * skip "." | |
1407 | */ | |
1408 | continue; | |
1409 | else if (name[1] == '.' && namelen == 2) | |
1410 | /* | |
1411 | * skip ".." | |
1412 | */ | |
1413 | continue; | |
1414 | else | |
1415 | dot_de = 1; | |
1416 | } else { | |
1417 | dot_de = 0; | |
1418 | } | |
1419 | ||
1420 | if (dot_de && target->name[0] != '.') { | |
1421 | CDEBUG(D_READA, "%.*s skip hidden file %.*s\n", | |
1422 | target->len, target->name, | |
1423 | namelen, name); | |
1424 | continue; | |
1425 | } | |
1426 | ||
1427 | if (target->len != namelen || | |
1428 | memcmp(target->name, name, namelen) != 0) | |
1429 | rc = LS_NONE_FIRST_DE; | |
1430 | else if (!dot_de) | |
1431 | rc = LS_FIRST_DE; | |
1432 | else | |
1433 | rc = LS_FIRST_DOT_DE; | |
1434 | ||
77a782ab | 1435 | ll_release_page(dir, page, false); |
34e1f2bb | 1436 | goto out; |
d7e09d03 PT |
1437 | } |
1438 | pos = le64_to_cpu(dp->ldp_hash_end); | |
1439 | if (pos == MDS_DIR_END_OFF) { | |
1440 | /* | |
1441 | * End of directory reached. | |
1442 | */ | |
77a782ab | 1443 | ll_release_page(dir, page, false); |
26f5c084 | 1444 | goto out; |
1445 | } else { | |
d7e09d03 PT |
1446 | /* |
1447 | * chain is exhausted | |
1448 | * Normal case: continue to the next page. | |
1449 | */ | |
006e4dcd | 1450 | ll_release_page(dir, page, |
1451 | le32_to_cpu(dp->ldp_flags) & | |
1452 | LDF_COLLIDE); | |
3978732f | 1453 | page = ll_get_dir_page(dir, op_data, pos, &chain); |
d7e09d03 PT |
1454 | } |
1455 | } | |
d7e09d03 PT |
1456 | out: |
1457 | ll_dir_chain_fini(&chain); | |
3978732f | 1458 | ll_finish_md_op_data(op_data); |
d7e09d03 PT |
1459 | return rc; |
1460 | } | |
1461 | ||
1462 | static void | |
1463 | ll_sai_unplug(struct ll_statahead_info *sai, struct ll_sa_entry *entry) | |
1464 | { | |
1465 | struct ptlrpc_thread *thread = &sai->sai_thread; | |
1466 | struct ll_sb_info *sbi = ll_i2sbi(sai->sai_inode); | |
1467 | int hit; | |
d7e09d03 | 1468 | |
6e16818b | 1469 | if (entry && entry->se_stat == SA_ENTRY_SUCC) |
d7e09d03 PT |
1470 | hit = 1; |
1471 | else | |
1472 | hit = 0; | |
1473 | ||
1474 | ll_sa_entry_fini(sai, entry); | |
1475 | if (hit) { | |
1476 | sai->sai_hit++; | |
1477 | sai->sai_consecutive_miss = 0; | |
1478 | sai->sai_max = min(2 * sai->sai_max, sbi->ll_sa_max); | |
1479 | } else { | |
1480 | struct ll_inode_info *lli = ll_i2info(sai->sai_inode); | |
1481 | ||
1482 | sai->sai_miss++; | |
1483 | sai->sai_consecutive_miss++; | |
1484 | if (sa_low_hit(sai) && thread_is_running(thread)) { | |
1485 | atomic_inc(&sbi->ll_sa_wrong); | |
2d00bd17 | 1486 | CDEBUG(D_READA, "Statahead for dir " DFID " hit ratio too low: hit/miss %llu/%llu, sent/replied %llu/%llu, stopping statahead thread\n", |
d7e09d03 PT |
1487 | PFID(&lli->lli_fid), sai->sai_hit, |
1488 | sai->sai_miss, sai->sai_sent, | |
9fc3b028 | 1489 | sai->sai_replied); |
d7e09d03 PT |
1490 | spin_lock(&lli->lli_sa_lock); |
1491 | if (!thread_is_stopped(thread)) | |
1492 | thread_set_flags(thread, SVC_STOPPING); | |
1493 | spin_unlock(&lli->lli_sa_lock); | |
1494 | } | |
1495 | } | |
1496 | ||
1497 | if (!thread_is_stopped(thread)) | |
1498 | wake_up(&thread->t_ctl_waitq); | |
d7e09d03 PT |
1499 | } |
1500 | ||
1501 | /** | |
1502 | * Start statahead thread if this is the first dir entry. | |
1503 | * Otherwise if a thread is started already, wait it until it is ahead of me. | |
1504 | * \retval 1 -- find entry with lock in cache, the caller needs to do | |
1505 | * nothing. | |
1506 | * \retval 0 -- find entry in cache, but without lock, the caller needs | |
1507 | * refresh from MDS. | |
1508 | * \retval others -- the caller need to process as non-statahead. | |
1509 | */ | |
1510 | int do_statahead_enter(struct inode *dir, struct dentry **dentryp, | |
1511 | int only_unplug) | |
1512 | { | |
1513 | struct ll_inode_info *lli = ll_i2info(dir); | |
1514 | struct ll_statahead_info *sai = lli->lli_sai; | |
1515 | struct dentry *parent; | |
1516 | struct ll_sa_entry *entry; | |
1517 | struct ptlrpc_thread *thread; | |
1518 | struct l_wait_info lwi = { 0 }; | |
060c2820 | 1519 | struct task_struct *task; |
d7e09d03 PT |
1520 | int rc = 0; |
1521 | struct ll_inode_info *plli; | |
d7e09d03 PT |
1522 | |
1523 | LASSERT(lli->lli_opendir_pid == current_pid()); | |
1524 | ||
1525 | if (sai) { | |
1526 | thread = &sai->sai_thread; | |
1527 | if (unlikely(thread_is_stopped(thread) && | |
1528 | list_empty(&sai->sai_entries_stated))) { | |
1529 | /* to release resource */ | |
1530 | ll_stop_statahead(dir, lli->lli_opendir_key); | |
0a3bdb00 | 1531 | return -EAGAIN; |
d7e09d03 PT |
1532 | } |
1533 | ||
1534 | if ((*dentryp)->d_name.name[0] == '.') { | |
1535 | if (sai->sai_ls_all || | |
1536 | sai->sai_miss_hidden >= sai->sai_skip_hidden) { | |
1537 | /* | |
1538 | * Hidden dentry is the first one, or statahead | |
1539 | * thread does not skip so many hidden dentries | |
1540 | * before "sai_ls_all" enabled as below. | |
1541 | */ | |
1542 | } else { | |
1543 | if (!sai->sai_ls_all) | |
1544 | /* | |
1545 | * It maybe because hidden dentry is not | |
1546 | * the first one, "sai_ls_all" was not | |
1547 | * set, then "ls -al" missed. Enable | |
1548 | * "sai_ls_all" for such case. | |
1549 | */ | |
1550 | sai->sai_ls_all = 1; | |
1551 | ||
1552 | /* | |
1553 | * Such "getattr" has been skipped before | |
1554 | * "sai_ls_all" enabled as above. | |
1555 | */ | |
1556 | sai->sai_miss_hidden++; | |
0a3bdb00 | 1557 | return -EAGAIN; |
d7e09d03 PT |
1558 | } |
1559 | } | |
1560 | ||
1561 | entry = ll_sa_entry_get_byname(sai, &(*dentryp)->d_name); | |
6e16818b | 1562 | if (!entry || only_unplug) { |
d7e09d03 | 1563 | ll_sai_unplug(sai, entry); |
0a3bdb00 | 1564 | return entry ? 1 : -EAGAIN; |
d7e09d03 PT |
1565 | } |
1566 | ||
2afad7fc | 1567 | /* if statahead is busy in readdir, help it do post-work */ |
1568 | while (!ll_sa_entry_stated(entry) && sai->sai_in_readpage && | |
1569 | !sa_received_empty(sai)) | |
1570 | ll_post_statahead(sai); | |
1571 | ||
d7e09d03 PT |
1572 | if (!ll_sa_entry_stated(entry)) { |
1573 | sai->sai_index_wait = entry->se_index; | |
1574 | lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(30), NULL, | |
1575 | LWI_ON_SIGNAL_NOOP, NULL); | |
1576 | rc = l_wait_event(sai->sai_waitq, | |
1577 | ll_sa_entry_stated(entry) || | |
1578 | thread_is_stopped(thread), | |
1579 | &lwi); | |
1580 | if (rc < 0) { | |
1581 | ll_sai_unplug(sai, entry); | |
0a3bdb00 | 1582 | return -EAGAIN; |
d7e09d03 PT |
1583 | } |
1584 | } | |
1585 | ||
6e16818b | 1586 | if (entry->se_stat == SA_ENTRY_SUCC && entry->se_inode) { |
d7e09d03 PT |
1587 | struct inode *inode = entry->se_inode; |
1588 | struct lookup_intent it = { .it_op = IT_GETATTR, | |
e476f2e5 | 1589 | .it_lock_handle = |
d7e09d03 PT |
1590 | entry->se_handle }; |
1591 | __u64 bits; | |
1592 | ||
1593 | rc = md_revalidate_lock(ll_i2mdexp(dir), &it, | |
1594 | ll_inode2fid(inode), &bits); | |
1595 | if (rc == 1) { | |
6e16818b | 1596 | if (!d_inode(*dentryp)) { |
7486bc06 SP |
1597 | struct dentry *alias; |
1598 | ||
1599 | alias = ll_splice_alias(inode, | |
e15ba45d | 1600 | *dentryp); |
7486bc06 | 1601 | if (IS_ERR(alias)) { |
3ea8f3bc | 1602 | ll_sai_unplug(sai, entry); |
7486bc06 | 1603 | return PTR_ERR(alias); |
3ea8f3bc | 1604 | } |
7486bc06 | 1605 | *dentryp = alias; |
2b0143b5 | 1606 | } else if (d_inode(*dentryp) != inode) { |
d7e09d03 | 1607 | /* revalidate, but inode is recreated */ |
97a075cd JN |
1608 | CDEBUG(D_READA, "%s: stale dentry %pd inode "DFID", statahead inode "DFID"\n", |
1609 | ll_get_fsname(d_inode(*dentryp)->i_sb, NULL, 0), | |
1610 | *dentryp, | |
1611 | PFID(ll_inode2fid(d_inode(*dentryp))), | |
1612 | PFID(ll_inode2fid(inode))); | |
79496845 | 1613 | ll_intent_release(&it); |
d7e09d03 | 1614 | ll_sai_unplug(sai, entry); |
0a3bdb00 | 1615 | return -ESTALE; |
d7e09d03 PT |
1616 | } else { |
1617 | iput(inode); | |
1618 | } | |
1619 | entry->se_inode = NULL; | |
1620 | ||
1621 | if ((bits & MDS_INODELOCK_LOOKUP) && | |
1622 | d_lustre_invalid(*dentryp)) | |
1623 | d_lustre_revalidate(*dentryp); | |
1624 | ll_intent_release(&it); | |
1625 | } | |
1626 | } | |
1627 | ||
1628 | ll_sai_unplug(sai, entry); | |
0a3bdb00 | 1629 | return rc; |
d7e09d03 PT |
1630 | } |
1631 | ||
1632 | /* I am the "lli_opendir_pid" owner, only me can set "lli_sai". */ | |
1633 | rc = is_first_dirent(dir, *dentryp); | |
34e1f2bb | 1634 | if (rc == LS_NONE_FIRST_DE) { |
d7e09d03 | 1635 | /* It is not "ls -{a}l" operation, no need statahead for it. */ |
34e1f2bb JL |
1636 | rc = -EAGAIN; |
1637 | goto out; | |
1638 | } | |
d7e09d03 PT |
1639 | |
1640 | sai = ll_sai_alloc(); | |
6e16818b | 1641 | if (!sai) { |
34e1f2bb JL |
1642 | rc = -ENOMEM; |
1643 | goto out; | |
1644 | } | |
d7e09d03 PT |
1645 | |
1646 | sai->sai_ls_all = (rc == LS_FIRST_DOT_DE); | |
1647 | sai->sai_inode = igrab(dir); | |
6e16818b | 1648 | if (unlikely(!sai->sai_inode)) { |
d7e09d03 PT |
1649 | CWARN("Do not start stat ahead on dying inode "DFID"\n", |
1650 | PFID(&lli->lli_fid)); | |
34e1f2bb JL |
1651 | rc = -ESTALE; |
1652 | goto out; | |
d7e09d03 PT |
1653 | } |
1654 | ||
1655 | /* get parent reference count here, and put it in ll_statahead_thread */ | |
1656 | parent = dget((*dentryp)->d_parent); | |
2b0143b5 DH |
1657 | if (unlikely(sai->sai_inode != d_inode(parent))) { |
1658 | struct ll_inode_info *nlli = ll_i2info(d_inode(parent)); | |
d7e09d03 | 1659 | |
dab363f9 | 1660 | CWARN("Race condition, someone changed %pd just now: old parent "DFID", new parent "DFID"\n", |
09561a53 | 1661 | *dentryp, |
d7e09d03 PT |
1662 | PFID(&lli->lli_fid), PFID(&nlli->lli_fid)); |
1663 | dput(parent); | |
1664 | iput(sai->sai_inode); | |
34e1f2bb JL |
1665 | rc = -EAGAIN; |
1666 | goto out; | |
d7e09d03 PT |
1667 | } |
1668 | ||
09561a53 AV |
1669 | CDEBUG(D_READA, "start statahead thread: sai %p, parent %pd\n", |
1670 | sai, parent); | |
d7e09d03 | 1671 | |
717d1c2e CM |
1672 | /* The sai buffer already has one reference taken at allocation time, |
1673 | * but as soon as we expose the sai by attaching it to the lli that | |
1674 | * default reference can be dropped by another thread calling | |
1675 | * ll_stop_statahead. We need to take a local reference to protect | |
c0894c6c OD |
1676 | * the sai buffer while we intend to access it. |
1677 | */ | |
717d1c2e | 1678 | ll_sai_get(sai); |
d7e09d03 PT |
1679 | lli->lli_sai = sai; |
1680 | ||
2b0143b5 | 1681 | plli = ll_i2info(d_inode(parent)); |
060c2820 JH |
1682 | task = kthread_run(ll_statahead_thread, parent, "ll_sa_%u", |
1683 | plli->lli_opendir_pid); | |
d7e09d03 | 1684 | thread = &sai->sai_thread; |
060c2820 JH |
1685 | if (IS_ERR(task)) { |
1686 | rc = PTR_ERR(task); | |
d7e09d03 PT |
1687 | CERROR("can't start ll_sa thread, rc: %d\n", rc); |
1688 | dput(parent); | |
1689 | lli->lli_opendir_key = NULL; | |
1690 | thread_set_flags(thread, SVC_STOPPED); | |
1691 | thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED); | |
717d1c2e | 1692 | /* Drop both our own local reference and the default |
c0894c6c OD |
1693 | * reference from allocation time. |
1694 | */ | |
717d1c2e | 1695 | ll_sai_put(sai); |
d7e09d03 | 1696 | ll_sai_put(sai); |
6e16818b | 1697 | LASSERT(!lli->lli_sai); |
0a3bdb00 | 1698 | return -EAGAIN; |
d7e09d03 PT |
1699 | } |
1700 | ||
1701 | l_wait_event(thread->t_ctl_waitq, | |
1702 | thread_is_running(thread) || thread_is_stopped(thread), | |
1703 | &lwi); | |
717d1c2e | 1704 | ll_sai_put(sai); |
d7e09d03 PT |
1705 | |
1706 | /* | |
1707 | * We don't stat-ahead for the first dirent since we are already in | |
1708 | * lookup. | |
1709 | */ | |
0a3bdb00 | 1710 | return -EAGAIN; |
d7e09d03 PT |
1711 | |
1712 | out: | |
37b5022d | 1713 | kfree(sai); |
d7e09d03 PT |
1714 | spin_lock(&lli->lli_sa_lock); |
1715 | lli->lli_opendir_key = NULL; | |
1716 | lli->lli_opendir_pid = 0; | |
1717 | spin_unlock(&lli->lli_sa_lock); | |
1718 | return rc; | |
1719 | } |