Commit | Line | Data |
---|---|---|
faca2ef7 DM |
1 | /* |
2 | * In-kernel transcendent memory (generic implementation) | |
3 | * | |
4 | * Copyright (c) 2009-2012, Dan Magenheimer, Oracle Corp. | |
5 | * | |
6 | * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented | |
7 | * "handles" (triples containing a pool id, and object id, and an index), to | |
8 | * pages in a page-accessible memory (PAM). Tmem references the PAM pages via | |
9 | * an abstract "pampd" (PAM page-descriptor), which can be operated on by a | |
10 | * set of functions (pamops). Each pampd contains some representation of | |
11 | * PAGE_SIZE bytes worth of data. For those familiar with key-value stores, | |
12 | * the tmem handle is a three-level hierarchical key, and the value is always | |
13 | * reconstituted (but not necessarily stored) as PAGE_SIZE bytes and is | |
14 | * referenced in the datastore by the pampd. The hierarchy is required | |
15 | * to ensure that certain invalidation functions can be performed efficiently | |
16 | * (i.e. flush all indexes associated with this object_id, or | |
17 | * flush all objects associated with this pool). | |
18 | * | |
19 | * Tmem must support potentially millions of pages and must be able to insert, | |
20 | * find, and delete these pages at a potential frequency of thousands per | |
21 | * second concurrently across many CPUs, (and, if used with KVM, across many | |
22 | * vcpus across many guests). Tmem is tracked with a hierarchy of data | |
23 | * structures, organized by the elements in the handle-tuple: pool_id, | |
24 | * object_id, and page index. One or more "clients" (e.g. guests) each | |
25 | * provide one or more tmem_pools. Each pool, contains a hash table of | |
26 | * rb_trees of tmem_objs. Each tmem_obj contains a radix-tree-like tree | |
27 | * of pointers, with intermediate nodes called tmem_objnodes. Each leaf | |
28 | * pointer in this tree points to a pampd, which is accessible only through | |
29 | * a small set of callbacks registered by the PAM implementation (see | |
30 | * tmem_register_pamops). Tmem only needs to memory allocation for objs | |
31 | * and objnodes and this is done via a set of callbacks that must be | |
32 | * registered by the tmem host implementation (e.g. see tmem_register_hostops). | |
33 | */ | |
34 | ||
35 | #include <linux/list.h> | |
36 | #include <linux/spinlock.h> | |
37 | #include <linux/atomic.h> | |
38 | #ifdef CONFIG_RAMSTER | |
39 | #include <linux/delay.h> | |
40 | #endif | |
41 | ||
42 | #include "tmem.h" | |
43 | ||
44 | /* data structure sentinels used for debugging... see tmem.h */ | |
45 | #define POOL_SENTINEL 0x87658765 | |
46 | #define OBJ_SENTINEL 0x12345678 | |
47 | #define OBJNODE_SENTINEL 0xfedcba09 | |
48 | ||
49 | /* | |
50 | * A tmem host implementation must use this function to register callbacks | |
51 | * for memory allocation. | |
52 | */ | |
53 | static struct tmem_hostops tmem_hostops; | |
54 | ||
55 | static void tmem_objnode_tree_init(void); | |
56 | ||
57 | void tmem_register_hostops(struct tmem_hostops *m) | |
58 | { | |
59 | tmem_objnode_tree_init(); | |
60 | tmem_hostops = *m; | |
61 | } | |
62 | ||
63 | /* | |
64 | * A tmem host implementation must use this function to register | |
65 | * callbacks for a page-accessible memory (PAM) implementation. | |
66 | */ | |
67 | static struct tmem_pamops tmem_pamops; | |
68 | ||
69 | void tmem_register_pamops(struct tmem_pamops *m) | |
70 | { | |
71 | tmem_pamops = *m; | |
72 | } | |
73 | ||
74 | /* | |
75 | * Oid's are potentially very sparse and tmem_objs may have an indeterminately | |
76 | * short life, being added and deleted at a relatively high frequency. | |
77 | * So an rb_tree is an ideal data structure to manage tmem_objs. But because | |
78 | * of the potentially huge number of tmem_objs, each pool manages a hashtable | |
79 | * of rb_trees to reduce search, insert, delete, and rebalancing time. | |
80 | * Each hashbucket also has a lock to manage concurrent access and no | |
81 | * searches, inserts, or deletions can be performed unless the lock is held. | |
82 | * As a result, care must be taken to ensure tmem routines are not called | |
83 | * recursively; the vast majority of the time, a recursive call may work | |
84 | * but a deadlock will occur a small fraction of the time due to the | |
85 | * hashbucket lock. | |
86 | * | |
87 | * The following routines manage tmem_objs. In all of these routines, | |
88 | * the hashbucket lock is already held. | |
89 | */ | |
90 | ||
91 | /* Search for object==oid in pool, returns object if found. */ | |
92 | static struct tmem_obj *__tmem_obj_find(struct tmem_hashbucket *hb, | |
93 | struct tmem_oid *oidp, | |
94 | struct rb_node **parent, | |
95 | struct rb_node ***link) | |
96 | { | |
97 | struct rb_node *_parent = NULL, **rbnode; | |
98 | struct tmem_obj *obj = NULL; | |
99 | ||
100 | rbnode = &hb->obj_rb_root.rb_node; | |
101 | while (*rbnode) { | |
102 | BUG_ON(RB_EMPTY_NODE(*rbnode)); | |
103 | _parent = *rbnode; | |
104 | obj = rb_entry(*rbnode, struct tmem_obj, | |
105 | rb_tree_node); | |
106 | switch (tmem_oid_compare(oidp, &obj->oid)) { | |
107 | case 0: /* equal */ | |
108 | goto out; | |
109 | case -1: | |
110 | rbnode = &(*rbnode)->rb_left; | |
111 | break; | |
112 | case 1: | |
113 | rbnode = &(*rbnode)->rb_right; | |
114 | break; | |
115 | } | |
116 | } | |
117 | ||
118 | if (parent) | |
119 | *parent = _parent; | |
120 | if (link) | |
121 | *link = rbnode; | |
122 | obj = NULL; | |
123 | out: | |
124 | return obj; | |
125 | } | |
126 | ||
127 | static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb, | |
128 | struct tmem_oid *oidp) | |
129 | { | |
130 | return __tmem_obj_find(hb, oidp, NULL, NULL); | |
131 | } | |
132 | ||
133 | static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *, bool); | |
134 | ||
135 | /* Free an object that has no more pampds in it. */ | |
136 | static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb) | |
137 | { | |
138 | struct tmem_pool *pool; | |
139 | ||
140 | BUG_ON(obj == NULL); | |
141 | ASSERT_SENTINEL(obj, OBJ); | |
142 | BUG_ON(obj->pampd_count > 0); | |
143 | pool = obj->pool; | |
144 | BUG_ON(pool == NULL); | |
145 | if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */ | |
146 | tmem_pampd_destroy_all_in_obj(obj, false); | |
147 | BUG_ON(obj->objnode_tree_root != NULL); | |
148 | BUG_ON((long)obj->objnode_count != 0); | |
149 | atomic_dec(&pool->obj_count); | |
150 | BUG_ON(atomic_read(&pool->obj_count) < 0); | |
151 | INVERT_SENTINEL(obj, OBJ); | |
152 | obj->pool = NULL; | |
153 | tmem_oid_set_invalid(&obj->oid); | |
154 | rb_erase(&obj->rb_tree_node, &hb->obj_rb_root); | |
155 | } | |
156 | ||
157 | /* | |
158 | * Initialize, and insert an tmem_object_root (called only if find failed). | |
159 | */ | |
160 | static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb, | |
161 | struct tmem_pool *pool, | |
162 | struct tmem_oid *oidp) | |
163 | { | |
164 | struct rb_root *root = &hb->obj_rb_root; | |
165 | struct rb_node **new = NULL, *parent = NULL; | |
166 | ||
167 | BUG_ON(pool == NULL); | |
168 | atomic_inc(&pool->obj_count); | |
169 | obj->objnode_tree_height = 0; | |
170 | obj->objnode_tree_root = NULL; | |
171 | obj->pool = pool; | |
172 | obj->oid = *oidp; | |
173 | obj->objnode_count = 0; | |
174 | obj->pampd_count = 0; | |
175 | #ifdef CONFIG_RAMSTER | |
176 | if (tmem_pamops.new_obj != NULL) | |
177 | (*tmem_pamops.new_obj)(obj); | |
178 | #endif | |
179 | SET_SENTINEL(obj, OBJ); | |
180 | ||
181 | if (__tmem_obj_find(hb, oidp, &parent, &new)) | |
182 | BUG(); | |
183 | ||
184 | rb_link_node(&obj->rb_tree_node, parent, new); | |
185 | rb_insert_color(&obj->rb_tree_node, root); | |
186 | } | |
187 | ||
188 | /* | |
189 | * Tmem is managed as a set of tmem_pools with certain attributes, such as | |
190 | * "ephemeral" vs "persistent". These attributes apply to all tmem_objs | |
191 | * and all pampds that belong to a tmem_pool. A tmem_pool is created | |
192 | * or deleted relatively rarely (for example, when a filesystem is | |
193 | * mounted or unmounted). | |
194 | */ | |
195 | ||
196 | /* flush all data from a pool and, optionally, free it */ | |
197 | static void tmem_pool_flush(struct tmem_pool *pool, bool destroy) | |
198 | { | |
199 | struct rb_node *rbnode; | |
200 | struct tmem_obj *obj; | |
201 | struct tmem_hashbucket *hb = &pool->hashbucket[0]; | |
202 | int i; | |
203 | ||
204 | BUG_ON(pool == NULL); | |
205 | for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) { | |
206 | spin_lock(&hb->lock); | |
207 | rbnode = rb_first(&hb->obj_rb_root); | |
208 | while (rbnode != NULL) { | |
209 | obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node); | |
210 | rbnode = rb_next(rbnode); | |
211 | tmem_pampd_destroy_all_in_obj(obj, true); | |
212 | tmem_obj_free(obj, hb); | |
213 | (*tmem_hostops.obj_free)(obj, pool); | |
214 | } | |
215 | spin_unlock(&hb->lock); | |
216 | } | |
217 | if (destroy) | |
218 | list_del(&pool->pool_list); | |
219 | } | |
220 | ||
221 | /* | |
222 | * A tmem_obj contains a radix-tree-like tree in which the intermediate | |
223 | * nodes are called tmem_objnodes. (The kernel lib/radix-tree.c implementation | |
224 | * is very specialized and tuned for specific uses and is not particularly | |
225 | * suited for use from this code, though some code from the core algorithms has | |
226 | * been reused, thus the copyright notices below). Each tmem_objnode contains | |
227 | * a set of pointers which point to either a set of intermediate tmem_objnodes | |
228 | * or a set of of pampds. | |
229 | * | |
230 | * Portions Copyright (C) 2001 Momchil Velikov | |
231 | * Portions Copyright (C) 2001 Christoph Hellwig | |
232 | * Portions Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com> | |
233 | */ | |
234 | ||
235 | struct tmem_objnode_tree_path { | |
236 | struct tmem_objnode *objnode; | |
237 | int offset; | |
238 | }; | |
239 | ||
240 | /* objnode height_to_maxindex translation */ | |
241 | static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1]; | |
242 | ||
243 | static void tmem_objnode_tree_init(void) | |
244 | { | |
245 | unsigned int ht, tmp; | |
246 | ||
247 | for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) { | |
248 | tmp = ht * OBJNODE_TREE_MAP_SHIFT; | |
249 | if (tmp >= OBJNODE_TREE_INDEX_BITS) | |
250 | tmem_objnode_tree_h2max[ht] = ~0UL; | |
251 | else | |
252 | tmem_objnode_tree_h2max[ht] = | |
253 | (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1; | |
254 | } | |
255 | } | |
256 | ||
257 | static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj) | |
258 | { | |
259 | struct tmem_objnode *objnode; | |
260 | ||
261 | ASSERT_SENTINEL(obj, OBJ); | |
262 | BUG_ON(obj->pool == NULL); | |
263 | ASSERT_SENTINEL(obj->pool, POOL); | |
264 | objnode = (*tmem_hostops.objnode_alloc)(obj->pool); | |
265 | if (unlikely(objnode == NULL)) | |
266 | goto out; | |
267 | objnode->obj = obj; | |
268 | SET_SENTINEL(objnode, OBJNODE); | |
269 | memset(&objnode->slots, 0, sizeof(objnode->slots)); | |
270 | objnode->slots_in_use = 0; | |
271 | obj->objnode_count++; | |
272 | out: | |
273 | return objnode; | |
274 | } | |
275 | ||
276 | static void tmem_objnode_free(struct tmem_objnode *objnode) | |
277 | { | |
278 | struct tmem_pool *pool; | |
279 | int i; | |
280 | ||
281 | BUG_ON(objnode == NULL); | |
282 | for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) | |
283 | BUG_ON(objnode->slots[i] != NULL); | |
284 | ASSERT_SENTINEL(objnode, OBJNODE); | |
285 | INVERT_SENTINEL(objnode, OBJNODE); | |
286 | BUG_ON(objnode->obj == NULL); | |
287 | ASSERT_SENTINEL(objnode->obj, OBJ); | |
288 | pool = objnode->obj->pool; | |
289 | BUG_ON(pool == NULL); | |
290 | ASSERT_SENTINEL(pool, POOL); | |
291 | objnode->obj->objnode_count--; | |
292 | objnode->obj = NULL; | |
293 | (*tmem_hostops.objnode_free)(objnode, pool); | |
294 | } | |
295 | ||
296 | /* | |
297 | * Lookup index in object and return associated pampd (or NULL if not found). | |
298 | */ | |
299 | static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) | |
300 | { | |
301 | unsigned int height, shift; | |
302 | struct tmem_objnode **slot = NULL; | |
303 | ||
304 | BUG_ON(obj == NULL); | |
305 | ASSERT_SENTINEL(obj, OBJ); | |
306 | BUG_ON(obj->pool == NULL); | |
307 | ASSERT_SENTINEL(obj->pool, POOL); | |
308 | ||
309 | height = obj->objnode_tree_height; | |
310 | if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) | |
311 | goto out; | |
312 | if (height == 0 && obj->objnode_tree_root) { | |
313 | slot = &obj->objnode_tree_root; | |
314 | goto out; | |
315 | } | |
316 | shift = (height-1) * OBJNODE_TREE_MAP_SHIFT; | |
317 | slot = &obj->objnode_tree_root; | |
318 | while (height > 0) { | |
319 | if (*slot == NULL) | |
320 | goto out; | |
321 | slot = (struct tmem_objnode **) | |
322 | ((*slot)->slots + | |
323 | ((index >> shift) & OBJNODE_TREE_MAP_MASK)); | |
324 | shift -= OBJNODE_TREE_MAP_SHIFT; | |
325 | height--; | |
326 | } | |
327 | out: | |
328 | return slot != NULL ? (void **)slot : NULL; | |
329 | } | |
330 | ||
331 | static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) | |
332 | { | |
333 | struct tmem_objnode **slot; | |
334 | ||
335 | slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index); | |
336 | return slot != NULL ? *slot : NULL; | |
337 | } | |
338 | ||
339 | #ifdef CONFIG_RAMSTER | |
340 | static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index, | |
341 | void *new_pampd, bool no_free) | |
342 | { | |
343 | struct tmem_objnode **slot; | |
344 | void *ret = NULL; | |
345 | ||
346 | slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index); | |
347 | if ((slot != NULL) && (*slot != NULL)) { | |
348 | void *old_pampd = *(void **)slot; | |
349 | *(void **)slot = new_pampd; | |
350 | if (!no_free) | |
351 | (*tmem_pamops.free)(old_pampd, obj->pool, | |
352 | NULL, 0, false); | |
353 | ret = new_pampd; | |
354 | } | |
355 | return ret; | |
356 | } | |
357 | #endif | |
358 | ||
359 | static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index, | |
360 | void *pampd) | |
361 | { | |
362 | int ret = 0; | |
363 | struct tmem_objnode *objnode = NULL, *newnode, *slot; | |
364 | unsigned int height, shift; | |
365 | int offset = 0; | |
366 | ||
367 | /* if necessary, extend the tree to be higher */ | |
368 | if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) { | |
369 | height = obj->objnode_tree_height + 1; | |
370 | if (index > tmem_objnode_tree_h2max[height]) | |
371 | while (index > tmem_objnode_tree_h2max[height]) | |
372 | height++; | |
373 | if (obj->objnode_tree_root == NULL) { | |
374 | obj->objnode_tree_height = height; | |
375 | goto insert; | |
376 | } | |
377 | do { | |
378 | newnode = tmem_objnode_alloc(obj); | |
379 | if (!newnode) { | |
380 | ret = -ENOMEM; | |
381 | goto out; | |
382 | } | |
383 | newnode->slots[0] = obj->objnode_tree_root; | |
384 | newnode->slots_in_use = 1; | |
385 | obj->objnode_tree_root = newnode; | |
386 | obj->objnode_tree_height++; | |
387 | } while (height > obj->objnode_tree_height); | |
388 | } | |
389 | insert: | |
390 | slot = obj->objnode_tree_root; | |
391 | height = obj->objnode_tree_height; | |
392 | shift = (height-1) * OBJNODE_TREE_MAP_SHIFT; | |
393 | while (height > 0) { | |
394 | if (slot == NULL) { | |
395 | /* add a child objnode. */ | |
396 | slot = tmem_objnode_alloc(obj); | |
397 | if (!slot) { | |
398 | ret = -ENOMEM; | |
399 | goto out; | |
400 | } | |
401 | if (objnode) { | |
402 | ||
403 | objnode->slots[offset] = slot; | |
404 | objnode->slots_in_use++; | |
405 | } else | |
406 | obj->objnode_tree_root = slot; | |
407 | } | |
408 | /* go down a level */ | |
409 | offset = (index >> shift) & OBJNODE_TREE_MAP_MASK; | |
410 | objnode = slot; | |
411 | slot = objnode->slots[offset]; | |
412 | shift -= OBJNODE_TREE_MAP_SHIFT; | |
413 | height--; | |
414 | } | |
415 | BUG_ON(slot != NULL); | |
416 | if (objnode) { | |
417 | objnode->slots_in_use++; | |
418 | objnode->slots[offset] = pampd; | |
419 | } else | |
420 | obj->objnode_tree_root = pampd; | |
421 | obj->pampd_count++; | |
422 | out: | |
423 | return ret; | |
424 | } | |
425 | ||
426 | static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index) | |
427 | { | |
428 | struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1]; | |
429 | struct tmem_objnode_tree_path *pathp = path; | |
430 | struct tmem_objnode *slot = NULL; | |
431 | unsigned int height, shift; | |
432 | int offset; | |
433 | ||
434 | BUG_ON(obj == NULL); | |
435 | ASSERT_SENTINEL(obj, OBJ); | |
436 | BUG_ON(obj->pool == NULL); | |
437 | ASSERT_SENTINEL(obj->pool, POOL); | |
438 | height = obj->objnode_tree_height; | |
439 | if (index > tmem_objnode_tree_h2max[height]) | |
440 | goto out; | |
441 | slot = obj->objnode_tree_root; | |
442 | if (height == 0 && obj->objnode_tree_root) { | |
443 | obj->objnode_tree_root = NULL; | |
444 | goto out; | |
445 | } | |
446 | shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT; | |
447 | pathp->objnode = NULL; | |
448 | do { | |
449 | if (slot == NULL) | |
450 | goto out; | |
451 | pathp++; | |
452 | offset = (index >> shift) & OBJNODE_TREE_MAP_MASK; | |
453 | pathp->offset = offset; | |
454 | pathp->objnode = slot; | |
455 | slot = slot->slots[offset]; | |
456 | shift -= OBJNODE_TREE_MAP_SHIFT; | |
457 | height--; | |
458 | } while (height > 0); | |
459 | if (slot == NULL) | |
460 | goto out; | |
461 | while (pathp->objnode) { | |
462 | pathp->objnode->slots[pathp->offset] = NULL; | |
463 | pathp->objnode->slots_in_use--; | |
464 | if (pathp->objnode->slots_in_use) { | |
465 | if (pathp->objnode == obj->objnode_tree_root) { | |
466 | while (obj->objnode_tree_height > 0 && | |
467 | obj->objnode_tree_root->slots_in_use == 1 && | |
468 | obj->objnode_tree_root->slots[0]) { | |
469 | struct tmem_objnode *to_free = | |
470 | obj->objnode_tree_root; | |
471 | ||
472 | obj->objnode_tree_root = | |
473 | to_free->slots[0]; | |
474 | obj->objnode_tree_height--; | |
475 | to_free->slots[0] = NULL; | |
476 | to_free->slots_in_use = 0; | |
477 | tmem_objnode_free(to_free); | |
478 | } | |
479 | } | |
480 | goto out; | |
481 | } | |
482 | tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */ | |
483 | pathp--; | |
484 | } | |
485 | obj->objnode_tree_height = 0; | |
486 | obj->objnode_tree_root = NULL; | |
487 | ||
488 | out: | |
489 | if (slot != NULL) | |
490 | obj->pampd_count--; | |
491 | BUG_ON(obj->pampd_count < 0); | |
492 | return slot; | |
493 | } | |
494 | ||
495 | /* Recursively walk the objnode_tree destroying pampds and objnodes. */ | |
496 | static void tmem_objnode_node_destroy(struct tmem_obj *obj, | |
497 | struct tmem_objnode *objnode, | |
498 | unsigned int ht) | |
499 | { | |
500 | int i; | |
501 | ||
502 | if (ht == 0) | |
503 | return; | |
504 | for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) { | |
505 | if (objnode->slots[i]) { | |
506 | if (ht == 1) { | |
507 | obj->pampd_count--; | |
508 | (*tmem_pamops.free)(objnode->slots[i], | |
509 | obj->pool, NULL, 0, true); | |
510 | objnode->slots[i] = NULL; | |
511 | continue; | |
512 | } | |
513 | tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1); | |
514 | tmem_objnode_free(objnode->slots[i]); | |
515 | objnode->slots[i] = NULL; | |
516 | } | |
517 | } | |
518 | } | |
519 | ||
520 | static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj, | |
521 | bool pool_destroy) | |
522 | { | |
523 | if (obj->objnode_tree_root == NULL) | |
524 | return; | |
525 | if (obj->objnode_tree_height == 0) { | |
526 | obj->pampd_count--; | |
527 | (*tmem_pamops.free)(obj->objnode_tree_root, | |
528 | obj->pool, NULL, 0, true); | |
529 | } else { | |
530 | tmem_objnode_node_destroy(obj, obj->objnode_tree_root, | |
531 | obj->objnode_tree_height); | |
532 | tmem_objnode_free(obj->objnode_tree_root); | |
533 | obj->objnode_tree_height = 0; | |
534 | } | |
535 | obj->objnode_tree_root = NULL; | |
536 | #ifdef CONFIG_RAMSTER | |
537 | if (tmem_pamops.free_obj != NULL) | |
538 | (*tmem_pamops.free_obj)(obj->pool, obj, pool_destroy); | |
539 | #endif | |
540 | } | |
541 | ||
542 | /* | |
543 | * Tmem is operated on by a set of well-defined actions: | |
544 | * "put", "get", "flush", "flush_object", "new pool" and "destroy pool". | |
545 | * (The tmem ABI allows for subpages and exchanges but these operations | |
546 | * are not included in this implementation.) | |
547 | * | |
548 | * These "tmem core" operations are implemented in the following functions. | |
549 | */ | |
550 | ||
551 | /* | |
552 | * "Put" a page, e.g. associate the passed pampd with the passed handle. | |
553 | * Tmem_put is complicated by a corner case: What if a page with matching | |
554 | * handle already exists in tmem? To guarantee coherency, one of two | |
555 | * actions is necessary: Either the data for the page must be overwritten, | |
556 | * or the page must be "flushed" so that the data is not accessible to a | |
557 | * subsequent "get". Since these "duplicate puts" are relatively rare, | |
558 | * this implementation always flushes for simplicity. | |
559 | */ | |
560 | int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, | |
561 | bool raw, void *pampd_to_use) | |
562 | { | |
563 | struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL; | |
564 | void *pampd = NULL, *pampd_del = NULL; | |
565 | int ret = -ENOMEM; | |
566 | struct tmem_hashbucket *hb; | |
567 | ||
568 | hb = &pool->hashbucket[tmem_oid_hash(oidp)]; | |
569 | spin_lock(&hb->lock); | |
570 | obj = objfound = tmem_obj_find(hb, oidp); | |
571 | if (obj != NULL) { | |
572 | pampd = tmem_pampd_lookup_in_obj(objfound, index); | |
573 | if (pampd != NULL) { | |
574 | /* if found, is a dup put, flush the old one */ | |
575 | pampd_del = tmem_pampd_delete_from_obj(obj, index); | |
576 | BUG_ON(pampd_del != pampd); | |
577 | (*tmem_pamops.free)(pampd, pool, oidp, index, true); | |
578 | if (obj->pampd_count == 0) { | |
579 | objnew = obj; | |
580 | objfound = NULL; | |
581 | } | |
582 | pampd = NULL; | |
583 | } | |
584 | } else { | |
585 | obj = objnew = (*tmem_hostops.obj_alloc)(pool); | |
586 | if (unlikely(obj == NULL)) { | |
587 | ret = -ENOMEM; | |
588 | goto out; | |
589 | } | |
590 | tmem_obj_init(obj, hb, pool, oidp); | |
591 | } | |
592 | BUG_ON(obj == NULL); | |
593 | BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound)); | |
594 | pampd = pampd_to_use; | |
595 | BUG_ON(pampd_to_use == NULL); | |
596 | ret = tmem_pampd_add_to_obj(obj, index, pampd); | |
597 | if (unlikely(ret == -ENOMEM)) | |
598 | /* may have partially built objnode tree ("stump") */ | |
599 | goto delete_and_free; | |
600 | (*tmem_pamops.create_finish)(pampd, is_ephemeral(pool)); | |
601 | goto out; | |
602 | ||
603 | delete_and_free: | |
604 | (void)tmem_pampd_delete_from_obj(obj, index); | |
605 | if (pampd) | |
606 | (*tmem_pamops.free)(pampd, pool, NULL, 0, true); | |
607 | if (objnew) { | |
608 | tmem_obj_free(objnew, hb); | |
609 | (*tmem_hostops.obj_free)(objnew, pool); | |
610 | } | |
611 | out: | |
612 | spin_unlock(&hb->lock); | |
613 | return ret; | |
614 | } | |
615 | ||
616 | #ifdef CONFIG_RAMSTER | |
617 | /* | |
618 | * For ramster only: The following routines provide a two-step sequence | |
619 | * to allow the caller to replace a pampd in the tmem data structures with | |
620 | * another pampd. Here, we lookup the passed handle and, if found, return the | |
621 | * associated pampd and object, leaving the hashbucket locked and returning | |
622 | * a reference to it. The caller is expected to immediately call the | |
623 | * matching tmem_localify_finish routine which will handles the replacement | |
624 | * and unlocks the hashbucket. | |
625 | */ | |
626 | void *tmem_localify_get_pampd(struct tmem_pool *pool, struct tmem_oid *oidp, | |
627 | uint32_t index, struct tmem_obj **ret_obj, | |
628 | void **saved_hb) | |
629 | { | |
630 | struct tmem_hashbucket *hb; | |
631 | struct tmem_obj *obj = NULL; | |
632 | void *pampd = NULL; | |
633 | ||
634 | hb = &pool->hashbucket[tmem_oid_hash(oidp)]; | |
635 | spin_lock(&hb->lock); | |
636 | obj = tmem_obj_find(hb, oidp); | |
637 | if (likely(obj != NULL)) | |
638 | pampd = tmem_pampd_lookup_in_obj(obj, index); | |
639 | *ret_obj = obj; | |
640 | *saved_hb = (void *)hb; | |
641 | /* note, hashbucket remains locked */ | |
642 | return pampd; | |
643 | } | |
644 | ||
645 | void tmem_localify_finish(struct tmem_obj *obj, uint32_t index, | |
646 | void *pampd, void *saved_hb, bool delete) | |
647 | { | |
648 | struct tmem_hashbucket *hb = (struct tmem_hashbucket *)saved_hb; | |
649 | ||
650 | BUG_ON(!spin_is_locked(&hb->lock)); | |
651 | if (pampd != NULL) { | |
652 | BUG_ON(obj == NULL); | |
653 | (void)tmem_pampd_replace_in_obj(obj, index, pampd, 1); | |
654 | (*tmem_pamops.create_finish)(pampd, is_ephemeral(obj->pool)); | |
655 | } else if (delete) { | |
656 | BUG_ON(obj == NULL); | |
657 | (void)tmem_pampd_delete_from_obj(obj, index); | |
658 | } | |
659 | spin_unlock(&hb->lock); | |
660 | } | |
661 | ||
662 | /* | |
663 | * For ramster only. Helper function to support asynchronous tmem_get. | |
664 | */ | |
665 | static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb, | |
666 | struct tmem_pool *pool, struct tmem_oid *oidp, | |
667 | uint32_t index, bool free, char *data) | |
668 | { | |
669 | void *old_pampd = *ppampd, *new_pampd = NULL; | |
670 | bool intransit = false; | |
671 | int ret = 0; | |
672 | ||
673 | if (!is_ephemeral(pool)) | |
674 | new_pampd = (*tmem_pamops.repatriate_preload)( | |
675 | old_pampd, pool, oidp, index, &intransit); | |
676 | if (intransit) | |
677 | ret = -EAGAIN; | |
678 | else if (new_pampd != NULL) | |
679 | *ppampd = new_pampd; | |
680 | /* must release the hb->lock else repatriate can't sleep */ | |
681 | spin_unlock(&hb->lock); | |
682 | if (!intransit) | |
683 | ret = (*tmem_pamops.repatriate)(old_pampd, new_pampd, pool, | |
684 | oidp, index, free, data); | |
685 | if (ret == -EAGAIN) { | |
686 | /* rare I think, but should cond_resched()??? */ | |
687 | usleep_range(10, 1000); | |
688 | } else if (ret == -ENOTCONN || ret == -EHOSTDOWN) { | |
689 | ret = -1; | |
690 | } else if (ret != 0 && ret != -ENOENT) { | |
691 | ret = -1; | |
692 | } | |
693 | /* note hb->lock has now been unlocked */ | |
694 | return ret; | |
695 | } | |
696 | ||
697 | /* | |
698 | * For ramster only. If a page in tmem matches the handle, replace the | |
699 | * page so that any subsequent "get" gets the new page. Returns 0 if | |
700 | * there was a page to replace, else returns -1. | |
701 | */ | |
702 | int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp, | |
703 | uint32_t index, void *new_pampd) | |
704 | { | |
705 | struct tmem_obj *obj; | |
706 | int ret = -1; | |
707 | struct tmem_hashbucket *hb; | |
708 | ||
709 | hb = &pool->hashbucket[tmem_oid_hash(oidp)]; | |
710 | spin_lock(&hb->lock); | |
711 | obj = tmem_obj_find(hb, oidp); | |
712 | if (obj == NULL) | |
713 | goto out; | |
714 | new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd, 0); | |
715 | /* if we bug here, pamops wasn't properly set up for ramster */ | |
716 | BUG_ON(tmem_pamops.replace_in_obj == NULL); | |
717 | ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj); | |
718 | out: | |
719 | spin_unlock(&hb->lock); | |
720 | return ret; | |
721 | } | |
722 | #endif | |
723 | ||
724 | /* | |
725 | * "Get" a page, e.g. if a pampd can be found matching the passed handle, | |
726 | * use a pamops callback to recreated the page from the pampd with the | |
727 | * matching handle. By tmem definition, when a "get" is successful on | |
728 | * an ephemeral page, the page is "flushed", and when a "get" is successful | |
729 | * on a persistent page, the page is retained in tmem. Note that to preserve | |
730 | * coherency, "get" can never be skipped if tmem contains the data. | |
731 | * That is, if a get is done with a certain handle and fails, any | |
732 | * subsequent "get" must also fail (unless of course there is a | |
733 | * "put" done with the same handle). | |
734 | */ | |
735 | int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, | |
736 | char *data, size_t *sizep, bool raw, int get_and_free) | |
737 | { | |
738 | struct tmem_obj *obj; | |
739 | void *pampd = NULL; | |
740 | bool ephemeral = is_ephemeral(pool); | |
741 | int ret = -1; | |
742 | struct tmem_hashbucket *hb; | |
743 | bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral); | |
744 | bool lock_held = false; | |
745 | void **ppampd; | |
746 | ||
747 | do { | |
748 | hb = &pool->hashbucket[tmem_oid_hash(oidp)]; | |
749 | spin_lock(&hb->lock); | |
750 | lock_held = true; | |
751 | obj = tmem_obj_find(hb, oidp); | |
752 | if (obj == NULL) | |
753 | goto out; | |
754 | ppampd = __tmem_pampd_lookup_in_obj(obj, index); | |
755 | if (ppampd == NULL) | |
756 | goto out; | |
757 | #ifdef CONFIG_RAMSTER | |
758 | if ((tmem_pamops.is_remote != NULL) && | |
759 | tmem_pamops.is_remote(*ppampd)) { | |
760 | ret = tmem_repatriate(ppampd, hb, pool, oidp, | |
761 | index, free, data); | |
762 | /* tmem_repatriate releases hb->lock */ | |
763 | lock_held = false; | |
764 | *sizep = PAGE_SIZE; | |
765 | if (ret != -EAGAIN) | |
766 | goto out; | |
767 | } | |
768 | #endif | |
769 | } while (ret == -EAGAIN); | |
770 | if (free) | |
771 | pampd = tmem_pampd_delete_from_obj(obj, index); | |
772 | else | |
773 | pampd = tmem_pampd_lookup_in_obj(obj, index); | |
774 | if (pampd == NULL) | |
775 | goto out; | |
776 | if (free) { | |
777 | if (obj->pampd_count == 0) { | |
778 | tmem_obj_free(obj, hb); | |
779 | (*tmem_hostops.obj_free)(obj, pool); | |
780 | obj = NULL; | |
781 | } | |
782 | } | |
783 | if (free) | |
784 | ret = (*tmem_pamops.get_data_and_free)( | |
785 | data, sizep, raw, pampd, pool, oidp, index); | |
786 | else | |
787 | ret = (*tmem_pamops.get_data)( | |
788 | data, sizep, raw, pampd, pool, oidp, index); | |
789 | if (ret < 0) | |
790 | goto out; | |
791 | ret = 0; | |
792 | out: | |
793 | if (lock_held) | |
794 | spin_unlock(&hb->lock); | |
795 | return ret; | |
796 | } | |
797 | ||
798 | /* | |
799 | * If a page in tmem matches the handle, "flush" this page from tmem such | |
800 | * that any subsequent "get" does not succeed (unless, of course, there | |
801 | * was another "put" with the same handle). | |
802 | */ | |
803 | int tmem_flush_page(struct tmem_pool *pool, | |
804 | struct tmem_oid *oidp, uint32_t index) | |
805 | { | |
806 | struct tmem_obj *obj; | |
807 | void *pampd; | |
808 | int ret = -1; | |
809 | struct tmem_hashbucket *hb; | |
810 | ||
811 | hb = &pool->hashbucket[tmem_oid_hash(oidp)]; | |
812 | spin_lock(&hb->lock); | |
813 | obj = tmem_obj_find(hb, oidp); | |
814 | if (obj == NULL) | |
815 | goto out; | |
816 | pampd = tmem_pampd_delete_from_obj(obj, index); | |
817 | if (pampd == NULL) | |
818 | goto out; | |
819 | (*tmem_pamops.free)(pampd, pool, oidp, index, true); | |
820 | if (obj->pampd_count == 0) { | |
821 | tmem_obj_free(obj, hb); | |
822 | (*tmem_hostops.obj_free)(obj, pool); | |
823 | } | |
824 | ret = 0; | |
825 | ||
826 | out: | |
827 | spin_unlock(&hb->lock); | |
828 | return ret; | |
829 | } | |
830 | ||
831 | /* | |
832 | * "Flush" all pages in tmem matching this oid. | |
833 | */ | |
834 | int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp) | |
835 | { | |
836 | struct tmem_obj *obj; | |
837 | struct tmem_hashbucket *hb; | |
838 | int ret = -1; | |
839 | ||
840 | hb = &pool->hashbucket[tmem_oid_hash(oidp)]; | |
841 | spin_lock(&hb->lock); | |
842 | obj = tmem_obj_find(hb, oidp); | |
843 | if (obj == NULL) | |
844 | goto out; | |
845 | tmem_pampd_destroy_all_in_obj(obj, false); | |
846 | tmem_obj_free(obj, hb); | |
847 | (*tmem_hostops.obj_free)(obj, pool); | |
848 | ret = 0; | |
849 | ||
850 | out: | |
851 | spin_unlock(&hb->lock); | |
852 | return ret; | |
853 | } | |
854 | ||
855 | /* | |
856 | * "Flush" all pages (and tmem_objs) from this tmem_pool and disable | |
857 | * all subsequent access to this tmem_pool. | |
858 | */ | |
859 | int tmem_destroy_pool(struct tmem_pool *pool) | |
860 | { | |
861 | int ret = -1; | |
862 | ||
863 | if (pool == NULL) | |
864 | goto out; | |
865 | tmem_pool_flush(pool, 1); | |
866 | ret = 0; | |
867 | out: | |
868 | return ret; | |
869 | } | |
870 | ||
871 | static LIST_HEAD(tmem_global_pool_list); | |
872 | ||
873 | /* | |
874 | * Create a new tmem_pool with the provided flag and return | |
875 | * a pool id provided by the tmem host implementation. | |
876 | */ | |
877 | void tmem_new_pool(struct tmem_pool *pool, uint32_t flags) | |
878 | { | |
879 | int persistent = flags & TMEM_POOL_PERSIST; | |
880 | int shared = flags & TMEM_POOL_SHARED; | |
881 | struct tmem_hashbucket *hb = &pool->hashbucket[0]; | |
882 | int i; | |
883 | ||
884 | for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) { | |
885 | hb->obj_rb_root = RB_ROOT; | |
886 | spin_lock_init(&hb->lock); | |
887 | } | |
888 | INIT_LIST_HEAD(&pool->pool_list); | |
889 | atomic_set(&pool->obj_count, 0); | |
890 | SET_SENTINEL(pool, POOL); | |
891 | list_add_tail(&pool->pool_list, &tmem_global_pool_list); | |
892 | pool->persistent = persistent; | |
893 | pool->shared = shared; | |
894 | } |