Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
tmem.c
Go to the documentation of this file.
1 /*
2  * In-kernel transcendent memory (generic implementation)
3  *
4  * Copyright (c) 2009-2012, Dan Magenheimer, Oracle Corp.
5  *
6  * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented
7  * "handles" (triples containing a pool id, and object id, and an index), to
8  * pages in a page-accessible memory (PAM). Tmem references the PAM pages via
9  * an abstract "pampd" (PAM page-descriptor), which can be operated on by a
10  * set of functions (pamops). Each pampd contains some representation of
11  * PAGE_SIZE bytes worth of data. For those familiar with key-value stores,
12  * the tmem handle is a three-level hierarchical key, and the value is always
13  * reconstituted (but not necessarily stored) as PAGE_SIZE bytes and is
14  * referenced in the datastore by the pampd. The hierarchy is required
15  * to ensure that certain invalidation functions can be performed efficiently
16  * (i.e. flush all indexes associated with this object_id, or
17  * flush all objects associated with this pool).
18  *
19  * Tmem must support potentially millions of pages and must be able to insert,
20  * find, and delete these pages at a potential frequency of thousands per
21  * second concurrently across many CPUs, (and, if used with KVM, across many
22  * vcpus across many guests). Tmem is tracked with a hierarchy of data
23  * structures, organized by the elements in the handle-tuple: pool_id,
24  * object_id, and page index. One or more "clients" (e.g. guests) each
25  * provide one or more tmem_pools. Each pool, contains a hash table of
26  * rb_trees of tmem_objs. Each tmem_obj contains a radix-tree-like tree
27  * of pointers, with intermediate nodes called tmem_objnodes. Each leaf
28  * pointer in this tree points to a pampd, which is accessible only through
29  * a small set of callbacks registered by the PAM implementation (see
30  * tmem_register_pamops). Tmem only needs to memory allocation for objs
31  * and objnodes and this is done via a set of callbacks that must be
32  * registered by the tmem host implementation (e.g. see tmem_register_hostops).
33  */
34 
35 #include <linux/list.h>
36 #include <linux/spinlock.h>
37 #include <linux/atomic.h>
38 #ifdef CONFIG_RAMSTER
39 #include <linux/delay.h>
40 #endif
41 
42 #include "tmem.h"
43 
44 /* data structure sentinels used for debugging... see tmem.h */
45 #define POOL_SENTINEL 0x87658765
46 #define OBJ_SENTINEL 0x12345678
47 #define OBJNODE_SENTINEL 0xfedcba09
48 
49 /*
50  * A tmem host implementation must use this function to register callbacks
51  * for memory allocation.
52  */
53 static struct tmem_hostops tmem_hostops;
54 
55 static void tmem_objnode_tree_init(void);
56 
58 {
59  tmem_objnode_tree_init();
60  tmem_hostops = *m;
61 }
62 
63 /*
64  * A tmem host implementation must use this function to register
65  * callbacks for a page-accessible memory (PAM) implementation.
66  */
67 static struct tmem_pamops tmem_pamops;
68 
70 {
71  tmem_pamops = *m;
72 }
73 
74 /*
75  * Oid's are potentially very sparse and tmem_objs may have an indeterminately
76  * short life, being added and deleted at a relatively high frequency.
77  * So an rb_tree is an ideal data structure to manage tmem_objs. But because
78  * of the potentially huge number of tmem_objs, each pool manages a hashtable
79  * of rb_trees to reduce search, insert, delete, and rebalancing time.
80  * Each hashbucket also has a lock to manage concurrent access and no
81  * searches, inserts, or deletions can be performed unless the lock is held.
82  * As a result, care must be taken to ensure tmem routines are not called
83  * recursively; the vast majority of the time, a recursive call may work
84  * but a deadlock will occur a small fraction of the time due to the
85  * hashbucket lock.
86  *
87  * The following routines manage tmem_objs. In all of these routines,
88  * the hashbucket lock is already held.
89  */
90 
91 /* Search for object==oid in pool, returns object if found. */
92 static struct tmem_obj *__tmem_obj_find(struct tmem_hashbucket *hb,
93  struct tmem_oid *oidp,
94  struct rb_node **parent,
95  struct rb_node ***link)
96 {
97  struct rb_node *_parent = NULL, **rbnode;
98  struct tmem_obj *obj = NULL;
99 
100  rbnode = &hb->obj_rb_root.rb_node;
101  while (*rbnode) {
102  BUG_ON(RB_EMPTY_NODE(*rbnode));
103  _parent = *rbnode;
104  obj = rb_entry(*rbnode, struct tmem_obj,
105  rb_tree_node);
106  switch (tmem_oid_compare(oidp, &obj->oid)) {
107  case 0: /* equal */
108  goto out;
109  case -1:
110  rbnode = &(*rbnode)->rb_left;
111  break;
112  case 1:
113  rbnode = &(*rbnode)->rb_right;
114  break;
115  }
116  }
117 
118  if (parent)
119  *parent = _parent;
120  if (link)
121  *link = rbnode;
122  obj = NULL;
123 out:
124  return obj;
125 }
126 
127 static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb,
128  struct tmem_oid *oidp)
129 {
130  return __tmem_obj_find(hb, oidp, NULL, NULL);
131 }
132 
133 static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *, bool);
134 
135 /* Free an object that has no more pampds in it. */
136 static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb)
137 {
138  struct tmem_pool *pool;
139 
140  BUG_ON(obj == NULL);
141  ASSERT_SENTINEL(obj, OBJ);
142  BUG_ON(obj->pampd_count > 0);
143  pool = obj->pool;
144  BUG_ON(pool == NULL);
145  if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */
146  tmem_pampd_destroy_all_in_obj(obj, false);
147  BUG_ON(obj->objnode_tree_root != NULL);
148  BUG_ON((long)obj->objnode_count != 0);
149  atomic_dec(&pool->obj_count);
150  BUG_ON(atomic_read(&pool->obj_count) < 0);
151  INVERT_SENTINEL(obj, OBJ);
152  obj->pool = NULL;
153  tmem_oid_set_invalid(&obj->oid);
154  rb_erase(&obj->rb_tree_node, &hb->obj_rb_root);
155 }
156 
157 /*
158  * Initialize, and insert an tmem_object_root (called only if find failed).
159  */
160 static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb,
161  struct tmem_pool *pool,
162  struct tmem_oid *oidp)
163 {
164  struct rb_root *root = &hb->obj_rb_root;
165  struct rb_node **new = NULL, *parent = NULL;
166 
167  BUG_ON(pool == NULL);
168  atomic_inc(&pool->obj_count);
169  obj->objnode_tree_height = 0;
170  obj->objnode_tree_root = NULL;
171  obj->pool = pool;
172  obj->oid = *oidp;
173  obj->objnode_count = 0;
174  obj->pampd_count = 0;
175 #ifdef CONFIG_RAMSTER
176  if (tmem_pamops.new_obj != NULL)
177  (*tmem_pamops.new_obj)(obj);
178 #endif
179  SET_SENTINEL(obj, OBJ);
180 
181  if (__tmem_obj_find(hb, oidp, &parent, &new))
182  BUG();
183 
184  rb_link_node(&obj->rb_tree_node, parent, new);
185  rb_insert_color(&obj->rb_tree_node, root);
186 }
187 
188 /*
189  * Tmem is managed as a set of tmem_pools with certain attributes, such as
190  * "ephemeral" vs "persistent". These attributes apply to all tmem_objs
191  * and all pampds that belong to a tmem_pool. A tmem_pool is created
192  * or deleted relatively rarely (for example, when a filesystem is
193  * mounted or unmounted).
194  */
195 
196 /* flush all data from a pool and, optionally, free it */
197 static void tmem_pool_flush(struct tmem_pool *pool, bool destroy)
198 {
199  struct rb_node *rbnode;
200  struct tmem_obj *obj;
201  struct tmem_hashbucket *hb = &pool->hashbucket[0];
202  int i;
203 
204  BUG_ON(pool == NULL);
205  for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
206  spin_lock(&hb->lock);
207  rbnode = rb_first(&hb->obj_rb_root);
208  while (rbnode != NULL) {
209  obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
210  rbnode = rb_next(rbnode);
211  tmem_pampd_destroy_all_in_obj(obj, true);
212  tmem_obj_free(obj, hb);
213  (*tmem_hostops.obj_free)(obj, pool);
214  }
215  spin_unlock(&hb->lock);
216  }
217  if (destroy)
218  list_del(&pool->pool_list);
219 }
220 
221 /*
222  * A tmem_obj contains a radix-tree-like tree in which the intermediate
223  * nodes are called tmem_objnodes. (The kernel lib/radix-tree.c implementation
224  * is very specialized and tuned for specific uses and is not particularly
225  * suited for use from this code, though some code from the core algorithms has
226  * been reused, thus the copyright notices below). Each tmem_objnode contains
227  * a set of pointers which point to either a set of intermediate tmem_objnodes
228  * or a set of of pampds.
229  *
230  * Portions Copyright (C) 2001 Momchil Velikov
231  * Portions Copyright (C) 2001 Christoph Hellwig
232  * Portions Copyright (C) 2005 SGI, Christoph Lameter <[email protected]>
233  */
234 
237  int offset;
238 };
239 
240 /* objnode height_to_maxindex translation */
241 static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1];
242 
243 static void tmem_objnode_tree_init(void)
244 {
245  unsigned int ht, tmp;
246 
247  for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) {
248  tmp = ht * OBJNODE_TREE_MAP_SHIFT;
249  if (tmp >= OBJNODE_TREE_INDEX_BITS)
250  tmem_objnode_tree_h2max[ht] = ~0UL;
251  else
252  tmem_objnode_tree_h2max[ht] =
253  (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1;
254  }
255 }
256 
257 static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj)
258 {
259  struct tmem_objnode *objnode;
260 
261  ASSERT_SENTINEL(obj, OBJ);
262  BUG_ON(obj->pool == NULL);
263  ASSERT_SENTINEL(obj->pool, POOL);
264  objnode = (*tmem_hostops.objnode_alloc)(obj->pool);
265  if (unlikely(objnode == NULL))
266  goto out;
267  objnode->obj = obj;
268  SET_SENTINEL(objnode, OBJNODE);
269  memset(&objnode->slots, 0, sizeof(objnode->slots));
270  objnode->slots_in_use = 0;
271  obj->objnode_count++;
272 out:
273  return objnode;
274 }
275 
276 static void tmem_objnode_free(struct tmem_objnode *objnode)
277 {
278  struct tmem_pool *pool;
279  int i;
280 
281  BUG_ON(objnode == NULL);
282  for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++)
283  BUG_ON(objnode->slots[i] != NULL);
284  ASSERT_SENTINEL(objnode, OBJNODE);
285  INVERT_SENTINEL(objnode, OBJNODE);
286  BUG_ON(objnode->obj == NULL);
287  ASSERT_SENTINEL(objnode->obj, OBJ);
288  pool = objnode->obj->pool;
289  BUG_ON(pool == NULL);
290  ASSERT_SENTINEL(pool, POOL);
291  objnode->obj->objnode_count--;
292  objnode->obj = NULL;
293  (*tmem_hostops.objnode_free)(objnode, pool);
294 }
295 
296 /*
297  * Lookup index in object and return associated pampd (or NULL if not found).
298  */
299 static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
300 {
301  unsigned int height, shift;
302  struct tmem_objnode **slot = NULL;
303 
304  BUG_ON(obj == NULL);
305  ASSERT_SENTINEL(obj, OBJ);
306  BUG_ON(obj->pool == NULL);
307  ASSERT_SENTINEL(obj->pool, POOL);
308 
309  height = obj->objnode_tree_height;
310  if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height])
311  goto out;
312  if (height == 0 && obj->objnode_tree_root) {
313  slot = &obj->objnode_tree_root;
314  goto out;
315  }
316  shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
317  slot = &obj->objnode_tree_root;
318  while (height > 0) {
319  if (*slot == NULL)
320  goto out;
321  slot = (struct tmem_objnode **)
322  ((*slot)->slots +
323  ((index >> shift) & OBJNODE_TREE_MAP_MASK));
324  shift -= OBJNODE_TREE_MAP_SHIFT;
325  height--;
326  }
327 out:
328  return slot != NULL ? (void **)slot : NULL;
329 }
330 
331 static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
332 {
333  struct tmem_objnode **slot;
334 
335  slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
336  return slot != NULL ? *slot : NULL;
337 }
338 
339 #ifdef CONFIG_RAMSTER
340 static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,
341  void *new_pampd, bool no_free)
342 {
343  struct tmem_objnode **slot;
344  void *ret = NULL;
345 
346  slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
347  if ((slot != NULL) && (*slot != NULL)) {
348  void *old_pampd = *(void **)slot;
349  *(void **)slot = new_pampd;
350  if (!no_free)
351  (*tmem_pamops.free)(old_pampd, obj->pool,
352  NULL, 0, false);
353  ret = new_pampd;
354  }
355  return ret;
356 }
357 #endif
358 
359 static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index,
360  void *pampd)
361 {
362  int ret = 0;
363  struct tmem_objnode *objnode = NULL, *newnode, *slot;
364  unsigned int height, shift;
365  int offset = 0;
366 
367  /* if necessary, extend the tree to be higher */
368  if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) {
369  height = obj->objnode_tree_height + 1;
370  if (index > tmem_objnode_tree_h2max[height])
371  while (index > tmem_objnode_tree_h2max[height])
372  height++;
373  if (obj->objnode_tree_root == NULL) {
375  goto insert;
376  }
377  do {
378  newnode = tmem_objnode_alloc(obj);
379  if (!newnode) {
380  ret = -ENOMEM;
381  goto out;
382  }
383  newnode->slots[0] = obj->objnode_tree_root;
384  newnode->slots_in_use = 1;
385  obj->objnode_tree_root = newnode;
386  obj->objnode_tree_height++;
387  } while (height > obj->objnode_tree_height);
388  }
389 insert:
390  slot = obj->objnode_tree_root;
391  height = obj->objnode_tree_height;
392  shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
393  while (height > 0) {
394  if (slot == NULL) {
395  /* add a child objnode. */
396  slot = tmem_objnode_alloc(obj);
397  if (!slot) {
398  ret = -ENOMEM;
399  goto out;
400  }
401  if (objnode) {
402 
403  objnode->slots[offset] = slot;
404  objnode->slots_in_use++;
405  } else
406  obj->objnode_tree_root = slot;
407  }
408  /* go down a level */
409  offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
410  objnode = slot;
411  slot = objnode->slots[offset];
412  shift -= OBJNODE_TREE_MAP_SHIFT;
413  height--;
414  }
415  BUG_ON(slot != NULL);
416  if (objnode) {
417  objnode->slots_in_use++;
418  objnode->slots[offset] = pampd;
419  } else
420  obj->objnode_tree_root = pampd;
421  obj->pampd_count++;
422 out:
423  return ret;
424 }
425 
426 static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index)
427 {
429  struct tmem_objnode_tree_path *pathp = path;
430  struct tmem_objnode *slot = NULL;
431  unsigned int height, shift;
432  int offset;
433 
434  BUG_ON(obj == NULL);
435  ASSERT_SENTINEL(obj, OBJ);
436  BUG_ON(obj->pool == NULL);
437  ASSERT_SENTINEL(obj->pool, POOL);
438  height = obj->objnode_tree_height;
439  if (index > tmem_objnode_tree_h2max[height])
440  goto out;
441  slot = obj->objnode_tree_root;
442  if (height == 0 && obj->objnode_tree_root) {
443  obj->objnode_tree_root = NULL;
444  goto out;
445  }
446  shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT;
447  pathp->objnode = NULL;
448  do {
449  if (slot == NULL)
450  goto out;
451  pathp++;
452  offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
453  pathp->offset = offset;
454  pathp->objnode = slot;
455  slot = slot->slots[offset];
456  shift -= OBJNODE_TREE_MAP_SHIFT;
457  height--;
458  } while (height > 0);
459  if (slot == NULL)
460  goto out;
461  while (pathp->objnode) {
462  pathp->objnode->slots[pathp->offset] = NULL;
463  pathp->objnode->slots_in_use--;
464  if (pathp->objnode->slots_in_use) {
465  if (pathp->objnode == obj->objnode_tree_root) {
466  while (obj->objnode_tree_height > 0 &&
467  obj->objnode_tree_root->slots_in_use == 1 &&
468  obj->objnode_tree_root->slots[0]) {
469  struct tmem_objnode *to_free =
470  obj->objnode_tree_root;
471 
472  obj->objnode_tree_root =
473  to_free->slots[0];
474  obj->objnode_tree_height--;
475  to_free->slots[0] = NULL;
476  to_free->slots_in_use = 0;
477  tmem_objnode_free(to_free);
478  }
479  }
480  goto out;
481  }
482  tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */
483  pathp--;
484  }
485  obj->objnode_tree_height = 0;
486  obj->objnode_tree_root = NULL;
487 
488 out:
489  if (slot != NULL)
490  obj->pampd_count--;
491  BUG_ON(obj->pampd_count < 0);
492  return slot;
493 }
494 
495 /* Recursively walk the objnode_tree destroying pampds and objnodes. */
496 static void tmem_objnode_node_destroy(struct tmem_obj *obj,
497  struct tmem_objnode *objnode,
498  unsigned int ht)
499 {
500  int i;
501 
502  if (ht == 0)
503  return;
504  for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) {
505  if (objnode->slots[i]) {
506  if (ht == 1) {
507  obj->pampd_count--;
508  (*tmem_pamops.free)(objnode->slots[i],
509  obj->pool, NULL, 0, true);
510  objnode->slots[i] = NULL;
511  continue;
512  }
513  tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1);
514  tmem_objnode_free(objnode->slots[i]);
515  objnode->slots[i] = NULL;
516  }
517  }
518 }
519 
520 static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj,
521  bool pool_destroy)
522 {
523  if (obj->objnode_tree_root == NULL)
524  return;
525  if (obj->objnode_tree_height == 0) {
526  obj->pampd_count--;
528  obj->pool, NULL, 0, true);
529  } else {
530  tmem_objnode_node_destroy(obj, obj->objnode_tree_root,
531  obj->objnode_tree_height);
532  tmem_objnode_free(obj->objnode_tree_root);
533  obj->objnode_tree_height = 0;
534  }
535  obj->objnode_tree_root = NULL;
536 #ifdef CONFIG_RAMSTER
537  if (tmem_pamops.free_obj != NULL)
538  (*tmem_pamops.free_obj)(obj->pool, obj, pool_destroy);
539 #endif
540 }
541 
542 /*
543  * Tmem is operated on by a set of well-defined actions:
544  * "put", "get", "flush", "flush_object", "new pool" and "destroy pool".
545  * (The tmem ABI allows for subpages and exchanges but these operations
546  * are not included in this implementation.)
547  *
548  * These "tmem core" operations are implemented in the following functions.
549  */
550 
551 /*
552  * "Put" a page, e.g. associate the passed pampd with the passed handle.
553  * Tmem_put is complicated by a corner case: What if a page with matching
554  * handle already exists in tmem? To guarantee coherency, one of two
555  * actions is necessary: Either the data for the page must be overwritten,
556  * or the page must be "flushed" so that the data is not accessible to a
557  * subsequent "get". Since these "duplicate puts" are relatively rare,
558  * this implementation always flushes for simplicity.
559  */
560 int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
561  bool raw, void *pampd_to_use)
562 {
563  struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL;
564  void *pampd = NULL, *pampd_del = NULL;
565  int ret = -ENOMEM;
566  struct tmem_hashbucket *hb;
567 
568  hb = &pool->hashbucket[tmem_oid_hash(oidp)];
569  spin_lock(&hb->lock);
570  obj = objfound = tmem_obj_find(hb, oidp);
571  if (obj != NULL) {
572  pampd = tmem_pampd_lookup_in_obj(objfound, index);
573  if (pampd != NULL) {
574  /* if found, is a dup put, flush the old one */
575  pampd_del = tmem_pampd_delete_from_obj(obj, index);
576  BUG_ON(pampd_del != pampd);
577  (*tmem_pamops.free)(pampd, pool, oidp, index, true);
578  if (obj->pampd_count == 0) {
579  objnew = obj;
580  objfound = NULL;
581  }
582  pampd = NULL;
583  }
584  } else {
585  obj = objnew = (*tmem_hostops.obj_alloc)(pool);
586  if (unlikely(obj == NULL)) {
587  ret = -ENOMEM;
588  goto out;
589  }
590  tmem_obj_init(obj, hb, pool, oidp);
591  }
592  BUG_ON(obj == NULL);
593  BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound));
594  pampd = pampd_to_use;
595  BUG_ON(pampd_to_use == NULL);
596  ret = tmem_pampd_add_to_obj(obj, index, pampd);
597  if (unlikely(ret == -ENOMEM))
598  /* may have partially built objnode tree ("stump") */
599  goto delete_and_free;
600  (*tmem_pamops.create_finish)(pampd, is_ephemeral(pool));
601  goto out;
602 
603 delete_and_free:
604  (void)tmem_pampd_delete_from_obj(obj, index);
605  if (pampd)
606  (*tmem_pamops.free)(pampd, pool, NULL, 0, true);
607  if (objnew) {
608  tmem_obj_free(objnew, hb);
609  (*tmem_hostops.obj_free)(objnew, pool);
610  }
611 out:
612  spin_unlock(&hb->lock);
613  return ret;
614 }
615 
616 #ifdef CONFIG_RAMSTER
617 /*
618  * For ramster only: The following routines provide a two-step sequence
619  * to allow the caller to replace a pampd in the tmem data structures with
620  * another pampd. Here, we lookup the passed handle and, if found, return the
621  * associated pampd and object, leaving the hashbucket locked and returning
622  * a reference to it. The caller is expected to immediately call the
623  * matching tmem_localify_finish routine which will handles the replacement
624  * and unlocks the hashbucket.
625  */
626 void *tmem_localify_get_pampd(struct tmem_pool *pool, struct tmem_oid *oidp,
627  uint32_t index, struct tmem_obj **ret_obj,
628  void **saved_hb)
629 {
630  struct tmem_hashbucket *hb;
631  struct tmem_obj *obj = NULL;
632  void *pampd = NULL;
633 
634  hb = &pool->hashbucket[tmem_oid_hash(oidp)];
635  spin_lock(&hb->lock);
636  obj = tmem_obj_find(hb, oidp);
637  if (likely(obj != NULL))
638  pampd = tmem_pampd_lookup_in_obj(obj, index);
639  *ret_obj = obj;
640  *saved_hb = (void *)hb;
641  /* note, hashbucket remains locked */
642  return pampd;
643 }
644 
645 void tmem_localify_finish(struct tmem_obj *obj, uint32_t index,
646  void *pampd, void *saved_hb, bool delete)
647 {
648  struct tmem_hashbucket *hb = (struct tmem_hashbucket *)saved_hb;
649 
650  BUG_ON(!spin_is_locked(&hb->lock));
651  if (pampd != NULL) {
652  BUG_ON(obj == NULL);
653  (void)tmem_pampd_replace_in_obj(obj, index, pampd, 1);
654  (*tmem_pamops.create_finish)(pampd, is_ephemeral(obj->pool));
655  } else if (delete) {
656  BUG_ON(obj == NULL);
657  (void)tmem_pampd_delete_from_obj(obj, index);
658  }
659  spin_unlock(&hb->lock);
660 }
661 
662 /*
663  * For ramster only. Helper function to support asynchronous tmem_get.
664  */
665 static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb,
666  struct tmem_pool *pool, struct tmem_oid *oidp,
667  uint32_t index, bool free, char *data)
668 {
669  void *old_pampd = *ppampd, *new_pampd = NULL;
670  bool intransit = false;
671  int ret = 0;
672 
673  if (!is_ephemeral(pool))
674  new_pampd = (*tmem_pamops.repatriate_preload)(
675  old_pampd, pool, oidp, index, &intransit);
676  if (intransit)
677  ret = -EAGAIN;
678  else if (new_pampd != NULL)
679  *ppampd = new_pampd;
680  /* must release the hb->lock else repatriate can't sleep */
681  spin_unlock(&hb->lock);
682  if (!intransit)
683  ret = (*tmem_pamops.repatriate)(old_pampd, new_pampd, pool,
684  oidp, index, free, data);
685  if (ret == -EAGAIN) {
686  /* rare I think, but should cond_resched()??? */
687  usleep_range(10, 1000);
688  } else if (ret == -ENOTCONN || ret == -EHOSTDOWN) {
689  ret = -1;
690  } else if (ret != 0 && ret != -ENOENT) {
691  ret = -1;
692  }
693  /* note hb->lock has now been unlocked */
694  return ret;
695 }
696 
697 /*
698  * For ramster only. If a page in tmem matches the handle, replace the
699  * page so that any subsequent "get" gets the new page. Returns 0 if
700  * there was a page to replace, else returns -1.
701  */
702 int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,
703  uint32_t index, void *new_pampd)
704 {
705  struct tmem_obj *obj;
706  int ret = -1;
707  struct tmem_hashbucket *hb;
708 
709  hb = &pool->hashbucket[tmem_oid_hash(oidp)];
710  spin_lock(&hb->lock);
711  obj = tmem_obj_find(hb, oidp);
712  if (obj == NULL)
713  goto out;
714  new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd, 0);
715  /* if we bug here, pamops wasn't properly set up for ramster */
717  ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj);
718 out:
719  spin_unlock(&hb->lock);
720  return ret;
721 }
722 #endif
723 
724 /*
725  * "Get" a page, e.g. if a pampd can be found matching the passed handle,
726  * use a pamops callback to recreated the page from the pampd with the
727  * matching handle. By tmem definition, when a "get" is successful on
728  * an ephemeral page, the page is "flushed", and when a "get" is successful
729  * on a persistent page, the page is retained in tmem. Note that to preserve
730  * coherency, "get" can never be skipped if tmem contains the data.
731  * That is, if a get is done with a certain handle and fails, any
732  * subsequent "get" must also fail (unless of course there is a
733  * "put" done with the same handle).
734  */
735 int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
736  char *data, size_t *sizep, bool raw, int get_and_free)
737 {
738  struct tmem_obj *obj;
739  void *pampd = NULL;
740  bool ephemeral = is_ephemeral(pool);
741  int ret = -1;
742  struct tmem_hashbucket *hb;
743  bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral);
744  bool lock_held = false;
745  void **ppampd;
746 
747  do {
748  hb = &pool->hashbucket[tmem_oid_hash(oidp)];
749  spin_lock(&hb->lock);
750  lock_held = true;
751  obj = tmem_obj_find(hb, oidp);
752  if (obj == NULL)
753  goto out;
754  ppampd = __tmem_pampd_lookup_in_obj(obj, index);
755  if (ppampd == NULL)
756  goto out;
757 #ifdef CONFIG_RAMSTER
758  if ((tmem_pamops.is_remote != NULL) &&
759  tmem_pamops.is_remote(*ppampd)) {
760  ret = tmem_repatriate(ppampd, hb, pool, oidp,
761  index, free, data);
762  /* tmem_repatriate releases hb->lock */
763  lock_held = false;
764  *sizep = PAGE_SIZE;
765  if (ret != -EAGAIN)
766  goto out;
767  }
768 #endif
769  } while (ret == -EAGAIN);
770  if (free)
771  pampd = tmem_pampd_delete_from_obj(obj, index);
772  else
773  pampd = tmem_pampd_lookup_in_obj(obj, index);
774  if (pampd == NULL)
775  goto out;
776  if (free) {
777  if (obj->pampd_count == 0) {
778  tmem_obj_free(obj, hb);
779  (*tmem_hostops.obj_free)(obj, pool);
780  obj = NULL;
781  }
782  }
783  if (free)
785  data, sizep, raw, pampd, pool, oidp, index);
786  else
787  ret = (*tmem_pamops.get_data)(
788  data, sizep, raw, pampd, pool, oidp, index);
789  if (ret < 0)
790  goto out;
791  ret = 0;
792 out:
793  if (lock_held)
794  spin_unlock(&hb->lock);
795  return ret;
796 }
797 
798 /*
799  * If a page in tmem matches the handle, "flush" this page from tmem such
800  * that any subsequent "get" does not succeed (unless, of course, there
801  * was another "put" with the same handle).
802  */
803 int tmem_flush_page(struct tmem_pool *pool,
804  struct tmem_oid *oidp, uint32_t index)
805 {
806  struct tmem_obj *obj;
807  void *pampd;
808  int ret = -1;
809  struct tmem_hashbucket *hb;
810 
811  hb = &pool->hashbucket[tmem_oid_hash(oidp)];
812  spin_lock(&hb->lock);
813  obj = tmem_obj_find(hb, oidp);
814  if (obj == NULL)
815  goto out;
816  pampd = tmem_pampd_delete_from_obj(obj, index);
817  if (pampd == NULL)
818  goto out;
819  (*tmem_pamops.free)(pampd, pool, oidp, index, true);
820  if (obj->pampd_count == 0) {
821  tmem_obj_free(obj, hb);
822  (*tmem_hostops.obj_free)(obj, pool);
823  }
824  ret = 0;
825 
826 out:
827  spin_unlock(&hb->lock);
828  return ret;
829 }
830 
831 /*
832  * "Flush" all pages in tmem matching this oid.
833  */
834 int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp)
835 {
836  struct tmem_obj *obj;
837  struct tmem_hashbucket *hb;
838  int ret = -1;
839 
840  hb = &pool->hashbucket[tmem_oid_hash(oidp)];
841  spin_lock(&hb->lock);
842  obj = tmem_obj_find(hb, oidp);
843  if (obj == NULL)
844  goto out;
845  tmem_pampd_destroy_all_in_obj(obj, false);
846  tmem_obj_free(obj, hb);
847  (*tmem_hostops.obj_free)(obj, pool);
848  ret = 0;
849 
850 out:
851  spin_unlock(&hb->lock);
852  return ret;
853 }
854 
855 /*
856  * "Flush" all pages (and tmem_objs) from this tmem_pool and disable
857  * all subsequent access to this tmem_pool.
858  */
859 int tmem_destroy_pool(struct tmem_pool *pool)
860 {
861  int ret = -1;
862 
863  if (pool == NULL)
864  goto out;
865  tmem_pool_flush(pool, 1);
866  ret = 0;
867 out:
868  return ret;
869 }
870 
871 static LIST_HEAD(tmem_global_pool_list);
872 
873 /*
874  * Create a new tmem_pool with the provided flag and return
875  * a pool id provided by the tmem host implementation.
876  */
878 {
879  int persistent = flags & TMEM_POOL_PERSIST;
880  int shared = flags & TMEM_POOL_SHARED;
881  struct tmem_hashbucket *hb = &pool->hashbucket[0];
882  int i;
883 
884  for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
885  hb->obj_rb_root = RB_ROOT;
886  spin_lock_init(&hb->lock);
887  }
888  INIT_LIST_HEAD(&pool->pool_list);
889  atomic_set(&pool->obj_count, 0);
890  SET_SENTINEL(pool, POOL);
891  list_add_tail(&pool->pool_list, &tmem_global_pool_list);
892  pool->persistent = persistent;
893  pool->shared = shared;
894 }