Commit | Line | Data |
---|---|---|
ab68f262 DW |
1 | /* |
2 | * Copyright(c) 2016 Intel Corporation. All rights reserved. | |
3 | * | |
4 | * This program is free software; you can redistribute it and/or modify | |
5 | * it under the terms of version 2 of the GNU General Public License as | |
6 | * published by the Free Software Foundation. | |
7 | * | |
8 | * This program is distributed in the hope that it will be useful, but | |
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
11 | * General Public License for more details. | |
12 | */ | |
13 | #include <linux/pagemap.h> | |
14 | #include <linux/module.h> | |
15 | #include <linux/device.h> | |
3bc52c45 | 16 | #include <linux/mount.h> |
ab68f262 | 17 | #include <linux/pfn_t.h> |
3bc52c45 | 18 | #include <linux/hash.h> |
ba09c01d | 19 | #include <linux/cdev.h> |
ab68f262 DW |
20 | #include <linux/slab.h> |
21 | #include <linux/dax.h> | |
22 | #include <linux/fs.h> | |
23 | #include <linux/mm.h> | |
ccdb07f6 | 24 | #include "dax.h" |
ab68f262 | 25 | |
ba09c01d | 26 | static dev_t dax_devt; |
ab68f262 DW |
27 | static struct class *dax_class; |
28 | static DEFINE_IDA(dax_minor_ida); | |
ba09c01d DW |
29 | static int nr_dax = CONFIG_NR_DEV_DAX; |
30 | module_param(nr_dax, int, S_IRUGO); | |
3bc52c45 DW |
31 | static struct vfsmount *dax_mnt; |
32 | static struct kmem_cache *dax_cache __read_mostly; | |
33 | static struct super_block *dax_superblock __read_mostly; | |
ba09c01d | 34 | MODULE_PARM_DESC(nr_dax, "max number of device-dax instances"); |
ab68f262 DW |
35 | |
36 | /** | |
37 | * struct dax_region - mapping infrastructure for dax devices | |
38 | * @id: kernel-wide unique region for a memory range | |
39 | * @base: linear address corresponding to @res | |
40 | * @kref: to pin while other agents have a need to do lookups | |
70f93a37 | 41 | * @lock: synchronize changes / consistent-access to the resource tree (@res) |
ab68f262 | 42 | * @dev: parent device backing this region |
d4c6b777 | 43 | * @seed: next device for dynamic allocation / configuration |
ab68f262 DW |
44 | * @align: allocation and mapping alignment for child dax devices |
45 | * @res: physical address range of the region | |
d4c6b777 | 46 | * @child_count: number of registered dax device instances |
ab68f262 DW |
47 | * @pfn_flags: identify whether the pfns are paged back or not |
48 | */ | |
49 | struct dax_region { | |
50 | int id; | |
51 | struct ida ida; | |
52 | void *base; | |
53 | struct kref kref; | |
70f93a37 | 54 | struct mutex lock; |
ab68f262 | 55 | struct device *dev; |
d4c6b777 | 56 | struct device *seed; |
ab68f262 DW |
57 | unsigned int align; |
58 | struct resource res; | |
d4c6b777 | 59 | atomic_t child_count; |
ab68f262 DW |
60 | unsigned long pfn_flags; |
61 | }; | |
62 | ||
63 | /** | |
64 | * struct dax_dev - subdivision of a dax region | |
65 | * @region - parent region | |
66 | * @dev - device backing the character device | |
ba09c01d | 67 | * @cdev - core chardev data |
dee41079 | 68 | * @alive - !alive + rcu grace period == no new mappings can be established |
ab68f262 DW |
69 | * @id - child id in the region |
70 | * @num_resources - number of physical address extents in this device | |
71 | * @res - array of physical address ranges | |
72 | */ | |
73 | struct dax_dev { | |
74 | struct dax_region *region; | |
3bc52c45 | 75 | struct inode *inode; |
ebd84d72 | 76 | struct device dev; |
ba09c01d | 77 | struct cdev cdev; |
dee41079 | 78 | bool alive; |
ab68f262 DW |
79 | int id; |
80 | int num_resources; | |
70f93a37 | 81 | struct resource **res; |
ab68f262 DW |
82 | }; |
83 | ||
70f93a37 DW |
84 | #define for_each_dax_region_resource(dax_region, res) \ |
85 | for (res = (dax_region)->res.child; res; res = res->sibling) | |
ab68f262 | 86 | |
70f93a37 DW |
87 | static unsigned long long dax_region_avail_size( |
88 | struct dax_region *dax_region) | |
ab68f262 | 89 | { |
70f93a37 DW |
90 | unsigned long long size; |
91 | struct resource *res; | |
ab68f262 | 92 | |
70f93a37 DW |
93 | mutex_lock(&dax_region->lock); |
94 | size = resource_size(&dax_region->res); | |
95 | for_each_dax_region_resource(dax_region, res) | |
96 | size -= resource_size(res); | |
97 | mutex_unlock(&dax_region->lock); | |
ab68f262 | 98 | |
70f93a37 | 99 | return size; |
ab68f262 DW |
100 | } |
101 | ||
70f93a37 DW |
102 | static ssize_t available_size_show(struct device *dev, |
103 | struct device_attribute *attr, char *buf) | |
ab68f262 DW |
104 | { |
105 | struct dax_region *dax_region; | |
70f93a37 | 106 | ssize_t rc = -ENXIO; |
ab68f262 | 107 | |
70f93a37 DW |
108 | device_lock(dev); |
109 | dax_region = dev_get_drvdata(dev); | |
110 | if (dax_region) | |
111 | rc = sprintf(buf, "%llu\n", dax_region_avail_size(dax_region)); | |
112 | device_unlock(dev); | |
ab68f262 | 113 | |
70f93a37 | 114 | return rc; |
ab68f262 | 115 | } |
70f93a37 | 116 | static DEVICE_ATTR_RO(available_size); |
ab68f262 | 117 | |
d4c6b777 | 118 | static ssize_t seed_show(struct device *dev, |
ab68f262 DW |
119 | struct device_attribute *attr, char *buf) |
120 | { | |
d4c6b777 DW |
121 | struct dax_region *dax_region; |
122 | ssize_t rc = -ENXIO; | |
ab68f262 | 123 | |
d4c6b777 DW |
124 | device_lock(dev); |
125 | dax_region = dev_get_drvdata(dev); | |
126 | if (dax_region) { | |
127 | mutex_lock(&dax_region->lock); | |
128 | if (dax_region->seed) | |
129 | rc = sprintf(buf, "%s\n", dev_name(dax_region->seed)); | |
130 | mutex_unlock(&dax_region->lock); | |
131 | } | |
132 | device_unlock(dev); | |
ab68f262 | 133 | |
d4c6b777 | 134 | return rc; |
ab68f262 | 135 | } |
d4c6b777 | 136 | static DEVICE_ATTR_RO(seed); |
ab68f262 | 137 | |
70f93a37 DW |
138 | static struct attribute *dax_region_attributes[] = { |
139 | &dev_attr_available_size.attr, | |
d4c6b777 | 140 | &dev_attr_seed.attr, |
ab68f262 DW |
141 | NULL, |
142 | }; | |
143 | ||
70f93a37 DW |
144 | static const struct attribute_group dax_region_attribute_group = { |
145 | .name = "dax_region", | |
146 | .attrs = dax_region_attributes, | |
ab68f262 DW |
147 | }; |
148 | ||
70f93a37 DW |
149 | static const struct attribute_group *dax_region_attribute_groups[] = { |
150 | &dax_region_attribute_group, | |
ab68f262 DW |
151 | NULL, |
152 | }; | |
153 | ||
3bc52c45 | 154 | static struct inode *dax_alloc_inode(struct super_block *sb) |
ab68f262 | 155 | { |
3bc52c45 DW |
156 | return kmem_cache_alloc(dax_cache, GFP_KERNEL); |
157 | } | |
ab68f262 | 158 | |
3bc52c45 DW |
159 | static void dax_i_callback(struct rcu_head *head) |
160 | { | |
161 | struct inode *inode = container_of(head, struct inode, i_rcu); | |
ab68f262 | 162 | |
3bc52c45 DW |
163 | kmem_cache_free(dax_cache, inode); |
164 | } | |
dee41079 | 165 | |
3bc52c45 DW |
166 | static void dax_destroy_inode(struct inode *inode) |
167 | { | |
168 | call_rcu(&inode->i_rcu, dax_i_callback); | |
ab68f262 DW |
169 | } |
170 | ||
3bc52c45 DW |
171 | static const struct super_operations dax_sops = { |
172 | .statfs = simple_statfs, | |
173 | .alloc_inode = dax_alloc_inode, | |
174 | .destroy_inode = dax_destroy_inode, | |
175 | .drop_inode = generic_delete_inode, | |
176 | }; | |
177 | ||
178 | static struct dentry *dax_mount(struct file_system_type *fs_type, | |
179 | int flags, const char *dev_name, void *data) | |
ab68f262 | 180 | { |
3bc52c45 DW |
181 | return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC); |
182 | } | |
ab68f262 | 183 | |
3bc52c45 DW |
184 | static struct file_system_type dax_type = { |
185 | .name = "dax", | |
186 | .mount = dax_mount, | |
187 | .kill_sb = kill_anon_super, | |
188 | }; | |
ab68f262 | 189 | |
3bc52c45 DW |
190 | static int dax_test(struct inode *inode, void *data) |
191 | { | |
192 | return inode->i_cdev == data; | |
193 | } | |
ab68f262 | 194 | |
3bc52c45 DW |
195 | static int dax_set(struct inode *inode, void *data) |
196 | { | |
197 | inode->i_cdev = data; | |
198 | return 0; | |
199 | } | |
ab68f262 | 200 | |
3bc52c45 DW |
201 | static struct inode *dax_inode_get(struct cdev *cdev, dev_t devt) |
202 | { | |
203 | struct inode *inode; | |
ab68f262 | 204 | |
3bc52c45 DW |
205 | inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31), |
206 | dax_test, dax_set, cdev); | |
ab68f262 | 207 | |
3bc52c45 DW |
208 | if (!inode) |
209 | return NULL; | |
ab68f262 | 210 | |
3bc52c45 DW |
211 | if (inode->i_state & I_NEW) { |
212 | inode->i_mode = S_IFCHR; | |
213 | inode->i_flags = S_DAX; | |
214 | inode->i_rdev = devt; | |
215 | mapping_set_gfp_mask(&inode->i_data, GFP_USER); | |
216 | unlock_new_inode(inode); | |
217 | } | |
218 | return inode; | |
219 | } | |
ab68f262 | 220 | |
3bc52c45 DW |
221 | static void init_once(void *inode) |
222 | { | |
223 | inode_init_once(inode); | |
ab68f262 | 224 | } |
ab68f262 | 225 | |
3bc52c45 | 226 | static int dax_inode_init(void) |
dee41079 | 227 | { |
3bc52c45 | 228 | int rc; |
dee41079 | 229 | |
3bc52c45 DW |
230 | dax_cache = kmem_cache_create("dax_cache", sizeof(struct inode), 0, |
231 | (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| | |
232 | SLAB_MEM_SPREAD|SLAB_ACCOUNT), | |
233 | init_once); | |
234 | if (!dax_cache) | |
235 | return -ENOMEM; | |
dee41079 | 236 | |
3bc52c45 DW |
237 | rc = register_filesystem(&dax_type); |
238 | if (rc) | |
239 | goto err_register_fs; | |
dee41079 | 240 | |
3bc52c45 DW |
241 | dax_mnt = kern_mount(&dax_type); |
242 | if (IS_ERR(dax_mnt)) { | |
243 | rc = PTR_ERR(dax_mnt); | |
244 | goto err_mount; | |
245 | } | |
246 | dax_superblock = dax_mnt->mnt_sb; | |
dee41079 | 247 | |
3bc52c45 | 248 | return 0; |
dee41079 | 249 | |
3bc52c45 DW |
250 | err_mount: |
251 | unregister_filesystem(&dax_type); | |
252 | err_register_fs: | |
253 | kmem_cache_destroy(dax_cache); | |
254 | ||
255 | return rc; | |
dee41079 DW |
256 | } |
257 | ||
3bc52c45 | 258 | static void dax_inode_exit(void) |
dee41079 | 259 | { |
3bc52c45 DW |
260 | kern_unmount(dax_mnt); |
261 | unregister_filesystem(&dax_type); | |
262 | kmem_cache_destroy(dax_cache); | |
263 | } | |
dee41079 | 264 | |
ab68f262 DW |
265 | static void dax_region_free(struct kref *kref) |
266 | { | |
267 | struct dax_region *dax_region; | |
268 | ||
269 | dax_region = container_of(kref, struct dax_region, kref); | |
d4c6b777 DW |
270 | WARN(atomic_read(&dax_region->child_count), |
271 | "%s: child count not zero\n", | |
272 | dev_name(dax_region->dev)); | |
ab68f262 | 273 | kfree(dax_region); |
dee41079 DW |
274 | } |
275 | ||
ab68f262 | 276 | void dax_region_put(struct dax_region *dax_region) |
dee41079 | 277 | { |
ab68f262 | 278 | kref_put(&dax_region->kref, dax_region_free); |
dee41079 | 279 | } |
ab68f262 DW |
280 | EXPORT_SYMBOL_GPL(dax_region_put); |
281 | ||
dee41079 | 282 | |
70f93a37 | 283 | static void dax_region_unregister(void *region) |
dee41079 | 284 | { |
70f93a37 | 285 | struct dax_region *dax_region = region; |
dee41079 | 286 | |
70f93a37 DW |
287 | sysfs_remove_groups(&dax_region->dev->kobj, |
288 | dax_region_attribute_groups); | |
289 | dax_region_put(dax_region); | |
290 | } | |
dee41079 | 291 | |
ab68f262 DW |
292 | struct dax_region *alloc_dax_region(struct device *parent, int region_id, |
293 | struct resource *res, unsigned int align, void *addr, | |
294 | unsigned long pfn_flags) | |
295 | { | |
296 | struct dax_region *dax_region; | |
297 | ||
70f93a37 DW |
298 | if (dev_get_drvdata(parent)) { |
299 | dev_WARN(parent, "dax core found drvdata already in use\n"); | |
300 | return NULL; | |
dee41079 | 301 | } |
dee41079 | 302 | |
9d2d01a0 DW |
303 | if (!IS_ALIGNED(res->start, align) |
304 | || !IS_ALIGNED(resource_size(res), align)) | |
305 | return NULL; | |
ab68f262 | 306 | |
9d2d01a0 | 307 | dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); |
ab68f262 DW |
308 | if (!dax_region) |
309 | return NULL; | |
70f93a37 DW |
310 | dev_set_drvdata(parent, dax_region); |
311 | dax_region->res.name = dev_name(parent); | |
312 | dax_region->res.start = res->start; | |
313 | dax_region->res.end = res->end; | |
ab68f262 | 314 | dax_region->pfn_flags = pfn_flags; |
70f93a37 | 315 | mutex_init(&dax_region->lock); |
ab68f262 DW |
316 | kref_init(&dax_region->kref); |
317 | dax_region->id = region_id; | |
318 | ida_init(&dax_region->ida); | |
319 | dax_region->align = align; | |
320 | dax_region->dev = parent; | |
321 | dax_region->base = addr; | |
70f93a37 DW |
322 | if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) { |
323 | kfree(dax_region); | |
324 | return NULL;; | |
dee41079 | 325 | } |
ab68f262 | 326 | |
70f93a37 DW |
327 | kref_get(&dax_region->kref); |
328 | if (devm_add_action_or_reset(parent, dax_region_unregister, dax_region)) | |
329 | return NULL; | |
ab68f262 DW |
330 | return dax_region; |
331 | } | |
332 | EXPORT_SYMBOL_GPL(alloc_dax_region); | |
333 | ||
ebd84d72 DW |
334 | static struct dax_dev *to_dax_dev(struct device *dev) |
335 | { | |
336 | return container_of(dev, struct dax_dev, dev); | |
dee41079 DW |
337 | } |
338 | ||
ab68f262 DW |
339 | static ssize_t size_show(struct device *dev, |
340 | struct device_attribute *attr, char *buf) | |
dee41079 | 341 | { |
ebd84d72 | 342 | struct dax_dev *dax_dev = to_dax_dev(dev); |
ab68f262 DW |
343 | unsigned long long size = 0; |
344 | int i; | |
dee41079 | 345 | |
ab68f262 | 346 | for (i = 0; i < dax_dev->num_resources; i++) |
70f93a37 | 347 | size += resource_size(dax_dev->res[i]); |
dee41079 | 348 | |
ab68f262 | 349 | return sprintf(buf, "%llu\n", size); |
dee41079 | 350 | } |
ab68f262 DW |
351 | static DEVICE_ATTR_RO(size); |
352 | ||
353 | static struct attribute *dax_device_attributes[] = { | |
354 | &dev_attr_size.attr, | |
355 | NULL, | |
356 | }; | |
357 | ||
358 | static const struct attribute_group dax_device_attribute_group = { | |
359 | .attrs = dax_device_attributes, | |
360 | }; | |
361 | ||
362 | static const struct attribute_group *dax_attribute_groups[] = { | |
363 | &dax_device_attribute_group, | |
364 | NULL, | |
365 | }; | |
dee41079 DW |
366 | |
367 | static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma, | |
368 | const char *func) | |
369 | { | |
370 | struct dax_region *dax_region = dax_dev->region; | |
ebd84d72 | 371 | struct device *dev = &dax_dev->dev; |
dee41079 DW |
372 | unsigned long mask; |
373 | ||
374 | if (!dax_dev->alive) | |
375 | return -ENXIO; | |
376 | ||
377 | /* prevent private / writable mappings from being established */ | |
378 | if ((vma->vm_flags & (VM_NORESERVE|VM_SHARED|VM_WRITE)) == VM_WRITE) { | |
379 | dev_info(dev, "%s: %s: fail, attempted private mapping\n", | |
380 | current->comm, func); | |
381 | return -EINVAL; | |
382 | } | |
383 | ||
384 | mask = dax_region->align - 1; | |
385 | if (vma->vm_start & mask || vma->vm_end & mask) { | |
386 | dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n", | |
387 | current->comm, func, vma->vm_start, vma->vm_end, | |
388 | mask); | |
389 | return -EINVAL; | |
390 | } | |
391 | ||
392 | if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV | |
393 | && (vma->vm_flags & VM_DONTCOPY) == 0) { | |
394 | dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n", | |
395 | current->comm, func); | |
396 | return -EINVAL; | |
397 | } | |
398 | ||
399 | if (!vma_is_dax(vma)) { | |
400 | dev_info(dev, "%s: %s: fail, vma is not DAX capable\n", | |
401 | current->comm, func); | |
402 | return -EINVAL; | |
403 | } | |
404 | ||
405 | return 0; | |
406 | } | |
407 | ||
408 | static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff, | |
409 | unsigned long size) | |
410 | { | |
411 | struct resource *res; | |
412 | phys_addr_t phys; | |
413 | int i; | |
414 | ||
415 | for (i = 0; i < dax_dev->num_resources; i++) { | |
70f93a37 | 416 | res = dax_dev->res[i]; |
dee41079 DW |
417 | phys = pgoff * PAGE_SIZE + res->start; |
418 | if (phys >= res->start && phys <= res->end) | |
419 | break; | |
420 | pgoff -= PHYS_PFN(resource_size(res)); | |
421 | } | |
422 | ||
423 | if (i < dax_dev->num_resources) { | |
70f93a37 | 424 | res = dax_dev->res[i]; |
dee41079 DW |
425 | if (phys + size - 1 <= res->end) |
426 | return phys; | |
427 | } | |
428 | ||
429 | return -1; | |
430 | } | |
431 | ||
432 | static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, | |
433 | struct vm_fault *vmf) | |
434 | { | |
435 | unsigned long vaddr = (unsigned long) vmf->virtual_address; | |
ebd84d72 | 436 | struct device *dev = &dax_dev->dev; |
dee41079 DW |
437 | struct dax_region *dax_region; |
438 | int rc = VM_FAULT_SIGBUS; | |
439 | phys_addr_t phys; | |
440 | pfn_t pfn; | |
441 | ||
442 | if (check_vma(dax_dev, vma, __func__)) | |
443 | return VM_FAULT_SIGBUS; | |
444 | ||
445 | dax_region = dax_dev->region; | |
446 | if (dax_region->align > PAGE_SIZE) { | |
447 | dev_dbg(dev, "%s: alignment > fault size\n", __func__); | |
448 | return VM_FAULT_SIGBUS; | |
449 | } | |
450 | ||
451 | phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE); | |
452 | if (phys == -1) { | |
453 | dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__, | |
454 | vmf->pgoff); | |
455 | return VM_FAULT_SIGBUS; | |
456 | } | |
457 | ||
458 | pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); | |
459 | ||
460 | rc = vm_insert_mixed(vma, vaddr, pfn); | |
461 | ||
462 | if (rc == -ENOMEM) | |
463 | return VM_FAULT_OOM; | |
464 | if (rc < 0 && rc != -EBUSY) | |
465 | return VM_FAULT_SIGBUS; | |
466 | ||
467 | return VM_FAULT_NOPAGE; | |
468 | } | |
469 | ||
470 | static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |
471 | { | |
472 | int rc; | |
473 | struct file *filp = vma->vm_file; | |
474 | struct dax_dev *dax_dev = filp->private_data; | |
475 | ||
ebd84d72 | 476 | dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, |
dee41079 DW |
477 | current->comm, (vmf->flags & FAULT_FLAG_WRITE) |
478 | ? "write" : "read", vma->vm_start, vma->vm_end); | |
479 | rcu_read_lock(); | |
480 | rc = __dax_dev_fault(dax_dev, vma, vmf); | |
481 | rcu_read_unlock(); | |
482 | ||
483 | return rc; | |
484 | } | |
485 | ||
486 | static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, | |
487 | struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, | |
488 | unsigned int flags) | |
489 | { | |
490 | unsigned long pmd_addr = addr & PMD_MASK; | |
ebd84d72 | 491 | struct device *dev = &dax_dev->dev; |
dee41079 DW |
492 | struct dax_region *dax_region; |
493 | phys_addr_t phys; | |
494 | pgoff_t pgoff; | |
495 | pfn_t pfn; | |
496 | ||
497 | if (check_vma(dax_dev, vma, __func__)) | |
498 | return VM_FAULT_SIGBUS; | |
499 | ||
500 | dax_region = dax_dev->region; | |
501 | if (dax_region->align > PMD_SIZE) { | |
502 | dev_dbg(dev, "%s: alignment > fault size\n", __func__); | |
503 | return VM_FAULT_SIGBUS; | |
504 | } | |
505 | ||
506 | /* dax pmd mappings require pfn_t_devmap() */ | |
507 | if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { | |
508 | dev_dbg(dev, "%s: alignment > fault size\n", __func__); | |
509 | return VM_FAULT_SIGBUS; | |
510 | } | |
511 | ||
512 | pgoff = linear_page_index(vma, pmd_addr); | |
4c3cb6e9 | 513 | phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE); |
dee41079 DW |
514 | if (phys == -1) { |
515 | dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__, | |
516 | pgoff); | |
517 | return VM_FAULT_SIGBUS; | |
518 | } | |
519 | ||
520 | pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); | |
521 | ||
522 | return vmf_insert_pfn_pmd(vma, addr, pmd, pfn, | |
523 | flags & FAULT_FLAG_WRITE); | |
524 | } | |
525 | ||
526 | static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr, | |
527 | pmd_t *pmd, unsigned int flags) | |
528 | { | |
529 | int rc; | |
530 | struct file *filp = vma->vm_file; | |
531 | struct dax_dev *dax_dev = filp->private_data; | |
532 | ||
ebd84d72 | 533 | dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, |
dee41079 DW |
534 | current->comm, (flags & FAULT_FLAG_WRITE) |
535 | ? "write" : "read", vma->vm_start, vma->vm_end); | |
536 | ||
537 | rcu_read_lock(); | |
538 | rc = __dax_dev_pmd_fault(dax_dev, vma, addr, pmd, flags); | |
539 | rcu_read_unlock(); | |
540 | ||
541 | return rc; | |
542 | } | |
543 | ||
dee41079 DW |
544 | static const struct vm_operations_struct dax_dev_vm_ops = { |
545 | .fault = dax_dev_fault, | |
546 | .pmd_fault = dax_dev_pmd_fault, | |
dee41079 DW |
547 | }; |
548 | ||
af69f51e | 549 | static int dax_mmap(struct file *filp, struct vm_area_struct *vma) |
dee41079 DW |
550 | { |
551 | struct dax_dev *dax_dev = filp->private_data; | |
552 | int rc; | |
553 | ||
ebd84d72 | 554 | dev_dbg(&dax_dev->dev, "%s\n", __func__); |
dee41079 DW |
555 | |
556 | rc = check_vma(dax_dev, vma, __func__); | |
557 | if (rc) | |
558 | return rc; | |
559 | ||
dee41079 DW |
560 | vma->vm_ops = &dax_dev_vm_ops; |
561 | vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; | |
562 | return 0; | |
043a9255 DW |
563 | } |
564 | ||
565 | /* return an unmapped area aligned to the dax region specified alignment */ | |
af69f51e | 566 | static unsigned long dax_get_unmapped_area(struct file *filp, |
043a9255 DW |
567 | unsigned long addr, unsigned long len, unsigned long pgoff, |
568 | unsigned long flags) | |
569 | { | |
570 | unsigned long off, off_end, off_align, len_align, addr_align, align; | |
571 | struct dax_dev *dax_dev = filp ? filp->private_data : NULL; | |
572 | struct dax_region *dax_region; | |
573 | ||
574 | if (!dax_dev || addr) | |
575 | goto out; | |
576 | ||
577 | dax_region = dax_dev->region; | |
578 | align = dax_region->align; | |
579 | off = pgoff << PAGE_SHIFT; | |
580 | off_end = off + len; | |
581 | off_align = round_up(off, align); | |
582 | ||
583 | if ((off_end <= off_align) || ((off_end - off_align) < align)) | |
584 | goto out; | |
585 | ||
586 | len_align = len + align; | |
587 | if ((off + len_align) < off) | |
588 | goto out; | |
dee41079 | 589 | |
043a9255 DW |
590 | addr_align = current->mm->get_unmapped_area(filp, addr, len_align, |
591 | pgoff, flags); | |
592 | if (!IS_ERR_VALUE(addr_align)) { | |
593 | addr_align += (off - addr_align) & (align - 1); | |
594 | return addr_align; | |
595 | } | |
596 | out: | |
597 | return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); | |
598 | } | |
599 | ||
af69f51e | 600 | static int dax_open(struct inode *inode, struct file *filp) |
043a9255 | 601 | { |
ba09c01d | 602 | struct dax_dev *dax_dev; |
043a9255 | 603 | |
ba09c01d DW |
604 | dax_dev = container_of(inode->i_cdev, struct dax_dev, cdev); |
605 | dev_dbg(&dax_dev->dev, "%s\n", __func__); | |
3bc52c45 DW |
606 | inode->i_mapping = dax_dev->inode->i_mapping; |
607 | inode->i_mapping->host = dax_dev->inode; | |
608 | filp->f_mapping = inode->i_mapping; | |
ebd84d72 DW |
609 | filp->private_data = dax_dev; |
610 | inode->i_flags = S_DAX; | |
043a9255 | 611 | |
043a9255 DW |
612 | return 0; |
613 | } | |
dee41079 | 614 | |
af69f51e | 615 | static int dax_release(struct inode *inode, struct file *filp) |
043a9255 DW |
616 | { |
617 | struct dax_dev *dax_dev = filp->private_data; | |
043a9255 | 618 | |
ba09c01d | 619 | dev_dbg(&dax_dev->dev, "%s\n", __func__); |
043a9255 | 620 | return 0; |
dee41079 DW |
621 | } |
622 | ||
ab68f262 DW |
623 | static const struct file_operations dax_fops = { |
624 | .llseek = noop_llseek, | |
625 | .owner = THIS_MODULE, | |
af69f51e DW |
626 | .open = dax_open, |
627 | .release = dax_release, | |
628 | .get_unmapped_area = dax_get_unmapped_area, | |
629 | .mmap = dax_mmap, | |
ab68f262 DW |
630 | }; |
631 | ||
ebd84d72 | 632 | static void dax_dev_release(struct device *dev) |
043a9255 | 633 | { |
ebd84d72 | 634 | struct dax_dev *dax_dev = to_dax_dev(dev); |
043a9255 DW |
635 | struct dax_region *dax_region = dax_dev->region; |
636 | ||
ebd84d72 DW |
637 | ida_simple_remove(&dax_region->ida, dax_dev->id); |
638 | ida_simple_remove(&dax_minor_ida, MINOR(dev->devt)); | |
639 | dax_region_put(dax_region); | |
3bc52c45 | 640 | iput(dax_dev->inode); |
70f93a37 | 641 | kfree(dax_dev->res); |
ebd84d72 DW |
642 | kfree(dax_dev); |
643 | } | |
644 | ||
645 | static void unregister_dax_dev(void *dev) | |
646 | { | |
647 | struct dax_dev *dax_dev = to_dax_dev(dev); | |
70f93a37 | 648 | struct dax_region *dax_region = dax_dev->region; |
ba09c01d | 649 | struct cdev *cdev = &dax_dev->cdev; |
70f93a37 | 650 | int i; |
ebd84d72 | 651 | |
043a9255 DW |
652 | dev_dbg(dev, "%s\n", __func__); |
653 | ||
654 | /* | |
655 | * Note, rcu is not protecting the liveness of dax_dev, rcu is | |
656 | * ensuring that any fault handlers that might have seen | |
657 | * dax_dev->alive == true, have completed. Any fault handlers | |
658 | * that start after synchronize_rcu() has started will abort | |
659 | * upon seeing dax_dev->alive == false. | |
660 | */ | |
661 | dax_dev->alive = false; | |
662 | synchronize_rcu(); | |
9dc1e492 | 663 | unmap_mapping_range(dax_dev->inode->i_mapping, 0, 0, 1); |
70f93a37 DW |
664 | |
665 | mutex_lock(&dax_region->lock); | |
666 | for (i = 0; i < dax_dev->num_resources; i++) | |
667 | __release_region(&dax_region->res, dax_dev->res[i]->start, | |
668 | resource_size(dax_dev->res[i])); | |
d4c6b777 DW |
669 | if (dax_region->seed == dev) |
670 | dax_region->seed = NULL; | |
70f93a37 | 671 | mutex_unlock(&dax_region->lock); |
d4c6b777 | 672 | atomic_dec(&dax_region->child_count); |
70f93a37 | 673 | |
ba09c01d | 674 | cdev_del(cdev); |
043a9255 | 675 | device_unregister(dev); |
043a9255 DW |
676 | } |
677 | ||
5662d52d DW |
678 | struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region, |
679 | struct resource *res, int count) | |
043a9255 DW |
680 | { |
681 | struct device *parent = dax_region->dev; | |
682 | struct dax_dev *dax_dev; | |
9d2d01a0 | 683 | int rc = 0, minor, i; |
043a9255 | 684 | struct device *dev; |
ba09c01d | 685 | struct cdev *cdev; |
043a9255 DW |
686 | dev_t dev_t; |
687 | ||
70f93a37 | 688 | dax_dev = kzalloc(sizeof(*dax_dev), GFP_KERNEL); |
043a9255 | 689 | if (!dax_dev) |
5662d52d | 690 | return ERR_PTR(-ENOMEM); |
043a9255 | 691 | |
70f93a37 DW |
692 | dax_dev->res = kzalloc(sizeof(res) * count, GFP_KERNEL); |
693 | if (!dax_dev->res) | |
694 | goto err_res; | |
695 | ||
9d2d01a0 | 696 | for (i = 0; i < count; i++) { |
70f93a37 DW |
697 | struct resource *dax_res; |
698 | ||
9d2d01a0 DW |
699 | if (!IS_ALIGNED(res[i].start, dax_region->align) |
700 | || !IS_ALIGNED(resource_size(&res[i]), | |
701 | dax_region->align)) { | |
702 | rc = -EINVAL; | |
703 | break; | |
704 | } | |
70f93a37 DW |
705 | |
706 | mutex_lock(&dax_region->lock); | |
707 | dax_res = __request_region(&dax_region->res, res[i].start, | |
708 | resource_size(&res[i]), NULL, 0); | |
709 | mutex_unlock(&dax_region->lock); | |
710 | if (!dax_res) { | |
711 | rc = -EBUSY; | |
712 | break; | |
713 | } | |
714 | dax_dev->res[i] = dax_res; | |
9d2d01a0 DW |
715 | } |
716 | ||
717 | if (i < count) | |
70f93a37 | 718 | goto err_request_region; |
9d2d01a0 | 719 | |
043a9255 DW |
720 | dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); |
721 | if (dax_dev->id < 0) { | |
722 | rc = dax_dev->id; | |
70f93a37 | 723 | goto err_request_region; |
043a9255 DW |
724 | } |
725 | ||
726 | minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL); | |
727 | if (minor < 0) { | |
728 | rc = minor; | |
729 | goto err_minor; | |
730 | } | |
731 | ||
72ffa4d2 AB |
732 | dev_t = MKDEV(MAJOR(dax_devt), minor); |
733 | dev = &dax_dev->dev; | |
3bc52c45 DW |
734 | dax_dev->inode = dax_inode_get(&dax_dev->cdev, dev_t); |
735 | if (!dax_dev->inode) { | |
736 | rc = -ENOMEM; | |
737 | goto err_inode; | |
738 | } | |
739 | ||
ba09c01d | 740 | /* device_initialize() so cdev can reference kobj parent */ |
ebd84d72 | 741 | device_initialize(dev); |
ba09c01d DW |
742 | |
743 | cdev = &dax_dev->cdev; | |
744 | cdev_init(cdev, &dax_fops); | |
745 | cdev->owner = parent->driver->owner; | |
746 | cdev->kobj.parent = &dev->kobj; | |
747 | rc = cdev_add(&dax_dev->cdev, dev_t, 1); | |
748 | if (rc) | |
749 | goto err_cdev; | |
750 | ||
751 | /* from here on we're committed to teardown via dax_dev_release() */ | |
ba09c01d DW |
752 | dax_dev->num_resources = count; |
753 | dax_dev->alive = true; | |
754 | dax_dev->region = dax_region; | |
755 | kref_get(&dax_region->kref); | |
756 | ||
ebd84d72 DW |
757 | dev->devt = dev_t; |
758 | dev->class = dax_class; | |
759 | dev->parent = parent; | |
760 | dev->groups = dax_attribute_groups; | |
761 | dev->release = dax_dev_release; | |
762 | dev_set_name(dev, "dax%d.%d", dax_region->id, dax_dev->id); | |
70f93a37 DW |
763 | /* update resource names now that the owner device is named */ |
764 | for (i = 0; i < dax_dev->num_resources; i++) | |
765 | dax_dev->res[i]->name = dev_name(dev); | |
766 | ||
ebd84d72 DW |
767 | rc = device_add(dev); |
768 | if (rc) { | |
769 | put_device(dev); | |
5662d52d | 770 | return ERR_PTR(rc); |
ebd84d72 | 771 | } |
043a9255 | 772 | |
5662d52d DW |
773 | rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev); |
774 | if (rc) | |
775 | return ERR_PTR(rc); | |
776 | ||
d4c6b777 DW |
777 | if (atomic_inc_return(&dax_region->child_count) == 1) { |
778 | struct dax_dev *seed; | |
779 | ||
780 | seed = devm_create_dax_dev(dax_region, NULL, 0); | |
781 | if (IS_ERR(seed)) | |
782 | dev_warn(parent, "failed to create region seed\n"); | |
783 | else | |
784 | dax_region->seed = &seed->dev; | |
785 | } | |
786 | ||
5662d52d | 787 | return dax_dev; |
043a9255 | 788 | |
ba09c01d | 789 | err_cdev: |
3bc52c45 DW |
790 | iput(dax_dev->inode); |
791 | err_inode: | |
ba09c01d | 792 | ida_simple_remove(&dax_minor_ida, minor); |
043a9255 DW |
793 | err_minor: |
794 | ida_simple_remove(&dax_region->ida, dax_dev->id); | |
70f93a37 DW |
795 | err_request_region: |
796 | mutex_lock(&dax_region->lock); | |
797 | for (i--; i >= 0; i--) | |
798 | __release_region(&dax_region->res, dax_dev->res[i]->start, | |
799 | resource_size(dax_dev->res[i])); | |
800 | mutex_unlock(&dax_region->lock); | |
801 | kfree(dax_dev->res); | |
802 | err_res: | |
ebd84d72 | 803 | kfree(dax_dev); |
043a9255 | 804 | |
5662d52d | 805 | return ERR_PTR(rc); |
043a9255 DW |
806 | } |
807 | EXPORT_SYMBOL_GPL(devm_create_dax_dev); | |
808 | ||
ab68f262 DW |
809 | static int __init dax_init(void) |
810 | { | |
811 | int rc; | |
812 | ||
3bc52c45 DW |
813 | rc = dax_inode_init(); |
814 | if (rc) | |
ab68f262 | 815 | return rc; |
3bc52c45 | 816 | |
ba09c01d DW |
817 | nr_dax = max(nr_dax, 256); |
818 | rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax"); | |
819 | if (rc) | |
3bc52c45 | 820 | goto err_chrdev; |
ab68f262 DW |
821 | |
822 | dax_class = class_create(THIS_MODULE, "dax"); | |
823 | if (IS_ERR(dax_class)) { | |
3bc52c45 DW |
824 | rc = PTR_ERR(dax_class); |
825 | goto err_class; | |
ab68f262 DW |
826 | } |
827 | ||
828 | return 0; | |
3bc52c45 DW |
829 | |
830 | err_class: | |
831 | unregister_chrdev_region(dax_devt, nr_dax); | |
832 | err_chrdev: | |
833 | dax_inode_exit(); | |
834 | return rc; | |
ab68f262 DW |
835 | } |
836 | ||
837 | static void __exit dax_exit(void) | |
838 | { | |
839 | class_destroy(dax_class); | |
ba09c01d | 840 | unregister_chrdev_region(dax_devt, nr_dax); |
ab68f262 | 841 | ida_destroy(&dax_minor_ida); |
3bc52c45 | 842 | dax_inode_exit(); |
ab68f262 DW |
843 | } |
844 | ||
845 | MODULE_AUTHOR("Intel Corporation"); | |
846 | MODULE_LICENSE("GPL v2"); | |
847 | subsys_initcall(dax_init); | |
848 | module_exit(dax_exit); |