21 #include <linux/bitops.h>
24 #include <linux/errno.h>
31 #include <linux/kdev_t.h>
33 #include <linux/kernel.h>
35 #include <linux/module.h>
37 #include <linux/pci.h>
38 #include <linux/poison.h>
39 #include <linux/sched.h>
40 #include <linux/slab.h>
41 #include <linux/types.h>
45 #define NVME_Q_DEPTH 1024
46 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
47 #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
48 #define NVME_MINORS 64
49 #define NVME_IO_TIMEOUT (5 * HZ)
50 #define ADMIN_TIMEOUT (60 * HZ)
52 static int nvme_major;
55 static int use_threaded_interrupts;
127 static inline void _nvme_check_size(
void)
188 static int alloc_cmdid_killable(
struct nvme_queue *nvmeq,
void *ctx,
193 (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0);
198 #define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA)
199 #define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE)
200 #define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE)
201 #define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE)
202 #define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE)
204 static void special_completion(
struct nvme_dev *
dev,
void *ctx,
213 "completed id %d twice on queue %d\n",
219 "invalid id %d completed on queue %d\n",
230 static void *free_cmdid(
struct nvme_queue *nvmeq,
int cmdid,
237 *fn = special_completion;
242 info[
cmdid].
fn = special_completion;
249 static void *cancel_cmdid(
struct nvme_queue *nvmeq,
int cmdid,
257 info[
cmdid].
fn = special_completion;
267 static void put_nvmeq(
struct nvme_queue *nvmeq)
290 spin_unlock_irqrestore(&nvmeq->
q_lock, flags);
313 return ((
void *)iod) + iod->
offset;
321 static int nvme_npages(
unsigned size)
328 nvme_alloc_iod(
unsigned nseg,
unsigned nbytes,
gfp_t gfp)
331 sizeof(
__le64 *) * nvme_npages(nbytes) +
352 for (i = 0; i < iod->
npages; i++) {
356 prp_dma = next_prp_dma;
361 static void requeue_bio(
struct nvme_dev *dev,
struct bio *bio)
364 if (bio_list_empty(&nvmeq->
sq_cong))
366 bio_list_add(&nvmeq->
sq_cong, bio);
371 static void bio_completion(
struct nvme_dev *dev,
void *ctx,
375 struct bio *bio = iod->
private;
380 nvme_free_iod(dev, iod);
383 }
else if (bio->bi_vcnt > bio->bi_idx) {
384 requeue_bio(dev, bio);
391 static int nvme_setup_prps(
struct nvme_dev *dev,
402 __le64 **list = iod_list(iod);
426 if (nprps <= (256 / 8)) {
446 __le64 *old_prp_list = prp_list;
449 return total_len -
length;
450 list[iod->
npages++] = prp_list;
451 prp_list[0] = old_prp_list[i - 1];
473 #define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \
474 (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE))
479 struct bio_vec *bvec, *bvprv =
NULL;
481 int i, old_idx, length = 0, nsegs = 0;
484 old_idx = bio->bi_idx;
485 bio_for_each_segment(bvec, bio, i) {
486 if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) {
487 sg->
length += bvec->bv_len;
491 sg = sg ? sg + 1 : iod->
sg;
492 sg_set_page(sg, bvec->bv_page, bvec->bv_len,
496 length += bvec->bv_len;
503 bio->bi_idx = old_idx;
514 memset(cmnd, 0,
sizeof(*cmnd));
533 return nvme_submit_flush(nvmeq, ns, cmdid);
551 result = nvme_submit_flush_data(nvmeq, ns);
556 iod = nvme_alloc_iod(psegs, bio->bi_size,
GFP_ATOMIC);
567 return nvme_submit_flush(nvmeq, ns, cmdid);
581 memset(cmnd, 0,
sizeof(*cmnd));
582 if (bio_data_dir(bio)) {
590 result = nvme_map_bio(nvmeq->
q_dmadev, iod, bio, dma_dir, psegs);
597 length = nvme_setup_prps(nvmeq->
dev, &cmnd->
common, iod, length,
604 bio->bi_sector += length >> 9;
613 nvme_free_iod(nvmeq->
dev, iod);
618 static void nvme_make_request(
struct request_queue *
q,
struct bio *bio)
620 struct nvme_ns *ns = q->queuedata;
624 spin_lock_irq(&nvmeq->
q_lock);
625 if (bio_list_empty(&nvmeq->
sq_cong))
626 result = nvme_submit_bio_queue(nvmeq, ns, bio);
628 if (bio_list_empty(&nvmeq->
sq_cong))
630 bio_list_add(&nvmeq->
sq_cong, bio);
633 spin_unlock_irq(&nvmeq->
q_lock);
651 if (++head == nvmeq->
q_depth) {
657 fn(nvmeq->
dev, ctx, &cqe);
680 spin_lock(&nvmeq->
q_lock);
681 result = nvme_process_cq(nvmeq);
682 spin_unlock(&nvmeq->
q_lock);
695 static void nvme_abort_command(
struct nvme_queue *nvmeq,
int cmdid)
697 spin_lock_irq(&nvmeq->
q_lock);
698 cancel_cmdid(nvmeq, cmdid,
NULL);
699 spin_unlock_irq(&nvmeq->
q_lock);
708 static void sync_completion(
struct nvme_dev *dev,
void *ctx,
721 static int nvme_submit_sync_cmd(
struct nvme_queue *nvmeq,
728 cmdinfo.status = -
EINTR;
730 cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion,
737 nvme_submit_cmd(nvmeq, cmd);
740 if (cmdinfo.status == -
EINTR) {
741 nvme_abort_command(nvmeq, cmdid);
746 *result = cmdinfo.result;
748 return cmdinfo.status;
763 c.delete_queue.opcode =
opcode;
766 status = nvme_submit_admin_cmd(dev, &
c,
NULL);
787 status = nvme_submit_admin_cmd(dev, &
c,
NULL);
793 static int adapter_alloc_sq(
struct nvme_dev *dev,
u16 qid,
808 status = nvme_submit_admin_cmd(dev, &
c,
NULL);
814 static int adapter_delete_cq(
struct nvme_dev *dev,
u16 cqid)
819 static int adapter_delete_sq(
struct nvme_dev *dev,
u16 sqid)
835 return nvme_submit_admin_cmd(dev, &
c,
NULL);
838 static int nvme_get_features(
struct nvme_dev *dev,
unsigned fid,
849 return nvme_submit_admin_cmd(dev, &
c,
NULL);
852 static int nvme_set_features(
struct nvme_dev *dev,
unsigned fid,
863 return nvme_submit_admin_cmd(dev, &
c, result);
871 static void nvme_cancel_ios(
struct nvme_queue *nvmeq,
bool timeout)
873 int depth = nvmeq->
q_depth - 1;
885 if (timeout && !
time_after(now, info[cmdid].timeout))
888 ctx = cancel_cmdid(nvmeq, cmdid, &fn);
889 fn(nvmeq->
dev, ctx, &cqe);
893 static void nvme_free_queue_mem(
struct nvme_queue *nvmeq)
902 static void nvme_free_queue(
struct nvme_dev *dev,
int qid)
907 spin_lock_irq(&nvmeq->
q_lock);
908 nvme_cancel_ios(nvmeq,
false);
909 spin_unlock_irq(&nvmeq->
q_lock);
911 irq_set_affinity_hint(vector,
NULL);
916 adapter_delete_sq(dev, qid);
917 adapter_delete_cq(dev, qid);
920 nvme_free_queue_mem(nvmeq);
924 int depth,
int vector)
950 init_waitqueue_entry(&nvmeq->
sq_cong_wait, nvme_thread);
951 bio_list_init(&nvmeq->
sq_cong);
969 if (use_threaded_interrupts)
971 nvme_irq_check, nvme_irq,
979 int qid,
int cq_size,
int vector)
982 struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector);
987 result = adapter_alloc_cq(dev, qid, nvmeq);
991 result = adapter_alloc_sq(dev, qid, nvmeq);
995 result = queue_request_irq(dev, nvmeq,
"nvme");
1002 adapter_delete_sq(dev, qid);
1004 adapter_delete_cq(dev, qid);
1011 return ERR_PTR(result);
1019 unsigned long timeout;
1024 nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
1048 if (fatal_signal_pending(
current))
1052 "Device not ready; aborting initialisation\n");
1058 nvme_free_queue_mem(nvmeq);
1062 result = queue_request_irq(dev, nvmeq,
"nvme admin");
1068 unsigned long addr,
unsigned length)
1082 pages = kcalloc(count,
sizeof(*pages),
GFP_KERNEL);
1093 iod = nvme_alloc_iod(count, length,
GFP_KERNEL);
1096 for (i = 0; i <
count; i++) {
1097 sg_set_page(&sg[i], pages[i],
1102 sg_mark_end(&sg[i - 1]);
1117 for (i = 0; i <
count; i++)
1120 return ERR_PTR(err);
1123 static void nvme_unmap_user_pages(
struct nvme_dev *dev,
int write,
1131 for (i = 0; i < iod->
nents; i++)
1149 switch (
io.opcode) {
1153 iod = nvme_map_user_pages(dev,
io.opcode & 1,
io.addr, length);
1160 return PTR_ERR(iod);
1163 c.rw.opcode =
io.opcode;
1164 c.rw.flags =
io.flags;
1170 c.rw.reftag =
io.reftag;
1171 c.rw.apptag =
io.apptag;
1172 c.rw.appmask =
io.appmask;
1174 length = nvme_setup_prps(dev, &
c.common, iod, length,
GFP_KERNEL);
1176 nvmeq = get_nvmeq(dev);
1189 nvme_unmap_user_pages(dev,
io.opcode & 1, iod);
1190 nvme_free_iod(dev, iod);
1194 static int nvme_user_admin_cmd(
struct nvme_dev *dev,
1208 c.common.opcode = cmd.opcode;
1209 c.common.flags = cmd.flags;
1220 length = cmd.data_len;
1222 iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr,
1225 return PTR_ERR(iod);
1226 length = nvme_setup_prps(dev, &
c.common, iod, length,
1230 if (length != cmd.data_len)
1233 status = nvme_submit_admin_cmd(dev, &
c,
NULL);
1236 nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
1237 nvme_free_iod(dev, iod);
1251 return nvme_user_admin_cmd(ns->
dev, (
void __user *)arg);
1253 return nvme_submit_io(ns, (
void __user *)arg);
1259 static const struct block_device_operations nvme_fops = {
1261 .ioctl = nvme_ioctl,
1262 .compat_ioctl = nvme_ioctl,
1265 static void nvme_resubmit_bios(
struct nvme_queue *nvmeq)
1267 while (bio_list_peek(&nvmeq->
sq_cong)) {
1268 struct bio *bio = bio_list_pop(&nvmeq->
sq_cong);
1269 struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
1270 if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
1271 bio_list_add_head(&nvmeq->
sq_cong, bio);
1274 if (bio_list_empty(&nvmeq->
sq_cong))
1280 static int nvme_kthread(
void *data)
1286 spin_lock(&dev_list_lock);
1293 spin_lock_irq(&nvmeq->
q_lock);
1294 if (nvme_process_cq(nvmeq))
1295 printk(
"process_cq did something\n");
1296 nvme_cancel_ios(nvmeq,
true);
1297 nvme_resubmit_bios(nvmeq);
1298 spin_unlock_irq(&nvmeq->
q_lock);
1301 spin_unlock(&dev_list_lock);
1310 static int nvme_get_ns_idx(
void)
1318 spin_lock(&dev_list_lock);
1320 spin_unlock(&dev_list_lock);
1321 }
while (error == -
EAGAIN);
1328 static void nvme_put_ns_idx(
int index)
1330 spin_lock(&dev_list_lock);
1332 spin_unlock(&dev_list_lock);
1339 struct gendisk *disk;
1351 ns->
queue->queue_flags = QUEUE_FLAG_DEFAULT;
1352 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->
queue);
1353 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->
queue);
1361 goto out_free_queue;
1364 lbaf =
id->flbas & 0xf;
1370 disk->major = nvme_major;
1372 disk->first_minor =
NVME_MINORS * nvme_get_ns_idx();
1373 disk->fops = &nvme_fops;
1374 disk->private_data =
ns;
1375 disk->queue = ns->
queue;
1376 disk->driverfs_dev = &dev->
pci_dev->dev;
1389 static void nvme_ns_free(
struct nvme_ns *ns)
1393 nvme_put_ns_idx(index);
1398 static int set_queue_count(
struct nvme_dev *dev,
int count)
1402 u32 q_count = (count - 1) | ((count - 1) << 16);
1408 return min(result & 0xffff, result >> 16) + 1;
1413 int result,
cpu,
i, nr_io_queues, db_bar_size, q_depth;
1416 result = set_queue_count(dev, nr_io_queues);
1419 if (result < nr_io_queues)
1425 db_bar_size = 4096 + ((nr_io_queues + 1) << (dev->
db_stride + 3));
1426 if (db_bar_size > 8192) {
1434 for (i = 0; i < nr_io_queues; i++)
1435 dev->
entry[i].entry = i;
1441 }
else if (result > 0) {
1450 result = queue_request_irq(dev, dev->
queues[0],
"nvme admin");
1453 cpu = cpumask_first(cpu_online_mask);
1454 for (i = 0; i < nr_io_queues; i++) {
1455 irq_set_affinity_hint(dev->
entry[i].vector, get_cpu_mask(cpu));
1456 cpu = cpumask_next(cpu, cpu_online_mask);
1461 for (i = 0; i < nr_io_queues; i++) {
1462 dev->
queues[i + 1] = nvme_create_queue(dev, i + 1, q_depth, i);
1463 if (IS_ERR(dev->
queues[i + 1]))
1464 return PTR_ERR(dev->
queues[i + 1]);
1476 static void nvme_free_queues(
struct nvme_dev *dev)
1481 nvme_free_queue(dev, i);
1493 res = nvme_setup_io_queues(dev);
1517 for (i = 1; i <=
nn; i++) {
1522 if (id_ns->
ncap == 0)
1530 ns = nvme_alloc_ns(dev, i, mem, mem + 4096);
1550 static int nvme_dev_remove(
struct nvme_dev *dev)
1554 spin_lock(&dev_list_lock);
1556 spin_unlock(&dev_list_lock);
1564 nvme_free_queues(dev);
1569 static int nvme_setup_prp_pools(
struct nvme_dev *dev)
1587 static void nvme_release_prp_pools(
struct nvme_dev *dev)
1595 static int nvme_set_instance(
struct nvme_dev *dev)
1597 int instance,
error;
1603 spin_lock(&dev_list_lock);
1604 error =
ida_get_new(&nvme_instance_ida, &instance);
1605 spin_unlock(&dev_list_lock);
1606 }
while (error == -
EAGAIN);
1615 static void nvme_release_instance(
struct nvme_dev *dev)
1617 spin_lock(&dev_list_lock);
1619 spin_unlock(&dev_list_lock);
1625 int bars, result = -
ENOMEM;
1649 pci_set_drvdata(pdev, dev);
1652 result = nvme_set_instance(dev);
1658 result = nvme_setup_prp_pools(dev);
1668 result = nvme_configure_admin_queue(dev);
1673 spin_lock(&dev_list_lock);
1675 spin_unlock(&dev_list_lock);
1677 result = nvme_dev_add(dev);
1684 spin_lock(&dev_list_lock);
1686 spin_unlock(&dev_list_lock);
1688 nvme_free_queues(dev);
1693 nvme_release_instance(dev);
1694 nvme_release_prp_pools(dev);
1707 struct nvme_dev *dev = pci_get_drvdata(pdev);
1708 nvme_dev_remove(dev);
1711 nvme_release_instance(dev);
1712 nvme_release_prp_pools(dev);
1721 #define nvme_error_detected NULL
1722 #define nvme_dump_registers NULL
1723 #define nvme_link_reset NULL
1724 #define nvme_slot_reset NULL
1725 #define nvme_error_resume NULL
1726 #define nvme_suspend NULL
1727 #define nvme_resume NULL
1738 #define PCI_CLASS_STORAGE_EXPRESS 0x010802
1748 .id_table = nvme_id_table,
1749 .probe = nvme_probe,
1753 .err_handler = &nvme_err_handler,
1756 static int __init nvme_init(
void)
1761 if (IS_ERR(nvme_thread))
1762 return PTR_ERR(nvme_thread);
1767 else if (result > 0)
1770 result = pci_register_driver(&nvme_driver);
1782 static void __exit nvme_exit(
void)