Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
nvme.c
Go to the documentation of this file.
1 /*
2  * NVM Express device driver
3  * Copyright (c) 2011, Intel Corporation.
4  *
5  * This program is free software; you can redistribute it and/or modify it
6  * under the terms and conditions of the GNU General Public License,
7  * version 2, as published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12  * more details.
13  *
14  * You should have received a copy of the GNU General Public License along with
15  * this program; if not, write to the Free Software Foundation, Inc.,
16  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
17  */
18 
19 #include <linux/nvme.h>
20 #include <linux/bio.h>
21 #include <linux/bitops.h>
22 #include <linux/blkdev.h>
23 #include <linux/delay.h>
24 #include <linux/errno.h>
25 #include <linux/fs.h>
26 #include <linux/genhd.h>
27 #include <linux/idr.h>
28 #include <linux/init.h>
29 #include <linux/interrupt.h>
30 #include <linux/io.h>
31 #include <linux/kdev_t.h>
32 #include <linux/kthread.h>
33 #include <linux/kernel.h>
34 #include <linux/mm.h>
35 #include <linux/module.h>
36 #include <linux/moduleparam.h>
37 #include <linux/pci.h>
38 #include <linux/poison.h>
39 #include <linux/sched.h>
40 #include <linux/slab.h>
41 #include <linux/types.h>
42 
44 
45 #define NVME_Q_DEPTH 1024
46 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
47 #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
48 #define NVME_MINORS 64
49 #define NVME_IO_TIMEOUT (5 * HZ)
50 #define ADMIN_TIMEOUT (60 * HZ)
51 
52 static int nvme_major;
53 module_param(nvme_major, int, 0);
54 
55 static int use_threaded_interrupts;
56 module_param(use_threaded_interrupts, int, 0);
57 
58 static DEFINE_SPINLOCK(dev_list_lock);
59 static LIST_HEAD(dev_list);
60 static struct task_struct *nvme_thread;
61 
62 /*
63  * Represents an NVM Express device. Each nvme_dev is a PCI function.
64  */
65 struct nvme_dev {
66  struct list_head node;
67  struct nvme_queue **queues;
69  struct pci_dev *pci_dev;
72  int instance;
74  int db_stride;
76  struct msix_entry *entry;
77  struct nvme_bar __iomem *bar;
79  char serial[20];
80  char model[40];
81  char firmware_rev[8];
83 };
84 
85 /*
86  * An NVM Express namespace is equivalent to a SCSI LUN
87  */
88 struct nvme_ns {
89  struct list_head list;
90 
91  struct nvme_dev *dev;
93  struct gendisk *disk;
94 
95  int ns_id;
96  int lba_shift;
97 };
98 
99 /*
100  * An NVM Express queue. Each device has at least two (one for admin
101  * commands and one for I/O commands).
102  */
103 struct nvme_queue {
104  struct device *q_dmadev;
105  struct nvme_dev *dev;
108  volatile struct nvme_completion *cqes;
121  unsigned long cmdid_data[];
122 };
123 
124 /*
125  * Check we didin't inadvertently grow the command struct
126  */
127 static inline void _nvme_check_size(void)
128 {
129  BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
130  BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
131  BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
132  BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
133  BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
134  BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
135  BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096);
136  BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
137  BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
138 }
139 
140 typedef void (*nvme_completion_fn)(struct nvme_dev *, void *,
141  struct nvme_completion *);
142 
145  void *ctx;
146  unsigned long timeout;
147 };
148 
149 static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
150 {
151  return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)];
152 }
153 
169 static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx,
170  nvme_completion_fn handler, unsigned timeout)
171 {
172  int depth = nvmeq->q_depth - 1;
173  struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
174  int cmdid;
175 
176  do {
177  cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth);
178  if (cmdid >= depth)
179  return -EBUSY;
180  } while (test_and_set_bit(cmdid, nvmeq->cmdid_data));
181 
182  info[cmdid].fn = handler;
183  info[cmdid].ctx = ctx;
184  info[cmdid].timeout = jiffies + timeout;
185  return cmdid;
186 }
187 
188 static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
189  nvme_completion_fn handler, unsigned timeout)
190 {
191  int cmdid;
193  (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0);
194  return (cmdid < 0) ? -EINTR : cmdid;
195 }
196 
197 /* Special values must be less than 0x1000 */
198 #define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA)
199 #define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE)
200 #define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE)
201 #define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE)
202 #define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE)
203 
204 static void special_completion(struct nvme_dev *dev, void *ctx,
205  struct nvme_completion *cqe)
206 {
207  if (ctx == CMD_CTX_CANCELLED)
208  return;
209  if (ctx == CMD_CTX_FLUSH)
210  return;
211  if (ctx == CMD_CTX_COMPLETED) {
212  dev_warn(&dev->pci_dev->dev,
213  "completed id %d twice on queue %d\n",
214  cqe->command_id, le16_to_cpup(&cqe->sq_id));
215  return;
216  }
217  if (ctx == CMD_CTX_INVALID) {
218  dev_warn(&dev->pci_dev->dev,
219  "invalid id %d completed on queue %d\n",
220  cqe->command_id, le16_to_cpup(&cqe->sq_id));
221  return;
222  }
223 
224  dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx);
225 }
226 
227 /*
228  * Called with local interrupts disabled and the q_lock held. May not sleep.
229  */
230 static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid,
232 {
233  void *ctx;
234  struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
235 
236  if (cmdid >= nvmeq->q_depth) {
237  *fn = special_completion;
238  return CMD_CTX_INVALID;
239  }
240  *fn = info[cmdid].fn;
241  ctx = info[cmdid].ctx;
242  info[cmdid].fn = special_completion;
243  info[cmdid].ctx = CMD_CTX_COMPLETED;
244  clear_bit(cmdid, nvmeq->cmdid_data);
245  wake_up(&nvmeq->sq_full);
246  return ctx;
247 }
248 
249 static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
250  nvme_completion_fn *fn)
251 {
252  void *ctx;
253  struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
254  if (fn)
255  *fn = info[cmdid].fn;
256  ctx = info[cmdid].ctx;
257  info[cmdid].fn = special_completion;
258  info[cmdid].ctx = CMD_CTX_CANCELLED;
259  return ctx;
260 }
261 
262 static struct nvme_queue *get_nvmeq(struct nvme_dev *dev)
263 {
264  return dev->queues[get_cpu() + 1];
265 }
266 
267 static void put_nvmeq(struct nvme_queue *nvmeq)
268 {
269  put_cpu();
270 }
271 
279 static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
280 {
281  unsigned long flags;
282  u16 tail;
283  spin_lock_irqsave(&nvmeq->q_lock, flags);
284  tail = nvmeq->sq_tail;
285  memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
286  if (++tail == nvmeq->q_depth)
287  tail = 0;
288  writel(tail, nvmeq->q_db);
289  nvmeq->sq_tail = tail;
290  spin_unlock_irqrestore(&nvmeq->q_lock, flags);
291 
292  return 0;
293 }
294 
295 /*
296  * The nvme_iod describes the data in an I/O, including the list of PRP
297  * entries. You can't see it in this data structure because C doesn't let
298  * me express that. Use nvme_alloc_iod to ensure there's enough space
299  * allocated to store the PRP list.
300  */
301 struct nvme_iod {
302  void *private; /* For the use of the submitter of the I/O */
303  int npages; /* In the PRP list. 0 means small pool in use */
304  int offset; /* Of PRP list */
305  int nents; /* Used in scatterlist */
306  int length; /* Of data, in bytes */
308  struct scatterlist sg[0];
309 };
310 
311 static __le64 **iod_list(struct nvme_iod *iod)
312 {
313  return ((void *)iod) + iod->offset;
314 }
315 
316 /*
317  * Will slightly overestimate the number of pages needed. This is OK
318  * as it only leads to a small amount of wasted memory for the lifetime of
319  * the I/O.
320  */
321 static int nvme_npages(unsigned size)
322 {
323  unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE);
324  return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
325 }
326 
327 static struct nvme_iod *
328 nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp)
329 {
330  struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
331  sizeof(__le64 *) * nvme_npages(nbytes) +
332  sizeof(struct scatterlist) * nseg, gfp);
333 
334  if (iod) {
335  iod->offset = offsetof(struct nvme_iod, sg[nseg]);
336  iod->npages = -1;
337  iod->length = nbytes;
338  }
339 
340  return iod;
341 }
342 
343 static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
344 {
345  const int last_prp = PAGE_SIZE / 8 - 1;
346  int i;
347  __le64 **list = iod_list(iod);
348  dma_addr_t prp_dma = iod->first_dma;
349 
350  if (iod->npages == 0)
351  dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
352  for (i = 0; i < iod->npages; i++) {
353  __le64 *prp_list = list[i];
354  dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
355  dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
356  prp_dma = next_prp_dma;
357  }
358  kfree(iod);
359 }
360 
361 static void requeue_bio(struct nvme_dev *dev, struct bio *bio)
362 {
363  struct nvme_queue *nvmeq = get_nvmeq(dev);
364  if (bio_list_empty(&nvmeq->sq_cong))
365  add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
366  bio_list_add(&nvmeq->sq_cong, bio);
367  put_nvmeq(nvmeq);
368  wake_up_process(nvme_thread);
369 }
370 
371 static void bio_completion(struct nvme_dev *dev, void *ctx,
372  struct nvme_completion *cqe)
373 {
374  struct nvme_iod *iod = ctx;
375  struct bio *bio = iod->private;
376  u16 status = le16_to_cpup(&cqe->status) >> 1;
377 
378  dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
379  bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
380  nvme_free_iod(dev, iod);
381  if (status) {
382  bio_endio(bio, -EIO);
383  } else if (bio->bi_vcnt > bio->bi_idx) {
384  requeue_bio(dev, bio);
385  } else {
386  bio_endio(bio, 0);
387  }
388 }
389 
390 /* length is in bytes. gfp flags indicates whether we may sleep. */
391 static int nvme_setup_prps(struct nvme_dev *dev,
392  struct nvme_common_command *cmd, struct nvme_iod *iod,
393  int total_len, gfp_t gfp)
394 {
395  struct dma_pool *pool;
396  int length = total_len;
397  struct scatterlist *sg = iod->sg;
398  int dma_len = sg_dma_len(sg);
400  int offset = offset_in_page(dma_addr);
401  __le64 *prp_list;
402  __le64 **list = iod_list(iod);
403  dma_addr_t prp_dma;
404  int nprps, i;
405 
406  cmd->prp1 = cpu_to_le64(dma_addr);
407  length -= (PAGE_SIZE - offset);
408  if (length <= 0)
409  return total_len;
410 
411  dma_len -= (PAGE_SIZE - offset);
412  if (dma_len) {
413  dma_addr += (PAGE_SIZE - offset);
414  } else {
415  sg = sg_next(sg);
416  dma_addr = sg_dma_address(sg);
417  dma_len = sg_dma_len(sg);
418  }
419 
420  if (length <= PAGE_SIZE) {
421  cmd->prp2 = cpu_to_le64(dma_addr);
422  return total_len;
423  }
424 
425  nprps = DIV_ROUND_UP(length, PAGE_SIZE);
426  if (nprps <= (256 / 8)) {
427  pool = dev->prp_small_pool;
428  iod->npages = 0;
429  } else {
430  pool = dev->prp_page_pool;
431  iod->npages = 1;
432  }
433 
434  prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
435  if (!prp_list) {
436  cmd->prp2 = cpu_to_le64(dma_addr);
437  iod->npages = -1;
438  return (total_len - length) + PAGE_SIZE;
439  }
440  list[0] = prp_list;
441  iod->first_dma = prp_dma;
442  cmd->prp2 = cpu_to_le64(prp_dma);
443  i = 0;
444  for (;;) {
445  if (i == PAGE_SIZE / 8) {
446  __le64 *old_prp_list = prp_list;
447  prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
448  if (!prp_list)
449  return total_len - length;
450  list[iod->npages++] = prp_list;
451  prp_list[0] = old_prp_list[i - 1];
452  old_prp_list[i - 1] = cpu_to_le64(prp_dma);
453  i = 1;
454  }
455  prp_list[i++] = cpu_to_le64(dma_addr);
456  dma_len -= PAGE_SIZE;
457  dma_addr += PAGE_SIZE;
458  length -= PAGE_SIZE;
459  if (length <= 0)
460  break;
461  if (dma_len > 0)
462  continue;
463  BUG_ON(dma_len < 0);
464  sg = sg_next(sg);
465  dma_addr = sg_dma_address(sg);
466  dma_len = sg_dma_len(sg);
467  }
468 
469  return total_len;
470 }
471 
472 /* NVMe scatterlists require no holes in the virtual address */
473 #define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \
474  (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE))
475 
476 static int nvme_map_bio(struct device *dev, struct nvme_iod *iod,
477  struct bio *bio, enum dma_data_direction dma_dir, int psegs)
478 {
479  struct bio_vec *bvec, *bvprv = NULL;
480  struct scatterlist *sg = NULL;
481  int i, old_idx, length = 0, nsegs = 0;
482 
483  sg_init_table(iod->sg, psegs);
484  old_idx = bio->bi_idx;
485  bio_for_each_segment(bvec, bio, i) {
486  if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) {
487  sg->length += bvec->bv_len;
488  } else {
489  if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec))
490  break;
491  sg = sg ? sg + 1 : iod->sg;
492  sg_set_page(sg, bvec->bv_page, bvec->bv_len,
493  bvec->bv_offset);
494  nsegs++;
495  }
496  length += bvec->bv_len;
497  bvprv = bvec;
498  }
499  bio->bi_idx = i;
500  iod->nents = nsegs;
501  sg_mark_end(sg);
502  if (dma_map_sg(dev, iod->sg, iod->nents, dma_dir) == 0) {
503  bio->bi_idx = old_idx;
504  return -ENOMEM;
505  }
506  return length;
507 }
508 
509 static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
510  int cmdid)
511 {
512  struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
513 
514  memset(cmnd, 0, sizeof(*cmnd));
515  cmnd->common.opcode = nvme_cmd_flush;
516  cmnd->common.command_id = cmdid;
517  cmnd->common.nsid = cpu_to_le32(ns->ns_id);
518 
519  if (++nvmeq->sq_tail == nvmeq->q_depth)
520  nvmeq->sq_tail = 0;
521  writel(nvmeq->sq_tail, nvmeq->q_db);
522 
523  return 0;
524 }
525 
526 static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns)
527 {
528  int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH,
529  special_completion, NVME_IO_TIMEOUT);
530  if (unlikely(cmdid < 0))
531  return cmdid;
532 
533  return nvme_submit_flush(nvmeq, ns, cmdid);
534 }
535 
536 /*
537  * Called with local interrupts disabled and the q_lock held. May not sleep.
538  */
539 static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
540  struct bio *bio)
541 {
542  struct nvme_command *cmnd;
543  struct nvme_iod *iod;
544  enum dma_data_direction dma_dir;
545  int cmdid, length, result = -ENOMEM;
546  u16 control;
547  u32 dsmgmt;
548  int psegs = bio_phys_segments(ns->queue, bio);
549 
550  if ((bio->bi_rw & REQ_FLUSH) && psegs) {
551  result = nvme_submit_flush_data(nvmeq, ns);
552  if (result)
553  return result;
554  }
555 
556  iod = nvme_alloc_iod(psegs, bio->bi_size, GFP_ATOMIC);
557  if (!iod)
558  goto nomem;
559  iod->private = bio;
560 
561  result = -EBUSY;
562  cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT);
563  if (unlikely(cmdid < 0))
564  goto free_iod;
565 
566  if ((bio->bi_rw & REQ_FLUSH) && !psegs)
567  return nvme_submit_flush(nvmeq, ns, cmdid);
568 
569  control = 0;
570  if (bio->bi_rw & REQ_FUA)
571  control |= NVME_RW_FUA;
572  if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD))
573  control |= NVME_RW_LR;
574 
575  dsmgmt = 0;
576  if (bio->bi_rw & REQ_RAHEAD)
577  dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
578 
579  cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
580 
581  memset(cmnd, 0, sizeof(*cmnd));
582  if (bio_data_dir(bio)) {
583  cmnd->rw.opcode = nvme_cmd_write;
584  dma_dir = DMA_TO_DEVICE;
585  } else {
586  cmnd->rw.opcode = nvme_cmd_read;
587  dma_dir = DMA_FROM_DEVICE;
588  }
589 
590  result = nvme_map_bio(nvmeq->q_dmadev, iod, bio, dma_dir, psegs);
591  if (result < 0)
592  goto free_iod;
593  length = result;
594 
595  cmnd->rw.command_id = cmdid;
596  cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
597  length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length,
598  GFP_ATOMIC);
599  cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
600  cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1);
601  cmnd->rw.control = cpu_to_le16(control);
602  cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
603 
604  bio->bi_sector += length >> 9;
605 
606  if (++nvmeq->sq_tail == nvmeq->q_depth)
607  nvmeq->sq_tail = 0;
608  writel(nvmeq->sq_tail, nvmeq->q_db);
609 
610  return 0;
611 
612  free_iod:
613  nvme_free_iod(nvmeq->dev, iod);
614  nomem:
615  return result;
616 }
617 
618 static void nvme_make_request(struct request_queue *q, struct bio *bio)
619 {
620  struct nvme_ns *ns = q->queuedata;
621  struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
622  int result = -EBUSY;
623 
624  spin_lock_irq(&nvmeq->q_lock);
625  if (bio_list_empty(&nvmeq->sq_cong))
626  result = nvme_submit_bio_queue(nvmeq, ns, bio);
627  if (unlikely(result)) {
628  if (bio_list_empty(&nvmeq->sq_cong))
629  add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
630  bio_list_add(&nvmeq->sq_cong, bio);
631  }
632 
633  spin_unlock_irq(&nvmeq->q_lock);
634  put_nvmeq(nvmeq);
635 }
636 
637 static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
638 {
639  u16 head, phase;
640 
641  head = nvmeq->cq_head;
642  phase = nvmeq->cq_phase;
643 
644  for (;;) {
645  void *ctx;
647  struct nvme_completion cqe = nvmeq->cqes[head];
648  if ((le16_to_cpu(cqe.status) & 1) != phase)
649  break;
650  nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
651  if (++head == nvmeq->q_depth) {
652  head = 0;
653  phase = !phase;
654  }
655 
656  ctx = free_cmdid(nvmeq, cqe.command_id, &fn);
657  fn(nvmeq->dev, ctx, &cqe);
658  }
659 
660  /* If the controller ignores the cq head doorbell and continuously
661  * writes to the queue, it is theoretically possible to wrap around
662  * the queue twice and mistakenly return IRQ_NONE. Linux only
663  * requires that 0.1% of your interrupts are handled, so this isn't
664  * a big problem.
665  */
666  if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
667  return IRQ_NONE;
668 
669  writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride));
670  nvmeq->cq_head = head;
671  nvmeq->cq_phase = phase;
672 
673  return IRQ_HANDLED;
674 }
675 
676 static irqreturn_t nvme_irq(int irq, void *data)
677 {
679  struct nvme_queue *nvmeq = data;
680  spin_lock(&nvmeq->q_lock);
681  result = nvme_process_cq(nvmeq);
682  spin_unlock(&nvmeq->q_lock);
683  return result;
684 }
685 
686 static irqreturn_t nvme_irq_check(int irq, void *data)
687 {
688  struct nvme_queue *nvmeq = data;
689  struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head];
690  if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase)
691  return IRQ_NONE;
692  return IRQ_WAKE_THREAD;
693 }
694 
695 static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
696 {
697  spin_lock_irq(&nvmeq->q_lock);
698  cancel_cmdid(nvmeq, cmdid, NULL);
699  spin_unlock_irq(&nvmeq->q_lock);
700 }
701 
703  struct task_struct *task;
705  int status;
706 };
707 
708 static void sync_completion(struct nvme_dev *dev, void *ctx,
709  struct nvme_completion *cqe)
710 {
711  struct sync_cmd_info *cmdinfo = ctx;
712  cmdinfo->result = le32_to_cpup(&cqe->result);
713  cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
714  wake_up_process(cmdinfo->task);
715 }
716 
717 /*
718  * Returns 0 on success. If the result is negative, it's a Linux error code;
719  * if the result is positive, it's an NVM Express status code
720  */
721 static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq,
722  struct nvme_command *cmd, u32 *result, unsigned timeout)
723 {
724  int cmdid;
725  struct sync_cmd_info cmdinfo;
726 
727  cmdinfo.task = current;
728  cmdinfo.status = -EINTR;
729 
730  cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion,
731  timeout);
732  if (cmdid < 0)
733  return cmdid;
734  cmd->common.command_id = cmdid;
735 
737  nvme_submit_cmd(nvmeq, cmd);
738  schedule();
739 
740  if (cmdinfo.status == -EINTR) {
741  nvme_abort_command(nvmeq, cmdid);
742  return -EINTR;
743  }
744 
745  if (result)
746  *result = cmdinfo.result;
747 
748  return cmdinfo.status;
749 }
750 
751 static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
752  u32 *result)
753 {
754  return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT);
755 }
756 
757 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
758 {
759  int status;
760  struct nvme_command c;
761 
762  memset(&c, 0, sizeof(c));
763  c.delete_queue.opcode = opcode;
764  c.delete_queue.qid = cpu_to_le16(id);
765 
766  status = nvme_submit_admin_cmd(dev, &c, NULL);
767  if (status)
768  return -EIO;
769  return 0;
770 }
771 
772 static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
773  struct nvme_queue *nvmeq)
774 {
775  int status;
776  struct nvme_command c;
778 
779  memset(&c, 0, sizeof(c));
780  c.create_cq.opcode = nvme_admin_create_cq;
781  c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
782  c.create_cq.cqid = cpu_to_le16(qid);
783  c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
784  c.create_cq.cq_flags = cpu_to_le16(flags);
785  c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
786 
787  status = nvme_submit_admin_cmd(dev, &c, NULL);
788  if (status)
789  return -EIO;
790  return 0;
791 }
792 
793 static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
794  struct nvme_queue *nvmeq)
795 {
796  int status;
797  struct nvme_command c;
799 
800  memset(&c, 0, sizeof(c));
801  c.create_sq.opcode = nvme_admin_create_sq;
802  c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
803  c.create_sq.sqid = cpu_to_le16(qid);
804  c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
805  c.create_sq.sq_flags = cpu_to_le16(flags);
806  c.create_sq.cqid = cpu_to_le16(qid);
807 
808  status = nvme_submit_admin_cmd(dev, &c, NULL);
809  if (status)
810  return -EIO;
811  return 0;
812 }
813 
814 static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
815 {
816  return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
817 }
818 
819 static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
820 {
821  return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
822 }
823 
824 static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns,
825  dma_addr_t dma_addr)
826 {
827  struct nvme_command c;
828 
829  memset(&c, 0, sizeof(c));
830  c.identify.opcode = nvme_admin_identify;
831  c.identify.nsid = cpu_to_le32(nsid);
832  c.identify.prp1 = cpu_to_le64(dma_addr);
833  c.identify.cns = cpu_to_le32(cns);
834 
835  return nvme_submit_admin_cmd(dev, &c, NULL);
836 }
837 
838 static int nvme_get_features(struct nvme_dev *dev, unsigned fid,
839  unsigned nsid, dma_addr_t dma_addr)
840 {
841  struct nvme_command c;
842 
843  memset(&c, 0, sizeof(c));
844  c.features.opcode = nvme_admin_get_features;
845  c.features.nsid = cpu_to_le32(nsid);
846  c.features.prp1 = cpu_to_le64(dma_addr);
847  c.features.fid = cpu_to_le32(fid);
848 
849  return nvme_submit_admin_cmd(dev, &c, NULL);
850 }
851 
852 static int nvme_set_features(struct nvme_dev *dev, unsigned fid,
853  unsigned dword11, dma_addr_t dma_addr, u32 *result)
854 {
855  struct nvme_command c;
856 
857  memset(&c, 0, sizeof(c));
858  c.features.opcode = nvme_admin_set_features;
859  c.features.prp1 = cpu_to_le64(dma_addr);
860  c.features.fid = cpu_to_le32(fid);
861  c.features.dword11 = cpu_to_le32(dword11);
862 
863  return nvme_submit_admin_cmd(dev, &c, result);
864 }
865 
871 static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
872 {
873  int depth = nvmeq->q_depth - 1;
874  struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
875  unsigned long now = jiffies;
876  int cmdid;
877 
878  for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) {
879  void *ctx;
881  static struct nvme_completion cqe = {
883  };
884 
885  if (timeout && !time_after(now, info[cmdid].timeout))
886  continue;
887  dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d\n", cmdid);
888  ctx = cancel_cmdid(nvmeq, cmdid, &fn);
889  fn(nvmeq->dev, ctx, &cqe);
890  }
891 }
892 
893 static void nvme_free_queue_mem(struct nvme_queue *nvmeq)
894 {
895  dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
896  (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
897  dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
898  nvmeq->sq_cmds, nvmeq->sq_dma_addr);
899  kfree(nvmeq);
900 }
901 
902 static void nvme_free_queue(struct nvme_dev *dev, int qid)
903 {
904  struct nvme_queue *nvmeq = dev->queues[qid];
905  int vector = dev->entry[nvmeq->cq_vector].vector;
906 
907  spin_lock_irq(&nvmeq->q_lock);
908  nvme_cancel_ios(nvmeq, false);
909  spin_unlock_irq(&nvmeq->q_lock);
910 
911  irq_set_affinity_hint(vector, NULL);
912  free_irq(vector, nvmeq);
913 
914  /* Don't tell the adapter to delete the admin queue */
915  if (qid) {
916  adapter_delete_sq(dev, qid);
917  adapter_delete_cq(dev, qid);
918  }
919 
920  nvme_free_queue_mem(nvmeq);
921 }
922 
923 static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
924  int depth, int vector)
925 {
926  struct device *dmadev = &dev->pci_dev->dev;
927  unsigned extra = DIV_ROUND_UP(depth, 8) + (depth *
928  sizeof(struct nvme_cmd_info));
929  struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
930  if (!nvmeq)
931  return NULL;
932 
933  nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth),
934  &nvmeq->cq_dma_addr, GFP_KERNEL);
935  if (!nvmeq->cqes)
936  goto free_nvmeq;
937  memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth));
938 
939  nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth),
940  &nvmeq->sq_dma_addr, GFP_KERNEL);
941  if (!nvmeq->sq_cmds)
942  goto free_cqdma;
943 
944  nvmeq->q_dmadev = dmadev;
945  nvmeq->dev = dev;
946  spin_lock_init(&nvmeq->q_lock);
947  nvmeq->cq_head = 0;
948  nvmeq->cq_phase = 1;
949  init_waitqueue_head(&nvmeq->sq_full);
950  init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
951  bio_list_init(&nvmeq->sq_cong);
952  nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)];
953  nvmeq->q_depth = depth;
954  nvmeq->cq_vector = vector;
955 
956  return nvmeq;
957 
958  free_cqdma:
959  dma_free_coherent(dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes,
960  nvmeq->cq_dma_addr);
961  free_nvmeq:
962  kfree(nvmeq);
963  return NULL;
964 }
965 
966 static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
967  const char *name)
968 {
969  if (use_threaded_interrupts)
970  return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector,
971  nvme_irq_check, nvme_irq,
973  name, nvmeq);
974  return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,
975  IRQF_DISABLED | IRQF_SHARED, name, nvmeq);
976 }
977 
978 static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
979  int qid, int cq_size, int vector)
980 {
981  int result;
982  struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector);
983 
984  if (!nvmeq)
985  return ERR_PTR(-ENOMEM);
986 
987  result = adapter_alloc_cq(dev, qid, nvmeq);
988  if (result < 0)
989  goto free_nvmeq;
990 
991  result = adapter_alloc_sq(dev, qid, nvmeq);
992  if (result < 0)
993  goto release_cq;
994 
995  result = queue_request_irq(dev, nvmeq, "nvme");
996  if (result < 0)
997  goto release_sq;
998 
999  return nvmeq;
1000 
1001  release_sq:
1002  adapter_delete_sq(dev, qid);
1003  release_cq:
1004  adapter_delete_cq(dev, qid);
1005  free_nvmeq:
1006  dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
1007  (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
1008  dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
1009  nvmeq->sq_cmds, nvmeq->sq_dma_addr);
1010  kfree(nvmeq);
1011  return ERR_PTR(result);
1012 }
1013 
1014 static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
1015 {
1016  int result = 0;
1017  u32 aqa;
1018  u64 cap;
1019  unsigned long timeout;
1020  struct nvme_queue *nvmeq;
1021 
1022  dev->dbs = ((void __iomem *)dev->bar) + 4096;
1023 
1024  nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
1025  if (!nvmeq)
1026  return -ENOMEM;
1027 
1028  aqa = nvmeq->q_depth - 1;
1029  aqa |= aqa << 16;
1030 
1032  dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
1035 
1036  writel(0, &dev->bar->cc);
1037  writel(aqa, &dev->bar->aqa);
1038  writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
1039  writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
1040  writel(dev->ctrl_config, &dev->bar->cc);
1041 
1042  cap = readq(&dev->bar->cap);
1043  timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
1044  dev->db_stride = NVME_CAP_STRIDE(cap);
1045 
1046  while (!result && !(readl(&dev->bar->csts) & NVME_CSTS_RDY)) {
1047  msleep(100);
1048  if (fatal_signal_pending(current))
1049  result = -EINTR;
1050  if (time_after(jiffies, timeout)) {
1051  dev_err(&dev->pci_dev->dev,
1052  "Device not ready; aborting initialisation\n");
1053  result = -ENODEV;
1054  }
1055  }
1056 
1057  if (result) {
1058  nvme_free_queue_mem(nvmeq);
1059  return result;
1060  }
1061 
1062  result = queue_request_irq(dev, nvmeq, "nvme admin");
1063  dev->queues[0] = nvmeq;
1064  return result;
1065 }
1066 
1067 static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
1068  unsigned long addr, unsigned length)
1069 {
1070  int i, err, count, nents, offset;
1071  struct scatterlist *sg;
1072  struct page **pages;
1073  struct nvme_iod *iod;
1074 
1075  if (addr & 3)
1076  return ERR_PTR(-EINVAL);
1077  if (!length)
1078  return ERR_PTR(-EINVAL);
1079 
1080  offset = offset_in_page(addr);
1081  count = DIV_ROUND_UP(offset + length, PAGE_SIZE);
1082  pages = kcalloc(count, sizeof(*pages), GFP_KERNEL);
1083  if (!pages)
1084  return ERR_PTR(-ENOMEM);
1085 
1086  err = get_user_pages_fast(addr, count, 1, pages);
1087  if (err < count) {
1088  count = err;
1089  err = -EFAULT;
1090  goto put_pages;
1091  }
1092 
1093  iod = nvme_alloc_iod(count, length, GFP_KERNEL);
1094  sg = iod->sg;
1095  sg_init_table(sg, count);
1096  for (i = 0; i < count; i++) {
1097  sg_set_page(&sg[i], pages[i],
1098  min_t(int, length, PAGE_SIZE - offset), offset);
1099  length -= (PAGE_SIZE - offset);
1100  offset = 0;
1101  }
1102  sg_mark_end(&sg[i - 1]);
1103  iod->nents = count;
1104 
1105  err = -ENOMEM;
1106  nents = dma_map_sg(&dev->pci_dev->dev, sg, count,
1107  write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
1108  if (!nents)
1109  goto free_iod;
1110 
1111  kfree(pages);
1112  return iod;
1113 
1114  free_iod:
1115  kfree(iod);
1116  put_pages:
1117  for (i = 0; i < count; i++)
1118  put_page(pages[i]);
1119  kfree(pages);
1120  return ERR_PTR(err);
1121 }
1122 
1123 static void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
1124  struct nvme_iod *iod)
1125 {
1126  int i;
1127 
1128  dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
1129  write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
1130 
1131  for (i = 0; i < iod->nents; i++)
1132  put_page(sg_page(&iod->sg[i]));
1133 }
1134 
1135 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
1136 {
1137  struct nvme_dev *dev = ns->dev;
1138  struct nvme_queue *nvmeq;
1139  struct nvme_user_io io;
1140  struct nvme_command c;
1141  unsigned length;
1142  int status;
1143  struct nvme_iod *iod;
1144 
1145  if (copy_from_user(&io, uio, sizeof(io)))
1146  return -EFAULT;
1147  length = (io.nblocks + 1) << ns->lba_shift;
1148 
1149  switch (io.opcode) {
1150  case nvme_cmd_write:
1151  case nvme_cmd_read:
1152  case nvme_cmd_compare:
1153  iod = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length);
1154  break;
1155  default:
1156  return -EINVAL;
1157  }
1158 
1159  if (IS_ERR(iod))
1160  return PTR_ERR(iod);
1161 
1162  memset(&c, 0, sizeof(c));
1163  c.rw.opcode = io.opcode;
1164  c.rw.flags = io.flags;
1165  c.rw.nsid = cpu_to_le32(ns->ns_id);
1166  c.rw.slba = cpu_to_le64(io.slba);
1167  c.rw.length = cpu_to_le16(io.nblocks);
1168  c.rw.control = cpu_to_le16(io.control);
1169  c.rw.dsmgmt = cpu_to_le16(io.dsmgmt);
1170  c.rw.reftag = io.reftag;
1171  c.rw.apptag = io.apptag;
1172  c.rw.appmask = io.appmask;
1173  /* XXX: metadata */
1174  length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL);
1175 
1176  nvmeq = get_nvmeq(dev);
1177  /*
1178  * Since nvme_submit_sync_cmd sleeps, we can't keep preemption
1179  * disabled. We may be preempted at any point, and be rescheduled
1180  * to a different CPU. That will cause cacheline bouncing, but no
1181  * additional races since q_lock already protects against other CPUs.
1182  */
1183  put_nvmeq(nvmeq);
1184  if (length != (io.nblocks + 1) << ns->lba_shift)
1185  status = -ENOMEM;
1186  else
1187  status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT);
1188 
1189  nvme_unmap_user_pages(dev, io.opcode & 1, iod);
1190  nvme_free_iod(dev, iod);
1191  return status;
1192 }
1193 
1194 static int nvme_user_admin_cmd(struct nvme_dev *dev,
1195  struct nvme_admin_cmd __user *ucmd)
1196 {
1197  struct nvme_admin_cmd cmd;
1198  struct nvme_command c;
1199  int status, length;
1200  struct nvme_iod *uninitialized_var(iod);
1201 
1202  if (!capable(CAP_SYS_ADMIN))
1203  return -EACCES;
1204  if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
1205  return -EFAULT;
1206 
1207  memset(&c, 0, sizeof(c));
1208  c.common.opcode = cmd.opcode;
1209  c.common.flags = cmd.flags;
1210  c.common.nsid = cpu_to_le32(cmd.nsid);
1211  c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
1212  c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
1213  c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
1214  c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
1215  c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
1216  c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
1217  c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
1218  c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
1219 
1220  length = cmd.data_len;
1221  if (cmd.data_len) {
1222  iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr,
1223  length);
1224  if (IS_ERR(iod))
1225  return PTR_ERR(iod);
1226  length = nvme_setup_prps(dev, &c.common, iod, length,
1227  GFP_KERNEL);
1228  }
1229 
1230  if (length != cmd.data_len)
1231  status = -ENOMEM;
1232  else
1233  status = nvme_submit_admin_cmd(dev, &c, NULL);
1234 
1235  if (cmd.data_len) {
1236  nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
1237  nvme_free_iod(dev, iod);
1238  }
1239  return status;
1240 }
1241 
1242 static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
1243  unsigned long arg)
1244 {
1245  struct nvme_ns *ns = bdev->bd_disk->private_data;
1246 
1247  switch (cmd) {
1248  case NVME_IOCTL_ID:
1249  return ns->ns_id;
1250  case NVME_IOCTL_ADMIN_CMD:
1251  return nvme_user_admin_cmd(ns->dev, (void __user *)arg);
1252  case NVME_IOCTL_SUBMIT_IO:
1253  return nvme_submit_io(ns, (void __user *)arg);
1254  default:
1255  return -ENOTTY;
1256  }
1257 }
1258 
1259 static const struct block_device_operations nvme_fops = {
1260  .owner = THIS_MODULE,
1261  .ioctl = nvme_ioctl,
1262  .compat_ioctl = nvme_ioctl,
1263 };
1264 
1265 static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
1266 {
1267  while (bio_list_peek(&nvmeq->sq_cong)) {
1268  struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
1269  struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
1270  if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
1271  bio_list_add_head(&nvmeq->sq_cong, bio);
1272  break;
1273  }
1274  if (bio_list_empty(&nvmeq->sq_cong))
1275  remove_wait_queue(&nvmeq->sq_full,
1276  &nvmeq->sq_cong_wait);
1277  }
1278 }
1279 
1280 static int nvme_kthread(void *data)
1281 {
1282  struct nvme_dev *dev;
1283 
1284  while (!kthread_should_stop()) {
1286  spin_lock(&dev_list_lock);
1288  int i;
1289  for (i = 0; i < dev->queue_count; i++) {
1290  struct nvme_queue *nvmeq = dev->queues[i];
1291  if (!nvmeq)
1292  continue;
1293  spin_lock_irq(&nvmeq->q_lock);
1294  if (nvme_process_cq(nvmeq))
1295  printk("process_cq did something\n");
1296  nvme_cancel_ios(nvmeq, true);
1297  nvme_resubmit_bios(nvmeq);
1298  spin_unlock_irq(&nvmeq->q_lock);
1299  }
1300  }
1301  spin_unlock(&dev_list_lock);
1304  }
1305  return 0;
1306 }
1307 
1308 static DEFINE_IDA(nvme_index_ida);
1309 
1310 static int nvme_get_ns_idx(void)
1311 {
1312  int index, error;
1313 
1314  do {
1315  if (!ida_pre_get(&nvme_index_ida, GFP_KERNEL))
1316  return -1;
1317 
1318  spin_lock(&dev_list_lock);
1319  error = ida_get_new(&nvme_index_ida, &index);
1320  spin_unlock(&dev_list_lock);
1321  } while (error == -EAGAIN);
1322 
1323  if (error)
1324  index = -1;
1325  return index;
1326 }
1327 
1328 static void nvme_put_ns_idx(int index)
1329 {
1330  spin_lock(&dev_list_lock);
1331  ida_remove(&nvme_index_ida, index);
1332  spin_unlock(&dev_list_lock);
1333 }
1334 
1335 static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid,
1336  struct nvme_id_ns *id, struct nvme_lba_range_type *rt)
1337 {
1338  struct nvme_ns *ns;
1339  struct gendisk *disk;
1340  int lbaf;
1341 
1343  return NULL;
1344 
1345  ns = kzalloc(sizeof(*ns), GFP_KERNEL);
1346  if (!ns)
1347  return NULL;
1349  if (!ns->queue)
1350  goto out_free_ns;
1351  ns->queue->queue_flags = QUEUE_FLAG_DEFAULT;
1352  queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
1353  queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
1354 /* queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); */
1355  blk_queue_make_request(ns->queue, nvme_make_request);
1356  ns->dev = dev;
1357  ns->queue->queuedata = ns;
1358 
1359  disk = alloc_disk(NVME_MINORS);
1360  if (!disk)
1361  goto out_free_queue;
1362  ns->ns_id = nsid;
1363  ns->disk = disk;
1364  lbaf = id->flbas & 0xf;
1365  ns->lba_shift = id->lbaf[lbaf].ds;
1367  if (dev->max_hw_sectors)
1369 
1370  disk->major = nvme_major;
1371  disk->minors = NVME_MINORS;
1372  disk->first_minor = NVME_MINORS * nvme_get_ns_idx();
1373  disk->fops = &nvme_fops;
1374  disk->private_data = ns;
1375  disk->queue = ns->queue;
1376  disk->driverfs_dev = &dev->pci_dev->dev;
1377  sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid);
1378  set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
1379 
1380  return ns;
1381 
1382  out_free_queue:
1383  blk_cleanup_queue(ns->queue);
1384  out_free_ns:
1385  kfree(ns);
1386  return NULL;
1387 }
1388 
1389 static void nvme_ns_free(struct nvme_ns *ns)
1390 {
1391  int index = ns->disk->first_minor / NVME_MINORS;
1392  put_disk(ns->disk);
1393  nvme_put_ns_idx(index);
1394  blk_cleanup_queue(ns->queue);
1395  kfree(ns);
1396 }
1397 
1398 static int set_queue_count(struct nvme_dev *dev, int count)
1399 {
1400  int status;
1401  u32 result;
1402  u32 q_count = (count - 1) | ((count - 1) << 16);
1403 
1404  status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0,
1405  &result);
1406  if (status)
1407  return -EIO;
1408  return min(result & 0xffff, result >> 16) + 1;
1409 }
1410 
1411 static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
1412 {
1413  int result, cpu, i, nr_io_queues, db_bar_size, q_depth;
1414 
1415  nr_io_queues = num_online_cpus();
1416  result = set_queue_count(dev, nr_io_queues);
1417  if (result < 0)
1418  return result;
1419  if (result < nr_io_queues)
1420  nr_io_queues = result;
1421 
1422  /* Deregister the admin queue's interrupt */
1423  free_irq(dev->entry[0].vector, dev->queues[0]);
1424 
1425  db_bar_size = 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3));
1426  if (db_bar_size > 8192) {
1427  iounmap(dev->bar);
1428  dev->bar = ioremap(pci_resource_start(dev->pci_dev, 0),
1429  db_bar_size);
1430  dev->dbs = ((void __iomem *)dev->bar) + 4096;
1431  dev->queues[0]->q_db = dev->dbs;
1432  }
1433 
1434  for (i = 0; i < nr_io_queues; i++)
1435  dev->entry[i].entry = i;
1436  for (;;) {
1437  result = pci_enable_msix(dev->pci_dev, dev->entry,
1438  nr_io_queues);
1439  if (result == 0) {
1440  break;
1441  } else if (result > 0) {
1442  nr_io_queues = result;
1443  continue;
1444  } else {
1445  nr_io_queues = 1;
1446  break;
1447  }
1448  }
1449 
1450  result = queue_request_irq(dev, dev->queues[0], "nvme admin");
1451  /* XXX: handle failure here */
1452 
1453  cpu = cpumask_first(cpu_online_mask);
1454  for (i = 0; i < nr_io_queues; i++) {
1455  irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu));
1456  cpu = cpumask_next(cpu, cpu_online_mask);
1457  }
1458 
1459  q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1,
1460  NVME_Q_DEPTH);
1461  for (i = 0; i < nr_io_queues; i++) {
1462  dev->queues[i + 1] = nvme_create_queue(dev, i + 1, q_depth, i);
1463  if (IS_ERR(dev->queues[i + 1]))
1464  return PTR_ERR(dev->queues[i + 1]);
1465  dev->queue_count++;
1466  }
1467 
1468  for (; i < num_possible_cpus(); i++) {
1469  int target = i % rounddown_pow_of_two(dev->queue_count - 1);
1470  dev->queues[i + 1] = dev->queues[target + 1];
1471  }
1472 
1473  return 0;
1474 }
1475 
1476 static void nvme_free_queues(struct nvme_dev *dev)
1477 {
1478  int i;
1479 
1480  for (i = dev->queue_count - 1; i >= 0; i--)
1481  nvme_free_queue(dev, i);
1482 }
1483 
1484 static int __devinit nvme_dev_add(struct nvme_dev *dev)
1485 {
1486  int res, nn, i;
1487  struct nvme_ns *ns, *next;
1488  struct nvme_id_ctrl *ctrl;
1489  struct nvme_id_ns *id_ns;
1490  void *mem;
1492 
1493  res = nvme_setup_io_queues(dev);
1494  if (res)
1495  return res;
1496 
1497  mem = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr,
1498  GFP_KERNEL);
1499 
1500  res = nvme_identify(dev, 0, 1, dma_addr);
1501  if (res) {
1502  res = -EIO;
1503  goto out_free;
1504  }
1505 
1506  ctrl = mem;
1507  nn = le32_to_cpup(&ctrl->nn);
1508  memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
1509  memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
1510  memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
1511  if (ctrl->mdts) {
1512  int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
1513  dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9);
1514  }
1515 
1516  id_ns = mem;
1517  for (i = 1; i <= nn; i++) {
1518  res = nvme_identify(dev, i, 0, dma_addr);
1519  if (res)
1520  continue;
1521 
1522  if (id_ns->ncap == 0)
1523  continue;
1524 
1525  res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i,
1526  dma_addr + 4096);
1527  if (res)
1528  continue;
1529 
1530  ns = nvme_alloc_ns(dev, i, mem, mem + 4096);
1531  if (ns)
1532  list_add_tail(&ns->list, &dev->namespaces);
1533  }
1534  list_for_each_entry(ns, &dev->namespaces, list)
1535  add_disk(ns->disk);
1536 
1537  goto out;
1538 
1539  out_free:
1540  list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
1541  list_del(&ns->list);
1542  nvme_ns_free(ns);
1543  }
1544 
1545  out:
1546  dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr);
1547  return res;
1548 }
1549 
1550 static int nvme_dev_remove(struct nvme_dev *dev)
1551 {
1552  struct nvme_ns *ns, *next;
1553 
1554  spin_lock(&dev_list_lock);
1555  list_del(&dev->node);
1556  spin_unlock(&dev_list_lock);
1557 
1558  list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
1559  list_del(&ns->list);
1560  del_gendisk(ns->disk);
1561  nvme_ns_free(ns);
1562  }
1563 
1564  nvme_free_queues(dev);
1565 
1566  return 0;
1567 }
1568 
1569 static int nvme_setup_prp_pools(struct nvme_dev *dev)
1570 {
1571  struct device *dmadev = &dev->pci_dev->dev;
1572  dev->prp_page_pool = dma_pool_create("prp list page", dmadev,
1573  PAGE_SIZE, PAGE_SIZE, 0);
1574  if (!dev->prp_page_pool)
1575  return -ENOMEM;
1576 
1577  /* Optimisation for I/Os between 4k and 128k */
1578  dev->prp_small_pool = dma_pool_create("prp list 256", dmadev,
1579  256, 256, 0);
1580  if (!dev->prp_small_pool) {
1582  return -ENOMEM;
1583  }
1584  return 0;
1585 }
1586 
1587 static void nvme_release_prp_pools(struct nvme_dev *dev)
1588 {
1591 }
1592 
1593 static DEFINE_IDA(nvme_instance_ida);
1594 
1595 static int nvme_set_instance(struct nvme_dev *dev)
1596 {
1597  int instance, error;
1598 
1599  do {
1600  if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
1601  return -ENODEV;
1602 
1603  spin_lock(&dev_list_lock);
1604  error = ida_get_new(&nvme_instance_ida, &instance);
1605  spin_unlock(&dev_list_lock);
1606  } while (error == -EAGAIN);
1607 
1608  if (error)
1609  return -ENODEV;
1610 
1611  dev->instance = instance;
1612  return 0;
1613 }
1614 
1615 static void nvme_release_instance(struct nvme_dev *dev)
1616 {
1617  spin_lock(&dev_list_lock);
1618  ida_remove(&nvme_instance_ida, dev->instance);
1619  spin_unlock(&dev_list_lock);
1620 }
1621 
1622 static int __devinit nvme_probe(struct pci_dev *pdev,
1623  const struct pci_device_id *id)
1624 {
1625  int bars, result = -ENOMEM;
1626  struct nvme_dev *dev;
1627 
1628  dev = kzalloc(sizeof(*dev), GFP_KERNEL);
1629  if (!dev)
1630  return -ENOMEM;
1631  dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry),
1632  GFP_KERNEL);
1633  if (!dev->entry)
1634  goto free;
1635  dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *),
1636  GFP_KERNEL);
1637  if (!dev->queues)
1638  goto free;
1639 
1640  if (pci_enable_device_mem(pdev))
1641  goto free;
1642  pci_set_master(pdev);
1643  bars = pci_select_bars(pdev, IORESOURCE_MEM);
1644  if (pci_request_selected_regions(pdev, bars, "nvme"))
1645  goto disable;
1646 
1647  INIT_LIST_HEAD(&dev->namespaces);
1648  dev->pci_dev = pdev;
1649  pci_set_drvdata(pdev, dev);
1650  dma_set_mask(&pdev->dev, DMA_BIT_MASK(64));
1651  dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64));
1652  result = nvme_set_instance(dev);
1653  if (result)
1654  goto disable;
1655 
1656  dev->entry[0].vector = pdev->irq;
1657 
1658  result = nvme_setup_prp_pools(dev);
1659  if (result)
1660  goto disable_msix;
1661 
1662  dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
1663  if (!dev->bar) {
1664  result = -ENOMEM;
1665  goto disable_msix;
1666  }
1667 
1668  result = nvme_configure_admin_queue(dev);
1669  if (result)
1670  goto unmap;
1671  dev->queue_count++;
1672 
1673  spin_lock(&dev_list_lock);
1674  list_add(&dev->node, &dev_list);
1675  spin_unlock(&dev_list_lock);
1676 
1677  result = nvme_dev_add(dev);
1678  if (result)
1679  goto delete;
1680 
1681  return 0;
1682 
1683  delete:
1684  spin_lock(&dev_list_lock);
1685  list_del(&dev->node);
1686  spin_unlock(&dev_list_lock);
1687 
1688  nvme_free_queues(dev);
1689  unmap:
1690  iounmap(dev->bar);
1691  disable_msix:
1692  pci_disable_msix(pdev);
1693  nvme_release_instance(dev);
1694  nvme_release_prp_pools(dev);
1695  disable:
1696  pci_disable_device(pdev);
1697  pci_release_regions(pdev);
1698  free:
1699  kfree(dev->queues);
1700  kfree(dev->entry);
1701  kfree(dev);
1702  return result;
1703 }
1704 
1705 static void __devexit nvme_remove(struct pci_dev *pdev)
1706 {
1707  struct nvme_dev *dev = pci_get_drvdata(pdev);
1708  nvme_dev_remove(dev);
1709  pci_disable_msix(pdev);
1710  iounmap(dev->bar);
1711  nvme_release_instance(dev);
1712  nvme_release_prp_pools(dev);
1713  pci_disable_device(pdev);
1714  pci_release_regions(pdev);
1715  kfree(dev->queues);
1716  kfree(dev->entry);
1717  kfree(dev);
1718 }
1719 
1720 /* These functions are yet to be implemented */
1721 #define nvme_error_detected NULL
1722 #define nvme_dump_registers NULL
1723 #define nvme_link_reset NULL
1724 #define nvme_slot_reset NULL
1725 #define nvme_error_resume NULL
1726 #define nvme_suspend NULL
1727 #define nvme_resume NULL
1728 
1729 static const struct pci_error_handlers nvme_err_handler = {
1730  .error_detected = nvme_error_detected,
1731  .mmio_enabled = nvme_dump_registers,
1732  .link_reset = nvme_link_reset,
1733  .slot_reset = nvme_slot_reset,
1734  .resume = nvme_error_resume,
1735 };
1736 
1737 /* Move to pci_ids.h later */
1738 #define PCI_CLASS_STORAGE_EXPRESS 0x010802
1739 
1740 static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = {
1742  { 0, }
1743 };
1744 MODULE_DEVICE_TABLE(pci, nvme_id_table);
1745 
1746 static struct pci_driver nvme_driver = {
1747  .name = "nvme",
1748  .id_table = nvme_id_table,
1749  .probe = nvme_probe,
1750  .remove = __devexit_p(nvme_remove),
1751  .suspend = nvme_suspend,
1752  .resume = nvme_resume,
1753  .err_handler = &nvme_err_handler,
1754 };
1755 
1756 static int __init nvme_init(void)
1757 {
1758  int result;
1759 
1760  nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
1761  if (IS_ERR(nvme_thread))
1762  return PTR_ERR(nvme_thread);
1763 
1764  result = register_blkdev(nvme_major, "nvme");
1765  if (result < 0)
1766  goto kill_kthread;
1767  else if (result > 0)
1768  nvme_major = result;
1769 
1770  result = pci_register_driver(&nvme_driver);
1771  if (result)
1772  goto unregister_blkdev;
1773  return 0;
1774 
1776  unregister_blkdev(nvme_major, "nvme");
1777  kill_kthread:
1778  kthread_stop(nvme_thread);
1779  return result;
1780 }
1781 
1782 static void __exit nvme_exit(void)
1783 {
1784  pci_unregister_driver(&nvme_driver);
1785  unregister_blkdev(nvme_major, "nvme");
1786  kthread_stop(nvme_thread);
1787 }
1788 
1789 MODULE_AUTHOR("Matthew Wilcox <[email protected]>");
1790 MODULE_LICENSE("GPL");
1791 MODULE_VERSION("0.8");
1792 module_init(nvme_init);
1793 module_exit(nvme_exit);