Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
intel-iommu.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <[email protected]>
19  * Author: Shaohua Li <[email protected]>
20  * Author: Anil S Keshavamurthy <[email protected]>
21  * Author: Fenghua Yu <[email protected]>
22  */
23 
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 
49 #define ROOT_SIZE VTD_PAGE_SIZE
50 #define CONTEXT_SIZE VTD_PAGE_SIZE
51 
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55 
56 #define IOAPIC_RANGE_START (0xfee00000)
57 #define IOAPIC_RANGE_END (0xfeefffff)
58 #define IOVA_START_ADDR (0x1000)
59 
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61 
62 #define MAX_AGAW_WIDTH 64
63 
64 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
65 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
66 
67 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
68  to match. That way, we can use 'unsigned long' for PFNs with impunity. */
69 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
70  __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
71 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
72 
73 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
74 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
75 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
76 
77 /* page table handling */
78 #define LEVEL_STRIDE (9)
79 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
80 
81 /*
82  * This bitmap is used to advertise the page sizes our hardware support
83  * to the IOMMU core, which will then use this information to split
84  * physically contiguous memory regions it is mapping into page sizes
85  * that we support.
86  *
87  * Traditionally the IOMMU core just handed us the mappings directly,
88  * after making sure the size is an order of a 4KiB page and that the
89  * mapping has natural alignment.
90  *
91  * To retain this behavior, we currently advertise that we support
92  * all page sizes that are an order of 4KiB.
93  *
94  * If at some point we'd like to utilize the IOMMU core's new behavior,
95  * we could change this to advertise the real page sizes we support.
96  */
97 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
98 
99 static inline int agaw_to_level(int agaw)
100 {
101  return agaw + 2;
102 }
103 
104 static inline int agaw_to_width(int agaw)
105 {
106  return 30 + agaw * LEVEL_STRIDE;
107 }
108 
109 static inline int width_to_agaw(int width)
110 {
111  return (width - 30) / LEVEL_STRIDE;
112 }
113 
114 static inline unsigned int level_to_offset_bits(int level)
115 {
116  return (level - 1) * LEVEL_STRIDE;
117 }
118 
119 static inline int pfn_level_offset(unsigned long pfn, int level)
120 {
121  return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
122 }
123 
124 static inline unsigned long level_mask(int level)
125 {
126  return -1UL << level_to_offset_bits(level);
127 }
128 
129 static inline unsigned long level_size(int level)
130 {
131  return 1UL << level_to_offset_bits(level);
132 }
133 
134 static inline unsigned long align_to_level(unsigned long pfn, int level)
135 {
136  return (pfn + level_size(level) - 1) & level_mask(level);
137 }
138 
139 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
140 {
141  return 1 << ((lvl - 1) * LEVEL_STRIDE);
142 }
143 
144 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
145  are never going to work. */
146 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
147 {
148  return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
149 }
150 
151 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
152 {
153  return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
154 }
155 static inline unsigned long page_to_dma_pfn(struct page *pg)
156 {
157  return mm_to_dma_pfn(page_to_pfn(pg));
158 }
159 static inline unsigned long virt_to_dma_pfn(void *p)
160 {
161  return page_to_dma_pfn(virt_to_page(p));
162 }
163 
164 /* global iommu list, set NULL for ignored DMAR units */
165 static struct intel_iommu **g_iommus;
166 
167 static void __init check_tylersburg_isoch(void);
168 static int rwbf_quirk;
169 
170 /*
171  * set to 1 to panic kernel if can't successfully enable VT-d
172  * (used when kernel is launched w/ TXT)
173  */
174 static int force_on = 0;
175 
176 /*
177  * 0: Present
178  * 1-11: Reserved
179  * 12-63: Context Ptr (12 - (haw-1))
180  * 64-127: Reserved
181  */
182 struct root_entry {
185 };
186 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
187 static inline bool root_present(struct root_entry *root)
188 {
189  return (root->val & 1);
190 }
191 static inline void set_root_present(struct root_entry *root)
192 {
193  root->val |= 1;
194 }
195 static inline void set_root_value(struct root_entry *root, unsigned long value)
196 {
197  root->val |= value & VTD_PAGE_MASK;
198 }
199 
200 static inline struct context_entry *
201 get_context_addr_from_root(struct root_entry *root)
202 {
203  return (struct context_entry *)
204  (root_present(root)?phys_to_virt(
205  root->val & VTD_PAGE_MASK) :
206  NULL);
207 }
208 
209 /*
210  * low 64 bits:
211  * 0: present
212  * 1: fault processing disable
213  * 2-3: translation type
214  * 12-63: address space root
215  * high 64 bits:
216  * 0-2: address width
217  * 3-6: aval
218  * 8-23: domain id
219  */
223 };
224 
225 static inline bool context_present(struct context_entry *context)
226 {
227  return (context->lo & 1);
228 }
229 static inline void context_set_present(struct context_entry *context)
230 {
231  context->lo |= 1;
232 }
233 
234 static inline void context_set_fault_enable(struct context_entry *context)
235 {
236  context->lo &= (((u64)-1) << 2) | 1;
237 }
238 
239 static inline void context_set_translation_type(struct context_entry *context,
240  unsigned long value)
241 {
242  context->lo &= (((u64)-1) << 4) | 3;
243  context->lo |= (value & 3) << 2;
244 }
245 
246 static inline void context_set_address_root(struct context_entry *context,
247  unsigned long value)
248 {
249  context->lo |= value & VTD_PAGE_MASK;
250 }
251 
252 static inline void context_set_address_width(struct context_entry *context,
253  unsigned long value)
254 {
255  context->hi |= value & 7;
256 }
257 
258 static inline void context_set_domain_id(struct context_entry *context,
259  unsigned long value)
260 {
261  context->hi |= (value & ((1 << 16) - 1)) << 8;
262 }
263 
264 static inline void context_clear_entry(struct context_entry *context)
265 {
266  context->lo = 0;
267  context->hi = 0;
268 }
269 
270 /*
271  * 0: readable
272  * 1: writable
273  * 2-6: reserved
274  * 7: super page
275  * 8-10: available
276  * 11: snoop behavior
277  * 12-63: Host physcial address
278  */
279 struct dma_pte {
281 };
282 
283 static inline void dma_clear_pte(struct dma_pte *pte)
284 {
285  pte->val = 0;
286 }
287 
288 static inline void dma_set_pte_readable(struct dma_pte *pte)
289 {
290  pte->val |= DMA_PTE_READ;
291 }
292 
293 static inline void dma_set_pte_writable(struct dma_pte *pte)
294 {
295  pte->val |= DMA_PTE_WRITE;
296 }
297 
298 static inline void dma_set_pte_snp(struct dma_pte *pte)
299 {
300  pte->val |= DMA_PTE_SNP;
301 }
302 
303 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
304 {
305  pte->val = (pte->val & ~3) | (prot & 3);
306 }
307 
308 static inline u64 dma_pte_addr(struct dma_pte *pte)
309 {
310 #ifdef CONFIG_64BIT
311  return pte->val & VTD_PAGE_MASK;
312 #else
313  /* Must have a full atomic 64-bit read */
314  return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
315 #endif
316 }
317 
318 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
319 {
320  pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
321 }
322 
323 static inline bool dma_pte_present(struct dma_pte *pte)
324 {
325  return (pte->val & 3) != 0;
326 }
327 
328 static inline bool dma_pte_superpage(struct dma_pte *pte)
329 {
330  return (pte->val & (1 << 7));
331 }
332 
333 static inline int first_pte_in_page(struct dma_pte *pte)
334 {
335  return !((unsigned long)pte & ~VTD_PAGE_MASK);
336 }
337 
338 /*
339  * This domain is a statically identity mapping domain.
340  * 1. This domain creats a static 1:1 mapping to all usable memory.
341  * 2. It maps to each iommu if successful.
342  * 3. Each iommu mapps to this domain if successful.
343  */
344 static struct dmar_domain *si_domain;
345 static int hw_pass_through = 1;
346 
347 /* devices under the same p2p bridge are owned in one domain */
348 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
349 
350 /* domain represents a virtual machine, more than one devices
351  * across iommus may be owned in one domain, e.g. kvm guest.
352  */
353 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
354 
355 /* si_domain contains mulitple devices */
356 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2)
357 
358 /* define the limit of IOMMUs supported in each domain */
359 #ifdef CONFIG_X86
360 # define IOMMU_UNITS_SUPPORTED MAX_IO_APICS
361 #else
362 # define IOMMU_UNITS_SUPPORTED 64
363 #endif
364 
365 struct dmar_domain {
366  int id; /* domain id */
367  int nid; /* node id */
369  /* bitmap of iommus this domain uses*/
370 
371  struct list_head devices; /* all devices' list */
372  struct iova_domain iovad; /* iova's that belong to this domain */
373 
374  struct dma_pte *pgd; /* virtual address */
375  int gaw; /* max guest address width */
376 
377  /* adjusted guest address width, 0 is level 2 30-bit */
378  int agaw;
379 
380  int flags; /* flags to find out type of domain */
381 
382  int iommu_coherency;/* indicate coherency of iommu access */
383  int iommu_snooping; /* indicate snooping control feature*/
384  int iommu_count; /* reference count of iommu */
385  int iommu_superpage;/* Level of superpages supported:
386  0 == 4KiB (no superpages), 1 == 2MiB,
387  2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
388  spinlock_t iommu_lock; /* protect iommu set in domain */
389  u64 max_addr; /* maximum mapped address */
390 };
391 
392 /* PCI domain-device relationship */
394  struct list_head link; /* link to domain siblings */
395  struct list_head global; /* link to global list */
396  int segment; /* PCI domain */
397  u8 bus; /* PCI bus number */
398  u8 devfn; /* PCI devfn number */
399  struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
400  struct intel_iommu *iommu; /* IOMMU used by this device */
401  struct dmar_domain *domain; /* pointer to domain */
402 };
403 
404 static void flush_unmaps_timeout(unsigned long data);
405 
406 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
407 
408 #define HIGH_WATER_MARK 250
410  int next;
413 };
414 
415 static struct deferred_flush_tables *deferred_flush;
416 
417 /* bitmap for indexing intel_iommus */
418 static int g_num_of_iommus;
419 
420 static DEFINE_SPINLOCK(async_umap_flush_lock);
421 static LIST_HEAD(unmaps_to_do);
422 
423 static int timer_on;
424 static long list_size;
425 
426 static void domain_remove_dev_info(struct dmar_domain *domain);
427 
428 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
429 int dmar_disabled = 0;
430 #else
431 int dmar_disabled = 1;
432 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
433 
435 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
436 
437 static int dmar_map_gfx = 1;
438 static int dmar_forcedac;
439 static int intel_iommu_strict;
440 static int intel_iommu_superpage = 1;
441 
443 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
444 
445 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
446 static DEFINE_SPINLOCK(device_domain_lock);
447 static LIST_HEAD(device_domain_list);
448 
449 static struct iommu_ops intel_iommu_ops;
450 
451 static int __init intel_iommu_setup(char *str)
452 {
453  if (!str)
454  return -EINVAL;
455  while (*str) {
456  if (!strncmp(str, "on", 2)) {
457  dmar_disabled = 0;
458  printk(KERN_INFO "Intel-IOMMU: enabled\n");
459  } else if (!strncmp(str, "off", 3)) {
460  dmar_disabled = 1;
461  printk(KERN_INFO "Intel-IOMMU: disabled\n");
462  } else if (!strncmp(str, "igfx_off", 8)) {
463  dmar_map_gfx = 0;
465  "Intel-IOMMU: disable GFX device mapping\n");
466  } else if (!strncmp(str, "forcedac", 8)) {
468  "Intel-IOMMU: Forcing DAC for PCI devices\n");
469  dmar_forcedac = 1;
470  } else if (!strncmp(str, "strict", 6)) {
472  "Intel-IOMMU: disable batched IOTLB flush\n");
473  intel_iommu_strict = 1;
474  } else if (!strncmp(str, "sp_off", 6)) {
476  "Intel-IOMMU: disable supported super page\n");
477  intel_iommu_superpage = 0;
478  }
479 
480  str += strcspn(str, ",");
481  while (*str == ',')
482  str++;
483  }
484  return 0;
485 }
486 __setup("intel_iommu=", intel_iommu_setup);
487 
488 static struct kmem_cache *iommu_domain_cache;
489 static struct kmem_cache *iommu_devinfo_cache;
490 static struct kmem_cache *iommu_iova_cache;
491 
492 static inline void *alloc_pgtable_page(int node)
493 {
494  struct page *page;
495  void *vaddr = NULL;
496 
497  page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
498  if (page)
499  vaddr = page_address(page);
500  return vaddr;
501 }
502 
503 static inline void free_pgtable_page(void *vaddr)
504 {
505  free_page((unsigned long)vaddr);
506 }
507 
508 static inline void *alloc_domain_mem(void)
509 {
510  return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
511 }
512 
513 static void free_domain_mem(void *vaddr)
514 {
515  kmem_cache_free(iommu_domain_cache, vaddr);
516 }
517 
518 static inline void * alloc_devinfo_mem(void)
519 {
520  return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
521 }
522 
523 static inline void free_devinfo_mem(void *vaddr)
524 {
525  kmem_cache_free(iommu_devinfo_cache, vaddr);
526 }
527 
528 struct iova *alloc_iova_mem(void)
529 {
530  return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
531 }
532 
533 void free_iova_mem(struct iova *iova)
534 {
535  kmem_cache_free(iommu_iova_cache, iova);
536 }
537 
538 
539 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
540 {
541  unsigned long sagaw;
542  int agaw = -1;
543 
544  sagaw = cap_sagaw(iommu->cap);
545  for (agaw = width_to_agaw(max_gaw);
546  agaw >= 0; agaw--) {
547  if (test_bit(agaw, &sagaw))
548  break;
549  }
550 
551  return agaw;
552 }
553 
554 /*
555  * Calculate max SAGAW for each iommu.
556  */
558 {
559  return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
560 }
561 
562 /*
563  * calculate agaw for each iommu.
564  * "SAGAW" may be different across iommus, use a default agaw, and
565  * get a supported less agaw for iommus that don't support the default agaw.
566  */
568 {
569  return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
570 }
571 
572 /* This functionin only returns single iommu in a domain */
573 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
574 {
575  int iommu_id;
576 
577  /* si_domain and vm domain should not get here. */
580 
581  iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
582  if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
583  return NULL;
584 
585  return g_iommus[iommu_id];
586 }
587 
588 static void domain_update_iommu_coherency(struct dmar_domain *domain)
589 {
590  int i;
591 
592  i = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
593 
594  domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
595 
596  for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
597  if (!ecap_coherent(g_iommus[i]->ecap)) {
598  domain->iommu_coherency = 0;
599  break;
600  }
601  }
602 }
603 
604 static void domain_update_iommu_snooping(struct dmar_domain *domain)
605 {
606  int i;
607 
608  domain->iommu_snooping = 1;
609 
610  for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
611  if (!ecap_sc_support(g_iommus[i]->ecap)) {
612  domain->iommu_snooping = 0;
613  break;
614  }
615  }
616 }
617 
618 static void domain_update_iommu_superpage(struct dmar_domain *domain)
619 {
620  struct dmar_drhd_unit *drhd;
621  struct intel_iommu *iommu = NULL;
622  int mask = 0xf;
623 
624  if (!intel_iommu_superpage) {
625  domain->iommu_superpage = 0;
626  return;
627  }
628 
629  /* set iommu_superpage to the smallest common denominator */
630  for_each_active_iommu(iommu, drhd) {
631  mask &= cap_super_page_val(iommu->cap);
632  if (!mask) {
633  break;
634  }
635  }
636  domain->iommu_superpage = fls(mask);
637 }
638 
639 /* Some capabilities may be different across iommus */
640 static void domain_update_iommu_cap(struct dmar_domain *domain)
641 {
642  domain_update_iommu_coherency(domain);
643  domain_update_iommu_snooping(domain);
644  domain_update_iommu_superpage(domain);
645 }
646 
647 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
648 {
649  struct dmar_drhd_unit *drhd = NULL;
650  int i;
651 
652  for_each_drhd_unit(drhd) {
653  if (drhd->ignored)
654  continue;
655  if (segment != drhd->segment)
656  continue;
657 
658  for (i = 0; i < drhd->devices_cnt; i++) {
659  if (drhd->devices[i] &&
660  drhd->devices[i]->bus->number == bus &&
661  drhd->devices[i]->devfn == devfn)
662  return drhd->iommu;
663  if (drhd->devices[i] &&
664  drhd->devices[i]->subordinate &&
665  drhd->devices[i]->subordinate->number <= bus &&
666  drhd->devices[i]->subordinate->busn_res.end >= bus)
667  return drhd->iommu;
668  }
669 
670  if (drhd->include_all)
671  return drhd->iommu;
672  }
673 
674  return NULL;
675 }
676 
677 static void domain_flush_cache(struct dmar_domain *domain,
678  void *addr, int size)
679 {
680  if (!domain->iommu_coherency)
681  clflush_cache_range(addr, size);
682 }
683 
684 /* Gets context entry for a given bus and devfn */
685 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
686  u8 bus, u8 devfn)
687 {
688  struct root_entry *root;
689  struct context_entry *context;
690  unsigned long phy_addr;
691  unsigned long flags;
692 
693  spin_lock_irqsave(&iommu->lock, flags);
694  root = &iommu->root_entry[bus];
695  context = get_context_addr_from_root(root);
696  if (!context) {
697  context = (struct context_entry *)
698  alloc_pgtable_page(iommu->node);
699  if (!context) {
700  spin_unlock_irqrestore(&iommu->lock, flags);
701  return NULL;
702  }
703  __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
704  phy_addr = virt_to_phys((void *)context);
705  set_root_value(root, phy_addr);
706  set_root_present(root);
707  __iommu_flush_cache(iommu, root, sizeof(*root));
708  }
709  spin_unlock_irqrestore(&iommu->lock, flags);
710  return &context[devfn];
711 }
712 
713 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
714 {
715  struct root_entry *root;
716  struct context_entry *context;
717  int ret;
718  unsigned long flags;
719 
720  spin_lock_irqsave(&iommu->lock, flags);
721  root = &iommu->root_entry[bus];
722  context = get_context_addr_from_root(root);
723  if (!context) {
724  ret = 0;
725  goto out;
726  }
727  ret = context_present(&context[devfn]);
728 out:
729  spin_unlock_irqrestore(&iommu->lock, flags);
730  return ret;
731 }
732 
733 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
734 {
735  struct root_entry *root;
736  struct context_entry *context;
737  unsigned long flags;
738 
739  spin_lock_irqsave(&iommu->lock, flags);
740  root = &iommu->root_entry[bus];
741  context = get_context_addr_from_root(root);
742  if (context) {
743  context_clear_entry(&context[devfn]);
744  __iommu_flush_cache(iommu, &context[devfn], \
745  sizeof(*context));
746  }
747  spin_unlock_irqrestore(&iommu->lock, flags);
748 }
749 
750 static void free_context_table(struct intel_iommu *iommu)
751 {
752  struct root_entry *root;
753  int i;
754  unsigned long flags;
755  struct context_entry *context;
756 
757  spin_lock_irqsave(&iommu->lock, flags);
758  if (!iommu->root_entry) {
759  goto out;
760  }
761  for (i = 0; i < ROOT_ENTRY_NR; i++) {
762  root = &iommu->root_entry[i];
763  context = get_context_addr_from_root(root);
764  if (context)
765  free_pgtable_page(context);
766  }
767  free_pgtable_page(iommu->root_entry);
768  iommu->root_entry = NULL;
769 out:
770  spin_unlock_irqrestore(&iommu->lock, flags);
771 }
772 
773 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
774  unsigned long pfn, int target_level)
775 {
776  int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
777  struct dma_pte *parent, *pte = NULL;
778  int level = agaw_to_level(domain->agaw);
779  int offset;
780 
781  BUG_ON(!domain->pgd);
782  BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
783  parent = domain->pgd;
784 
785  while (level > 0) {
786  void *tmp_page;
787 
788  offset = pfn_level_offset(pfn, level);
789  pte = &parent[offset];
790  if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
791  break;
792  if (level == target_level)
793  break;
794 
795  if (!dma_pte_present(pte)) {
796  uint64_t pteval;
797 
798  tmp_page = alloc_pgtable_page(domain->nid);
799 
800  if (!tmp_page)
801  return NULL;
802 
803  domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
804  pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
805  if (cmpxchg64(&pte->val, 0ULL, pteval)) {
806  /* Someone else set it while we were thinking; use theirs. */
807  free_pgtable_page(tmp_page);
808  } else {
809  dma_pte_addr(pte);
810  domain_flush_cache(domain, pte, sizeof(*pte));
811  }
812  }
813  parent = phys_to_virt(dma_pte_addr(pte));
814  level--;
815  }
816 
817  return pte;
818 }
819 
820 
821 /* return address's pte at specific level */
822 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
823  unsigned long pfn,
824  int level, int *large_page)
825 {
826  struct dma_pte *parent, *pte = NULL;
827  int total = agaw_to_level(domain->agaw);
828  int offset;
829 
830  parent = domain->pgd;
831  while (level <= total) {
832  offset = pfn_level_offset(pfn, total);
833  pte = &parent[offset];
834  if (level == total)
835  return pte;
836 
837  if (!dma_pte_present(pte)) {
838  *large_page = total;
839  break;
840  }
841 
842  if (pte->val & DMA_PTE_LARGE_PAGE) {
843  *large_page = total;
844  return pte;
845  }
846 
847  parent = phys_to_virt(dma_pte_addr(pte));
848  total--;
849  }
850  return NULL;
851 }
852 
853 /* clear last level pte, a tlb flush should be followed */
854 static int dma_pte_clear_range(struct dmar_domain *domain,
855  unsigned long start_pfn,
856  unsigned long last_pfn)
857 {
858  int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
859  unsigned int large_page = 1;
860  struct dma_pte *first_pte, *pte;
861  int order;
862 
863  BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
864  BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
865  BUG_ON(start_pfn > last_pfn);
866 
867  /* we don't need lock here; nobody else touches the iova range */
868  do {
869  large_page = 1;
870  first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
871  if (!pte) {
872  start_pfn = align_to_level(start_pfn + 1, large_page + 1);
873  continue;
874  }
875  do {
876  dma_clear_pte(pte);
877  start_pfn += lvl_to_nr_pages(large_page);
878  pte++;
879  } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
880 
881  domain_flush_cache(domain, first_pte,
882  (void *)pte - (void *)first_pte);
883 
884  } while (start_pfn && start_pfn <= last_pfn);
885 
886  order = (large_page - 1) * 9;
887  return order;
888 }
889 
890 /* free page table pages. last level pte should already be cleared */
891 static void dma_pte_free_pagetable(struct dmar_domain *domain,
892  unsigned long start_pfn,
893  unsigned long last_pfn)
894 {
895  int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
896  struct dma_pte *first_pte, *pte;
897  int total = agaw_to_level(domain->agaw);
898  int level;
899  unsigned long tmp;
900  int large_page = 2;
901 
902  BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
903  BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
904  BUG_ON(start_pfn > last_pfn);
905 
906  /* We don't need lock here; nobody else touches the iova range */
907  level = 2;
908  while (level <= total) {
909  tmp = align_to_level(start_pfn, level);
910 
911  /* If we can't even clear one PTE at this level, we're done */
912  if (tmp + level_size(level) - 1 > last_pfn)
913  return;
914 
915  do {
916  large_page = level;
917  first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
918  if (large_page > level)
919  level = large_page + 1;
920  if (!pte) {
921  tmp = align_to_level(tmp + 1, level + 1);
922  continue;
923  }
924  do {
925  if (dma_pte_present(pte)) {
926  free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
927  dma_clear_pte(pte);
928  }
929  pte++;
930  tmp += level_size(level);
931  } while (!first_pte_in_page(pte) &&
932  tmp + level_size(level) - 1 <= last_pfn);
933 
934  domain_flush_cache(domain, first_pte,
935  (void *)pte - (void *)first_pte);
936 
937  } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
938  level++;
939  }
940  /* free pgd */
941  if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
942  free_pgtable_page(domain->pgd);
943  domain->pgd = NULL;
944  }
945 }
946 
947 /* iommu handling */
948 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
949 {
950  struct root_entry *root;
951  unsigned long flags;
952 
953  root = (struct root_entry *)alloc_pgtable_page(iommu->node);
954  if (!root)
955  return -ENOMEM;
956 
957  __iommu_flush_cache(iommu, root, ROOT_SIZE);
958 
959  spin_lock_irqsave(&iommu->lock, flags);
960  iommu->root_entry = root;
961  spin_unlock_irqrestore(&iommu->lock, flags);
962 
963  return 0;
964 }
965 
966 static void iommu_set_root_entry(struct intel_iommu *iommu)
967 {
968  void *addr;
969  u32 sts;
970  unsigned long flag;
971 
972  addr = iommu->root_entry;
973 
974  raw_spin_lock_irqsave(&iommu->register_lock, flag);
975  dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
976 
977  writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
978 
979  /* Make sure hardware complete it */
981  readl, (sts & DMA_GSTS_RTPS), sts);
982 
984 }
985 
986 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
987 {
988  u32 val;
989  unsigned long flag;
990 
991  if (!rwbf_quirk && !cap_rwbf(iommu->cap))
992  return;
993 
994  raw_spin_lock_irqsave(&iommu->register_lock, flag);
995  writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
996 
997  /* Make sure hardware complete it */
999  readl, (!(val & DMA_GSTS_WBFS)), val);
1000 
1002 }
1003 
1004 /* return value determine if we need a write buffer flush */
1005 static void __iommu_flush_context(struct intel_iommu *iommu,
1006  u16 did, u16 source_id, u8 function_mask,
1007  u64 type)
1008 {
1009  u64 val = 0;
1010  unsigned long flag;
1011 
1012  switch (type) {
1013  case DMA_CCMD_GLOBAL_INVL:
1014  val = DMA_CCMD_GLOBAL_INVL;
1015  break;
1016  case DMA_CCMD_DOMAIN_INVL:
1018  break;
1019  case DMA_CCMD_DEVICE_INVL:
1021  | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1022  break;
1023  default:
1024  BUG();
1025  }
1026  val |= DMA_CCMD_ICC;
1027 
1028  raw_spin_lock_irqsave(&iommu->register_lock, flag);
1029  dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1030 
1031  /* Make sure hardware complete it */
1033  dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1034 
1036 }
1037 
1038 /* return value determine if we need a write buffer flush */
1039 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1040  u64 addr, unsigned int size_order, u64 type)
1041 {
1042  int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1043  u64 val = 0, val_iva = 0;
1044  unsigned long flag;
1045 
1046  switch (type) {
1047  case DMA_TLB_GLOBAL_FLUSH:
1048  /* global flush doesn't need set IVA_REG */
1050  break;
1051  case DMA_TLB_DSI_FLUSH:
1053  break;
1054  case DMA_TLB_PSI_FLUSH:
1056  /* Note: always flush non-leaf currently */
1057  val_iva = size_order | addr;
1058  break;
1059  default:
1060  BUG();
1061  }
1062  /* Note: set drain read/write */
1063 #if 0
1064  /*
1065  * This is probably to be super secure.. Looks like we can
1066  * ignore it without any impact.
1067  */
1068  if (cap_read_drain(iommu->cap))
1069  val |= DMA_TLB_READ_DRAIN;
1070 #endif
1071  if (cap_write_drain(iommu->cap))
1072  val |= DMA_TLB_WRITE_DRAIN;
1073 
1074  raw_spin_lock_irqsave(&iommu->register_lock, flag);
1075  /* Note: Only uses first TLB reg currently */
1076  if (val_iva)
1077  dmar_writeq(iommu->reg + tlb_offset, val_iva);
1078  dmar_writeq(iommu->reg + tlb_offset + 8, val);
1079 
1080  /* Make sure hardware complete it */
1081  IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1082  dmar_readq, (!(val & DMA_TLB_IVT)), val);
1083 
1085 
1086  /* check IOTLB invalidation granularity */
1087  if (DMA_TLB_IAIG(val) == 0)
1088  printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1089  if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1090  pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1091  (unsigned long long)DMA_TLB_IIRG(type),
1092  (unsigned long long)DMA_TLB_IAIG(val));
1093 }
1094 
1095 static struct device_domain_info *iommu_support_dev_iotlb(
1096  struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1097 {
1098  int found = 0;
1099  unsigned long flags;
1100  struct device_domain_info *info;
1101  struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1102 
1103  if (!ecap_dev_iotlb_support(iommu->ecap))
1104  return NULL;
1105 
1106  if (!iommu->qi)
1107  return NULL;
1108 
1109  spin_lock_irqsave(&device_domain_lock, flags);
1110  list_for_each_entry(info, &domain->devices, link)
1111  if (info->bus == bus && info->devfn == devfn) {
1112  found = 1;
1113  break;
1114  }
1115  spin_unlock_irqrestore(&device_domain_lock, flags);
1116 
1117  if (!found || !info->dev)
1118  return NULL;
1119 
1121  return NULL;
1122 
1123  if (!dmar_find_matched_atsr_unit(info->dev))
1124  return NULL;
1125 
1126  info->iommu = iommu;
1127 
1128  return info;
1129 }
1130 
1131 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1132 {
1133  if (!info)
1134  return;
1135 
1137 }
1138 
1139 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1140 {
1141  if (!info->dev || !pci_ats_enabled(info->dev))
1142  return;
1143 
1144  pci_disable_ats(info->dev);
1145 }
1146 
1147 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1148  u64 addr, unsigned mask)
1149 {
1150  u16 sid, qdep;
1151  unsigned long flags;
1152  struct device_domain_info *info;
1153 
1154  spin_lock_irqsave(&device_domain_lock, flags);
1155  list_for_each_entry(info, &domain->devices, link) {
1156  if (!info->dev || !pci_ats_enabled(info->dev))
1157  continue;
1158 
1159  sid = info->bus << 8 | info->devfn;
1160  qdep = pci_ats_queue_depth(info->dev);
1161  qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1162  }
1163  spin_unlock_irqrestore(&device_domain_lock, flags);
1164 }
1165 
1166 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1167  unsigned long pfn, unsigned int pages, int map)
1168 {
1169  unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1170  uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1171 
1172  BUG_ON(pages == 0);
1173 
1174  /*
1175  * Fallback to domain selective flush if no PSI support or the size is
1176  * too big.
1177  * PSI requires page size to be 2 ^ x, and the base address is naturally
1178  * aligned to the size
1179  */
1180  if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1181  iommu->flush.flush_iotlb(iommu, did, 0, 0,
1183  else
1184  iommu->flush.flush_iotlb(iommu, did, addr, mask,
1186 
1187  /*
1188  * In caching mode, changes of pages from non-present to present require
1189  * flush. However, device IOTLB doesn't need to be flushed in this case.
1190  */
1191  if (!cap_caching_mode(iommu->cap) || !map)
1192  iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1193 }
1194 
1195 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1196 {
1197  u32 pmen;
1198  unsigned long flags;
1199 
1200  raw_spin_lock_irqsave(&iommu->register_lock, flags);
1201  pmen = readl(iommu->reg + DMAR_PMEN_REG);
1202  pmen &= ~DMA_PMEN_EPM;
1203  writel(pmen, iommu->reg + DMAR_PMEN_REG);
1204 
1205  /* wait for the protected region status bit to clear */
1207  readl, !(pmen & DMA_PMEN_PRS), pmen);
1208 
1209  raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1210 }
1211 
1212 static int iommu_enable_translation(struct intel_iommu *iommu)
1213 {
1214  u32 sts;
1215  unsigned long flags;
1216 
1217  raw_spin_lock_irqsave(&iommu->register_lock, flags);
1218  iommu->gcmd |= DMA_GCMD_TE;
1219  writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1220 
1221  /* Make sure hardware complete it */
1223  readl, (sts & DMA_GSTS_TES), sts);
1224 
1225  raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1226  return 0;
1227 }
1228 
1229 static int iommu_disable_translation(struct intel_iommu *iommu)
1230 {
1231  u32 sts;
1232  unsigned long flag;
1233 
1234  raw_spin_lock_irqsave(&iommu->register_lock, flag);
1235  iommu->gcmd &= ~DMA_GCMD_TE;
1236  writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1237 
1238  /* Make sure hardware complete it */
1240  readl, (!(sts & DMA_GSTS_TES)), sts);
1241 
1243  return 0;
1244 }
1245 
1246 
1247 static int iommu_init_domains(struct intel_iommu *iommu)
1248 {
1249  unsigned long ndomains;
1250  unsigned long nlongs;
1251 
1252  ndomains = cap_ndoms(iommu->cap);
1253  pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1254  ndomains);
1255  nlongs = BITS_TO_LONGS(ndomains);
1256 
1257  spin_lock_init(&iommu->lock);
1258 
1259  /* TBD: there might be 64K domains,
1260  * consider other allocation for future chip
1261  */
1262  iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1263  if (!iommu->domain_ids) {
1264  printk(KERN_ERR "Allocating domain id array failed\n");
1265  return -ENOMEM;
1266  }
1267  iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1268  GFP_KERNEL);
1269  if (!iommu->domains) {
1270  printk(KERN_ERR "Allocating domain array failed\n");
1271  return -ENOMEM;
1272  }
1273 
1274  /*
1275  * if Caching mode is set, then invalid translations are tagged
1276  * with domainid 0. Hence we need to pre-allocate it.
1277  */
1278  if (cap_caching_mode(iommu->cap))
1279  set_bit(0, iommu->domain_ids);
1280  return 0;
1281 }
1282 
1283 
1284 static void domain_exit(struct dmar_domain *domain);
1285 static void vm_domain_exit(struct dmar_domain *domain);
1286 
1287 void free_dmar_iommu(struct intel_iommu *iommu)
1288 {
1289  struct dmar_domain *domain;
1290  int i;
1291  unsigned long flags;
1292 
1293  if ((iommu->domains) && (iommu->domain_ids)) {
1294  for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1295  domain = iommu->domains[i];
1296  clear_bit(i, iommu->domain_ids);
1297 
1298  spin_lock_irqsave(&domain->iommu_lock, flags);
1299  if (--domain->iommu_count == 0) {
1300  if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1301  vm_domain_exit(domain);
1302  else
1303  domain_exit(domain);
1304  }
1305  spin_unlock_irqrestore(&domain->iommu_lock, flags);
1306  }
1307  }
1308 
1309  if (iommu->gcmd & DMA_GCMD_TE)
1310  iommu_disable_translation(iommu);
1311 
1312  if (iommu->irq) {
1313  irq_set_handler_data(iommu->irq, NULL);
1314  /* This will mask the irq */
1315  free_irq(iommu->irq, iommu);
1316  destroy_irq(iommu->irq);
1317  }
1318 
1319  kfree(iommu->domains);
1320  kfree(iommu->domain_ids);
1321 
1322  g_iommus[iommu->seq_id] = NULL;
1323 
1324  /* if all iommus are freed, free g_iommus */
1325  for (i = 0; i < g_num_of_iommus; i++) {
1326  if (g_iommus[i])
1327  break;
1328  }
1329 
1330  if (i == g_num_of_iommus)
1331  kfree(g_iommus);
1332 
1333  /* free context mapping */
1334  free_context_table(iommu);
1335 }
1336 
1337 static struct dmar_domain *alloc_domain(void)
1338 {
1339  struct dmar_domain *domain;
1340 
1341  domain = alloc_domain_mem();
1342  if (!domain)
1343  return NULL;
1344 
1345  domain->nid = -1;
1346  memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1347  domain->flags = 0;
1348 
1349  return domain;
1350 }
1351 
1352 static int iommu_attach_domain(struct dmar_domain *domain,
1353  struct intel_iommu *iommu)
1354 {
1355  int num;
1356  unsigned long ndomains;
1357  unsigned long flags;
1358 
1359  ndomains = cap_ndoms(iommu->cap);
1360 
1361  spin_lock_irqsave(&iommu->lock, flags);
1362 
1363  num = find_first_zero_bit(iommu->domain_ids, ndomains);
1364  if (num >= ndomains) {
1365  spin_unlock_irqrestore(&iommu->lock, flags);
1366  printk(KERN_ERR "IOMMU: no free domain ids\n");
1367  return -ENOMEM;
1368  }
1369 
1370  domain->id = num;
1371  set_bit(num, iommu->domain_ids);
1372  set_bit(iommu->seq_id, domain->iommu_bmp);
1373  iommu->domains[num] = domain;
1374  spin_unlock_irqrestore(&iommu->lock, flags);
1375 
1376  return 0;
1377 }
1378 
1379 static void iommu_detach_domain(struct dmar_domain *domain,
1380  struct intel_iommu *iommu)
1381 {
1382  unsigned long flags;
1383  int num, ndomains;
1384  int found = 0;
1385 
1386  spin_lock_irqsave(&iommu->lock, flags);
1387  ndomains = cap_ndoms(iommu->cap);
1388  for_each_set_bit(num, iommu->domain_ids, ndomains) {
1389  if (iommu->domains[num] == domain) {
1390  found = 1;
1391  break;
1392  }
1393  }
1394 
1395  if (found) {
1396  clear_bit(num, iommu->domain_ids);
1397  clear_bit(iommu->seq_id, domain->iommu_bmp);
1398  iommu->domains[num] = NULL;
1399  }
1400  spin_unlock_irqrestore(&iommu->lock, flags);
1401 }
1402 
1403 static struct iova_domain reserved_iova_list;
1404 static struct lock_class_key reserved_rbtree_key;
1405 
1406 static int dmar_init_reserved_ranges(void)
1407 {
1408  struct pci_dev *pdev = NULL;
1409  struct iova *iova;
1410  int i;
1411 
1412  init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1413 
1414  lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1415  &reserved_rbtree_key);
1416 
1417  /* IOAPIC ranges shouldn't be accessed by DMA */
1418  iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1420  if (!iova) {
1421  printk(KERN_ERR "Reserve IOAPIC range failed\n");
1422  return -ENODEV;
1423  }
1424 
1425  /* Reserve all PCI MMIO to avoid peer-to-peer access */
1426  for_each_pci_dev(pdev) {
1427  struct resource *r;
1428 
1429  for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1430  r = &pdev->resource[i];
1431  if (!r->flags || !(r->flags & IORESOURCE_MEM))
1432  continue;
1433  iova = reserve_iova(&reserved_iova_list,
1434  IOVA_PFN(r->start),
1435  IOVA_PFN(r->end));
1436  if (!iova) {
1437  printk(KERN_ERR "Reserve iova failed\n");
1438  return -ENODEV;
1439  }
1440  }
1441  }
1442  return 0;
1443 }
1444 
1445 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1446 {
1447  copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1448 }
1449 
1450 static inline int guestwidth_to_adjustwidth(int gaw)
1451 {
1452  int agaw;
1453  int r = (gaw - 12) % 9;
1454 
1455  if (r == 0)
1456  agaw = gaw;
1457  else
1458  agaw = gaw + 9 - r;
1459  if (agaw > 64)
1460  agaw = 64;
1461  return agaw;
1462 }
1463 
1464 static int domain_init(struct dmar_domain *domain, int guest_width)
1465 {
1466  struct intel_iommu *iommu;
1467  int adjust_width, agaw;
1468  unsigned long sagaw;
1469 
1471  spin_lock_init(&domain->iommu_lock);
1472 
1473  domain_reserve_special_ranges(domain);
1474 
1475  /* calculate AGAW */
1476  iommu = domain_get_iommu(domain);
1477  if (guest_width > cap_mgaw(iommu->cap))
1478  guest_width = cap_mgaw(iommu->cap);
1479  domain->gaw = guest_width;
1480  adjust_width = guestwidth_to_adjustwidth(guest_width);
1481  agaw = width_to_agaw(adjust_width);
1482  sagaw = cap_sagaw(iommu->cap);
1483  if (!test_bit(agaw, &sagaw)) {
1484  /* hardware doesn't support it, choose a bigger one */
1485  pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1486  agaw = find_next_bit(&sagaw, 5, agaw);
1487  if (agaw >= 5)
1488  return -ENODEV;
1489  }
1490  domain->agaw = agaw;
1491  INIT_LIST_HEAD(&domain->devices);
1492 
1493  if (ecap_coherent(iommu->ecap))
1494  domain->iommu_coherency = 1;
1495  else
1496  domain->iommu_coherency = 0;
1497 
1498  if (ecap_sc_support(iommu->ecap))
1499  domain->iommu_snooping = 1;
1500  else
1501  domain->iommu_snooping = 0;
1502 
1503  domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1504  domain->iommu_count = 1;
1505  domain->nid = iommu->node;
1506 
1507  /* always allocate the top pgd */
1508  domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1509  if (!domain->pgd)
1510  return -ENOMEM;
1511  __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1512  return 0;
1513 }
1514 
1515 static void domain_exit(struct dmar_domain *domain)
1516 {
1517  struct dmar_drhd_unit *drhd;
1518  struct intel_iommu *iommu;
1519 
1520  /* Domain 0 is reserved, so dont process it */
1521  if (!domain)
1522  return;
1523 
1524  /* Flush any lazy unmaps that may reference this domain */
1525  if (!intel_iommu_strict)
1526  flush_unmaps_timeout(0);
1527 
1528  domain_remove_dev_info(domain);
1529  /* destroy iovas */
1530  put_iova_domain(&domain->iovad);
1531 
1532  /* clear ptes */
1533  dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1534 
1535  /* free page tables */
1536  dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1537 
1538  for_each_active_iommu(iommu, drhd)
1539  if (test_bit(iommu->seq_id, domain->iommu_bmp))
1540  iommu_detach_domain(domain, iommu);
1541 
1542  free_domain_mem(domain);
1543 }
1544 
1545 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1546  u8 bus, u8 devfn, int translation)
1547 {
1548  struct context_entry *context;
1549  unsigned long flags;
1550  struct intel_iommu *iommu;
1551  struct dma_pte *pgd;
1552  unsigned long num;
1553  unsigned long ndomains;
1554  int id;
1555  int agaw;
1556  struct device_domain_info *info = NULL;
1557 
1558  pr_debug("Set context mapping for %02x:%02x.%d\n",
1559  bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1560 
1561  BUG_ON(!domain->pgd);
1562  BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1563  translation != CONTEXT_TT_MULTI_LEVEL);
1564 
1565  iommu = device_to_iommu(segment, bus, devfn);
1566  if (!iommu)
1567  return -ENODEV;
1568 
1569  context = device_to_context_entry(iommu, bus, devfn);
1570  if (!context)
1571  return -ENOMEM;
1572  spin_lock_irqsave(&iommu->lock, flags);
1573  if (context_present(context)) {
1574  spin_unlock_irqrestore(&iommu->lock, flags);
1575  return 0;
1576  }
1577 
1578  id = domain->id;
1579  pgd = domain->pgd;
1580 
1581  if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1582  domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1583  int found = 0;
1584 
1585  /* find an available domain id for this device in iommu */
1586  ndomains = cap_ndoms(iommu->cap);
1587  for_each_set_bit(num, iommu->domain_ids, ndomains) {
1588  if (iommu->domains[num] == domain) {
1589  id = num;
1590  found = 1;
1591  break;
1592  }
1593  }
1594 
1595  if (found == 0) {
1596  num = find_first_zero_bit(iommu->domain_ids, ndomains);
1597  if (num >= ndomains) {
1598  spin_unlock_irqrestore(&iommu->lock, flags);
1599  printk(KERN_ERR "IOMMU: no free domain ids\n");
1600  return -EFAULT;
1601  }
1602 
1603  set_bit(num, iommu->domain_ids);
1604  iommu->domains[num] = domain;
1605  id = num;
1606  }
1607 
1608  /* Skip top levels of page tables for
1609  * iommu which has less agaw than default.
1610  * Unnecessary for PT mode.
1611  */
1612  if (translation != CONTEXT_TT_PASS_THROUGH) {
1613  for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1614  pgd = phys_to_virt(dma_pte_addr(pgd));
1615  if (!dma_pte_present(pgd)) {
1616  spin_unlock_irqrestore(&iommu->lock, flags);
1617  return -ENOMEM;
1618  }
1619  }
1620  }
1621  }
1622 
1623  context_set_domain_id(context, id);
1624 
1625  if (translation != CONTEXT_TT_PASS_THROUGH) {
1626  info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1627  translation = info ? CONTEXT_TT_DEV_IOTLB :
1629  }
1630  /*
1631  * In pass through mode, AW must be programmed to indicate the largest
1632  * AGAW value supported by hardware. And ASR is ignored by hardware.
1633  */
1634  if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1635  context_set_address_width(context, iommu->msagaw);
1636  else {
1637  context_set_address_root(context, virt_to_phys(pgd));
1638  context_set_address_width(context, iommu->agaw);
1639  }
1640 
1641  context_set_translation_type(context, translation);
1642  context_set_fault_enable(context);
1643  context_set_present(context);
1644  domain_flush_cache(domain, context, sizeof(*context));
1645 
1646  /*
1647  * It's a non-present to present mapping. If hardware doesn't cache
1648  * non-present entry we only need to flush the write-buffer. If the
1649  * _does_ cache non-present entries, then it does so in the special
1650  * domain #0, which we have to flush:
1651  */
1652  if (cap_caching_mode(iommu->cap)) {
1653  iommu->flush.flush_context(iommu, 0,
1654  (((u16)bus) << 8) | devfn,
1657  iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1658  } else {
1659  iommu_flush_write_buffer(iommu);
1660  }
1661  iommu_enable_dev_iotlb(info);
1662  spin_unlock_irqrestore(&iommu->lock, flags);
1663 
1664  spin_lock_irqsave(&domain->iommu_lock, flags);
1665  if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1666  domain->iommu_count++;
1667  if (domain->iommu_count == 1)
1668  domain->nid = iommu->node;
1669  domain_update_iommu_cap(domain);
1670  }
1671  spin_unlock_irqrestore(&domain->iommu_lock, flags);
1672  return 0;
1673 }
1674 
1675 static int
1676 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1677  int translation)
1678 {
1679  int ret;
1680  struct pci_dev *tmp, *parent;
1681 
1682  ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1683  pdev->bus->number, pdev->devfn,
1684  translation);
1685  if (ret)
1686  return ret;
1687 
1688  /* dependent device mapping */
1689  tmp = pci_find_upstream_pcie_bridge(pdev);
1690  if (!tmp)
1691  return 0;
1692  /* Secondary interface's bus number and devfn 0 */
1693  parent = pdev->bus->self;
1694  while (parent != tmp) {
1695  ret = domain_context_mapping_one(domain,
1696  pci_domain_nr(parent->bus),
1697  parent->bus->number,
1698  parent->devfn, translation);
1699  if (ret)
1700  return ret;
1701  parent = parent->bus->self;
1702  }
1703  if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1704  return domain_context_mapping_one(domain,
1705  pci_domain_nr(tmp->subordinate),
1706  tmp->subordinate->number, 0,
1707  translation);
1708  else /* this is a legacy PCI bridge */
1709  return domain_context_mapping_one(domain,
1710  pci_domain_nr(tmp->bus),
1711  tmp->bus->number,
1712  tmp->devfn,
1713  translation);
1714 }
1715 
1716 static int domain_context_mapped(struct pci_dev *pdev)
1717 {
1718  int ret;
1719  struct pci_dev *tmp, *parent;
1720  struct intel_iommu *iommu;
1721 
1722  iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1723  pdev->devfn);
1724  if (!iommu)
1725  return -ENODEV;
1726 
1727  ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1728  if (!ret)
1729  return ret;
1730  /* dependent device mapping */
1731  tmp = pci_find_upstream_pcie_bridge(pdev);
1732  if (!tmp)
1733  return ret;
1734  /* Secondary interface's bus number and devfn 0 */
1735  parent = pdev->bus->self;
1736  while (parent != tmp) {
1737  ret = device_context_mapped(iommu, parent->bus->number,
1738  parent->devfn);
1739  if (!ret)
1740  return ret;
1741  parent = parent->bus->self;
1742  }
1743  if (pci_is_pcie(tmp))
1744  return device_context_mapped(iommu, tmp->subordinate->number,
1745  0);
1746  else
1747  return device_context_mapped(iommu, tmp->bus->number,
1748  tmp->devfn);
1749 }
1750 
1751 /* Returns a number of VTD pages, but aligned to MM page size */
1752 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1753  size_t size)
1754 {
1755  host_addr &= ~PAGE_MASK;
1756  return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1757 }
1758 
1759 /* Return largest possible superpage level for a given mapping */
1760 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1761  unsigned long iov_pfn,
1762  unsigned long phy_pfn,
1763  unsigned long pages)
1764 {
1765  int support, level = 1;
1766  unsigned long pfnmerge;
1767 
1768  support = domain->iommu_superpage;
1769 
1770  /* To use a large page, the virtual *and* physical addresses
1771  must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1772  of them will mean we have to use smaller pages. So just
1773  merge them and check both at once. */
1774  pfnmerge = iov_pfn | phy_pfn;
1775 
1776  while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1777  pages >>= VTD_STRIDE_SHIFT;
1778  if (!pages)
1779  break;
1780  pfnmerge >>= VTD_STRIDE_SHIFT;
1781  level++;
1782  support--;
1783  }
1784  return level;
1785 }
1786 
1787 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1788  struct scatterlist *sg, unsigned long phys_pfn,
1789  unsigned long nr_pages, int prot)
1790 {
1791  struct dma_pte *first_pte = NULL, *pte = NULL;
1793  int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1794  unsigned long sg_res;
1795  unsigned int largepage_lvl = 0;
1796  unsigned long lvl_pages = 0;
1797 
1798  BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1799 
1800  if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1801  return -EINVAL;
1802 
1804 
1805  if (sg)
1806  sg_res = 0;
1807  else {
1808  sg_res = nr_pages + 1;
1809  pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1810  }
1811 
1812  while (nr_pages > 0) {
1813  uint64_t tmp;
1814 
1815  if (!sg_res) {
1816  sg_res = aligned_nrpages(sg->offset, sg->length);
1817  sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1818  sg->dma_length = sg->length;
1819  pteval = page_to_phys(sg_page(sg)) | prot;
1820  phys_pfn = pteval >> VTD_PAGE_SHIFT;
1821  }
1822 
1823  if (!pte) {
1824  largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1825 
1826  first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1827  if (!pte)
1828  return -ENOMEM;
1829  /* It is large page*/
1830  if (largepage_lvl > 1)
1831  pteval |= DMA_PTE_LARGE_PAGE;
1832  else
1833  pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1834 
1835  }
1836  /* We don't need lock here, nobody else
1837  * touches the iova range
1838  */
1839  tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1840  if (tmp) {
1841  static int dumps = 5;
1842  printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1843  iov_pfn, tmp, (unsigned long long)pteval);
1844  if (dumps) {
1845  dumps--;
1847  }
1848  WARN_ON(1);
1849  }
1850 
1851  lvl_pages = lvl_to_nr_pages(largepage_lvl);
1852 
1853  BUG_ON(nr_pages < lvl_pages);
1854  BUG_ON(sg_res < lvl_pages);
1855 
1856  nr_pages -= lvl_pages;
1857  iov_pfn += lvl_pages;
1858  phys_pfn += lvl_pages;
1859  pteval += lvl_pages * VTD_PAGE_SIZE;
1860  sg_res -= lvl_pages;
1861 
1862  /* If the next PTE would be the first in a new page, then we
1863  need to flush the cache on the entries we've just written.
1864  And then we'll need to recalculate 'pte', so clear it and
1865  let it get set again in the if (!pte) block above.
1866 
1867  If we're done (!nr_pages) we need to flush the cache too.
1868 
1869  Also if we've been setting superpages, we may need to
1870  recalculate 'pte' and switch back to smaller pages for the
1871  end of the mapping, if the trailing size is not enough to
1872  use another superpage (i.e. sg_res < lvl_pages). */
1873  pte++;
1874  if (!nr_pages || first_pte_in_page(pte) ||
1875  (largepage_lvl > 1 && sg_res < lvl_pages)) {
1876  domain_flush_cache(domain, first_pte,
1877  (void *)pte - (void *)first_pte);
1878  pte = NULL;
1879  }
1880 
1881  if (!sg_res && nr_pages)
1882  sg = sg_next(sg);
1883  }
1884  return 0;
1885 }
1886 
1887 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1888  struct scatterlist *sg, unsigned long nr_pages,
1889  int prot)
1890 {
1891  return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1892 }
1893 
1894 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1895  unsigned long phys_pfn, unsigned long nr_pages,
1896  int prot)
1897 {
1898  return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1899 }
1900 
1901 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1902 {
1903  if (!iommu)
1904  return;
1905 
1906  clear_context_table(iommu, bus, devfn);
1907  iommu->flush.flush_context(iommu, 0, 0, 0,
1909  iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1910 }
1911 
1912 static inline void unlink_domain_info(struct device_domain_info *info)
1913 {
1914  assert_spin_locked(&device_domain_lock);
1915  list_del(&info->link);
1916  list_del(&info->global);
1917  if (info->dev)
1918  info->dev->dev.archdata.iommu = NULL;
1919 }
1920 
1921 static void domain_remove_dev_info(struct dmar_domain *domain)
1922 {
1923  struct device_domain_info *info;
1924  unsigned long flags;
1925  struct intel_iommu *iommu;
1926 
1927  spin_lock_irqsave(&device_domain_lock, flags);
1928  while (!list_empty(&domain->devices)) {
1929  info = list_entry(domain->devices.next,
1930  struct device_domain_info, link);
1931  unlink_domain_info(info);
1932  spin_unlock_irqrestore(&device_domain_lock, flags);
1933 
1934  iommu_disable_dev_iotlb(info);
1935  iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1936  iommu_detach_dev(iommu, info->bus, info->devfn);
1937  free_devinfo_mem(info);
1938 
1939  spin_lock_irqsave(&device_domain_lock, flags);
1940  }
1941  spin_unlock_irqrestore(&device_domain_lock, flags);
1942 }
1943 
1944 /*
1945  * find_domain
1946  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1947  */
1948 static struct dmar_domain *
1949 find_domain(struct pci_dev *pdev)
1950 {
1951  struct device_domain_info *info;
1952 
1953  /* No lock here, assumes no domain exit in normal case */
1954  info = pdev->dev.archdata.iommu;
1955  if (info)
1956  return info->domain;
1957  return NULL;
1958 }
1959 
1960 /* domain is initialized */
1961 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1962 {
1963  struct dmar_domain *domain, *found = NULL;
1964  struct intel_iommu *iommu;
1965  struct dmar_drhd_unit *drhd;
1966  struct device_domain_info *info, *tmp;
1967  struct pci_dev *dev_tmp;
1968  unsigned long flags;
1969  int bus = 0, devfn = 0;
1970  int segment;
1971  int ret;
1972 
1973  domain = find_domain(pdev);
1974  if (domain)
1975  return domain;
1976 
1977  segment = pci_domain_nr(pdev->bus);
1978 
1979  dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1980  if (dev_tmp) {
1981  if (pci_is_pcie(dev_tmp)) {
1982  bus = dev_tmp->subordinate->number;
1983  devfn = 0;
1984  } else {
1985  bus = dev_tmp->bus->number;
1986  devfn = dev_tmp->devfn;
1987  }
1988  spin_lock_irqsave(&device_domain_lock, flags);
1989  list_for_each_entry(info, &device_domain_list, global) {
1990  if (info->segment == segment &&
1991  info->bus == bus && info->devfn == devfn) {
1992  found = info->domain;
1993  break;
1994  }
1995  }
1996  spin_unlock_irqrestore(&device_domain_lock, flags);
1997  /* pcie-pci bridge already has a domain, uses it */
1998  if (found) {
1999  domain = found;
2000  goto found_domain;
2001  }
2002  }
2003 
2004  domain = alloc_domain();
2005  if (!domain)
2006  goto error;
2007 
2008  /* Allocate new domain for the device */
2009  drhd = dmar_find_matched_drhd_unit(pdev);
2010  if (!drhd) {
2011  printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2012  pci_name(pdev));
2013  free_domain_mem(domain);
2014  return NULL;
2015  }
2016  iommu = drhd->iommu;
2017 
2018  ret = iommu_attach_domain(domain, iommu);
2019  if (ret) {
2020  free_domain_mem(domain);
2021  goto error;
2022  }
2023 
2024  if (domain_init(domain, gaw)) {
2025  domain_exit(domain);
2026  goto error;
2027  }
2028 
2029  /* register pcie-to-pci device */
2030  if (dev_tmp) {
2031  info = alloc_devinfo_mem();
2032  if (!info) {
2033  domain_exit(domain);
2034  goto error;
2035  }
2036  info->segment = segment;
2037  info->bus = bus;
2038  info->devfn = devfn;
2039  info->dev = NULL;
2040  info->domain = domain;
2041  /* This domain is shared by devices under p2p bridge */
2043 
2044  /* pcie-to-pci bridge already has a domain, uses it */
2045  found = NULL;
2046  spin_lock_irqsave(&device_domain_lock, flags);
2047  list_for_each_entry(tmp, &device_domain_list, global) {
2048  if (tmp->segment == segment &&
2049  tmp->bus == bus && tmp->devfn == devfn) {
2050  found = tmp->domain;
2051  break;
2052  }
2053  }
2054  if (found) {
2055  spin_unlock_irqrestore(&device_domain_lock, flags);
2056  free_devinfo_mem(info);
2057  domain_exit(domain);
2058  domain = found;
2059  } else {
2060  list_add(&info->link, &domain->devices);
2061  list_add(&info->global, &device_domain_list);
2062  spin_unlock_irqrestore(&device_domain_lock, flags);
2063  }
2064  }
2065 
2066 found_domain:
2067  info = alloc_devinfo_mem();
2068  if (!info)
2069  goto error;
2070  info->segment = segment;
2071  info->bus = pdev->bus->number;
2072  info->devfn = pdev->devfn;
2073  info->dev = pdev;
2074  info->domain = domain;
2075  spin_lock_irqsave(&device_domain_lock, flags);
2076  /* somebody is fast */
2077  found = find_domain(pdev);
2078  if (found != NULL) {
2079  spin_unlock_irqrestore(&device_domain_lock, flags);
2080  if (found != domain) {
2081  domain_exit(domain);
2082  domain = found;
2083  }
2084  free_devinfo_mem(info);
2085  return domain;
2086  }
2087  list_add(&info->link, &domain->devices);
2088  list_add(&info->global, &device_domain_list);
2089  pdev->dev.archdata.iommu = info;
2090  spin_unlock_irqrestore(&device_domain_lock, flags);
2091  return domain;
2092 error:
2093  /* recheck it here, maybe others set it */
2094  return find_domain(pdev);
2095 }
2096 
2097 static int iommu_identity_mapping;
2098 #define IDENTMAP_ALL 1
2099 #define IDENTMAP_GFX 2
2100 #define IDENTMAP_AZALIA 4
2101 
2102 static int iommu_domain_identity_map(struct dmar_domain *domain,
2103  unsigned long long start,
2104  unsigned long long end)
2105 {
2106  unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2107  unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2108 
2109  if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2110  dma_to_mm_pfn(last_vpfn))) {
2111  printk(KERN_ERR "IOMMU: reserve iova failed\n");
2112  return -ENOMEM;
2113  }
2114 
2115  pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2116  start, end, domain->id);
2117  /*
2118  * RMRR range might have overlap with physical memory range,
2119  * clear it first
2120  */
2121  dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2122 
2123  return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2124  last_vpfn - first_vpfn + 1,
2126 }
2127 
2128 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2129  unsigned long long start,
2130  unsigned long long end)
2131 {
2132  struct dmar_domain *domain;
2133  int ret;
2134 
2135  domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2136  if (!domain)
2137  return -ENOMEM;
2138 
2139  /* For _hardware_ passthrough, don't bother. But for software
2140  passthrough, we do it anyway -- it may indicate a memory
2141  range which is reserved in E820, so which didn't get set
2142  up to start with in si_domain */
2143  if (domain == si_domain && hw_pass_through) {
2144  printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2145  pci_name(pdev), start, end);
2146  return 0;
2147  }
2148 
2150  "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2151  pci_name(pdev), start, end);
2152 
2153  if (end < start) {
2154  WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2155  "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2159  ret = -EIO;
2160  goto error;
2161  }
2162 
2163  if (end >> agaw_to_width(domain->agaw)) {
2164  WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2165  "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2166  agaw_to_width(domain->agaw),
2170  ret = -EIO;
2171  goto error;
2172  }
2173 
2174  ret = iommu_domain_identity_map(domain, start, end);
2175  if (ret)
2176  goto error;
2177 
2178  /* context entry init */
2179  ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2180  if (ret)
2181  goto error;
2182 
2183  return 0;
2184 
2185  error:
2186  domain_exit(domain);
2187  return ret;
2188 }
2189 
2190 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2191  struct pci_dev *pdev)
2192 {
2193  if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2194  return 0;
2195  return iommu_prepare_identity_map(pdev, rmrr->base_address,
2196  rmrr->end_address);
2197 }
2198 
2199 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2200 static inline void iommu_prepare_isa(void)
2201 {
2202  struct pci_dev *pdev;
2203  int ret;
2204 
2205  pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2206  if (!pdev)
2207  return;
2208 
2209  printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2210  ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2211 
2212  if (ret)
2213  printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2214  "floppy might not work\n");
2215 
2216 }
2217 #else
2218 static inline void iommu_prepare_isa(void)
2219 {
2220  return;
2221 }
2222 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2223 
2224 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2225 
2226 static int __init si_domain_init(int hw)
2227 {
2228  struct dmar_drhd_unit *drhd;
2229  struct intel_iommu *iommu;
2230  int nid, ret = 0;
2231 
2232  si_domain = alloc_domain();
2233  if (!si_domain)
2234  return -EFAULT;
2235 
2236  pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2237 
2238  for_each_active_iommu(iommu, drhd) {
2239  ret = iommu_attach_domain(si_domain, iommu);
2240  if (ret) {
2241  domain_exit(si_domain);
2242  return -EFAULT;
2243  }
2244  }
2245 
2246  if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2247  domain_exit(si_domain);
2248  return -EFAULT;
2249  }
2250 
2251  si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2252 
2253  if (hw)
2254  return 0;
2255 
2256  for_each_online_node(nid) {
2257  unsigned long start_pfn, end_pfn;
2258  int i;
2259 
2260  for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2261  ret = iommu_domain_identity_map(si_domain,
2262  PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2263  if (ret)
2264  return ret;
2265  }
2266  }
2267 
2268  return 0;
2269 }
2270 
2271 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2272  struct pci_dev *pdev);
2273 static int identity_mapping(struct pci_dev *pdev)
2274 {
2275  struct device_domain_info *info;
2276 
2277  if (likely(!iommu_identity_mapping))
2278  return 0;
2279 
2280  info = pdev->dev.archdata.iommu;
2281  if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2282  return (info->domain == si_domain);
2283 
2284  return 0;
2285 }
2286 
2287 static int domain_add_dev_info(struct dmar_domain *domain,
2288  struct pci_dev *pdev,
2289  int translation)
2290 {
2291  struct device_domain_info *info;
2292  unsigned long flags;
2293  int ret;
2294 
2295  info = alloc_devinfo_mem();
2296  if (!info)
2297  return -ENOMEM;
2298 
2299  info->segment = pci_domain_nr(pdev->bus);
2300  info->bus = pdev->bus->number;
2301  info->devfn = pdev->devfn;
2302  info->dev = pdev;
2303  info->domain = domain;
2304 
2305  spin_lock_irqsave(&device_domain_lock, flags);
2306  list_add(&info->link, &domain->devices);
2307  list_add(&info->global, &device_domain_list);
2308  pdev->dev.archdata.iommu = info;
2309  spin_unlock_irqrestore(&device_domain_lock, flags);
2310 
2311  ret = domain_context_mapping(domain, pdev, translation);
2312  if (ret) {
2313  spin_lock_irqsave(&device_domain_lock, flags);
2314  unlink_domain_info(info);
2315  spin_unlock_irqrestore(&device_domain_lock, flags);
2316  free_devinfo_mem(info);
2317  return ret;
2318  }
2319 
2320  return 0;
2321 }
2322 
2323 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2324 {
2325  if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2326  return 1;
2327 
2328  if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2329  return 1;
2330 
2331  if (!(iommu_identity_mapping & IDENTMAP_ALL))
2332  return 0;
2333 
2334  /*
2335  * We want to start off with all devices in the 1:1 domain, and
2336  * take them out later if we find they can't access all of memory.
2337  *
2338  * However, we can't do this for PCI devices behind bridges,
2339  * because all PCI devices behind the same bridge will end up
2340  * with the same source-id on their transactions.
2341  *
2342  * Practically speaking, we can't change things around for these
2343  * devices at run-time, because we can't be sure there'll be no
2344  * DMA transactions in flight for any of their siblings.
2345  *
2346  * So PCI devices (unless they're on the root bus) as well as
2347  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2348  * the 1:1 domain, just in _case_ one of their siblings turns out
2349  * not to be able to map all of memory.
2350  */
2351  if (!pci_is_pcie(pdev)) {
2352  if (!pci_is_root_bus(pdev->bus))
2353  return 0;
2354  if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2355  return 0;
2356  } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2357  return 0;
2358 
2359  /*
2360  * At boot time, we don't yet know if devices will be 64-bit capable.
2361  * Assume that they will -- if they turn out not to be, then we can
2362  * take them out of the 1:1 domain later.
2363  */
2364  if (!startup) {
2365  /*
2366  * If the device's dma_mask is less than the system's memory
2367  * size then this is not a candidate for identity mapping.
2368  */
2369  u64 dma_mask = pdev->dma_mask;
2370 
2371  if (pdev->dev.coherent_dma_mask &&
2372  pdev->dev.coherent_dma_mask < dma_mask)
2373  dma_mask = pdev->dev.coherent_dma_mask;
2374 
2375  return dma_mask >= dma_get_required_mask(&pdev->dev);
2376  }
2377 
2378  return 1;
2379 }
2380 
2381 static int __init iommu_prepare_static_identity_mapping(int hw)
2382 {
2383  struct pci_dev *pdev = NULL;
2384  int ret;
2385 
2386  ret = si_domain_init(hw);
2387  if (ret)
2388  return -EFAULT;
2389 
2390  for_each_pci_dev(pdev) {
2391  if (iommu_should_identity_map(pdev, 1)) {
2392  ret = domain_add_dev_info(si_domain, pdev,
2395  if (ret) {
2396  /* device not associated with an iommu */
2397  if (ret == -ENODEV)
2398  continue;
2399  return ret;
2400  }
2401  pr_info("IOMMU: %s identity mapping for device %s\n",
2402  hw ? "hardware" : "software", pci_name(pdev));
2403  }
2404  }
2405 
2406  return 0;
2407 }
2408 
2409 static int __init init_dmars(void)
2410 {
2411  struct dmar_drhd_unit *drhd;
2412  struct dmar_rmrr_unit *rmrr;
2413  struct pci_dev *pdev;
2414  struct intel_iommu *iommu;
2415  int i, ret;
2416 
2417  /*
2418  * for each drhd
2419  * allocate root
2420  * initialize and program root entry to not present
2421  * endfor
2422  */
2423  for_each_drhd_unit(drhd) {
2424  /*
2425  * lock not needed as this is only incremented in the single
2426  * threaded kernel __init code path all other access are read
2427  * only
2428  */
2429  if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2430  g_num_of_iommus++;
2431  continue;
2432  }
2433  printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2435  }
2436 
2437  g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2438  GFP_KERNEL);
2439  if (!g_iommus) {
2440  printk(KERN_ERR "Allocating global iommu array failed\n");
2441  ret = -ENOMEM;
2442  goto error;
2443  }
2444 
2445  deferred_flush = kzalloc(g_num_of_iommus *
2446  sizeof(struct deferred_flush_tables), GFP_KERNEL);
2447  if (!deferred_flush) {
2448  ret = -ENOMEM;
2449  goto error;
2450  }
2451 
2452  for_each_drhd_unit(drhd) {
2453  if (drhd->ignored)
2454  continue;
2455 
2456  iommu = drhd->iommu;
2457  g_iommus[iommu->seq_id] = iommu;
2458 
2459  ret = iommu_init_domains(iommu);
2460  if (ret)
2461  goto error;
2462 
2463  /*
2464  * TBD:
2465  * we could share the same root & context tables
2466  * among all IOMMU's. Need to Split it later.
2467  */
2468  ret = iommu_alloc_root_entry(iommu);
2469  if (ret) {
2470  printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2471  goto error;
2472  }
2473  if (!ecap_pass_through(iommu->ecap))
2474  hw_pass_through = 0;
2475  }
2476 
2477  /*
2478  * Start from the sane iommu hardware state.
2479  */
2480  for_each_drhd_unit(drhd) {
2481  if (drhd->ignored)
2482  continue;
2483 
2484  iommu = drhd->iommu;
2485 
2486  /*
2487  * If the queued invalidation is already initialized by us
2488  * (for example, while enabling interrupt-remapping) then
2489  * we got the things already rolling from a sane state.
2490  */
2491  if (iommu->qi)
2492  continue;
2493 
2494  /*
2495  * Clear any previous faults.
2496  */
2497  dmar_fault(-1, iommu);
2498  /*
2499  * Disable queued invalidation if supported and already enabled
2500  * before OS handover.
2501  */
2502  dmar_disable_qi(iommu);
2503  }
2504 
2505  for_each_drhd_unit(drhd) {
2506  if (drhd->ignored)
2507  continue;
2508 
2509  iommu = drhd->iommu;
2510 
2511  if (dmar_enable_qi(iommu)) {
2512  /*
2513  * Queued Invalidate not enabled, use Register Based
2514  * Invalidate
2515  */
2516  iommu->flush.flush_context = __iommu_flush_context;
2517  iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2518  printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2519  "invalidation\n",
2520  iommu->seq_id,
2521  (unsigned long long)drhd->reg_base_addr);
2522  } else {
2523  iommu->flush.flush_context = qi_flush_context;
2524  iommu->flush.flush_iotlb = qi_flush_iotlb;
2525  printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2526  "invalidation\n",
2527  iommu->seq_id,
2528  (unsigned long long)drhd->reg_base_addr);
2529  }
2530  }
2531 
2532  if (iommu_pass_through)
2533  iommu_identity_mapping |= IDENTMAP_ALL;
2534 
2535 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2536  iommu_identity_mapping |= IDENTMAP_GFX;
2537 #endif
2538 
2539  check_tylersburg_isoch();
2540 
2541  /*
2542  * If pass through is not set or not enabled, setup context entries for
2543  * identity mappings for rmrr, gfx, and isa and may fall back to static
2544  * identity mapping if iommu_identity_mapping is set.
2545  */
2546  if (iommu_identity_mapping) {
2547  ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2548  if (ret) {
2549  printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2550  goto error;
2551  }
2552  }
2553  /*
2554  * For each rmrr
2555  * for each dev attached to rmrr
2556  * do
2557  * locate drhd for dev, alloc domain for dev
2558  * allocate free domain
2559  * allocate page table entries for rmrr
2560  * if context not allocated for bus
2561  * allocate and init context
2562  * set present in root table for this bus
2563  * init context with domain, translation etc
2564  * endfor
2565  * endfor
2566  */
2567  printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2568  for_each_rmrr_units(rmrr) {
2569  for (i = 0; i < rmrr->devices_cnt; i++) {
2570  pdev = rmrr->devices[i];
2571  /*
2572  * some BIOS lists non-exist devices in DMAR
2573  * table.
2574  */
2575  if (!pdev)
2576  continue;
2577  ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2578  if (ret)
2580  "IOMMU: mapping reserved region failed\n");
2581  }
2582  }
2583 
2584  iommu_prepare_isa();
2585 
2586  /*
2587  * for each drhd
2588  * enable fault log
2589  * global invalidate context cache
2590  * global invalidate iotlb
2591  * enable translation
2592  */
2593  for_each_drhd_unit(drhd) {
2594  if (drhd->ignored) {
2595  /*
2596  * we always have to disable PMRs or DMA may fail on
2597  * this device
2598  */
2599  if (force_on)
2600  iommu_disable_protect_mem_regions(drhd->iommu);
2601  continue;
2602  }
2603  iommu = drhd->iommu;
2604 
2605  iommu_flush_write_buffer(iommu);
2606 
2607  ret = dmar_set_interrupt(iommu);
2608  if (ret)
2609  goto error;
2610 
2611  iommu_set_root_entry(iommu);
2612 
2613  iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2614  iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2615 
2616  ret = iommu_enable_translation(iommu);
2617  if (ret)
2618  goto error;
2619 
2620  iommu_disable_protect_mem_regions(iommu);
2621  }
2622 
2623  return 0;
2624 error:
2625  for_each_drhd_unit(drhd) {
2626  if (drhd->ignored)
2627  continue;
2628  iommu = drhd->iommu;
2629  free_iommu(iommu);
2630  }
2631  kfree(g_iommus);
2632  return ret;
2633 }
2634 
2635 /* This takes a number of _MM_ pages, not VTD pages */
2636 static struct iova *intel_alloc_iova(struct device *dev,
2637  struct dmar_domain *domain,
2638  unsigned long nrpages, uint64_t dma_mask)
2639 {
2640  struct pci_dev *pdev = to_pci_dev(dev);
2641  struct iova *iova = NULL;
2642 
2643  /* Restrict dma_mask to the width that the iommu can handle */
2644  dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2645 
2646  if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2647  /*
2648  * First try to allocate an io virtual address in
2649  * DMA_BIT_MASK(32) and if that fails then try allocating
2650  * from higher range
2651  */
2652  iova = alloc_iova(&domain->iovad, nrpages,
2653  IOVA_PFN(DMA_BIT_MASK(32)), 1);
2654  if (iova)
2655  return iova;
2656  }
2657  iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2658  if (unlikely(!iova)) {
2659  printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2660  nrpages, pci_name(pdev));
2661  return NULL;
2662  }
2663 
2664  return iova;
2665 }
2666 
2667 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2668 {
2669  struct dmar_domain *domain;
2670  int ret;
2671 
2672  domain = get_domain_for_dev(pdev,
2674  if (!domain) {
2676  "Allocating domain for %s failed", pci_name(pdev));
2677  return NULL;
2678  }
2679 
2680  /* make sure context mapping is ok */
2681  if (unlikely(!domain_context_mapped(pdev))) {
2682  ret = domain_context_mapping(domain, pdev,
2684  if (ret) {
2686  "Domain context map for %s failed",
2687  pci_name(pdev));
2688  return NULL;
2689  }
2690  }
2691 
2692  return domain;
2693 }
2694 
2695 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2696 {
2697  struct device_domain_info *info;
2698 
2699  /* No lock here, assumes no domain exit in normal case */
2700  info = dev->dev.archdata.iommu;
2701  if (likely(info))
2702  return info->domain;
2703 
2704  return __get_valid_domain_for_dev(dev);
2705 }
2706 
2707 static int iommu_dummy(struct pci_dev *pdev)
2708 {
2709  return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2710 }
2711 
2712 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2713 static int iommu_no_mapping(struct device *dev)
2714 {
2715  struct pci_dev *pdev;
2716  int found;
2717 
2718  if (unlikely(dev->bus != &pci_bus_type))
2719  return 1;
2720 
2721  pdev = to_pci_dev(dev);
2722  if (iommu_dummy(pdev))
2723  return 1;
2724 
2725  if (!iommu_identity_mapping)
2726  return 0;
2727 
2728  found = identity_mapping(pdev);
2729  if (found) {
2730  if (iommu_should_identity_map(pdev, 0))
2731  return 1;
2732  else {
2733  /*
2734  * 32 bit DMA is removed from si_domain and fall back
2735  * to non-identity mapping.
2736  */
2737  domain_remove_one_dev_info(si_domain, pdev);
2738  printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2739  pci_name(pdev));
2740  return 0;
2741  }
2742  } else {
2743  /*
2744  * In case of a detached 64 bit DMA device from vm, the device
2745  * is put into si_domain for identity mapping.
2746  */
2747  if (iommu_should_identity_map(pdev, 0)) {
2748  int ret;
2749  ret = domain_add_dev_info(si_domain, pdev,
2750  hw_pass_through ?
2753  if (!ret) {
2754  printk(KERN_INFO "64bit %s uses identity mapping\n",
2755  pci_name(pdev));
2756  return 1;
2757  }
2758  }
2759  }
2760 
2761  return 0;
2762 }
2763 
2764 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2765  size_t size, int dir, u64 dma_mask)
2766 {
2767  struct pci_dev *pdev = to_pci_dev(hwdev);
2768  struct dmar_domain *domain;
2769  phys_addr_t start_paddr;
2770  struct iova *iova;
2771  int prot = 0;
2772  int ret;
2773  struct intel_iommu *iommu;
2774  unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2775 
2776  BUG_ON(dir == DMA_NONE);
2777 
2778  if (iommu_no_mapping(hwdev))
2779  return paddr;
2780 
2781  domain = get_valid_domain_for_dev(pdev);
2782  if (!domain)
2783  return 0;
2784 
2785  iommu = domain_get_iommu(domain);
2786  size = aligned_nrpages(paddr, size);
2787 
2788  iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2789  if (!iova)
2790  goto error;
2791 
2792  /*
2793  * Check if DMAR supports zero-length reads on write only
2794  * mappings..
2795  */
2796  if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2797  !cap_zlr(iommu->cap))
2798  prot |= DMA_PTE_READ;
2799  if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2800  prot |= DMA_PTE_WRITE;
2801  /*
2802  * paddr - (paddr + size) might be partial page, we should map the whole
2803  * page. Note: if two part of one page are separately mapped, we
2804  * might have two guest_addr mapping to the same host paddr, but this
2805  * is not a big problem
2806  */
2807  ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2808  mm_to_dma_pfn(paddr_pfn), size, prot);
2809  if (ret)
2810  goto error;
2811 
2812  /* it's a non-present to present mapping. Only flush if caching mode */
2813  if (cap_caching_mode(iommu->cap))
2814  iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2815  else
2816  iommu_flush_write_buffer(iommu);
2817 
2818  start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2819  start_paddr += paddr & ~PAGE_MASK;
2820  return start_paddr;
2821 
2822 error:
2823  if (iova)
2824  __free_iova(&domain->iovad, iova);
2825  printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2826  pci_name(pdev), size, (unsigned long long)paddr, dir);
2827  return 0;
2828 }
2829 
2830 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2831  unsigned long offset, size_t size,
2832  enum dma_data_direction dir,
2833  struct dma_attrs *attrs)
2834 {
2835  return __intel_map_single(dev, page_to_phys(page) + offset, size,
2836  dir, to_pci_dev(dev)->dma_mask);
2837 }
2838 
2839 static void flush_unmaps(void)
2840 {
2841  int i, j;
2842 
2843  timer_on = 0;
2844 
2845  /* just flush them all */
2846  for (i = 0; i < g_num_of_iommus; i++) {
2847  struct intel_iommu *iommu = g_iommus[i];
2848  if (!iommu)
2849  continue;
2850 
2851  if (!deferred_flush[i].next)
2852  continue;
2853 
2854  /* In caching mode, global flushes turn emulation expensive */
2855  if (!cap_caching_mode(iommu->cap))
2856  iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2858  for (j = 0; j < deferred_flush[i].next; j++) {
2859  unsigned long mask;
2860  struct iova *iova = deferred_flush[i].iova[j];
2861  struct dmar_domain *domain = deferred_flush[i].domain[j];
2862 
2863  /* On real hardware multiple invalidations are expensive */
2864  if (cap_caching_mode(iommu->cap))
2865  iommu_flush_iotlb_psi(iommu, domain->id,
2866  iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2867  else {
2868  mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2869  iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2870  (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2871  }
2872  __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2873  }
2874  deferred_flush[i].next = 0;
2875  }
2876 
2877  list_size = 0;
2878 }
2879 
2880 static void flush_unmaps_timeout(unsigned long data)
2881 {
2882  unsigned long flags;
2883 
2884  spin_lock_irqsave(&async_umap_flush_lock, flags);
2885  flush_unmaps();
2886  spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2887 }
2888 
2889 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2890 {
2891  unsigned long flags;
2892  int next, iommu_id;
2893  struct intel_iommu *iommu;
2894 
2895  spin_lock_irqsave(&async_umap_flush_lock, flags);
2896  if (list_size == HIGH_WATER_MARK)
2897  flush_unmaps();
2898 
2899  iommu = domain_get_iommu(dom);
2900  iommu_id = iommu->seq_id;
2901 
2902  next = deferred_flush[iommu_id].next;
2903  deferred_flush[iommu_id].domain[next] = dom;
2904  deferred_flush[iommu_id].iova[next] = iova;
2905  deferred_flush[iommu_id].next++;
2906 
2907  if (!timer_on) {
2908  mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2909  timer_on = 1;
2910  }
2911  list_size++;
2912  spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2913 }
2914 
2915 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2916  size_t size, enum dma_data_direction dir,
2917  struct dma_attrs *attrs)
2918 {
2919  struct pci_dev *pdev = to_pci_dev(dev);
2920  struct dmar_domain *domain;
2921  unsigned long start_pfn, last_pfn;
2922  struct iova *iova;
2923  struct intel_iommu *iommu;
2924 
2925  if (iommu_no_mapping(dev))
2926  return;
2927 
2928  domain = find_domain(pdev);
2929  BUG_ON(!domain);
2930 
2931  iommu = domain_get_iommu(domain);
2932 
2933  iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2934  if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2935  (unsigned long long)dev_addr))
2936  return;
2937 
2938  start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2939  last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2940 
2941  pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2942  pci_name(pdev), start_pfn, last_pfn);
2943 
2944  /* clear the whole page */
2945  dma_pte_clear_range(domain, start_pfn, last_pfn);
2946 
2947  /* free page tables */
2948  dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2949 
2950  if (intel_iommu_strict) {
2951  iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2952  last_pfn - start_pfn + 1, 0);
2953  /* free iova */
2954  __free_iova(&domain->iovad, iova);
2955  } else {
2956  add_unmap(domain, iova);
2957  /*
2958  * queue up the release of the unmap to save the 1/6th of the
2959  * cpu used up by the iotlb flush operation...
2960  */
2961  }
2962 }
2963 
2964 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2965  dma_addr_t *dma_handle, gfp_t flags,
2966  struct dma_attrs *attrs)
2967 {
2968  void *vaddr;
2969  int order;
2970 
2971  size = PAGE_ALIGN(size);
2972  order = get_order(size);
2973 
2974  if (!iommu_no_mapping(hwdev))
2975  flags &= ~(GFP_DMA | GFP_DMA32);
2976  else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2977  if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2978  flags |= GFP_DMA;
2979  else
2980  flags |= GFP_DMA32;
2981  }
2982 
2983  vaddr = (void *)__get_free_pages(flags, order);
2984  if (!vaddr)
2985  return NULL;
2986  memset(vaddr, 0, size);
2987 
2988  *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2990  hwdev->coherent_dma_mask);
2991  if (*dma_handle)
2992  return vaddr;
2993  free_pages((unsigned long)vaddr, order);
2994  return NULL;
2995 }
2996 
2997 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2998  dma_addr_t dma_handle, struct dma_attrs *attrs)
2999 {
3000  int order;
3001 
3002  size = PAGE_ALIGN(size);
3003  order = get_order(size);
3004 
3005  intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3006  free_pages((unsigned long)vaddr, order);
3007 }
3008 
3009 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3010  int nelems, enum dma_data_direction dir,
3011  struct dma_attrs *attrs)
3012 {
3013  struct pci_dev *pdev = to_pci_dev(hwdev);
3014  struct dmar_domain *domain;
3015  unsigned long start_pfn, last_pfn;
3016  struct iova *iova;
3017  struct intel_iommu *iommu;
3018 
3019  if (iommu_no_mapping(hwdev))
3020  return;
3021 
3022  domain = find_domain(pdev);
3023  BUG_ON(!domain);
3024 
3025  iommu = domain_get_iommu(domain);
3026 
3027  iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3028  if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3029  (unsigned long long)sglist[0].dma_address))
3030  return;
3031 
3032  start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3033  last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3034 
3035  /* clear the whole page */
3036  dma_pte_clear_range(domain, start_pfn, last_pfn);
3037 
3038  /* free page tables */
3039  dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3040 
3041  if (intel_iommu_strict) {
3042  iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3043  last_pfn - start_pfn + 1, 0);
3044  /* free iova */
3045  __free_iova(&domain->iovad, iova);
3046  } else {
3047  add_unmap(domain, iova);
3048  /*
3049  * queue up the release of the unmap to save the 1/6th of the
3050  * cpu used up by the iotlb flush operation...
3051  */
3052  }
3053 }
3054 
3055 static int intel_nontranslate_map_sg(struct device *hddev,
3056  struct scatterlist *sglist, int nelems, int dir)
3057 {
3058  int i;
3059  struct scatterlist *sg;
3060 
3061  for_each_sg(sglist, sg, nelems, i) {
3062  BUG_ON(!sg_page(sg));
3063  sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3064  sg->dma_length = sg->length;
3065  }
3066  return nelems;
3067 }
3068 
3069 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3070  enum dma_data_direction dir, struct dma_attrs *attrs)
3071 {
3072  int i;
3073  struct pci_dev *pdev = to_pci_dev(hwdev);
3074  struct dmar_domain *domain;
3075  size_t size = 0;
3076  int prot = 0;
3077  struct iova *iova = NULL;
3078  int ret;
3079  struct scatterlist *sg;
3080  unsigned long start_vpfn;
3081  struct intel_iommu *iommu;
3082 
3083  BUG_ON(dir == DMA_NONE);
3084  if (iommu_no_mapping(hwdev))
3085  return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3086 
3087  domain = get_valid_domain_for_dev(pdev);
3088  if (!domain)
3089  return 0;
3090 
3091  iommu = domain_get_iommu(domain);
3092 
3093  for_each_sg(sglist, sg, nelems, i)
3094  size += aligned_nrpages(sg->offset, sg->length);
3095 
3096  iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3097  pdev->dma_mask);
3098  if (!iova) {
3099  sglist->dma_length = 0;
3100  return 0;
3101  }
3102 
3103  /*
3104  * Check if DMAR supports zero-length reads on write only
3105  * mappings..
3106  */
3107  if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3108  !cap_zlr(iommu->cap))
3109  prot |= DMA_PTE_READ;
3110  if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3111  prot |= DMA_PTE_WRITE;
3112 
3113  start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3114 
3115  ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3116  if (unlikely(ret)) {
3117  /* clear the page */
3118  dma_pte_clear_range(domain, start_vpfn,
3119  start_vpfn + size - 1);
3120  /* free page tables */
3121  dma_pte_free_pagetable(domain, start_vpfn,
3122  start_vpfn + size - 1);
3123  /* free iova */
3124  __free_iova(&domain->iovad, iova);
3125  return 0;
3126  }
3127 
3128  /* it's a non-present to present mapping. Only flush if caching mode */
3129  if (cap_caching_mode(iommu->cap))
3130  iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3131  else
3132  iommu_flush_write_buffer(iommu);
3133 
3134  return nelems;
3135 }
3136 
3137 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3138 {
3139  return !dma_addr;
3140 }
3141 
3143  .alloc = intel_alloc_coherent,
3144  .free = intel_free_coherent,
3145  .map_sg = intel_map_sg,
3146  .unmap_sg = intel_unmap_sg,
3147  .map_page = intel_map_page,
3148  .unmap_page = intel_unmap_page,
3149  .mapping_error = intel_mapping_error,
3150 };
3151 
3152 static inline int iommu_domain_cache_init(void)
3153 {
3154  int ret = 0;
3155 
3156  iommu_domain_cache = kmem_cache_create("iommu_domain",
3157  sizeof(struct dmar_domain),
3158  0,
3160 
3161  NULL);
3162  if (!iommu_domain_cache) {
3163  printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3164  ret = -ENOMEM;
3165  }
3166 
3167  return ret;
3168 }
3169 
3170 static inline int iommu_devinfo_cache_init(void)
3171 {
3172  int ret = 0;
3173 
3174  iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3175  sizeof(struct device_domain_info),
3176  0,
3178  NULL);
3179  if (!iommu_devinfo_cache) {
3180  printk(KERN_ERR "Couldn't create devinfo cache\n");
3181  ret = -ENOMEM;
3182  }
3183 
3184  return ret;
3185 }
3186 
3187 static inline int iommu_iova_cache_init(void)
3188 {
3189  int ret = 0;
3190 
3191  iommu_iova_cache = kmem_cache_create("iommu_iova",
3192  sizeof(struct iova),
3193  0,
3195  NULL);
3196  if (!iommu_iova_cache) {
3197  printk(KERN_ERR "Couldn't create iova cache\n");
3198  ret = -ENOMEM;
3199  }
3200 
3201  return ret;
3202 }
3203 
3204 static int __init iommu_init_mempool(void)
3205 {
3206  int ret;
3207  ret = iommu_iova_cache_init();
3208  if (ret)
3209  return ret;
3210 
3211  ret = iommu_domain_cache_init();
3212  if (ret)
3213  goto domain_error;
3214 
3215  ret = iommu_devinfo_cache_init();
3216  if (!ret)
3217  return ret;
3218 
3219  kmem_cache_destroy(iommu_domain_cache);
3220 domain_error:
3221  kmem_cache_destroy(iommu_iova_cache);
3222 
3223  return -ENOMEM;
3224 }
3225 
3226 static void __init iommu_exit_mempool(void)
3227 {
3228  kmem_cache_destroy(iommu_devinfo_cache);
3229  kmem_cache_destroy(iommu_domain_cache);
3230  kmem_cache_destroy(iommu_iova_cache);
3231 
3232 }
3233 
3234 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3235 {
3236  struct dmar_drhd_unit *drhd;
3237  u32 vtbar;
3238  int rc;
3239 
3240  /* We know that this device on this chipset has its own IOMMU.
3241  * If we find it under a different IOMMU, then the BIOS is lying
3242  * to us. Hope that the IOMMU for this device is actually
3243  * disabled, and it needs no translation...
3244  */
3245  rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3246  if (rc) {
3247  /* "can't" happen */
3248  dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3249  return;
3250  }
3251  vtbar &= 0xffff0000;
3252 
3253  /* we know that the this iommu should be at offset 0xa000 from vtbar */
3254  drhd = dmar_find_matched_drhd_unit(pdev);
3255  if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3257  "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3258  pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3259 }
3261 
3262 static void __init init_no_remapping_devices(void)
3263 {
3264  struct dmar_drhd_unit *drhd;
3265 
3266  for_each_drhd_unit(drhd) {
3267  if (!drhd->include_all) {
3268  int i;
3269  for (i = 0; i < drhd->devices_cnt; i++)
3270  if (drhd->devices[i] != NULL)
3271  break;
3272  /* ignore DMAR unit if no pci devices exist */
3273  if (i == drhd->devices_cnt)
3274  drhd->ignored = 1;
3275  }
3276  }
3277 
3278  for_each_drhd_unit(drhd) {
3279  int i;
3280  if (drhd->ignored || drhd->include_all)
3281  continue;
3282 
3283  for (i = 0; i < drhd->devices_cnt; i++)
3284  if (drhd->devices[i] &&
3285  !IS_GFX_DEVICE(drhd->devices[i]))
3286  break;
3287 
3288  if (i < drhd->devices_cnt)
3289  continue;
3290 
3291  /* This IOMMU has *only* gfx devices. Either bypass it or
3292  set the gfx_mapped flag, as appropriate */
3293  if (dmar_map_gfx) {
3294  intel_iommu_gfx_mapped = 1;
3295  } else {
3296  drhd->ignored = 1;
3297  for (i = 0; i < drhd->devices_cnt; i++) {
3298  if (!drhd->devices[i])
3299  continue;
3300  drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3301  }
3302  }
3303  }
3304 }
3305 
3306 #ifdef CONFIG_SUSPEND
3307 static int init_iommu_hw(void)
3308 {
3309  struct dmar_drhd_unit *drhd;
3310  struct intel_iommu *iommu = NULL;
3311 
3312  for_each_active_iommu(iommu, drhd)
3313  if (iommu->qi)
3314  dmar_reenable_qi(iommu);
3315 
3316  for_each_iommu(iommu, drhd) {
3317  if (drhd->ignored) {
3318  /*
3319  * we always have to disable PMRs or DMA may fail on
3320  * this device
3321  */
3322  if (force_on)
3323  iommu_disable_protect_mem_regions(iommu);
3324  continue;
3325  }
3326 
3327  iommu_flush_write_buffer(iommu);
3328 
3329  iommu_set_root_entry(iommu);
3330 
3331  iommu->flush.flush_context(iommu, 0, 0, 0,
3333  iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3335  if (iommu_enable_translation(iommu))
3336  return 1;
3337  iommu_disable_protect_mem_regions(iommu);
3338  }
3339 
3340  return 0;
3341 }
3342 
3343 static void iommu_flush_all(void)
3344 {
3345  struct dmar_drhd_unit *drhd;
3346  struct intel_iommu *iommu;
3347 
3348  for_each_active_iommu(iommu, drhd) {
3349  iommu->flush.flush_context(iommu, 0, 0, 0,
3351  iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3353  }
3354 }
3355 
3356 static int iommu_suspend(void)
3357 {
3358  struct dmar_drhd_unit *drhd;
3359  struct intel_iommu *iommu = NULL;
3360  unsigned long flag;
3361 
3362  for_each_active_iommu(iommu, drhd) {
3363  iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3364  GFP_ATOMIC);
3365  if (!iommu->iommu_state)
3366  goto nomem;
3367  }
3368 
3369  iommu_flush_all();
3370 
3371  for_each_active_iommu(iommu, drhd) {
3372  iommu_disable_translation(iommu);
3373 
3374  raw_spin_lock_irqsave(&iommu->register_lock, flag);
3375 
3376  iommu->iommu_state[SR_DMAR_FECTL_REG] =
3377  readl(iommu->reg + DMAR_FECTL_REG);
3379  readl(iommu->reg + DMAR_FEDATA_REG);
3381  readl(iommu->reg + DMAR_FEADDR_REG);
3383  readl(iommu->reg + DMAR_FEUADDR_REG);
3384 
3386  }
3387  return 0;
3388 
3389 nomem:
3390  for_each_active_iommu(iommu, drhd)
3391  kfree(iommu->iommu_state);
3392 
3393  return -ENOMEM;
3394 }
3395 
3396 static void iommu_resume(void)
3397 {
3398  struct dmar_drhd_unit *drhd;
3399  struct intel_iommu *iommu = NULL;
3400  unsigned long flag;
3401 
3402  if (init_iommu_hw()) {
3403  if (force_on)
3404  panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3405  else
3406  WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3407  return;
3408  }
3409 
3410  for_each_active_iommu(iommu, drhd) {
3411 
3412  raw_spin_lock_irqsave(&iommu->register_lock, flag);
3413 
3415  iommu->reg + DMAR_FECTL_REG);
3417  iommu->reg + DMAR_FEDATA_REG);
3419  iommu->reg + DMAR_FEADDR_REG);
3421  iommu->reg + DMAR_FEUADDR_REG);
3422 
3424  }
3425 
3426  for_each_active_iommu(iommu, drhd)
3427  kfree(iommu->iommu_state);
3428 }
3429 
3430 static struct syscore_ops iommu_syscore_ops = {
3431  .resume = iommu_resume,
3432  .suspend = iommu_suspend,
3433 };
3434 
3435 static void __init init_iommu_pm_ops(void)
3436 {
3437  register_syscore_ops(&iommu_syscore_ops);
3438 }
3439 
3440 #else
3441 static inline void init_iommu_pm_ops(void) {}
3442 #endif /* CONFIG_PM */
3443 
3444 LIST_HEAD(dmar_rmrr_units);
3445 
3446 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3447 {
3448  list_add(&rmrr->list, &dmar_rmrr_units);
3449 }
3450 
3451 
3453 {
3454  struct acpi_dmar_reserved_memory *rmrr;
3455  struct dmar_rmrr_unit *rmrru;
3456 
3457  rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3458  if (!rmrru)
3459  return -ENOMEM;
3460 
3461  rmrru->hdr = header;
3462  rmrr = (struct acpi_dmar_reserved_memory *)header;
3463  rmrru->base_address = rmrr->base_address;
3464  rmrru->end_address = rmrr->end_address;
3465 
3466  dmar_register_rmrr_unit(rmrru);
3467  return 0;
3468 }
3469 
3470 static int __init
3471 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3472 {
3473  struct acpi_dmar_reserved_memory *rmrr;
3474  int ret;
3475 
3476  rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3477  ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3478  ((void *)rmrr) + rmrr->header.length,
3479  &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3480 
3481  if (ret || (rmrru->devices_cnt == 0)) {
3482  list_del(&rmrru->list);
3483  kfree(rmrru);
3484  }
3485  return ret;
3486 }
3487 
3488 static LIST_HEAD(dmar_atsr_units);
3489 
3491 {
3492  struct acpi_dmar_atsr *atsr;
3493  struct dmar_atsr_unit *atsru;
3494 
3495  atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3496  atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3497  if (!atsru)
3498  return -ENOMEM;
3499 
3500  atsru->hdr = hdr;
3501  atsru->include_all = atsr->flags & 0x1;
3502 
3503  list_add(&atsru->list, &dmar_atsr_units);
3504 
3505  return 0;
3506 }
3507 
3508 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3509 {
3510  int rc;
3511  struct acpi_dmar_atsr *atsr;
3512 
3513  if (atsru->include_all)
3514  return 0;
3515 
3516  atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3517  rc = dmar_parse_dev_scope((void *)(atsr + 1),
3518  (void *)atsr + atsr->header.length,
3519  &atsru->devices_cnt, &atsru->devices,
3520  atsr->segment);
3521  if (rc || !atsru->devices_cnt) {
3522  list_del(&atsru->list);
3523  kfree(atsru);
3524  }
3525 
3526  return rc;
3527 }
3528 
3530 {
3531  int i;
3532  struct pci_bus *bus;
3533  struct acpi_dmar_atsr *atsr;
3534  struct dmar_atsr_unit *atsru;
3535 
3536  dev = pci_physfn(dev);
3537 
3538  list_for_each_entry(atsru, &dmar_atsr_units, list) {
3539  atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3540  if (atsr->segment == pci_domain_nr(dev->bus))
3541  goto found;
3542  }
3543 
3544  return 0;
3545 
3546 found:
3547  for (bus = dev->bus; bus; bus = bus->parent) {
3548  struct pci_dev *bridge = bus->self;
3549 
3550  if (!bridge || !pci_is_pcie(bridge) ||
3551  pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3552  return 0;
3553 
3554  if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) {
3555  for (i = 0; i < atsru->devices_cnt; i++)
3556  if (atsru->devices[i] == bridge)
3557  return 1;
3558  break;
3559  }
3560  }
3561 
3562  if (atsru->include_all)
3563  return 1;
3564 
3565  return 0;
3566 }
3567 
3569 {
3570  struct dmar_rmrr_unit *rmrr, *rmrr_n;
3571  struct dmar_atsr_unit *atsr, *atsr_n;
3572  int ret = 0;
3573 
3574  list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3575  ret = rmrr_parse_dev(rmrr);
3576  if (ret)
3577  return ret;
3578  }
3579 
3580  list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3581  ret = atsr_parse_dev(atsr);
3582  if (ret)
3583  return ret;
3584  }
3585 
3586  return ret;
3587 }
3588 
3589 /*
3590  * Here we only respond to action of unbound device from driver.
3591  *
3592  * Added device is not attached to its DMAR domain here yet. That will happen
3593  * when mapping the device to iova.
3594  */
3595 static int device_notifier(struct notifier_block *nb,
3596  unsigned long action, void *data)
3597 {
3598  struct device *dev = data;
3599  struct pci_dev *pdev = to_pci_dev(dev);
3600  struct dmar_domain *domain;
3601 
3602  if (iommu_no_mapping(dev))
3603  return 0;
3604 
3605  domain = find_domain(pdev);
3606  if (!domain)
3607  return 0;
3608 
3609  if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3610  domain_remove_one_dev_info(domain, pdev);
3611 
3612  if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3613  !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3614  list_empty(&domain->devices))
3615  domain_exit(domain);
3616  }
3617 
3618  return 0;
3619 }
3620 
3621 static struct notifier_block device_nb = {
3622  .notifier_call = device_notifier,
3623 };
3624 
3626 {
3627  int ret = 0;
3628 
3629  /* VT-d is required for a TXT/tboot launch, so enforce that */
3630  force_on = tboot_force_iommu();
3631 
3632  if (dmar_table_init()) {
3633  if (force_on)
3634  panic("tboot: Failed to initialize DMAR table\n");
3635  return -ENODEV;
3636  }
3637 
3638  if (dmar_dev_scope_init() < 0) {
3639  if (force_on)
3640  panic("tboot: Failed to initialize DMAR device scope\n");
3641  return -ENODEV;
3642  }
3643 
3644  if (no_iommu || dmar_disabled)
3645  return -ENODEV;
3646 
3647  if (iommu_init_mempool()) {
3648  if (force_on)
3649  panic("tboot: Failed to initialize iommu memory\n");
3650  return -ENODEV;
3651  }
3652 
3653  if (list_empty(&dmar_rmrr_units))
3654  printk(KERN_INFO "DMAR: No RMRR found\n");
3655 
3656  if (list_empty(&dmar_atsr_units))
3657  printk(KERN_INFO "DMAR: No ATSR found\n");
3658 
3659  if (dmar_init_reserved_ranges()) {
3660  if (force_on)
3661  panic("tboot: Failed to reserve iommu ranges\n");
3662  return -ENODEV;
3663  }
3664 
3665  init_no_remapping_devices();
3666 
3667  ret = init_dmars();
3668  if (ret) {
3669  if (force_on)
3670  panic("tboot: Failed to initialize DMARs\n");
3671  printk(KERN_ERR "IOMMU: dmar init failed\n");
3672  put_iova_domain(&reserved_iova_list);
3673  iommu_exit_mempool();
3674  return ret;
3675  }
3677  "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3678 
3679  init_timer(&unmap_timer);
3680 #ifdef CONFIG_SWIOTLB
3681  swiotlb = 0;
3682 #endif
3684 
3685  init_iommu_pm_ops();
3686 
3687  bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3688 
3689  bus_register_notifier(&pci_bus_type, &device_nb);
3690 
3691  intel_iommu_enabled = 1;
3692 
3693  return 0;
3694 }
3695 
3696 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3697  struct pci_dev *pdev)
3698 {
3699  struct pci_dev *tmp, *parent;
3700 
3701  if (!iommu || !pdev)
3702  return;
3703 
3704  /* dependent device detach */
3705  tmp = pci_find_upstream_pcie_bridge(pdev);
3706  /* Secondary interface's bus number and devfn 0 */
3707  if (tmp) {
3708  parent = pdev->bus->self;
3709  while (parent != tmp) {
3710  iommu_detach_dev(iommu, parent->bus->number,
3711  parent->devfn);
3712  parent = parent->bus->self;
3713  }
3714  if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3715  iommu_detach_dev(iommu,
3716  tmp->subordinate->number, 0);
3717  else /* this is a legacy PCI bridge */
3718  iommu_detach_dev(iommu, tmp->bus->number,
3719  tmp->devfn);
3720  }
3721 }
3722 
3723 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3724  struct pci_dev *pdev)
3725 {
3726  struct device_domain_info *info;
3727  struct intel_iommu *iommu;
3728  unsigned long flags;
3729  int found = 0;
3730  struct list_head *entry, *tmp;
3731 
3732  iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3733  pdev->devfn);
3734  if (!iommu)
3735  return;
3736 
3737  spin_lock_irqsave(&device_domain_lock, flags);
3738  list_for_each_safe(entry, tmp, &domain->devices) {
3739  info = list_entry(entry, struct device_domain_info, link);
3740  if (info->segment == pci_domain_nr(pdev->bus) &&
3741  info->bus == pdev->bus->number &&
3742  info->devfn == pdev->devfn) {
3743  unlink_domain_info(info);
3744  spin_unlock_irqrestore(&device_domain_lock, flags);
3745 
3746  iommu_disable_dev_iotlb(info);
3747  iommu_detach_dev(iommu, info->bus, info->devfn);
3748  iommu_detach_dependent_devices(iommu, pdev);
3749  free_devinfo_mem(info);
3750 
3751  spin_lock_irqsave(&device_domain_lock, flags);
3752 
3753  if (found)
3754  break;
3755  else
3756  continue;
3757  }
3758 
3759  /* if there is no other devices under the same iommu
3760  * owned by this domain, clear this iommu in iommu_bmp
3761  * update iommu count and coherency
3762  */
3763  if (iommu == device_to_iommu(info->segment, info->bus,
3764  info->devfn))
3765  found = 1;
3766  }
3767 
3768  spin_unlock_irqrestore(&device_domain_lock, flags);
3769 
3770  if (found == 0) {
3771  unsigned long tmp_flags;
3772  spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3773  clear_bit(iommu->seq_id, domain->iommu_bmp);
3774  domain->iommu_count--;
3775  domain_update_iommu_cap(domain);
3776  spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3777 
3778  if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3779  !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3780  spin_lock_irqsave(&iommu->lock, tmp_flags);
3781  clear_bit(domain->id, iommu->domain_ids);
3782  iommu->domains[domain->id] = NULL;
3783  spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3784  }
3785  }
3786 }
3787 
3788 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3789 {
3790  struct device_domain_info *info;
3791  struct intel_iommu *iommu;
3792  unsigned long flags1, flags2;
3793 
3794  spin_lock_irqsave(&device_domain_lock, flags1);
3795  while (!list_empty(&domain->devices)) {
3796  info = list_entry(domain->devices.next,
3797  struct device_domain_info, link);
3798  unlink_domain_info(info);
3799  spin_unlock_irqrestore(&device_domain_lock, flags1);
3800 
3801  iommu_disable_dev_iotlb(info);
3802  iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3803  iommu_detach_dev(iommu, info->bus, info->devfn);
3804  iommu_detach_dependent_devices(iommu, info->dev);
3805 
3806  /* clear this iommu in iommu_bmp, update iommu count
3807  * and capabilities
3808  */
3809  spin_lock_irqsave(&domain->iommu_lock, flags2);
3810  if (test_and_clear_bit(iommu->seq_id,
3811  domain->iommu_bmp)) {
3812  domain->iommu_count--;
3813  domain_update_iommu_cap(domain);
3814  }
3815  spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3816 
3817  free_devinfo_mem(info);
3818  spin_lock_irqsave(&device_domain_lock, flags1);
3819  }
3820  spin_unlock_irqrestore(&device_domain_lock, flags1);
3821 }
3822 
3823 /* domain id for virtual machine, it won't be set in context */
3824 static unsigned long vm_domid;
3825 
3826 static struct dmar_domain *iommu_alloc_vm_domain(void)
3827 {
3828  struct dmar_domain *domain;
3829 
3830  domain = alloc_domain_mem();
3831  if (!domain)
3832  return NULL;
3833 
3834  domain->id = vm_domid++;
3835  domain->nid = -1;
3836  memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3838 
3839  return domain;
3840 }
3841 
3842 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3843 {
3844  int adjust_width;
3845 
3847  spin_lock_init(&domain->iommu_lock);
3848 
3849  domain_reserve_special_ranges(domain);
3850 
3851  /* calculate AGAW */
3852  domain->gaw = guest_width;
3853  adjust_width = guestwidth_to_adjustwidth(guest_width);
3854  domain->agaw = width_to_agaw(adjust_width);
3855 
3856  INIT_LIST_HEAD(&domain->devices);
3857 
3858  domain->iommu_count = 0;
3859  domain->iommu_coherency = 0;
3860  domain->iommu_snooping = 0;
3861  domain->iommu_superpage = 0;
3862  domain->max_addr = 0;
3863  domain->nid = -1;
3864 
3865  /* always allocate the top pgd */
3866  domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3867  if (!domain->pgd)
3868  return -ENOMEM;
3869  domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3870  return 0;
3871 }
3872 
3873 static void iommu_free_vm_domain(struct dmar_domain *domain)
3874 {
3875  unsigned long flags;
3876  struct dmar_drhd_unit *drhd;
3877  struct intel_iommu *iommu;
3878  unsigned long i;
3879  unsigned long ndomains;
3880 
3881  for_each_drhd_unit(drhd) {
3882  if (drhd->ignored)
3883  continue;
3884  iommu = drhd->iommu;
3885 
3886  ndomains = cap_ndoms(iommu->cap);
3887  for_each_set_bit(i, iommu->domain_ids, ndomains) {
3888  if (iommu->domains[i] == domain) {
3889  spin_lock_irqsave(&iommu->lock, flags);
3890  clear_bit(i, iommu->domain_ids);
3891  iommu->domains[i] = NULL;
3892  spin_unlock_irqrestore(&iommu->lock, flags);
3893  break;
3894  }
3895  }
3896  }
3897 }
3898 
3899 static void vm_domain_exit(struct dmar_domain *domain)
3900 {
3901  /* Domain 0 is reserved, so dont process it */
3902  if (!domain)
3903  return;
3904 
3905  vm_domain_remove_all_dev_info(domain);
3906  /* destroy iovas */
3907  put_iova_domain(&domain->iovad);
3908 
3909  /* clear ptes */
3910  dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3911 
3912  /* free page tables */
3913  dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3914 
3915  iommu_free_vm_domain(domain);
3916  free_domain_mem(domain);
3917 }
3918 
3919 static int intel_iommu_domain_init(struct iommu_domain *domain)
3920 {
3921  struct dmar_domain *dmar_domain;
3922 
3923  dmar_domain = iommu_alloc_vm_domain();
3924  if (!dmar_domain) {
3926  "intel_iommu_domain_init: dmar_domain == NULL\n");
3927  return -ENOMEM;
3928  }
3929  if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3931  "intel_iommu_domain_init() failed\n");
3932  vm_domain_exit(dmar_domain);
3933  return -ENOMEM;
3934  }
3935  domain_update_iommu_cap(dmar_domain);
3936  domain->priv = dmar_domain;
3937 
3938  domain->geometry.aperture_start = 0;
3939  domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3940  domain->geometry.force_aperture = true;
3941 
3942  return 0;
3943 }
3944 
3945 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3946 {
3947  struct dmar_domain *dmar_domain = domain->priv;
3948 
3949  domain->priv = NULL;
3950  vm_domain_exit(dmar_domain);
3951 }
3952 
3953 static int intel_iommu_attach_device(struct iommu_domain *domain,
3954  struct device *dev)
3955 {
3956  struct dmar_domain *dmar_domain = domain->priv;
3957  struct pci_dev *pdev = to_pci_dev(dev);
3958  struct intel_iommu *iommu;
3959  int addr_width;
3960 
3961  /* normally pdev is not mapped */
3962  if (unlikely(domain_context_mapped(pdev))) {
3963  struct dmar_domain *old_domain;
3964 
3965  old_domain = find_domain(pdev);
3966  if (old_domain) {
3967  if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3968  dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3969  domain_remove_one_dev_info(old_domain, pdev);
3970  else
3971  domain_remove_dev_info(old_domain);
3972  }
3973  }
3974 
3975  iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3976  pdev->devfn);
3977  if (!iommu)
3978  return -ENODEV;
3979 
3980  /* check if this iommu agaw is sufficient for max mapped address */
3981  addr_width = agaw_to_width(iommu->agaw);
3982  if (addr_width > cap_mgaw(iommu->cap))
3983  addr_width = cap_mgaw(iommu->cap);
3984 
3985  if (dmar_domain->max_addr > (1LL << addr_width)) {
3986  printk(KERN_ERR "%s: iommu width (%d) is not "
3987  "sufficient for the mapped address (%llx)\n",
3988  __func__, addr_width, dmar_domain->max_addr);
3989  return -EFAULT;
3990  }
3991  dmar_domain->gaw = addr_width;
3992 
3993  /*
3994  * Knock out extra levels of page tables if necessary
3995  */
3996  while (iommu->agaw < dmar_domain->agaw) {
3997  struct dma_pte *pte;
3998 
3999  pte = dmar_domain->pgd;
4000  if (dma_pte_present(pte)) {
4001  dmar_domain->pgd = (struct dma_pte *)
4002  phys_to_virt(dma_pte_addr(pte));
4003  free_pgtable_page(pte);
4004  }
4005  dmar_domain->agaw--;
4006  }
4007 
4008  return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4009 }
4010 
4011 static void intel_iommu_detach_device(struct iommu_domain *domain,
4012  struct device *dev)
4013 {
4014  struct dmar_domain *dmar_domain = domain->priv;
4015  struct pci_dev *pdev = to_pci_dev(dev);
4016 
4017  domain_remove_one_dev_info(dmar_domain, pdev);
4018 }
4019 
4020 static int intel_iommu_map(struct iommu_domain *domain,
4021  unsigned long iova, phys_addr_t hpa,
4022  size_t size, int iommu_prot)
4023 {
4024  struct dmar_domain *dmar_domain = domain->priv;
4025  u64 max_addr;
4026  int prot = 0;
4027  int ret;
4028 
4029  if (iommu_prot & IOMMU_READ)
4030  prot |= DMA_PTE_READ;
4031  if (iommu_prot & IOMMU_WRITE)
4032  prot |= DMA_PTE_WRITE;
4033  if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4034  prot |= DMA_PTE_SNP;
4035 
4036  max_addr = iova + size;
4037  if (dmar_domain->max_addr < max_addr) {
4038  u64 end;
4039 
4040  /* check if minimum agaw is sufficient for mapped address */
4041  end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4042  if (end < max_addr) {
4043  printk(KERN_ERR "%s: iommu width (%d) is not "
4044  "sufficient for the mapped address (%llx)\n",
4045  __func__, dmar_domain->gaw, max_addr);
4046  return -EFAULT;
4047  }
4048  dmar_domain->max_addr = max_addr;
4049  }
4050  /* Round up size to next multiple of PAGE_SIZE, if it and
4051  the low bits of hpa would take us onto the next page */
4052  size = aligned_nrpages(hpa, size);
4053  ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4054  hpa >> VTD_PAGE_SHIFT, size, prot);
4055  return ret;
4056 }
4057 
4058 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4059  unsigned long iova, size_t size)
4060 {
4061  struct dmar_domain *dmar_domain = domain->priv;
4062  int order;
4063 
4064  order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4065  (iova + size - 1) >> VTD_PAGE_SHIFT);
4066 
4067  if (dmar_domain->max_addr == iova + size)
4068  dmar_domain->max_addr = iova;
4069 
4070  return PAGE_SIZE << order;
4071 }
4072 
4073 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4074  unsigned long iova)
4075 {
4076  struct dmar_domain *dmar_domain = domain->priv;
4077  struct dma_pte *pte;
4078  u64 phys = 0;
4079 
4080  pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4081  if (pte)
4082  phys = dma_pte_addr(pte);
4083 
4084  return phys;
4085 }
4086 
4087 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4088  unsigned long cap)
4089 {
4090  struct dmar_domain *dmar_domain = domain->priv;
4091 
4092  if (cap == IOMMU_CAP_CACHE_COHERENCY)
4093  return dmar_domain->iommu_snooping;
4094  if (cap == IOMMU_CAP_INTR_REMAP)
4095  return irq_remapping_enabled;
4096 
4097  return 0;
4098 }
4099 
4100 static void swap_pci_ref(struct pci_dev **from, struct pci_dev *to)
4101 {
4102  pci_dev_put(*from);
4103  *from = to;
4104 }
4105 
4106 #define REQ_ACS_FLAGS (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4107 
4108 static int intel_iommu_add_device(struct device *dev)
4109 {
4110  struct pci_dev *pdev = to_pci_dev(dev);
4111  struct pci_dev *bridge, *dma_pdev = NULL;
4112  struct iommu_group *group;
4113  int ret;
4114 
4115  if (!device_to_iommu(pci_domain_nr(pdev->bus),
4116  pdev->bus->number, pdev->devfn))
4117  return -ENODEV;
4118 
4119  bridge = pci_find_upstream_pcie_bridge(pdev);
4120  if (bridge) {
4121  if (pci_is_pcie(bridge))
4122  dma_pdev = pci_get_domain_bus_and_slot(
4123  pci_domain_nr(pdev->bus),
4124  bridge->subordinate->number, 0);
4125  if (!dma_pdev)
4126  dma_pdev = pci_dev_get(bridge);
4127  } else
4128  dma_pdev = pci_dev_get(pdev);
4129 
4130  /* Account for quirked devices */
4131  swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4132 
4133  /*
4134  * If it's a multifunction device that does not support our
4135  * required ACS flags, add to the same group as function 0.
4136  */
4137  if (dma_pdev->multifunction &&
4138  !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS))
4139  swap_pci_ref(&dma_pdev,
4140  pci_get_slot(dma_pdev->bus,
4141  PCI_DEVFN(PCI_SLOT(dma_pdev->devfn),
4142  0)));
4143 
4144  /*
4145  * Devices on the root bus go through the iommu. If that's not us,
4146  * find the next upstream device and test ACS up to the root bus.
4147  * Finding the next device may require skipping virtual buses.
4148  */
4149  while (!pci_is_root_bus(dma_pdev->bus)) {
4150  struct pci_bus *bus = dma_pdev->bus;
4151 
4152  while (!bus->self) {
4153  if (!pci_is_root_bus(bus))
4154  bus = bus->parent;
4155  else
4156  goto root_bus;
4157  }
4158 
4160  break;
4161 
4162  swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4163  }
4164 
4165 root_bus:
4166  group = iommu_group_get(&dma_pdev->dev);
4167  pci_dev_put(dma_pdev);
4168  if (!group) {
4169  group = iommu_group_alloc();
4170  if (IS_ERR(group))
4171  return PTR_ERR(group);
4172  }
4173 
4174  ret = iommu_group_add_device(group, dev);
4175 
4176  iommu_group_put(group);
4177  return ret;
4178 }
4179 
4180 static void intel_iommu_remove_device(struct device *dev)
4181 {
4183 }
4184 
4185 static struct iommu_ops intel_iommu_ops = {
4186  .domain_init = intel_iommu_domain_init,
4187  .domain_destroy = intel_iommu_domain_destroy,
4188  .attach_dev = intel_iommu_attach_device,
4189  .detach_dev = intel_iommu_detach_device,
4190  .map = intel_iommu_map,
4191  .unmap = intel_iommu_unmap,
4192  .iova_to_phys = intel_iommu_iova_to_phys,
4193  .domain_has_cap = intel_iommu_domain_has_cap,
4194  .add_device = intel_iommu_add_device,
4195  .remove_device = intel_iommu_remove_device,
4196  .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
4197 };
4198 
4199 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4200 {
4201  /*
4202  * Mobile 4 Series Chipset neglects to set RWBF capability,
4203  * but needs it:
4204  */
4205  printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4206  rwbf_quirk = 1;
4207 
4208  /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
4209  if (dev->revision == 0x07) {
4210  printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4211  dmar_map_gfx = 0;
4212  }
4213 }
4214 
4215 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4216 
4217 #define GGC 0x52
4218 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4219 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4220 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4221 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4222 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4223 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4224 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4225 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4226 
4227 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4228 {
4229  unsigned short ggc;
4230 
4231  if (pci_read_config_word(dev, GGC, &ggc))
4232  return;
4233 
4234  if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4235  printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4236  dmar_map_gfx = 0;
4237  } else if (dmar_map_gfx) {
4238  /* we have to ensure the gfx device is idle before we flush */
4239  printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4240  intel_iommu_strict = 1;
4241  }
4242 }
4243 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4244 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4245 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4246 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4247 
4248 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4249  ISOCH DMAR unit for the Azalia sound device, but not give it any
4250  TLB entries, which causes it to deadlock. Check for that. We do
4251  this in a function called from init_dmars(), instead of in a PCI
4252  quirk, because we don't want to print the obnoxious "BIOS broken"
4253  message if VT-d is actually disabled.
4254 */
4255 static void __init check_tylersburg_isoch(void)
4256 {
4257  struct pci_dev *pdev;
4258  uint32_t vtisochctrl;
4259 
4260  /* If there's no Azalia in the system anyway, forget it. */
4261  pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4262  if (!pdev)
4263  return;
4264  pci_dev_put(pdev);
4265 
4266  /* System Management Registers. Might be hidden, in which case
4267  we can't do the sanity check. But that's OK, because the
4268  known-broken BIOSes _don't_ actually hide it, so far. */
4269  pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4270  if (!pdev)
4271  return;
4272 
4273  if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4274  pci_dev_put(pdev);
4275  return;
4276  }
4277 
4278  pci_dev_put(pdev);
4279 
4280  /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4281  if (vtisochctrl & 1)
4282  return;
4283 
4284  /* Drop all bits other than the number of TLB entries */
4285  vtisochctrl &= 0x1c;
4286 
4287  /* If we have the recommended number of TLB entries (16), fine. */
4288  if (vtisochctrl == 0x10)
4289  return;
4290 
4291  /* Zero TLB entries? You get to ride the short bus to school. */
4292  if (!vtisochctrl) {
4293  WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4294  "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4298  iommu_identity_mapping |= IDENTMAP_AZALIA;
4299  return;
4300  }
4301 
4302  printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4303  vtisochctrl);
4304 }