Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
setup.c
Go to the documentation of this file.
1 /*
2  * Copyright 2010 Tilera Corporation. All Rights Reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation, version 2.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11  * NON INFRINGEMENT. See the GNU General Public License for
12  * more details.
13  */
14 
15 #include <linux/sched.h>
16 #include <linux/kernel.h>
17 #include <linux/mmzone.h>
18 #include <linux/bootmem.h>
19 #include <linux/module.h>
20 #include <linux/node.h>
21 #include <linux/cpu.h>
22 #include <linux/ioport.h>
23 #include <linux/irq.h>
24 #include <linux/kexec.h>
25 #include <linux/pci.h>
26 #include <linux/swiotlb.h>
27 #include <linux/initrd.h>
28 #include <linux/io.h>
29 #include <linux/highmem.h>
30 #include <linux/smp.h>
31 #include <linux/timex.h>
32 #include <linux/hugetlb.h>
33 #include <linux/start_kernel.h>
34 #include <asm/setup.h>
35 #include <asm/sections.h>
36 #include <asm/cacheflush.h>
37 #include <asm/pgalloc.h>
38 #include <asm/mmu_context.h>
39 #include <hv/hypervisor.h>
40 #include <arch/interrupts.h>
41 
42 /* <linux/smp.h> doesn't provide this definition. */
43 #ifndef CONFIG_SMP
44 #define setup_max_cpus 1
45 #endif
46 
47 static inline int ABS(int x) { return x >= 0 ? x : -x; }
48 
49 /* Chip information */
51 
54 
55 /* Information on the NUMA nodes that we compute early */
61 
62 static unsigned long __initdata node_percpu[MAX_NUMNODES];
63 
64 /*
65  * per-CPU stack and boot info.
66  */
67 DEFINE_PER_CPU(unsigned long, boot_sp) =
68  (unsigned long)init_stack + THREAD_SIZE;
69 
70 #ifdef CONFIG_SMP
71 DEFINE_PER_CPU(unsigned long, boot_pc) = (unsigned long)start_kernel;
72 #else
73 /*
74  * The variable must be __initdata since it references __init code.
75  * With CONFIG_SMP it is per-cpu data, which is exempt from validation.
76  */
77 unsigned long __initdata boot_pc = (unsigned long)start_kernel;
78 #endif
79 
80 #ifdef CONFIG_HIGHMEM
81 /* Page frame index of end of lowmem on each controller. */
82 unsigned long __cpuinitdata node_lowmem_end_pfn[MAX_NUMNODES];
83 
84 /* Number of pages that can be mapped into lowmem. */
85 static unsigned long __initdata mappable_physpages;
86 #endif
87 
88 /* Data on which physical memory controller corresponds to which NUMA node */
89 int node_controller[MAX_NUMNODES] = { [0 ... MAX_NUMNODES-1] = -1 };
90 
91 #ifdef CONFIG_HIGHMEM
92 /* Map information from VAs to PAs */
93 unsigned long pbase_map[1 << (32 - HPAGE_SHIFT)]
95 EXPORT_SYMBOL(pbase_map);
96 
97 /* Map information from PAs to VAs */
98 void *vbase_map[NR_PA_HIGHBIT_VALUES]
100 EXPORT_SYMBOL(vbase_map);
101 #endif
102 
103 /* Node number as a function of the high PA bits */
104 int highbits_to_node[NR_PA_HIGHBIT_VALUES] __write_once;
105 EXPORT_SYMBOL(highbits_to_node);
106 
107 static unsigned int __initdata maxmem_pfn = -1U;
108 static unsigned int __initdata maxnodemem_pfn[MAX_NUMNODES] = {
109  [0 ... MAX_NUMNODES-1] = -1U
110 };
111 static nodemask_t __initdata isolnodes;
112 
113 #if defined(CONFIG_PCI) && !defined(__tilegx__)
114 enum { DEFAULT_PCI_RESERVE_MB = 64 };
115 static unsigned int __initdata pci_reserve_mb = DEFAULT_PCI_RESERVE_MB;
116 unsigned long __initdata pci_reserve_start_pfn = -1U;
117 unsigned long __initdata pci_reserve_end_pfn = -1U;
118 #endif
119 
120 static int __init setup_maxmem(char *str)
121 {
122  unsigned long long maxmem;
123  if (str == NULL || (maxmem = memparse(str, NULL)) == 0)
124  return -EINVAL;
125 
126  maxmem_pfn = (maxmem >> HPAGE_SHIFT) << (HPAGE_SHIFT - PAGE_SHIFT);
127  pr_info("Forcing RAM used to no more than %dMB\n",
128  maxmem_pfn >> (20 - PAGE_SHIFT));
129  return 0;
130 }
131 early_param("maxmem", setup_maxmem);
132 
133 static int __init setup_maxnodemem(char *str)
134 {
135  char *endp;
136  unsigned long long maxnodemem;
137  long node;
138 
139  node = str ? simple_strtoul(str, &endp, 0) : INT_MAX;
140  if (node >= MAX_NUMNODES || *endp != ':')
141  return -EINVAL;
142 
143  maxnodemem = memparse(endp+1, NULL);
144  maxnodemem_pfn[node] = (maxnodemem >> HPAGE_SHIFT) <<
146  pr_info("Forcing RAM used on node %ld to no more than %dMB\n",
147  node, maxnodemem_pfn[node] >> (20 - PAGE_SHIFT));
148  return 0;
149 }
150 early_param("maxnodemem", setup_maxnodemem);
151 
152 static int __init setup_isolnodes(char *str)
153 {
154  char buf[MAX_NUMNODES * 5];
155  if (str == NULL || nodelist_parse(str, isolnodes) != 0)
156  return -EINVAL;
157 
158  nodelist_scnprintf(buf, sizeof(buf), isolnodes);
159  pr_info("Set isolnodes value to '%s'\n", buf);
160  return 0;
161 }
162 early_param("isolnodes", setup_isolnodes);
163 
164 #if defined(CONFIG_PCI) && !defined(__tilegx__)
165 static int __init setup_pci_reserve(char* str)
166 {
167  unsigned long mb;
168 
169  if (str == NULL || strict_strtoul(str, 0, &mb) != 0 ||
170  mb > 3 * 1024)
171  return -EINVAL;
172 
173  pci_reserve_mb = mb;
174  pr_info("Reserving %dMB for PCIE root complex mappings\n",
175  pci_reserve_mb);
176  return 0;
177 }
178 early_param("pci_reserve", setup_pci_reserve);
179 #endif
180 
181 #ifndef __tilegx__
182 /*
183  * vmalloc=size forces the vmalloc area to be exactly 'size' bytes.
184  * This can be used to increase (or decrease) the vmalloc area.
185  */
186 static int __init parse_vmalloc(char *arg)
187 {
188  if (!arg)
189  return -EINVAL;
190 
191  VMALLOC_RESERVE = (memparse(arg, &arg) + PGDIR_SIZE - 1) & PGDIR_MASK;
192 
193  /* See validate_va() for more on this test. */
194  if ((long)_VMALLOC_START >= 0)
195  early_panic("\"vmalloc=%#lx\" value too large: maximum %#lx\n",
196  VMALLOC_RESERVE, _VMALLOC_END - 0x80000000UL);
197 
198  return 0;
199 }
200 early_param("vmalloc", parse_vmalloc);
201 #endif
202 
203 #ifdef CONFIG_HIGHMEM
204 /*
205  * Determine for each controller where its lowmem is mapped and how much of
206  * it is mapped there. On controller zero, the first few megabytes are
207  * already mapped in as code at MEM_SV_INTRPT, so in principle we could
208  * start our data mappings higher up, but for now we don't bother, to avoid
209  * additional confusion.
210  *
211  * One question is whether, on systems with more than 768 Mb and
212  * controllers of different sizes, to map in a proportionate amount of
213  * each one, or to try to map the same amount from each controller.
214  * (E.g. if we have three controllers with 256MB, 1GB, and 256MB
215  * respectively, do we map 256MB from each, or do we map 128 MB, 512
216  * MB, and 128 MB respectively?) For now we use a proportionate
217  * solution like the latter.
218  *
219  * The VA/PA mapping demands that we align our decisions at 16 MB
220  * boundaries so that we can rapidly convert VA to PA.
221  */
222 static void *__init setup_pa_va_mapping(void)
223 {
224  unsigned long curr_pages = 0;
225  unsigned long vaddr = PAGE_OFFSET;
226  nodemask_t highonlynodes = isolnodes;
227  int i, j;
228 
229  memset(pbase_map, -1, sizeof(pbase_map));
230  memset(vbase_map, -1, sizeof(vbase_map));
231 
232  /* Node zero cannot be isolated for LOWMEM purposes. */
233  node_clear(0, highonlynodes);
234 
235  /* Count up the number of pages on non-highonlynodes controllers. */
236  mappable_physpages = 0;
238  if (!node_isset(i, highonlynodes))
239  mappable_physpages +=
241  }
242 
244  unsigned long start = node_start_pfn[i];
245  unsigned long end = node_end_pfn[i];
246  unsigned long size = end - start;
247  unsigned long vaddr_end;
248 
249  if (node_isset(i, highonlynodes)) {
250  /* Mark this controller as having no lowmem. */
251  node_lowmem_end_pfn[i] = start;
252  continue;
253  }
254 
255  curr_pages += size;
256  if (mappable_physpages > MAXMEM_PFN) {
257  vaddr_end = PAGE_OFFSET +
258  (((u64)curr_pages * MAXMEM_PFN /
259  mappable_physpages)
260  << PAGE_SHIFT);
261  } else {
262  vaddr_end = PAGE_OFFSET + (curr_pages << PAGE_SHIFT);
263  }
264  for (j = 0; vaddr < vaddr_end; vaddr += HPAGE_SIZE, ++j) {
265  unsigned long this_pfn =
266  start + (j << HUGETLB_PAGE_ORDER);
267  pbase_map[vaddr >> HPAGE_SHIFT] = this_pfn;
268  if (vbase_map[__pfn_to_highbits(this_pfn)] ==
269  (void *)-1)
270  vbase_map[__pfn_to_highbits(this_pfn)] =
271  (void *)(vaddr & HPAGE_MASK);
272  }
273  node_lowmem_end_pfn[i] = start + (j << HUGETLB_PAGE_ORDER);
274  BUG_ON(node_lowmem_end_pfn[i] > end);
275  }
276 
277  /* Return highest address of any mapped memory. */
278  return (void *)vaddr;
279 }
280 #endif /* CONFIG_HIGHMEM */
281 
282 /*
283  * Register our most important memory mappings with the debug stub.
284  *
285  * This is up to 4 mappings for lowmem, one mapping per memory
286  * controller, plus one for our text segment.
287  */
288 static void __cpuinit store_permanent_mappings(void)
289 {
290  int i;
291 
294 #ifdef CONFIG_HIGHMEM
295  HV_PhysAddr high_mapped_pa = node_lowmem_end_pfn[i];
296 #else
297  HV_PhysAddr high_mapped_pa = node_end_pfn[i];
298 #endif
299 
300  unsigned long pages = high_mapped_pa - node_start_pfn[i];
302  hv_store_mapping(addr, pages << PAGE_SHIFT, pa);
303  }
304 
306  (uint32_t)(_einittext - _stext), 0);
307 }
308 
309 /*
310  * Use hv_inquire_physical() to populate node_{start,end}_pfn[]
311  * and node_online_map, doing suitable sanity-checking.
312  * Also set min_low_pfn, max_low_pfn, and max_pfn.
313  */
314 static void __init setup_memory(void)
315 {
316  int i, j;
317  int highbits_seen[NR_PA_HIGHBIT_VALUES] = { 0 };
318 #ifdef CONFIG_HIGHMEM
319  long highmem_pages;
320 #endif
321 #ifndef __tilegx__
322  int cap;
323 #endif
324 #if defined(CONFIG_HIGHMEM) || defined(__tilegx__)
325  long lowmem_pages;
326 #endif
327 
328  /* We are using a char to hold the cpu_2_node[] mapping */
329  BUILD_BUG_ON(MAX_NUMNODES > 127);
330 
331  /* Discover the ranges of memory available to us */
332  for (i = 0; ; ++i) {
333  unsigned long start, size, end, highbits;
335  if (range.size == 0)
336  break;
337 #ifdef CONFIG_FLATMEM
338  if (i > 0) {
339  pr_err("Can't use discontiguous PAs: %#llx..%#llx\n",
340  range.size, range.start + range.size);
341  continue;
342  }
343 #endif
344 #ifndef __tilegx__
345  if ((unsigned long)range.start) {
346  pr_err("Range not at 4GB multiple: %#llx..%#llx\n",
347  range.start, range.start + range.size);
348  continue;
349  }
350 #endif
351  if ((range.start & (HPAGE_SIZE-1)) != 0 ||
352  (range.size & (HPAGE_SIZE-1)) != 0) {
353  unsigned long long start_pa = range.start;
354  unsigned long long orig_size = range.size;
355  range.start = (start_pa + HPAGE_SIZE - 1) & HPAGE_MASK;
356  range.size -= (range.start - start_pa);
357  range.size &= HPAGE_MASK;
358  pr_err("Range not hugepage-aligned: %#llx..%#llx:"
359  " now %#llx-%#llx\n",
360  start_pa, start_pa + orig_size,
361  range.start, range.start + range.size);
362  }
363  highbits = __pa_to_highbits(range.start);
364  if (highbits >= NR_PA_HIGHBIT_VALUES) {
365  pr_err("PA high bits too high: %#llx..%#llx\n",
366  range.start, range.start + range.size);
367  continue;
368  }
369  if (highbits_seen[highbits]) {
370  pr_err("Range overlaps in high bits: %#llx..%#llx\n",
371  range.start, range.start + range.size);
372  continue;
373  }
374  highbits_seen[highbits] = 1;
375  if (PFN_DOWN(range.size) > maxnodemem_pfn[i]) {
376  int max_size = maxnodemem_pfn[i];
377  if (max_size > 0) {
378  pr_err("Maxnodemem reduced node %d to"
379  " %d pages\n", i, max_size);
380  range.size = PFN_PHYS(max_size);
381  } else {
382  pr_err("Maxnodemem disabled node %d\n", i);
383  continue;
384  }
385  }
386  if (num_physpages + PFN_DOWN(range.size) > maxmem_pfn) {
387  int max_size = maxmem_pfn - num_physpages;
388  if (max_size > 0) {
389  pr_err("Maxmem reduced node %d to %d pages\n",
390  i, max_size);
391  range.size = PFN_PHYS(max_size);
392  } else {
393  pr_err("Maxmem disabled node %d\n", i);
394  continue;
395  }
396  }
397  if (i >= MAX_NUMNODES) {
398  pr_err("Too many PA nodes (#%d): %#llx...%#llx\n",
399  i, range.size, range.size + range.start);
400  continue;
401  }
402 
403  start = range.start >> PAGE_SHIFT;
404  size = range.size >> PAGE_SHIFT;
405  end = start + size;
406 
407 #ifndef __tilegx__
408  if (((HV_PhysAddr)end << PAGE_SHIFT) !=
409  (range.start + range.size)) {
410  pr_err("PAs too high to represent: %#llx..%#llx\n",
411  range.start, range.start + range.size);
412  continue;
413  }
414 #endif
415 #if defined(CONFIG_PCI) && !defined(__tilegx__)
416  /*
417  * Blocks that overlap the pci reserved region must
418  * have enough space to hold the maximum percpu data
419  * region at the top of the range. If there isn't
420  * enough space above the reserved region, just
421  * truncate the node.
422  */
423  if (start <= pci_reserve_start_pfn &&
424  end > pci_reserve_start_pfn) {
425  unsigned int per_cpu_size =
427  unsigned int percpu_pages =
428  NR_CPUS * (PFN_UP(per_cpu_size) >> PAGE_SHIFT);
429  if (end < pci_reserve_end_pfn + percpu_pages) {
430  end = pci_reserve_start_pfn;
431  pr_err("PCI mapping region reduced node %d to"
432  " %ld pages\n", i, end - start);
433  }
434  }
435 #endif
436 
437  for (j = __pfn_to_highbits(start);
438  j <= __pfn_to_highbits(end - 1); j++)
439  highbits_to_node[j] = i;
440 
441  node_start_pfn[i] = start;
442  node_end_pfn[i] = end;
443  node_controller[i] = range.controller;
444  num_physpages += size;
445  max_pfn = end;
446 
447  /* Mark node as online */
450  }
451 
452 #ifndef __tilegx__
453  /*
454  * For 4KB pages, mem_map "struct page" data is 1% of the size
455  * of the physical memory, so can be quite big (640 MB for
456  * four 16G zones). These structures must be mapped in
457  * lowmem, and since we currently cap out at about 768 MB,
458  * it's impractical to try to use this much address space.
459  * For now, arbitrarily cap the amount of physical memory
460  * we're willing to use at 8 million pages (32GB of 4KB pages).
461  */
462  cap = 8 * 1024 * 1024; /* 8 million pages */
463  if (num_physpages > cap) {
464  int num_nodes = num_online_nodes();
465  int cap_each = cap / num_nodes;
466  unsigned long dropped_pages = 0;
467  for (i = 0; i < num_nodes; ++i) {
468  int size = node_end_pfn[i] - node_start_pfn[i];
469  if (size > cap_each) {
470  dropped_pages += (size - cap_each);
471  node_end_pfn[i] = node_start_pfn[i] + cap_each;
472  }
473  }
474  num_physpages -= dropped_pages;
475  pr_warning("Only using %ldMB memory;"
476  " ignoring %ldMB.\n",
477  num_physpages >> (20 - PAGE_SHIFT),
478  dropped_pages >> (20 - PAGE_SHIFT));
479  pr_warning("Consider using a larger page size.\n");
480  }
481 #endif
482 
483  /* Heap starts just above the last loaded address. */
484  min_low_pfn = PFN_UP((unsigned long)_end - PAGE_OFFSET);
485 
486 #ifdef CONFIG_HIGHMEM
487  /* Find where we map lowmem from each controller. */
488  high_memory = setup_pa_va_mapping();
489 
490  /* Set max_low_pfn based on what node 0 can directly address. */
491  max_low_pfn = node_lowmem_end_pfn[0];
492 
493  lowmem_pages = (mappable_physpages > MAXMEM_PFN) ?
494  MAXMEM_PFN : mappable_physpages;
495  highmem_pages = (long) (num_physpages - lowmem_pages);
496 
497  pr_notice("%ldMB HIGHMEM available.\n",
498  pages_to_mb(highmem_pages > 0 ? highmem_pages : 0));
499  pr_notice("%ldMB LOWMEM available.\n",
500  pages_to_mb(lowmem_pages));
501 #else
502  /* Set max_low_pfn based on what node 0 can directly address. */
504 
505 #ifndef __tilegx__
506  if (node_end_pfn[0] > MAXMEM_PFN) {
507  pr_warning("Only using %ldMB LOWMEM.\n",
508  MAXMEM>>20);
509  pr_warning("Use a HIGHMEM enabled kernel.\n");
512  num_physpages = MAXMEM_PFN;
514  } else {
515  pr_notice("%ldMB memory available.\n",
517  }
518  for (i = 1; i < MAX_NUMNODES; ++i) {
519  node_start_pfn[i] = 0;
520  node_end_pfn[i] = 0;
521  }
523 #else
524  lowmem_pages = 0;
525  for (i = 0; i < MAX_NUMNODES; ++i) {
526  int pages = node_end_pfn[i] - node_start_pfn[i];
527  lowmem_pages += pages;
528  if (pages)
530  }
531  pr_notice("%ldMB memory available.\n",
532  pages_to_mb(lowmem_pages));
533 #endif
534 #endif
535 }
536 
537 /*
538  * On 32-bit machines, we only put bootmem on the low controller,
539  * since PAs > 4GB can't be used in bootmem. In principle one could
540  * imagine, e.g., multiple 1 GB controllers all of which could support
541  * bootmem, but in practice using controllers this small isn't a
542  * particularly interesting scenario, so we just keep it simple and
543  * use only the first controller for bootmem on 32-bit machines.
544  */
545 static inline int node_has_bootmem(int nid)
546 {
547 #ifdef CONFIG_64BIT
548  return 1;
549 #else
550  return nid == 0;
551 #endif
552 }
553 
554 static inline unsigned long alloc_bootmem_pfn(int nid,
555  unsigned long size,
556  unsigned long goal)
557 {
558  void *kva = __alloc_bootmem_node(NODE_DATA(nid), size,
559  PAGE_SIZE, goal);
560  unsigned long pfn = kaddr_to_pfn(kva);
561  BUG_ON(goal && PFN_PHYS(pfn) != goal);
562  return pfn;
563 }
564 
565 static void __init setup_bootmem_allocator_node(int i)
566 {
567  unsigned long start, end, mapsize, mapstart;
568 
569  if (node_has_bootmem(i)) {
570  NODE_DATA(i)->bdata = &bootmem_node_data[i];
571  } else {
572  /* Share controller zero's bdata for now. */
573  NODE_DATA(i)->bdata = &bootmem_node_data[0];
574  return;
575  }
576 
577  /* Skip up to after the bss in node 0. */
578  start = (i == 0) ? min_low_pfn : node_start_pfn[i];
579 
580  /* Only lowmem, if we're a HIGHMEM build. */
581 #ifdef CONFIG_HIGHMEM
582  end = node_lowmem_end_pfn[i];
583 #else
584  end = node_end_pfn[i];
585 #endif
586 
587  /* No memory here. */
588  if (end == start)
589  return;
590 
591  /* Figure out where the bootmem bitmap is located. */
592  mapsize = bootmem_bootmap_pages(end - start);
593  if (i == 0) {
594  /* Use some space right before the heap on node 0. */
595  mapstart = start;
596  start += mapsize;
597  } else {
598  /* Allocate bitmap on node 0 to avoid page table issues. */
599  mapstart = alloc_bootmem_pfn(0, PFN_PHYS(mapsize), 0);
600  }
601 
602  /* Initialize a node. */
603  init_bootmem_node(NODE_DATA(i), mapstart, start, end);
604 
605  /* Free all the space back into the allocator. */
606  free_bootmem(PFN_PHYS(start), PFN_PHYS(end - start));
607 
608 #if defined(CONFIG_PCI) && !defined(__tilegx__)
609  /*
610  * Throw away any memory aliased by the PCI region.
611  */
612  if (pci_reserve_start_pfn < end && pci_reserve_end_pfn > start)
613  reserve_bootmem(PFN_PHYS(pci_reserve_start_pfn),
614  PFN_PHYS(pci_reserve_end_pfn -
615  pci_reserve_start_pfn),
617 #endif
618 }
619 
620 static void __init setup_bootmem_allocator(void)
621 {
622  int i;
623  for (i = 0; i < MAX_NUMNODES; ++i)
624  setup_bootmem_allocator_node(i);
625 
626 #ifdef CONFIG_KEXEC
627  if (crashk_res.start != crashk_res.end)
628  reserve_bootmem(crashk_res.start, resource_size(&crashk_res), 0);
629 #endif
630 }
631 
632 void *__init alloc_remap(int nid, unsigned long size)
633 {
634  int pages = node_end_pfn[nid] - node_start_pfn[nid];
635  void *map = pfn_to_kaddr(node_memmap_pfn[nid]);
636  BUG_ON(size != pages * sizeof(struct page));
637  memset(map, 0, size);
638  return map;
639 }
640 
641 static int __init percpu_size(void)
642 {
643  int size = __per_cpu_end - __per_cpu_start;
644  size += PERCPU_MODULE_RESERVE;
646  if (size < PCPU_MIN_UNIT_SIZE)
647  size = PCPU_MIN_UNIT_SIZE;
648  size = roundup(size, PAGE_SIZE);
649 
650  /* In several places we assume the per-cpu data fits on a huge page. */
651  BUG_ON(kdata_huge && size > HPAGE_SIZE);
652  return size;
653 }
654 
655 static void __init zone_sizes_init(void)
656 {
657  unsigned long zones_size[MAX_NR_ZONES] = { 0 };
658  int size = percpu_size();
659  int num_cpus = smp_height * smp_width;
660  const unsigned long dma_end = (1UL << (32 - PAGE_SHIFT));
661 
662  int i;
663 
664  for (i = 0; i < num_cpus; ++i)
665  node_percpu[cpu_to_node(i)] += size;
666 
668  unsigned long start = node_start_pfn[i];
669  unsigned long end = node_end_pfn[i];
670 #ifdef CONFIG_HIGHMEM
671  unsigned long lowmem_end = node_lowmem_end_pfn[i];
672 #else
673  unsigned long lowmem_end = end;
674 #endif
675  int memmap_size = (end - start) * sizeof(struct page);
676  node_free_pfn[i] = start;
677 
678  /*
679  * Set aside pages for per-cpu data and the mem_map array.
680  *
681  * Since the per-cpu data requires special homecaching,
682  * if we are in kdata_huge mode, we put it at the end of
683  * the lowmem region. If we're not in kdata_huge mode,
684  * we take the per-cpu pages from the bottom of the
685  * controller, since that avoids fragmenting a huge page
686  * that users might want. We always take the memmap
687  * from the bottom of the controller, since with
688  * kdata_huge that lets it be under a huge TLB entry.
689  *
690  * If the user has requested isolnodes for a controller,
691  * though, there'll be no lowmem, so we just alloc_bootmem
692  * the memmap. There will be no percpu memory either.
693  */
694  if (i != 0 && cpu_isset(i, isolnodes)) {
695  node_memmap_pfn[i] =
696  alloc_bootmem_pfn(0, memmap_size, 0);
697  BUG_ON(node_percpu[i] != 0);
698  } else if (node_has_bootmem(start)) {
699  unsigned long goal = 0;
700  node_memmap_pfn[i] =
701  alloc_bootmem_pfn(i, memmap_size, 0);
702  if (kdata_huge)
703  goal = PFN_PHYS(lowmem_end) - node_percpu[i];
704  if (node_percpu[i])
705  node_percpu_pfn[i] =
706  alloc_bootmem_pfn(i, node_percpu[i],
707  goal);
708  } else {
709  /* In non-bootmem zones, just reserve some pages. */
711  node_free_pfn[i] += PFN_UP(memmap_size);
712  if (!kdata_huge) {
714  node_free_pfn[i] += PFN_UP(node_percpu[i]);
715  } else {
716  node_percpu_pfn[i] =
717  lowmem_end - PFN_UP(node_percpu[i]);
718  }
719  }
720 
721 #ifdef CONFIG_HIGHMEM
722  if (start > lowmem_end) {
723  zones_size[ZONE_NORMAL] = 0;
724  zones_size[ZONE_HIGHMEM] = end - start;
725  } else {
726  zones_size[ZONE_NORMAL] = lowmem_end - start;
727  zones_size[ZONE_HIGHMEM] = end - lowmem_end;
728  }
729 #else
730  zones_size[ZONE_NORMAL] = end - start;
731 #endif
732 
733  if (start < dma_end) {
734  zones_size[ZONE_DMA] = min(zones_size[ZONE_NORMAL],
735  dma_end - start);
736  zones_size[ZONE_NORMAL] -= zones_size[ZONE_DMA];
737  } else {
738  zones_size[ZONE_DMA] = 0;
739  }
740 
741  /* Take zone metadata from controller 0 if we're isolnode. */
742  if (node_isset(i, isolnodes))
743  NODE_DATA(i)->bdata = &bootmem_node_data[0];
744 
745  free_area_init_node(i, zones_size, start, NULL);
746  printk(KERN_DEBUG " Normal zone: %ld per-cpu pages\n",
747  PFN_UP(node_percpu[i]));
748 
749  /* Track the type of memory on each node */
750  if (zones_size[ZONE_NORMAL] || zones_size[ZONE_DMA])
751  node_set_state(i, N_NORMAL_MEMORY);
752 #ifdef CONFIG_HIGHMEM
753  if (end != start)
754  node_set_state(i, N_HIGH_MEMORY);
755 #endif
756 
757  node_set_online(i);
758  }
759 }
760 
761 #ifdef CONFIG_NUMA
762 
763 /* which logical CPUs are on which nodes */
764 struct cpumask node_2_cpu_mask[MAX_NUMNODES] __write_once;
765 EXPORT_SYMBOL(node_2_cpu_mask);
766 
767 /* which node each logical CPU is on */
768 char cpu_2_node[NR_CPUS] __write_once __attribute__((aligned(L2_CACHE_BYTES)));
769 EXPORT_SYMBOL(cpu_2_node);
770 
771 /* Return cpu_to_node() except for cpus not yet assigned, which return -1 */
772 static int __init cpu_to_bound_node(int cpu, struct cpumask* unbound_cpus)
773 {
774  if (!cpu_possible(cpu) || cpumask_test_cpu(cpu, unbound_cpus))
775  return -1;
776  else
777  return cpu_to_node(cpu);
778 }
779 
780 /* Return number of immediately-adjacent tiles sharing the same NUMA node. */
781 static int __init node_neighbors(int node, int cpu,
782  struct cpumask *unbound_cpus)
783 {
784  int neighbors = 0;
785  int w = smp_width;
786  int h = smp_height;
787  int x = cpu % w;
788  int y = cpu / w;
789  if (x > 0 && cpu_to_bound_node(cpu-1, unbound_cpus) == node)
790  ++neighbors;
791  if (x < w-1 && cpu_to_bound_node(cpu+1, unbound_cpus) == node)
792  ++neighbors;
793  if (y > 0 && cpu_to_bound_node(cpu-w, unbound_cpus) == node)
794  ++neighbors;
795  if (y < h-1 && cpu_to_bound_node(cpu+w, unbound_cpus) == node)
796  ++neighbors;
797  return neighbors;
798 }
799 
800 static void __init setup_numa_mapping(void)
801 {
802  int distance[MAX_NUMNODES][NR_CPUS];
803  HV_Coord coord;
804  int cpu, node, cpus, i, x, y;
805  int num_nodes = num_online_nodes();
806  struct cpumask unbound_cpus;
807  nodemask_t default_nodes;
808 
809  cpumask_clear(&unbound_cpus);
810 
811  /* Get set of nodes we will use for defaults */
812  nodes_andnot(default_nodes, node_online_map, isolnodes);
813  if (nodes_empty(default_nodes)) {
815  pr_err("Forcing NUMA node zero available as a default node\n");
816  node_set(0, default_nodes);
817  }
818 
819  /* Populate the distance[] array */
820  memset(distance, -1, sizeof(distance));
821  cpu = 0;
822  for (coord.y = 0; coord.y < smp_height; ++coord.y) {
823  for (coord.x = 0; coord.x < smp_width;
824  ++coord.x, ++cpu) {
825  BUG_ON(cpu >= nr_cpu_ids);
826  if (!cpu_possible(cpu)) {
827  cpu_2_node[cpu] = -1;
828  continue;
829  }
830  for_each_node_mask(node, default_nodes) {
833  coord, node_controller[node]);
834  distance[node][cpu] =
835  ABS(info.coord.x) + ABS(info.coord.y);
836  }
837  cpumask_set_cpu(cpu, &unbound_cpus);
838  }
839  }
840  cpus = cpu;
841 
842  /*
843  * Round-robin through the NUMA nodes until all the cpus are
844  * assigned. We could be more clever here (e.g. create four
845  * sorted linked lists on the same set of cpu nodes, and pull
846  * off them in round-robin sequence, removing from all four
847  * lists each time) but given the relatively small numbers
848  * involved, O(n^2) seem OK for a one-time cost.
849  */
850  node = first_node(default_nodes);
851  while (!cpumask_empty(&unbound_cpus)) {
852  int best_cpu = -1;
853  int best_distance = INT_MAX;
854  for (cpu = 0; cpu < cpus; ++cpu) {
855  if (cpumask_test_cpu(cpu, &unbound_cpus)) {
856  /*
857  * Compute metric, which is how much
858  * closer the cpu is to this memory
859  * controller than the others, shifted
860  * up, and then the number of
861  * neighbors already in the node as an
862  * epsilon adjustment to try to keep
863  * the nodes compact.
864  */
865  int d = distance[node][cpu] * num_nodes;
866  for_each_node_mask(i, default_nodes) {
867  if (i != node)
868  d -= distance[i][cpu];
869  }
870  d *= 8; /* allow space for epsilon */
871  d -= node_neighbors(node, cpu, &unbound_cpus);
872  if (d < best_distance) {
873  best_cpu = cpu;
874  best_distance = d;
875  }
876  }
877  }
878  BUG_ON(best_cpu < 0);
879  cpumask_set_cpu(best_cpu, &node_2_cpu_mask[node]);
880  cpu_2_node[best_cpu] = node;
881  cpumask_clear_cpu(best_cpu, &unbound_cpus);
882  node = next_node(node, default_nodes);
883  if (node == MAX_NUMNODES)
884  node = first_node(default_nodes);
885  }
886 
887  /* Print out node assignments and set defaults for disabled cpus */
888  cpu = 0;
889  for (y = 0; y < smp_height; ++y) {
890  printk(KERN_DEBUG "NUMA cpu-to-node row %d:", y);
891  for (x = 0; x < smp_width; ++x, ++cpu) {
892  if (cpu_to_node(cpu) < 0) {
893  pr_cont(" -");
894  cpu_2_node[cpu] = first_node(default_nodes);
895  } else {
896  pr_cont(" %d", cpu_to_node(cpu));
897  }
898  }
899  pr_cont("\n");
900  }
901 }
902 
903 static struct cpu cpu_devices[NR_CPUS];
904 
905 static int __init topology_init(void)
906 {
907  int i;
908 
911 
912  for (i = 0; i < smp_height * smp_width; ++i)
913  register_cpu(&cpu_devices[i], i);
914 
915  return 0;
916 }
917 
918 subsys_initcall(topology_init);
919 
920 #else /* !CONFIG_NUMA */
921 
922 #define setup_numa_mapping() do { } while (0)
923 
924 #endif /* CONFIG_NUMA */
925 
926 /*
927  * Initialize hugepage support on this cpu. We do this on all cores
928  * early in boot: before argument parsing for the boot cpu, and after
929  * argument parsing but before the init functions run on the secondaries.
930  * So the values we set up here in the hypervisor may be overridden on
931  * the boot cpu as arguments are parsed.
932  */
933 static __cpuinit void init_super_pages(void)
934 {
935 #ifdef CONFIG_HUGETLB_SUPER_PAGES
936  int i;
937  for (i = 0; i < HUGE_SHIFT_ENTRIES; ++i)
938  hv_set_pte_super_shift(i, huge_shift[i]);
939 #endif
940 }
941 
948 void __cpuinit setup_cpu(int boot)
949 {
950  /* The boot cpu sets up its permanent mappings much earlier. */
951  if (!boot)
952  store_permanent_mappings();
953 
954  /* Allow asynchronous TLB interrupts. */
955 #if CHIP_HAS_TILE_DMA()
958 #endif
959 #if CHIP_HAS_SN_PROC()
961 #endif
962 #ifdef __tilegx__
963  arch_local_irq_unmask(INT_SINGLE_STEP_K);
964 #endif
965 
966  /*
967  * Allow user access to many generic SPRs, like the cycle
968  * counter, PASS/FAIL/DONE, INTERRUPT_CRITICAL_SECTION, etc.
969  */
970  __insn_mtspr(SPR_MPL_WORLD_ACCESS_SET_0, 1);
971 
972 #if CHIP_HAS_SN()
973  /* Static network is not restricted. */
974  __insn_mtspr(SPR_MPL_SN_ACCESS_SET_0, 1);
975 #endif
976 #if CHIP_HAS_SN_PROC()
977  __insn_mtspr(SPR_MPL_SN_NOTIFY_SET_0, 1);
978  __insn_mtspr(SPR_MPL_SN_CPL_SET_0, 1);
979 #endif
980 
981  /*
982  * Set the MPL for interrupt control 0 & 1 to the corresponding
983  * values. This includes access to the SYSTEM_SAVE and EX_CONTEXT
984  * SPRs, as well as the interrupt mask.
985  */
986  __insn_mtspr(SPR_MPL_INTCTRL_0_SET_0, 1);
987  __insn_mtspr(SPR_MPL_INTCTRL_1_SET_1, 1);
988 
989  /* Initialize IRQ support for this cpu. */
990  setup_irq_regs();
991 
992 #ifdef CONFIG_HARDWALL
993  /* Reset the network state on this cpu. */
995 #endif
996 
997  init_super_pages();
998 }
999 
1000 #ifdef CONFIG_BLK_DEV_INITRD
1001 
1002 /*
1003  * Note that the kernel can potentially support other compression
1004  * techniques than gz, though we don't do so by default. If we ever
1005  * decide to do so we can either look for other filename extensions,
1006  * or just allow a file with this name to be compressed with an
1007  * arbitrary compressor (somewhat counterintuitively).
1008  */
1009 static int __initdata set_initramfs_file;
1010 static char __initdata initramfs_file[128] = "initramfs.cpio.gz";
1011 
1012 static int __init setup_initramfs_file(char *str)
1013 {
1014  if (str == NULL)
1015  return -EINVAL;
1016  strncpy(initramfs_file, str, sizeof(initramfs_file) - 1);
1017  set_initramfs_file = 1;
1018 
1019  return 0;
1020 }
1021 early_param("initramfs_file", setup_initramfs_file);
1022 
1023 /*
1024  * We look for an "initramfs.cpio.gz" file in the hvfs.
1025  * If there is one, we allocate some memory for it and it will be
1026  * unpacked to the initramfs.
1027  */
1028 static void __init load_hv_initrd(void)
1029 {
1031  int fd, rc;
1032  void *initrd;
1033 
1034  fd = hv_fs_findfile((HV_VirtAddr) initramfs_file);
1035  if (fd == HV_ENOENT) {
1036  if (set_initramfs_file)
1037  pr_warning("No such hvfs initramfs file '%s'\n",
1038  initramfs_file);
1039  return;
1040  }
1041  BUG_ON(fd < 0);
1042  stat = hv_fs_fstat(fd);
1043  BUG_ON(stat.size < 0);
1044  if (stat.flags & HV_FS_ISDIR) {
1045  pr_warning("Ignoring hvfs file '%s': it's a directory.\n",
1046  initramfs_file);
1047  return;
1048  }
1049  initrd = alloc_bootmem_pages(stat.size);
1050  rc = hv_fs_pread(fd, (HV_VirtAddr) initrd, stat.size, 0);
1051  if (rc != stat.size) {
1052  pr_err("Error reading %d bytes from hvfs file '%s': %d\n",
1053  stat.size, initramfs_file, rc);
1054  free_initrd_mem((unsigned long) initrd, stat.size);
1055  return;
1056  }
1057  initrd_start = (unsigned long) initrd;
1058  initrd_end = initrd_start + stat.size;
1059 }
1060 
1061 void __init free_initrd_mem(unsigned long begin, unsigned long end)
1062 {
1063  free_bootmem(__pa(begin), end - begin);
1064 }
1065 
1066 #else
1067 static inline void load_hv_initrd(void) {}
1068 #endif /* CONFIG_BLK_DEV_INITRD */
1069 
1070 static void __init validate_hv(void)
1071 {
1072  /*
1073  * It may already be too late, but let's check our built-in
1074  * configuration against what the hypervisor is providing.
1075  */
1076  unsigned long glue_size = hv_sysconf(HV_SYSCONF_GLUE_SIZE);
1077  int hv_page_size = hv_sysconf(HV_SYSCONF_PAGE_SIZE_SMALL);
1078  int hv_hpage_size = hv_sysconf(HV_SYSCONF_PAGE_SIZE_LARGE);
1079  HV_ASIDRange asid_range;
1080 
1081 #ifndef CONFIG_SMP
1082  HV_Topology topology = hv_inquire_topology();
1083  BUG_ON(topology.coord.x != 0 || topology.coord.y != 0);
1084  if (topology.width != 1 || topology.height != 1) {
1085  pr_warning("Warning: booting UP kernel on %dx%d grid;"
1086  " will ignore all but first tile.\n",
1087  topology.width, topology.height);
1088  }
1089 #endif
1090 
1091  if (PAGE_OFFSET + HV_GLUE_START_CPA + glue_size > (unsigned long)_text)
1092  early_panic("Hypervisor glue size %ld is too big!\n",
1093  glue_size);
1094  if (hv_page_size != PAGE_SIZE)
1095  early_panic("Hypervisor page size %#x != our %#lx\n",
1096  hv_page_size, PAGE_SIZE);
1097  if (hv_hpage_size != HPAGE_SIZE)
1098  early_panic("Hypervisor huge page size %#x != our %#lx\n",
1099  hv_hpage_size, HPAGE_SIZE);
1100 
1101 #ifdef CONFIG_SMP
1102  /*
1103  * Some hypervisor APIs take a pointer to a bitmap array
1104  * whose size is at least the number of cpus on the chip.
1105  * We use a struct cpumask for this, so it must be big enough.
1106  */
1107  if ((smp_height * smp_width) > nr_cpu_ids)
1108  early_panic("Hypervisor %d x %d grid too big for Linux"
1109  " NR_CPUS %d\n", smp_height, smp_width,
1110  nr_cpu_ids);
1111 #endif
1112 
1113  /*
1114  * Check that we're using allowed ASIDs, and initialize the
1115  * various asid variables to their appropriate initial states.
1116  */
1117  asid_range = hv_inquire_asid(0);
1118  __get_cpu_var(current_asid) = min_asid = asid_range.start;
1119  max_asid = asid_range.start + asid_range.size - 1;
1120 
1121  if (hv_confstr(HV_CONFSTR_CHIP_MODEL, (HV_VirtAddr)chip_model,
1122  sizeof(chip_model)) < 0) {
1123  pr_err("Warning: HV_CONFSTR_CHIP_MODEL not available\n");
1124  strlcpy(chip_model, "unknown", sizeof(chip_model));
1125  }
1126 }
1127 
1128 static void __init validate_va(void)
1129 {
1130 #ifndef __tilegx__ /* FIXME: GX: probably some validation relevant here */
1131  /*
1132  * Similarly, make sure we're only using allowed VAs.
1133  * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_INTRPT,
1134  * and 0 .. KERNEL_HIGH_VADDR.
1135  * In addition, make sure we CAN'T use the end of memory, since
1136  * we use the last chunk of each pgd for the pgd_list.
1137  */
1138  int i, user_kernel_ok = 0;
1139  unsigned long max_va = 0;
1140  unsigned long list_va =
1141  ((PGD_LIST_OFFSET / sizeof(pgd_t)) << PGDIR_SHIFT);
1142 
1143  for (i = 0; ; ++i) {
1145  if (range.size == 0)
1146  break;
1147  if (range.start <= MEM_USER_INTRPT &&
1148  range.start + range.size >= MEM_HV_INTRPT)
1149  user_kernel_ok = 1;
1150  if (range.start == 0)
1151  max_va = range.size;
1152  BUG_ON(range.start + range.size > list_va);
1153  }
1154  if (!user_kernel_ok)
1155  early_panic("Hypervisor not configured for user/kernel VAs\n");
1156  if (max_va == 0)
1157  early_panic("Hypervisor not configured for low VAs\n");
1158  if (max_va < KERNEL_HIGH_VADDR)
1159  early_panic("Hypervisor max VA %#lx smaller than %#lx\n",
1160  max_va, KERNEL_HIGH_VADDR);
1161 
1162  /* Kernel PCs must have their high bit set; see intvec.S. */
1163  if ((long)VMALLOC_START >= 0)
1164  early_panic(
1165  "Linux VMALLOC region below the 2GB line (%#lx)!\n"
1166  "Reconfigure the kernel with fewer NR_HUGE_VMAPS\n"
1167  "or smaller VMALLOC_RESERVE.\n",
1168  VMALLOC_START);
1169 #endif
1170 }
1171 
1172 /*
1173  * cpu_lotar_map lists all the cpus that are valid for the supervisor
1174  * to cache data on at a page level, i.e. what cpus can be placed in
1175  * the LOTAR field of a PTE. It is equivalent to the set of possible
1176  * cpus plus any other cpus that are willing to share their cache.
1177  * It is set by hv_inquire_tiles(HV_INQ_TILES_LOTAR).
1178  */
1179 struct cpumask __write_once cpu_lotar_map;
1181 
1182 #if CHIP_HAS_CBOX_HOME_MAP()
1183 /*
1184  * hash_for_home_map lists all the tiles that hash-for-home data
1185  * will be cached on. Note that this may includes tiles that are not
1186  * valid for this supervisor to use otherwise (e.g. if a hypervisor
1187  * device is being shared between multiple supervisors).
1188  * It is set by hv_inquire_tiles(HV_INQ_TILES_HFH_CACHE).
1189  */
1190 struct cpumask hash_for_home_map;
1191 EXPORT_SYMBOL(hash_for_home_map);
1192 #endif
1193 
1194 /*
1195  * cpu_cacheable_map lists all the cpus whose caches the hypervisor can
1196  * flush on our behalf. It is set to cpu_possible_mask OR'ed with
1197  * hash_for_home_map, and it is what should be passed to
1198  * hv_flush_remote() to flush all caches. Note that if there are
1199  * dedicated hypervisor driver tiles that have authorized use of their
1200  * cache, those tiles will only appear in cpu_lotar_map, NOT in
1201  * cpu_cacheable_map, as they are a special case.
1202  */
1203 struct cpumask __write_once cpu_cacheable_map;
1205 
1206 static __initdata struct cpumask disabled_map;
1207 
1208 static int __init disabled_cpus(char *str)
1209 {
1210  int boot_cpu = smp_processor_id();
1211 
1212  if (str == NULL || cpulist_parse_crop(str, &disabled_map) != 0)
1213  return -EINVAL;
1214  if (cpumask_test_cpu(boot_cpu, &disabled_map)) {
1215  pr_err("disabled_cpus: can't disable boot cpu %d\n", boot_cpu);
1216  cpumask_clear_cpu(boot_cpu, &disabled_map);
1217  }
1218  return 0;
1219 }
1220 
1221 early_param("disabled_cpus", disabled_cpus);
1222 
1224 {
1225  if (!cpumask_empty(&disabled_map)) {
1226  char buf[100];
1227  cpulist_scnprintf(buf, sizeof(buf), &disabled_map);
1228  pr_info("CPUs not available for Linux: %s\n", buf);
1229  }
1230 }
1231 
1232 static void __init setup_cpu_maps(void)
1233 {
1234  struct cpumask hv_disabled_map, cpu_possible_init;
1235  int boot_cpu = smp_processor_id();
1236  int cpus, i, rc;
1237 
1238  /* Learn which cpus are allowed by the hypervisor. */
1240  (HV_VirtAddr) cpumask_bits(&cpu_possible_init),
1241  sizeof(cpu_cacheable_map));
1242  if (rc < 0)
1243  early_panic("hv_inquire_tiles(AVAIL) failed: rc %d\n", rc);
1244  if (!cpumask_test_cpu(boot_cpu, &cpu_possible_init))
1245  early_panic("Boot CPU %d disabled by hypervisor!\n", boot_cpu);
1246 
1247  /* Compute the cpus disabled by the hvconfig file. */
1248  cpumask_complement(&hv_disabled_map, &cpu_possible_init);
1249 
1250  /* Include them with the cpus disabled by "disabled_cpus". */
1251  cpumask_or(&disabled_map, &disabled_map, &hv_disabled_map);
1252 
1253  /*
1254  * Disable every cpu after "setup_max_cpus". But don't mark
1255  * as disabled the cpus that are outside of our initial rectangle,
1256  * since that turns out to be confusing.
1257  */
1258  cpus = 1; /* this cpu */
1259  cpumask_set_cpu(boot_cpu, &disabled_map); /* ignore this cpu */
1260  for (i = 0; cpus < setup_max_cpus; ++i)
1261  if (!cpumask_test_cpu(i, &disabled_map))
1262  ++cpus;
1263  for (; i < smp_height * smp_width; ++i)
1264  cpumask_set_cpu(i, &disabled_map);
1265  cpumask_clear_cpu(boot_cpu, &disabled_map); /* reset this cpu */
1266  for (i = smp_height * smp_width; i < NR_CPUS; ++i)
1267  cpumask_clear_cpu(i, &disabled_map);
1268 
1269  /*
1270  * Setup cpu_possible map as every cpu allocated to us, minus
1271  * the results of any "disabled_cpus" settings.
1272  */
1273  cpumask_andnot(&cpu_possible_init, &cpu_possible_init, &disabled_map);
1274  init_cpu_possible(&cpu_possible_init);
1275 
1276  /* Learn which cpus are valid for LOTAR caching. */
1279  sizeof(cpu_lotar_map));
1280  if (rc < 0) {
1281  pr_err("warning: no HV_INQ_TILES_LOTAR; using AVAIL\n");
1283  }
1284 
1285 #if CHIP_HAS_CBOX_HOME_MAP()
1286  /* Retrieve set of CPUs used for hash-for-home caching */
1288  (HV_VirtAddr) hash_for_home_map.bits,
1289  sizeof(hash_for_home_map));
1290  if (rc < 0)
1291  early_panic("hv_inquire_tiles(HFH_CACHE) failed: rc %d\n", rc);
1292  cpumask_or(&cpu_cacheable_map, cpu_possible_mask, &hash_for_home_map);
1293 #else
1295 #endif
1296 }
1297 
1298 
1299 static int __init dataplane(char *str)
1300 {
1301  pr_warning("WARNING: dataplane support disabled in this kernel\n");
1302  return 0;
1303 }
1304 
1305 early_param("dataplane", dataplane);
1306 
1307 #ifdef CONFIG_CMDLINE_BOOL
1308 static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
1309 #endif
1310 
1311 void __init setup_arch(char **cmdline_p)
1312 {
1313  int len;
1314 
1315 #if defined(CONFIG_CMDLINE_BOOL) && defined(CONFIG_CMDLINE_OVERRIDE)
1318  if (boot_command_line[0])
1319  pr_warning("WARNING: ignoring dynamic command line \"%s\"\n",
1320  boot_command_line);
1321  strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
1322 #else
1323  char *hv_cmdline;
1324 #if defined(CONFIG_CMDLINE_BOOL)
1325  if (builtin_cmdline[0]) {
1326  int builtin_len = strlcpy(boot_command_line, builtin_cmdline,
1328  if (builtin_len < COMMAND_LINE_SIZE-1)
1329  boot_command_line[builtin_len++] = ' ';
1330  hv_cmdline = &boot_command_line[builtin_len];
1331  len = COMMAND_LINE_SIZE - builtin_len;
1332  } else
1333 #endif
1334  {
1335  hv_cmdline = boot_command_line;
1336  len = COMMAND_LINE_SIZE;
1337  }
1338  len = hv_get_command_line((HV_VirtAddr) hv_cmdline, len);
1339  if (len < 0 || len > COMMAND_LINE_SIZE)
1340  early_panic("hv_get_command_line failed: %d\n", len);
1341 #endif
1342 
1343  *cmdline_p = boot_command_line;
1344 
1345  /* Set disabled_map and setup_max_cpus very early */
1347 
1348  /* Make sure the kernel is compatible with the hypervisor. */
1349  validate_hv();
1350  validate_va();
1351 
1352  setup_cpu_maps();
1353 
1354 
1355 #if defined(CONFIG_PCI) && !defined(__tilegx__)
1356  /*
1357  * Initialize the PCI structures. This is done before memory
1358  * setup so that we know whether or not a pci_reserve region
1359  * is necessary.
1360  */
1361  if (tile_pci_init() == 0)
1362  pci_reserve_mb = 0;
1363 
1364  /* PCI systems reserve a region just below 4GB for mapping iomem. */
1365  pci_reserve_end_pfn = (1 << (32 - PAGE_SHIFT));
1366  pci_reserve_start_pfn = pci_reserve_end_pfn -
1367  (pci_reserve_mb << (20 - PAGE_SHIFT));
1368 #endif
1369 
1370  init_mm.start_code = (unsigned long) _text;
1371  init_mm.end_code = (unsigned long) _etext;
1372  init_mm.end_data = (unsigned long) _edata;
1373  init_mm.brk = (unsigned long) _end;
1374 
1375  setup_memory();
1376  store_permanent_mappings();
1377  setup_bootmem_allocator();
1378 
1379  /*
1380  * NOTE: before this point _nobody_ is allowed to allocate
1381  * any memory using the bootmem allocator.
1382  */
1383 
1384 #ifdef CONFIG_SWIOTLB
1385  swiotlb_init(0);
1386 #endif
1387 
1388  paging_init();
1390  zone_sizes_init();
1391  set_page_homes();
1392  setup_cpu(1);
1393  setup_clock();
1394  load_hv_initrd();
1395 }
1396 
1397 
1398 /*
1399  * Set up per-cpu memory.
1400  */
1401 
1402 unsigned long __per_cpu_offset[NR_CPUS] __write_once;
1403 EXPORT_SYMBOL(__per_cpu_offset);
1404 
1405 static size_t __initdata pfn_offset[MAX_NUMNODES] = { 0 };
1406 static unsigned long __initdata percpu_pfn[NR_CPUS] = { 0 };
1407 
1408 /*
1409  * As the percpu code allocates pages, we return the pages from the
1410  * end of the node for the specified cpu.
1411  */
1412 static void *__init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
1413 {
1414  int nid = cpu_to_node(cpu);
1415  unsigned long pfn = node_percpu_pfn[nid] + pfn_offset[nid];
1416 
1417  BUG_ON(size % PAGE_SIZE != 0);
1418  pfn_offset[nid] += size / PAGE_SIZE;
1419  BUG_ON(node_percpu[nid] < size);
1420  node_percpu[nid] -= size;
1421  if (percpu_pfn[cpu] == 0)
1422  percpu_pfn[cpu] = pfn;
1423  return pfn_to_kaddr(pfn);
1424 }
1425 
1426 /*
1427  * Pages reserved for percpu memory are not freeable, and in any case we are
1428  * on a short path to panic() in setup_per_cpu_area() at this point anyway.
1429  */
1430 static void __init pcpu_fc_free(void *ptr, size_t size)
1431 {
1432 }
1433 
1434 /*
1435  * Set up vmalloc page tables using bootmem for the percpu code.
1436  */
1437 static void __init pcpu_fc_populate_pte(unsigned long addr)
1438 {
1439  pgd_t *pgd;
1440  pud_t *pud;
1441  pmd_t *pmd;
1442  pte_t *pte;
1443 
1444  BUG_ON(pgd_addr_invalid(addr));
1445  if (addr < VMALLOC_START || addr >= VMALLOC_END)
1446  panic("PCPU addr %#lx outside vmalloc range %#lx..%#lx;"
1447  " try increasing CONFIG_VMALLOC_RESERVE\n",
1448  addr, VMALLOC_START, VMALLOC_END);
1449 
1450  pgd = swapper_pg_dir + pgd_index(addr);
1451  pud = pud_offset(pgd, addr);
1452  BUG_ON(!pud_present(*pud));
1453  pmd = pmd_offset(pud, addr);
1454  if (pmd_present(*pmd)) {
1455  BUG_ON(pmd_huge_page(*pmd));
1456  } else {
1458  HV_PAGE_TABLE_ALIGN, 0);
1459  pmd_populate_kernel(&init_mm, pmd, pte);
1460  }
1461 }
1462 
1464 {
1465  struct page *pg;
1466  unsigned long delta, pfn, lowmem_va;
1467  unsigned long size = percpu_size();
1468  char *ptr;
1469  int rc, cpu, i;
1470 
1471  rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, pcpu_fc_alloc,
1472  pcpu_fc_free, pcpu_fc_populate_pte);
1473  if (rc < 0)
1474  panic("Cannot initialize percpu area (err=%d)", rc);
1475 
1476  delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
1477  for_each_possible_cpu(cpu) {
1478  __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
1479 
1480  /* finv the copy out of cache so we can change homecache */
1481  ptr = pcpu_base_addr + pcpu_unit_offsets[cpu];
1482  __finv_buffer(ptr, size);
1483  pfn = percpu_pfn[cpu];
1484 
1485  /* Rewrite the page tables to cache on that cpu */
1486  pg = pfn_to_page(pfn);
1487  for (i = 0; i < size; i += PAGE_SIZE, ++pfn, ++pg) {
1488 
1489  /* Update the vmalloc mapping and page home. */
1490  unsigned long addr = (unsigned long)ptr + i;
1491  pte_t *ptep = virt_to_pte(NULL, addr);
1492  pte_t pte = *ptep;
1493  BUG_ON(pfn != pte_pfn(pte));
1494  pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_TILE_L3);
1495  pte = set_remote_cache_cpu(pte, cpu);
1496  set_pte_at(&init_mm, addr, ptep, pte);
1497 
1498  /* Update the lowmem mapping for consistency. */
1499  lowmem_va = (unsigned long)pfn_to_kaddr(pfn);
1500  ptep = virt_to_pte(NULL, lowmem_va);
1501  if (pte_huge(*ptep)) {
1502  printk(KERN_DEBUG "early shatter of huge page"
1503  " at %#lx\n", lowmem_va);
1504  shatter_pmd((pmd_t *)ptep);
1505  ptep = virt_to_pte(NULL, lowmem_va);
1506  BUG_ON(pte_huge(*ptep));
1507  }
1508  BUG_ON(pfn != pte_pfn(*ptep));
1509  set_pte_at(&init_mm, lowmem_va, ptep, pte);
1510  }
1511  }
1512 
1513  /* Set our thread pointer appropriately. */
1514  set_my_cpu_offset(__per_cpu_offset[smp_processor_id()]);
1515 
1516  /* Make sure the finv's have completed. */
1517  mb_incoherent();
1518 
1519  /* Flush the TLB so we reference it properly from here on out. */
1521 }
1522 
1523 static struct resource data_resource = {
1524  .name = "Kernel data",
1525  .start = 0,
1526  .end = 0,
1527  .flags = IORESOURCE_BUSY | IORESOURCE_MEM
1528 };
1529 
1530 static struct resource code_resource = {
1531  .name = "Kernel code",
1532  .start = 0,
1533  .end = 0,
1534  .flags = IORESOURCE_BUSY | IORESOURCE_MEM
1535 };
1536 
1537 /*
1538  * On Pro, we reserve all resources above 4GB so that PCI won't try to put
1539  * mappings above 4GB.
1540  */
1541 #if defined(CONFIG_PCI) && !defined(__tilegx__)
1542 static struct resource* __init
1543 insert_non_bus_resource(void)
1544 {
1545  struct resource *res =
1546  kzalloc(sizeof(struct resource), GFP_ATOMIC);
1547  res->name = "Non-Bus Physical Address Space";
1548  res->start = (1ULL << 32);
1549  res->end = -1LL;
1551  if (insert_resource(&iomem_resource, res)) {
1552  kfree(res);
1553  return NULL;
1554  }
1555  return res;
1556 }
1557 #endif
1558 
1559 static struct resource* __init
1560 insert_ram_resource(u64 start_pfn, u64 end_pfn)
1561 {
1562  struct resource *res =
1563  kzalloc(sizeof(struct resource), GFP_ATOMIC);
1564  res->name = "System RAM";
1565  res->start = start_pfn << PAGE_SHIFT;
1566  res->end = (end_pfn << PAGE_SHIFT) - 1;
1568  if (insert_resource(&iomem_resource, res)) {
1569  kfree(res);
1570  return NULL;
1571  }
1572  return res;
1573 }
1574 
1575 /*
1576  * Request address space for all standard resources
1577  *
1578  * If the system includes PCI root complex drivers, we need to create
1579  * a window just below 4GB where PCI BARs can be mapped.
1580  */
1581 static int __init request_standard_resources(void)
1582 {
1583  int i;
1584  enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
1585 
1586 #if defined(CONFIG_PCI) && !defined(__tilegx__)
1587  insert_non_bus_resource();
1588 #endif
1589 
1591  u64 start_pfn = node_start_pfn[i];
1592  u64 end_pfn = node_end_pfn[i];
1593 
1594 #if defined(CONFIG_PCI) && !defined(__tilegx__)
1595  if (start_pfn <= pci_reserve_start_pfn &&
1596  end_pfn > pci_reserve_start_pfn) {
1597  if (end_pfn > pci_reserve_end_pfn)
1598  insert_ram_resource(pci_reserve_end_pfn,
1599  end_pfn);
1600  end_pfn = pci_reserve_start_pfn;
1601  }
1602 #endif
1603  insert_ram_resource(start_pfn, end_pfn);
1604  }
1605 
1606  code_resource.start = __pa(_text - CODE_DELTA);
1607  code_resource.end = __pa(_etext - CODE_DELTA)-1;
1608  data_resource.start = __pa(_sdata);
1609  data_resource.end = __pa(_end)-1;
1610 
1611  insert_resource(&iomem_resource, &code_resource);
1612  insert_resource(&iomem_resource, &data_resource);
1613 
1614 #ifdef CONFIG_KEXEC
1616 #endif
1617 
1618  return 0;
1619 }
1620 
1621 subsys_initcall(request_standard_resources);