Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
numa_emulation.c
Go to the documentation of this file.
1 /*
2  * NUMA emulation
3  */
4 #include <linux/kernel.h>
5 #include <linux/errno.h>
6 #include <linux/topology.h>
7 #include <linux/memblock.h>
8 #include <linux/bootmem.h>
9 #include <asm/dma.h>
10 
11 #include "numa_internal.h"
12 
13 static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata;
14 static char *emu_cmdline __initdata;
15 
17 {
18  emu_cmdline = str;
19 }
20 
21 static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
22 {
23  int i;
24 
25  for (i = 0; i < mi->nr_blks; i++)
26  if (mi->blk[i].nid == nid)
27  return i;
28  return -ENOENT;
29 }
30 
31 static u64 __init mem_hole_size(u64 start, u64 end)
32 {
33  unsigned long start_pfn = PFN_UP(start);
34  unsigned long end_pfn = PFN_DOWN(end);
35 
36  if (start_pfn < end_pfn)
37  return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn));
38  return 0;
39 }
40 
41 /*
42  * Sets up nid to range from @start to @end. The return value is -errno if
43  * something went wrong, 0 otherwise.
44  */
45 static int __init emu_setup_memblk(struct numa_meminfo *ei,
46  struct numa_meminfo *pi,
47  int nid, int phys_blk, u64 size)
48 {
49  struct numa_memblk *eb = &ei->blk[ei->nr_blks];
50  struct numa_memblk *pb = &pi->blk[phys_blk];
51 
52  if (ei->nr_blks >= NR_NODE_MEMBLKS) {
53  pr_err("NUMA: Too many emulated memblks, failing emulation\n");
54  return -EINVAL;
55  }
56 
57  ei->nr_blks++;
58  eb->start = pb->start;
59  eb->end = pb->start + size;
60  eb->nid = nid;
61 
62  if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
63  emu_nid_to_phys[nid] = nid;
64 
65  pb->start += size;
66  if (pb->start >= pb->end) {
67  WARN_ON_ONCE(pb->start > pb->end);
68  numa_remove_memblk_from(phys_blk, pi);
69  }
70 
71  printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
72  nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
73  return 0;
74 }
75 
76 /*
77  * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
78  * to max_addr. The return value is the number of nodes allocated.
79  */
80 static int __init split_nodes_interleave(struct numa_meminfo *ei,
81  struct numa_meminfo *pi,
82  u64 addr, u64 max_addr, int nr_nodes)
83 {
84  nodemask_t physnode_mask = NODE_MASK_NONE;
85  u64 size;
86  int big;
87  int nid = 0;
88  int i, ret;
89 
90  if (nr_nodes <= 0)
91  return -1;
92  if (nr_nodes > MAX_NUMNODES) {
93  pr_info("numa=fake=%d too large, reducing to %d\n",
94  nr_nodes, MAX_NUMNODES);
95  nr_nodes = MAX_NUMNODES;
96  }
97 
98  /*
99  * Calculate target node size. x86_32 freaks on __udivdi3() so do
100  * the division in ulong number of pages and convert back.
101  */
102  size = max_addr - addr - mem_hole_size(addr, max_addr);
103  size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
104 
105  /*
106  * Calculate the number of big nodes that can be allocated as a result
107  * of consolidating the remainder.
108  */
109  big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
110  FAKE_NODE_MIN_SIZE;
111 
112  size &= FAKE_NODE_MIN_HASH_MASK;
113  if (!size) {
114  pr_err("Not enough memory for each node. "
115  "NUMA emulation disabled.\n");
116  return -1;
117  }
118 
119  for (i = 0; i < pi->nr_blks; i++)
120  node_set(pi->blk[i].nid, physnode_mask);
121 
122  /*
123  * Continue to fill physical nodes with fake nodes until there is no
124  * memory left on any of them.
125  */
126  while (nodes_weight(physnode_mask)) {
127  for_each_node_mask(i, physnode_mask) {
128  u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
129  u64 start, limit, end;
130  int phys_blk;
131 
132  phys_blk = emu_find_memblk_by_nid(i, pi);
133  if (phys_blk < 0) {
134  node_clear(i, physnode_mask);
135  continue;
136  }
137  start = pi->blk[phys_blk].start;
138  limit = pi->blk[phys_blk].end;
139  end = start + size;
140 
141  if (nid < big)
142  end += FAKE_NODE_MIN_SIZE;
143 
144  /*
145  * Continue to add memory to this fake node if its
146  * non-reserved memory is less than the per-node size.
147  */
148  while (end - start - mem_hole_size(start, end) < size) {
149  end += FAKE_NODE_MIN_SIZE;
150  if (end > limit) {
151  end = limit;
152  break;
153  }
154  }
155 
156  /*
157  * If there won't be at least FAKE_NODE_MIN_SIZE of
158  * non-reserved memory in ZONE_DMA32 for the next node,
159  * this one must extend to the boundary.
160  */
161  if (end < dma32_end && dma32_end - end -
162  mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
163  end = dma32_end;
164 
165  /*
166  * If there won't be enough non-reserved memory for the
167  * next node, this one must extend to the end of the
168  * physical node.
169  */
170  if (limit - end - mem_hole_size(end, limit) < size)
171  end = limit;
172 
173  ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
174  phys_blk,
175  min(end, limit) - start);
176  if (ret < 0)
177  return ret;
178  }
179  }
180  return 0;
181 }
182 
183 /*
184  * Returns the end address of a node so that there is at least `size' amount of
185  * non-reserved memory or `max_addr' is reached.
186  */
187 static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
188 {
189  u64 end = start + size;
190 
191  while (end - start - mem_hole_size(start, end) < size) {
192  end += FAKE_NODE_MIN_SIZE;
193  if (end > max_addr) {
194  end = max_addr;
195  break;
196  }
197  }
198  return end;
199 }
200 
201 /*
202  * Sets up fake nodes of `size' interleaved over physical nodes ranging from
203  * `addr' to `max_addr'. The return value is the number of nodes allocated.
204  */
205 static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
206  struct numa_meminfo *pi,
207  u64 addr, u64 max_addr, u64 size)
208 {
209  nodemask_t physnode_mask = NODE_MASK_NONE;
210  u64 min_size;
211  int nid = 0;
212  int i, ret;
213 
214  if (!size)
215  return -1;
216  /*
217  * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
218  * increased accordingly if the requested size is too small. This
219  * creates a uniform distribution of node sizes across the entire
220  * machine (but not necessarily over physical nodes).
221  */
222  min_size = (max_addr - addr - mem_hole_size(addr, max_addr)) / MAX_NUMNODES;
223  min_size = max(min_size, FAKE_NODE_MIN_SIZE);
224  if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
225  min_size = (min_size + FAKE_NODE_MIN_SIZE) &
226  FAKE_NODE_MIN_HASH_MASK;
227  if (size < min_size) {
228  pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
229  size >> 20, min_size >> 20);
230  size = min_size;
231  }
232  size &= FAKE_NODE_MIN_HASH_MASK;
233 
234  for (i = 0; i < pi->nr_blks; i++)
235  node_set(pi->blk[i].nid, physnode_mask);
236 
237  /*
238  * Fill physical nodes with fake nodes of size until there is no memory
239  * left on any of them.
240  */
241  while (nodes_weight(physnode_mask)) {
242  for_each_node_mask(i, physnode_mask) {
243  u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
244  u64 start, limit, end;
245  int phys_blk;
246 
247  phys_blk = emu_find_memblk_by_nid(i, pi);
248  if (phys_blk < 0) {
249  node_clear(i, physnode_mask);
250  continue;
251  }
252  start = pi->blk[phys_blk].start;
253  limit = pi->blk[phys_blk].end;
254 
255  end = find_end_of_node(start, limit, size);
256  /*
257  * If there won't be at least FAKE_NODE_MIN_SIZE of
258  * non-reserved memory in ZONE_DMA32 for the next node,
259  * this one must extend to the boundary.
260  */
261  if (end < dma32_end && dma32_end - end -
262  mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
263  end = dma32_end;
264 
265  /*
266  * If there won't be enough non-reserved memory for the
267  * next node, this one must extend to the end of the
268  * physical node.
269  */
270  if (limit - end - mem_hole_size(end, limit) < size)
271  end = limit;
272 
273  ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
274  phys_blk,
275  min(end, limit) - start);
276  if (ret < 0)
277  return ret;
278  }
279  }
280  return 0;
281 }
282 
310 void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
311 {
312  static struct numa_meminfo ei __initdata;
313  static struct numa_meminfo pi __initdata;
314  const u64 max_addr = PFN_PHYS(max_pfn);
315  u8 *phys_dist = NULL;
316  size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
317  int max_emu_nid, dfl_phys_nid;
318  int i, j, ret;
319 
320  if (!emu_cmdline)
321  goto no_emu;
322 
323  memset(&ei, 0, sizeof(ei));
324  pi = *numa_meminfo;
325 
326  for (i = 0; i < MAX_NUMNODES; i++)
327  emu_nid_to_phys[i] = NUMA_NO_NODE;
328 
329  /*
330  * If the numa=fake command-line contains a 'M' or 'G', it represents
331  * the fixed node size. Otherwise, if it is just a single number N,
332  * split the system RAM into N fake nodes.
333  */
334  if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
335  u64 size;
336 
337  size = memparse(emu_cmdline, &emu_cmdline);
338  ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
339  } else {
340  unsigned long n;
341 
342  n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
343  ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
344  }
345  if (*emu_cmdline == ':')
346  emu_cmdline++;
347 
348  if (ret < 0)
349  goto no_emu;
350 
351  if (numa_cleanup_meminfo(&ei) < 0) {
352  pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
353  goto no_emu;
354  }
355 
356  /* copy the physical distance table */
357  if (numa_dist_cnt) {
358  u64 phys;
359 
361  phys_size, PAGE_SIZE);
362  if (!phys) {
363  pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
364  goto no_emu;
365  }
366  memblock_reserve(phys, phys_size);
367  phys_dist = __va(phys);
368 
369  for (i = 0; i < numa_dist_cnt; i++)
370  for (j = 0; j < numa_dist_cnt; j++)
371  phys_dist[i * numa_dist_cnt + j] =
372  node_distance(i, j);
373  }
374 
375  /*
376  * Determine the max emulated nid and the default phys nid to use
377  * for unmapped nodes.
378  */
379  max_emu_nid = 0;
380  dfl_phys_nid = NUMA_NO_NODE;
381  for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
382  if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
383  max_emu_nid = i;
384  if (dfl_phys_nid == NUMA_NO_NODE)
385  dfl_phys_nid = emu_nid_to_phys[i];
386  }
387  }
388  if (dfl_phys_nid == NUMA_NO_NODE) {
389  pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n");
390  goto no_emu;
391  }
392 
393  /* commit */
394  *numa_meminfo = ei;
395 
396  /*
397  * Transform __apicid_to_node table to use emulated nids by
398  * reverse-mapping phys_nid. The maps should always exist but fall
399  * back to zero just in case.
400  */
401  for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
402  if (__apicid_to_node[i] == NUMA_NO_NODE)
403  continue;
404  for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
405  if (__apicid_to_node[i] == emu_nid_to_phys[j])
406  break;
407  __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
408  }
409 
410  /* make sure all emulated nodes are mapped to a physical node */
411  for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
412  if (emu_nid_to_phys[i] == NUMA_NO_NODE)
413  emu_nid_to_phys[i] = dfl_phys_nid;
414 
415  /* transform distance table */
417  for (i = 0; i < max_emu_nid + 1; i++) {
418  for (j = 0; j < max_emu_nid + 1; j++) {
419  int physi = emu_nid_to_phys[i];
420  int physj = emu_nid_to_phys[j];
421  int dist;
422 
423  if (get_option(&emu_cmdline, &dist) == 2)
424  ;
425  else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
426  dist = physi == physj ?
428  else
429  dist = phys_dist[physi * numa_dist_cnt + physj];
430 
431  numa_set_distance(i, j, dist);
432  }
433  }
434 
435  /* free the copied physical distance table */
436  if (phys_dist)
437  memblock_free(__pa(phys_dist), phys_size);
438  return;
439 
440 no_emu:
441  /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
442  for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
443  emu_nid_to_phys[i] = i;
444 }
445 
446 #ifndef CONFIG_DEBUG_PER_CPU_MAPS
448 {
449  int physnid, nid;
450 
451  nid = early_cpu_to_node(cpu);
452  BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
453 
454  physnid = emu_nid_to_phys[nid];
455 
456  /*
457  * Map the cpu to each emulated node that is allocated on the physical
458  * node of the cpu's apic id.
459  */
461  if (emu_nid_to_phys[nid] == physnid)
462  cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
463 }
464 
466 {
467  int i;
468 
470  cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
471 }
472 #else /* !CONFIG_DEBUG_PER_CPU_MAPS */
473 static void __cpuinit numa_set_cpumask(int cpu, bool enable)
474 {
475  int nid, physnid;
476 
477  nid = early_cpu_to_node(cpu);
478  if (nid == NUMA_NO_NODE) {
479  /* early_cpu_to_node() already emits a warning and trace */
480  return;
481  }
482 
483  physnid = emu_nid_to_phys[nid];
484 
485  for_each_online_node(nid) {
486  if (emu_nid_to_phys[nid] != physnid)
487  continue;
488 
489  debug_cpumask_set_cpu(cpu, nid, enable);
490  }
491 }
492 
493 void __cpuinit numa_add_cpu(int cpu)
494 {
495  numa_set_cpumask(cpu, true);
496 }
497 
498 void __cpuinit numa_remove_cpu(int cpu)
499 {
500  numa_set_cpumask(cpu, false);
501 }
502 #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */