Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
smpboot.c
Go to the documentation of this file.
1  /*
2  * x86 SMP booting functions
3  *
4  * (c) 1995 Alan Cox, Building #3 <[email protected]>
5  * (c) 1998, 1999, 2000, 2009 Ingo Molnar <[email protected]>
6  * Copyright 2001 Andi Kleen, SuSE Labs.
7  *
8  * Much of the core SMP work is based on previous work by Thomas Radke, to
9  * whom a great many thanks are extended.
10  *
11  * Thanks to Intel for making available several different Pentium,
12  * Pentium Pro and Pentium-II/Xeon MP machines.
13  * Original development of Linux SMP code supported by Caldera.
14  *
15  * This code is released under the GNU General Public License version 2 or
16  * later.
17  *
18  * Fixes
19  * Felix Koop : NR_CPUS used properly
20  * Jose Renau : Handle single CPU case.
21  * Alan Cox : By repeated request 8) - Total BogoMIPS report.
22  * Greg Wright : Fix for kernel stacks panic.
23  * Erich Boleyn : MP v1.4 and additional changes.
24  * Matthias Sattler : Changes for 2.1 kernel map.
25  * Michel Lespinasse : Changes for 2.1 kernel map.
26  * Michael Chastain : Change trampoline.S to gnu as.
27  * Alan Cox : Dumb bug: 'B' step PPro's are fine
28  * Ingo Molnar : Added APIC timers, based on code
29  * from Jose Renau
30  * Ingo Molnar : various cleanups and rewrites
31  * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
32  * Maciej W. Rozycki : Bits for genuine 82489DX APICs
33  * Andi Kleen : Changed for SMP boot into long mode.
34  * Martin J. Bligh : Added support for multi-quad systems
35  * Dave Jones : Report invalid combinations of Athlon CPUs.
36  * Rusty Russell : Hacked into shape for new "hotplug" boot process.
37  * Andi Kleen : Converted to new state machine.
38  * Ashok Raj : CPU hotplug support
39  * Glauber Costa : i386 and x86_64 integration
40  */
41 
42 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
43 
44 #include <linux/init.h>
45 #include <linux/smp.h>
46 #include <linux/module.h>
47 #include <linux/sched.h>
48 #include <linux/percpu.h>
49 #include <linux/bootmem.h>
50 #include <linux/err.h>
51 #include <linux/nmi.h>
52 #include <linux/tboot.h>
53 #include <linux/stackprotector.h>
54 #include <linux/gfp.h>
55 #include <linux/cpuidle.h>
56 
57 #include <asm/acpi.h>
58 #include <asm/desc.h>
59 #include <asm/nmi.h>
60 #include <asm/irq.h>
61 #include <asm/idle.h>
62 #include <asm/realmode.h>
63 #include <asm/cpu.h>
64 #include <asm/numa.h>
65 #include <asm/pgtable.h>
66 #include <asm/tlbflush.h>
67 #include <asm/mtrr.h>
68 #include <asm/mwait.h>
69 #include <asm/apic.h>
70 #include <asm/io_apic.h>
71 #include <asm/i387.h>
72 #include <asm/fpu-internal.h>
73 #include <asm/setup.h>
74 #include <asm/uv/uv.h>
75 #include <linux/mc146818rtc.h>
76 
77 #include <asm/smpboot_hooks.h>
78 #include <asm/i8259.h>
79 
80 #include <asm/realmode.h>
81 
82 /* State of each CPU */
83 DEFINE_PER_CPU(int, cpu_state) = { 0 };
84 
85 #ifdef CONFIG_HOTPLUG_CPU
86 /*
87  * We need this for trampoline_base protection from concurrent accesses when
88  * off- and onlining cores wildly.
89  */
90 static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
91 
92 void cpu_hotplug_driver_lock(void)
93 {
94  mutex_lock(&x86_cpu_hotplug_driver_mutex);
95 }
96 
97 void cpu_hotplug_driver_unlock(void)
98 {
99  mutex_unlock(&x86_cpu_hotplug_driver_mutex);
100 }
101 
102 ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; }
103 ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; }
104 #endif
105 
106 /* Number of siblings per CPU package */
109 
110 /* Last level cache ID of each logical CPU */
112 
113 /* representing HT siblings of each logical CPU */
114 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
115 EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
116 
117 /* representing HT and core siblings of each logical CPU */
120 
121 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
122 
123 /* Per CPU bogomips and other parameters */
126 
128 
129 /*
130  * Report back to the Boot Processor.
131  * Running on AP.
132  */
133 static void __cpuinit smp_callin(void)
134 {
135  int cpuid, phys_id;
136  unsigned long timeout;
137 
138  /*
139  * If waken up by an INIT in an 82489DX configuration
140  * we may get here before an INIT-deassert IPI reaches
141  * our local APIC. We have to wait for the IPI or we'll
142  * lock up on an APIC access.
143  */
145  apic->wait_for_init_deassert(&init_deasserted);
146 
147  /*
148  * (This works even if the APIC is not enabled.)
149  */
150  phys_id = read_apic_id();
151  cpuid = smp_processor_id();
152  if (cpumask_test_cpu(cpuid, cpu_callin_mask)) {
153  panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
154  phys_id, cpuid);
155  }
156  pr_debug("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
157 
158  /*
159  * STARTUP IPIs are fragile beasts as they might sometimes
160  * trigger some glue motherboard logic. Complete APIC bus
161  * silence for 1 second, this overestimates the time the
162  * boot CPU is spending to send the up to 2 STARTUP IPIs
163  * by a factor of two. This should be enough.
164  */
165 
166  /*
167  * Waiting 2s total for startup (udelay is not yet working)
168  */
169  timeout = jiffies + 2*HZ;
170  while (time_before(jiffies, timeout)) {
171  /*
172  * Has the boot CPU finished it's STARTUP sequence?
173  */
175  break;
176  cpu_relax();
177  }
178 
179  if (!time_before(jiffies, timeout)) {
180  panic("%s: CPU%d started up but did not get a callout!\n",
181  __func__, cpuid);
182  }
183 
184  /*
185  * the boot CPU has finished the init stage and is spinning
186  * on callin_map until we finish. We are free to set up this
187  * CPU, first the APIC. (this is probably redundant on most
188  * boards)
189  */
190 
191  pr_debug("CALLIN, before setup_local_APIC()\n");
196 
197  /*
198  * Need to setup vector mappings before we enable interrupts.
199  */
201 
202  /*
203  * Save our processor parameters. Note: this information
204  * is needed for clock calibration.
205  */
206  smp_store_cpu_info(cpuid);
207 
208  /*
209  * Get our bogomips.
210  * Update loops_per_jiffy in cpu_data. Previous call to
211  * smp_store_cpu_info() stored a value that is close but not as
212  * accurate as the value just calculated.
213  */
214  calibrate_delay();
215  cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy;
216  pr_debug("Stack at about %p\n", &cpuid);
217 
218  /*
219  * This must be done before setting cpu_online_mask
220  * or calling notify_cpu_starting.
221  */
223  wmb();
224 
225  notify_cpu_starting(cpuid);
226 
227  /*
228  * Allow the master to continue.
229  */
230  cpumask_set_cpu(cpuid, cpu_callin_mask);
231 }
232 
233 /*
234  * Activate a secondary processor.
235  */
236 notrace static void __cpuinit start_secondary(void *unused)
237 {
238  /*
239  * Don't put *anything* before cpu_init(), SMP booting is too
240  * fragile that we want to limit the things done here to the
241  * most necessary things.
242  */
243  cpu_init();
244  x86_cpuinit.early_percpu_clock_init();
245  preempt_disable();
246  smp_callin();
247 
248 #ifdef CONFIG_X86_32
249  /* switch away from the initial page table */
250  load_cr3(swapper_pg_dir);
251  __flush_tlb_all();
252 #endif
253 
254  /* otherwise gcc will move up smp_processor_id before the cpu_init */
255  barrier();
256  /*
257  * Check TSC synchronization with the BP:
258  */
260 
261  /*
262  * We need to hold vector_lock so there the set of online cpus
263  * does not change while we are assigning vectors to cpus. Holding
264  * this lock ensures we don't half assign or remove an irq from a cpu.
265  */
269  per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
270  x86_platform.nmi_init();
271 
272  /* enable local interrupts */
274 
275  /* to prevent fake stack check failure in clock setup */
276  boot_init_stack_canary();
277 
278  x86_cpuinit.setup_percpu_clockev();
279 
280  wmb();
281  cpu_idle();
282 }
283 
284 /*
285  * The bootstrap kernel entry code has set these up. Save them for
286  * a given CPU
287  */
288 
290 {
291  struct cpuinfo_x86 *c = &cpu_data(id);
292 
293  *c = boot_cpu_data;
294  c->cpu_index = id;
295  if (id != 0)
297 }
298 
299 static bool __cpuinit
300 topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
301 {
302  int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
303 
304  return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2),
305  "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
306  "[node: %d != %d]. Ignoring dependency.\n",
307  cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
308 }
309 
310 #define link_mask(_m, c1, c2) \
311 do { \
312  cpumask_set_cpu((c1), cpu_##_m##_mask(c2)); \
313  cpumask_set_cpu((c2), cpu_##_m##_mask(c1)); \
314 } while (0)
315 
316 static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
317 {
318  if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
319  int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
320 
321  if (c->phys_proc_id == o->phys_proc_id &&
322  per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) &&
324  return topology_sane(c, o, "smt");
325 
326  } else if (c->phys_proc_id == o->phys_proc_id &&
327  c->cpu_core_id == o->cpu_core_id) {
328  return topology_sane(c, o, "smt");
329  }
330 
331  return false;
332 }
333 
334 static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
335 {
336  int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
337 
338  if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID &&
339  per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2))
340  return topology_sane(c, o, "llc");
341 
342  return false;
343 }
344 
345 static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
346 {
347  if (c->phys_proc_id == o->phys_proc_id) {
348  if (cpu_has(c, X86_FEATURE_AMD_DCM))
349  return true;
350 
351  return topology_sane(c, o, "mc");
352  }
353  return false;
354 }
355 
357 {
358  bool has_mc = boot_cpu_data.x86_max_cores > 1;
359  bool has_smt = smp_num_siblings > 1;
360  struct cpuinfo_x86 *c = &cpu_data(cpu);
361  struct cpuinfo_x86 *o;
362  int i;
363 
364  cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
365 
366  if (!has_smt && !has_mc) {
367  cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
368  cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
369  cpumask_set_cpu(cpu, cpu_core_mask(cpu));
370  c->booted_cores = 1;
371  return;
372  }
373 
375  o = &cpu_data(i);
376 
377  if ((i == cpu) || (has_smt && match_smt(c, o)))
378  link_mask(sibling, cpu, i);
379 
380  if ((i == cpu) || (has_mc && match_llc(c, o)))
381  link_mask(llc_shared, cpu, i);
382 
383  }
384 
385  /*
386  * This needs a separate iteration over the cpus because we rely on all
387  * cpu_sibling_mask links to be set-up.
388  */
390  o = &cpu_data(i);
391 
392  if ((i == cpu) || (has_mc && match_mc(c, o))) {
393  link_mask(core, cpu, i);
394 
395  /*
396  * Does this new cpu bringup a new core?
397  */
398  if (cpumask_weight(cpu_sibling_mask(cpu)) == 1) {
399  /*
400  * for each core in package, increment
401  * the booted_cores for this new cpu
402  */
403  if (cpumask_first(cpu_sibling_mask(i)) == i)
404  c->booted_cores++;
405  /*
406  * increment the core count for all
407  * the other cpus in this package
408  */
409  if (i != cpu)
410  cpu_data(i).booted_cores++;
411  } else if (i != cpu && !c->booted_cores)
412  c->booted_cores = cpu_data(i).booted_cores;
413  }
414  }
415 }
416 
417 /* maps the cpu to the sched domain representing multi-core */
418 const struct cpumask *cpu_coregroup_mask(int cpu)
419 {
420  return cpu_llc_shared_mask(cpu);
421 }
422 
423 static void impress_friends(void)
424 {
425  int cpu;
426  unsigned long bogosum = 0;
427  /*
428  * Allow the user to impress friends.
429  */
430  pr_debug("Before bogomips\n");
433  bogosum += cpu_data(cpu).loops_per_jiffy;
434  pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n",
435  num_online_cpus(),
436  bogosum/(500000/HZ),
437  (bogosum/(5000/HZ))%100);
438 
439  pr_debug("Before bogocount - setting activated=1\n");
440 }
441 
443 {
444  unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
445  const char * const names[] = { "ID", "VERSION", "SPIV" };
446  int timeout;
447  u32 status;
448 
449  pr_info("Inquiring remote APIC 0x%x...\n", apicid);
450 
451  for (i = 0; i < ARRAY_SIZE(regs); i++) {
452  pr_info("... APIC 0x%x %s: ", apicid, names[i]);
453 
454  /*
455  * Wait for idle.
456  */
457  status = safe_apic_wait_icr_idle();
458  if (status)
459  pr_cont("a previous APIC delivery may have failed\n");
460 
461  apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
462 
463  timeout = 0;
464  do {
465  udelay(100);
466  status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
467  } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
468 
469  switch (status) {
470  case APIC_ICR_RR_VALID:
471  status = apic_read(APIC_RRR);
472  pr_cont("%08x\n", status);
473  break;
474  default:
475  pr_cont("failed\n");
476  }
477  }
478 }
479 
480 /*
481  * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
482  * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
483  * won't ... remember to clear down the APIC, etc later.
484  */
485 int __cpuinit
486 wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
487 {
488  unsigned long send_status, accept_status = 0;
489  int maxlvt;
490 
491  /* Target chip */
492  /* Boot on the stack */
493  /* Kick the second */
494  apic_icr_write(APIC_DM_NMI | apic->dest_logical, logical_apicid);
495 
496  pr_debug("Waiting for send to finish...\n");
497  send_status = safe_apic_wait_icr_idle();
498 
499  /*
500  * Give the other CPU some time to accept the IPI.
501  */
502  udelay(200);
504  maxlvt = lapic_get_maxlvt();
505  if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
506  apic_write(APIC_ESR, 0);
507  accept_status = (apic_read(APIC_ESR) & 0xEF);
508  }
509  pr_debug("NMI sent\n");
510 
511  if (send_status)
512  pr_err("APIC never delivered???\n");
513  if (accept_status)
514  pr_err("APIC delivery error (%lx)\n", accept_status);
515 
516  return (send_status | accept_status);
517 }
518 
519 static int __cpuinit
520 wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
521 {
522  unsigned long send_status, accept_status = 0;
523  int maxlvt, num_starts, j;
524 
525  maxlvt = lapic_get_maxlvt();
526 
527  /*
528  * Be paranoid about clearing APIC errors.
529  */
530  if (APIC_INTEGRATED(apic_version[phys_apicid])) {
531  if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
532  apic_write(APIC_ESR, 0);
533  apic_read(APIC_ESR);
534  }
535 
536  pr_debug("Asserting INIT\n");
537 
538  /*
539  * Turn INIT on target chip
540  */
541  /*
542  * Send IPI
543  */
545  phys_apicid);
546 
547  pr_debug("Waiting for send to finish...\n");
548  send_status = safe_apic_wait_icr_idle();
549 
550  mdelay(10);
551 
552  pr_debug("Deasserting INIT\n");
553 
554  /* Target chip */
555  /* Send IPI */
556  apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
557 
558  pr_debug("Waiting for send to finish...\n");
559  send_status = safe_apic_wait_icr_idle();
560 
561  mb();
562  atomic_set(&init_deasserted, 1);
563 
564  /*
565  * Should we send STARTUP IPIs ?
566  *
567  * Determine this based on the APIC version.
568  * If we don't have an integrated APIC, don't send the STARTUP IPIs.
569  */
570  if (APIC_INTEGRATED(apic_version[phys_apicid]))
571  num_starts = 2;
572  else
573  num_starts = 0;
574 
575  /*
576  * Paravirt / VMI wants a startup IPI hook here to set up the
577  * target processor state.
578  */
579  startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
580  stack_start);
581 
582  /*
583  * Run STARTUP IPI loop.
584  */
585  pr_debug("#startup loops: %d\n", num_starts);
586 
587  for (j = 1; j <= num_starts; j++) {
588  pr_debug("Sending STARTUP #%d\n", j);
589  if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
590  apic_write(APIC_ESR, 0);
591  apic_read(APIC_ESR);
592  pr_debug("After apic_write\n");
593 
594  /*
595  * STARTUP IPI
596  */
597 
598  /* Target chip */
599  /* Boot on the stack */
600  /* Kick the second */
601  apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12),
602  phys_apicid);
603 
604  /*
605  * Give the other CPU some time to accept the IPI.
606  */
607  udelay(300);
608 
609  pr_debug("Startup point 1\n");
610 
611  pr_debug("Waiting for send to finish...\n");
612  send_status = safe_apic_wait_icr_idle();
613 
614  /*
615  * Give the other CPU some time to accept the IPI.
616  */
617  udelay(200);
618  if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
619  apic_write(APIC_ESR, 0);
620  accept_status = (apic_read(APIC_ESR) & 0xEF);
621  if (send_status || accept_status)
622  break;
623  }
624  pr_debug("After Startup\n");
625 
626  if (send_status)
627  pr_err("APIC never delivered???\n");
628  if (accept_status)
629  pr_err("APIC delivery error (%lx)\n", accept_status);
630 
631  return (send_status | accept_status);
632 }
633 
634 /* reduce the number of lines printed when booting a large cpu count system */
635 static void __cpuinit announce_cpu(int cpu, int apicid)
636 {
637  static int current_node = -1;
638  int node = early_cpu_to_node(cpu);
639 
640  if (system_state == SYSTEM_BOOTING) {
641  if (node != current_node) {
642  if (current_node > (-1))
643  pr_cont(" OK\n");
644  current_node = node;
645  pr_info("Booting Node %3d, Processors ", node);
646  }
647  pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " OK\n" : "");
648  return;
649  } else
650  pr_info("Booting Node %d Processor %d APIC 0x%x\n",
651  node, cpu, apicid);
652 }
653 
654 /*
655  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
656  * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
657  * Returns zero if CPU booted OK, else error code from
658  * ->wakeup_secondary_cpu.
659  */
660 static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
661 {
662  volatile u32 *trampoline_status =
664  /* start_ip had better be page-aligned! */
665  unsigned long start_ip = real_mode_header->trampoline_start;
666 
667  unsigned long boot_error = 0;
668  int timeout;
669 
670  /* Just in case we booted with a single CPU. */
671  alternatives_enable_smp();
672 
673  idle->thread.sp = (unsigned long) (((struct pt_regs *)
674  (THREAD_SIZE + task_stack_page(idle))) - 1);
675  per_cpu(current_task, cpu) = idle;
676 
677 #ifdef CONFIG_X86_32
678  /* Stack for startup_32 can be just as for start_secondary onwards */
679  irq_ctx_init(cpu);
680 #else
681  clear_tsk_thread_flag(idle, TIF_FORK);
682  initial_gs = per_cpu_offset(cpu);
683  per_cpu(kernel_stack, cpu) =
684  (unsigned long)task_stack_page(idle) -
686 #endif
687  early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
688  initial_code = (unsigned long)start_secondary;
689  stack_start = idle->thread.sp;
690 
691  /* So we see what's up */
692  announce_cpu(cpu, apicid);
693 
694  /*
695  * This grunge runs the startup process for
696  * the targeted processor.
697  */
698 
699  atomic_set(&init_deasserted, 0);
700 
702 
703  pr_debug("Setting warm reset code and vector.\n");
704 
705  smpboot_setup_warm_reset_vector(start_ip);
706  /*
707  * Be paranoid about clearing APIC errors.
708  */
710  apic_write(APIC_ESR, 0);
711  apic_read(APIC_ESR);
712  }
713  }
714 
715  /*
716  * Kick the secondary CPU. Use the method in the APIC driver
717  * if it's defined - or use an INIT boot APIC message otherwise:
718  */
720  boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
721  else
722  boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
723 
724  if (!boot_error) {
725  /*
726  * allow APs to start initializing.
727  */
728  pr_debug("Before Callout %d\n", cpu);
729  cpumask_set_cpu(cpu, cpu_callout_mask);
730  pr_debug("After Callout %d\n", cpu);
731 
732  /*
733  * Wait 5s total for a response
734  */
735  for (timeout = 0; timeout < 50000; timeout++) {
737  break; /* It has booted */
738  udelay(100);
739  /*
740  * Allow other tasks to run while we wait for the
741  * AP to come online. This also gives a chance
742  * for the MTRR work(triggered by the AP coming online)
743  * to be completed in the stop machine context.
744  */
745  schedule();
746  }
747 
748  if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
749  print_cpu_msr(&cpu_data(cpu));
750  pr_debug("CPU%d: has booted.\n", cpu);
751  } else {
752  boot_error = 1;
753  if (*trampoline_status == 0xA5A5A5A5)
754  /* trampoline started but...? */
755  pr_err("CPU%d: Stuck ??\n", cpu);
756  else
757  /* trampoline code not run */
758  pr_err("CPU%d: Not responding\n", cpu);
760  apic->inquire_remote_apic(apicid);
761  }
762  }
763 
764  if (boot_error) {
765  /* Try to put things back the way they were before ... */
766  numa_remove_cpu(cpu); /* was set by numa_add_cpu */
767 
768  /* was set by do_boot_cpu() */
769  cpumask_clear_cpu(cpu, cpu_callout_mask);
770 
771  /* was set by cpu_init() */
772  cpumask_clear_cpu(cpu, cpu_initialized_mask);
773 
774  set_cpu_present(cpu, false);
775  per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
776  }
777 
778  /* mark "stuck" area as not stuck */
779  *trampoline_status = 0;
780 
782  /*
783  * Cleanup possible dangling ends...
784  */
785  smpboot_restore_warm_reset_vector();
786  }
787  return boot_error;
788 }
789 
790 int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle)
791 {
792  int apicid = apic->cpu_present_to_apicid(cpu);
793  unsigned long flags;
794  int err;
795 
797 
798  pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
799 
800  if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
802  !apic->apic_id_valid(apicid)) {
803  pr_err("%s: bad cpu %d\n", __func__, cpu);
804  return -EINVAL;
805  }
806 
807  /*
808  * Already booted CPU?
809  */
810  if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
811  pr_debug("do_boot_cpu %d Already started\n", cpu);
812  return -ENOSYS;
813  }
814 
815  /*
816  * Save current MTRR state in case it was changed since early boot
817  * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
818  */
819  mtrr_save_state();
820 
821  per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
822 
823  /* the FPU context is blank, nobody can own it */
824  __cpu_disable_lazy_restore(cpu);
825 
826  err = do_boot_cpu(apicid, cpu, tidle);
827  if (err) {
828  pr_debug("do_boot_cpu failed %d\n", err);
829  return -EIO;
830  }
831 
832  /*
833  * Check TSC synchronization with the AP (keep irqs disabled
834  * while doing so):
835  */
836  local_irq_save(flags);
838  local_irq_restore(flags);
839 
840  while (!cpu_online(cpu)) {
841  cpu_relax();
843  }
844 
845  return 0;
846 }
847 
852 {
854 }
855 
856 /*
857  * Fall back to non SMP mode after errors.
858  *
859  * RED-PEN audit/test this more. I bet there is more state messed up here.
860  */
861 static __init void disable_smp(void)
862 {
865  smpboot_clear_io_apic_irqs();
866 
867  if (smp_found_config)
868  physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
869  else
870  physid_set_mask_of_physid(0, &phys_cpu_present_map);
871  cpumask_set_cpu(0, cpu_sibling_mask(0));
872  cpumask_set_cpu(0, cpu_core_mask(0));
873 }
874 
875 /*
876  * Various sanity checks.
877  */
878 static int __init smp_sanity_check(unsigned max_cpus)
879 {
880  preempt_disable();
881 
882 #if !defined(CONFIG_X86_BIGSMP) && defined(CONFIG_X86_32)
883  if (def_to_bigsmp && nr_cpu_ids > 8) {
884  unsigned int cpu;
885  unsigned nr;
886 
887  pr_warn("More than 8 CPUs detected - skipping them\n"
888  "Use CONFIG_X86_BIGSMP\n");
889 
890  nr = 0;
891  for_each_present_cpu(cpu) {
892  if (nr >= 8)
893  set_cpu_present(cpu, false);
894  nr++;
895  }
896 
897  nr = 0;
898  for_each_possible_cpu(cpu) {
899  if (nr >= 8)
900  set_cpu_possible(cpu, false);
901  nr++;
902  }
903 
904  nr_cpu_ids = 8;
905  }
906 #endif
907 
909  pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n",
911 
913  }
914 
915  /*
916  * If we couldn't find an SMP configuration at boot time,
917  * get out of here now!
918  */
919  if (!smp_found_config && !acpi_lapic) {
920  preempt_enable();
921  pr_notice("SMP motherboard not detected\n");
922  disable_smp();
924  pr_notice("Local APIC not detected. Using dummy APIC emulation.\n");
925  return -1;
926  }
927 
928  /*
929  * Should not be necessary because the MP table should list the boot
930  * CPU too, but we do it for the sake of robustness anyway.
931  */
933  pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n",
936  }
937  preempt_enable();
938 
939  /*
940  * If we couldn't find a local APIC, then get out of here now!
941  */
943  !cpu_has_apic) {
944  if (!disable_apic) {
945  pr_err("BIOS bug, local APIC #%d not detected!...\n",
946  boot_cpu_physical_apicid);
947  pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n");
948  }
949  smpboot_clear_io_apic();
951  return -1;
952  }
953 
955 
956  /*
957  * If SMP should be disabled, then really disable it!
958  */
959  if (!max_cpus) {
960  pr_info("SMP mode deactivated\n");
961  smpboot_clear_io_apic();
962 
966  return -1;
967  }
968 
969  return 0;
970 }
971 
972 static void __init smp_cpu_index_default(void)
973 {
974  int i;
975  struct cpuinfo_x86 *c;
976 
978  c = &cpu_data(i);
979  /* mark all to hotplug */
980  c->cpu_index = nr_cpu_ids;
981  }
982 }
983 
984 /*
985  * Prepare for SMP bootup. The MP table or ACPI has been read
986  * earlier. Just do some sanity checking here and enable APIC mode.
987  */
988 void __init native_smp_prepare_cpus(unsigned int max_cpus)
989 {
990  unsigned int i;
991 
992  preempt_disable();
993  smp_cpu_index_default();
994 
995  /*
996  * Setup boot CPU information
997  */
998  smp_store_cpu_info(0); /* Final full version of the data */
999  cpumask_copy(cpu_callin_mask, cpumask_of(0));
1000  mb();
1001 
1002  current_thread_info()->cpu = 0; /* needed? */
1004  zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
1005  zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
1006  zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
1007  }
1009 
1010 
1011  if (smp_sanity_check(max_cpus) < 0) {
1012  pr_info("SMP disabled\n");
1013  disable_smp();
1014  goto out;
1015  }
1016 
1018 
1019  preempt_disable();
1020  if (read_apic_id() != boot_cpu_physical_apicid) {
1021  panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
1022  read_apic_id(), boot_cpu_physical_apicid);
1023  /* Or can we switch back to PIC here? */
1024  }
1025  preempt_enable();
1026 
1027  connect_bsp_APIC();
1028 
1029  /*
1030  * Switch from PIC to APIC mode.
1031  */
1032  setup_local_APIC();
1033 
1034  /*
1035  * Enable IO APIC before setting up error vector
1036  */
1037  if (!skip_ioapic_setup && nr_ioapics)
1038  enable_IO_APIC();
1039 
1041 
1042  if (apic->setup_portio_remap)
1044 
1045  smpboot_setup_io_apic();
1046  /*
1047  * Set up local APIC timer on boot CPU.
1048  */
1049 
1050  pr_info("CPU%d: ", 0);
1051  print_cpu_info(&cpu_data(0));
1052  x86_init.timers.setup_percpu_clockev();
1053 
1054  if (is_uv_system())
1055  uv_system_init();
1056 
1058 out:
1059  preempt_enable();
1060 }
1061 
1063 {
1065 }
1066 
1068 {
1069  mtrr_aps_init();
1070 }
1071 
1072 /*
1073  * Early setup to make printk work.
1074  */
1076 {
1077  int me = smp_processor_id();
1078  switch_to_new_gdt(me);
1079  /* already set me in cpu_online_mask in boot_cpu_init() */
1080  cpumask_set_cpu(me, cpu_callout_mask);
1081  per_cpu(cpu_state, me) = CPU_ONLINE;
1082 }
1083 
1084 void __init native_smp_cpus_done(unsigned int max_cpus)
1085 {
1086  pr_debug("Boot done\n");
1087 
1088  nmi_selftest();
1089  impress_friends();
1090 #ifdef CONFIG_X86_IO_APIC
1092 #endif
1093  mtrr_aps_init();
1094 }
1095 
1096 static int __initdata setup_possible_cpus = -1;
1097 static int __init _setup_possible_cpus(char *str)
1098 {
1099  get_option(&str, &setup_possible_cpus);
1100  return 0;
1101 }
1102 early_param("possible_cpus", _setup_possible_cpus);
1103 
1104 
1105 /*
1106  * cpu_possible_mask should be static, it cannot change as cpu's
1107  * are onlined, or offlined. The reason is per-cpu data-structures
1108  * are allocated by some modules at init time, and dont expect to
1109  * do this dynamically on cpu arrival/departure.
1110  * cpu_present_mask on the other hand can change dynamically.
1111  * In case when cpu_hotplug is not compiled, then we resort to current
1112  * behaviour, which is cpu_possible == cpu_present.
1113  * - Ashok Raj
1114  *
1115  * Three ways to find out the number of additional hotplug CPUs:
1116  * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
1117  * - The user can overwrite it with possible_cpus=NUM
1118  * - Otherwise don't reserve additional CPUs.
1119  * We do this because additional CPUs waste a lot of memory.
1120  * -AK
1121  */
1123 {
1124  int i, possible;
1125 
1126  /* no processor from mptable or madt */
1127  if (!num_processors)
1128  num_processors = 1;
1129 
1130  i = setup_max_cpus ?: 1;
1131  if (setup_possible_cpus == -1) {
1132  possible = num_processors;
1133 #ifdef CONFIG_HOTPLUG_CPU
1134  if (setup_max_cpus)
1135  possible += disabled_cpus;
1136 #else
1137  if (possible > i)
1138  possible = i;
1139 #endif
1140  } else
1141  possible = setup_possible_cpus;
1142 
1143  total_cpus = max_t(int, possible, num_processors + disabled_cpus);
1144 
1145  /* nr_cpu_ids could be reduced via nr_cpus= */
1146  if (possible > nr_cpu_ids) {
1147  pr_warn("%d Processors exceeds NR_CPUS limit of %d\n",
1148  possible, nr_cpu_ids);
1149  possible = nr_cpu_ids;
1150  }
1151 
1152 #ifdef CONFIG_HOTPLUG_CPU
1153  if (!setup_max_cpus)
1154 #endif
1155  if (possible > i) {
1156  pr_warn("%d Processors exceeds max_cpus limit of %u\n",
1157  possible, setup_max_cpus);
1158  possible = i;
1159  }
1160 
1161  pr_info("Allowing %d CPUs, %d hotplug CPUs\n",
1162  possible, max_t(int, possible - num_processors, 0));
1163 
1164  for (i = 0; i < possible; i++)
1165  set_cpu_possible(i, true);
1166  for (; i < NR_CPUS; i++)
1167  set_cpu_possible(i, false);
1168 
1169  nr_cpu_ids = possible;
1170 }
1171 
1172 #ifdef CONFIG_HOTPLUG_CPU
1173 
1174 static void remove_siblinginfo(int cpu)
1175 {
1176  int sibling;
1177  struct cpuinfo_x86 *c = &cpu_data(cpu);
1178 
1179  for_each_cpu(sibling, cpu_core_mask(cpu)) {
1180  cpumask_clear_cpu(cpu, cpu_core_mask(sibling));
1181  /*/
1182  * last thread sibling in this cpu core going down
1183  */
1184  if (cpumask_weight(cpu_sibling_mask(cpu)) == 1)
1185  cpu_data(sibling).booted_cores--;
1186  }
1187 
1188  for_each_cpu(sibling, cpu_sibling_mask(cpu))
1189  cpumask_clear_cpu(cpu, cpu_sibling_mask(sibling));
1190  cpumask_clear(cpu_sibling_mask(cpu));
1191  cpumask_clear(cpu_core_mask(cpu));
1192  c->phys_proc_id = 0;
1193  c->cpu_core_id = 0;
1194  cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
1195 }
1196 
1197 static void __ref remove_cpu_from_maps(int cpu)
1198 {
1199  set_cpu_online(cpu, false);
1200  cpumask_clear_cpu(cpu, cpu_callout_mask);
1201  cpumask_clear_cpu(cpu, cpu_callin_mask);
1202  /* was set by cpu_init() */
1203  cpumask_clear_cpu(cpu, cpu_initialized_mask);
1204  numa_remove_cpu(cpu);
1205 }
1206 
1207 void cpu_disable_common(void)
1208 {
1209  int cpu = smp_processor_id();
1210 
1211  remove_siblinginfo(cpu);
1212 
1213  /* It's now safe to remove this processor from the online map */
1214  lock_vector_lock();
1215  remove_cpu_from_maps(cpu);
1217  fixup_irqs();
1218 }
1219 
1220 int native_cpu_disable(void)
1221 {
1222  int cpu = smp_processor_id();
1223 
1224  /*
1225  * Perhaps use cpufreq to drop frequency, but that could go
1226  * into generic code.
1227  *
1228  * We won't take down the boot processor on i386 due to some
1229  * interrupts only being able to be serviced by the BSP.
1230  * Especially so if we're not using an IOAPIC -zwane
1231  */
1232  if (cpu == 0)
1233  return -EBUSY;
1234 
1235  clear_local_APIC();
1236 
1237  cpu_disable_common();
1238  return 0;
1239 }
1240 
1241 void native_cpu_die(unsigned int cpu)
1242 {
1243  /* We don't do anything here: idle task is faking death itself. */
1244  unsigned int i;
1245 
1246  for (i = 0; i < 10; i++) {
1247  /* They ack this in play_dead by setting CPU_DEAD */
1248  if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
1250  pr_info("CPU %u is now offline\n", cpu);
1251  return;
1252  }
1253  msleep(100);
1254  }
1255  pr_err("CPU %u didn't die...\n", cpu);
1256 }
1257 
1258 void play_dead_common(void)
1259 {
1260  idle_task_exit();
1261  reset_lazy_tlbstate();
1263 
1264  mb();
1265  /* Ack it */
1266  __this_cpu_write(cpu_state, CPU_DEAD);
1267 
1268  /*
1269  * With physical CPU hotplug, we should halt the cpu
1270  */
1272 }
1273 
1274 /*
1275  * We need to flush the caches before going to sleep, lest we have
1276  * dirty data in our caches when we come back up.
1277  */
1278 static inline void mwait_play_dead(void)
1279 {
1280  unsigned int eax, ebx, ecx, edx;
1281  unsigned int highest_cstate = 0;
1282  unsigned int highest_subcstate = 0;
1283  int i;
1284  void *mwait_ptr;
1285  struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
1286 
1287  if (!(this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c)))
1288  return;
1289  if (!this_cpu_has(X86_FEATURE_CLFLSH))
1290  return;
1291  if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
1292  return;
1293 
1294  eax = CPUID_MWAIT_LEAF;
1295  ecx = 0;
1296  native_cpuid(&eax, &ebx, &ecx, &edx);
1297 
1298  /*
1299  * eax will be 0 if EDX enumeration is not valid.
1300  * Initialized below to cstate, sub_cstate value when EDX is valid.
1301  */
1302  if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
1303  eax = 0;
1304  } else {
1305  edx >>= MWAIT_SUBSTATE_SIZE;
1306  for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
1307  if (edx & MWAIT_SUBSTATE_MASK) {
1308  highest_cstate = i;
1309  highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
1310  }
1311  }
1312  eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
1313  (highest_subcstate - 1);
1314  }
1315 
1316  /*
1317  * This should be a memory location in a cache line which is
1318  * unlikely to be touched by other processors. The actual
1319  * content is immaterial as it is not actually modified in any way.
1320  */
1321  mwait_ptr = &current_thread_info()->flags;
1322 
1323  wbinvd();
1324 
1325  while (1) {
1326  /*
1327  * The CLFLUSH is a workaround for erratum AAI65 for
1328  * the Xeon 7400 series. It's not clear it is actually
1329  * needed, but it should be harmless in either case.
1330  * The WBINVD is insufficient due to the spurious-wakeup
1331  * case where we return around the loop.
1332  */
1333  clflush(mwait_ptr);
1334  __monitor(mwait_ptr, 0, 0);
1335  mb();
1336  __mwait(eax, 0);
1337  }
1338 }
1339 
1340 static inline void hlt_play_dead(void)
1341 {
1342  if (__this_cpu_read(cpu_info.x86) >= 4)
1343  wbinvd();
1344 
1345  while (1) {
1346  native_halt();
1347  }
1348 }
1349 
1350 void native_play_dead(void)
1351 {
1352  play_dead_common();
1354 
1355  mwait_play_dead(); /* Only returns on failure */
1356  if (cpuidle_play_dead())
1357  hlt_play_dead();
1358 }
1359 
1360 #else /* ... !CONFIG_HOTPLUG_CPU */
1362 {
1363  return -ENOSYS;
1364 }
1365 
1366 void native_cpu_die(unsigned int cpu)
1367 {
1368  /* We said "no" in __cpu_disable */
1369  BUG();
1370 }
1371 
1373 {
1374  BUG();
1375 }
1376 
1377 #endif