Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
machine_kexec_64.c
Go to the documentation of this file.
1 /*
2  * PPC64 code to handle Linux booting another kernel.
3  *
4  * Copyright (C) 2004-2005, IBM Corp.
5  *
6  * Created by: Milton D Miller II
7  *
8  * This source code is licensed under the GNU General Public License,
9  * Version 2. See the file COPYING for more details.
10  */
11 
12 
13 #include <linux/kexec.h>
14 #include <linux/smp.h>
15 #include <linux/thread_info.h>
16 #include <linux/init_task.h>
17 #include <linux/errno.h>
18 #include <linux/kernel.h>
19 #include <linux/cpu.h>
20 
21 #include <asm/page.h>
22 #include <asm/current.h>
23 #include <asm/machdep.h>
24 #include <asm/cacheflush.h>
25 #include <asm/paca.h>
26 #include <asm/mmu.h>
27 #include <asm/sections.h> /* _end */
28 #include <asm/prom.h>
29 #include <asm/smp.h>
30 #include <asm/hw_breakpoint.h>
31 
33 {
34  int i;
35  unsigned long begin, end; /* limits of segment */
36  unsigned long low, high; /* limits of blocked memory range */
37  struct device_node *node;
38  const unsigned long *basep;
39  const unsigned int *sizep;
40 
41  if (!ppc_md.hpte_clear_all)
42  return -ENOENT;
43 
44  /*
45  * Since we use the kernel fault handlers and paging code to
46  * handle the virtual mode, we must make sure no destination
47  * overlaps kernel static data or bss.
48  */
49  for (i = 0; i < image->nr_segments; i++)
50  if (image->segment[i].mem < __pa(_end))
51  return -ETXTBSY;
52 
53  /*
54  * For non-LPAR, we absolutely can not overwrite the mmu hash
55  * table, since we are still using the bolted entries in it to
56  * do the copy. Check that here.
57  *
58  * It is safe if the end is below the start of the blocked
59  * region (end <= low), or if the beginning is after the
60  * end of the blocked region (begin >= high). Use the
61  * boolean identity !(a || b) === (!a && !b).
62  */
63  if (htab_address) {
64  low = __pa(htab_address);
65  high = low + htab_size_bytes;
66 
67  for (i = 0; i < image->nr_segments; i++) {
68  begin = image->segment[i].mem;
69  end = begin + image->segment[i].memsz;
70 
71  if ((begin < high) && (end > low))
72  return -ETXTBSY;
73  }
74  }
75 
76  /* We also should not overwrite the tce tables */
77  for_each_node_by_type(node, "pci") {
78  basep = of_get_property(node, "linux,tce-base", NULL);
79  sizep = of_get_property(node, "linux,tce-size", NULL);
80  if (basep == NULL || sizep == NULL)
81  continue;
82 
83  low = *basep;
84  high = low + (*sizep);
85 
86  for (i = 0; i < image->nr_segments; i++) {
87  begin = image->segment[i].mem;
88  end = begin + image->segment[i].memsz;
89 
90  if ((begin < high) && (end > low))
91  return -ETXTBSY;
92  }
93  }
94 
95  return 0;
96 }
97 
98 #define IND_FLAGS (IND_DESTINATION | IND_INDIRECTION | IND_DONE | IND_SOURCE)
99 
100 static void copy_segments(unsigned long ind)
101 {
102  unsigned long entry;
103  unsigned long *ptr;
104  void *dest;
105  void *addr;
106 
107  /*
108  * We rely on kexec_load to create a lists that properly
109  * initializes these pointers before they are used.
110  * We will still crash if the list is wrong, but at least
111  * the compiler will be quiet.
112  */
113  ptr = NULL;
114  dest = NULL;
115 
116  for (entry = ind; !(entry & IND_DONE); entry = *ptr++) {
117  addr = __va(entry & PAGE_MASK);
118 
119  switch (entry & IND_FLAGS) {
120  case IND_DESTINATION:
121  dest = addr;
122  break;
123  case IND_INDIRECTION:
124  ptr = addr;
125  break;
126  case IND_SOURCE:
127  copy_page(dest, addr);
128  dest += PAGE_SIZE;
129  }
130  }
131 }
132 
133 void kexec_copy_flush(struct kimage *image)
134 {
135  long i, nr_segments = image->nr_segments;
136  struct kexec_segment ranges[KEXEC_SEGMENT_MAX];
137 
138  /* save the ranges on the stack to efficiently flush the icache */
139  memcpy(ranges, image->segment, sizeof(ranges));
140 
141  /*
142  * After this call we may not use anything allocated in dynamic
143  * memory, including *image.
144  *
145  * Only globals and the stack are allowed.
146  */
147  copy_segments(image->head);
148 
149  /*
150  * we need to clear the icache for all dest pages sometime,
151  * including ones that were in place on the original copy
152  */
153  for (i = 0; i < nr_segments; i++)
154  flush_icache_range((unsigned long)__va(ranges[i].mem),
155  (unsigned long)__va(ranges[i].mem + ranges[i].memsz));
156 }
157 
158 #ifdef CONFIG_SMP
159 
160 static int kexec_all_irq_disabled = 0;
161 
162 static void kexec_smp_down(void *arg)
163 {
165  mb(); /* make sure our irqs are disabled before we say they are */
166  get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF;
167  while(kexec_all_irq_disabled == 0)
168  cpu_relax();
169  mb(); /* make sure all irqs are disabled before this */
170  hw_breakpoint_disable();
171  /*
172  * Now every CPU has IRQs off, we can clear out any pending
173  * IPIs and be sure that no more will come in after this.
174  */
175  if (ppc_md.kexec_cpu_down)
176  ppc_md.kexec_cpu_down(0, 1);
177 
178  kexec_smp_wait();
179  /* NOTREACHED */
180 }
181 
182 static void kexec_prepare_cpus_wait(int wait_state)
183 {
184  int my_cpu, i, notified=-1;
185 
186  hw_breakpoint_disable();
187  my_cpu = get_cpu();
188  /* Make sure each CPU has at least made it to the state we need.
189  *
190  * FIXME: There is a (slim) chance of a problem if not all of the CPUs
191  * are correctly onlined. If somehow we start a CPU on boot with RTAS
192  * start-cpu, but somehow that CPU doesn't write callin_cpu_map[] in
193  * time, the boot CPU will timeout. If it does eventually execute
194  * stuff, the secondary will start up (paca[].cpu_start was written) and
195  * get into a peculiar state. If the platform supports
196  * smp_ops->take_timebase(), the secondary CPU will probably be spinning
197  * in there. If not (i.e. pseries), the secondary will continue on and
198  * try to online itself/idle/etc. If it survives that, we need to find
199  * these possible-but-not-online-but-should-be CPUs and chaperone them
200  * into kexec_smp_wait().
201  */
203  if (i == my_cpu)
204  continue;
205 
206  while (paca[i].kexec_state < wait_state) {
207  barrier();
208  if (i != notified) {
209  printk(KERN_INFO "kexec: waiting for cpu %d "
210  "(physical %d) to enter %i state\n",
211  i, paca[i].hw_cpu_id, wait_state);
212  notified = i;
213  }
214  }
215  }
216  mb();
217 }
218 
219 /*
220  * We need to make sure each present CPU is online. The next kernel will scan
221  * the device tree and assume primary threads are online and query secondary
222  * threads via RTAS to online them if required. If we don't online primary
223  * threads, they will be stuck. However, we also online secondary threads as we
224  * may be using 'cede offline'. In this case RTAS doesn't see the secondary
225  * threads as offline -- and again, these CPUs will be stuck.
226  *
227  * So, we online all CPUs that should be running, including secondary threads.
228  */
229 static void wake_offline_cpus(void)
230 {
231  int cpu = 0;
232 
233  for_each_present_cpu(cpu) {
234  if (!cpu_online(cpu)) {
235  printk(KERN_INFO "kexec: Waking offline cpu %d.\n",
236  cpu);
237  cpu_up(cpu);
238  }
239  }
240 }
241 
242 static void kexec_prepare_cpus(void)
243 {
244  wake_offline_cpus();
245  smp_call_function(kexec_smp_down, NULL, /* wait */0);
247  mb(); /* make sure IRQs are disabled before we say they are */
248  get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF;
249 
250  kexec_prepare_cpus_wait(KEXEC_STATE_IRQS_OFF);
251  /* we are sure every CPU has IRQs off at this point */
252  kexec_all_irq_disabled = 1;
253 
254  /* after we tell the others to go down */
255  if (ppc_md.kexec_cpu_down)
256  ppc_md.kexec_cpu_down(0, 0);
257 
258  /*
259  * Before removing MMU mappings make sure all CPUs have entered real
260  * mode:
261  */
262  kexec_prepare_cpus_wait(KEXEC_STATE_REAL_MODE);
263 
264  put_cpu();
265 }
266 
267 #else /* ! SMP */
268 
269 static void kexec_prepare_cpus(void)
270 {
271  /*
272  * move the secondarys to us so that we can copy
273  * the new kernel 0-0x100 safely
274  *
275  * do this if kexec in setup.c ?
276  *
277  * We need to release the cpus if we are ever going from an
278  * UP to an SMP kernel.
279  */
280  smp_release_cpus();
281  if (ppc_md.kexec_cpu_down)
282  ppc_md.kexec_cpu_down(0, 0);
284 }
285 
286 #endif /* SMP */
287 
288 /*
289  * kexec thread structure and stack.
290  *
291  * We need to make sure that this is 16384-byte aligned due to the
292  * way process stacks are handled. It also must be statically allocated
293  * or allocated as part of the kimage, because everything else may be
294  * overwritten when we copy the kexec image. We piggyback on the
295  * "init_task" linker section here to statically allocate a stack.
296  *
297  * We could use a smaller stack if we don't care about anything using
298  * current, but that audit has not been performed.
299  */
300 static union thread_union kexec_stack __init_task_data =
301  { };
302 
303 /*
304  * For similar reasons to the stack above, the kexecing CPU needs to be on a
305  * static PACA; we switch to kexec_paca.
306  */
307 struct paca_struct kexec_paca;
308 
309 /* Our assembly helper, in kexec_stub.S */
310 extern void kexec_sequence(void *newstack, unsigned long start,
311  void *image, void *control,
312  void (*clear_all)(void)) __noreturn;
313 
314 /* too late to fail here */
315 void default_machine_kexec(struct kimage *image)
316 {
317  /* prepare control code if any */
318 
319  /*
320  * If the kexec boot is the normal one, need to shutdown other cpus
321  * into our wait loop and quiesce interrupts.
322  * Otherwise, in the case of crashed mode (crashing_cpu >= 0),
323  * stopping other CPUs and collecting their pt_regs is done before
324  * using debugger IPI.
325  */
326 
327  if (crashing_cpu == -1)
328  kexec_prepare_cpus();
329 
330  pr_debug("kexec: Starting switchover sequence.\n");
331 
332  /* switch to a staticly allocated stack. Based on irq stack code.
333  * XXX: the task struct will likely be invalid once we do the copy!
334  */
335  kexec_stack.thread_info.task = current_thread_info()->task;
336  kexec_stack.thread_info.flags = 0;
337 
338  /* We need a static PACA, too; copy this CPU's PACA over and switch to
339  * it. Also poison per_cpu_offset to catch anyone using non-static
340  * data.
341  */
342  memcpy(&kexec_paca, get_paca(), sizeof(struct paca_struct));
343  kexec_paca.data_offset = 0xedeaddeadeeeeeeeUL;
344  paca = (struct paca_struct *)RELOC_HIDE(&kexec_paca, 0) -
345  kexec_paca.paca_index;
347 
348  /* XXX: If anyone does 'dynamic lppacas' this will also need to be
349  * switched to a static version!
350  */
351 
352  /* Some things are best done in assembly. Finding globals with
353  * a toc is easier in C, so pass in what we can.
354  */
355  kexec_sequence(&kexec_stack, image->start, image,
356  page_address(image->control_code_page),
357  ppc_md.hpte_clear_all);
358  /* NOTREACHED */
359 }
360 
361 /* Values we need to export to the second kernel via the device tree. */
362 static unsigned long htab_base;
363 
364 static struct property htab_base_prop = {
365  .name = "linux,htab-base",
366  .length = sizeof(unsigned long),
367  .value = &htab_base,
368 };
369 
370 static struct property htab_size_prop = {
371  .name = "linux,htab-size",
372  .length = sizeof(unsigned long),
374 };
375 
376 static int __init export_htab_values(void)
377 {
378  struct device_node *node;
379  struct property *prop;
380 
381  /* On machines with no htab htab_address is NULL */
382  if (!htab_address)
383  return -ENODEV;
384 
385  node = of_find_node_by_path("/chosen");
386  if (!node)
387  return -ENODEV;
388 
389  /* remove any stale propertys so ours can be found */
390  prop = of_find_property(node, htab_base_prop.name, NULL);
391  if (prop)
392  prom_remove_property(node, prop);
393  prop = of_find_property(node, htab_size_prop.name, NULL);
394  if (prop)
395  prom_remove_property(node, prop);
396 
397  htab_base = __pa(htab_address);
398  prom_add_property(node, &htab_base_prop);
399  prom_add_property(node, &htab_size_prop);
400 
401  of_node_put(node);
402  return 0;
403 }
404 late_initcall(export_htab_values);