Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
memcpy.c
Go to the documentation of this file.
1 /*
2  * Optimized memory copy routines.
3  *
4  * Copyright (C) 2004 Randolph Chung <[email protected]>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2, or (at your option)
9  * any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19  *
20  * Portions derived from the GNU C Library
21  * Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
22  *
23  * Several strategies are tried to try to get the best performance for various
24  * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using
25  * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using
26  * general registers. Unaligned copies are handled either by aligning the
27  * destination and then using shift-and-write method, or in a few cases by
28  * falling back to a byte-at-a-time copy.
29  *
30  * I chose to implement this in C because it is easier to maintain and debug,
31  * and in my experiments it appears that the C code generated by gcc (3.3/3.4
32  * at the time of writing) is fairly optimal. Unfortunately some of the
33  * semantics of the copy routine (exception handling) is difficult to express
34  * in C, so we have to play some tricks to get it to work.
35  *
36  * All the loads and stores are done via explicit asm() code in order to use
37  * the right space registers.
38  *
39  * Testing with various alignments and buffer sizes shows that this code is
40  * often >10x faster than a simple byte-at-a-time copy, even for strangely
41  * aligned operands. It is interesting to note that the glibc version
42  * of memcpy (written in C) is actually quite fast already. This routine is
43  * able to beat it by 30-40% for aligned copies because of the loop unrolling,
44  * but in some cases the glibc version is still slightly faster. This lends
45  * more credibility that gcc can generate very good code as long as we are
46  * careful.
47  *
48  * TODO:
49  * - cache prefetching needs more experimentation to get optimal settings
50  * - try not to use the post-increment address modifiers; they create additional
51  * interlocks
52  * - replace byte-copy loops with stybs sequences
53  */
54 
55 #ifdef __KERNEL__
56 #include <linux/module.h>
57 #include <linux/compiler.h>
58 #include <asm/uaccess.h>
59 #define s_space "%%sr1"
60 #define d_space "%%sr2"
61 #else
62 #include "memcpy.h"
63 #define s_space "%%sr0"
64 #define d_space "%%sr0"
65 #define pa_memcpy new2_copy
66 #endif
67 
69 
70 #define preserve_branch(label) do { \
71  volatile int dummy; \
72  /* The following branch is never taken, it's just here to */ \
73  /* prevent gcc from optimizing away our exception code. */ \
74  if (unlikely(dummy != dummy)) \
75  goto label; \
76 } while (0)
77 
78 #define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
79 #define get_kernel_space() (0)
80 
81 #define MERGE(w0, sh_1, w1, sh_2) ({ \
82  unsigned int _r; \
83  asm volatile ( \
84  "mtsar %3\n" \
85  "shrpw %1, %2, %%sar, %0\n" \
86  : "=r"(_r) \
87  : "r"(w0), "r"(w1), "r"(sh_2) \
88  ); \
89  _r; \
90 })
91 #define THRESHOLD 16
92 
93 #ifdef DEBUG_MEMCPY
94 #define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __func__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
95 #else
96 #define DPRINTF(fmt, args...)
97 #endif
98 
99 #define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
100  __asm__ __volatile__ ( \
101  "1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n\t" \
102  ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
103  : _tt(_t), "+r"(_a) \
104  : \
105  : "r8")
106 
107 #define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
108  __asm__ __volatile__ ( \
109  "1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n\t" \
110  ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
111  : "+r"(_a) \
112  : _tt(_t) \
113  : "r8")
114 
115 #define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e)
116 #define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e)
117 #define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e)
118 #define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e)
119 #define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)
120 #define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)
121 
122 #define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) \
123  __asm__ __volatile__ ( \
124  "1:\t" #_insn " " #_o "(" _s ",%1), %0\n\t" \
125  ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
126  : _tt(_t) \
127  : "r"(_a) \
128  : "r8")
129 
130 #define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) \
131  __asm__ __volatile__ ( \
132  "1:\t" #_insn " %0, " #_o "(" _s ",%1)\n\t" \
133  ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
134  : \
135  : _tt(_t), "r"(_a) \
136  : "r8")
137 
138 #define ldw(_s,_o,_a,_t,_e) def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)
139 #define stw(_s,_t,_o,_a,_e) def_store_insn(stw,"r",_s,_t,_o,_a,_e)
140 
141 #ifdef CONFIG_PREFETCH
142 static inline void prefetch_src(const void *addr)
143 {
144  __asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr));
145 }
146 
147 static inline void prefetch_dst(const void *addr)
148 {
149  __asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr));
150 }
151 #else
152 #define prefetch_src(addr) do { } while(0)
153 #define prefetch_dst(addr) do { } while(0)
154 #endif
155 
156 /* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
157  * per loop. This code is derived from glibc.
158  */
159 static inline unsigned long copy_dstaligned(unsigned long dst, unsigned long src, unsigned long len, unsigned long o_dst, unsigned long o_src, unsigned long o_len)
160 {
161  /* gcc complains that a2 and a3 may be uninitialized, but actually
162  * they cannot be. Initialize a2/a3 to shut gcc up.
163  */
164  register unsigned int a0, a1, a2 = 0, a3 = 0;
165  int sh_1, sh_2;
166  struct exception_data *d;
167 
168  /* prefetch_src((const void *)src); */
169 
170  /* Calculate how to shift a word read at the memory operation
171  aligned srcp to make it aligned for copy. */
172  sh_1 = 8 * (src % sizeof(unsigned int));
173  sh_2 = 8 * sizeof(unsigned int) - sh_1;
174 
175  /* Make src aligned by rounding it down. */
176  src &= -sizeof(unsigned int);
177 
178  switch (len % 4)
179  {
180  case 2:
181  /* a1 = ((unsigned int *) src)[0];
182  a2 = ((unsigned int *) src)[1]; */
183  ldw(s_space, 0, src, a1, cda_ldw_exc);
184  ldw(s_space, 4, src, a2, cda_ldw_exc);
185  src -= 1 * sizeof(unsigned int);
186  dst -= 3 * sizeof(unsigned int);
187  len += 2;
188  goto do1;
189  case 3:
190  /* a0 = ((unsigned int *) src)[0];
191  a1 = ((unsigned int *) src)[1]; */
192  ldw(s_space, 0, src, a0, cda_ldw_exc);
193  ldw(s_space, 4, src, a1, cda_ldw_exc);
194  src -= 0 * sizeof(unsigned int);
195  dst -= 2 * sizeof(unsigned int);
196  len += 1;
197  goto do2;
198  case 0:
199  if (len == 0)
200  return 0;
201  /* a3 = ((unsigned int *) src)[0];
202  a0 = ((unsigned int *) src)[1]; */
203  ldw(s_space, 0, src, a3, cda_ldw_exc);
204  ldw(s_space, 4, src, a0, cda_ldw_exc);
205  src -=-1 * sizeof(unsigned int);
206  dst -= 1 * sizeof(unsigned int);
207  len += 0;
208  goto do3;
209  case 1:
210  /* a2 = ((unsigned int *) src)[0];
211  a3 = ((unsigned int *) src)[1]; */
212  ldw(s_space, 0, src, a2, cda_ldw_exc);
213  ldw(s_space, 4, src, a3, cda_ldw_exc);
214  src -=-2 * sizeof(unsigned int);
215  dst -= 0 * sizeof(unsigned int);
216  len -= 1;
217  if (len == 0)
218  goto do0;
219  goto do4; /* No-op. */
220  }
221 
222  do
223  {
224  /* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */
225 do4:
226  /* a0 = ((unsigned int *) src)[0]; */
227  ldw(s_space, 0, src, a0, cda_ldw_exc);
228  /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
229  stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
230 do3:
231  /* a1 = ((unsigned int *) src)[1]; */
232  ldw(s_space, 4, src, a1, cda_ldw_exc);
233  /* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */
234  stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc);
235 do2:
236  /* a2 = ((unsigned int *) src)[2]; */
237  ldw(s_space, 8, src, a2, cda_ldw_exc);
238  /* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */
239  stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc);
240 do1:
241  /* a3 = ((unsigned int *) src)[3]; */
242  ldw(s_space, 12, src, a3, cda_ldw_exc);
243  /* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */
244  stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc);
245 
246  src += 4 * sizeof(unsigned int);
247  dst += 4 * sizeof(unsigned int);
248  len -= 4;
249  }
250  while (len != 0);
251 
252 do0:
253  /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
254  stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
255 
256  preserve_branch(handle_load_error);
257  preserve_branch(handle_store_error);
258 
259  return 0;
260 
261 handle_load_error:
262  __asm__ __volatile__ ("cda_ldw_exc:\n");
264  DPRINTF("cda_ldw_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
265  o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
266  return o_len * 4 - d->fault_addr + o_src;
267 
268 handle_store_error:
269  __asm__ __volatile__ ("cda_stw_exc:\n");
271  DPRINTF("cda_stw_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
272  o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
273  return o_len * 4 - d->fault_addr + o_dst;
274 }
275 
276 
277 /* Returns 0 for success, otherwise, returns number of bytes not transferred. */
278 static unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
279 {
280  register unsigned long src, dst, t1, t2, t3;
281  register unsigned char *pcs, *pcd;
282  register unsigned int *pws, *pwd;
283  register double *pds, *pdd;
284  unsigned long ret = 0;
285  unsigned long o_dst, o_src, o_len;
286  struct exception_data *d;
287 
288  src = (unsigned long)srcp;
289  dst = (unsigned long)dstp;
290  pcs = (unsigned char *)srcp;
291  pcd = (unsigned char *)dstp;
292 
293  o_dst = dst; o_src = src; o_len = len;
294 
295  /* prefetch_src((const void *)srcp); */
296 
297  if (len < THRESHOLD)
298  goto byte_copy;
299 
300  /* Check alignment */
301  t1 = (src ^ dst);
302  if (unlikely(t1 & (sizeof(double)-1)))
303  goto unaligned_copy;
304 
305  /* src and dst have same alignment. */
306 
307  /* Copy bytes till we are double-aligned. */
308  t2 = src & (sizeof(double) - 1);
309  if (unlikely(t2 != 0)) {
310  t2 = sizeof(double) - t2;
311  while (t2 && len) {
312  /* *pcd++ = *pcs++; */
313  ldbma(s_space, pcs, t3, pmc_load_exc);
314  len--;
315  stbma(d_space, t3, pcd, pmc_store_exc);
316  t2--;
317  }
318  }
319 
320  pds = (double *)pcs;
321  pdd = (double *)pcd;
322 
323 #if 0
324  /* Copy 8 doubles at a time */
325  while (len >= 8*sizeof(double)) {
326  register double r1, r2, r3, r4, r5, r6, r7, r8;
327  /* prefetch_src((char *)pds + L1_CACHE_BYTES); */
328  flddma(s_space, pds, r1, pmc_load_exc);
329  flddma(s_space, pds, r2, pmc_load_exc);
330  flddma(s_space, pds, r3, pmc_load_exc);
331  flddma(s_space, pds, r4, pmc_load_exc);
332  fstdma(d_space, r1, pdd, pmc_store_exc);
333  fstdma(d_space, r2, pdd, pmc_store_exc);
334  fstdma(d_space, r3, pdd, pmc_store_exc);
335  fstdma(d_space, r4, pdd, pmc_store_exc);
336 
337 #if 0
338  if (L1_CACHE_BYTES <= 32)
339  prefetch_src((char *)pds + L1_CACHE_BYTES);
340 #endif
341  flddma(s_space, pds, r5, pmc_load_exc);
342  flddma(s_space, pds, r6, pmc_load_exc);
343  flddma(s_space, pds, r7, pmc_load_exc);
344  flddma(s_space, pds, r8, pmc_load_exc);
345  fstdma(d_space, r5, pdd, pmc_store_exc);
346  fstdma(d_space, r6, pdd, pmc_store_exc);
347  fstdma(d_space, r7, pdd, pmc_store_exc);
348  fstdma(d_space, r8, pdd, pmc_store_exc);
349  len -= 8*sizeof(double);
350  }
351 #endif
352 
353  pws = (unsigned int *)pds;
354  pwd = (unsigned int *)pdd;
355 
356 word_copy:
357  while (len >= 8*sizeof(unsigned int)) {
358  register unsigned int r1,r2,r3,r4,r5,r6,r7,r8;
359  /* prefetch_src((char *)pws + L1_CACHE_BYTES); */
360  ldwma(s_space, pws, r1, pmc_load_exc);
361  ldwma(s_space, pws, r2, pmc_load_exc);
362  ldwma(s_space, pws, r3, pmc_load_exc);
363  ldwma(s_space, pws, r4, pmc_load_exc);
364  stwma(d_space, r1, pwd, pmc_store_exc);
365  stwma(d_space, r2, pwd, pmc_store_exc);
366  stwma(d_space, r3, pwd, pmc_store_exc);
367  stwma(d_space, r4, pwd, pmc_store_exc);
368 
369  ldwma(s_space, pws, r5, pmc_load_exc);
370  ldwma(s_space, pws, r6, pmc_load_exc);
371  ldwma(s_space, pws, r7, pmc_load_exc);
372  ldwma(s_space, pws, r8, pmc_load_exc);
373  stwma(d_space, r5, pwd, pmc_store_exc);
374  stwma(d_space, r6, pwd, pmc_store_exc);
375  stwma(d_space, r7, pwd, pmc_store_exc);
376  stwma(d_space, r8, pwd, pmc_store_exc);
377  len -= 8*sizeof(unsigned int);
378  }
379 
380  while (len >= 4*sizeof(unsigned int)) {
381  register unsigned int r1,r2,r3,r4;
382  ldwma(s_space, pws, r1, pmc_load_exc);
383  ldwma(s_space, pws, r2, pmc_load_exc);
384  ldwma(s_space, pws, r3, pmc_load_exc);
385  ldwma(s_space, pws, r4, pmc_load_exc);
386  stwma(d_space, r1, pwd, pmc_store_exc);
387  stwma(d_space, r2, pwd, pmc_store_exc);
388  stwma(d_space, r3, pwd, pmc_store_exc);
389  stwma(d_space, r4, pwd, pmc_store_exc);
390  len -= 4*sizeof(unsigned int);
391  }
392 
393  pcs = (unsigned char *)pws;
394  pcd = (unsigned char *)pwd;
395 
396 byte_copy:
397  while (len) {
398  /* *pcd++ = *pcs++; */
399  ldbma(s_space, pcs, t3, pmc_load_exc);
400  stbma(d_space, t3, pcd, pmc_store_exc);
401  len--;
402  }
403 
404  return 0;
405 
406 unaligned_copy:
407  /* possibly we are aligned on a word, but not on a double... */
408  if (likely((t1 & (sizeof(unsigned int)-1)) == 0)) {
409  t2 = src & (sizeof(unsigned int) - 1);
410 
411  if (unlikely(t2 != 0)) {
412  t2 = sizeof(unsigned int) - t2;
413  while (t2) {
414  /* *pcd++ = *pcs++; */
415  ldbma(s_space, pcs, t3, pmc_load_exc);
416  stbma(d_space, t3, pcd, pmc_store_exc);
417  len--;
418  t2--;
419  }
420  }
421 
422  pws = (unsigned int *)pcs;
423  pwd = (unsigned int *)pcd;
424  goto word_copy;
425  }
426 
427  /* Align the destination. */
428  if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) {
429  t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1));
430  while (t2) {
431  /* *pcd++ = *pcs++; */
432  ldbma(s_space, pcs, t3, pmc_load_exc);
433  stbma(d_space, t3, pcd, pmc_store_exc);
434  len--;
435  t2--;
436  }
437  dst = (unsigned long)pcd;
438  src = (unsigned long)pcs;
439  }
440 
441  ret = copy_dstaligned(dst, src, len / sizeof(unsigned int),
442  o_dst, o_src, o_len);
443  if (ret)
444  return ret;
445 
446  pcs += (len & -sizeof(unsigned int));
447  pcd += (len & -sizeof(unsigned int));
448  len %= sizeof(unsigned int);
449 
450  preserve_branch(handle_load_error);
451  preserve_branch(handle_store_error);
452 
453  goto byte_copy;
454 
455 handle_load_error:
456  __asm__ __volatile__ ("pmc_load_exc:\n");
458  DPRINTF("pmc_load_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
459  o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
460  return o_len - d->fault_addr + o_src;
461 
462 handle_store_error:
463  __asm__ __volatile__ ("pmc_store_exc:\n");
465  DPRINTF("pmc_store_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
466  o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
467  return o_len - d->fault_addr + o_dst;
468 }
469 
470 #ifdef __KERNEL__
471 unsigned long copy_to_user(void __user *dst, const void *src, unsigned long len)
472 {
473  mtsp(get_kernel_space(), 1);
474  mtsp(get_user_space(), 2);
475  return pa_memcpy((void __force *)dst, src, len);
476 }
477 
479 unsigned long __copy_from_user(void *dst, const void __user *src, unsigned long len)
480 {
481  mtsp(get_user_space(), 1);
482  mtsp(get_kernel_space(), 2);
483  return pa_memcpy(dst, (void __force *)src, len);
484 }
485 
486 unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned long len)
487 {
488  mtsp(get_user_space(), 1);
489  mtsp(get_user_space(), 2);
490  return pa_memcpy((void __force *)dst, (void __force *)src, len);
491 }
492 
493 
494 void * memcpy(void * dst,const void *src, size_t count)
495 {
496  mtsp(get_kernel_space(), 1);
497  mtsp(get_kernel_space(), 2);
498  pa_memcpy(dst, src, count);
499  return dst;
500 }
501 
506 #endif