Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
memset.c
Go to the documentation of this file.
1 /* A memset for CRIS.
2  Copyright (C) 1999-2005 Axis Communications.
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions
7  are met:
8 
9  1. Redistributions of source code must retain the above copyright
10  notice, this list of conditions and the following disclaimer.
11 
12  2. Neither the name of Axis Communications nor the names of its
13  contributors may be used to endorse or promote products derived
14  from this software without specific prior written permission.
15 
16  THIS SOFTWARE IS PROVIDED BY AXIS COMMUNICATIONS AND ITS CONTRIBUTORS
17  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL AXIS
20  COMMUNICATIONS OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
21  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
25  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
26  IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27  POSSIBILITY OF SUCH DAMAGE. */
28 
29 /* FIXME: This file should really only be used for reference, as the
30  result is somewhat depending on gcc generating what we expect rather
31  than what we describe. An assembly file should be used instead. */
32 
33 /* Note the multiple occurrence of the expression "12*4", including the
34  asm. It is hard to get it into the asm in a good way. Thus better to
35  expose the problem everywhere: no macro. */
36 
37 /* Assuming one cycle per dword written or read (ok, not really true; the
38  world is not ideal), and one cycle per instruction, then 43+3*(n/48-1)
39  <= 24+24*(n/48-1) so n >= 45.7; n >= 0.9; we win on the first full
40  48-byte block to set. */
41 
42 #define MEMSET_BY_BLOCK_THRESHOLD (1 * 48)
43 
44 /* No name ambiguities in this file. */
45 __asm__ (".syntax no_register_prefix");
46 
47 void *memset(void *pdst, int c, unsigned int plen)
48 {
49  /* Now we want the parameters in special registers. Make sure the
50  compiler does something usable with this. */
51 
52  register char *return_dst __asm__ ("r10") = pdst;
53  register int n __asm__ ("r12") = plen;
54  register int lc __asm__ ("r11") = c;
55 
56  /* Most apps use memset sanely. Memsetting about 3..4 bytes or less get
57  penalized here compared to the generic implementation. */
58 
59  /* This is fragile performancewise at best. Check with newer GCC
60  releases, if they compile cascaded "x |= x << 8" to sane code. */
61  __asm__("movu.b %0,r13 \n\
62  lslq 8,r13 \n\
63  move.b %0,r13 \n\
64  move.d r13,%0 \n\
65  lslq 16,r13 \n\
66  or.d r13,%0"
67  : "=r" (lc) /* Inputs. */
68  : "0" (lc) /* Outputs. */
69  : "r13"); /* Trash. */
70 
71  {
72  register char *dst __asm__ ("r13") = pdst;
73 
74  if (((unsigned long) pdst & 3) != 0
75  /* Oops! n = 0 must be a valid call, regardless of alignment. */
76  && n >= 3)
77  {
78  if ((unsigned long) dst & 1)
79  {
80  *dst = (char) lc;
81  n--;
82  dst++;
83  }
84 
85  if ((unsigned long) dst & 2)
86  {
87  *(short *) dst = lc;
88  n -= 2;
89  dst += 2;
90  }
91  }
92 
93  /* Decide which setting method to use. */
95  {
96  /* It is not optimal to tell the compiler about clobbering any
97  registers; that will move the saving/restoring of those registers
98  to the function prologue/epilogue, and make non-block sizes
99  suboptimal. */
100  __asm__ volatile
101  ("\
102  ;; GCC does promise correct register allocations, but let's \n\
103  ;; make sure it keeps its promises. \n\
104  .ifnc %0-%1-%4,$r13-$r12-$r11 \n\
105  .error \"GCC reg alloc bug: %0-%1-%4 != $r13-$r12-$r11\" \n\
106  .endif \n\
107  \n\
108  ;; Save the registers we'll clobber in the movem process \n\
109  ;; on the stack. Don't mention them to gcc, it will only be \n\
110  ;; upset. \n\
111  subq 11*4,sp \n\
112  movem r10,[sp] \n\
113  \n\
114  move.d r11,r0 \n\
115  move.d r11,r1 \n\
116  move.d r11,r2 \n\
117  move.d r11,r3 \n\
118  move.d r11,r4 \n\
119  move.d r11,r5 \n\
120  move.d r11,r6 \n\
121  move.d r11,r7 \n\
122  move.d r11,r8 \n\
123  move.d r11,r9 \n\
124  move.d r11,r10 \n\
125  \n\
126  ;; Now we've got this: \n\
127  ;; r13 - dst \n\
128  ;; r12 - n \n\
129  \n\
130  ;; Update n for the first loop \n\
131  subq 12*4,r12 \n\
132 0: \n\
133 "
134 #ifdef __arch_common_v10_v32
135  /* Cater to branch offset difference between v32 and v10. We
136  assume the branch below has an 8-bit offset. */
137 " setf\n"
138 #endif
139 " subq 12*4,r12 \n\
140  bge 0b \n\
141  movem r11,[r13+] \n\
142  \n\
143  ;; Compensate for last loop underflowing n. \n\
144  addq 12*4,r12 \n\
145  \n\
146  ;; Restore registers from stack. \n\
147  movem [sp+],r10"
148 
149  /* Outputs. */
150  : "=r" (dst), "=r" (n)
151 
152  /* Inputs. */
153  : "0" (dst), "1" (n), "r" (lc));
154  }
155 
156  /* An ad-hoc unroll, used for 4*12-1..16 bytes. */
157  while (n >= 16)
158  {
159  *(long *) dst = lc; dst += 4;
160  *(long *) dst = lc; dst += 4;
161  *(long *) dst = lc; dst += 4;
162  *(long *) dst = lc; dst += 4;
163  n -= 16;
164  }
165 
166  switch (n)
167  {
168  case 0:
169  break;
170 
171  case 1:
172  *dst = (char) lc;
173  break;
174 
175  case 2:
176  *(short *) dst = (short) lc;
177  break;
178 
179  case 3:
180  *(short *) dst = (short) lc; dst += 2;
181  *dst = (char) lc;
182  break;
183 
184  case 4:
185  *(long *) dst = lc;
186  break;
187 
188  case 5:
189  *(long *) dst = lc; dst += 4;
190  *dst = (char) lc;
191  break;
192 
193  case 6:
194  *(long *) dst = lc; dst += 4;
195  *(short *) dst = (short) lc;
196  break;
197 
198  case 7:
199  *(long *) dst = lc; dst += 4;
200  *(short *) dst = (short) lc; dst += 2;
201  *dst = (char) lc;
202  break;
203 
204  case 8:
205  *(long *) dst = lc; dst += 4;
206  *(long *) dst = lc;
207  break;
208 
209  case 9:
210  *(long *) dst = lc; dst += 4;
211  *(long *) dst = lc; dst += 4;
212  *dst = (char) lc;
213  break;
214 
215  case 10:
216  *(long *) dst = lc; dst += 4;
217  *(long *) dst = lc; dst += 4;
218  *(short *) dst = (short) lc;
219  break;
220 
221  case 11:
222  *(long *) dst = lc; dst += 4;
223  *(long *) dst = lc; dst += 4;
224  *(short *) dst = (short) lc; dst += 2;
225  *dst = (char) lc;
226  break;
227 
228  case 12:
229  *(long *) dst = lc; dst += 4;
230  *(long *) dst = lc; dst += 4;
231  *(long *) dst = lc;
232  break;
233 
234  case 13:
235  *(long *) dst = lc; dst += 4;
236  *(long *) dst = lc; dst += 4;
237  *(long *) dst = lc; dst += 4;
238  *dst = (char) lc;
239  break;
240 
241  case 14:
242  *(long *) dst = lc; dst += 4;
243  *(long *) dst = lc; dst += 4;
244  *(long *) dst = lc; dst += 4;
245  *(short *) dst = (short) lc;
246  break;
247 
248  case 15:
249  *(long *) dst = lc; dst += 4;
250  *(long *) dst = lc; dst += 4;
251  *(long *) dst = lc; dst += 4;
252  *(short *) dst = (short) lc; dst += 2;
253  *dst = (char) lc;
254  break;
255  }
256  }
257 
258  return return_dst;
259 }