bcm27xx: 6.1: add kernel patches
[openwrt/openwrt.git] / target / linux / bcm27xx / patches-6.1 / 950-0121-Improve-__copy_to_user-and-__copy_from_user-performa.patch
1 From 9a10ffd029c81a694db6666c1dedfa687396093b Mon Sep 17 00:00:00 2001
2 From: popcornmix <popcornmix@gmail.com>
3 Date: Mon, 28 Nov 2016 16:50:04 +0000
4 Subject: [PATCH] Improve __copy_to_user and __copy_from_user
5 performance
6
7 Provide a __copy_from_user that uses memcpy. On BCM2708, use
8 optimised memcpy/memmove/memcmp/memset implementations.
9
10 arch/arm: Add mmiocpy/set aliases for memcpy/set
11
12 See: https://github.com/raspberrypi/linux/issues/1082
13
14 copy_from_user: CPU_SW_DOMAIN_PAN compatibility
15
16 The downstream copy_from_user acceleration must also play nice with
17 CONFIG_CPU_SW_DOMAIN_PAN.
18
19 See: https://github.com/raspberrypi/linux/issues/1381
20
21 Signed-off-by: Phil Elwell <phil@raspberrypi.org>
22
23 Fix copy_from_user if BCM2835_FAST_MEMCPY=n
24
25 The change which introduced CONFIG_BCM2835_FAST_MEMCPY unconditionally
26 changed the behaviour of arm_copy_from_user. The page pinning code
27 is not safe on ARMv7 if LPAE & high memory is enabled and causes
28 crashes which look like PTE corruption.
29
30 Make __copy_from_user_memcpy conditional on CONFIG_2835_FAST_MEMCPY=y
31 which is really an ARMv6 / Pi1 optimization and not necessary on newer
32 ARM processors.
33
34 arm: fix mmap unlocks in uaccess_with_memcpy.c
35
36 This is a regression that was added with the commit 192a4e923ef092924dd013e7326f2ec520ee4783 as of rpi-5.8.y, since that is when the move to the mmap locking API was introduced - d8ed45c5dcd455fc5848d47f86883a1b872ac0d0
37
38 The issue is that when the patch to improve performance for the __copy_to_user and __copy_from_user functions were added for the Raspberry Pi, some of the mmaps were incorrectly mapped to write instead of read. This would cause a verity of issues, and in my case, prevent the booting of a squashfs filesystem on rpi-5.8-y and above. An example of the panic you would see from this can be seen at https://pastebin.com/raw/jBz5xCzL
39
40 Signed-off-by: Christian Lamparter <chunkeey@gmail.com>
41 Signed-off-by: Christopher Blake <chrisrblake93@gmail.com>
42
43 arch/arm: Add __memset alias to memset_rpi.S
44
45 memset_rpi.S is an optimised memset implementation, but doesn't define
46 __memset (which was just added to memset.S). As a result, building
47 for the BCM2835 platform causes a link failure.
48
49 Add __memset as yet another alias to our common implementation.
50
51 Signed-off-by: Phil Elwell <phil@raspberrypi.com>
52
53 arm: Fix custom rpi __memset32 and __memset64
54
55 See: https://github.com/raspberrypi/linux/issues/4798
56
57 Signed-off-by: Phil Elwell <phil@raspberrypi.com>
58
59 arm: Fix annoying .eh_frame section warnings
60
61 Replace the cfi directives with the UNWIND equivalents. This prevents
62 the .eh_frame section from being created, eliminating the warnings.
63
64 Signed-off-by: Phil Elwell <phil@raspberrypi.com>
65 ---
66 arch/arm/include/asm/string.h | 5 +
67 arch/arm/include/asm/uaccess.h | 3 +
68 arch/arm/lib/Makefile | 14 +-
69 arch/arm/lib/arm-mem.h | 159 ++++++++++
70 arch/arm/lib/copy_from_user.S | 4 +-
71 arch/arm/lib/exports_rpi.c | 37 +++
72 arch/arm/lib/memcmp_rpi.S | 285 +++++++++++++++++
73 arch/arm/lib/memcpy_rpi.S | 63 ++++
74 arch/arm/lib/memcpymove.h | 488 +++++++++++++++++++++++++++++
75 arch/arm/lib/memmove_rpi.S | 63 ++++
76 arch/arm/lib/memset_rpi.S | 132 ++++++++
77 arch/arm/lib/uaccess_with_memcpy.c | 125 +++++++-
78 arch/arm/mach-bcm/Kconfig | 24 ++
79 13 files changed, 1396 insertions(+), 6 deletions(-)
80 create mode 100644 arch/arm/lib/arm-mem.h
81 create mode 100644 arch/arm/lib/exports_rpi.c
82 create mode 100644 arch/arm/lib/memcmp_rpi.S
83 create mode 100644 arch/arm/lib/memcpy_rpi.S
84 create mode 100644 arch/arm/lib/memcpymove.h
85 create mode 100644 arch/arm/lib/memmove_rpi.S
86 create mode 100644 arch/arm/lib/memset_rpi.S
87
88 --- a/arch/arm/include/asm/string.h
89 +++ b/arch/arm/include/asm/string.h
90 @@ -65,4 +65,9 @@ static inline void *memset64(uint64_t *p
91
92 #endif
93
94 +#ifdef CONFIG_BCM2835_FAST_MEMCPY
95 +#define __HAVE_ARCH_MEMCMP
96 +extern int memcmp(const void *, const void *, size_t);
97 +#endif
98 +
99 #endif
100 --- a/arch/arm/include/asm/uaccess.h
101 +++ b/arch/arm/include/asm/uaccess.h
102 @@ -509,6 +509,9 @@ do { \
103 extern unsigned long __must_check
104 arm_copy_from_user(void *to, const void __user *from, unsigned long n);
105
106 +extern unsigned long __must_check
107 +__copy_from_user_std(void *to, const void __user *from, unsigned long n);
108 +
109 static inline unsigned long __must_check
110 raw_copy_from_user(void *to, const void __user *from, unsigned long n)
111 {
112 --- a/arch/arm/lib/Makefile
113 +++ b/arch/arm/lib/Makefile
114 @@ -7,8 +7,8 @@
115
116 lib-y := changebit.o csumipv6.o csumpartial.o \
117 csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
118 - delay.o delay-loop.o findbit.o memchr.o memcpy.o \
119 - memmove.o memset.o setbit.o \
120 + delay.o delay-loop.o findbit.o memchr.o \
121 + setbit.o \
122 strchr.o strrchr.o \
123 testchangebit.o testclearbit.o testsetbit.o \
124 ashldi3.o ashrdi3.o lshrdi3.o muldi3.o \
125 @@ -25,6 +25,16 @@ else
126 lib-y += backtrace.o
127 endif
128
129 +# Choose optimised implementations for Raspberry Pi
130 +ifeq ($(CONFIG_BCM2835_FAST_MEMCPY),y)
131 + CFLAGS_uaccess_with_memcpy.o += -DCOPY_FROM_USER_THRESHOLD=1600
132 + CFLAGS_uaccess_with_memcpy.o += -DCOPY_TO_USER_THRESHOLD=672
133 + obj-$(CONFIG_MODULES) += exports_rpi.o
134 + lib-y += memcpy_rpi.o memmove_rpi.o memset_rpi.o memcmp_rpi.o
135 +else
136 + lib-y += memcpy.o memmove.o memset.o
137 +endif
138 +
139 # using lib_ here won't override already available weak symbols
140 obj-$(CONFIG_UACCESS_WITH_MEMCPY) += uaccess_with_memcpy.o
141
142 --- /dev/null
143 +++ b/arch/arm/lib/arm-mem.h
144 @@ -0,0 +1,159 @@
145 +/*
146 +Copyright (c) 2013, Raspberry Pi Foundation
147 +Copyright (c) 2013, RISC OS Open Ltd
148 +All rights reserved.
149 +
150 +Redistribution and use in source and binary forms, with or without
151 +modification, are permitted provided that the following conditions are met:
152 + * Redistributions of source code must retain the above copyright
153 + notice, this list of conditions and the following disclaimer.
154 + * Redistributions in binary form must reproduce the above copyright
155 + notice, this list of conditions and the following disclaimer in the
156 + documentation and/or other materials provided with the distribution.
157 + * Neither the name of the copyright holder nor the
158 + names of its contributors may be used to endorse or promote products
159 + derived from this software without specific prior written permission.
160 +
161 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
162 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
163 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
164 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
165 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
166 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
167 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
168 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
169 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
170 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
171 +*/
172 +
173 +.macro myfunc fname
174 + .func fname
175 + .global fname
176 +fname:
177 +.endm
178 +
179 +.macro preload_leading_step1 backwards, ptr, base
180 +/* If the destination is already 16-byte aligned, then we need to preload
181 + * between 0 and prefetch_distance (inclusive) cache lines ahead so there
182 + * are no gaps when the inner loop starts.
183 + */
184 + .if backwards
185 + sub ptr, base, #1
186 + bic ptr, ptr, #31
187 + .else
188 + bic ptr, base, #31
189 + .endif
190 + .set OFFSET, 0
191 + .rept prefetch_distance+1
192 + pld [ptr, #OFFSET]
193 + .if backwards
194 + .set OFFSET, OFFSET-32
195 + .else
196 + .set OFFSET, OFFSET+32
197 + .endif
198 + .endr
199 +.endm
200 +
201 +.macro preload_leading_step2 backwards, ptr, base, leading_bytes, tmp
202 +/* However, if the destination is not 16-byte aligned, we may need to
203 + * preload one more cache line than that. The question we need to ask is:
204 + * are the leading bytes more than the amount by which the source
205 + * pointer will be rounded down for preloading, and if so, by how many
206 + * cache lines?
207 + */
208 + .if backwards
209 +/* Here we compare against how many bytes we are into the
210 + * cache line, counting down from the highest such address.
211 + * Effectively, we want to calculate
212 + * leading_bytes = dst&15
213 + * cacheline_offset = 31-((src-leading_bytes-1)&31)
214 + * extra_needed = leading_bytes - cacheline_offset
215 + * and test if extra_needed is <= 0, or rearranging:
216 + * leading_bytes + (src-leading_bytes-1)&31 <= 31
217 + */
218 + mov tmp, base, lsl #32-5
219 + sbc tmp, tmp, leading_bytes, lsl #32-5
220 + adds tmp, tmp, leading_bytes, lsl #32-5
221 + bcc 61f
222 + pld [ptr, #-32*(prefetch_distance+1)]
223 + .else
224 +/* Effectively, we want to calculate
225 + * leading_bytes = (-dst)&15
226 + * cacheline_offset = (src+leading_bytes)&31
227 + * extra_needed = leading_bytes - cacheline_offset
228 + * and test if extra_needed is <= 0.
229 + */
230 + mov tmp, base, lsl #32-5
231 + add tmp, tmp, leading_bytes, lsl #32-5
232 + rsbs tmp, tmp, leading_bytes, lsl #32-5
233 + bls 61f
234 + pld [ptr, #32*(prefetch_distance+1)]
235 + .endif
236 +61:
237 +.endm
238 +
239 +.macro preload_trailing backwards, base, remain, tmp
240 + /* We need either 0, 1 or 2 extra preloads */
241 + .if backwards
242 + rsb tmp, base, #0
243 + mov tmp, tmp, lsl #32-5
244 + .else
245 + mov tmp, base, lsl #32-5
246 + .endif
247 + adds tmp, tmp, remain, lsl #32-5
248 + adceqs tmp, tmp, #0
249 + /* The instruction above has two effects: ensures Z is only
250 + * set if C was clear (so Z indicates that both shifted quantities
251 + * were 0), and clears C if Z was set (so C indicates that the sum
252 + * of the shifted quantities was greater and not equal to 32) */
253 + beq 82f
254 + .if backwards
255 + sub tmp, base, #1
256 + bic tmp, tmp, #31
257 + .else
258 + bic tmp, base, #31
259 + .endif
260 + bcc 81f
261 + .if backwards
262 + pld [tmp, #-32*(prefetch_distance+1)]
263 +81:
264 + pld [tmp, #-32*prefetch_distance]
265 + .else
266 + pld [tmp, #32*(prefetch_distance+2)]
267 +81:
268 + pld [tmp, #32*(prefetch_distance+1)]
269 + .endif
270 +82:
271 +.endm
272 +
273 +.macro preload_all backwards, narrow_case, shift, base, remain, tmp0, tmp1
274 + .if backwards
275 + sub tmp0, base, #1
276 + bic tmp0, tmp0, #31
277 + pld [tmp0]
278 + sub tmp1, base, remain, lsl #shift
279 + .else
280 + bic tmp0, base, #31
281 + pld [tmp0]
282 + add tmp1, base, remain, lsl #shift
283 + sub tmp1, tmp1, #1
284 + .endif
285 + bic tmp1, tmp1, #31
286 + cmp tmp1, tmp0
287 + beq 92f
288 + .if narrow_case
289 + /* In this case, all the data fits in either 1 or 2 cache lines */
290 + pld [tmp1]
291 + .else
292 +91:
293 + .if backwards
294 + sub tmp0, tmp0, #32
295 + .else
296 + add tmp0, tmp0, #32
297 + .endif
298 + cmp tmp0, tmp1
299 + pld [tmp0]
300 + bne 91b
301 + .endif
302 +92:
303 +.endm
304 --- a/arch/arm/lib/copy_from_user.S
305 +++ b/arch/arm/lib/copy_from_user.S
306 @@ -104,7 +104,8 @@ UNWIND( .save {r0, r2, r3, \regs} )
307
308 .text
309
310 -ENTRY(arm_copy_from_user)
311 +ENTRY(__copy_from_user_std)
312 +WEAK(arm_copy_from_user)
313 #ifdef CONFIG_CPU_SPECTRE
314 ldr r3, =TASK_SIZE
315 uaccess_mask_range_ptr r1, r2, r3, ip
316 @@ -113,6 +114,7 @@ ENTRY(arm_copy_from_user)
317 #include "copy_template.S"
318
319 ENDPROC(arm_copy_from_user)
320 +ENDPROC(__copy_from_user_std)
321
322 .pushsection .text.fixup,"ax"
323 .align 0
324 --- /dev/null
325 +++ b/arch/arm/lib/exports_rpi.c
326 @@ -0,0 +1,37 @@
327 +/**
328 + * Copyright (c) 2014, Raspberry Pi (Trading) Ltd.
329 + *
330 + * Redistribution and use in source and binary forms, with or without
331 + * modification, are permitted provided that the following conditions
332 + * are met:
333 + * 1. Redistributions of source code must retain the above copyright
334 + * notice, this list of conditions, and the following disclaimer,
335 + * without modification.
336 + * 2. Redistributions in binary form must reproduce the above copyright
337 + * notice, this list of conditions and the following disclaimer in the
338 + * documentation and/or other materials provided with the distribution.
339 + * 3. The names of the above-listed copyright holders may not be used
340 + * to endorse or promote products derived from this software without
341 + * specific prior written permission.
342 + *
343 + * ALTERNATIVELY, this software may be distributed under the terms of the
344 + * GNU General Public License ("GPL") version 2, as published by the Free
345 + * Software Foundation.
346 + *
347 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
348 + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
349 + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
350 + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
351 + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
352 + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
353 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
354 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
355 + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
356 + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
357 + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
358 + */
359 +
360 +#include <linux/kernel.h>
361 +#include <linux/module.h>
362 +
363 +EXPORT_SYMBOL(memcmp);
364 --- /dev/null
365 +++ b/arch/arm/lib/memcmp_rpi.S
366 @@ -0,0 +1,285 @@
367 +/*
368 +Copyright (c) 2013, Raspberry Pi Foundation
369 +Copyright (c) 2013, RISC OS Open Ltd
370 +All rights reserved.
371 +
372 +Redistribution and use in source and binary forms, with or without
373 +modification, are permitted provided that the following conditions are met:
374 + * Redistributions of source code must retain the above copyright
375 + notice, this list of conditions and the following disclaimer.
376 + * Redistributions in binary form must reproduce the above copyright
377 + notice, this list of conditions and the following disclaimer in the
378 + documentation and/or other materials provided with the distribution.
379 + * Neither the name of the copyright holder nor the
380 + names of its contributors may be used to endorse or promote products
381 + derived from this software without specific prior written permission.
382 +
383 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
384 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
385 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
386 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
387 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
388 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
389 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
390 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
391 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
392 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
393 +*/
394 +
395 +#include <linux/linkage.h>
396 +#include "arm-mem.h"
397 +
398 +/* Prevent the stack from becoming executable */
399 +#if defined(__linux__) && defined(__ELF__)
400 +.section .note.GNU-stack,"",%progbits
401 +#endif
402 +
403 + .text
404 + .arch armv6
405 + .object_arch armv4
406 + .arm
407 + .altmacro
408 + .p2align 2
409 +
410 +.macro memcmp_process_head unaligned
411 + .if unaligned
412 + ldr DAT0, [S_1], #4
413 + ldr DAT1, [S_1], #4
414 + ldr DAT2, [S_1], #4
415 + ldr DAT3, [S_1], #4
416 + .else
417 + ldmia S_1!, {DAT0, DAT1, DAT2, DAT3}
418 + .endif
419 + ldmia S_2!, {DAT4, DAT5, DAT6, DAT7}
420 +.endm
421 +
422 +.macro memcmp_process_tail
423 + cmp DAT0, DAT4
424 + cmpeq DAT1, DAT5
425 + cmpeq DAT2, DAT6
426 + cmpeq DAT3, DAT7
427 + bne 200f
428 +.endm
429 +
430 +.macro memcmp_leading_31bytes
431 + movs DAT0, OFF, lsl #31
432 + ldrmib DAT0, [S_1], #1
433 + ldrcsh DAT1, [S_1], #2
434 + ldrmib DAT4, [S_2], #1
435 + ldrcsh DAT5, [S_2], #2
436 + movpl DAT0, #0
437 + movcc DAT1, #0
438 + movpl DAT4, #0
439 + movcc DAT5, #0
440 + submi N, N, #1
441 + subcs N, N, #2
442 + cmp DAT0, DAT4
443 + cmpeq DAT1, DAT5
444 + bne 200f
445 + movs DAT0, OFF, lsl #29
446 + ldrmi DAT0, [S_1], #4
447 + ldrcs DAT1, [S_1], #4
448 + ldrcs DAT2, [S_1], #4
449 + ldrmi DAT4, [S_2], #4
450 + ldmcsia S_2!, {DAT5, DAT6}
451 + movpl DAT0, #0
452 + movcc DAT1, #0
453 + movcc DAT2, #0
454 + movpl DAT4, #0
455 + movcc DAT5, #0
456 + movcc DAT6, #0
457 + submi N, N, #4
458 + subcs N, N, #8
459 + cmp DAT0, DAT4
460 + cmpeq DAT1, DAT5
461 + cmpeq DAT2, DAT6
462 + bne 200f
463 + tst OFF, #16
464 + beq 105f
465 + memcmp_process_head 1
466 + sub N, N, #16
467 + memcmp_process_tail
468 +105:
469 +.endm
470 +
471 +.macro memcmp_trailing_15bytes unaligned
472 + movs N, N, lsl #29
473 + .if unaligned
474 + ldrcs DAT0, [S_1], #4
475 + ldrcs DAT1, [S_1], #4
476 + .else
477 + ldmcsia S_1!, {DAT0, DAT1}
478 + .endif
479 + ldrmi DAT2, [S_1], #4
480 + ldmcsia S_2!, {DAT4, DAT5}
481 + ldrmi DAT6, [S_2], #4
482 + movcc DAT0, #0
483 + movcc DAT1, #0
484 + movpl DAT2, #0
485 + movcc DAT4, #0
486 + movcc DAT5, #0
487 + movpl DAT6, #0
488 + cmp DAT0, DAT4
489 + cmpeq DAT1, DAT5
490 + cmpeq DAT2, DAT6
491 + bne 200f
492 + movs N, N, lsl #2
493 + ldrcsh DAT0, [S_1], #2
494 + ldrmib DAT1, [S_1]
495 + ldrcsh DAT4, [S_2], #2
496 + ldrmib DAT5, [S_2]
497 + movcc DAT0, #0
498 + movpl DAT1, #0
499 + movcc DAT4, #0
500 + movpl DAT5, #0
501 + cmp DAT0, DAT4
502 + cmpeq DAT1, DAT5
503 + bne 200f
504 +.endm
505 +
506 +.macro memcmp_long_inner_loop unaligned
507 +110:
508 + memcmp_process_head unaligned
509 + pld [S_2, #prefetch_distance*32 + 16]
510 + memcmp_process_tail
511 + memcmp_process_head unaligned
512 + pld [S_1, OFF]
513 + memcmp_process_tail
514 + subs N, N, #32
515 + bhs 110b
516 + /* Just before the final (prefetch_distance+1) 32-byte blocks,
517 + * deal with final preloads */
518 + preload_trailing 0, S_1, N, DAT0
519 + preload_trailing 0, S_2, N, DAT0
520 + add N, N, #(prefetch_distance+2)*32 - 16
521 +120:
522 + memcmp_process_head unaligned
523 + memcmp_process_tail
524 + subs N, N, #16
525 + bhs 120b
526 + /* Trailing words and bytes */
527 + tst N, #15
528 + beq 199f
529 + memcmp_trailing_15bytes unaligned
530 +199: /* Reached end without detecting a difference */
531 + mov a1, #0
532 + setend le
533 + pop {DAT1-DAT6, pc}
534 +.endm
535 +
536 +.macro memcmp_short_inner_loop unaligned
537 + subs N, N, #16 /* simplifies inner loop termination */
538 + blo 122f
539 +120:
540 + memcmp_process_head unaligned
541 + memcmp_process_tail
542 + subs N, N, #16
543 + bhs 120b
544 +122: /* Trailing words and bytes */
545 + tst N, #15
546 + beq 199f
547 + memcmp_trailing_15bytes unaligned
548 +199: /* Reached end without detecting a difference */
549 + mov a1, #0
550 + setend le
551 + pop {DAT1-DAT6, pc}
552 +.endm
553 +
554 +/*
555 + * int memcmp(const void *s1, const void *s2, size_t n);
556 + * On entry:
557 + * a1 = pointer to buffer 1
558 + * a2 = pointer to buffer 2
559 + * a3 = number of bytes to compare (as unsigned chars)
560 + * On exit:
561 + * a1 = >0/=0/<0 if s1 >/=/< s2
562 + */
563 +
564 +.set prefetch_distance, 2
565 +
566 +ENTRY(memcmp)
567 + S_1 .req a1
568 + S_2 .req a2
569 + N .req a3
570 + DAT0 .req a4
571 + DAT1 .req v1
572 + DAT2 .req v2
573 + DAT3 .req v3
574 + DAT4 .req v4
575 + DAT5 .req v5
576 + DAT6 .req v6
577 + DAT7 .req ip
578 + OFF .req lr
579 +
580 + push {DAT1-DAT6, lr}
581 + setend be /* lowest-addressed bytes are most significant */
582 +
583 + /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
584 + cmp N, #(prefetch_distance+3)*32 - 1
585 + blo 170f
586 +
587 + /* Long case */
588 + /* Adjust N so that the decrement instruction can also test for
589 + * inner loop termination. We want it to stop when there are
590 + * (prefetch_distance+1) complete blocks to go. */
591 + sub N, N, #(prefetch_distance+2)*32
592 + preload_leading_step1 0, DAT0, S_1
593 + preload_leading_step1 0, DAT1, S_2
594 + tst S_2, #31
595 + beq 154f
596 + rsb OFF, S_2, #0 /* no need to AND with 15 here */
597 + preload_leading_step2 0, DAT0, S_1, OFF, DAT2
598 + preload_leading_step2 0, DAT1, S_2, OFF, DAT2
599 + memcmp_leading_31bytes
600 +154: /* Second source now cacheline (32-byte) aligned; we have at
601 + * least one prefetch to go. */
602 + /* Prefetch offset is best selected such that it lies in the
603 + * first 8 of each 32 bytes - but it's just as easy to aim for
604 + * the first one */
605 + and OFF, S_1, #31
606 + rsb OFF, OFF, #32*prefetch_distance
607 + tst S_1, #3
608 + bne 140f
609 + memcmp_long_inner_loop 0
610 +140: memcmp_long_inner_loop 1
611 +
612 +170: /* Short case */
613 + teq N, #0
614 + beq 199f
615 + preload_all 0, 0, 0, S_1, N, DAT0, DAT1
616 + preload_all 0, 0, 0, S_2, N, DAT0, DAT1
617 + tst S_2, #3
618 + beq 174f
619 +172: subs N, N, #1
620 + blo 199f
621 + ldrb DAT0, [S_1], #1
622 + ldrb DAT4, [S_2], #1
623 + cmp DAT0, DAT4
624 + bne 200f
625 + tst S_2, #3
626 + bne 172b
627 +174: /* Second source now 4-byte aligned; we have 0 or more bytes to go */
628 + tst S_1, #3
629 + bne 140f
630 + memcmp_short_inner_loop 0
631 +140: memcmp_short_inner_loop 1
632 +
633 +200: /* Difference found: determine sign. */
634 + movhi a1, #1
635 + movlo a1, #-1
636 + setend le
637 + pop {DAT1-DAT6, pc}
638 +
639 + .unreq S_1
640 + .unreq S_2
641 + .unreq N
642 + .unreq DAT0
643 + .unreq DAT1
644 + .unreq DAT2
645 + .unreq DAT3
646 + .unreq DAT4
647 + .unreq DAT5
648 + .unreq DAT6
649 + .unreq DAT7
650 + .unreq OFF
651 +ENDPROC(memcmp)
652 --- /dev/null
653 +++ b/arch/arm/lib/memcpy_rpi.S
654 @@ -0,0 +1,63 @@
655 +/*
656 +Copyright (c) 2013, Raspberry Pi Foundation
657 +Copyright (c) 2013, RISC OS Open Ltd
658 +All rights reserved.
659 +
660 +Redistribution and use in source and binary forms, with or without
661 +modification, are permitted provided that the following conditions are met:
662 + * Redistributions of source code must retain the above copyright
663 + notice, this list of conditions and the following disclaimer.
664 + * Redistributions in binary form must reproduce the above copyright
665 + notice, this list of conditions and the following disclaimer in the
666 + documentation and/or other materials provided with the distribution.
667 + * Neither the name of the copyright holder nor the
668 + names of its contributors may be used to endorse or promote products
669 + derived from this software without specific prior written permission.
670 +
671 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
672 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
673 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
674 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
675 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
676 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
677 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
678 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
679 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
680 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
681 +*/
682 +
683 +#include <linux/linkage.h>
684 +#include <asm/assembler.h>
685 +#include <asm/unwind.h>
686 +#include "arm-mem.h"
687 +#include "memcpymove.h"
688 +
689 +/* Prevent the stack from becoming executable */
690 +#if defined(__linux__) && defined(__ELF__)
691 +.section .note.GNU-stack,"",%progbits
692 +#endif
693 +
694 + .text
695 + .arch armv6
696 + .object_arch armv4
697 + .arm
698 + .altmacro
699 + .p2align 2
700 +
701 +/*
702 + * void *memcpy(void * restrict s1, const void * restrict s2, size_t n);
703 + * On entry:
704 + * a1 = pointer to destination
705 + * a2 = pointer to source
706 + * a3 = number of bytes to copy
707 + * On exit:
708 + * a1 preserved
709 + */
710 +
711 +.set prefetch_distance, 3
712 +
713 +ENTRY(mmiocpy)
714 +ENTRY(memcpy)
715 + memcpy 0
716 +ENDPROC(memcpy)
717 +ENDPROC(mmiocpy)
718 --- /dev/null
719 +++ b/arch/arm/lib/memcpymove.h
720 @@ -0,0 +1,488 @@
721 +/*
722 +Copyright (c) 2013, Raspberry Pi Foundation
723 +Copyright (c) 2013, RISC OS Open Ltd
724 +All rights reserved.
725 +
726 +Redistribution and use in source and binary forms, with or without
727 +modification, are permitted provided that the following conditions are met:
728 + * Redistributions of source code must retain the above copyright
729 + notice, this list of conditions and the following disclaimer.
730 + * Redistributions in binary form must reproduce the above copyright
731 + notice, this list of conditions and the following disclaimer in the
732 + documentation and/or other materials provided with the distribution.
733 + * Neither the name of the copyright holder nor the
734 + names of its contributors may be used to endorse or promote products
735 + derived from this software without specific prior written permission.
736 +
737 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
738 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
739 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
740 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
741 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
742 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
743 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
744 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
745 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
746 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
747 +*/
748 +
749 +.macro unaligned_words backwards, align, use_pld, words, r0, r1, r2, r3, r4, r5, r6, r7, r8
750 + .if words == 1
751 + .if backwards
752 + mov r1, r0, lsl #32-align*8
753 + ldr r0, [S, #-4]!
754 + orr r1, r1, r0, lsr #align*8
755 + str r1, [D, #-4]!
756 + .else
757 + mov r0, r1, lsr #align*8
758 + ldr r1, [S, #4]!
759 + orr r0, r0, r1, lsl #32-align*8
760 + str r0, [D], #4
761 + .endif
762 + .elseif words == 2
763 + .if backwards
764 + ldr r1, [S, #-4]!
765 + mov r2, r0, lsl #32-align*8
766 + ldr r0, [S, #-4]!
767 + orr r2, r2, r1, lsr #align*8
768 + mov r1, r1, lsl #32-align*8
769 + orr r1, r1, r0, lsr #align*8
770 + stmdb D!, {r1, r2}
771 + .else
772 + ldr r1, [S, #4]!
773 + mov r0, r2, lsr #align*8
774 + ldr r2, [S, #4]!
775 + orr r0, r0, r1, lsl #32-align*8
776 + mov r1, r1, lsr #align*8
777 + orr r1, r1, r2, lsl #32-align*8
778 + stmia D!, {r0, r1}
779 + .endif
780 + .elseif words == 4
781 + .if backwards
782 + ldmdb S!, {r2, r3}
783 + mov r4, r0, lsl #32-align*8
784 + ldmdb S!, {r0, r1}
785 + orr r4, r4, r3, lsr #align*8
786 + mov r3, r3, lsl #32-align*8
787 + orr r3, r3, r2, lsr #align*8
788 + mov r2, r2, lsl #32-align*8
789 + orr r2, r2, r1, lsr #align*8
790 + mov r1, r1, lsl #32-align*8
791 + orr r1, r1, r0, lsr #align*8
792 + stmdb D!, {r1, r2, r3, r4}
793 + .else
794 + ldmib S!, {r1, r2}
795 + mov r0, r4, lsr #align*8
796 + ldmib S!, {r3, r4}
797 + orr r0, r0, r1, lsl #32-align*8
798 + mov r1, r1, lsr #align*8
799 + orr r1, r1, r2, lsl #32-align*8
800 + mov r2, r2, lsr #align*8
801 + orr r2, r2, r3, lsl #32-align*8
802 + mov r3, r3, lsr #align*8
803 + orr r3, r3, r4, lsl #32-align*8
804 + stmia D!, {r0, r1, r2, r3}
805 + .endif
806 + .elseif words == 8
807 + .if backwards
808 + ldmdb S!, {r4, r5, r6, r7}
809 + mov r8, r0, lsl #32-align*8
810 + ldmdb S!, {r0, r1, r2, r3}
811 + .if use_pld
812 + pld [S, OFF]
813 + .endif
814 + orr r8, r8, r7, lsr #align*8
815 + mov r7, r7, lsl #32-align*8
816 + orr r7, r7, r6, lsr #align*8
817 + mov r6, r6, lsl #32-align*8
818 + orr r6, r6, r5, lsr #align*8
819 + mov r5, r5, lsl #32-align*8
820 + orr r5, r5, r4, lsr #align*8
821 + mov r4, r4, lsl #32-align*8
822 + orr r4, r4, r3, lsr #align*8
823 + mov r3, r3, lsl #32-align*8
824 + orr r3, r3, r2, lsr #align*8
825 + mov r2, r2, lsl #32-align*8
826 + orr r2, r2, r1, lsr #align*8
827 + mov r1, r1, lsl #32-align*8
828 + orr r1, r1, r0, lsr #align*8
829 + stmdb D!, {r5, r6, r7, r8}
830 + stmdb D!, {r1, r2, r3, r4}
831 + .else
832 + ldmib S!, {r1, r2, r3, r4}
833 + mov r0, r8, lsr #align*8
834 + ldmib S!, {r5, r6, r7, r8}
835 + .if use_pld
836 + pld [S, OFF]
837 + .endif
838 + orr r0, r0, r1, lsl #32-align*8
839 + mov r1, r1, lsr #align*8
840 + orr r1, r1, r2, lsl #32-align*8
841 + mov r2, r2, lsr #align*8
842 + orr r2, r2, r3, lsl #32-align*8
843 + mov r3, r3, lsr #align*8
844 + orr r3, r3, r4, lsl #32-align*8
845 + mov r4, r4, lsr #align*8
846 + orr r4, r4, r5, lsl #32-align*8
847 + mov r5, r5, lsr #align*8
848 + orr r5, r5, r6, lsl #32-align*8
849 + mov r6, r6, lsr #align*8
850 + orr r6, r6, r7, lsl #32-align*8
851 + mov r7, r7, lsr #align*8
852 + orr r7, r7, r8, lsl #32-align*8
853 + stmia D!, {r0, r1, r2, r3}
854 + stmia D!, {r4, r5, r6, r7}
855 + .endif
856 + .endif
857 +.endm
858 +
859 +.macro memcpy_leading_15bytes backwards, align
860 + movs DAT1, DAT2, lsl #31
861 + sub N, N, DAT2
862 + .if backwards
863 + ldrmib DAT0, [S, #-1]!
864 + ldrcsh DAT1, [S, #-2]!
865 + strmib DAT0, [D, #-1]!
866 + strcsh DAT1, [D, #-2]!
867 + .else
868 + ldrmib DAT0, [S], #1
869 + ldrcsh DAT1, [S], #2
870 + strmib DAT0, [D], #1
871 + strcsh DAT1, [D], #2
872 + .endif
873 + movs DAT1, DAT2, lsl #29
874 + .if backwards
875 + ldrmi DAT0, [S, #-4]!
876 + .if align == 0
877 + ldmcsdb S!, {DAT1, DAT2}
878 + .else
879 + ldrcs DAT2, [S, #-4]!
880 + ldrcs DAT1, [S, #-4]!
881 + .endif
882 + strmi DAT0, [D, #-4]!
883 + stmcsdb D!, {DAT1, DAT2}
884 + .else
885 + ldrmi DAT0, [S], #4
886 + .if align == 0
887 + ldmcsia S!, {DAT1, DAT2}
888 + .else
889 + ldrcs DAT1, [S], #4
890 + ldrcs DAT2, [S], #4
891 + .endif
892 + strmi DAT0, [D], #4
893 + stmcsia D!, {DAT1, DAT2}
894 + .endif
895 +.endm
896 +
897 +.macro memcpy_trailing_15bytes backwards, align
898 + movs N, N, lsl #29
899 + .if backwards
900 + .if align == 0
901 + ldmcsdb S!, {DAT0, DAT1}
902 + .else
903 + ldrcs DAT1, [S, #-4]!
904 + ldrcs DAT0, [S, #-4]!
905 + .endif
906 + ldrmi DAT2, [S, #-4]!
907 + stmcsdb D!, {DAT0, DAT1}
908 + strmi DAT2, [D, #-4]!
909 + .else
910 + .if align == 0
911 + ldmcsia S!, {DAT0, DAT1}
912 + .else
913 + ldrcs DAT0, [S], #4
914 + ldrcs DAT1, [S], #4
915 + .endif
916 + ldrmi DAT2, [S], #4
917 + stmcsia D!, {DAT0, DAT1}
918 + strmi DAT2, [D], #4
919 + .endif
920 + movs N, N, lsl #2
921 + .if backwards
922 + ldrcsh DAT0, [S, #-2]!
923 + ldrmib DAT1, [S, #-1]
924 + strcsh DAT0, [D, #-2]!
925 + strmib DAT1, [D, #-1]
926 + .else
927 + ldrcsh DAT0, [S], #2
928 + ldrmib DAT1, [S]
929 + strcsh DAT0, [D], #2
930 + strmib DAT1, [D]
931 + .endif
932 +.endm
933 +
934 +.macro memcpy_long_inner_loop backwards, align
935 + .if align != 0
936 + .if backwards
937 + ldr DAT0, [S, #-align]!
938 + .else
939 + ldr LAST, [S, #-align]!
940 + .endif
941 + .endif
942 +110:
943 + .if align == 0
944 + .if backwards
945 + ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
946 + pld [S, OFF]
947 + stmdb D!, {DAT4, DAT5, DAT6, LAST}
948 + stmdb D!, {DAT0, DAT1, DAT2, DAT3}
949 + .else
950 + ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
951 + pld [S, OFF]
952 + stmia D!, {DAT0, DAT1, DAT2, DAT3}
953 + stmia D!, {DAT4, DAT5, DAT6, LAST}
954 + .endif
955 + .else
956 + unaligned_words backwards, align, 1, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST
957 + .endif
958 + subs N, N, #32
959 + bhs 110b
960 + /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
961 + preload_trailing backwards, S, N, OFF
962 + add N, N, #(prefetch_distance+2)*32 - 32
963 +120:
964 + .if align == 0
965 + .if backwards
966 + ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
967 + stmdb D!, {DAT4, DAT5, DAT6, LAST}
968 + stmdb D!, {DAT0, DAT1, DAT2, DAT3}
969 + .else
970 + ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
971 + stmia D!, {DAT0, DAT1, DAT2, DAT3}
972 + stmia D!, {DAT4, DAT5, DAT6, LAST}
973 + .endif
974 + .else
975 + unaligned_words backwards, align, 0, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST
976 + .endif
977 + subs N, N, #32
978 + bhs 120b
979 + tst N, #16
980 + .if align == 0
981 + .if backwards
982 + ldmnedb S!, {DAT0, DAT1, DAT2, LAST}
983 + stmnedb D!, {DAT0, DAT1, DAT2, LAST}
984 + .else
985 + ldmneia S!, {DAT0, DAT1, DAT2, LAST}
986 + stmneia D!, {DAT0, DAT1, DAT2, LAST}
987 + .endif
988 + .else
989 + beq 130f
990 + unaligned_words backwards, align, 0, 4, DAT0, DAT1, DAT2, DAT3, LAST
991 +130:
992 + .endif
993 + /* Trailing words and bytes */
994 + tst N, #15
995 + beq 199f
996 + .if align != 0
997 + add S, S, #align
998 + .endif
999 + memcpy_trailing_15bytes backwards, align
1000 +199:
1001 + pop {DAT3, DAT4, DAT5, DAT6, DAT7}
1002 + pop {D, DAT1, DAT2, pc}
1003 +.endm
1004 +
1005 +.macro memcpy_medium_inner_loop backwards, align
1006 +120:
1007 + .if backwards
1008 + .if align == 0
1009 + ldmdb S!, {DAT0, DAT1, DAT2, LAST}
1010 + .else
1011 + ldr LAST, [S, #-4]!
1012 + ldr DAT2, [S, #-4]!
1013 + ldr DAT1, [S, #-4]!
1014 + ldr DAT0, [S, #-4]!
1015 + .endif
1016 + stmdb D!, {DAT0, DAT1, DAT2, LAST}
1017 + .else
1018 + .if align == 0
1019 + ldmia S!, {DAT0, DAT1, DAT2, LAST}
1020 + .else
1021 + ldr DAT0, [S], #4
1022 + ldr DAT1, [S], #4
1023 + ldr DAT2, [S], #4
1024 + ldr LAST, [S], #4
1025 + .endif
1026 + stmia D!, {DAT0, DAT1, DAT2, LAST}
1027 + .endif
1028 + subs N, N, #16
1029 + bhs 120b
1030 + /* Trailing words and bytes */
1031 + tst N, #15
1032 + beq 199f
1033 + memcpy_trailing_15bytes backwards, align
1034 +199:
1035 + pop {D, DAT1, DAT2, pc}
1036 +.endm
1037 +
1038 +.macro memcpy_short_inner_loop backwards, align
1039 + tst N, #16
1040 + .if backwards
1041 + .if align == 0
1042 + ldmnedb S!, {DAT0, DAT1, DAT2, LAST}
1043 + .else
1044 + ldrne LAST, [S, #-4]!
1045 + ldrne DAT2, [S, #-4]!
1046 + ldrne DAT1, [S, #-4]!
1047 + ldrne DAT0, [S, #-4]!
1048 + .endif
1049 + stmnedb D!, {DAT0, DAT1, DAT2, LAST}
1050 + .else
1051 + .if align == 0
1052 + ldmneia S!, {DAT0, DAT1, DAT2, LAST}
1053 + .else
1054 + ldrne DAT0, [S], #4
1055 + ldrne DAT1, [S], #4
1056 + ldrne DAT2, [S], #4
1057 + ldrne LAST, [S], #4
1058 + .endif
1059 + stmneia D!, {DAT0, DAT1, DAT2, LAST}
1060 + .endif
1061 + memcpy_trailing_15bytes backwards, align
1062 +199:
1063 + pop {D, DAT1, DAT2, pc}
1064 +.endm
1065 +
1066 +.macro memcpy backwards
1067 + D .req a1
1068 + S .req a2
1069 + N .req a3
1070 + DAT0 .req a4
1071 + DAT1 .req v1
1072 + DAT2 .req v2
1073 + DAT3 .req v3
1074 + DAT4 .req v4
1075 + DAT5 .req v5
1076 + DAT6 .req v6
1077 + DAT7 .req sl
1078 + LAST .req ip
1079 + OFF .req lr
1080 +
1081 + UNWIND( .fnstart )
1082 +
1083 + push {D, DAT1, DAT2, lr}
1084 + UNWIND( .fnend )
1085 +
1086 + UNWIND( .fnstart )
1087 + UNWIND( .save {D, DAT1, DAT2, lr} )
1088 +
1089 + .if backwards
1090 + add D, D, N
1091 + add S, S, N
1092 + .endif
1093 +
1094 + /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */
1095 + cmp N, #31
1096 + blo 170f
1097 + /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
1098 + cmp N, #(prefetch_distance+3)*32 - 1
1099 + blo 160f
1100 +
1101 + /* Long case */
1102 + push {DAT3, DAT4, DAT5, DAT6, DAT7}
1103 + UNWIND( .fnend )
1104 +
1105 + UNWIND( .fnstart )
1106 + UNWIND( .save {D, DAT1, DAT2, lr} )
1107 + UNWIND( .save {DAT3, DAT4, DAT5, DAT6, DAT7} )
1108 +
1109 + /* Adjust N so that the decrement instruction can also test for
1110 + * inner loop termination. We want it to stop when there are
1111 + * (prefetch_distance+1) complete blocks to go. */
1112 + sub N, N, #(prefetch_distance+2)*32
1113 + preload_leading_step1 backwards, DAT0, S
1114 + .if backwards
1115 + /* Bug in GAS: it accepts, but mis-assembles the instruction
1116 + * ands DAT2, D, #60, 2
1117 + * which sets DAT2 to the number of leading bytes until destination is aligned and also clears C (sets borrow)
1118 + */
1119 + .word 0xE210513C
1120 + beq 154f
1121 + .else
1122 + ands DAT2, D, #15
1123 + beq 154f
1124 + rsb DAT2, DAT2, #16 /* number of leading bytes until destination aligned */
1125 + .endif
1126 + preload_leading_step2 backwards, DAT0, S, DAT2, OFF
1127 + memcpy_leading_15bytes backwards, 1
1128 +154: /* Destination now 16-byte aligned; we have at least one prefetch as well as at least one 16-byte output block */
1129 + /* Prefetch offset is best selected such that it lies in the first 8 of each 32 bytes - but it's just as easy to aim for the first one */
1130 + .if backwards
1131 + rsb OFF, S, #3
1132 + and OFF, OFF, #28
1133 + sub OFF, OFF, #32*(prefetch_distance+1)
1134 + .else
1135 + and OFF, S, #28
1136 + rsb OFF, OFF, #32*prefetch_distance
1137 + .endif
1138 + movs DAT0, S, lsl #31
1139 + bhi 157f
1140 + bcs 156f
1141 + bmi 155f
1142 + memcpy_long_inner_loop backwards, 0
1143 +155: memcpy_long_inner_loop backwards, 1
1144 +156: memcpy_long_inner_loop backwards, 2
1145 +157: memcpy_long_inner_loop backwards, 3
1146 +
1147 + UNWIND( .fnend )
1148 +
1149 + UNWIND( .fnstart )
1150 + UNWIND( .save {D, DAT1, DAT2, lr} )
1151 +
1152 +160: /* Medium case */
1153 + preload_all backwards, 0, 0, S, N, DAT2, OFF
1154 + sub N, N, #16 /* simplifies inner loop termination */
1155 + .if backwards
1156 + ands DAT2, D, #15
1157 + beq 164f
1158 + .else
1159 + ands DAT2, D, #15
1160 + beq 164f
1161 + rsb DAT2, DAT2, #16
1162 + .endif
1163 + memcpy_leading_15bytes backwards, align
1164 +164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */
1165 + tst S, #3
1166 + bne 140f
1167 + memcpy_medium_inner_loop backwards, 0
1168 +140: memcpy_medium_inner_loop backwards, 1
1169 +
1170 +170: /* Short case, less than 31 bytes, so no guarantee of at least one 16-byte block */
1171 + teq N, #0
1172 + beq 199f
1173 + preload_all backwards, 1, 0, S, N, DAT2, LAST
1174 + tst D, #3
1175 + beq 174f
1176 +172: subs N, N, #1
1177 + blo 199f
1178 + .if backwards
1179 + ldrb DAT0, [S, #-1]!
1180 + strb DAT0, [D, #-1]!
1181 + .else
1182 + ldrb DAT0, [S], #1
1183 + strb DAT0, [D], #1
1184 + .endif
1185 + tst D, #3
1186 + bne 172b
1187 +174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
1188 + tst S, #3
1189 + bne 140f
1190 + memcpy_short_inner_loop backwards, 0
1191 +140: memcpy_short_inner_loop backwards, 1
1192 +
1193 + UNWIND( .fnend )
1194 +
1195 + .unreq D
1196 + .unreq S
1197 + .unreq N
1198 + .unreq DAT0
1199 + .unreq DAT1
1200 + .unreq DAT2
1201 + .unreq DAT3
1202 + .unreq DAT4
1203 + .unreq DAT5
1204 + .unreq DAT6
1205 + .unreq DAT7
1206 + .unreq LAST
1207 + .unreq OFF
1208 +.endm
1209 --- /dev/null
1210 +++ b/arch/arm/lib/memmove_rpi.S
1211 @@ -0,0 +1,63 @@
1212 +/*
1213 +Copyright (c) 2013, Raspberry Pi Foundation
1214 +Copyright (c) 2013, RISC OS Open Ltd
1215 +All rights reserved.
1216 +
1217 +Redistribution and use in source and binary forms, with or without
1218 +modification, are permitted provided that the following conditions are met:
1219 + * Redistributions of source code must retain the above copyright
1220 + notice, this list of conditions and the following disclaimer.
1221 + * Redistributions in binary form must reproduce the above copyright
1222 + notice, this list of conditions and the following disclaimer in the
1223 + documentation and/or other materials provided with the distribution.
1224 + * Neither the name of the copyright holder nor the
1225 + names of its contributors may be used to endorse or promote products
1226 + derived from this software without specific prior written permission.
1227 +
1228 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
1229 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
1230 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
1231 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
1232 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
1233 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
1234 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
1235 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
1236 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
1237 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1238 +*/
1239 +
1240 +#include <linux/linkage.h>
1241 +#include <asm/assembler.h>
1242 +#include <asm/unwind.h>
1243 +#include "arm-mem.h"
1244 +#include "memcpymove.h"
1245 +
1246 +/* Prevent the stack from becoming executable */
1247 +#if defined(__linux__) && defined(__ELF__)
1248 +.section .note.GNU-stack,"",%progbits
1249 +#endif
1250 +
1251 + .text
1252 + .arch armv6
1253 + .object_arch armv4
1254 + .arm
1255 + .altmacro
1256 + .p2align 2
1257 +
1258 +/*
1259 + * void *memmove(void *s1, const void *s2, size_t n);
1260 + * On entry:
1261 + * a1 = pointer to destination
1262 + * a2 = pointer to source
1263 + * a3 = number of bytes to copy
1264 + * On exit:
1265 + * a1 preserved
1266 + */
1267 +
1268 +.set prefetch_distance, 3
1269 +
1270 +ENTRY(memmove)
1271 + cmp a2, a1
1272 + bpl memcpy /* pl works even over -1 - 0 and 0x7fffffff - 0x80000000 boundaries */
1273 + memcpy 1
1274 +ENDPROC(memmove)
1275 --- /dev/null
1276 +++ b/arch/arm/lib/memset_rpi.S
1277 @@ -0,0 +1,132 @@
1278 +/*
1279 +Copyright (c) 2013, Raspberry Pi Foundation
1280 +Copyright (c) 2013, RISC OS Open Ltd
1281 +All rights reserved.
1282 +
1283 +Redistribution and use in source and binary forms, with or without
1284 +modification, are permitted provided that the following conditions are met:
1285 + * Redistributions of source code must retain the above copyright
1286 + notice, this list of conditions and the following disclaimer.
1287 + * Redistributions in binary form must reproduce the above copyright
1288 + notice, this list of conditions and the following disclaimer in the
1289 + documentation and/or other materials provided with the distribution.
1290 + * Neither the name of the copyright holder nor the
1291 + names of its contributors may be used to endorse or promote products
1292 + derived from this software without specific prior written permission.
1293 +
1294 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
1295 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
1296 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
1297 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
1298 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
1299 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
1300 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
1301 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
1302 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
1303 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1304 +*/
1305 +
1306 +#include <linux/linkage.h>
1307 +#include "arm-mem.h"
1308 +
1309 +/* Prevent the stack from becoming executable */
1310 +#if defined(__linux__) && defined(__ELF__)
1311 +.section .note.GNU-stack,"",%progbits
1312 +#endif
1313 +
1314 + .text
1315 + .arch armv6
1316 + .object_arch armv4
1317 + .arm
1318 + .altmacro
1319 + .p2align 2
1320 +
1321 +/*
1322 + * void *memset(void *s, int c, size_t n);
1323 + * On entry:
1324 + * a1 = pointer to buffer to fill
1325 + * a2 = byte pattern to fill with (caller-narrowed)
1326 + * a3 = number of bytes to fill
1327 + * On exit:
1328 + * a1 preserved
1329 + */
1330 +ENTRY(mmioset)
1331 +ENTRY(memset)
1332 +ENTRY(__memset)
1333 +
1334 + S .req a1
1335 + DAT0 .req a2
1336 + N .req a3
1337 + DAT1 .req a4
1338 + DAT2 .req ip
1339 + DAT3 .req lr
1340 +
1341 + orr DAT0, DAT0, DAT0, lsl #8
1342 + orr DAT0, DAT0, DAT0, lsl #16
1343 +
1344 +ENTRY(__memset32)
1345 + mov DAT1, DAT0
1346 +
1347 +ENTRY(__memset64)
1348 + push {S, lr}
1349 +
1350 + /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */
1351 + cmp N, #31
1352 + blo 170f
1353 +
1354 +161: sub N, N, #16 /* simplifies inner loop termination */
1355 + /* Leading words and bytes */
1356 + tst S, #15
1357 + beq 164f
1358 + rsb DAT3, S, #0 /* bits 0-3 = number of leading bytes until aligned */
1359 + movs DAT2, DAT3, lsl #31
1360 + submi N, N, #1
1361 + strmib DAT0, [S], #1
1362 + subcs N, N, #2
1363 + strcsh DAT0, [S], #2
1364 + movs DAT2, DAT3, lsl #29
1365 + submi N, N, #4
1366 + strmi DAT0, [S], #4
1367 + subcs N, N, #8
1368 + stmcsia S!, {DAT0, DAT1}
1369 +164: /* Delayed set up of DAT2 and DAT3 so we could use them as scratch registers above */
1370 + mov DAT2, DAT0
1371 + mov DAT3, DAT1
1372 + /* Now the inner loop of 16-byte stores */
1373 +165: stmia S!, {DAT0, DAT1, DAT2, DAT3}
1374 + subs N, N, #16
1375 + bhs 165b
1376 +166: /* Trailing words and bytes */
1377 + movs N, N, lsl #29
1378 + stmcsia S!, {DAT0, DAT1}
1379 + strmi DAT0, [S], #4
1380 + movs N, N, lsl #2
1381 + strcsh DAT0, [S], #2
1382 + strmib DAT0, [S]
1383 +199: pop {S, pc}
1384 +
1385 +170: /* Short case */
1386 + mov DAT2, DAT0
1387 + mov DAT3, DAT1
1388 + tst S, #3
1389 + beq 174f
1390 +172: subs N, N, #1
1391 + blo 199b
1392 + strb DAT0, [S], #1
1393 + tst S, #3
1394 + bne 172b
1395 +174: tst N, #16
1396 + stmneia S!, {DAT0, DAT1, DAT2, DAT3}
1397 + b 166b
1398 +
1399 + .unreq S
1400 + .unreq DAT0
1401 + .unreq N
1402 + .unreq DAT1
1403 + .unreq DAT2
1404 + .unreq DAT3
1405 +ENDPROC(__memset64)
1406 +ENDPROC(__memset32)
1407 +ENDPROC(__memset)
1408 +ENDPROC(memset)
1409 +ENDPROC(mmioset)
1410 --- a/arch/arm/lib/uaccess_with_memcpy.c
1411 +++ b/arch/arm/lib/uaccess_with_memcpy.c
1412 @@ -19,6 +19,14 @@
1413 #include <asm/current.h>
1414 #include <asm/page.h>
1415
1416 +#ifndef COPY_FROM_USER_THRESHOLD
1417 +#define COPY_FROM_USER_THRESHOLD 64
1418 +#endif
1419 +
1420 +#ifndef COPY_TO_USER_THRESHOLD
1421 +#define COPY_TO_USER_THRESHOLD 64
1422 +#endif
1423 +
1424 static int
1425 pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
1426 {
1427 @@ -43,7 +51,7 @@ pin_page_for_write(const void __user *_a
1428 return 0;
1429
1430 pmd = pmd_offset(pud, addr);
1431 - if (unlikely(pmd_none(*pmd)))
1432 + if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd)))
1433 return 0;
1434
1435 /*
1436 @@ -86,7 +94,46 @@ pin_page_for_write(const void __user *_a
1437 return 1;
1438 }
1439
1440 -static unsigned long noinline
1441 +static int
1442 +pin_page_for_read(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
1443 +{
1444 + unsigned long addr = (unsigned long)_addr;
1445 + pgd_t *pgd;
1446 + p4d_t *p4d;
1447 + pmd_t *pmd;
1448 + pte_t *pte;
1449 + pud_t *pud;
1450 + spinlock_t *ptl;
1451 +
1452 + pgd = pgd_offset(current->mm, addr);
1453 + if (unlikely(pgd_none(*pgd) || pgd_bad(*pgd)))
1454 + return 0;
1455 +
1456 + p4d = p4d_offset(pgd, addr);
1457 + if (unlikely(p4d_none(*p4d) || p4d_bad(*p4d)))
1458 + return 0;
1459 +
1460 + pud = pud_offset(p4d, addr);
1461 + if (unlikely(pud_none(*pud) || pud_bad(*pud)))
1462 + return 0;
1463 +
1464 + pmd = pmd_offset(pud, addr);
1465 + if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd)))
1466 + return 0;
1467 +
1468 + pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
1469 + if (unlikely(!pte_present(*pte) || !pte_young(*pte))) {
1470 + pte_unmap_unlock(pte, ptl);
1471 + return 0;
1472 + }
1473 +
1474 + *ptep = pte;
1475 + *ptlp = ptl;
1476 +
1477 + return 1;
1478 +}
1479 +
1480 +unsigned long noinline
1481 __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
1482 {
1483 unsigned long ua_flags;
1484 @@ -134,6 +181,52 @@ out:
1485 return n;
1486 }
1487
1488 +unsigned long noinline
1489 +__copy_from_user_memcpy(void *to, const void __user *from, unsigned long n)
1490 +{
1491 + unsigned long ua_flags;
1492 + int atomic;
1493 +
1494 + /* the mmap semaphore is taken only if not in an atomic context */
1495 + atomic = in_atomic();
1496 +
1497 + if (!atomic)
1498 + mmap_read_lock(current->mm);
1499 + while (n) {
1500 + pte_t *pte;
1501 + spinlock_t *ptl;
1502 + int tocopy;
1503 +
1504 + while (!pin_page_for_read(from, &pte, &ptl)) {
1505 + char temp;
1506 + if (!atomic)
1507 + mmap_read_unlock(current->mm);
1508 + if (__get_user(temp, (char __user *)from))
1509 + goto out;
1510 + if (!atomic)
1511 + mmap_read_lock(current->mm);
1512 + }
1513 +
1514 + tocopy = (~(unsigned long)from & ~PAGE_MASK) + 1;
1515 + if (tocopy > n)
1516 + tocopy = n;
1517 +
1518 + ua_flags = uaccess_save_and_enable();
1519 + memcpy(to, (const void *)from, tocopy);
1520 + uaccess_restore(ua_flags);
1521 + to += tocopy;
1522 + from += tocopy;
1523 + n -= tocopy;
1524 +
1525 + pte_unmap_unlock(pte, ptl);
1526 + }
1527 + if (!atomic)
1528 + mmap_read_unlock(current->mm);
1529 +
1530 +out:
1531 + return n;
1532 +}
1533 +
1534 unsigned long
1535 arm_copy_to_user(void __user *to, const void *from, unsigned long n)
1536 {
1537 @@ -144,7 +237,7 @@ arm_copy_to_user(void __user *to, const
1538 * With frame pointer disabled, tail call optimization kicks in
1539 * as well making this test almost invisible.
1540 */
1541 - if (n < 64) {
1542 + if (n < COPY_TO_USER_THRESHOLD) {
1543 unsigned long ua_flags = uaccess_save_and_enable();
1544 n = __copy_to_user_std(to, from, n);
1545 uaccess_restore(ua_flags);
1546 @@ -154,6 +247,32 @@ arm_copy_to_user(void __user *to, const
1547 }
1548 return n;
1549 }
1550 +
1551 +unsigned long __must_check
1552 +arm_copy_from_user(void *to, const void __user *from, unsigned long n)
1553 +{
1554 +#ifdef CONFIG_BCM2835_FAST_MEMCPY
1555 + /*
1556 + * This test is stubbed out of the main function above to keep
1557 + * the overhead for small copies low by avoiding a large
1558 + * register dump on the stack just to reload them right away.
1559 + * With frame pointer disabled, tail call optimization kicks in
1560 + * as well making this test almost invisible.
1561 + */
1562 + if (n < COPY_TO_USER_THRESHOLD) {
1563 + unsigned long ua_flags = uaccess_save_and_enable();
1564 + n = __copy_from_user_std(to, from, n);
1565 + uaccess_restore(ua_flags);
1566 + } else {
1567 + n = __copy_from_user_memcpy(to, from, n);
1568 + }
1569 +#else
1570 + unsigned long ua_flags = uaccess_save_and_enable();
1571 + n = __copy_from_user_std(to, from, n);
1572 + uaccess_restore(ua_flags);
1573 +#endif
1574 + return n;
1575 +}
1576
1577 static unsigned long noinline
1578 __clear_user_memset(void __user *addr, unsigned long n)
1579 --- a/arch/arm/mach-bcm/Kconfig
1580 +++ b/arch/arm/mach-bcm/Kconfig
1581 @@ -182,6 +182,30 @@ config ARCH_BCM_53573
1582 The base chip is BCM53573 and there are some packaging modifications
1583 like BCM47189 and BCM47452.
1584
1585 +config ARCH_BCM_63XX
1586 + bool "Broadcom BCM63xx DSL SoC"
1587 + depends on ARCH_MULTI_V7
1588 + select ARCH_HAS_RESET_CONTROLLER
1589 + select ARM_ERRATA_754322
1590 + select ARM_ERRATA_764369 if SMP
1591 + select ARM_GIC
1592 + select ARM_GLOBAL_TIMER
1593 + select CACHE_L2X0
1594 + select HAVE_ARM_ARCH_TIMER
1595 + select HAVE_ARM_TWD if SMP
1596 + select HAVE_ARM_SCU if SMP
1597 + help
1598 + This enables support for systems based on Broadcom DSL SoCs.
1599 + It currently supports the 'BCM63XX' ARM-based family, which includes
1600 + the BCM63138 variant.
1601 +
1602 +config BCM2835_FAST_MEMCPY
1603 + bool "Enable optimized __copy_to_user and __copy_from_user"
1604 + depends on ARCH_BCM2835 && ARCH_MULTI_V6
1605 + default y
1606 + help
1607 + Optimized versions of __copy_to_user and __copy_from_user for Pi1.
1608 +
1609 config ARCH_BRCMSTB
1610 bool "Broadcom BCM7XXX based boards"
1611 depends on ARCH_MULTI_V7