* [PATCH v2 01/10] ARM: Use optimized reads[bwl] and writes[bwl] functions
2024-09-26 11:17 [PATCH v2 00/10] ARM: add assembler optimized memmove Sascha Hauer
@ 2024-09-26 11:17 ` Sascha Hauer
2024-09-26 11:17 ` [PATCH v2 02/10] ARM: rename logical shift macros push pull into lspush lspull Sascha Hauer
` (9 subsequent siblings)
10 siblings, 0 replies; 14+ messages in thread
From: Sascha Hauer @ 2024-09-26 11:17 UTC (permalink / raw)
To: open list:BAREBOX; +Cc: Ahmad Fatoum
The optimized versions are there for ARM32, but since 0d53f3c584a2
("arm: use asm-generic/io.h") they were no longer used. Activate
them again.
Reviewed-by: Ahmad Fatoum <a.fatoum@pengutronix.de>
Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
---
arch/arm/include/asm/io.h | 24 ++++++++++++++++++++++++
arch/arm/lib32/io-readsb.S | 6 +++---
arch/arm/lib32/io-readsl.S | 6 +++---
arch/arm/lib32/io-readsw-armv4.S | 6 +++---
arch/arm/lib32/io-writesb.S | 6 +++---
arch/arm/lib32/io-writesl.S | 6 +++---
6 files changed, 39 insertions(+), 15 deletions(-)
diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h
index 486b142950..9e9b13ad18 100644
--- a/arch/arm/include/asm/io.h
+++ b/arch/arm/include/asm/io.h
@@ -3,6 +3,30 @@
#ifndef __ASM_ARM_IO_H
#define __ASM_ARM_IO_H
+#include <linux/compiler.h>
+
+#ifndef CONFIG_CPU_64
+/*
+ * Generic IO read/write. These perform native-endian accesses. Note
+ * that some architectures will want to re-define __raw_{read,write}w.
+ */
+void __raw_writesb(volatile void __iomem *addr, const void *data, int bytelen);
+void __raw_writesw(volatile void __iomem *addr, const void *data, int wordlen);
+void __raw_writesl(volatile void __iomem *addr, const void *data, int longlen);
+
+void __raw_readsb(const volatile void __iomem *addr, void *data, int bytelen);
+void __raw_readsw(const volatile void __iomem *addr, void *data, int wordlen);
+void __raw_readsl(const volatile void __iomem *addr, void *data, int longlen);
+
+#define readsb(p,d,l) __raw_readsb(p,d,l)
+#define readsw(p,d,l) __raw_readsw(p,d,l)
+#define readsl(p,d,l) __raw_readsl(p,d,l)
+
+#define writesb(p,d,l) __raw_writesb(p,d,l)
+#define writesw(p,d,l) __raw_writesw(p,d,l)
+#define writesl(p,d,l) __raw_writesl(p,d,l)
+#endif
+
#define IO_SPACE_LIMIT 0
#define memcpy_fromio memcpy_fromio
diff --git a/arch/arm/lib32/io-readsb.S b/arch/arm/lib32/io-readsb.S
index f853c48021..41f68092c5 100644
--- a/arch/arm/lib32/io-readsb.S
+++ b/arch/arm/lib32/io-readsb.S
@@ -7,7 +7,7 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
-.section .text.readsb
+.section .text.__raw_readsb
.Linsb_align: rsb ip, ip, #4
cmp ip, r2
@@ -22,7 +22,7 @@
subs r2, r2, ip
bne .Linsb_aligned
-ENTRY(readsb)
+ENTRY(__raw_readsb)
teq r2, #0 @ do we have to check for the zero len?
moveq pc, lr
ands ip, r1, #3
@@ -119,4 +119,4 @@ ENTRY(readsb)
strgtb r3, [r1]
ldmfd sp!, {r4 - r6, pc}
-ENDPROC(readsb)
+ENDPROC(__raw_readsb)
diff --git a/arch/arm/lib32/io-readsl.S b/arch/arm/lib32/io-readsl.S
index bb8b96ded0..e1855fd636 100644
--- a/arch/arm/lib32/io-readsl.S
+++ b/arch/arm/lib32/io-readsl.S
@@ -7,9 +7,9 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
-.section .text.readsl
+.section .text.__raw_readsl
-ENTRY(readsl)
+ENTRY(__raw_readsl)
teq r2, #0 @ do we have to check for the zero len?
moveq pc, lr
ands ip, r1, #3
@@ -75,4 +75,4 @@ ENTRY(readsl)
8: mov r3, ip, get_byte_0
strb r3, [r1, #0]
mov pc, lr
-ENDPROC(readsl)
+ENDPROC(__raw_readsl)
diff --git a/arch/arm/lib32/io-readsw-armv4.S b/arch/arm/lib32/io-readsw-armv4.S
index 25f2778860..9fb7fd7576 100644
--- a/arch/arm/lib32/io-readsw-armv4.S
+++ b/arch/arm/lib32/io-readsw-armv4.S
@@ -15,7 +15,7 @@
#endif
.endm
-.section .text.readsw
+.section .text.__raw_readsw
.Linsw_align: movs ip, r1, lsl #31
bne .Linsw_noalign
@@ -23,7 +23,7 @@
sub r2, r2, #1
strh ip, [r1], #2
-ENTRY(readsw)
+ENTRY(__raw_readsw)
teq r2, #0
moveq pc, lr
tst r1, #3
@@ -127,4 +127,4 @@ ENTRY(readsw)
_BE_ONLY_( movne ip, ip, lsr #24 )
strneb ip, [r1]
ldmfd sp!, {r4, pc}
-ENDPROC(readsw)
+ENDPROC(__raw_readsw)
diff --git a/arch/arm/lib32/io-writesb.S b/arch/arm/lib32/io-writesb.S
index 313839bff6..b6ce85f0d4 100644
--- a/arch/arm/lib32/io-writesb.S
+++ b/arch/arm/lib32/io-writesb.S
@@ -27,7 +27,7 @@
#endif
.endm
-.section .text.writesb
+.section .text.__raw_writesb
.Loutsb_align: rsb ip, ip, #4
cmp ip, r2
@@ -42,7 +42,7 @@
subs r2, r2, ip
bne .Loutsb_aligned
-ENTRY(writesb)
+ENTRY(__raw_writesb)
teq r2, #0 @ do we have to check for the zero len?
moveq pc, lr
ands ip, r1, #3
@@ -90,4 +90,4 @@ ENTRY(writesb)
strgtb r3, [r0]
ldmfd sp!, {r4, r5, pc}
-ENDPROC(writesb)
+ENDPROC(__raw_writesb)
diff --git a/arch/arm/lib32/io-writesl.S b/arch/arm/lib32/io-writesl.S
index d9a29d9153..ed91ae19b7 100644
--- a/arch/arm/lib32/io-writesl.S
+++ b/arch/arm/lib32/io-writesl.S
@@ -7,9 +7,9 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
-.section .text.writesl
+.section .text.__raw_writesl
-ENTRY(writesl)
+ENTRY(__raw_writesl)
teq r2, #0 @ do we have to check for the zero len?
moveq pc, lr
ands ip, r1, #3
@@ -63,4 +63,4 @@ ENTRY(writesl)
str ip, [r0]
bne 6b
mov pc, lr
-ENDPROC(writesl)
+ENDPROC(__raw_writesl)
--
2.39.5
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v2 02/10] ARM: rename logical shift macros push pull into lspush lspull
2024-09-26 11:17 [PATCH v2 00/10] ARM: add assembler optimized memmove Sascha Hauer
2024-09-26 11:17 ` [PATCH v2 01/10] ARM: Use optimized reads[bwl] and writes[bwl] functions Sascha Hauer
@ 2024-09-26 11:17 ` Sascha Hauer
2024-09-26 11:17 ` [PATCH v2 03/10] ARM: convert all "mov.* pc, reg" to "bx reg" for ARMv6+ Sascha Hauer
` (8 subsequent siblings)
10 siblings, 0 replies; 14+ messages in thread
From: Sascha Hauer @ 2024-09-26 11:17 UTC (permalink / raw)
To: open list:BAREBOX; +Cc: Ahmad Fatoum
Adoption of Linux commit:
| commit d98b90ea22b0a28d9d787769704a9cf1ea5a513a
| Author: Victor Kamensky <victor.kamensky@linaro.org>
| Date: Tue Feb 25 08:41:09 2014 +0100
|
| ARM: 7990/1: asm: rename logical shift macros push pull into lspush lspull
|
| Renames logical shift macros, 'push' and 'pull', defined in
| arch/arm/include/asm/assembler.h, into 'lspush' and 'lspull'.
| That eliminates name conflict between 'push' logical shift macro
| and 'push' instruction mnemonic. That allows assembler.h to be
| included in .S files that use 'push' instruction.
|
| Suggested-by: Will Deacon <will.deacon@arm.com>
| Signed-off-by: Victor Kamensky <victor.kamensky@linaro.org>
| Acked-by: Nicolas Pitre <nico@linaro.org>
| Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Reviewed-by: Ahmad Fatoum <a.fatoum@pengutronix.de>
Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
---
arch/arm/include/asm/assembler.h | 8 ++++----
arch/arm/lib32/copy_template.S | 36 ++++++++++++++++++------------------
arch/arm/lib32/io-readsl.S | 12 ++++++------
arch/arm/lib32/io-writesl.S | 12 ++++++------
4 files changed, 34 insertions(+), 34 deletions(-)
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index 5db0f692ee..4e7ad57170 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -20,8 +20,8 @@
* Endian independent macros for shifting bytes within registers.
*/
#ifndef __ARMEB__
-#define pull lsr
-#define push lsl
+#define lspull lsr
+#define lspush lsl
#define get_byte_0 lsl #0
#define get_byte_1 lsr #8
#define get_byte_2 lsr #16
@@ -31,8 +31,8 @@
#define put_byte_2 lsl #16
#define put_byte_3 lsl #24
#else
-#define pull lsl
-#define push lsr
+#define lspull lsl
+#define lspush lsr
#define get_byte_0 lsr #24
#define get_byte_1 lsr #16
#define get_byte_2 lsr #8
diff --git a/arch/arm/lib32/copy_template.S b/arch/arm/lib32/copy_template.S
index f66cd6e667..897e3db3ff 100644
--- a/arch/arm/lib32/copy_template.S
+++ b/arch/arm/lib32/copy_template.S
@@ -192,24 +192,24 @@
12: PLD( pld [r1, #124] )
13: ldr4w r1, r4, r5, r6, r7, abort=19f
- mov r3, lr, pull #\pull
+ mov r3, lr, lspull #\pull
subs r2, r2, #32
ldr4w r1, r8, r9, ip, lr, abort=19f
- orr r3, r3, r4, push #\push
- mov r4, r4, pull #\pull
- orr r4, r4, r5, push #\push
- mov r5, r5, pull #\pull
- orr r5, r5, r6, push #\push
- mov r6, r6, pull #\pull
- orr r6, r6, r7, push #\push
- mov r7, r7, pull #\pull
- orr r7, r7, r8, push #\push
- mov r8, r8, pull #\pull
- orr r8, r8, r9, push #\push
- mov r9, r9, pull #\pull
- orr r9, r9, ip, push #\push
- mov ip, ip, pull #\pull
- orr ip, ip, lr, push #\push
+ orr r3, r3, r4, lspush #\push
+ mov r4, r4, lspull #\pull
+ orr r4, r4, r5, lspush #\push
+ mov r5, r5, lspull #\pull
+ orr r5, r5, r6, lspush #\push
+ mov r6, r6, lspull #\pull
+ orr r6, r6, r7, lspush #\push
+ mov r7, r7, lspull #\pull
+ orr r7, r7, r8, lspush #\push
+ mov r8, r8, lspull #\pull
+ orr r8, r8, r9, lspush #\push
+ mov r9, r9, lspull #\pull
+ orr r9, r9, ip, lspush #\push
+ mov ip, ip, lspull #\pull
+ orr ip, ip, lr, lspush #\push
str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
bge 12b
PLD( cmn r2, #96 )
@@ -220,10 +220,10 @@
14: ands ip, r2, #28
beq 16f
-15: mov r3, lr, pull #\pull
+15: mov r3, lr, lspull #\pull
ldr1w r1, lr, abort=21f
subs ip, ip, #4
- orr r3, r3, lr, push #\push
+ orr r3, r3, lr, lspush #\push
str1w r0, r3, abort=21f
bgt 15b
CALGN( cmp r2, #0 )
diff --git a/arch/arm/lib32/io-readsl.S b/arch/arm/lib32/io-readsl.S
index e1855fd636..7bcd0d45bc 100644
--- a/arch/arm/lib32/io-readsl.S
+++ b/arch/arm/lib32/io-readsl.S
@@ -46,25 +46,25 @@ ENTRY(__raw_readsl)
strb ip, [r1], #1
4: subs r2, r2, #1
- mov ip, r3, pull #24
+ mov ip, r3, lspull #24
ldrne r3, [r0]
- orrne ip, ip, r3, push #8
+ orrne ip, ip, r3, lspush #8
strne ip, [r1], #4
bne 4b
b 8f
5: subs r2, r2, #1
- mov ip, r3, pull #16
+ mov ip, r3, lspull #16
ldrne r3, [r0]
- orrne ip, ip, r3, push #16
+ orrne ip, ip, r3, lspush #16
strne ip, [r1], #4
bne 5b
b 7f
6: subs r2, r2, #1
- mov ip, r3, pull #8
+ mov ip, r3, lspull #8
ldrne r3, [r0]
- orrne ip, ip, r3, push #24
+ orrne ip, ip, r3, lspush #24
strne ip, [r1], #4
bne 6b
diff --git a/arch/arm/lib32/io-writesl.S b/arch/arm/lib32/io-writesl.S
index ed91ae19b7..61164234de 100644
--- a/arch/arm/lib32/io-writesl.S
+++ b/arch/arm/lib32/io-writesl.S
@@ -40,26 +40,26 @@ ENTRY(__raw_writesl)
blt 5f
bgt 6f
-4: mov ip, r3, pull #16
+4: mov ip, r3, lspull #16
ldr r3, [r1], #4
subs r2, r2, #1
- orr ip, ip, r3, push #16
+ orr ip, ip, r3, lspush #16
str ip, [r0]
bne 4b
mov pc, lr
-5: mov ip, r3, pull #8
+5: mov ip, r3, lspull #8
ldr r3, [r1], #4
subs r2, r2, #1
- orr ip, ip, r3, push #24
+ orr ip, ip, r3, lspush #24
str ip, [r0]
bne 5b
mov pc, lr
-6: mov ip, r3, pull #24
+6: mov ip, r3, lspull #24
ldr r3, [r1], #4
subs r2, r2, #1
- orr ip, ip, r3, push #8
+ orr ip, ip, r3, lspush #8
str ip, [r0]
bne 6b
mov pc, lr
--
2.39.5
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v2 03/10] ARM: convert all "mov.* pc, reg" to "bx reg" for ARMv6+
2024-09-26 11:17 [PATCH v2 00/10] ARM: add assembler optimized memmove Sascha Hauer
2024-09-26 11:17 ` [PATCH v2 01/10] ARM: Use optimized reads[bwl] and writes[bwl] functions Sascha Hauer
2024-09-26 11:17 ` [PATCH v2 02/10] ARM: rename logical shift macros push pull into lspush lspull Sascha Hauer
@ 2024-09-26 11:17 ` Sascha Hauer
2024-09-26 11:17 ` [PATCH v2 04/10] ARM: update lib1funcs.S from Linux Sascha Hauer
` (7 subsequent siblings)
10 siblings, 0 replies; 14+ messages in thread
From: Sascha Hauer @ 2024-09-26 11:17 UTC (permalink / raw)
To: open list:BAREBOX; +Cc: Ahmad Fatoum
Adoption of Linux commit:
| commit 6ebbf2ce437b33022d30badd49dc94d33ecfa498
| Author: Russell King <rmk+kernel@arm.linux.org.uk>
| Date: Mon Jun 30 16:29:12 2014 +0100
|
| ARM: convert all "mov.* pc, reg" to "bx reg" for ARMv6+
|
| ARMv6 and greater introduced a new instruction ("bx") which can be used
| to return from function calls. Recent CPUs perform better when the
| "bx lr" instruction is used rather than the "mov pc, lr" instruction,
| and this sequence is strongly recommended to be used by the ARM
| architecture manual (section A.4.1.1).
|
| We provide a new macro "ret" with all its variants for the condition
| code which will resolve to the appropriate instruction.
|
| Rather than doing this piecemeal, and miss some instances, change all
| the "mov pc" instances to use the new macro, with the exception of
| the "movs" instruction and the kprobes code. This allows us to detect
| the "mov pc, lr" case and fix it up - and also gives us the possibility
| of deploying this for other registers depending on the CPU selection.
|
| Reported-by: Will Deacon <will.deacon@arm.com>
| Tested-by: Stephen Warren <swarren@nvidia.com> # Tegra Jetson TK1
| Tested-by: Robert Jarzmik <robert.jarzmik@free.fr> # mioa701_bootresume.S
| Tested-by: Andrew Lunn <andrew@lunn.ch> # Kirkwood
| Tested-by: Shawn Guo <shawn.guo@freescale.com>
| Tested-by: Tony Lindgren <tony@atomide.com> # OMAPs
| Tested-by: Gregory CLEMENT <gregory.clement@free-electrons.com> # Armada XP, 375, 385
| Acked-by: Sekhar Nori <nsekhar@ti.com> # DaVinci
| Acked-by: Christoffer Dall <christoffer.dall@linaro.org> # kvm/hyp
| Acked-by: Haojian Zhuang <haojian.zhuang@gmail.com> # PXA3xx
| Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com> # Xen
| Tested-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> # ARMv7M
| Tested-by: Simon Horman <horms+renesas@verge.net.au> # Shmobile
| Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Reviewed-by: Ahmad Fatoum <a.fatoum@pengutronix.de>
Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
---
arch/arm/cpu/cache-armv4.S | 11 ++++++-----
arch/arm/cpu/cache-armv5.S | 13 +++++++------
arch/arm/cpu/cache-armv6.S | 13 +++++++------
arch/arm/cpu/cache-armv7.S | 9 +++++----
arch/arm/cpu/hyp.S | 3 ++-
arch/arm/cpu/setupc_32.S | 7 ++++---
arch/arm/cpu/sm_as.S | 3 ++-
arch/arm/include/asm/assembler.h | 22 ++++++++++++++++++++++
arch/arm/lib32/ashldi3.S | 3 ++-
arch/arm/lib32/ashrdi3.S | 3 ++-
arch/arm/lib32/lshrdi3.S | 3 ++-
arch/arm/lib32/runtime-offset.S | 2 +-
12 files changed, 62 insertions(+), 30 deletions(-)
diff --git a/arch/arm/cpu/cache-armv4.S b/arch/arm/cpu/cache-armv4.S
index 78a098b2fe..024a94c583 100644
--- a/arch/arm/cpu/cache-armv4.S
+++ b/arch/arm/cpu/cache-armv4.S
@@ -2,6 +2,7 @@
#include <linux/linkage.h>
#include <init.h>
+#include <asm/assembler.h>
#define CACHE_DLINESIZE 32
@@ -22,7 +23,7 @@ ENTRY(v4_mmu_cache_on)
mov r0, #0
mcr p15, 0, r0, c8, c7, 0 @ flush I,D TLBs
#endif
- mov pc, r12
+ ret r12
ENDPROC(v4_mmu_cache_on)
__common_mmu_cache_on:
@@ -43,7 +44,7 @@ ENTRY(v4_mmu_cache_off)
mcr p15, 0, r0, c7, c7 @ invalidate whole cache v4
mcr p15, 0, r0, c8, c7 @ invalidate whole TLB v4
#endif
- mov pc, lr
+ ret lr
ENDPROC(v4_mmu_cache_off)
.section .text.v4_mmu_cache_flush
@@ -105,7 +106,7 @@ ENTRY(v4_dma_inv_range)
cmp r0, r1
blo 1b
mcr p15, 0, r0, c7, c10, 4 @ drain WB
- mov pc, lr
+ ret lr
/*
* dma_clean_range(start, end)
@@ -125,7 +126,7 @@ ENTRY(v4_dma_clean_range)
cmp r0, r1
blo 1b
mcr p15, 0, r0, c7, c10, 4 @ drain WB
- mov pc, lr
+ ret lr
/*
* dma_flush_range(start, end)
@@ -143,5 +144,5 @@ ENTRY(v4_dma_flush_range)
cmp r0, r1
blo 1b
mcr p15, 0, r0, c7, c10, 4 @ drain WB
- mov pc, lr
+ ret lr
diff --git a/arch/arm/cpu/cache-armv5.S b/arch/arm/cpu/cache-armv5.S
index bcb7ebf466..6d9cbba015 100644
--- a/arch/arm/cpu/cache-armv5.S
+++ b/arch/arm/cpu/cache-armv5.S
@@ -2,6 +2,7 @@
#include <linux/linkage.h>
#include <init.h>
+#include <asm/assembler.h>
#define CACHE_DLINESIZE 32
@@ -22,7 +23,7 @@ ENTRY(v5_mmu_cache_on)
mov r0, #0
mcr p15, 0, r0, c8, c7, 0 @ flush I,D TLBs
#endif
- mov pc, r12
+ ret r12
ENDPROC(v5_mmu_cache_on)
__common_mmu_cache_on:
@@ -43,7 +44,7 @@ ENTRY(v5_mmu_cache_off)
mcr p15, 0, r0, c7, c7 @ invalidate whole cache v4
mcr p15, 0, r0, c8, c7 @ invalidate whole TLB v4
#endif
- mov pc, lr
+ ret lr
ENDPROC(v5_mmu_cache_off)
.section .text.v5_mmu_cache_flush
@@ -52,7 +53,7 @@ ENTRY(v5_mmu_cache_flush)
bne 1b
mcr p15, 0, r0, c7, c5, 0 @ flush I cache
mcr p15, 0, r0, c7, c10, 4 @ drain WB
- mov pc, lr
+ ret lr
ENDPROC(v5_mmu_cache_flush)
/*
@@ -80,7 +81,7 @@ ENTRY(v5_dma_inv_range)
cmp r0, r1
blo 1b
mcr p15, 0, r0, c7, c10, 4 @ drain WB
- mov pc, lr
+ ret lr
/*
* dma_clean_range(start, end)
@@ -100,7 +101,7 @@ ENTRY(v5_dma_clean_range)
cmp r0, r1
blo 1b
mcr p15, 0, r0, c7, c10, 4 @ drain WB
- mov pc, lr
+ ret lr
/*
* dma_flush_range(start, end)
@@ -118,5 +119,5 @@ ENTRY(v5_dma_flush_range)
cmp r0, r1
blo 1b
mcr p15, 0, r0, c7, c10, 4 @ drain WB
- mov pc, lr
+ ret lr
diff --git a/arch/arm/cpu/cache-armv6.S b/arch/arm/cpu/cache-armv6.S
index cc720314c0..ab965623a3 100644
--- a/arch/arm/cpu/cache-armv6.S
+++ b/arch/arm/cpu/cache-armv6.S
@@ -2,6 +2,7 @@
#include <linux/linkage.h>
#include <init.h>
+#include <asm/assembler.h>
#define HARVARD_CACHE
#define CACHE_LINE_SIZE 32
@@ -24,7 +25,7 @@ ENTRY(v6_mmu_cache_on)
mov r0, #0
mcr p15, 0, r0, c8, c7, 0 @ flush I,D TLBs
#endif
- mov pc, r12
+ ret r12
ENDPROC(v6_mmu_cache_on)
__common_mmu_cache_on:
@@ -46,7 +47,7 @@ ENTRY(v6_mmu_cache_off)
mcr p15, 0, r0, c7, c7 @ invalidate whole cache v4
mcr p15, 0, r0, c8, c7 @ invalidate whole TLB v4
#endif
- mov pc, lr
+ ret lr
.section .text.v6_mmu_cache_flush
ENTRY(v6_mmu_cache_flush)
@@ -55,7 +56,7 @@ ENTRY(v6_mmu_cache_flush)
mcr p15, 0, r1, c7, c5, 0 @ invalidate I+BTB
mcr p15, 0, r1, c7, c15, 0 @ clean+invalidate unified
mcr p15, 0, r1, c7, c10, 4 @ drain WB
- mov pc, lr
+ ret lr
ENDPROC(v6_mmu_cache_flush)
/*
@@ -95,7 +96,7 @@ ENTRY(v6_dma_inv_range)
blo 1b
mov r0, #0
mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
- mov pc, lr
+ ret lr
ENDPROC(v6_dma_inv_range)
/*
@@ -117,7 +118,7 @@ ENTRY(v6_dma_clean_range)
blo 1b
mov r0, #0
mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
- mov pc, lr
+ ret lr
ENDPROC(v6_dma_clean_range)
/*
@@ -139,5 +140,5 @@ ENTRY(v6_dma_flush_range)
blo 1b
mov r0, #0
mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
- mov pc, lr
+ ret lr
ENDPROC(v6_dma_flush_range)
diff --git a/arch/arm/cpu/cache-armv7.S b/arch/arm/cpu/cache-armv7.S
index efd9fe412f..3f6e5e6b73 100644
--- a/arch/arm/cpu/cache-armv7.S
+++ b/arch/arm/cpu/cache-armv7.S
@@ -2,6 +2,7 @@
#include <linux/linkage.h>
#include <init.h>
+#include <asm/assembler.h>
.section .text.v7_mmu_cache_on
ENTRY(v7_mmu_cache_on)
@@ -140,7 +141,7 @@ iflush:
mcr p15, 0, r12, c7, c5, 0 @ invalidate I+BTB
dsb
isb
- mov pc, lr
+ ret lr
ENDPROC(__v7_mmu_cache_flush_invalidate)
/*
@@ -182,7 +183,7 @@ ENTRY(v7_dma_inv_range)
cmp r0, r1
blo 1b
dsb
- mov pc, lr
+ ret lr
ENDPROC(v7_dma_inv_range)
/*
@@ -201,7 +202,7 @@ ENTRY(v7_dma_clean_range)
cmp r0, r1
blo 1b
dsb
- mov pc, lr
+ ret lr
ENDPROC(v7_dma_clean_range)
/*
@@ -220,5 +221,5 @@ ENTRY(v7_dma_flush_range)
cmp r0, r1
blo 1b
dsb
- mov pc, lr
+ ret lr
ENDPROC(v7_dma_flush_range)
diff --git a/arch/arm/cpu/hyp.S b/arch/arm/cpu/hyp.S
index b5e4807877..016bcd79c0 100644
--- a/arch/arm/cpu/hyp.S
+++ b/arch/arm/cpu/hyp.S
@@ -4,6 +4,7 @@
#include <asm/system.h>
#include <asm/opcodes-virt.h>
#include <init.h>
+#include <asm/assembler.h>
.arch_extension sec
.arch_extension virt
@@ -80,7 +81,7 @@ THUMB( orr r12, r12, #PSR_T_BIT )
__ERET
1: msr cpsr_c, r12
2:
- mov pc, r2
+ ret r2
ENDPROC(armv7_hyp_install)
ENTRY(armv7_switch_to_hyp)
diff --git a/arch/arm/cpu/setupc_32.S b/arch/arm/cpu/setupc_32.S
index eafc9b52c6..d3449d9646 100644
--- a/arch/arm/cpu/setupc_32.S
+++ b/arch/arm/cpu/setupc_32.S
@@ -2,6 +2,7 @@
#include <linux/linkage.h>
#include <asm/sections.h>
+#include <asm/assembler.h>
.section .text.setupc
@@ -32,7 +33,7 @@ ENTRY(setup_c)
bl sync_caches_for_execution
sub lr, r5, r4 /* adjust return address to new location */
pop {r4, r5}
- mov pc, lr
+ ret lr
ENDPROC(setup_c)
/*
@@ -76,13 +77,13 @@ ENTRY(relocate_to_adr)
ldr r0,=1f
sub r0, r0, r8
add r0, r0, r6
- mov pc, r0 /* jump to relocated address */
+ ret r0 /* jump to relocated address */
1:
bl relocate_to_current_adr /* relocate binary */
mov lr, r7
pop {r3, r4, r5, r6, r7, r8}
- mov pc, lr
+ ret lr
ENDPROC(relocate_to_adr)
diff --git a/arch/arm/cpu/sm_as.S b/arch/arm/cpu/sm_as.S
index f55ac8661c..32007147d4 100644
--- a/arch/arm/cpu/sm_as.S
+++ b/arch/arm/cpu/sm_as.S
@@ -5,6 +5,7 @@
#include <asm-generic/memory_layout.h>
#include <asm/secure.h>
#include <asm/system.h>
+#include <asm/assembler.h>
.arch_extension sec
.arch_extension virt
@@ -147,7 +148,7 @@ secure_monitor:
hyp_trap:
mrs lr, elr_hyp @ for older asm: .byte 0x00, 0xe3, 0x0e, 0xe1
- mov pc, lr @ do no switch modes, but
+ ret lr @ do no switch modes, but
@ return to caller
ENTRY(psci_cpu_entry)
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index 4e7ad57170..e8f5625a0a 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -340,4 +340,26 @@
blx\c \dst
.endif
.endm
+
+ .irp c,,eq,ne,cs,cc,mi,pl,vs,vc,hi,ls,ge,lt,gt,le,hs,lo
+ .macro ret\c, reg
+#if __LINUX_ARM_ARCH__ < 6
+ mov\c pc, \reg
+#else
+ .ifeqs "\reg", "lr"
+ bx\c \reg
+ .else
+ mov\c pc, \reg
+ .endif
+#endif
+ .endm
+ .endr
+
+ .macro ret.w, reg
+ ret \reg
+#ifdef CONFIG_THUMB2_BAREBOX
+ nop
+#endif
+ .endm
+
#endif
diff --git a/arch/arm/lib32/ashldi3.S b/arch/arm/lib32/ashldi3.S
index b62e06f602..dccb732078 100644
--- a/arch/arm/lib32/ashldi3.S
+++ b/arch/arm/lib32/ashldi3.S
@@ -23,6 +23,7 @@ General Public License for more details.
*/
#include <linux/linkage.h>
+#include <asm/assembler.h>
#ifdef __ARMEB__
#define al r1
@@ -44,7 +45,7 @@ ENTRY(__aeabi_llsl)
THUMB( lsrmi r3, al, ip )
THUMB( orrmi ah, ah, r3 )
mov al, al, lsl r2
- mov pc, lr
+ ret lr
ENDPROC(__ashldi3)
ENDPROC(__aeabi_llsl)
diff --git a/arch/arm/lib32/ashrdi3.S b/arch/arm/lib32/ashrdi3.S
index db849b65fc..3db06281e5 100644
--- a/arch/arm/lib32/ashrdi3.S
+++ b/arch/arm/lib32/ashrdi3.S
@@ -23,6 +23,7 @@ General Public License for more details.
*/
#include <linux/linkage.h>
+#include <asm/assembler.h>
#ifdef __ARMEB__
#define al r1
@@ -44,7 +45,7 @@ ENTRY(__aeabi_lasr)
THUMB( lslmi r3, ah, ip )
THUMB( orrmi al, al, r3 )
mov ah, ah, asr r2
- mov pc, lr
+ ret lr
ENDPROC(__ashrdi3)
ENDPROC(__aeabi_lasr)
diff --git a/arch/arm/lib32/lshrdi3.S b/arch/arm/lib32/lshrdi3.S
index e77e96c7bc..5af522482c 100644
--- a/arch/arm/lib32/lshrdi3.S
+++ b/arch/arm/lib32/lshrdi3.S
@@ -23,6 +23,7 @@ General Public License for more details.
*/
#include <linux/linkage.h>
+#include <asm/assembler.h>
#ifdef __ARMEB__
#define al r1
@@ -44,7 +45,7 @@ ENTRY(__aeabi_llsr)
THUMB( lslmi r3, ah, ip )
THUMB( orrmi al, al, r3 )
mov ah, ah, lsr r2
- mov pc, lr
+ ret lr
ENDPROC(__lshrdi3)
ENDPROC(__aeabi_llsr)
diff --git a/arch/arm/lib32/runtime-offset.S b/arch/arm/lib32/runtime-offset.S
index ac104de119..d9ba864b3b 100644
--- a/arch/arm/lib32/runtime-offset.S
+++ b/arch/arm/lib32/runtime-offset.S
@@ -14,7 +14,7 @@ ENTRY(get_runtime_offset)
ldr r1, linkadr
subs r0, r0, r1
THUMB( adds r0, r0, #1)
- mov pc, lr
+ ret lr
linkadr:
.word get_runtime_offset
--
2.39.5
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v2 04/10] ARM: update lib1funcs.S from Linux
2024-09-26 11:17 [PATCH v2 00/10] ARM: add assembler optimized memmove Sascha Hauer
` (2 preceding siblings ...)
2024-09-26 11:17 ` [PATCH v2 03/10] ARM: convert all "mov.* pc, reg" to "bx reg" for ARMv6+ Sascha Hauer
@ 2024-09-26 11:17 ` Sascha Hauer
2024-09-26 11:17 ` [PATCH v2 05/10] ARM: update findbit.S " Sascha Hauer
` (6 subsequent siblings)
10 siblings, 0 replies; 14+ messages in thread
From: Sascha Hauer @ 2024-09-26 11:17 UTC (permalink / raw)
To: open list:BAREBOX
This updates lib1funcs.S from Linux-6.10
Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
---
arch/arm/lib32/lib1funcs.S | 80 +++++++++++++++++++++++++++++-----------------
1 file changed, 51 insertions(+), 29 deletions(-)
diff --git a/arch/arm/lib32/lib1funcs.S b/arch/arm/lib32/lib1funcs.S
index bf1d0192d6..cd8af72737 100644
--- a/arch/arm/lib32/lib1funcs.S
+++ b/arch/arm/lib32/lib1funcs.S
@@ -27,11 +27,17 @@ WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
-*/
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING. If not, write to
+the Free Software Foundation, 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA. */
+
#include <linux/linkage.h>
#include <asm/assembler.h>
+#include <asm/unwind.h>
+.syntax unified
.macro ARM_DIV_BODY dividend, divisor, result, curbit
@@ -92,7 +98,7 @@ General Public License for more details.
subhs \dividend, \dividend, \divisor, lsr #3
orrhs \result, \result, \curbit, lsr #3
cmp \dividend, #0 @ Early termination?
- movnes \curbit, \curbit, lsr #4 @ No, any more bits to do?
+ movsne \curbit, \curbit, lsr #4 @ No, any more bits to do?
movne \divisor, \divisor, lsr #4
bne 1b
@@ -163,7 +169,7 @@ General Public License for more details.
#endif
- @ Perform all needed substractions to keep only the reminder.
+ @ Perform all needed subtractions to keep only the reminder.
@ Do comparisons in batch of 4 first.
subs \order, \order, #3 @ yes, 3 is intended here
blt 2f
@@ -178,14 +184,14 @@ General Public License for more details.
subhs \dividend, \dividend, \divisor, lsr #3
cmp \dividend, #1
mov \divisor, \divisor, lsr #4
- subges \order, \order, #4
+ subsge \order, \order, #4
bge 1b
tst \order, #3
teqne \dividend, #0
beq 5f
- @ Either 1, 2 or 3 comparison/substractions are left.
+ @ Either 1, 2 or 3 comparison/subtractions are left.
2: cmn \order, #2
blt 4f
beq 3f
@@ -201,12 +207,16 @@ General Public License for more details.
.endm
-.section .text.__udivsi3
+#ifdef CONFIG_ARM_PATCH_IDIV
+ .align 3
+#endif
+
ENTRY(__udivsi3)
ENTRY(__aeabi_uidiv)
+UNWIND(.fnstart)
subs r2, r1, #1
- moveq pc, lr
+ reteq lr
bcc Ldiv0
cmp r0, r1
bls 11f
@@ -216,22 +226,23 @@ ENTRY(__aeabi_uidiv)
ARM_DIV_BODY r0, r1, r2, r3
mov r0, r2
- mov pc, lr
+ ret lr
11: moveq r0, #1
movne r0, #0
- mov pc, lr
+ ret lr
12: ARM_DIV2_ORDER r1, r2
mov r0, r0, lsr r2
- mov pc, lr
+ ret lr
+UNWIND(.fnend)
ENDPROC(__udivsi3)
ENDPROC(__aeabi_uidiv)
-.section .text.__umodsi3
ENTRY(__umodsi3)
+UNWIND(.fnstart)
subs r2, r1, #1 @ compare divisor with 1
bcc Ldiv0
@@ -239,17 +250,22 @@ ENTRY(__umodsi3)
moveq r0, #0
tsthi r1, r2 @ see if divisor is power of 2
andeq r0, r0, r2
- movls pc, lr
+ retls lr
ARM_MOD_BODY r0, r1, r2, r3
- mov pc, lr
+ ret lr
+UNWIND(.fnend)
ENDPROC(__umodsi3)
-.section .text.__divsi3
+#ifdef CONFIG_ARM_PATCH_IDIV
+ .align 3
+#endif
+
ENTRY(__divsi3)
ENTRY(__aeabi_idiv)
+UNWIND(.fnstart)
cmp r1, #0
eor ip, r0, r1 @ save the sign of the result.
@@ -268,29 +284,30 @@ ENTRY(__aeabi_idiv)
cmp ip, #0
rsbmi r0, r0, #0
- mov pc, lr
+ ret lr
10: teq ip, r0 @ same sign ?
rsbmi r0, r0, #0
- mov pc, lr
+ ret lr
11: movlo r0, #0
moveq r0, ip, asr #31
orreq r0, r0, #1
- mov pc, lr
+ ret lr
12: ARM_DIV2_ORDER r1, r2
cmp ip, #0
mov r0, r3, lsr r2
rsbmi r0, r0, #0
- mov pc, lr
+ ret lr
+UNWIND(.fnend)
ENDPROC(__divsi3)
ENDPROC(__aeabi_idiv)
-.section .text.__modsi3
ENTRY(__modsi3)
+UNWIND(.fnstart)
cmp r1, #0
beq Ldiv0
@@ -308,44 +325,49 @@ ENTRY(__modsi3)
10: cmp ip, #0
rsbmi r0, r0, #0
- mov pc, lr
+ ret lr
+UNWIND(.fnend)
ENDPROC(__modsi3)
#ifdef CONFIG_AEABI
-.section .text.__aeabi_uidivmod
ENTRY(__aeabi_uidivmod)
+UNWIND(.fnstart)
+UNWIND(.save {r0, r1, ip, lr} )
stmfd sp!, {r0, r1, ip, lr}
bl __aeabi_uidiv
ldmfd sp!, {r1, r2, ip, lr}
mul r3, r0, r2
sub r1, r1, r3
- mov pc, lr
+ ret lr
+UNWIND(.fnend)
ENDPROC(__aeabi_uidivmod)
-.section .text.__aeabi_idivmod
ENTRY(__aeabi_idivmod)
-
+UNWIND(.fnstart)
+UNWIND(.save {r0, r1, ip, lr} )
stmfd sp!, {r0, r1, ip, lr}
bl __aeabi_idiv
ldmfd sp!, {r1, r2, ip, lr}
mul r3, r0, r2
sub r1, r1, r3
- mov pc, lr
+ ret lr
+UNWIND(.fnend)
ENDPROC(__aeabi_idivmod)
#endif
-.section .text.Ldiv0
Ldiv0:
-
+UNWIND(.fnstart)
+UNWIND(.pad #4)
+UNWIND(.save {lr})
str lr, [sp, #-8]!
bl __div0
mov r0, #0 @ About as wrong as it could be.
ldr pc, [sp], #8
-
-
+UNWIND(.fnend)
+ENDPROC(Ldiv0)
--
2.39.5
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v2 05/10] ARM: update findbit.S from Linux
2024-09-26 11:17 [PATCH v2 00/10] ARM: add assembler optimized memmove Sascha Hauer
` (3 preceding siblings ...)
2024-09-26 11:17 ` [PATCH v2 04/10] ARM: update lib1funcs.S from Linux Sascha Hauer
@ 2024-09-26 11:17 ` Sascha Hauer
2024-09-26 11:17 ` [PATCH v2 06/10] ARM: update io-* " Sascha Hauer
` (5 subsequent siblings)
10 siblings, 0 replies; 14+ messages in thread
From: Sascha Hauer @ 2024-09-26 11:17 UTC (permalink / raw)
To: open list:BAREBOX
This updates findbit.S from Linux-6.10
Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
---
arch/arm/lib32/findbit.S | 243 ++++++++++++++++++-----------------------------
1 file changed, 94 insertions(+), 149 deletions(-)
diff --git a/arch/arm/lib32/findbit.S b/arch/arm/lib32/findbit.S
index 82a0f34dc2..b7ac2d3c07 100644
--- a/arch/arm/lib32/findbit.S
+++ b/arch/arm/lib32/findbit.S
@@ -1,9 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* SPDX-FileCopyrightText: 1995-2000 Russell King */
-
/*
- * Originally from Linux kernel
- * arch/arm/lib/findbit.S
+ * linux/arch/arm/lib/findbit.S
+ *
+ * Copyright (C) 1995-2000 Russell King
*
* 16th March 2001 - John Ripley <jripley@sonicblue.com>
* Fixed so that "size" is an exclusive not an inclusive quantity.
@@ -13,182 +12,128 @@
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
+#include <asm/unwind.h>
.text
-/*
- * Purpose : Find a 'zero' bit
- * Prototype: int find_first_zero_bit(void *addr, unsigned int maxbit);
- */
-ENTRY(_find_first_zero_bit_le)
- teq r1, #0
- beq 3f
- mov r2, #0
-1:
- ARM( ldrb r3, [r0, r2, lsr #3] )
- THUMB( lsr r3, r2, #3 )
- THUMB( ldrb r3, [r0, r3] )
- eors r3, r3, #0xff @ invert bits
- bne .L_found @ any now set - found zero bit
- add r2, r2, #8 @ next bit pointer
-2: cmp r2, r1 @ any more?
- blo 1b
-3: mov r0, r1 @ no free bits
- mov pc, lr
-ENDPROC(_find_first_zero_bit_le)
-
-/*
- * Purpose : Find next 'zero' bit
- * Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset)
- */
-ENTRY(_find_next_zero_bit_le)
- teq r1, #0
- beq 3b
- ands ip, r2, #7
- beq 1b @ If new byte, goto old routine
- ARM( ldrb r3, [r0, r2, lsr #3] )
- THUMB( lsr r3, r2, #3 )
- THUMB( ldrb r3, [r0, r3] )
- eor r3, r3, #0xff @ now looking for a 1 bit
- movs r3, r3, lsr ip @ shift off unused bits
- bne .L_found
- orr r2, r2, #7 @ if zero, then no bits here
- add r2, r2, #1 @ align bit pointer
- b 2b @ loop for next bit
-ENDPROC(_find_next_zero_bit_le)
+#ifdef __ARMEB__
+#define SWAB_ENDIAN le
+#else
+#define SWAB_ENDIAN be
+#endif
-/*
- * Purpose : Find a 'one' bit
- * Prototype: int find_first_bit(const unsigned long *addr, unsigned int maxbit);
- */
-ENTRY(_find_first_bit_le)
+ .macro find_first, endian, set, name
+ENTRY(_find_first_\name\()bit_\endian)
+ UNWIND( .fnstart)
teq r1, #0
beq 3f
mov r2, #0
-1:
- ARM( ldrb r3, [r0, r2, lsr #3] )
- THUMB( lsr r3, r2, #3 )
- THUMB( ldrb r3, [r0, r3] )
- movs r3, r3
- bne .L_found @ any now set - found zero bit
- add r2, r2, #8 @ next bit pointer
+1: ldr r3, [r0], #4
+ .ifeq \set
+ mvns r3, r3 @ invert/test bits
+ .else
+ movs r3, r3 @ test bits
+ .endif
+ .ifc \endian, SWAB_ENDIAN
+ bne .L_found_swab
+ .else
+ bne .L_found @ found the bit?
+ .endif
+ add r2, r2, #32 @ next index
2: cmp r2, r1 @ any more?
blo 1b
-3: mov r0, r1 @ no free bits
- mov pc, lr
-ENDPROC(_find_first_bit_le)
+3: mov r0, r1 @ no more bits
+ ret lr
+ UNWIND( .fnend)
+ENDPROC(_find_first_\name\()bit_\endian)
+ .endm
-/*
- * Purpose : Find next 'one' bit
- * Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset)
- */
-ENTRY(_find_next_bit_le)
- teq r1, #0
- beq 3b
- ands ip, r2, #7
- beq 1b @ If new byte, goto old routine
- ARM( ldrb r3, [r0, r2, lsr #3] )
- THUMB( lsr r3, r2, #3 )
- THUMB( ldrb r3, [r0, r3] )
+ .macro find_next, endian, set, name
+ENTRY(_find_next_\name\()bit_\endian)
+ UNWIND( .fnstart)
+ cmp r2, r1
+ bhs 3b
+ mov ip, r2, lsr #5 @ word index
+ add r0, r0, ip, lsl #2
+ ands ip, r2, #31 @ bit position
+ beq 1b
+ ldr r3, [r0], #4
+ .ifeq \set
+ mvn r3, r3 @ invert bits
+ .endif
+ .ifc \endian, SWAB_ENDIAN
+ rev_l r3, ip
+ .if .Lrev_l_uses_tmp
+ @ we need to recompute ip because rev_l will have overwritten
+ @ it.
+ and ip, r2, #31 @ bit position
+ .endif
+ .endif
movs r3, r3, lsr ip @ shift off unused bits
bne .L_found
- orr r2, r2, #7 @ if zero, then no bits here
+ orr r2, r2, #31 @ no zero bits
add r2, r2, #1 @ align bit pointer
b 2b @ loop for next bit
-ENDPROC(_find_next_bit_le)
+ UNWIND( .fnend)
+ENDPROC(_find_next_\name\()bit_\endian)
+ .endm
-#ifdef __ARMEB__
+ .macro find_bit, endian, set, name
+ find_first \endian, \set, \name
+ find_next \endian, \set, \name
+ .endm
-ENTRY(_find_first_zero_bit_be)
- teq r1, #0
- beq 3f
- mov r2, #0
-1: eor r3, r2, #0x18 @ big endian byte ordering
- ARM( ldrb r3, [r0, r3, lsr #3] )
- THUMB( lsr r3, #3 )
- THUMB( ldrb r3, [r0, r3] )
- eors r3, r3, #0xff @ invert bits
- bne .L_found @ any now set - found zero bit
- add r2, r2, #8 @ next bit pointer
-2: cmp r2, r1 @ any more?
- blo 1b
-3: mov r0, r1 @ no free bits
- mov pc, lr
-ENDPROC(_find_first_zero_bit_be)
+/* _find_first_zero_bit_le and _find_next_zero_bit_le */
+ find_bit le, 0, zero_
-ENTRY(_find_next_zero_bit_be)
- teq r1, #0
- beq 3b
- ands ip, r2, #7
- beq 1b @ If new byte, goto old routine
- eor r3, r2, #0x18 @ big endian byte ordering
- ARM( ldrb r3, [r0, r3, lsr #3] )
- THUMB( lsr r3, #3 )
- THUMB( ldrb r3, [r0, r3] )
- eor r3, r3, #0xff @ now looking for a 1 bit
- movs r3, r3, lsr ip @ shift off unused bits
- bne .L_found
- orr r2, r2, #7 @ if zero, then no bits here
- add r2, r2, #1 @ align bit pointer
- b 2b @ loop for next bit
-ENDPROC(_find_next_zero_bit_be)
+/* _find_first_bit_le and _find_next_bit_le */
+ find_bit le, 1
-ENTRY(_find_first_bit_be)
- teq r1, #0
- beq 3f
- mov r2, #0
-1: eor r3, r2, #0x18 @ big endian byte ordering
- ARM( ldrb r3, [r0, r3, lsr #3] )
- THUMB( lsr r3, #3 )
- THUMB( ldrb r3, [r0, r3] )
- movs r3, r3
- bne .L_found @ any now set - found zero bit
- add r2, r2, #8 @ next bit pointer
-2: cmp r2, r1 @ any more?
- blo 1b
-3: mov r0, r1 @ no free bits
- mov pc, lr
-ENDPROC(_find_first_bit_be)
+#ifdef __ARMEB__
-ENTRY(_find_next_bit_be)
- teq r1, #0
- beq 3b
- ands ip, r2, #7
- beq 1b @ If new byte, goto old routine
- eor r3, r2, #0x18 @ big endian byte ordering
- ARM( ldrb r3, [r0, r3, lsr #3] )
- THUMB( lsr r3, #3 )
- THUMB( ldrb r3, [r0, r3] )
- movs r3, r3, lsr ip @ shift off unused bits
- bne .L_found
- orr r2, r2, #7 @ if zero, then no bits here
- add r2, r2, #1 @ align bit pointer
- b 2b @ loop for next bit
-ENDPROC(_find_next_bit_be)
+/* _find_first_zero_bit_be and _find_next_zero_bit_be */
+ find_bit be, 0, zero_
+
+/* _find_first_bit_be and _find_next_bit_be */
+ find_bit be, 1
#endif
/*
* One or more bits in the LSB of r3 are assumed to be set.
*/
+.L_found_swab:
+ UNWIND( .fnstart)
+ rev_l r3, ip
.L_found:
-#if __LINUX_ARM_ARCH__ >= 5
+#if __LINUX_ARM_ARCH__ >= 7
+ rbit r3, r3 @ reverse bits
+ clz r3, r3 @ count high zero bits
+ add r0, r2, r3 @ add offset of first set bit
+#elif __LINUX_ARM_ARCH__ >= 5
rsb r0, r3, #0
- and r3, r3, r0
- clz r3, r3
- rsb r3, r3, #31
- add r0, r2, r3
+ and r3, r3, r0 @ mask out lowest bit set
+ clz r3, r3 @ count high zero bits
+ rsb r3, r3, #31 @ offset of first set bit
+ add r0, r2, r3 @ add offset of first set bit
#else
- tst r3, #0x0f
+ mov ip, #~0
+ tst r3, ip, lsr #16 @ test bits 0-15
+ addeq r2, r2, #16
+ moveq r3, r3, lsr #16
+ tst r3, #0x00ff
+ addeq r2, r2, #8
+ moveq r3, r3, lsr #8
+ tst r3, #0x000f
addeq r2, r2, #4
- movne r3, r3, lsl #4
- tst r3, #0x30
+ moveq r3, r3, lsr #4
+ tst r3, #0x0003
addeq r2, r2, #2
- movne r3, r3, lsl #2
- tst r3, #0x40
+ moveq r3, r3, lsr #2
+ tst r3, #0x0001
addeq r2, r2, #1
mov r0, r2
#endif
cmp r1, r0 @ Clamp to maxbit
movlo r0, r1
- mov pc, lr
-
+ ret lr
+ UNWIND( .fnend)
--
2.39.5
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v2 06/10] ARM: update io-* from Linux
2024-09-26 11:17 [PATCH v2 00/10] ARM: add assembler optimized memmove Sascha Hauer
` (4 preceding siblings ...)
2024-09-26 11:17 ` [PATCH v2 05/10] ARM: update findbit.S " Sascha Hauer
@ 2024-09-26 11:17 ` Sascha Hauer
2024-09-26 11:17 ` [PATCH v2 07/10] ARM: always assume the unified syntax for assembly code Sascha Hauer
` (4 subsequent siblings)
10 siblings, 0 replies; 14+ messages in thread
From: Sascha Hauer @ 2024-09-26 11:17 UTC (permalink / raw)
To: open list:BAREBOX; +Cc: Ahmad Fatoum
This updates io-readsb.S, io-readsl.S, io-readsw-armv4.S, io-writesb.S,
io-writesl.S and io-writesw-armv4.S from Linux-6.10.
Reviewed-by: Ahmad Fatoum <a.fatoum@pengutronix.de>
Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
---
arch/arm/lib32/io-readsb.S | 28 +++++++++++++++-------------
arch/arm/lib32/io-readsl.S | 16 +++++++++-------
arch/arm/lib32/io-readsw-armv4.S | 22 ++++++++++++----------
arch/arm/lib32/io-writesb.S | 30 ++++++++++++++++--------------
arch/arm/lib32/io-writesl.S | 20 +++++++++++---------
arch/arm/lib32/io-writesw-armv4.S | 18 ++++++++++--------
6 files changed, 73 insertions(+), 61 deletions(-)
diff --git a/arch/arm/lib32/io-readsb.S b/arch/arm/lib32/io-readsb.S
index 41f68092c5..2777a49b22 100644
--- a/arch/arm/lib32/io-readsb.S
+++ b/arch/arm/lib32/io-readsb.S
@@ -1,12 +1,14 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* SPDX-FileCopyrightText: 1995-2000 Russell King */
-
/*
* linux/arch/arm/lib/io-readsb.S
+ *
+ * Copyright (C) 1995-2000 Russell King
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
+.syntax unified
+
.section .text.__raw_readsb
.Linsb_align: rsb ip, ip, #4
@@ -15,16 +17,16 @@
cmp ip, #2
ldrb r3, [r0]
strb r3, [r1], #1
- ldrgeb r3, [r0]
- strgeb r3, [r1], #1
- ldrgtb r3, [r0]
- strgtb r3, [r1], #1
+ ldrbge r3, [r0]
+ strbge r3, [r1], #1
+ ldrbgt r3, [r0]
+ strbgt r3, [r1], #1
subs r2, r2, ip
bne .Linsb_aligned
ENTRY(__raw_readsb)
teq r2, #0 @ do we have to check for the zero len?
- moveq pc, lr
+ reteq lr
ands ip, r1, #3
bne .Linsb_align
@@ -71,7 +73,7 @@ ENTRY(__raw_readsb)
bpl .Linsb_16_lp
tst r2, #15
- ldmeqfd sp!, {r4 - r6, pc}
+ ldmfdeq sp!, {r4 - r6, pc}
.Linsb_no_16: tst r2, #8
beq .Linsb_no_8
@@ -108,15 +110,15 @@ ENTRY(__raw_readsb)
str r3, [r1], #4
.Linsb_no_4: ands r2, r2, #3
- ldmeqfd sp!, {r4 - r6, pc}
+ ldmfdeq sp!, {r4 - r6, pc}
cmp r2, #2
ldrb r3, [r0]
strb r3, [r1], #1
- ldrgeb r3, [r0]
- strgeb r3, [r1], #1
- ldrgtb r3, [r0]
- strgtb r3, [r1]
+ ldrbge r3, [r0]
+ strbge r3, [r1], #1
+ ldrbgt r3, [r0]
+ strbgt r3, [r1]
ldmfd sp!, {r4 - r6, pc}
ENDPROC(__raw_readsb)
diff --git a/arch/arm/lib32/io-readsl.S b/arch/arm/lib32/io-readsl.S
index 7bcd0d45bc..aecac1f9eb 100644
--- a/arch/arm/lib32/io-readsl.S
+++ b/arch/arm/lib32/io-readsl.S
@@ -1,17 +1,19 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* SPDX-FileCopyrightText: 1995-2000 Russell King */
-
/*
- * linux/arch/arm/lib/io-readsl.S
+ * linux/arch/arm/lib/io-readsl.S
+ *
+ * Copyright (C) 1995-2000 Russell King
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
+.syntax unified
+
.section .text.__raw_readsl
ENTRY(__raw_readsl)
teq r2, #0 @ do we have to check for the zero len?
- moveq pc, lr
+ reteq lr
ands ip, r1, #3
bne 3f
@@ -29,10 +31,10 @@ ENTRY(__raw_readsl)
2: movs r2, r2, lsl #31
ldrcs r3, [r0, #0]
ldrcs ip, [r0, #0]
- stmcsia r1!, {r3, ip}
+ stmiacs r1!, {r3, ip}
ldrne r3, [r0, #0]
strne r3, [r1, #0]
- mov pc, lr
+ ret lr
3: ldr r3, [r0]
cmp ip, #2
@@ -74,5 +76,5 @@ ENTRY(__raw_readsl)
strb r3, [r1, #1]
8: mov r3, ip, get_byte_0
strb r3, [r1, #0]
- mov pc, lr
+ ret lr
ENDPROC(__raw_readsl)
diff --git a/arch/arm/lib32/io-readsw-armv4.S b/arch/arm/lib32/io-readsw-armv4.S
index 9fb7fd7576..f5c633027c 100644
--- a/arch/arm/lib32/io-readsw-armv4.S
+++ b/arch/arm/lib32/io-readsw-armv4.S
@@ -1,12 +1,14 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* SPDX-FileCopyrightText: 1995-2000 Russell King */
-
/*
- * linux/arch/arm/lib/io-readsw-armv4.S
+ * linux/arch/arm/lib/io-readsw-armv4.S
+ *
+ * Copyright (C) 1995-2000 Russell King
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
+.syntax unified
+
.macro pack, rd, hw1, hw2
#ifndef __ARMEB__
orr \rd, \hw1, \hw2, lsl #16
@@ -25,7 +27,7 @@
ENTRY(__raw_readsw)
teq r2, #0
- moveq pc, lr
+ reteq lr
tst r1, #3
bne .Linsw_align
@@ -75,8 +77,8 @@ ENTRY(__raw_readsw)
pack r3, r3, ip
str r3, [r1], #4
-.Lno_insw_2: ldrneh r3, [r0]
- strneh r3, [r1]
+.Lno_insw_2: ldrhne r3, [r0]
+ strhne r3, [r1]
ldmfd sp!, {r4, r5, pc}
@@ -93,7 +95,7 @@ ENTRY(__raw_readsw)
#endif
.Linsw_noalign: stmfd sp!, {r4, lr}
- ldrccb ip, [r1, #-1]!
+ ldrbcc ip, [r1, #-1]!
bcc 1f
ldrh ip, [r0]
@@ -120,11 +122,11 @@ ENTRY(__raw_readsw)
3: tst r2, #1
strb ip, [r1], #1
- ldrneh ip, [r0]
+ ldrhne ip, [r0]
_BE_ONLY_( movne ip, ip, ror #8 )
- strneb ip, [r1], #1
+ strbne ip, [r1], #1
_LE_ONLY_( movne ip, ip, lsr #8 )
_BE_ONLY_( movne ip, ip, lsr #24 )
- strneb ip, [r1]
+ strbne ip, [r1]
ldmfd sp!, {r4, pc}
ENDPROC(__raw_readsw)
diff --git a/arch/arm/lib32/io-writesb.S b/arch/arm/lib32/io-writesb.S
index b6ce85f0d4..0bfb1f914e 100644
--- a/arch/arm/lib32/io-writesb.S
+++ b/arch/arm/lib32/io-writesb.S
@@ -1,12 +1,14 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* SPDX-FileCopyrightText: 1995-2000 Russell King */
-
/*
- * linux/arch/arm/lib/io-writesb.S
+ * linux/arch/arm/lib/io-writesb.S
+ *
+ * Copyright (C) 1995-2000 Russell King
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
+.syntax unified
+
.macro outword, rd
#ifndef __ARMEB__
strb \rd, [r0]
@@ -35,16 +37,16 @@
cmp ip, #2
ldrb r3, [r1], #1
strb r3, [r0]
- ldrgeb r3, [r1], #1
- strgeb r3, [r0]
- ldrgtb r3, [r1], #1
- strgtb r3, [r0]
+ ldrbge r3, [r1], #1
+ strbge r3, [r0]
+ ldrbgt r3, [r1], #1
+ strbgt r3, [r0]
subs r2, r2, ip
bne .Loutsb_aligned
ENTRY(__raw_writesb)
teq r2, #0 @ do we have to check for the zero len?
- moveq pc, lr
+ reteq lr
ands ip, r1, #3
bne .Loutsb_align
@@ -63,7 +65,7 @@ ENTRY(__raw_writesb)
bpl .Loutsb_16_lp
tst r2, #15
- ldmeqfd sp!, {r4, r5, pc}
+ ldmfdeq sp!, {r4, r5, pc}
.Loutsb_no_16: tst r2, #8
beq .Loutsb_no_8
@@ -79,15 +81,15 @@ ENTRY(__raw_writesb)
outword r3
.Loutsb_no_4: ands r2, r2, #3
- ldmeqfd sp!, {r4, r5, pc}
+ ldmfdeq sp!, {r4, r5, pc}
cmp r2, #2
ldrb r3, [r1], #1
strb r3, [r0]
- ldrgeb r3, [r1], #1
- strgeb r3, [r0]
- ldrgtb r3, [r1]
- strgtb r3, [r0]
+ ldrbge r3, [r1], #1
+ strbge r3, [r0]
+ ldrbgt r3, [r1]
+ strbgt r3, [r0]
ldmfd sp!, {r4, r5, pc}
ENDPROC(__raw_writesb)
diff --git a/arch/arm/lib32/io-writesl.S b/arch/arm/lib32/io-writesl.S
index 61164234de..c300a62daf 100644
--- a/arch/arm/lib32/io-writesl.S
+++ b/arch/arm/lib32/io-writesl.S
@@ -1,17 +1,19 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* SPDX-FileCopyrightText: 1995-2000 Russell King */
-
/*
- * linux/arch/arm/lib/io-writesl.S
+ * linux/arch/arm/lib/io-writesl.S
+ *
+ * Copyright (C) 1995-2000 Russell King
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
+.syntax unified
+
.section .text.__raw_writesl
ENTRY(__raw_writesl)
teq r2, #0 @ do we have to check for the zero len?
- moveq pc, lr
+ reteq lr
ands ip, r1, #3
bne 3f
@@ -27,12 +29,12 @@ ENTRY(__raw_writesl)
bpl 1b
ldmfd sp!, {r4, lr}
2: movs r2, r2, lsl #31
- ldmcsia r1!, {r3, ip}
+ ldmiacs r1!, {r3, ip}
strcs r3, [r0, #0]
ldrne r3, [r1, #0]
strcs ip, [r0, #0]
strne r3, [r0, #0]
- mov pc, lr
+ ret lr
3: bic r1, r1, #3
ldr r3, [r1], #4
@@ -46,7 +48,7 @@ ENTRY(__raw_writesl)
orr ip, ip, r3, lspush #16
str ip, [r0]
bne 4b
- mov pc, lr
+ ret lr
5: mov ip, r3, lspull #8
ldr r3, [r1], #4
@@ -54,7 +56,7 @@ ENTRY(__raw_writesl)
orr ip, ip, r3, lspush #24
str ip, [r0]
bne 5b
- mov pc, lr
+ ret lr
6: mov ip, r3, lspull #24
ldr r3, [r1], #4
@@ -62,5 +64,5 @@ ENTRY(__raw_writesl)
orr ip, ip, r3, lspush #8
str ip, [r0]
bne 6b
- mov pc, lr
+ ret lr
ENDPROC(__raw_writesl)
diff --git a/arch/arm/lib32/io-writesw-armv4.S b/arch/arm/lib32/io-writesw-armv4.S
index 5cfa74356c..717237f3cc 100644
--- a/arch/arm/lib32/io-writesw-armv4.S
+++ b/arch/arm/lib32/io-writesw-armv4.S
@@ -1,12 +1,14 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* SPDX-FileCopyrightText: 1995-2000 Russell King */
-
/*
- * linux/arch/arm/lib/io-writesw-armv4.S
+ * linux/arch/arm/lib/io-writesw-armv4.S
+ *
+ * Copyright (C) 1995-2000 Russell King
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
+.syntax unified
+
.macro outword, rd
#ifndef __ARMEB__
strh \rd, [r0]
@@ -30,7 +32,7 @@
ENTRY(__raw_writesw)
teq r2, #0
- moveq pc, lr
+ reteq lr
ands r3, r1, #3
bne .Loutsw_align
@@ -60,8 +62,8 @@ ENTRY(__raw_writesw)
ldr r3, [r1], #4
outword r3
-.Lno_outsw_2: ldrneh r3, [r1]
- strneh r3, [r0]
+.Lno_outsw_2: ldrhne r3, [r1]
+ strhne r3, [r0]
ldmfd sp!, {r4, r5, pc}
@@ -94,6 +96,6 @@ ENTRY(__raw_writesw)
tst r2, #1
3: movne ip, r3, lsr #8
- strneh ip, [r0]
- mov pc, lr
+ strhne ip, [r0]
+ ret lr
ENDPROC(__raw_writesw)
--
2.39.5
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v2 07/10] ARM: always assume the unified syntax for assembly code
2024-09-26 11:17 [PATCH v2 00/10] ARM: add assembler optimized memmove Sascha Hauer
` (5 preceding siblings ...)
2024-09-26 11:17 ` [PATCH v2 06/10] ARM: update io-* " Sascha Hauer
@ 2024-09-26 11:17 ` Sascha Hauer
2024-09-26 11:17 ` [PATCH v2 08/10] ARM: update memcpy.S and memset.S from Linux Sascha Hauer
` (3 subsequent siblings)
10 siblings, 0 replies; 14+ messages in thread
From: Sascha Hauer @ 2024-09-26 11:17 UTC (permalink / raw)
To: open list:BAREBOX; +Cc: Ahmad Fatoum
Adoption of Linux Commit:
| commit 75fea300d73ae5b18957949a53ec770daaeb6fc2
| Author: Nicolas Pitre <nico@fluxnic.net>
| Date: Wed Nov 29 07:52:52 2017 +0100
|
| ARM: 8723/2: always assume the "unified" syntax for assembly code
|
| The GNU assembler has implemented the "unified syntax" parsing since
| 2005. This "unified" syntax is required when the kernel is built in
| Thumb2 mode. However the "unified" syntax is a mixed bag of features,
| including not requiring a `#' prefix with immediate operands. This leads
| to situations where some code builds just fine in Thumb2 mode and fails
| to build in ARM mode if that prefix is missing. This behavior
| discrepancy makes build tests less valuable, forcing both ARM and Thumb2
| builds for proper coverage.
|
| Let's "fix" this issue by always using the "unified" syntax for both ARM
| and Thumb2 mode. Given that the documented minimum binutils version that
| properly builds the kernel is version 2.20 released in 2010, we can
| assume that any toolchain capable of building the latest kernel is also
| "unified syntax" capable.
|
| Whith this, a bunch of macros used to mask some differences between both
| syntaxes can be removed, with the side effect of making LTO easier.
|
| Suggested-by: Robin Murphy <robin.murphy@arm.com>
| Signed-off-by: Nicolas Pitre <nico@linaro.org>
| Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Ahmad Fatoum <a.fatoum@pengutronix.de>
Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
---
arch/arm/Kconfig | 4 ---
arch/arm/include/asm/unified.h | 75 +--------------------------------------
arch/arm/lib32/io-readsb.S | 2 --
arch/arm/lib32/io-readsl.S | 2 --
arch/arm/lib32/io-readsw-armv4.S | 2 --
arch/arm/lib32/io-writesb.S | 2 --
arch/arm/lib32/io-writesl.S | 2 --
arch/arm/lib32/io-writesw-armv4.S | 2 --
arch/arm/lib32/lib1funcs.S | 2 --
9 files changed, 1 insertion(+), 92 deletions(-)
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 9b90c8009a..0251f2dcef 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -286,9 +286,6 @@ config BOARD_ARM_GENERIC_DT_AARCH64
depends on BOARD_ARM_GENERIC_DT
default y
-config ARM_ASM_UNIFIED
- bool
-
config AEABI
bool "Use the ARM EABI to compile barebox"
depends on !CPU_V8
@@ -299,7 +296,6 @@ config AEABI
To use this you need GCC version 4.0.0 or later.
config THUMB2_BAREBOX
- select ARM_ASM_UNIFIED
select AEABI
depends on !ARCH_TEGRA && !ARCH_AT91
depends on CPU_V7 && !CPU_32v4T && !CPU_32v5 && !CPU_32v6
diff --git a/arch/arm/include/asm/unified.h b/arch/arm/include/asm/unified.h
index 5501d7f703..68b1deecfb 100644
--- a/arch/arm/include/asm/unified.h
+++ b/arch/arm/include/asm/unified.h
@@ -6,7 +6,7 @@
#ifndef __ASM_UNIFIED_H
#define __ASM_UNIFIED_H
-#if defined(__ASSEMBLY__) && defined(CONFIG_ARM_ASM_UNIFIED)
+#if defined(__ASSEMBLY__) && defined(CONFIG_CPU_32)
.syntax unified
#endif
@@ -40,77 +40,4 @@
#endif /* CONFIG_THUMB2_BAREBOX */
-#ifndef CONFIG_ARM_ASM_UNIFIED
-
-/*
- * If the unified assembly syntax isn't used (in ARM mode), these
- * macros expand to an empty string
- */
-#ifdef __ASSEMBLY__
- .macro it, cond
- .endm
- .macro itt, cond
- .endm
- .macro ite, cond
- .endm
- .macro ittt, cond
- .endm
- .macro itte, cond
- .endm
- .macro itet, cond
- .endm
- .macro itee, cond
- .endm
- .macro itttt, cond
- .endm
- .macro ittte, cond
- .endm
- .macro ittet, cond
- .endm
- .macro ittee, cond
- .endm
- .macro itett, cond
- .endm
- .macro itete, cond
- .endm
- .macro iteet, cond
- .endm
- .macro iteee, cond
- .endm
-#else /* !__ASSEMBLY__ */
-__asm__(
-" .macro it, cond\n"
-" .endm\n"
-" .macro itt, cond\n"
-" .endm\n"
-" .macro ite, cond\n"
-" .endm\n"
-" .macro ittt, cond\n"
-" .endm\n"
-" .macro itte, cond\n"
-" .endm\n"
-" .macro itet, cond\n"
-" .endm\n"
-" .macro itee, cond\n"
-" .endm\n"
-" .macro itttt, cond\n"
-" .endm\n"
-" .macro ittte, cond\n"
-" .endm\n"
-" .macro ittet, cond\n"
-" .endm\n"
-" .macro ittee, cond\n"
-" .endm\n"
-" .macro itett, cond\n"
-" .endm\n"
-" .macro itete, cond\n"
-" .endm\n"
-" .macro iteet, cond\n"
-" .endm\n"
-" .macro iteee, cond\n"
-" .endm\n");
-#endif /* __ASSEMBLY__ */
-
-#endif /* CONFIG_ARM_ASM_UNIFIED */
-
#endif /* !__ASM_UNIFIED_H */
diff --git a/arch/arm/lib32/io-readsb.S b/arch/arm/lib32/io-readsb.S
index 2777a49b22..66a89074bf 100644
--- a/arch/arm/lib32/io-readsb.S
+++ b/arch/arm/lib32/io-readsb.S
@@ -7,8 +7,6 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
-.syntax unified
-
.section .text.__raw_readsb
.Linsb_align: rsb ip, ip, #4
diff --git a/arch/arm/lib32/io-readsl.S b/arch/arm/lib32/io-readsl.S
index aecac1f9eb..6c01cb5dfa 100644
--- a/arch/arm/lib32/io-readsl.S
+++ b/arch/arm/lib32/io-readsl.S
@@ -7,8 +7,6 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
-.syntax unified
-
.section .text.__raw_readsl
ENTRY(__raw_readsl)
diff --git a/arch/arm/lib32/io-readsw-armv4.S b/arch/arm/lib32/io-readsw-armv4.S
index f5c633027c..b82ec390e9 100644
--- a/arch/arm/lib32/io-readsw-armv4.S
+++ b/arch/arm/lib32/io-readsw-armv4.S
@@ -7,8 +7,6 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
-.syntax unified
-
.macro pack, rd, hw1, hw2
#ifndef __ARMEB__
orr \rd, \hw1, \hw2, lsl #16
diff --git a/arch/arm/lib32/io-writesb.S b/arch/arm/lib32/io-writesb.S
index 0bfb1f914e..e90fa9e340 100644
--- a/arch/arm/lib32/io-writesb.S
+++ b/arch/arm/lib32/io-writesb.S
@@ -7,8 +7,6 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
-.syntax unified
-
.macro outword, rd
#ifndef __ARMEB__
strb \rd, [r0]
diff --git a/arch/arm/lib32/io-writesl.S b/arch/arm/lib32/io-writesl.S
index c300a62daf..8f7128589d 100644
--- a/arch/arm/lib32/io-writesl.S
+++ b/arch/arm/lib32/io-writesl.S
@@ -7,8 +7,6 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
-.syntax unified
-
.section .text.__raw_writesl
ENTRY(__raw_writesl)
diff --git a/arch/arm/lib32/io-writesw-armv4.S b/arch/arm/lib32/io-writesw-armv4.S
index 717237f3cc..9c478f5696 100644
--- a/arch/arm/lib32/io-writesw-armv4.S
+++ b/arch/arm/lib32/io-writesw-armv4.S
@@ -7,8 +7,6 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
-.syntax unified
-
.macro outword, rd
#ifndef __ARMEB__
strh \rd, [r0]
diff --git a/arch/arm/lib32/lib1funcs.S b/arch/arm/lib32/lib1funcs.S
index cd8af72737..7e402df1cd 100644
--- a/arch/arm/lib32/lib1funcs.S
+++ b/arch/arm/lib32/lib1funcs.S
@@ -37,8 +37,6 @@ Boston, MA 02111-1307, USA. */
#include <asm/assembler.h>
#include <asm/unwind.h>
-.syntax unified
-
.macro ARM_DIV_BODY dividend, divisor, result, curbit
#if __LINUX_ARM_ARCH__ >= 5
--
2.39.5
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v2 08/10] ARM: update memcpy.S and memset.S from Linux
2024-09-26 11:17 [PATCH v2 00/10] ARM: add assembler optimized memmove Sascha Hauer
` (6 preceding siblings ...)
2024-09-26 11:17 ` [PATCH v2 07/10] ARM: always assume the unified syntax for assembly code Sascha Hauer
@ 2024-09-26 11:17 ` Sascha Hauer
2024-09-26 11:17 ` [PATCH v2 09/10] lib/string.c: export non optimized memmove as __default_memmove Sascha Hauer
` (2 subsequent siblings)
10 siblings, 0 replies; 14+ messages in thread
From: Sascha Hauer @ 2024-09-26 11:17 UTC (permalink / raw)
To: open list:BAREBOX; +Cc: Ahmad Fatoum
This updates the assembler optimized memcpy() and memset() functions
from Linux-6.10.
Reviewed-by: Ahmad Fatoum <a.fatoum@pengutronix.de>
Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
---
arch/arm/include/asm/assembler.h | 6 +++
arch/arm/lib32/copy_template.S | 58 +++++++++++++-----------
arch/arm/lib32/memcpy.S | 30 +++++++------
arch/arm/lib32/memset.S | 96 +++++++++++++++++++++++++---------------
4 files changed, 117 insertions(+), 73 deletions(-)
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index e8f5625a0a..c84c8ec734 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -67,6 +67,12 @@
#define CALGN(code...)
#endif
+#ifndef CONFIG_CPU_64
+/* the frame pointer used for stack unwinding */
+ARM( fpreg .req r11 )
+THUMB( fpreg .req r7 )
+#endif
+
/*
* Enable and disable interrupts
*/
diff --git a/arch/arm/lib32/copy_template.S b/arch/arm/lib32/copy_template.S
index 897e3db3ff..777e185701 100644
--- a/arch/arm/lib32/copy_template.S
+++ b/arch/arm/lib32/copy_template.S
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* SPDX-FileCopyrightText: 2005 MontaVista Software, Inc (Nicolas Pitre)
+/* SPDX-FileCopyrightText: 2005 MontaVista Software, Inc (Nicolas Pitre) */
/*
* linux/arch/arm/lib/copy_template.s
@@ -48,6 +48,12 @@
* data as needed by the implementation including this code. Called
* upon code entry.
*
+ * usave reg1 reg2
+ *
+ * Unwind annotation macro is corresponding for 'enter' macro.
+ * It tell unwinder that preserved some provided registers on the stack
+ * and additional data by a prior 'enter' macro.
+ *
* exit reg1 reg2
*
* Restore registers with the values previously saved with the
@@ -61,8 +67,10 @@
* than one 32bit instruction in Thumb-2)
*/
-
- enter r4, lr
+ UNWIND( .fnstart )
+ enter r4, UNWIND(fpreg,) lr
+ UNWIND( .setfp fpreg, sp )
+ UNWIND( mov fpreg, sp )
subs r2, r2, #4
blt 8f
@@ -73,12 +81,12 @@
bne 10f
1: subs r2, r2, #(28)
- stmfd sp!, {r5 - r8}
+ stmfd sp!, {r5, r6, r8, r9}
blt 5f
CALGN( ands ip, r0, #31 )
CALGN( rsb r3, ip, #32 )
- CALGN( sbcnes r4, r3, r2 ) @ C is always set here
+ CALGN( sbcsne r4, r3, r2 ) @ C is always set here
CALGN( bcs 2f )
CALGN( adr r4, 6f )
CALGN( subs r2, r2, r3 ) @ C gets set
@@ -92,9 +100,9 @@
PLD( pld [r1, #92] )
3: PLD( pld [r1, #124] )
-4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+4: ldr8w r1, r3, r4, r5, r6, r8, r9, ip, lr, abort=20f
subs r2, r2, #32
- str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+ str8w r0, r3, r4, r5, r6, r8, r9, ip, lr, abort=20f
bge 3b
PLD( cmn r2, #96 )
PLD( bge 4b )
@@ -114,8 +122,8 @@
ldr1w r1, r4, abort=20f
ldr1w r1, r5, abort=20f
ldr1w r1, r6, abort=20f
- ldr1w r1, r7, abort=20f
ldr1w r1, r8, abort=20f
+ ldr1w r1, r9, abort=20f
ldr1w r1, lr, abort=20f
#if LDR1W_SHIFT < STR1W_SHIFT
@@ -132,13 +140,13 @@
str1w r0, r4, abort=20f
str1w r0, r5, abort=20f
str1w r0, r6, abort=20f
- str1w r0, r7, abort=20f
str1w r0, r8, abort=20f
+ str1w r0, r9, abort=20f
str1w r0, lr, abort=20f
CALGN( bcs 2b )
-7: ldmfd sp!, {r5 - r8}
+7: ldmfd sp!, {r5, r6, r8, r9}
8: movs r2, r2, lsl #31
ldr1b r1, r3, ne, abort=21f
@@ -148,7 +156,7 @@
str1b r0, r4, cs, abort=21f
str1b r0, ip, cs, abort=21f
- exit r4, pc
+ exit r4, UNWIND(fpreg,) pc
9: rsb ip, ip, #4
cmp ip, #2
@@ -177,11 +185,11 @@
CALGN( ands ip, r0, #31 )
CALGN( rsb ip, ip, #32 )
- CALGN( sbcnes r4, ip, r2 ) @ C is always set here
+ CALGN( sbcsne r4, ip, r2 ) @ C is always set here
CALGN( subcc r2, r2, ip )
CALGN( bcc 15f )
-11: stmfd sp!, {r5 - r9}
+11: stmfd sp!, {r5, r6, r8 - r10}
PLD( pld [r1, #0] )
PLD( subs r2, r2, #96 )
@@ -191,31 +199,31 @@
PLD( pld [r1, #92] )
12: PLD( pld [r1, #124] )
-13: ldr4w r1, r4, r5, r6, r7, abort=19f
+13: ldr4w r1, r4, r5, r6, r8, abort=19f
mov r3, lr, lspull #\pull
subs r2, r2, #32
- ldr4w r1, r8, r9, ip, lr, abort=19f
+ ldr4w r1, r9, r10, ip, lr, abort=19f
orr r3, r3, r4, lspush #\push
mov r4, r4, lspull #\pull
orr r4, r4, r5, lspush #\push
mov r5, r5, lspull #\pull
orr r5, r5, r6, lspush #\push
mov r6, r6, lspull #\pull
- orr r6, r6, r7, lspush #\push
- mov r7, r7, lspull #\pull
- orr r7, r7, r8, lspush #\push
+ orr r6, r6, r8, lspush #\push
mov r8, r8, lspull #\pull
orr r8, r8, r9, lspush #\push
mov r9, r9, lspull #\pull
- orr r9, r9, ip, lspush #\push
+ orr r9, r9, r10, lspush #\push
+ mov r10, r10, lspull #\pull
+ orr r10, r10, ip, lspush #\push
mov ip, ip, lspull #\pull
orr ip, ip, lr, lspush #\push
- str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
+ str8w r0, r3, r4, r5, r6, r8, r9, r10, ip, abort=19f
bge 12b
PLD( cmn r2, #96 )
PLD( bge 13b )
- ldmfd sp!, {r5 - r9}
+ ldmfd sp!, {r5, r6, r8 - r10}
14: ands ip, r2, #28
beq 16f
@@ -241,6 +249,7 @@
18: forward_copy_shift pull=24 push=8
+ UNWIND( .fnend )
/*
* Abort preamble and completion macros.
@@ -250,14 +259,13 @@
*/
.macro copy_abort_preamble
-19: ldmfd sp!, {r5 - r9}
+19: ldmfd sp!, {r5, r6, r8 - r10}
b 21f
-20: ldmfd sp!, {r5 - r8}
+20: ldmfd sp!, {r5, r6, r8, r9}
21:
.endm
.macro copy_abort_end
- ldmfd sp!, {r4, pc}
+ ldmfd sp!, {r4, UNWIND(fpreg,) pc}
.endm
-
diff --git a/arch/arm/lib32/memcpy.S b/arch/arm/lib32/memcpy.S
index d40296e4bf..90f2b645aa 100644
--- a/arch/arm/lib32/memcpy.S
+++ b/arch/arm/lib32/memcpy.S
@@ -1,12 +1,15 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* SPDX-FileCopyrightText: 2005 MontaVista Software, Inc (Nicolas Pitre)
-
/*
- * linux/arch/arm/lib/memcpy.S
+ * linux/arch/arm/lib/memcpy.S
+ *
+ * Author: Nicolas Pitre
+ * Created: Sep 28, 2005
+ * Copyright: MontaVista Software, Inc.
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
+#include <asm/unwind.h>
#define LDR1W_SHIFT 0
#define STR1W_SHIFT 0
@@ -24,7 +27,7 @@
.endm
.macro ldr1b ptr reg cond=al abort
- ldr\cond\()b \reg, [\ptr], #1
+ ldrb\cond \reg, [\ptr], #1
.endm
.macro str1w ptr reg abort
@@ -36,27 +39,28 @@
.endm
.macro str1b ptr reg cond=al abort
- str\cond\()b \reg, [\ptr], #1
+ strb\cond \reg, [\ptr], #1
.endm
- .macro enter reg1 reg2
- stmdb sp!, {r0, \reg1, \reg2}
+ .macro enter regs:vararg
+UNWIND( .save {r0, \regs} )
+ stmdb sp!, {r0, \regs}
.endm
- .macro exit reg1 reg2
- ldmfd sp!, {r0, \reg1, \reg2}
+ .macro exit regs:vararg
+ ldmfd sp!, {r0, \regs}
.endm
.text
/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
-.weak memcpy
-ENTRY(memcpy)
ENTRY(__memcpy)
+ENTRY(mmiocpy)
+WEAK(memcpy)
#include "copy_template.S"
-ENDPROC(__memcpy)
ENDPROC(memcpy)
-
+ENDPROC(mmiocpy)
+ENDPROC(__memcpy)
diff --git a/arch/arm/lib32/memset.S b/arch/arm/lib32/memset.S
index 4ba74e0c6c..de75ae4d5a 100644
--- a/arch/arm/lib32/memset.S
+++ b/arch/arm/lib32/memset.S
@@ -1,19 +1,23 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* SPDX-FileCopyrightText: 1995-2000 Russell King */
-
/*
- * linux/arch/arm/lib/memset.S
- * ASM optimised string functions
+ * linux/arch/arm/lib/memset.S
+ *
+ * Copyright (C) 1995-2000 Russell King
+ *
+ * ASM optimised string functions
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
+#include <asm/unwind.h>
.text
.align 5
-.weak memset
ENTRY(__memset)
-ENTRY(memset)
+ENTRY(mmioset)
+WEAK(memset)
+UNWIND( .fnstart )
+ and r1, r1, #255 @ cast to unsigned char
ands r3, r0, #3 @ 1 unaligned?
mov ip, r0 @ preserve r0 as return value
bne 6f @ 1
@@ -23,34 +27,38 @@ ENTRY(memset)
1: orr r1, r1, r1, lsl #8
orr r1, r1, r1, lsl #16
mov r3, r1
- cmp r2, #16
+7: cmp r2, #16
blt 4f
+UNWIND( .fnend )
#if ! CALGN(1)+0
/*
- * We need an 2 extra registers for this loop - use r8 and the LR
+ * We need 2 extra registers for this loop - use r8 and the LR
*/
+UNWIND( .fnstart )
+UNWIND( .save {r8, lr} )
stmfd sp!, {r8, lr}
mov r8, r1
- mov lr, r1
+ mov lr, r3
2: subs r2, r2, #64
- stmgeia ip!, {r1, r3, r8, lr} @ 64 bytes at a time.
- stmgeia ip!, {r1, r3, r8, lr}
- stmgeia ip!, {r1, r3, r8, lr}
- stmgeia ip!, {r1, r3, r8, lr}
+ stmiage ip!, {r1, r3, r8, lr} @ 64 bytes at a time.
+ stmiage ip!, {r1, r3, r8, lr}
+ stmiage ip!, {r1, r3, r8, lr}
+ stmiage ip!, {r1, r3, r8, lr}
bgt 2b
- ldmeqfd sp!, {r8, pc} @ Now <64 bytes to go.
+ ldmfdeq sp!, {r8, pc} @ Now <64 bytes to go.
/*
* No need to correct the count; we're only testing bits from now on
*/
tst r2, #32
- stmneia ip!, {r1, r3, r8, lr}
- stmneia ip!, {r1, r3, r8, lr}
+ stmiane ip!, {r1, r3, r8, lr}
+ stmiane ip!, {r1, r3, r8, lr}
tst r2, #16
- stmneia ip!, {r1, r3, r8, lr}
+ stmiane ip!, {r1, r3, r8, lr}
ldmfd sp!, {r8, lr}
+UNWIND( .fnend )
#else
@@ -59,13 +67,15 @@ ENTRY(memset)
* whole cache lines at once.
*/
+UNWIND( .fnstart )
+UNWIND( .save {r4-r8, lr} )
stmfd sp!, {r4-r8, lr}
mov r4, r1
- mov r5, r1
+ mov r5, r3
mov r6, r1
- mov r7, r1
+ mov r7, r3
mov r8, r1
- mov lr, r1
+ mov lr, r3
cmp r2, #96
tstgt ip, #31
@@ -75,48 +85,64 @@ ENTRY(memset)
rsb r8, r8, #32
sub r2, r2, r8
movs r8, r8, lsl #(32 - 4)
- stmcsia ip!, {r4, r5, r6, r7}
- stmmiia ip!, {r4, r5}
+ stmiacs ip!, {r4, r5, r6, r7}
+ stmiami ip!, {r4, r5}
tst r8, #(1 << 30)
mov r8, r1
strne r1, [ip], #4
3: subs r2, r2, #64
- stmgeia ip!, {r1, r3-r8, lr}
- stmgeia ip!, {r1, r3-r8, lr}
+ stmiage ip!, {r1, r3-r8, lr}
+ stmiage ip!, {r1, r3-r8, lr}
bgt 3b
- ldmeqfd sp!, {r4-r8, pc}
+ ldmfdeq sp!, {r4-r8, pc}
tst r2, #32
- stmneia ip!, {r1, r3-r8, lr}
+ stmiane ip!, {r1, r3-r8, lr}
tst r2, #16
- stmneia ip!, {r4-r7}
+ stmiane ip!, {r4-r7}
ldmfd sp!, {r4-r8, lr}
+UNWIND( .fnend )
#endif
+UNWIND( .fnstart )
4: tst r2, #8
- stmneia ip!, {r1, r3}
+ stmiane ip!, {r1, r3}
tst r2, #4
strne r1, [ip], #4
/*
- * When we get here, we've got less than 4 bytes to zero. We
+ * When we get here, we've got less than 4 bytes to set. We
* may have an unaligned pointer as well.
*/
5: tst r2, #2
- strneb r1, [ip], #1
- strneb r1, [ip], #1
+ strbne r1, [ip], #1
+ strbne r1, [ip], #1
tst r2, #1
- strneb r1, [ip], #1
- mov pc, lr
+ strbne r1, [ip], #1
+ ret lr
6: subs r2, r2, #4 @ 1 do we have enough
blt 5b @ 1 bytes to align with?
cmp r3, #2 @ 1
- strltb r1, [ip], #1 @ 1
- strleb r1, [ip], #1 @ 1
+ strblt r1, [ip], #1 @ 1
+ strble r1, [ip], #1 @ 1
strb r1, [ip], #1 @ 1
add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3))
b 1b
+UNWIND( .fnend )
ENDPROC(memset)
+ENDPROC(mmioset)
ENDPROC(__memset)
+
+ENTRY(__memset32)
+UNWIND( .fnstart )
+ mov r3, r1 @ copy r1 to r3 and fall into memset64
+UNWIND( .fnend )
+ENDPROC(__memset32)
+ENTRY(__memset64)
+UNWIND( .fnstart )
+ mov ip, r0 @ preserve r0 as return value
+ b 7b @ jump into the middle of memset
+UNWIND( .fnend )
+ENDPROC(__memset64)
--
2.39.5
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v2 09/10] lib/string.c: export non optimized memmove as __default_memmove
2024-09-26 11:17 [PATCH v2 00/10] ARM: add assembler optimized memmove Sascha Hauer
` (7 preceding siblings ...)
2024-09-26 11:17 ` [PATCH v2 08/10] ARM: update memcpy.S and memset.S from Linux Sascha Hauer
@ 2024-09-26 11:17 ` Sascha Hauer
2024-09-26 11:17 ` [PATCH v2 10/10] ARM: add optimized memmove Sascha Hauer
2024-09-27 10:39 ` [PATCH v2 00/10] ARM: add assembler " Sascha Hauer
10 siblings, 0 replies; 14+ messages in thread
From: Sascha Hauer @ 2024-09-26 11:17 UTC (permalink / raw)
To: open list:BAREBOX; +Cc: Ahmad Fatoum
When an architecture defines __HAVE_ARCH_MEMMOVE then the generic
non optimized memmove is not compiled in. It is needed on ARM64
as a fallback in certain cases though, so make it available as
__default_memmove just like done with __default_memcpy already.
Reviewed-by: Ahmad Fatoum <a.fatoum@pengutronix.de>
Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
---
lib/string.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/lib/string.c b/lib/string.c
index 374f326143..98dd3cffdd 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -666,7 +666,6 @@ void *mempcpy(void *dest, const void *src, size_t count)
}
EXPORT_SYMBOL(mempcpy);
-#ifndef __HAVE_ARCH_MEMMOVE
/**
* memmove - Copy one area of memory to another
* @dest: Where to copy to
@@ -675,7 +674,7 @@ EXPORT_SYMBOL(mempcpy);
*
* Unlike memcpy(), memmove() copes with overlapping areas.
*/
-void * memmove(void * dest,const void *src,size_t count)
+void *__default_memmove(void * dest,const void *src,size_t count)
{
char *tmp, *s;
@@ -694,6 +693,13 @@ void * memmove(void * dest,const void *src,size_t count)
return dest;
}
+EXPORT_SYMBOL(__default_memmove);
+
+#ifndef __HAVE_ARCH_MEMMOVE
+void *memmove(void * dest, const void *src, size_t count)
+ __alias(__default_memmove);
+void *__memmove(void * dest, const void *src, size_t count)
+ __alias(__default_memmove);
#endif
EXPORT_SYMBOL(memmove);
--
2.39.5
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v2 10/10] ARM: add optimized memmove
2024-09-26 11:17 [PATCH v2 00/10] ARM: add assembler optimized memmove Sascha Hauer
` (8 preceding siblings ...)
2024-09-26 11:17 ` [PATCH v2 09/10] lib/string.c: export non optimized memmove as __default_memmove Sascha Hauer
@ 2024-09-26 11:17 ` Sascha Hauer
2024-09-27 5:12 ` Marco Felsch
2024-09-27 10:39 ` [PATCH v2 00/10] ARM: add assembler " Sascha Hauer
10 siblings, 1 reply; 14+ messages in thread
From: Sascha Hauer @ 2024-09-26 11:17 UTC (permalink / raw)
To: open list:BAREBOX; +Cc: Ahmad Fatoum
Until now there has been no assembler optimized version of memmove() for
ARM. Add this from Linux-6.10 for both ARM32 and ARM64. This also updates
memcpy() for ARM64 from Linux-6.10.
Reviewed-by: Ahmad Fatoum <a.fatoum@pengutronix.de>
Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
---
arch/arm/include/asm/cache.h | 8 ++
arch/arm/include/asm/string.h | 4 +-
arch/arm/lib32/Makefile | 1 +
arch/arm/lib32/memmove.S | 206 +++++++++++++++++++++++++++++++
arch/arm/lib64/copy_template.S | 180 ---------------------------
arch/arm/lib64/memcpy.S | 274 ++++++++++++++++++++++++++++++++++-------
arch/arm/lib64/memset.S | 18 +--
arch/arm/lib64/string.c | 17 +++
include/string.h | 2 +
lib/string.c | 1 -
10 files changed, 478 insertions(+), 233 deletions(-)
diff --git a/arch/arm/include/asm/cache.h b/arch/arm/include/asm/cache.h
index 261c30129a..dd022c1f23 100644
--- a/arch/arm/include/asm/cache.h
+++ b/arch/arm/include/asm/cache.h
@@ -3,6 +3,13 @@
#ifndef __ASM_CACHE_H
#define __ASM_CACHE_H
+#ifdef CONFIG_CPU_64
+#define L1_CACHE_SHIFT (6)
+#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
+#endif
+
+#ifndef __ASSEMBLY__
+
void v8_invalidate_icache_all(void);
void v8_flush_dcache_all(void);
void v8_invalidate_dcache_all(void);
@@ -25,5 +32,6 @@ void arm_early_mmu_cache_invalidate(void);
void sync_caches_for_execution(void);
#include <asm-generic/cache.h>
+#endif
#endif
diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h
index 2322b846b2..f79392e53d 100644
--- a/arch/arm/include/asm/string.h
+++ b/arch/arm/include/asm/string.h
@@ -9,10 +9,12 @@
extern void *memcpy(void *, const void *, __kernel_size_t);
#define __HAVE_ARCH_MEMSET
extern void *memset(void *, int, __kernel_size_t);
-
+#define __HAVE_ARCH_MEMMOVE
+extern void *memmove(void *, const void *, __kernel_size_t);
#endif
extern void *__memcpy(void *, const void *, __kernel_size_t);
extern void *__memset(void *, int, __kernel_size_t);
+extern void *__memmove(void *, const void *, __kernel_size_t);
#endif
diff --git a/arch/arm/lib32/Makefile b/arch/arm/lib32/Makefile
index 511a029062..a139a80fb8 100644
--- a/arch/arm/lib32/Makefile
+++ b/arch/arm/lib32/Makefile
@@ -21,6 +21,7 @@ obj-y += lshrdi3.o
obj-y += runtime-offset.o
pbl-y += runtime-offset.o
obj-$(CONFIG_ARM_OPTIMZED_STRING_FUNCTIONS) += memcpy.o
+obj-$(CONFIG_ARM_OPTIMZED_STRING_FUNCTIONS) += memmove.o
obj-$(CONFIG_ARM_OPTIMZED_STRING_FUNCTIONS) += memset.o
obj-$(CONFIG_ARM_UNWIND) += unwind.o
obj-$(CONFIG_MODULES) += module.o
diff --git a/arch/arm/lib32/memmove.S b/arch/arm/lib32/memmove.S
new file mode 100644
index 0000000000..6410554039
--- /dev/null
+++ b/arch/arm/lib32/memmove.S
@@ -0,0 +1,206 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * linux/arch/arm/lib/memmove.S
+ *
+ * Author: Nicolas Pitre
+ * Created: Sep 28, 2005
+ * Copyright: (C) MontaVista Software Inc.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/unwind.h>
+
+ .text
+
+/*
+ * Prototype: void *memmove(void *dest, const void *src, size_t n);
+ *
+ * Note:
+ *
+ * If the memory regions don't overlap, we simply branch to memcpy which is
+ * normally a bit faster. Otherwise the copy is done going downwards. This
+ * is a transposition of the code from copy_template.S but with the copy
+ * occurring in the opposite direction.
+ */
+
+ENTRY(__memmove)
+WEAK(memmove)
+ UNWIND( .fnstart )
+
+ subs ip, r0, r1
+ cmphi r2, ip
+ bls __memcpy
+ UNWIND( .fnend )
+
+ UNWIND( .fnstart )
+ UNWIND( .save {r0, r4, fpreg, lr} )
+ stmfd sp!, {r0, r4, UNWIND(fpreg,) lr}
+ UNWIND( .setfp fpreg, sp )
+ UNWIND( mov fpreg, sp )
+ add r1, r1, r2
+ add r0, r0, r2
+ subs r2, r2, #4
+ blt 8f
+ ands ip, r0, #3
+ PLD( pld [r1, #-4] )
+ bne 9f
+ ands ip, r1, #3
+ bne 10f
+
+1: subs r2, r2, #(28)
+ stmfd sp!, {r5, r6, r8, r9}
+ blt 5f
+
+ CALGN( ands ip, r0, #31 )
+ CALGN( sbcsne r4, ip, r2 ) @ C is always set here
+ CALGN( bcs 2f )
+ CALGN( adr r4, 6f )
+ CALGN( subs r2, r2, ip ) @ C is set here
+ CALGN( rsb ip, ip, #32 )
+ CALGN( add pc, r4, ip )
+
+ PLD( pld [r1, #-4] )
+2: PLD( subs r2, r2, #96 )
+ PLD( pld [r1, #-32] )
+ PLD( blt 4f )
+ PLD( pld [r1, #-64] )
+ PLD( pld [r1, #-96] )
+
+3: PLD( pld [r1, #-128] )
+4: ldmdb r1!, {r3, r4, r5, r6, r8, r9, ip, lr}
+ subs r2, r2, #32
+ stmdb r0!, {r3, r4, r5, r6, r8, r9, ip, lr}
+ bge 3b
+ PLD( cmn r2, #96 )
+ PLD( bge 4b )
+
+5: ands ip, r2, #28
+ rsb ip, ip, #32
+ addne pc, pc, ip @ C is always clear here
+ b 7f
+6: W(nop)
+ W(ldr) r3, [r1, #-4]!
+ W(ldr) r4, [r1, #-4]!
+ W(ldr) r5, [r1, #-4]!
+ W(ldr) r6, [r1, #-4]!
+ W(ldr) r8, [r1, #-4]!
+ W(ldr) r9, [r1, #-4]!
+ W(ldr) lr, [r1, #-4]!
+
+ add pc, pc, ip
+ nop
+ W(nop)
+ W(str) r3, [r0, #-4]!
+ W(str) r4, [r0, #-4]!
+ W(str) r5, [r0, #-4]!
+ W(str) r6, [r0, #-4]!
+ W(str) r8, [r0, #-4]!
+ W(str) r9, [r0, #-4]!
+ W(str) lr, [r0, #-4]!
+
+ CALGN( bcs 2b )
+
+7: ldmfd sp!, {r5, r6, r8, r9}
+
+8: movs r2, r2, lsl #31
+ ldrbne r3, [r1, #-1]!
+ ldrbcs r4, [r1, #-1]!
+ ldrbcs ip, [r1, #-1]
+ strbne r3, [r0, #-1]!
+ strbcs r4, [r0, #-1]!
+ strbcs ip, [r0, #-1]
+ ldmfd sp!, {r0, r4, UNWIND(fpreg,) pc}
+
+9: cmp ip, #2
+ ldrbgt r3, [r1, #-1]!
+ ldrbge r4, [r1, #-1]!
+ ldrb lr, [r1, #-1]!
+ strbgt r3, [r0, #-1]!
+ strbge r4, [r0, #-1]!
+ subs r2, r2, ip
+ strb lr, [r0, #-1]!
+ blt 8b
+ ands ip, r1, #3
+ beq 1b
+
+10: bic r1, r1, #3
+ cmp ip, #2
+ ldr r3, [r1, #0]
+ beq 17f
+ blt 18f
+
+
+ .macro backward_copy_shift push pull
+
+ subs r2, r2, #28
+ blt 14f
+
+ CALGN( ands ip, r0, #31 )
+ CALGN( sbcsne r4, ip, r2 ) @ C is always set here
+ CALGN( subcc r2, r2, ip )
+ CALGN( bcc 15f )
+
+11: stmfd sp!, {r5, r6, r8 - r10}
+
+ PLD( pld [r1, #-4] )
+ PLD( subs r2, r2, #96 )
+ PLD( pld [r1, #-32] )
+ PLD( blt 13f )
+ PLD( pld [r1, #-64] )
+ PLD( pld [r1, #-96] )
+
+12: PLD( pld [r1, #-128] )
+13: ldmdb r1!, {r8, r9, r10, ip}
+ mov lr, r3, lspush #\push
+ subs r2, r2, #32
+ ldmdb r1!, {r3, r4, r5, r6}
+ orr lr, lr, ip, lspull #\pull
+ mov ip, ip, lspush #\push
+ orr ip, ip, r10, lspull #\pull
+ mov r10, r10, lspush #\push
+ orr r10, r10, r9, lspull #\pull
+ mov r9, r9, lspush #\push
+ orr r9, r9, r8, lspull #\pull
+ mov r8, r8, lspush #\push
+ orr r8, r8, r6, lspull #\pull
+ mov r6, r6, lspush #\push
+ orr r6, r6, r5, lspull #\pull
+ mov r5, r5, lspush #\push
+ orr r5, r5, r4, lspull #\pull
+ mov r4, r4, lspush #\push
+ orr r4, r4, r3, lspull #\pull
+ stmdb r0!, {r4 - r6, r8 - r10, ip, lr}
+ bge 12b
+ PLD( cmn r2, #96 )
+ PLD( bge 13b )
+
+ ldmfd sp!, {r5, r6, r8 - r10}
+
+14: ands ip, r2, #28
+ beq 16f
+
+15: mov lr, r3, lspush #\push
+ ldr r3, [r1, #-4]!
+ subs ip, ip, #4
+ orr lr, lr, r3, lspull #\pull
+ str lr, [r0, #-4]!
+ bgt 15b
+ CALGN( cmp r2, #0 )
+ CALGN( bge 11b )
+
+16: add r1, r1, #(\pull / 8)
+ b 8b
+
+ .endm
+
+
+ backward_copy_shift push=8 pull=24
+
+17: backward_copy_shift push=16 pull=16
+
+18: backward_copy_shift push=24 pull=8
+
+ UNWIND( .fnend )
+ENDPROC(memmove)
+ENDPROC(__memmove)
diff --git a/arch/arm/lib64/copy_template.S b/arch/arm/lib64/copy_template.S
deleted file mode 100644
index 8e4ff059d1..0000000000
--- a/arch/arm/lib64/copy_template.S
+++ /dev/null
@@ -1,180 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/* SPDX-FileCopyrightText: 2013 ARM Ltd. */
-/* SPDX-FileCopyrightText: 2013 Linaro */
-
-/*
- * This code is based on glibc cortex strings work originally authored by Linaro
- * and re-licensed under GPLv2 for the Linux kernel. The original code can
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
- */
-
-/*
- * Copy a buffer from src to dest (alignment handled by the hardware)
- *
- * Parameters:
- * x0 - dest
- * x1 - src
- * x2 - n
- * Returns:
- * x0 - dest
- */
-dstin .req x0
-src .req x1
-count .req x2
-tmp1 .req x3
-tmp1w .req w3
-tmp2 .req x4
-tmp2w .req w4
-dst .req x6
-
-A_l .req x7
-A_h .req x8
-B_l .req x9
-B_h .req x10
-C_l .req x11
-C_h .req x12
-D_l .req x13
-D_h .req x14
-
- mov dst, dstin
- cmp count, #16
- /*When memory length is less than 16, the accessed are not aligned.*/
- b.lo .Ltiny15
-
- neg tmp2, src
- ands tmp2, tmp2, #15/* Bytes to reach alignment. */
- b.eq .LSrcAligned
- sub count, count, tmp2
- /*
- * Copy the leading memory data from src to dst in an increasing
- * address order.By this way,the risk of overwritting the source
- * memory data is eliminated when the distance between src and
- * dst is less than 16. The memory accesses here are alignment.
- */
- tbz tmp2, #0, 1f
- ldrb1 tmp1w, src, #1
- strb1 tmp1w, dst, #1
-1:
- tbz tmp2, #1, 2f
- ldrh1 tmp1w, src, #2
- strh1 tmp1w, dst, #2
-2:
- tbz tmp2, #2, 3f
- ldr1 tmp1w, src, #4
- str1 tmp1w, dst, #4
-3:
- tbz tmp2, #3, .LSrcAligned
- ldr1 tmp1, src, #8
- str1 tmp1, dst, #8
-
-.LSrcAligned:
- cmp count, #64
- b.ge .Lcpy_over64
- /*
- * Deal with small copies quickly by dropping straight into the
- * exit block.
- */
-.Ltail63:
- /*
- * Copy up to 48 bytes of data. At this point we only need the
- * bottom 6 bits of count to be accurate.
- */
- ands tmp1, count, #0x30
- b.eq .Ltiny15
- cmp tmp1w, #0x20
- b.eq 1f
- b.lt 2f
- ldp1 A_l, A_h, src, #16
- stp1 A_l, A_h, dst, #16
-1:
- ldp1 A_l, A_h, src, #16
- stp1 A_l, A_h, dst, #16
-2:
- ldp1 A_l, A_h, src, #16
- stp1 A_l, A_h, dst, #16
-.Ltiny15:
- /*
- * Prefer to break one ldp/stp into several load/store to access
- * memory in an increasing address order,rather than to load/store 16
- * bytes from (src-16) to (dst-16) and to backward the src to aligned
- * address,which way is used in original cortex memcpy. If keeping
- * the original memcpy process here, memmove need to satisfy the
- * precondition that src address is at least 16 bytes bigger than dst
- * address,otherwise some source data will be overwritten when memove
- * call memcpy directly. To make memmove simpler and decouple the
- * memcpy's dependency on memmove, withdrew the original process.
- */
- tbz count, #3, 1f
- ldr1 tmp1, src, #8
- str1 tmp1, dst, #8
-1:
- tbz count, #2, 2f
- ldr1 tmp1w, src, #4
- str1 tmp1w, dst, #4
-2:
- tbz count, #1, 3f
- ldrh1 tmp1w, src, #2
- strh1 tmp1w, dst, #2
-3:
- tbz count, #0, .Lexitfunc
- ldrb1 tmp1w, src, #1
- strb1 tmp1w, dst, #1
-
- b .Lexitfunc
-
-.Lcpy_over64:
- subs count, count, #128
- b.ge .Lcpy_body_large
- /*
- * Less than 128 bytes to copy, so handle 64 here and then jump
- * to the tail.
- */
- ldp1 A_l, A_h, src, #16
- stp1 A_l, A_h, dst, #16
- ldp1 B_l, B_h, src, #16
- ldp1 C_l, C_h, src, #16
- stp1 B_l, B_h, dst, #16
- stp1 C_l, C_h, dst, #16
- ldp1 D_l, D_h, src, #16
- stp1 D_l, D_h, dst, #16
-
- tst count, #0x3f
- b.ne .Ltail63
- b .Lexitfunc
-
- /*
- * Critical loop. Start at a new cache line boundary. Assuming
- * 64 bytes per line this ensures the entire loop is in one line.
- */
-.Lcpy_body_large:
- /* pre-get 64 bytes data. */
- ldp1 A_l, A_h, src, #16
- ldp1 B_l, B_h, src, #16
- ldp1 C_l, C_h, src, #16
- ldp1 D_l, D_h, src, #16
-1:
- /*
- * interlace the load of next 64 bytes data block with store of the last
- * loaded 64 bytes data.
- */
- stp1 A_l, A_h, dst, #16
- ldp1 A_l, A_h, src, #16
- stp1 B_l, B_h, dst, #16
- ldp1 B_l, B_h, src, #16
- stp1 C_l, C_h, dst, #16
- ldp1 C_l, C_h, src, #16
- stp1 D_l, D_h, dst, #16
- ldp1 D_l, D_h, src, #16
- subs count, count, #64
- b.ge 1b
- stp1 A_l, A_h, dst, #16
- stp1 B_l, B_h, dst, #16
- stp1 C_l, C_h, dst, #16
- stp1 D_l, D_h, dst, #16
-
- tst count, #0x3f
- b.ne .Ltail63
-.Lexitfunc:
diff --git a/arch/arm/lib64/memcpy.S b/arch/arm/lib64/memcpy.S
index 92845b25a6..98b453d3fd 100644
--- a/arch/arm/lib64/memcpy.S
+++ b/arch/arm/lib64/memcpy.S
@@ -1,63 +1,249 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* SPDX-FileCopyrightText: 2013 ARM Ltd. */
-/* SPDX-FileCopyrightText: 2013 Linaro */
-
/*
- * This code is based on glibc cortex strings work originally authored by Linaro
- * and re-licensed under GPLv2 for the Linux kernel. The original code can
- * be found @
+ * Copyright (c) 2012-2021, Arm Limited.
*
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * Adapted from the original at:
+ * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
-/*
- * Copy a buffer from src to dest (alignment handled by the hardware)
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
*
- * Parameters:
- * x0 - dest
- * x1 - src
- * x2 - n
- * Returns:
- * x0 - dest
*/
- .macro ldrb1 ptr, regB, val
- ldrb \ptr, [\regB], \val
- .endm
- .macro strb1 ptr, regB, val
- strb \ptr, [\regB], \val
- .endm
+#define L(label) .L ## label
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define A_l x6
+#define A_lw w6
+#define A_h x7
+#define B_l x8
+#define B_lw w8
+#define B_h x9
+#define C_l x10
+#define C_lw w10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l x14
+#define E_h x15
+#define F_l x16
+#define F_h x17
+#define G_l count
+#define G_h dst
+#define H_l src
+#define H_h srcend
+#define tmp1 x14
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+ from a single entry point. It uses unaligned accesses and branchless
+ sequences to keep the code small, simple and improve performance.
+
+ Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+ copies of up to 128 bytes, and large copies. The overhead of the overlap
+ check is negligible since it is only required for large copies.
+
+ Large copies use a software pipelined loop processing 64 bytes per iteration.
+ The destination pointer is 16-byte aligned to minimize unaligned accesses.
+ The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+SYM_FUNC_START(__pi_memcpy)
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 128
+ b.hi L(copy_long)
+ cmp count, 32
+ b.hi L(copy32_128)
+
+ /* Small copies: 0..32 bytes. */
+ cmp count, 16
+ b.lo L(copy16)
+ ldp A_l, A_h, [src]
+ ldp D_l, D_h, [srcend, -16]
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ /* Copy 8-15 bytes. */
+L(copy16):
+ tbz count, 3, L(copy8)
+ ldr A_l, [src]
+ ldr A_h, [srcend, -8]
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ ret
+
+ .p2align 3
+ /* Copy 4-7 bytes. */
+L(copy8):
+ tbz count, 2, L(copy4)
+ ldr A_lw, [src]
+ ldr B_lw, [srcend, -4]
+ str A_lw, [dstin]
+ str B_lw, [dstend, -4]
+ ret
- .macro ldrh1 ptr, regB, val
- ldrh \ptr, [\regB], \val
- .endm
+ /* Copy 0..3 bytes using a branchless sequence. */
+L(copy4):
+ cbz count, L(copy0)
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb C_lw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb C_lw, [dstend, -1]
+L(copy0):
+ ret
- .macro strh1 ptr, regB, val
- strh \ptr, [\regB], \val
- .endm
+ .p2align 4
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ ldp A_l, A_h, [src]
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ ldp D_l, D_h, [srcend, -16]
+ cmp count, 64
+ b.hi L(copy128)
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
+ ret
- .macro ldr1 ptr, regB, val
- ldr \ptr, [\regB], \val
- .endm
+ .p2align 4
+ /* Copy 65..128 bytes. */
+L(copy128):
+ ldp E_l, E_h, [src, 32]
+ ldp F_l, F_h, [src, 48]
+ cmp count, 96
+ b.ls L(copy96)
+ ldp G_l, G_h, [srcend, -64]
+ ldp H_l, H_h, [srcend, -48]
+ stp G_l, G_h, [dstend, -64]
+ stp H_l, H_h, [dstend, -48]
+L(copy96):
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp E_l, E_h, [dstin, 32]
+ stp F_l, F_h, [dstin, 48]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
+ ret
- .macro str1 ptr, regB, val
- str \ptr, [\regB], \val
- .endm
+ .p2align 4
+ /* Copy more than 128 bytes. */
+L(copy_long):
+ /* Use backwards copy if there is an overlap. */
+ sub tmp1, dstin, src
+ cbz tmp1, L(copy0)
+ cmp tmp1, count
+ b.lo L(copy_long_backwards)
- .macro ldp1 ptr, regB, regC, val
- ldp \ptr, \regB, [\regC], \val
- .endm
+ /* Copy 16 bytes and then align dst to 16-byte alignment. */
- .macro stp1 ptr, regB, regC, val
- stp \ptr, \regB, [\regC], \val
- .endm
+ ldp D_l, D_h, [src]
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_l, A_h, [src, 16]
+ stp D_l, D_h, [dstin]
+ ldp B_l, B_h, [src, 32]
+ ldp C_l, C_h, [src, 48]
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(copy64_from_end)
- .weak __arch_memcpy
-ENTRY(__arch_memcpy)
-#include "copy_template.S"
+L(loop64):
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]!
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 64
+ b.hi L(loop64)
+
+ /* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end):
+ ldp E_l, E_h, [srcend, -64]
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [srcend, -48]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [srcend, -16]
+ stp D_l, D_h, [dst, 64]
+ stp E_l, E_h, [dstend, -64]
+ stp A_l, A_h, [dstend, -48]
+ stp B_l, B_h, [dstend, -32]
+ stp C_l, C_h, [dstend, -16]
ret
-ENDPROC(__arch_memcpy)
+
+ .p2align 4
+
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align dst to 16-byte alignment. */
+L(copy_long_backwards):
+ ldp D_l, D_h, [srcend, -16]
+ and tmp1, dstend, 15
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldp A_l, A_h, [srcend, -16]
+ stp D_l, D_h, [dstend, -16]
+ ldp B_l, B_h, [srcend, -32]
+ ldp C_l, C_h, [srcend, -48]
+ ldp D_l, D_h, [srcend, -64]!
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls L(copy64_from_start)
+
+L(loop64_backwards):
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [srcend, -16]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [srcend, -48]
+ stp D_l, D_h, [dstend, -64]!
+ ldp D_l, D_h, [srcend, -64]!
+ subs count, count, 64
+ b.hi L(loop64_backwards)
+
+ /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start):
+ ldp G_l, G_h, [src, 48]
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [src, 32]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [src, 16]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [src]
+ stp D_l, D_h, [dstend, -64]
+ stp G_l, G_h, [dstin, 48]
+ stp A_l, A_h, [dstin, 32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin]
+ ret
+SYM_FUNC_END(__pi_memcpy)
+
+SYM_FUNC_ALIAS(__arch_memcpy, __pi_memcpy)
+SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
+
+SYM_FUNC_ALIAS(__pi_memmove, __pi_memcpy)
+
+SYM_FUNC_ALIAS(__arch_memmove, __pi_memmove)
+SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
diff --git a/arch/arm/lib64/memset.S b/arch/arm/lib64/memset.S
index ff201750f1..f059203983 100644
--- a/arch/arm/lib64/memset.S
+++ b/arch/arm/lib64/memset.S
@@ -1,10 +1,9 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* SPDX-FileCopyrightText: 2013 ARM Ltd. */
-/* SPDX-FileCopyrightText: 2013 Linaro */
-
/*
+ * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
* This code is based on glibc cortex strings work originally authored by Linaro
- * and re-licensed under GPLv2 for the Linux kernel. The original code can
* be found @
*
* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
@@ -13,6 +12,7 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
+#include <asm/cache.h>
/*
* Fill in the buffer with character c (alignment handled by the hardware)
@@ -42,8 +42,7 @@ dst .req x8
tmp3w .req w9
tmp3 .req x9
- .weak memset
-ENTRY(__arch_memset)
+SYM_FUNC_START(__pi_memset)
mov dst, dstin /* Preserve return value. */
and A_lw, val, #255
orr A_lw, A_lw, A_lw, lsl #8
@@ -115,6 +114,7 @@ ENTRY(__arch_memset)
* Critical loop. Start at a new cache line boundary. Assuming
* 64 bytes per line, this ensures the entire loop is in one line.
*/
+ .p2align L1_CACHE_SHIFT
.Lnot_short:
sub dst, dst, #16/* Pre-bias. */
sub count, count, #64
@@ -201,4 +201,8 @@ ENTRY(__arch_memset)
ands count, count, zva_bits_x
b.ne .Ltail_maybe_long
ret
-ENDPROC(__arch_memset)
+SYM_FUNC_END(__pi_memset)
+
+SYM_FUNC_ALIAS(__arch_memset, __pi_memset)
+
+SYM_FUNC_ALIAS_WEAK(memset, __pi_memset)
diff --git a/arch/arm/lib64/string.c b/arch/arm/lib64/string.c
index 938790e1a9..c7954d6efe 100644
--- a/arch/arm/lib64/string.c
+++ b/arch/arm/lib64/string.c
@@ -6,6 +6,7 @@
void *__arch_memset(void *dst, int c, __kernel_size_t size);
void *__arch_memcpy(void * dest, const void *src, size_t count);
+void *__arch_memmove(void * dest, const void *src, size_t count);
static __prereloc void *_memset(void *dst, int c, __kernel_size_t size)
{
@@ -38,3 +39,19 @@ void __weak *memcpy(void * dest, const void *src, size_t count)
void *__memcpy(void * dest, const void *src, size_t count)
__alias(_memcpy);
+
+static void *_memmove(void * dest, const void *src, size_t count)
+{
+ if (likely(get_cr() & CR_M))
+ return __arch_memmove(dest, src, count);
+
+ return __default_memmove(dest, src, count);
+}
+
+void __weak *memmove(void * dest, const void *src, size_t count)
+{
+ return _memmove(dest, src, count);
+}
+
+void *__memmove(void * dest, const void *src, size_t count)
+ __alias(_memmove);
diff --git a/include/string.h b/include/string.h
index cbe6eddf7f..986ccd83dd 100644
--- a/include/string.h
+++ b/include/string.h
@@ -17,6 +17,8 @@ void *__nokasan_default_memset(void *, int, __kernel_size_t);
void *__default_memcpy(void * dest,const void *src,size_t count);
void *__nokasan_default_memcpy(void * dest,const void *src,size_t count);
+void *__default_memmove(void * dest,const void *src,size_t count);
+
char *parse_assignment(char *str);
int strverscmp(const char *a, const char *b);
diff --git a/lib/string.c b/lib/string.c
index 98dd3cffdd..50c2016c2b 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -701,7 +701,6 @@ void *memmove(void * dest, const void *src, size_t count)
void *__memmove(void * dest, const void *src, size_t count)
__alias(__default_memmove);
#endif
-EXPORT_SYMBOL(memmove);
#ifndef __HAVE_ARCH_MEMCMP
/**
--
2.39.5
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v2 10/10] ARM: add optimized memmove
2024-09-26 11:17 ` [PATCH v2 10/10] ARM: add optimized memmove Sascha Hauer
@ 2024-09-27 5:12 ` Marco Felsch
2024-09-27 10:04 ` Sascha Hauer
0 siblings, 1 reply; 14+ messages in thread
From: Marco Felsch @ 2024-09-27 5:12 UTC (permalink / raw)
To: Sascha Hauer; +Cc: open list:BAREBOX, Ahmad Fatoum
Hi Sascha,
On 24-09-26, Sascha Hauer wrote:
> Until now there has been no assembler optimized version of memmove() for
> ARM. Add this from Linux-6.10 for both ARM32 and ARM64. This also updates
> memcpy() for ARM64 from Linux-6.10.
out of curiosity, did you made performance measurements?
Regards,
Marco
> Reviewed-by: Ahmad Fatoum <a.fatoum@pengutronix.de>
> Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
> ---
> arch/arm/include/asm/cache.h | 8 ++
> arch/arm/include/asm/string.h | 4 +-
> arch/arm/lib32/Makefile | 1 +
> arch/arm/lib32/memmove.S | 206 +++++++++++++++++++++++++++++++
> arch/arm/lib64/copy_template.S | 180 ---------------------------
> arch/arm/lib64/memcpy.S | 274 ++++++++++++++++++++++++++++++++++-------
> arch/arm/lib64/memset.S | 18 +--
> arch/arm/lib64/string.c | 17 +++
> include/string.h | 2 +
> lib/string.c | 1 -
> 10 files changed, 478 insertions(+), 233 deletions(-)
>
> diff --git a/arch/arm/include/asm/cache.h b/arch/arm/include/asm/cache.h
> index 261c30129a..dd022c1f23 100644
> --- a/arch/arm/include/asm/cache.h
> +++ b/arch/arm/include/asm/cache.h
> @@ -3,6 +3,13 @@
> #ifndef __ASM_CACHE_H
> #define __ASM_CACHE_H
>
> +#ifdef CONFIG_CPU_64
> +#define L1_CACHE_SHIFT (6)
> +#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
> +#endif
> +
> +#ifndef __ASSEMBLY__
> +
> void v8_invalidate_icache_all(void);
> void v8_flush_dcache_all(void);
> void v8_invalidate_dcache_all(void);
> @@ -25,5 +32,6 @@ void arm_early_mmu_cache_invalidate(void);
> void sync_caches_for_execution(void);
>
> #include <asm-generic/cache.h>
> +#endif
>
> #endif
> diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h
> index 2322b846b2..f79392e53d 100644
> --- a/arch/arm/include/asm/string.h
> +++ b/arch/arm/include/asm/string.h
> @@ -9,10 +9,12 @@
> extern void *memcpy(void *, const void *, __kernel_size_t);
> #define __HAVE_ARCH_MEMSET
> extern void *memset(void *, int, __kernel_size_t);
> -
> +#define __HAVE_ARCH_MEMMOVE
> +extern void *memmove(void *, const void *, __kernel_size_t);
> #endif
>
> extern void *__memcpy(void *, const void *, __kernel_size_t);
> extern void *__memset(void *, int, __kernel_size_t);
> +extern void *__memmove(void *, const void *, __kernel_size_t);
>
> #endif
> diff --git a/arch/arm/lib32/Makefile b/arch/arm/lib32/Makefile
> index 511a029062..a139a80fb8 100644
> --- a/arch/arm/lib32/Makefile
> +++ b/arch/arm/lib32/Makefile
> @@ -21,6 +21,7 @@ obj-y += lshrdi3.o
> obj-y += runtime-offset.o
> pbl-y += runtime-offset.o
> obj-$(CONFIG_ARM_OPTIMZED_STRING_FUNCTIONS) += memcpy.o
> +obj-$(CONFIG_ARM_OPTIMZED_STRING_FUNCTIONS) += memmove.o
> obj-$(CONFIG_ARM_OPTIMZED_STRING_FUNCTIONS) += memset.o
> obj-$(CONFIG_ARM_UNWIND) += unwind.o
> obj-$(CONFIG_MODULES) += module.o
> diff --git a/arch/arm/lib32/memmove.S b/arch/arm/lib32/memmove.S
> new file mode 100644
> index 0000000000..6410554039
> --- /dev/null
> +++ b/arch/arm/lib32/memmove.S
> @@ -0,0 +1,206 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * linux/arch/arm/lib/memmove.S
> + *
> + * Author: Nicolas Pitre
> + * Created: Sep 28, 2005
> + * Copyright: (C) MontaVista Software Inc.
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/assembler.h>
> +#include <asm/unwind.h>
> +
> + .text
> +
> +/*
> + * Prototype: void *memmove(void *dest, const void *src, size_t n);
> + *
> + * Note:
> + *
> + * If the memory regions don't overlap, we simply branch to memcpy which is
> + * normally a bit faster. Otherwise the copy is done going downwards. This
> + * is a transposition of the code from copy_template.S but with the copy
> + * occurring in the opposite direction.
> + */
> +
> +ENTRY(__memmove)
> +WEAK(memmove)
> + UNWIND( .fnstart )
> +
> + subs ip, r0, r1
> + cmphi r2, ip
> + bls __memcpy
> + UNWIND( .fnend )
> +
> + UNWIND( .fnstart )
> + UNWIND( .save {r0, r4, fpreg, lr} )
> + stmfd sp!, {r0, r4, UNWIND(fpreg,) lr}
> + UNWIND( .setfp fpreg, sp )
> + UNWIND( mov fpreg, sp )
> + add r1, r1, r2
> + add r0, r0, r2
> + subs r2, r2, #4
> + blt 8f
> + ands ip, r0, #3
> + PLD( pld [r1, #-4] )
> + bne 9f
> + ands ip, r1, #3
> + bne 10f
> +
> +1: subs r2, r2, #(28)
> + stmfd sp!, {r5, r6, r8, r9}
> + blt 5f
> +
> + CALGN( ands ip, r0, #31 )
> + CALGN( sbcsne r4, ip, r2 ) @ C is always set here
> + CALGN( bcs 2f )
> + CALGN( adr r4, 6f )
> + CALGN( subs r2, r2, ip ) @ C is set here
> + CALGN( rsb ip, ip, #32 )
> + CALGN( add pc, r4, ip )
> +
> + PLD( pld [r1, #-4] )
> +2: PLD( subs r2, r2, #96 )
> + PLD( pld [r1, #-32] )
> + PLD( blt 4f )
> + PLD( pld [r1, #-64] )
> + PLD( pld [r1, #-96] )
> +
> +3: PLD( pld [r1, #-128] )
> +4: ldmdb r1!, {r3, r4, r5, r6, r8, r9, ip, lr}
> + subs r2, r2, #32
> + stmdb r0!, {r3, r4, r5, r6, r8, r9, ip, lr}
> + bge 3b
> + PLD( cmn r2, #96 )
> + PLD( bge 4b )
> +
> +5: ands ip, r2, #28
> + rsb ip, ip, #32
> + addne pc, pc, ip @ C is always clear here
> + b 7f
> +6: W(nop)
> + W(ldr) r3, [r1, #-4]!
> + W(ldr) r4, [r1, #-4]!
> + W(ldr) r5, [r1, #-4]!
> + W(ldr) r6, [r1, #-4]!
> + W(ldr) r8, [r1, #-4]!
> + W(ldr) r9, [r1, #-4]!
> + W(ldr) lr, [r1, #-4]!
> +
> + add pc, pc, ip
> + nop
> + W(nop)
> + W(str) r3, [r0, #-4]!
> + W(str) r4, [r0, #-4]!
> + W(str) r5, [r0, #-4]!
> + W(str) r6, [r0, #-4]!
> + W(str) r8, [r0, #-4]!
> + W(str) r9, [r0, #-4]!
> + W(str) lr, [r0, #-4]!
> +
> + CALGN( bcs 2b )
> +
> +7: ldmfd sp!, {r5, r6, r8, r9}
> +
> +8: movs r2, r2, lsl #31
> + ldrbne r3, [r1, #-1]!
> + ldrbcs r4, [r1, #-1]!
> + ldrbcs ip, [r1, #-1]
> + strbne r3, [r0, #-1]!
> + strbcs r4, [r0, #-1]!
> + strbcs ip, [r0, #-1]
> + ldmfd sp!, {r0, r4, UNWIND(fpreg,) pc}
> +
> +9: cmp ip, #2
> + ldrbgt r3, [r1, #-1]!
> + ldrbge r4, [r1, #-1]!
> + ldrb lr, [r1, #-1]!
> + strbgt r3, [r0, #-1]!
> + strbge r4, [r0, #-1]!
> + subs r2, r2, ip
> + strb lr, [r0, #-1]!
> + blt 8b
> + ands ip, r1, #3
> + beq 1b
> +
> +10: bic r1, r1, #3
> + cmp ip, #2
> + ldr r3, [r1, #0]
> + beq 17f
> + blt 18f
> +
> +
> + .macro backward_copy_shift push pull
> +
> + subs r2, r2, #28
> + blt 14f
> +
> + CALGN( ands ip, r0, #31 )
> + CALGN( sbcsne r4, ip, r2 ) @ C is always set here
> + CALGN( subcc r2, r2, ip )
> + CALGN( bcc 15f )
> +
> +11: stmfd sp!, {r5, r6, r8 - r10}
> +
> + PLD( pld [r1, #-4] )
> + PLD( subs r2, r2, #96 )
> + PLD( pld [r1, #-32] )
> + PLD( blt 13f )
> + PLD( pld [r1, #-64] )
> + PLD( pld [r1, #-96] )
> +
> +12: PLD( pld [r1, #-128] )
> +13: ldmdb r1!, {r8, r9, r10, ip}
> + mov lr, r3, lspush #\push
> + subs r2, r2, #32
> + ldmdb r1!, {r3, r4, r5, r6}
> + orr lr, lr, ip, lspull #\pull
> + mov ip, ip, lspush #\push
> + orr ip, ip, r10, lspull #\pull
> + mov r10, r10, lspush #\push
> + orr r10, r10, r9, lspull #\pull
> + mov r9, r9, lspush #\push
> + orr r9, r9, r8, lspull #\pull
> + mov r8, r8, lspush #\push
> + orr r8, r8, r6, lspull #\pull
> + mov r6, r6, lspush #\push
> + orr r6, r6, r5, lspull #\pull
> + mov r5, r5, lspush #\push
> + orr r5, r5, r4, lspull #\pull
> + mov r4, r4, lspush #\push
> + orr r4, r4, r3, lspull #\pull
> + stmdb r0!, {r4 - r6, r8 - r10, ip, lr}
> + bge 12b
> + PLD( cmn r2, #96 )
> + PLD( bge 13b )
> +
> + ldmfd sp!, {r5, r6, r8 - r10}
> +
> +14: ands ip, r2, #28
> + beq 16f
> +
> +15: mov lr, r3, lspush #\push
> + ldr r3, [r1, #-4]!
> + subs ip, ip, #4
> + orr lr, lr, r3, lspull #\pull
> + str lr, [r0, #-4]!
> + bgt 15b
> + CALGN( cmp r2, #0 )
> + CALGN( bge 11b )
> +
> +16: add r1, r1, #(\pull / 8)
> + b 8b
> +
> + .endm
> +
> +
> + backward_copy_shift push=8 pull=24
> +
> +17: backward_copy_shift push=16 pull=16
> +
> +18: backward_copy_shift push=24 pull=8
> +
> + UNWIND( .fnend )
> +ENDPROC(memmove)
> +ENDPROC(__memmove)
> diff --git a/arch/arm/lib64/copy_template.S b/arch/arm/lib64/copy_template.S
> deleted file mode 100644
> index 8e4ff059d1..0000000000
> --- a/arch/arm/lib64/copy_template.S
> +++ /dev/null
> @@ -1,180 +0,0 @@
> -/* SPDX-License-Identifier: GPL-2.0-only */
> -/* SPDX-FileCopyrightText: 2013 ARM Ltd. */
> -/* SPDX-FileCopyrightText: 2013 Linaro */
> -
> -/*
> - * This code is based on glibc cortex strings work originally authored by Linaro
> - * and re-licensed under GPLv2 for the Linux kernel. The original code can
> - * be found @
> - *
> - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
> - * files/head:/src/aarch64/
> - */
> -
> -/*
> - * Copy a buffer from src to dest (alignment handled by the hardware)
> - *
> - * Parameters:
> - * x0 - dest
> - * x1 - src
> - * x2 - n
> - * Returns:
> - * x0 - dest
> - */
> -dstin .req x0
> -src .req x1
> -count .req x2
> -tmp1 .req x3
> -tmp1w .req w3
> -tmp2 .req x4
> -tmp2w .req w4
> -dst .req x6
> -
> -A_l .req x7
> -A_h .req x8
> -B_l .req x9
> -B_h .req x10
> -C_l .req x11
> -C_h .req x12
> -D_l .req x13
> -D_h .req x14
> -
> - mov dst, dstin
> - cmp count, #16
> - /*When memory length is less than 16, the accessed are not aligned.*/
> - b.lo .Ltiny15
> -
> - neg tmp2, src
> - ands tmp2, tmp2, #15/* Bytes to reach alignment. */
> - b.eq .LSrcAligned
> - sub count, count, tmp2
> - /*
> - * Copy the leading memory data from src to dst in an increasing
> - * address order.By this way,the risk of overwritting the source
> - * memory data is eliminated when the distance between src and
> - * dst is less than 16. The memory accesses here are alignment.
> - */
> - tbz tmp2, #0, 1f
> - ldrb1 tmp1w, src, #1
> - strb1 tmp1w, dst, #1
> -1:
> - tbz tmp2, #1, 2f
> - ldrh1 tmp1w, src, #2
> - strh1 tmp1w, dst, #2
> -2:
> - tbz tmp2, #2, 3f
> - ldr1 tmp1w, src, #4
> - str1 tmp1w, dst, #4
> -3:
> - tbz tmp2, #3, .LSrcAligned
> - ldr1 tmp1, src, #8
> - str1 tmp1, dst, #8
> -
> -.LSrcAligned:
> - cmp count, #64
> - b.ge .Lcpy_over64
> - /*
> - * Deal with small copies quickly by dropping straight into the
> - * exit block.
> - */
> -.Ltail63:
> - /*
> - * Copy up to 48 bytes of data. At this point we only need the
> - * bottom 6 bits of count to be accurate.
> - */
> - ands tmp1, count, #0x30
> - b.eq .Ltiny15
> - cmp tmp1w, #0x20
> - b.eq 1f
> - b.lt 2f
> - ldp1 A_l, A_h, src, #16
> - stp1 A_l, A_h, dst, #16
> -1:
> - ldp1 A_l, A_h, src, #16
> - stp1 A_l, A_h, dst, #16
> -2:
> - ldp1 A_l, A_h, src, #16
> - stp1 A_l, A_h, dst, #16
> -.Ltiny15:
> - /*
> - * Prefer to break one ldp/stp into several load/store to access
> - * memory in an increasing address order,rather than to load/store 16
> - * bytes from (src-16) to (dst-16) and to backward the src to aligned
> - * address,which way is used in original cortex memcpy. If keeping
> - * the original memcpy process here, memmove need to satisfy the
> - * precondition that src address is at least 16 bytes bigger than dst
> - * address,otherwise some source data will be overwritten when memove
> - * call memcpy directly. To make memmove simpler and decouple the
> - * memcpy's dependency on memmove, withdrew the original process.
> - */
> - tbz count, #3, 1f
> - ldr1 tmp1, src, #8
> - str1 tmp1, dst, #8
> -1:
> - tbz count, #2, 2f
> - ldr1 tmp1w, src, #4
> - str1 tmp1w, dst, #4
> -2:
> - tbz count, #1, 3f
> - ldrh1 tmp1w, src, #2
> - strh1 tmp1w, dst, #2
> -3:
> - tbz count, #0, .Lexitfunc
> - ldrb1 tmp1w, src, #1
> - strb1 tmp1w, dst, #1
> -
> - b .Lexitfunc
> -
> -.Lcpy_over64:
> - subs count, count, #128
> - b.ge .Lcpy_body_large
> - /*
> - * Less than 128 bytes to copy, so handle 64 here and then jump
> - * to the tail.
> - */
> - ldp1 A_l, A_h, src, #16
> - stp1 A_l, A_h, dst, #16
> - ldp1 B_l, B_h, src, #16
> - ldp1 C_l, C_h, src, #16
> - stp1 B_l, B_h, dst, #16
> - stp1 C_l, C_h, dst, #16
> - ldp1 D_l, D_h, src, #16
> - stp1 D_l, D_h, dst, #16
> -
> - tst count, #0x3f
> - b.ne .Ltail63
> - b .Lexitfunc
> -
> - /*
> - * Critical loop. Start at a new cache line boundary. Assuming
> - * 64 bytes per line this ensures the entire loop is in one line.
> - */
> -.Lcpy_body_large:
> - /* pre-get 64 bytes data. */
> - ldp1 A_l, A_h, src, #16
> - ldp1 B_l, B_h, src, #16
> - ldp1 C_l, C_h, src, #16
> - ldp1 D_l, D_h, src, #16
> -1:
> - /*
> - * interlace the load of next 64 bytes data block with store of the last
> - * loaded 64 bytes data.
> - */
> - stp1 A_l, A_h, dst, #16
> - ldp1 A_l, A_h, src, #16
> - stp1 B_l, B_h, dst, #16
> - ldp1 B_l, B_h, src, #16
> - stp1 C_l, C_h, dst, #16
> - ldp1 C_l, C_h, src, #16
> - stp1 D_l, D_h, dst, #16
> - ldp1 D_l, D_h, src, #16
> - subs count, count, #64
> - b.ge 1b
> - stp1 A_l, A_h, dst, #16
> - stp1 B_l, B_h, dst, #16
> - stp1 C_l, C_h, dst, #16
> - stp1 D_l, D_h, dst, #16
> -
> - tst count, #0x3f
> - b.ne .Ltail63
> -.Lexitfunc:
> diff --git a/arch/arm/lib64/memcpy.S b/arch/arm/lib64/memcpy.S
> index 92845b25a6..98b453d3fd 100644
> --- a/arch/arm/lib64/memcpy.S
> +++ b/arch/arm/lib64/memcpy.S
> @@ -1,63 +1,249 @@
> /* SPDX-License-Identifier: GPL-2.0-only */
> -/* SPDX-FileCopyrightText: 2013 ARM Ltd. */
> -/* SPDX-FileCopyrightText: 2013 Linaro */
> -
> /*
> - * This code is based on glibc cortex strings work originally authored by Linaro
> - * and re-licensed under GPLv2 for the Linux kernel. The original code can
> - * be found @
> + * Copyright (c) 2012-2021, Arm Limited.
> *
> - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
> - * files/head:/src/aarch64/
> + * Adapted from the original at:
> + * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
> */
>
> #include <linux/linkage.h>
> #include <asm/assembler.h>
>
> -/*
> - * Copy a buffer from src to dest (alignment handled by the hardware)
> +/* Assumptions:
> + *
> + * ARMv8-a, AArch64, unaligned accesses.
> *
> - * Parameters:
> - * x0 - dest
> - * x1 - src
> - * x2 - n
> - * Returns:
> - * x0 - dest
> */
> - .macro ldrb1 ptr, regB, val
> - ldrb \ptr, [\regB], \val
> - .endm
>
> - .macro strb1 ptr, regB, val
> - strb \ptr, [\regB], \val
> - .endm
> +#define L(label) .L ## label
> +
> +#define dstin x0
> +#define src x1
> +#define count x2
> +#define dst x3
> +#define srcend x4
> +#define dstend x5
> +#define A_l x6
> +#define A_lw w6
> +#define A_h x7
> +#define B_l x8
> +#define B_lw w8
> +#define B_h x9
> +#define C_l x10
> +#define C_lw w10
> +#define C_h x11
> +#define D_l x12
> +#define D_h x13
> +#define E_l x14
> +#define E_h x15
> +#define F_l x16
> +#define F_h x17
> +#define G_l count
> +#define G_h dst
> +#define H_l src
> +#define H_h srcend
> +#define tmp1 x14
> +
> +/* This implementation handles overlaps and supports both memcpy and memmove
> + from a single entry point. It uses unaligned accesses and branchless
> + sequences to keep the code small, simple and improve performance.
> +
> + Copies are split into 3 main cases: small copies of up to 32 bytes, medium
> + copies of up to 128 bytes, and large copies. The overhead of the overlap
> + check is negligible since it is only required for large copies.
> +
> + Large copies use a software pipelined loop processing 64 bytes per iteration.
> + The destination pointer is 16-byte aligned to minimize unaligned accesses.
> + The loop tail is handled by always copying 64 bytes from the end.
> +*/
> +
> +SYM_FUNC_START(__pi_memcpy)
> + add srcend, src, count
> + add dstend, dstin, count
> + cmp count, 128
> + b.hi L(copy_long)
> + cmp count, 32
> + b.hi L(copy32_128)
> +
> + /* Small copies: 0..32 bytes. */
> + cmp count, 16
> + b.lo L(copy16)
> + ldp A_l, A_h, [src]
> + ldp D_l, D_h, [srcend, -16]
> + stp A_l, A_h, [dstin]
> + stp D_l, D_h, [dstend, -16]
> + ret
> +
> + /* Copy 8-15 bytes. */
> +L(copy16):
> + tbz count, 3, L(copy8)
> + ldr A_l, [src]
> + ldr A_h, [srcend, -8]
> + str A_l, [dstin]
> + str A_h, [dstend, -8]
> + ret
> +
> + .p2align 3
> + /* Copy 4-7 bytes. */
> +L(copy8):
> + tbz count, 2, L(copy4)
> + ldr A_lw, [src]
> + ldr B_lw, [srcend, -4]
> + str A_lw, [dstin]
> + str B_lw, [dstend, -4]
> + ret
>
> - .macro ldrh1 ptr, regB, val
> - ldrh \ptr, [\regB], \val
> - .endm
> + /* Copy 0..3 bytes using a branchless sequence. */
> +L(copy4):
> + cbz count, L(copy0)
> + lsr tmp1, count, 1
> + ldrb A_lw, [src]
> + ldrb C_lw, [srcend, -1]
> + ldrb B_lw, [src, tmp1]
> + strb A_lw, [dstin]
> + strb B_lw, [dstin, tmp1]
> + strb C_lw, [dstend, -1]
> +L(copy0):
> + ret
>
> - .macro strh1 ptr, regB, val
> - strh \ptr, [\regB], \val
> - .endm
> + .p2align 4
> + /* Medium copies: 33..128 bytes. */
> +L(copy32_128):
> + ldp A_l, A_h, [src]
> + ldp B_l, B_h, [src, 16]
> + ldp C_l, C_h, [srcend, -32]
> + ldp D_l, D_h, [srcend, -16]
> + cmp count, 64
> + b.hi L(copy128)
> + stp A_l, A_h, [dstin]
> + stp B_l, B_h, [dstin, 16]
> + stp C_l, C_h, [dstend, -32]
> + stp D_l, D_h, [dstend, -16]
> + ret
>
> - .macro ldr1 ptr, regB, val
> - ldr \ptr, [\regB], \val
> - .endm
> + .p2align 4
> + /* Copy 65..128 bytes. */
> +L(copy128):
> + ldp E_l, E_h, [src, 32]
> + ldp F_l, F_h, [src, 48]
> + cmp count, 96
> + b.ls L(copy96)
> + ldp G_l, G_h, [srcend, -64]
> + ldp H_l, H_h, [srcend, -48]
> + stp G_l, G_h, [dstend, -64]
> + stp H_l, H_h, [dstend, -48]
> +L(copy96):
> + stp A_l, A_h, [dstin]
> + stp B_l, B_h, [dstin, 16]
> + stp E_l, E_h, [dstin, 32]
> + stp F_l, F_h, [dstin, 48]
> + stp C_l, C_h, [dstend, -32]
> + stp D_l, D_h, [dstend, -16]
> + ret
>
> - .macro str1 ptr, regB, val
> - str \ptr, [\regB], \val
> - .endm
> + .p2align 4
> + /* Copy more than 128 bytes. */
> +L(copy_long):
> + /* Use backwards copy if there is an overlap. */
> + sub tmp1, dstin, src
> + cbz tmp1, L(copy0)
> + cmp tmp1, count
> + b.lo L(copy_long_backwards)
>
> - .macro ldp1 ptr, regB, regC, val
> - ldp \ptr, \regB, [\regC], \val
> - .endm
> + /* Copy 16 bytes and then align dst to 16-byte alignment. */
>
> - .macro stp1 ptr, regB, regC, val
> - stp \ptr, \regB, [\regC], \val
> - .endm
> + ldp D_l, D_h, [src]
> + and tmp1, dstin, 15
> + bic dst, dstin, 15
> + sub src, src, tmp1
> + add count, count, tmp1 /* Count is now 16 too large. */
> + ldp A_l, A_h, [src, 16]
> + stp D_l, D_h, [dstin]
> + ldp B_l, B_h, [src, 32]
> + ldp C_l, C_h, [src, 48]
> + ldp D_l, D_h, [src, 64]!
> + subs count, count, 128 + 16 /* Test and readjust count. */
> + b.ls L(copy64_from_end)
>
> - .weak __arch_memcpy
> -ENTRY(__arch_memcpy)
> -#include "copy_template.S"
> +L(loop64):
> + stp A_l, A_h, [dst, 16]
> + ldp A_l, A_h, [src, 16]
> + stp B_l, B_h, [dst, 32]
> + ldp B_l, B_h, [src, 32]
> + stp C_l, C_h, [dst, 48]
> + ldp C_l, C_h, [src, 48]
> + stp D_l, D_h, [dst, 64]!
> + ldp D_l, D_h, [src, 64]!
> + subs count, count, 64
> + b.hi L(loop64)
> +
> + /* Write the last iteration and copy 64 bytes from the end. */
> +L(copy64_from_end):
> + ldp E_l, E_h, [srcend, -64]
> + stp A_l, A_h, [dst, 16]
> + ldp A_l, A_h, [srcend, -48]
> + stp B_l, B_h, [dst, 32]
> + ldp B_l, B_h, [srcend, -32]
> + stp C_l, C_h, [dst, 48]
> + ldp C_l, C_h, [srcend, -16]
> + stp D_l, D_h, [dst, 64]
> + stp E_l, E_h, [dstend, -64]
> + stp A_l, A_h, [dstend, -48]
> + stp B_l, B_h, [dstend, -32]
> + stp C_l, C_h, [dstend, -16]
> ret
> -ENDPROC(__arch_memcpy)
> +
> + .p2align 4
> +
> + /* Large backwards copy for overlapping copies.
> + Copy 16 bytes and then align dst to 16-byte alignment. */
> +L(copy_long_backwards):
> + ldp D_l, D_h, [srcend, -16]
> + and tmp1, dstend, 15
> + sub srcend, srcend, tmp1
> + sub count, count, tmp1
> + ldp A_l, A_h, [srcend, -16]
> + stp D_l, D_h, [dstend, -16]
> + ldp B_l, B_h, [srcend, -32]
> + ldp C_l, C_h, [srcend, -48]
> + ldp D_l, D_h, [srcend, -64]!
> + sub dstend, dstend, tmp1
> + subs count, count, 128
> + b.ls L(copy64_from_start)
> +
> +L(loop64_backwards):
> + stp A_l, A_h, [dstend, -16]
> + ldp A_l, A_h, [srcend, -16]
> + stp B_l, B_h, [dstend, -32]
> + ldp B_l, B_h, [srcend, -32]
> + stp C_l, C_h, [dstend, -48]
> + ldp C_l, C_h, [srcend, -48]
> + stp D_l, D_h, [dstend, -64]!
> + ldp D_l, D_h, [srcend, -64]!
> + subs count, count, 64
> + b.hi L(loop64_backwards)
> +
> + /* Write the last iteration and copy 64 bytes from the start. */
> +L(copy64_from_start):
> + ldp G_l, G_h, [src, 48]
> + stp A_l, A_h, [dstend, -16]
> + ldp A_l, A_h, [src, 32]
> + stp B_l, B_h, [dstend, -32]
> + ldp B_l, B_h, [src, 16]
> + stp C_l, C_h, [dstend, -48]
> + ldp C_l, C_h, [src]
> + stp D_l, D_h, [dstend, -64]
> + stp G_l, G_h, [dstin, 48]
> + stp A_l, A_h, [dstin, 32]
> + stp B_l, B_h, [dstin, 16]
> + stp C_l, C_h, [dstin]
> + ret
> +SYM_FUNC_END(__pi_memcpy)
> +
> +SYM_FUNC_ALIAS(__arch_memcpy, __pi_memcpy)
> +SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
> +
> +SYM_FUNC_ALIAS(__pi_memmove, __pi_memcpy)
> +
> +SYM_FUNC_ALIAS(__arch_memmove, __pi_memmove)
> +SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
> diff --git a/arch/arm/lib64/memset.S b/arch/arm/lib64/memset.S
> index ff201750f1..f059203983 100644
> --- a/arch/arm/lib64/memset.S
> +++ b/arch/arm/lib64/memset.S
> @@ -1,10 +1,9 @@
> /* SPDX-License-Identifier: GPL-2.0-only */
> -/* SPDX-FileCopyrightText: 2013 ARM Ltd. */
> -/* SPDX-FileCopyrightText: 2013 Linaro */
> -
> /*
> + * Copyright (C) 2013 ARM Ltd.
> + * Copyright (C) 2013 Linaro.
> + *
> * This code is based on glibc cortex strings work originally authored by Linaro
> - * and re-licensed under GPLv2 for the Linux kernel. The original code can
> * be found @
> *
> * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
> @@ -13,6 +12,7 @@
>
> #include <linux/linkage.h>
> #include <asm/assembler.h>
> +#include <asm/cache.h>
>
> /*
> * Fill in the buffer with character c (alignment handled by the hardware)
> @@ -42,8 +42,7 @@ dst .req x8
> tmp3w .req w9
> tmp3 .req x9
>
> - .weak memset
> -ENTRY(__arch_memset)
> +SYM_FUNC_START(__pi_memset)
> mov dst, dstin /* Preserve return value. */
> and A_lw, val, #255
> orr A_lw, A_lw, A_lw, lsl #8
> @@ -115,6 +114,7 @@ ENTRY(__arch_memset)
> * Critical loop. Start at a new cache line boundary. Assuming
> * 64 bytes per line, this ensures the entire loop is in one line.
> */
> + .p2align L1_CACHE_SHIFT
> .Lnot_short:
> sub dst, dst, #16/* Pre-bias. */
> sub count, count, #64
> @@ -201,4 +201,8 @@ ENTRY(__arch_memset)
> ands count, count, zva_bits_x
> b.ne .Ltail_maybe_long
> ret
> -ENDPROC(__arch_memset)
> +SYM_FUNC_END(__pi_memset)
> +
> +SYM_FUNC_ALIAS(__arch_memset, __pi_memset)
> +
> +SYM_FUNC_ALIAS_WEAK(memset, __pi_memset)
> diff --git a/arch/arm/lib64/string.c b/arch/arm/lib64/string.c
> index 938790e1a9..c7954d6efe 100644
> --- a/arch/arm/lib64/string.c
> +++ b/arch/arm/lib64/string.c
> @@ -6,6 +6,7 @@
>
> void *__arch_memset(void *dst, int c, __kernel_size_t size);
> void *__arch_memcpy(void * dest, const void *src, size_t count);
> +void *__arch_memmove(void * dest, const void *src, size_t count);
>
> static __prereloc void *_memset(void *dst, int c, __kernel_size_t size)
> {
> @@ -38,3 +39,19 @@ void __weak *memcpy(void * dest, const void *src, size_t count)
>
> void *__memcpy(void * dest, const void *src, size_t count)
> __alias(_memcpy);
> +
> +static void *_memmove(void * dest, const void *src, size_t count)
> +{
> + if (likely(get_cr() & CR_M))
> + return __arch_memmove(dest, src, count);
> +
> + return __default_memmove(dest, src, count);
> +}
> +
> +void __weak *memmove(void * dest, const void *src, size_t count)
> +{
> + return _memmove(dest, src, count);
> +}
> +
> +void *__memmove(void * dest, const void *src, size_t count)
> + __alias(_memmove);
> diff --git a/include/string.h b/include/string.h
> index cbe6eddf7f..986ccd83dd 100644
> --- a/include/string.h
> +++ b/include/string.h
> @@ -17,6 +17,8 @@ void *__nokasan_default_memset(void *, int, __kernel_size_t);
> void *__default_memcpy(void * dest,const void *src,size_t count);
> void *__nokasan_default_memcpy(void * dest,const void *src,size_t count);
>
> +void *__default_memmove(void * dest,const void *src,size_t count);
> +
> char *parse_assignment(char *str);
>
> int strverscmp(const char *a, const char *b);
> diff --git a/lib/string.c b/lib/string.c
> index 98dd3cffdd..50c2016c2b 100644
> --- a/lib/string.c
> +++ b/lib/string.c
> @@ -701,7 +701,6 @@ void *memmove(void * dest, const void *src, size_t count)
> void *__memmove(void * dest, const void *src, size_t count)
> __alias(__default_memmove);
> #endif
> -EXPORT_SYMBOL(memmove);
>
> #ifndef __HAVE_ARCH_MEMCMP
> /**
>
> --
> 2.39.5
>
>
>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v2 10/10] ARM: add optimized memmove
2024-09-27 5:12 ` Marco Felsch
@ 2024-09-27 10:04 ` Sascha Hauer
0 siblings, 0 replies; 14+ messages in thread
From: Sascha Hauer @ 2024-09-27 10:04 UTC (permalink / raw)
To: Marco Felsch; +Cc: open list:BAREBOX, Ahmad Fatoum
On Fri, Sep 27, 2024 at 07:12:15AM +0200, Marco Felsch wrote:
> Hi Sascha,
>
> On 24-09-26, Sascha Hauer wrote:
> > Until now there has been no assembler optimized version of memmove() for
> > ARM. Add this from Linux-6.10 for both ARM32 and ARM64. This also updates
> > memcpy() for ARM64 from Linux-6.10.
>
> out of curiosity, did you made performance measurements?
No, I haven't made any performance measurements. I implemented this for
framebuffer console rotation support. When moving a framebuffer one line
left or right it's the difference between night and day. Otherwise I
think memmove is rarely used in barebox.
Sascha
--
Pengutronix e.K. | |
Steuerwalder Str. 21 | http://www.pengutronix.de/ |
31137 Hildesheim, Germany | Phone: +49-5121-206917-0 |
Amtsgericht Hildesheim, HRA 2686 | Fax: +49-5121-206917-5555 |
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v2 00/10] ARM: add assembler optimized memmove
2024-09-26 11:17 [PATCH v2 00/10] ARM: add assembler optimized memmove Sascha Hauer
` (9 preceding siblings ...)
2024-09-26 11:17 ` [PATCH v2 10/10] ARM: add optimized memmove Sascha Hauer
@ 2024-09-27 10:39 ` Sascha Hauer
10 siblings, 0 replies; 14+ messages in thread
From: Sascha Hauer @ 2024-09-27 10:39 UTC (permalink / raw)
To: open list:BAREBOX, Sascha Hauer; +Cc: Ahmad Fatoum
On Thu, 26 Sep 2024 13:17:02 +0200, Sascha Hauer wrote:
> I realized that ARM uses the generic memmove() implementation which is
> rather slow. This series adds the assembler optimized version for ARM.
> The corresponding recent Linux code doesn't fit into barebox anymore, so
> to merge the code the surroundings have to be updated first, hence the
> series is bigger than I like it to be.
>
> Sascha
>
> [...]
Applied, thanks!
[01/10] ARM: Use optimized reads[bwl] and writes[bwl] functions
https://git.pengutronix.de/cgit/barebox/commit/?id=54885f734d50 (link may not be stable)
[02/10] ARM: rename logical shift macros push pull into lspush lspull
https://git.pengutronix.de/cgit/barebox/commit/?id=46319e748dfc (link may not be stable)
[03/10] ARM: convert all "mov.* pc, reg" to "bx reg" for ARMv6+
https://git.pengutronix.de/cgit/barebox/commit/?id=e8baf08fca3f (link may not be stable)
[04/10] ARM: update lib1funcs.S from Linux
https://git.pengutronix.de/cgit/barebox/commit/?id=4a876371504b (link may not be stable)
[05/10] ARM: update findbit.S from Linux
https://git.pengutronix.de/cgit/barebox/commit/?id=7131c48f71e0 (link may not be stable)
[06/10] ARM: update io-* from Linux
https://git.pengutronix.de/cgit/barebox/commit/?id=cd6b3d722c19 (link may not be stable)
[07/10] ARM: always assume the unified syntax for assembly code
https://git.pengutronix.de/cgit/barebox/commit/?id=958c395ef6a8 (link may not be stable)
[08/10] ARM: update memcpy.S and memset.S from Linux
https://git.pengutronix.de/cgit/barebox/commit/?id=a6052c22a4e1 (link may not be stable)
[09/10] lib/string.c: export non optimized memmove as __default_memmove
https://git.pengutronix.de/cgit/barebox/commit/?id=54568a8ada22 (link may not be stable)
[10/10] ARM: add optimized memmove
https://git.pengutronix.de/cgit/barebox/commit/?id=95e8f2b2efd1 (link may not be stable)
Best regards,
--
Sascha Hauer <s.hauer@pengutronix.de>
^ permalink raw reply [flat|nested] 14+ messages in thread