summaryrefslogtreecommitdiff
path: root/packages/glibc/2.17/0047-glibc-ppc64le-25.patch
diff options
context:
space:
mode:
Diffstat (limited to 'packages/glibc/2.17/0047-glibc-ppc64le-25.patch')
-rw-r--r--packages/glibc/2.17/0047-glibc-ppc64le-25.patch411
1 files changed, 411 insertions, 0 deletions
diff --git a/packages/glibc/2.17/0047-glibc-ppc64le-25.patch b/packages/glibc/2.17/0047-glibc-ppc64le-25.patch
new file mode 100644
index 0000000..144d3f3
--- /dev/null
+++ b/packages/glibc/2.17/0047-glibc-ppc64le-25.patch
@@ -0,0 +1,411 @@
+# commit db9b4570c5dc550074140ac1d1677077fba29a26
+# Author: Alan Modra <amodra@gmail.com>
+# Date: Sat Aug 17 18:40:11 2013 +0930
+#
+# PowerPC LE strlen
+# http://sourceware.org/ml/libc-alpha/2013-08/msg00097.html
+#
+# This is the first of nine patches adding little-endian support to the
+# existing optimised string and memory functions. I did spend some
+# time with a power7 simulator looking at cycle by cycle behaviour for
+# memchr, but most of these patches have not been run on cpu simulators
+# to check that we are going as fast as possible. I'm sure PowerPC can
+# do better. However, the little-endian support mostly leaves main
+# loops unchanged, so I'm banking on previous authors having done a
+# good job on big-endian.. As with most code you stare at long enough,
+# I found some improvements for big-endian too.
+#
+# Little-endian support for strlen. Like most of the string functions,
+# I leave the main word or multiple-word loops substantially unchanged,
+# just needing to modify the tail.
+#
+# Removing the branch in the power7 functions is just a tidy. .align
+# produces a branch anyway. Modifying regs in the non-power7 functions
+# is to suit the new little-endian tail.
+#
+# * sysdeps/powerpc/powerpc64/power7/strlen.S (strlen): Add little-endian
+# support. Don't branch over align.
+# * sysdeps/powerpc/powerpc32/power7/strlen.S: Likewise.
+# * sysdeps/powerpc/powerpc64/strlen.S (strlen): Add little-endian support.
+# Rearrange tmp reg use to suit. Comment.
+# * sysdeps/powerpc/powerpc32/strlen.S: Likewise.
+#
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strlen.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strlen.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strlen.S 2014-05-28 12:28:44.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strlen.S 2014-05-28 12:28:45.000000000 -0500
+@@ -31,7 +31,11 @@
+ li r0,0 /* Word with null chars to use with cmpb. */
+ li r5,-1 /* MASK = 0xffffffffffffffff. */
+ lwz r12,0(r4) /* Load word from memory. */
++#ifdef __LITTLE_ENDIAN__
++ slw r5,r5,r6
++#else
+ srw r5,r5,r6 /* MASK = MASK >> padding. */
++#endif
+ orc r9,r12,r5 /* Mask bits that are not part of the string. */
+ cmpb r10,r9,r0 /* Check for null bytes in WORD1. */
+ cmpwi cr7,r10,0 /* If r10 == 0, no null's have been found. */
+@@ -49,9 +53,6 @@
+ cmpb r10,r12,r0
+ cmpwi cr7,r10,0
+ bne cr7,L(done)
+- b L(loop) /* We branch here (rather than falling through)
+- to skip the nops due to heavy alignment
+- of the loop below. */
+
+ /* Main loop to look for the end of the string. Since it's a
+ small loop (< 8 instructions), align it to 32-bytes. */
+@@ -88,9 +89,15 @@
+ 0xff in the same position as the null byte in the original
+ word from the string. Use that to calculate the length. */
+ L(done):
+- cntlzw r0,r10 /* Count leading zeroes before the match. */
++#ifdef __LITTLE_ENDIAN__
++ addi r9, r10, -1 /* Form a mask from trailing zeros. */
++ andc r9, r9, r10
++ popcntw r0, r9 /* Count the bits in the mask. */
++#else
++ cntlzw r0,r10 /* Count leading zeros before the match. */
++#endif
+ subf r5,r3,r4
+- srwi r0,r0,3 /* Convert leading zeroes to bytes. */
++ srwi r0,r0,3 /* Convert leading zeros to bytes. */
+ add r3,r5,r0 /* Compute final length. */
+ blr
+ END (BP_SYM (strlen))
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/strlen.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/strlen.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/strlen.S 2014-05-28 12:28:44.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/strlen.S 2014-05-28 12:32:24.000000000 -0500
+@@ -31,7 +31,12 @@
+ 1 is subtracted you get a value in the range 0x00-0x7f, none of which
+ have their high bit set. The expression here is
+ (x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when
+- there were no 0x00 bytes in the word.
++ there were no 0x00 bytes in the word. You get 0x80 in bytes that
++ match, but possibly false 0x80 matches in the next more significant
++ byte to a true match due to carries. For little-endian this is
++ of no consequence since the least significant match is the one
++ we're interested in, but big-endian needs method 2 to find which
++ byte matches.
+
+ 2) Given a word 'x', we can test to see _which_ byte was zero by
+ calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f).
+@@ -74,7 +79,7 @@
+
+ ENTRY (BP_SYM (strlen))
+
+-#define rTMP1 r0
++#define rTMP4 r0
+ #define rRTN r3 /* incoming STR arg, outgoing result */
+ #define rSTR r4 /* current string position */
+ #define rPADN r5 /* number of padding bits we prepend to the
+@@ -84,9 +89,9 @@
+ #define rWORD1 r8 /* current string word */
+ #define rWORD2 r9 /* next string word */
+ #define rMASK r9 /* mask for first string word */
+-#define rTMP2 r10
+-#define rTMP3 r11
+-#define rTMP4 r12
++#define rTMP1 r10
++#define rTMP2 r11
++#define rTMP3 r12
+
+ CHECK_BOUNDS_LOW (rRTN, rTMP1, rTMP2)
+
+@@ -96,15 +101,20 @@
+ lwz rWORD1, 0(rSTR)
+ li rMASK, -1
+ addi r7F7F, r7F7F, 0x7f7f
+-/* That's the setup done, now do the first pair of words.
+- We make an exception and use method (2) on the first two words, to reduce
+- overhead. */
++/* We use method (2) on the first two words, because rFEFE isn't
++ required which reduces setup overhead. Also gives a faster return
++ for small strings on big-endian due to needing to recalculate with
++ method (2) anyway. */
++#ifdef __LITTLE_ENDIAN__
++ slw rMASK, rMASK, rPADN
++#else
+ srw rMASK, rMASK, rPADN
++#endif
+ and rTMP1, r7F7F, rWORD1
+ or rTMP2, r7F7F, rWORD1
+ add rTMP1, rTMP1, r7F7F
+- nor rTMP1, rTMP2, rTMP1
+- and. rWORD1, rTMP1, rMASK
++ nor rTMP3, rTMP2, rTMP1
++ and. rTMP3, rTMP3, rMASK
+ mtcrf 0x01, rRTN
+ bne L(done0)
+ lis rFEFE, -0x101
+@@ -113,11 +123,12 @@
+ bt 29, L(loop)
+
+ /* Handle second word of pair. */
++/* Perhaps use method (1) here for little-endian, saving one instruction? */
+ lwzu rWORD1, 4(rSTR)
+ and rTMP1, r7F7F, rWORD1
+ or rTMP2, r7F7F, rWORD1
+ add rTMP1, rTMP1, r7F7F
+- nor. rWORD1, rTMP2, rTMP1
++ nor. rTMP3, rTMP2, rTMP1
+ bne L(done0)
+
+ /* The loop. */
+@@ -131,29 +142,53 @@
+ add rTMP3, rFEFE, rWORD2
+ nor rTMP4, r7F7F, rWORD2
+ bne L(done1)
+- and. rTMP1, rTMP3, rTMP4
++ and. rTMP3, rTMP3, rTMP4
+ beq L(loop)
+
++#ifndef __LITTLE_ENDIAN__
+ and rTMP1, r7F7F, rWORD2
+ add rTMP1, rTMP1, r7F7F
+- andc rWORD1, rTMP4, rTMP1
++ andc rTMP3, rTMP4, rTMP1
+ b L(done0)
+
+ L(done1):
+ and rTMP1, r7F7F, rWORD1
+ subi rSTR, rSTR, 4
+ add rTMP1, rTMP1, r7F7F
+- andc rWORD1, rTMP2, rTMP1
++ andc rTMP3, rTMP2, rTMP1
+
+ /* When we get to here, rSTR points to the first word in the string that
+- contains a zero byte, and the most significant set bit in rWORD1 is in that
+- byte. */
++ contains a zero byte, and rTMP3 has 0x80 for bytes that are zero,
++ and 0x00 otherwise. */
+ L(done0):
+- cntlzw rTMP3, rWORD1
++ cntlzw rTMP3, rTMP3
+ subf rTMP1, rRTN, rSTR
+ srwi rTMP3, rTMP3, 3
+ add rRTN, rTMP1, rTMP3
+ /* GKM FIXME: check high bound. */
+ blr
++#else
++
++L(done0):
++ addi rTMP1, rTMP3, -1 /* Form a mask from trailing zeros. */
++ andc rTMP1, rTMP1, rTMP3
++ cntlzw rTMP1, rTMP1 /* Count bits not in the mask. */
++ subf rTMP3, rRTN, rSTR
++ subfic rTMP1, rTMP1, 32-7
++ srwi rTMP1, rTMP1, 3
++ add rRTN, rTMP1, rTMP3
++ blr
++
++L(done1):
++ addi rTMP3, rTMP1, -1
++ andc rTMP3, rTMP3, rTMP1
++ cntlzw rTMP3, rTMP3
++ subf rTMP1, rRTN, rSTR
++ subfic rTMP3, rTMP3, 32-7-32
++ srawi rTMP3, rTMP3, 3
++ add rRTN, rTMP1, rTMP3
++ blr
++#endif
++
+ END (BP_SYM (strlen))
+ libc_hidden_builtin_def (strlen)
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strlen.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strlen.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strlen.S 2014-05-28 12:28:44.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strlen.S 2014-05-28 12:28:45.000000000 -0500
+@@ -32,7 +32,11 @@
+ with cmpb. */
+ li r5,-1 /* MASK = 0xffffffffffffffff. */
+ ld r12,0(r4) /* Load doubleword from memory. */
++#ifdef __LITTLE_ENDIAN__
++ sld r5,r5,r6
++#else
+ srd r5,r5,r6 /* MASK = MASK >> padding. */
++#endif
+ orc r9,r12,r5 /* Mask bits that are not part of the string. */
+ cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */
+ cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */
+@@ -50,9 +54,6 @@
+ cmpb r10,r12,r0
+ cmpdi cr7,r10,0
+ bne cr7,L(done)
+- b L(loop) /* We branch here (rather than falling through)
+- to skip the nops due to heavy alignment
+- of the loop below. */
+
+ /* Main loop to look for the end of the string. Since it's a
+ small loop (< 8 instructions), align it to 32-bytes. */
+@@ -89,9 +90,15 @@
+ 0xff in the same position as the null byte in the original
+ doubleword from the string. Use that to calculate the length. */
+ L(done):
+- cntlzd r0,r10 /* Count leading zeroes before the match. */
++#ifdef __LITTLE_ENDIAN__
++ addi r9, r10, -1 /* Form a mask from trailing zeros. */
++ andc r9, r9, r10
++ popcntd r0, r9 /* Count the bits in the mask. */
++#else
++ cntlzd r0,r10 /* Count leading zeros before the match. */
++#endif
+ subf r5,r3,r4
+- srdi r0,r0,3 /* Convert leading zeroes to bytes. */
++ srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
+ add r3,r5,r0 /* Compute final length. */
+ blr
+ END (BP_SYM (strlen))
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/strlen.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/strlen.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/strlen.S 2014-05-28 12:28:44.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/strlen.S 2014-05-28 12:38:17.000000000 -0500
+@@ -31,7 +31,12 @@
+ 1 is subtracted you get a value in the range 0x00-0x7f, none of which
+ have their high bit set. The expression here is
+ (x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when
+- there were no 0x00 bytes in the word.
++ there were no 0x00 bytes in the word. You get 0x80 in bytes that
++ match, but possibly false 0x80 matches in the next more significant
++ byte to a true match due to carries. For little-endian this is
++ of no consequence since the least significant match is the one
++ we're interested in, but big-endian needs method 2 to find which
++ byte matches.
+
+ 2) Given a word 'x', we can test to see _which_ byte was zero by
+ calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f).
+@@ -64,7 +69,7 @@
+ Answer:
+ 1) Added a Data Cache Block Touch early to prefetch the first 128
+ byte cache line. Adding dcbt instructions to the loop would not be
+- effective since most strings will be shorter than the cache line.*/
++ effective since most strings will be shorter than the cache line. */
+
+ /* Some notes on register usage: Under the SVR4 ABI, we can use registers
+ 0 and 3 through 12 (so long as we don't call any procedures) without
+@@ -80,7 +85,7 @@
+ ENTRY (BP_SYM (strlen))
+ CALL_MCOUNT 1
+
+-#define rTMP1 r0
++#define rTMP4 r0
+ #define rRTN r3 /* incoming STR arg, outgoing result */
+ #define rSTR r4 /* current string position */
+ #define rPADN r5 /* number of padding bits we prepend to the
+@@ -90,9 +95,9 @@
+ #define rWORD1 r8 /* current string doubleword */
+ #define rWORD2 r9 /* next string doubleword */
+ #define rMASK r9 /* mask for first string doubleword */
+-#define rTMP2 r10
+-#define rTMP3 r11
+-#define rTMP4 r12
++#define rTMP1 r10
++#define rTMP2 r11
++#define rTMP3 r12
+
+ /* Note: The Bounded pointer support in this code is broken. This code
+ was inherited from PPC32 and that support was never completed.
+@@ -109,30 +114,36 @@
+ addi r7F7F, r7F7F, 0x7f7f
+ li rMASK, -1
+ insrdi r7F7F, r7F7F, 32, 0
+-/* That's the setup done, now do the first pair of doublewords.
+- We make an exception and use method (2) on the first two doublewords,
+- to reduce overhead. */
+- srd rMASK, rMASK, rPADN
++/* We use method (2) on the first two doublewords, because rFEFE isn't
++ required which reduces setup overhead. Also gives a faster return
++ for small strings on big-endian due to needing to recalculate with
++ method (2) anyway. */
++#ifdef __LITTLE_ENDIAN__
++ sld rMASK, rMASK, rPADN
++#else
++ srd rMASK, rMASK, rPADN
++#endif
+ and rTMP1, r7F7F, rWORD1
+ or rTMP2, r7F7F, rWORD1
+ lis rFEFE, -0x101
+ add rTMP1, rTMP1, r7F7F
+ addi rFEFE, rFEFE, -0x101
+- nor rTMP1, rTMP2, rTMP1
+- and. rWORD1, rTMP1, rMASK
++ nor rTMP3, rTMP2, rTMP1
++ and. rTMP3, rTMP3, rMASK
+ mtcrf 0x01, rRTN
+ bne L(done0)
+- sldi rTMP1, rFEFE, 32
+- add rFEFE, rFEFE, rTMP1
++ sldi rTMP1, rFEFE, 32
++ add rFEFE, rFEFE, rTMP1
+ /* Are we now aligned to a doubleword boundary? */
+ bt 28, L(loop)
+
+ /* Handle second doubleword of pair. */
++/* Perhaps use method (1) here for little-endian, saving one instruction? */
+ ldu rWORD1, 8(rSTR)
+ and rTMP1, r7F7F, rWORD1
+ or rTMP2, r7F7F, rWORD1
+ add rTMP1, rTMP1, r7F7F
+- nor. rWORD1, rTMP2, rTMP1
++ nor. rTMP3, rTMP2, rTMP1
+ bne L(done0)
+
+ /* The loop. */
+@@ -146,29 +157,53 @@
+ add rTMP3, rFEFE, rWORD2
+ nor rTMP4, r7F7F, rWORD2
+ bne L(done1)
+- and. rTMP1, rTMP3, rTMP4
++ and. rTMP3, rTMP3, rTMP4
+ beq L(loop)
+
++#ifndef __LITTLE_ENDIAN__
+ and rTMP1, r7F7F, rWORD2
+ add rTMP1, rTMP1, r7F7F
+- andc rWORD1, rTMP4, rTMP1
++ andc rTMP3, rTMP4, rTMP1
+ b L(done0)
+
+ L(done1):
+ and rTMP1, r7F7F, rWORD1
+ subi rSTR, rSTR, 8
+ add rTMP1, rTMP1, r7F7F
+- andc rWORD1, rTMP2, rTMP1
++ andc rTMP3, rTMP2, rTMP1
+
+ /* When we get to here, rSTR points to the first doubleword in the string that
+- contains a zero byte, and the most significant set bit in rWORD1 is in that
+- byte. */
++ contains a zero byte, and rTMP3 has 0x80 for bytes that are zero, and 0x00
++ otherwise. */
+ L(done0):
+- cntlzd rTMP3, rWORD1
++ cntlzd rTMP3, rTMP3
+ subf rTMP1, rRTN, rSTR
+ srdi rTMP3, rTMP3, 3
+ add rRTN, rTMP1, rTMP3
+ /* GKM FIXME: check high bound. */
+ blr
++#else
++
++L(done0):
++ addi rTMP1, rTMP3, -1 /* Form a mask from trailing zeros. */
++ andc rTMP1, rTMP1, rTMP3
++ cntlzd rTMP1, rTMP1 /* Count bits not in the mask. */
++ subf rTMP3, rRTN, rSTR
++ subfic rTMP1, rTMP1, 64-7
++ srdi rTMP1, rTMP1, 3
++ add rRTN, rTMP1, rTMP3
++ blr
++
++L(done1):
++ addi rTMP3, rTMP1, -1
++ andc rTMP3, rTMP3, rTMP1
++ cntlzd rTMP3, rTMP3
++ subf rTMP1, rRTN, rSTR
++ subfic rTMP3, rTMP3, 64-7-64
++ sradi rTMP3, rTMP3, 3
++ add rRTN, rTMP1, rTMP3
++ blr
++#endif
++
+ END (BP_SYM (strlen))
+ libc_hidden_builtin_def (strlen)