diff options
Diffstat (limited to 'packages/glibc/2.17/0047-glibc-ppc64le-25.patch')
-rw-r--r-- | packages/glibc/2.17/0047-glibc-ppc64le-25.patch | 411 |
1 files changed, 411 insertions, 0 deletions
diff --git a/packages/glibc/2.17/0047-glibc-ppc64le-25.patch b/packages/glibc/2.17/0047-glibc-ppc64le-25.patch new file mode 100644 index 0000000..144d3f3 --- /dev/null +++ b/packages/glibc/2.17/0047-glibc-ppc64le-25.patch @@ -0,0 +1,411 @@ +# commit db9b4570c5dc550074140ac1d1677077fba29a26 +# Author: Alan Modra <amodra@gmail.com> +# Date: Sat Aug 17 18:40:11 2013 +0930 +# +# PowerPC LE strlen +# http://sourceware.org/ml/libc-alpha/2013-08/msg00097.html +# +# This is the first of nine patches adding little-endian support to the +# existing optimised string and memory functions. I did spend some +# time with a power7 simulator looking at cycle by cycle behaviour for +# memchr, but most of these patches have not been run on cpu simulators +# to check that we are going as fast as possible. I'm sure PowerPC can +# do better. However, the little-endian support mostly leaves main +# loops unchanged, so I'm banking on previous authors having done a +# good job on big-endian.. As with most code you stare at long enough, +# I found some improvements for big-endian too. +# +# Little-endian support for strlen. Like most of the string functions, +# I leave the main word or multiple-word loops substantially unchanged, +# just needing to modify the tail. +# +# Removing the branch in the power7 functions is just a tidy. .align +# produces a branch anyway. Modifying regs in the non-power7 functions +# is to suit the new little-endian tail. +# +# * sysdeps/powerpc/powerpc64/power7/strlen.S (strlen): Add little-endian +# support. Don't branch over align. +# * sysdeps/powerpc/powerpc32/power7/strlen.S: Likewise. +# * sysdeps/powerpc/powerpc64/strlen.S (strlen): Add little-endian support. +# Rearrange tmp reg use to suit. Comment. +# * sysdeps/powerpc/powerpc32/strlen.S: Likewise. +# +diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strlen.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strlen.S +--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strlen.S 2014-05-28 12:28:44.000000000 -0500 ++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strlen.S 2014-05-28 12:28:45.000000000 -0500 +@@ -31,7 +31,11 @@ + li r0,0 /* Word with null chars to use with cmpb. */ + li r5,-1 /* MASK = 0xffffffffffffffff. */ + lwz r12,0(r4) /* Load word from memory. */ ++#ifdef __LITTLE_ENDIAN__ ++ slw r5,r5,r6 ++#else + srw r5,r5,r6 /* MASK = MASK >> padding. */ ++#endif + orc r9,r12,r5 /* Mask bits that are not part of the string. */ + cmpb r10,r9,r0 /* Check for null bytes in WORD1. */ + cmpwi cr7,r10,0 /* If r10 == 0, no null's have been found. */ +@@ -49,9 +53,6 @@ + cmpb r10,r12,r0 + cmpwi cr7,r10,0 + bne cr7,L(done) +- b L(loop) /* We branch here (rather than falling through) +- to skip the nops due to heavy alignment +- of the loop below. */ + + /* Main loop to look for the end of the string. Since it's a + small loop (< 8 instructions), align it to 32-bytes. */ +@@ -88,9 +89,15 @@ + 0xff in the same position as the null byte in the original + word from the string. Use that to calculate the length. */ + L(done): +- cntlzw r0,r10 /* Count leading zeroes before the match. */ ++#ifdef __LITTLE_ENDIAN__ ++ addi r9, r10, -1 /* Form a mask from trailing zeros. */ ++ andc r9, r9, r10 ++ popcntw r0, r9 /* Count the bits in the mask. */ ++#else ++ cntlzw r0,r10 /* Count leading zeros before the match. */ ++#endif + subf r5,r3,r4 +- srwi r0,r0,3 /* Convert leading zeroes to bytes. */ ++ srwi r0,r0,3 /* Convert leading zeros to bytes. */ + add r3,r5,r0 /* Compute final length. */ + blr + END (BP_SYM (strlen)) +diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/strlen.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/strlen.S +--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/strlen.S 2014-05-28 12:28:44.000000000 -0500 ++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/strlen.S 2014-05-28 12:32:24.000000000 -0500 +@@ -31,7 +31,12 @@ + 1 is subtracted you get a value in the range 0x00-0x7f, none of which + have their high bit set. The expression here is + (x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when +- there were no 0x00 bytes in the word. ++ there were no 0x00 bytes in the word. You get 0x80 in bytes that ++ match, but possibly false 0x80 matches in the next more significant ++ byte to a true match due to carries. For little-endian this is ++ of no consequence since the least significant match is the one ++ we're interested in, but big-endian needs method 2 to find which ++ byte matches. + + 2) Given a word 'x', we can test to see _which_ byte was zero by + calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f). +@@ -74,7 +79,7 @@ + + ENTRY (BP_SYM (strlen)) + +-#define rTMP1 r0 ++#define rTMP4 r0 + #define rRTN r3 /* incoming STR arg, outgoing result */ + #define rSTR r4 /* current string position */ + #define rPADN r5 /* number of padding bits we prepend to the +@@ -84,9 +89,9 @@ + #define rWORD1 r8 /* current string word */ + #define rWORD2 r9 /* next string word */ + #define rMASK r9 /* mask for first string word */ +-#define rTMP2 r10 +-#define rTMP3 r11 +-#define rTMP4 r12 ++#define rTMP1 r10 ++#define rTMP2 r11 ++#define rTMP3 r12 + + CHECK_BOUNDS_LOW (rRTN, rTMP1, rTMP2) + +@@ -96,15 +101,20 @@ + lwz rWORD1, 0(rSTR) + li rMASK, -1 + addi r7F7F, r7F7F, 0x7f7f +-/* That's the setup done, now do the first pair of words. +- We make an exception and use method (2) on the first two words, to reduce +- overhead. */ ++/* We use method (2) on the first two words, because rFEFE isn't ++ required which reduces setup overhead. Also gives a faster return ++ for small strings on big-endian due to needing to recalculate with ++ method (2) anyway. */ ++#ifdef __LITTLE_ENDIAN__ ++ slw rMASK, rMASK, rPADN ++#else + srw rMASK, rMASK, rPADN ++#endif + and rTMP1, r7F7F, rWORD1 + or rTMP2, r7F7F, rWORD1 + add rTMP1, rTMP1, r7F7F +- nor rTMP1, rTMP2, rTMP1 +- and. rWORD1, rTMP1, rMASK ++ nor rTMP3, rTMP2, rTMP1 ++ and. rTMP3, rTMP3, rMASK + mtcrf 0x01, rRTN + bne L(done0) + lis rFEFE, -0x101 +@@ -113,11 +123,12 @@ + bt 29, L(loop) + + /* Handle second word of pair. */ ++/* Perhaps use method (1) here for little-endian, saving one instruction? */ + lwzu rWORD1, 4(rSTR) + and rTMP1, r7F7F, rWORD1 + or rTMP2, r7F7F, rWORD1 + add rTMP1, rTMP1, r7F7F +- nor. rWORD1, rTMP2, rTMP1 ++ nor. rTMP3, rTMP2, rTMP1 + bne L(done0) + + /* The loop. */ +@@ -131,29 +142,53 @@ + add rTMP3, rFEFE, rWORD2 + nor rTMP4, r7F7F, rWORD2 + bne L(done1) +- and. rTMP1, rTMP3, rTMP4 ++ and. rTMP3, rTMP3, rTMP4 + beq L(loop) + ++#ifndef __LITTLE_ENDIAN__ + and rTMP1, r7F7F, rWORD2 + add rTMP1, rTMP1, r7F7F +- andc rWORD1, rTMP4, rTMP1 ++ andc rTMP3, rTMP4, rTMP1 + b L(done0) + + L(done1): + and rTMP1, r7F7F, rWORD1 + subi rSTR, rSTR, 4 + add rTMP1, rTMP1, r7F7F +- andc rWORD1, rTMP2, rTMP1 ++ andc rTMP3, rTMP2, rTMP1 + + /* When we get to here, rSTR points to the first word in the string that +- contains a zero byte, and the most significant set bit in rWORD1 is in that +- byte. */ ++ contains a zero byte, and rTMP3 has 0x80 for bytes that are zero, ++ and 0x00 otherwise. */ + L(done0): +- cntlzw rTMP3, rWORD1 ++ cntlzw rTMP3, rTMP3 + subf rTMP1, rRTN, rSTR + srwi rTMP3, rTMP3, 3 + add rRTN, rTMP1, rTMP3 + /* GKM FIXME: check high bound. */ + blr ++#else ++ ++L(done0): ++ addi rTMP1, rTMP3, -1 /* Form a mask from trailing zeros. */ ++ andc rTMP1, rTMP1, rTMP3 ++ cntlzw rTMP1, rTMP1 /* Count bits not in the mask. */ ++ subf rTMP3, rRTN, rSTR ++ subfic rTMP1, rTMP1, 32-7 ++ srwi rTMP1, rTMP1, 3 ++ add rRTN, rTMP1, rTMP3 ++ blr ++ ++L(done1): ++ addi rTMP3, rTMP1, -1 ++ andc rTMP3, rTMP3, rTMP1 ++ cntlzw rTMP3, rTMP3 ++ subf rTMP1, rRTN, rSTR ++ subfic rTMP3, rTMP3, 32-7-32 ++ srawi rTMP3, rTMP3, 3 ++ add rRTN, rTMP1, rTMP3 ++ blr ++#endif ++ + END (BP_SYM (strlen)) + libc_hidden_builtin_def (strlen) +diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strlen.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strlen.S +--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strlen.S 2014-05-28 12:28:44.000000000 -0500 ++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strlen.S 2014-05-28 12:28:45.000000000 -0500 +@@ -32,7 +32,11 @@ + with cmpb. */ + li r5,-1 /* MASK = 0xffffffffffffffff. */ + ld r12,0(r4) /* Load doubleword from memory. */ ++#ifdef __LITTLE_ENDIAN__ ++ sld r5,r5,r6 ++#else + srd r5,r5,r6 /* MASK = MASK >> padding. */ ++#endif + orc r9,r12,r5 /* Mask bits that are not part of the string. */ + cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */ + cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */ +@@ -50,9 +54,6 @@ + cmpb r10,r12,r0 + cmpdi cr7,r10,0 + bne cr7,L(done) +- b L(loop) /* We branch here (rather than falling through) +- to skip the nops due to heavy alignment +- of the loop below. */ + + /* Main loop to look for the end of the string. Since it's a + small loop (< 8 instructions), align it to 32-bytes. */ +@@ -89,9 +90,15 @@ + 0xff in the same position as the null byte in the original + doubleword from the string. Use that to calculate the length. */ + L(done): +- cntlzd r0,r10 /* Count leading zeroes before the match. */ ++#ifdef __LITTLE_ENDIAN__ ++ addi r9, r10, -1 /* Form a mask from trailing zeros. */ ++ andc r9, r9, r10 ++ popcntd r0, r9 /* Count the bits in the mask. */ ++#else ++ cntlzd r0,r10 /* Count leading zeros before the match. */ ++#endif + subf r5,r3,r4 +- srdi r0,r0,3 /* Convert leading zeroes to bytes. */ ++ srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */ + add r3,r5,r0 /* Compute final length. */ + blr + END (BP_SYM (strlen)) +diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/strlen.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/strlen.S +--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/strlen.S 2014-05-28 12:28:44.000000000 -0500 ++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/strlen.S 2014-05-28 12:38:17.000000000 -0500 +@@ -31,7 +31,12 @@ + 1 is subtracted you get a value in the range 0x00-0x7f, none of which + have their high bit set. The expression here is + (x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when +- there were no 0x00 bytes in the word. ++ there were no 0x00 bytes in the word. You get 0x80 in bytes that ++ match, but possibly false 0x80 matches in the next more significant ++ byte to a true match due to carries. For little-endian this is ++ of no consequence since the least significant match is the one ++ we're interested in, but big-endian needs method 2 to find which ++ byte matches. + + 2) Given a word 'x', we can test to see _which_ byte was zero by + calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f). +@@ -64,7 +69,7 @@ + Answer: + 1) Added a Data Cache Block Touch early to prefetch the first 128 + byte cache line. Adding dcbt instructions to the loop would not be +- effective since most strings will be shorter than the cache line.*/ ++ effective since most strings will be shorter than the cache line. */ + + /* Some notes on register usage: Under the SVR4 ABI, we can use registers + 0 and 3 through 12 (so long as we don't call any procedures) without +@@ -80,7 +85,7 @@ + ENTRY (BP_SYM (strlen)) + CALL_MCOUNT 1 + +-#define rTMP1 r0 ++#define rTMP4 r0 + #define rRTN r3 /* incoming STR arg, outgoing result */ + #define rSTR r4 /* current string position */ + #define rPADN r5 /* number of padding bits we prepend to the +@@ -90,9 +95,9 @@ + #define rWORD1 r8 /* current string doubleword */ + #define rWORD2 r9 /* next string doubleword */ + #define rMASK r9 /* mask for first string doubleword */ +-#define rTMP2 r10 +-#define rTMP3 r11 +-#define rTMP4 r12 ++#define rTMP1 r10 ++#define rTMP2 r11 ++#define rTMP3 r12 + + /* Note: The Bounded pointer support in this code is broken. This code + was inherited from PPC32 and that support was never completed. +@@ -109,30 +114,36 @@ + addi r7F7F, r7F7F, 0x7f7f + li rMASK, -1 + insrdi r7F7F, r7F7F, 32, 0 +-/* That's the setup done, now do the first pair of doublewords. +- We make an exception and use method (2) on the first two doublewords, +- to reduce overhead. */ +- srd rMASK, rMASK, rPADN ++/* We use method (2) on the first two doublewords, because rFEFE isn't ++ required which reduces setup overhead. Also gives a faster return ++ for small strings on big-endian due to needing to recalculate with ++ method (2) anyway. */ ++#ifdef __LITTLE_ENDIAN__ ++ sld rMASK, rMASK, rPADN ++#else ++ srd rMASK, rMASK, rPADN ++#endif + and rTMP1, r7F7F, rWORD1 + or rTMP2, r7F7F, rWORD1 + lis rFEFE, -0x101 + add rTMP1, rTMP1, r7F7F + addi rFEFE, rFEFE, -0x101 +- nor rTMP1, rTMP2, rTMP1 +- and. rWORD1, rTMP1, rMASK ++ nor rTMP3, rTMP2, rTMP1 ++ and. rTMP3, rTMP3, rMASK + mtcrf 0x01, rRTN + bne L(done0) +- sldi rTMP1, rFEFE, 32 +- add rFEFE, rFEFE, rTMP1 ++ sldi rTMP1, rFEFE, 32 ++ add rFEFE, rFEFE, rTMP1 + /* Are we now aligned to a doubleword boundary? */ + bt 28, L(loop) + + /* Handle second doubleword of pair. */ ++/* Perhaps use method (1) here for little-endian, saving one instruction? */ + ldu rWORD1, 8(rSTR) + and rTMP1, r7F7F, rWORD1 + or rTMP2, r7F7F, rWORD1 + add rTMP1, rTMP1, r7F7F +- nor. rWORD1, rTMP2, rTMP1 ++ nor. rTMP3, rTMP2, rTMP1 + bne L(done0) + + /* The loop. */ +@@ -146,29 +157,53 @@ + add rTMP3, rFEFE, rWORD2 + nor rTMP4, r7F7F, rWORD2 + bne L(done1) +- and. rTMP1, rTMP3, rTMP4 ++ and. rTMP3, rTMP3, rTMP4 + beq L(loop) + ++#ifndef __LITTLE_ENDIAN__ + and rTMP1, r7F7F, rWORD2 + add rTMP1, rTMP1, r7F7F +- andc rWORD1, rTMP4, rTMP1 ++ andc rTMP3, rTMP4, rTMP1 + b L(done0) + + L(done1): + and rTMP1, r7F7F, rWORD1 + subi rSTR, rSTR, 8 + add rTMP1, rTMP1, r7F7F +- andc rWORD1, rTMP2, rTMP1 ++ andc rTMP3, rTMP2, rTMP1 + + /* When we get to here, rSTR points to the first doubleword in the string that +- contains a zero byte, and the most significant set bit in rWORD1 is in that +- byte. */ ++ contains a zero byte, and rTMP3 has 0x80 for bytes that are zero, and 0x00 ++ otherwise. */ + L(done0): +- cntlzd rTMP3, rWORD1 ++ cntlzd rTMP3, rTMP3 + subf rTMP1, rRTN, rSTR + srdi rTMP3, rTMP3, 3 + add rRTN, rTMP1, rTMP3 + /* GKM FIXME: check high bound. */ + blr ++#else ++ ++L(done0): ++ addi rTMP1, rTMP3, -1 /* Form a mask from trailing zeros. */ ++ andc rTMP1, rTMP1, rTMP3 ++ cntlzd rTMP1, rTMP1 /* Count bits not in the mask. */ ++ subf rTMP3, rRTN, rSTR ++ subfic rTMP1, rTMP1, 64-7 ++ srdi rTMP1, rTMP1, 3 ++ add rRTN, rTMP1, rTMP3 ++ blr ++ ++L(done1): ++ addi rTMP3, rTMP1, -1 ++ andc rTMP3, rTMP3, rTMP1 ++ cntlzd rTMP3, rTMP3 ++ subf rTMP1, rRTN, rSTR ++ subfic rTMP3, rTMP3, 64-7-64 ++ sradi rTMP3, rTMP3, 3 ++ add rRTN, rTMP1, rTMP3 ++ blr ++#endif ++ + END (BP_SYM (strlen)) + libc_hidden_builtin_def (strlen) |