diff options
author | Chris Packham <judge.packham@gmail.com> | 2021-05-18 08:46:49 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-05-18 08:46:49 (GMT) |
commit | 6d008334bcfa76f8b46e61d9edb6dd5335cd6632 (patch) | |
tree | cd137ec7ab048fa32049a4322c10a0e27ba80c20 /packages/glibc/2.17/0053-glibc-ppc64le-31.patch | |
parent | f284f4149518de6e8c403a9392be8e817bfab2e8 (diff) | |
parent | 0088351811bf442aa2e7d35c564f36ca67a8a699 (diff) |
Merge pull request #1510 from messense/glibc-ppc64le-patches
Add GLIBC 2.17 support to powerpc64le-unknown-linux-gnu
Diffstat (limited to 'packages/glibc/2.17/0053-glibc-ppc64le-31.patch')
-rw-r--r-- | packages/glibc/2.17/0053-glibc-ppc64le-31.patch | 2943 |
1 files changed, 2943 insertions, 0 deletions
diff --git a/packages/glibc/2.17/0053-glibc-ppc64le-31.patch b/packages/glibc/2.17/0053-glibc-ppc64le-31.patch new file mode 100644 index 0000000..de90661 --- /dev/null +++ b/packages/glibc/2.17/0053-glibc-ppc64le-31.patch @@ -0,0 +1,2943 @@ +# commit 759cfef3ac4c07dba1ece0bbc1207e099348816d +# Author: Alan Modra <amodra@gmail.com> +# Date: Sat Aug 17 18:47:22 2013 +0930 +# +# PowerPC LE memcpy +# http://sourceware.org/ml/libc-alpha/2013-08/msg00103.html +# +# LIttle-endian support for memcpy. I spent some time cleaning up the +# 64-bit power7 memcpy, in order to avoid the extra alignment traps +# power7 takes for little-endian. It probably would have been better +# to copy the linux kernel version of memcpy. +# +# * sysdeps/powerpc/powerpc32/power4/memcpy.S: Add little endian support. +# * sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise. +# * sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise. +# * sysdeps/powerpc/powerpc32/power7/mempcpy.S: Likewise. +# * sysdeps/powerpc/powerpc64/memcpy.S: Likewise. +# * sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise. +# * sysdeps/powerpc/powerpc64/power6/memcpy.S: Likewise. +# * sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise. +# * sysdeps/powerpc/powerpc64/power7/mempcpy.S: Likewise. Make better +# use of regs. Use power7 mtocrf. Tidy function tails. +# +diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S +--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S 2014-05-29 13:04:56.000000000 -0500 ++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S 2014-05-29 13:04:56.000000000 -0500 +@@ -205,15 +205,28 @@ + blt cr6,5f + srwi 7,6,16 + bgt cr6,3f ++#ifdef __LITTLE_ENDIAN__ ++ sth 7,0(3) ++#else + sth 6,0(3) ++#endif + b 7f + .align 4 + 3: ++#ifdef __LITTLE_ENDIAN__ ++ rotlwi 6,6,24 ++ stb 6,0(3) ++ sth 7,1(3) ++#else + stb 7,0(3) + sth 6,1(3) ++#endif + b 7f + .align 4 + 5: ++#ifdef __LITTLE_ENDIAN__ ++ rotlwi 6,6,8 ++#endif + stb 6,0(3) + 7: + cmplwi cr1,10,16 +@@ -341,13 +354,23 @@ + bf 30,1f + + /* there are at least two words to copy, so copy them */ ++#ifdef __LITTLE_ENDIAN__ ++ srw 0,6,10 ++ slw 8,7,9 ++#else + slw 0,6,10 /* shift 1st src word to left align it in R0 */ + srw 8,7,9 /* shift 2nd src word to right align it in R8 */ ++#endif + or 0,0,8 /* or them to get word to store */ + lwz 6,8(5) /* load the 3rd src word */ + stw 0,0(4) /* store the 1st dst word */ ++#ifdef __LITTLE_ENDIAN__ ++ srw 0,7,10 ++ slw 8,6,9 ++#else + slw 0,7,10 /* now left align 2nd src word into R0 */ + srw 8,6,9 /* shift 3rd src word to right align it in R8 */ ++#endif + or 0,0,8 /* or them to get word to store */ + lwz 7,12(5) + stw 0,4(4) /* store the 2nd dst word */ +@@ -355,8 +378,13 @@ + addi 5,5,16 + bf 31,4f + /* there is a third word to copy, so copy it */ ++#ifdef __LITTLE_ENDIAN__ ++ srw 0,6,10 ++ slw 8,7,9 ++#else + slw 0,6,10 /* shift 3rd src word to left align it in R0 */ + srw 8,7,9 /* shift 4th src word to right align it in R8 */ ++#endif + or 0,0,8 /* or them to get word to store */ + stw 0,0(4) /* store 3rd dst word */ + mr 6,7 +@@ -366,8 +394,13 @@ + b 4f + .align 4 + 1: ++#ifdef __LITTLE_ENDIAN__ ++ srw 0,6,10 ++ slw 8,7,9 ++#else + slw 0,6,10 /* shift 1st src word to left align it in R0 */ + srw 8,7,9 /* shift 2nd src word to right align it in R8 */ ++#endif + addi 5,5,8 + or 0,0,8 /* or them to get word to store */ + bf 31,4f +@@ -380,23 +413,43 @@ + .align 4 + 4: + /* copy 16 bytes at a time */ ++#ifdef __LITTLE_ENDIAN__ ++ srw 0,6,10 ++ slw 8,7,9 ++#else + slw 0,6,10 + srw 8,7,9 ++#endif + or 0,0,8 + lwz 6,0(5) + stw 0,0(4) ++#ifdef __LITTLE_ENDIAN__ ++ srw 0,7,10 ++ slw 8,6,9 ++#else + slw 0,7,10 + srw 8,6,9 ++#endif + or 0,0,8 + lwz 7,4(5) + stw 0,4(4) ++#ifdef __LITTLE_ENDIAN__ ++ srw 0,6,10 ++ slw 8,7,9 ++#else + slw 0,6,10 + srw 8,7,9 ++#endif + or 0,0,8 + lwz 6,8(5) + stw 0,8(4) ++#ifdef __LITTLE_ENDIAN__ ++ srw 0,7,10 ++ slw 8,6,9 ++#else + slw 0,7,10 + srw 8,6,9 ++#endif + or 0,0,8 + lwz 7,12(5) + stw 0,12(4) +@@ -405,8 +458,13 @@ + bdnz+ 4b + 8: + /* calculate and store the final word */ ++#ifdef __LITTLE_ENDIAN__ ++ srw 0,6,10 ++ slw 8,7,9 ++#else + slw 0,6,10 + srw 8,7,9 ++#endif + or 0,0,8 + stw 0,0(4) + 3: +diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S +--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S 2014-05-29 13:04:56.000000000 -0500 ++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S 2014-05-29 13:04:56.000000000 -0500 +@@ -221,15 +221,28 @@ + blt cr6,5f + srwi 7,6,16 + bgt cr6,3f ++#ifdef __LITTLE_ENDIAN__ ++ sth 7,0(3) ++#else + sth 6,0(3) ++#endif + b 7f + .align 4 + 3: ++#ifdef __LITTLE_ENDIAN__ ++ rotlwi 6,6,24 ++ stb 6,0(3) ++ sth 7,1(3) ++#else + stb 7,0(3) + sth 6,1(3) ++#endif + b 7f + .align 4 + 5: ++#ifdef __LITTLE_ENDIAN__ ++ rotlwi 6,6,8 ++#endif + stb 6,0(3) + 7: + cmplwi cr1,10,16 +@@ -579,7 +592,11 @@ + lwz 6,-1(4) + cmplwi cr6,31,4 + srwi 8,31,5 /* calculate the 32 byte loop count */ ++#ifdef __LITTLE_ENDIAN__ ++ srwi 6,6,8 ++#else + slwi 6,6,8 ++#endif + clrlwi 31,31,27 /* The remaining bytes, < 32. */ + blt cr5,L(wdu1_32tail) + mtctr 8 +@@ -587,8 +604,12 @@ + + lwz 8,3(4) + lwz 7,4(4) ++#ifdef __LITTLE_ENDIAN__ ++ rldimi 6,8,24,32 ++#else + /* Equivalent to: srwi 8,8,32-8; or 6,6,8 */ + rlwimi 6,8,8,(32-8),31 ++#endif + b L(wdu1_loop32x) + .align 4 + L(wdu1_loop32): +@@ -597,8 +618,12 @@ + lwz 7,4(4) + stw 10,-8(3) + stw 11,-4(3) ++#ifdef __LITTLE_ENDIAN__ ++ rldimi 6,8,24,32 ++#else + /* Equivalent to srwi 8,8,32-8; or 6,6,8 */ + rlwimi 6,8,8,(32-8),31 ++#endif + L(wdu1_loop32x): + lwz 10,8(4) + lwz 11,12(4) +@@ -615,7 +640,11 @@ + stw 6,16(3) + stw 7,20(3) + addi 3,3,32 ++#ifdef __LITTLE_ENDIAN__ ++ srwi 6,8,8 ++#else + slwi 6,8,8 ++#endif + bdnz+ L(wdu1_loop32) + stw 10,-8(3) + stw 11,-4(3) +@@ -626,8 +655,12 @@ + blt cr6,L(wdu_4tail) + /* calculate and store the final word */ + lwz 8,3(4) +-/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */ ++#ifdef __LITTLE_ENDIAN__ ++ rldimi 6,8,24,32 ++#else ++/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */ + rlwimi 6,8,8,(32-8),31 ++#endif + b L(wdu_32tailx) + + L(wdu2_32): +@@ -635,7 +668,11 @@ + lwz 6,-2(4) + cmplwi cr6,31,4 + srwi 8,31,5 /* calculate the 32 byte loop count */ ++#ifdef __LITTLE_ENDIAN__ ++ srwi 6,6,16 ++#else + slwi 6,6,16 ++#endif + clrlwi 31,31,27 /* The remaining bytes, < 32. */ + blt cr5,L(wdu2_32tail) + mtctr 8 +@@ -643,8 +680,11 @@ + + lwz 8,2(4) + lwz 7,4(4) +-/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */ ++#ifdef __LITTLE_ENDIAN__ ++ rldimi 6,8,16,32 ++#else + rlwimi 6,8,16,(32-16),31 ++#endif + b L(wdu2_loop32x) + .align 4 + L(wdu2_loop32): +@@ -653,8 +693,11 @@ + lwz 7,4(4) + stw 10,-8(3) + stw 11,-4(3) +-/* Equivalent to srwi 8,8,32-8; or 6,6,8 */ ++#ifdef __LITTLE_ENDIAN__ ++ rldimi 6,8,16,32 ++#else + rlwimi 6,8,16,(32-16),31 ++#endif + L(wdu2_loop32x): + lwz 10,8(4) + lwz 11,12(4) +@@ -672,7 +715,11 @@ + stw 6,16(3) + stw 7,20(3) + addi 3,3,32 ++#ifdef __LITTLE_ENDIAN__ ++ srwi 6,8,16 ++#else + slwi 6,8,16 ++#endif + bdnz+ L(wdu2_loop32) + stw 10,-8(3) + stw 11,-4(3) +@@ -683,8 +730,11 @@ + blt cr6,L(wdu_4tail) + /* calculate and store the final word */ + lwz 8,2(4) +-/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */ ++#ifdef __LITTLE_ENDIAN__ ++ rldimi 6,8,16,32 ++#else + rlwimi 6,8,16,(32-16),31 ++#endif + b L(wdu_32tailx) + + L(wdu3_32): +@@ -692,7 +742,11 @@ + lwz 6,-3(4) + cmplwi cr6,31,4 + srwi 8,31,5 /* calculate the 32 byte loop count */ ++#ifdef __LITTLE_ENDIAN__ ++ srwi 6,6,24 ++#else + slwi 6,6,24 ++#endif + clrlwi 31,31,27 /* The remaining bytes, < 32. */ + blt cr5,L(wdu3_32tail) + mtctr 8 +@@ -700,8 +754,11 @@ + + lwz 8,1(4) + lwz 7,4(4) +-/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */ ++#ifdef __LITTLE_ENDIAN__ ++ rldimi 6,8,8,32 ++#else + rlwimi 6,8,24,(32-24),31 ++#endif + b L(wdu3_loop32x) + .align 4 + L(wdu3_loop32): +@@ -710,8 +767,11 @@ + lwz 7,4(4) + stw 10,-8(3) + stw 11,-4(3) +-/* Equivalent to srwi 8,8,32-8; or 6,6,8 */ ++#ifdef __LITTLE_ENDIAN__ ++ rldimi 6,8,8,32 ++#else + rlwimi 6,8,24,(32-24),31 ++#endif + L(wdu3_loop32x): + lwz 10,8(4) + lwz 11,12(4) +@@ -728,7 +788,11 @@ + stw 6,16(3) + stw 7,20(3) + addi 3,3,32 ++#ifdef __LITTLE_ENDIAN__ ++ srwi 6,8,24 ++#else + slwi 6,8,24 ++#endif + bdnz+ L(wdu3_loop32) + stw 10,-8(3) + stw 11,-4(3) +@@ -739,8 +803,11 @@ + blt cr6,L(wdu_4tail) + /* calculate and store the final word */ + lwz 8,1(4) +-/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */ ++#ifdef __LITTLE_ENDIAN__ ++ rldimi 6,8,8,32 ++#else + rlwimi 6,8,24,(32-24),31 ++#endif + b L(wdu_32tailx) + .align 4 + L(wdu_32tailx): +diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S +--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S 2014-05-29 13:04:56.000000000 -0500 ++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S 2014-05-29 13:04:56.000000000 -0500 +@@ -385,7 +385,7 @@ + + beq L(copy_GE_32_unaligned_cont) + +- /* SRC is not quadword aligned, get it aligned. */ ++ /* DST is not quadword aligned, get it aligned. */ + + mtcrf 0x01,0 + subf 31,0,5 +@@ -437,13 +437,21 @@ + mr 11,12 + mtcrf 0x01,9 + cmplwi cr6,9,1 ++#ifdef __LITTLE_ENDIAN__ ++ lvsr 5,0,12 ++#else + lvsl 5,0,12 ++#endif + lvx 3,0,12 + bf 31,L(setup_unaligned_loop) + + /* Copy another 16 bytes to align to 32-bytes due to the loop . */ + lvx 4,12,6 ++#ifdef __LITTLE_ENDIAN__ ++ vperm 6,4,3,5 ++#else + vperm 6,3,4,5 ++#endif + addi 11,12,16 + addi 10,3,16 + stvx 6,0,3 +@@ -463,11 +471,17 @@ + vector instructions though. */ + + lvx 4,11,6 /* vr4 = r11+16. */ +- vperm 6,3,4,5 /* Merge the correctly-aligned portions +- of vr3/vr4 into vr6. */ ++#ifdef __LITTLE_ENDIAN__ ++ vperm 6,4,3,5 ++#else ++ vperm 6,3,4,5 ++#endif + lvx 3,11,7 /* vr3 = r11+32. */ +- vperm 10,4,3,5 /* Merge the correctly-aligned portions +- of vr3/vr4 into vr10. */ ++#ifdef __LITTLE_ENDIAN__ ++ vperm 10,3,4,5 ++#else ++ vperm 10,4,3,5 ++#endif + addi 11,11,32 + stvx 6,0,10 + stvx 10,10,6 +diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S +--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S 2014-05-29 13:04:56.000000000 -0500 ++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S 2014-05-29 13:04:56.000000000 -0500 +@@ -327,7 +327,7 @@ + + beq L(copy_GE_32_unaligned_cont) + +- /* SRC is not quadword aligned, get it aligned. */ ++ /* DST is not quadword aligned, get it aligned. */ + + mtcrf 0x01,0 + subf 31,0,5 +@@ -379,13 +379,21 @@ + mr 11,12 + mtcrf 0x01,9 + cmplwi cr6,9,1 +- lvsl 5,0,12 ++#ifdef __LITTLE_ENDIAN__ ++ lvsr 5,0,12 ++#else ++ lvsl 5,0,12 ++#endif + lvx 3,0,12 + bf 31,L(setup_unaligned_loop) + + /* Copy another 16 bytes to align to 32-bytes due to the loop . */ + lvx 4,12,6 +- vperm 6,3,4,5 ++#ifdef __LITTLE_ENDIAN__ ++ vperm 6,4,3,5 ++#else ++ vperm 6,3,4,5 ++#endif + addi 11,12,16 + addi 10,3,16 + stvx 6,0,3 +@@ -405,11 +413,17 @@ + vector instructions though. */ + + lvx 4,11,6 /* vr4 = r11+16. */ +- vperm 6,3,4,5 /* Merge the correctly-aligned portions +- of vr3/vr4 into vr6. */ ++#ifdef __LITTLE_ENDIAN__ ++ vperm 6,4,3,5 ++#else ++ vperm 6,3,4,5 ++#endif + lvx 3,11,7 /* vr3 = r11+32. */ +- vperm 10,4,3,5 /* Merge the correctly-aligned portions +- of vr3/vr4 into vr10. */ ++#ifdef __LITTLE_ENDIAN__ ++ vperm 10,3,4,5 ++#else ++ vperm 10,4,3,5 ++#endif + addi 11,11,32 + stvx 6,0,10 + stvx 10,10,6 +diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S +--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S 2014-05-29 13:04:56.000000000 -0500 ++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S 2014-05-29 13:04:56.000000000 -0500 +@@ -214,15 +214,28 @@ + blt cr6,5f + srdi 7,6,16 + bgt cr6,3f ++#ifdef __LITTLE_ENDIAN__ ++ sth 7,0(3) ++#else + sth 6,0(3) ++#endif + b 7f + .align 4 + 3: ++#ifdef __LITTLE_ENDIAN__ ++ rotlwi 6,6,24 ++ stb 6,0(3) ++ sth 7,1(3) ++#else + stb 7,0(3) + sth 6,1(3) ++#endif + b 7f + .align 4 + 5: ++#ifdef __LITTLE_ENDIAN__ ++ rotlwi 6,6,8 ++#endif + stb 6,0(3) + 7: + cmpldi cr1,10,16 +@@ -330,7 +343,11 @@ + ld 7,8(5) + subfic 9,10,64 + beq 2f ++#ifdef __LITTLE_ENDIAN__ ++ srd 0,6,10 ++#else + sld 0,6,10 ++#endif + cmpldi 11,1 + mr 6,7 + addi 4,4,-8 +@@ -338,15 +355,25 @@ + b 1f + 2: addi 5,5,8 + .align 4 ++#ifdef __LITTLE_ENDIAN__ ++0: srd 0,6,10 ++ sld 8,7,9 ++#else + 0: sld 0,6,10 + srd 8,7,9 ++#endif + cmpldi 11,2 + ld 6,8(5) + or 0,0,8 + addi 11,11,-2 + std 0,0(4) ++#ifdef __LITTLE_ENDIAN__ ++ srd 0,7,10 ++1: sld 8,6,9 ++#else + sld 0,7,10 + 1: srd 8,6,9 ++#endif + or 0,0,8 + beq 8f + ld 7,16(5) +diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S +--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S 2014-05-29 13:04:56.000000000 -0500 ++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S 2014-05-29 13:05:51.000000000 -0500 +@@ -1,5 +1,5 @@ + /* Optimized memcpy implementation for PowerPC64. +- Copyright (C) 2003, 2006, 2011 Free Software Foundation, Inc. ++ Copyright (C) 2003-2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or +@@ -17,26 +17,24 @@ + <http://www.gnu.org/licenses/>. */ + + #include <sysdep.h> +-#include <bp-sym.h> +-#include <bp-asm.h> + + /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]); + Returns 'dst'. + +- Memcpy handles short copies (< 32-bytes) using a binary move blocks +- (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled +- with the appropriate combination of byte and halfword load/stores. +- There is minimal effort to optimize the alignment of short moves. ++ Memcpy handles short copies (< 32-bytes) using a binary move blocks ++ (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled ++ with the appropriate combination of byte and halfword load/stores. ++ There is minimal effort to optimize the alignment of short moves. + The 64-bit implementations of POWER3 and POWER4 do a reasonable job +- of handling unligned load/stores that do not cross 32-byte boundries. ++ of handling unaligned load/stores that do not cross 32-byte boundaries. + + Longer moves (>= 32-bytes) justify the effort to get at least the + destination doubleword (8-byte) aligned. Further optimization is +- posible when both source and destination are doubleword aligned. ++ possible when both source and destination are doubleword aligned. + Each case has a optimized unrolled loop. */ + + .machine power4 +-EALIGN (BP_SYM (memcpy), 5, 0) ++EALIGN (memcpy, 5, 0) + CALL_MCOUNT 3 + + cmpldi cr1,5,31 +@@ -44,20 +42,20 @@ + std 3,-16(1) + std 31,-8(1) + cfi_offset(31,-8) +- andi. 11,3,7 /* check alignement of dst. */ ++ andi. 11,3,7 /* check alignment of dst. */ + clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */ +- clrldi 10,4,61 /* check alignement of src. */ ++ clrldi 10,4,61 /* check alignment of src. */ + cmpldi cr6,5,8 + ble- cr1,.L2 /* If move < 32 bytes use short move code. */ +- cmpld cr6,10,11 ++ cmpld cr6,10,11 + mr 12,4 + srdi 9,5,3 /* Number of full double words remaining. */ + mtcrf 0x01,0 + mr 31,5 + beq .L0 +- ++ + subf 31,0,5 +- /* Move 0-7 bytes as needed to get the destination doubleword alligned. */ ++ /* Move 0-7 bytes as needed to get the destination doubleword aligned. */ + 1: bf 31,2f + lbz 6,0(12) + addi 12,12,1 +@@ -74,17 +72,17 @@ + stw 6,0(3) + addi 3,3,4 + 0: +- clrldi 10,12,61 /* check alignement of src again. */ ++ clrldi 10,12,61 /* check alignment of src again. */ + srdi 9,31,3 /* Number of full double words remaining. */ +- +- /* Copy doublewords from source to destination, assumpting the ++ ++ /* Copy doublewords from source to destination, assuming the + destination is aligned on a doubleword boundary. + + At this point we know there are at least 25 bytes left (32-7) to copy. +- The next step is to determine if the source is also doubleword aligned. ++ The next step is to determine if the source is also doubleword aligned. + If not branch to the unaligned move code at .L6. which uses + a load, shift, store strategy. +- ++ + Otherwise source and destination are doubleword aligned, and we can + the optimized doubleword copy loop. */ + .L0: +@@ -97,14 +95,14 @@ + Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration. + If the copy is not an exact multiple of 32 bytes, 1-3 + doublewords are copied as needed to set up the main loop. After +- the main loop exits there may be a tail of 1-7 bytes. These byte are ++ the main loop exits there may be a tail of 1-7 bytes. These byte are + copied a word/halfword/byte at a time as needed to preserve alignment. */ + + srdi 8,31,5 + cmpldi cr1,9,4 + cmpldi cr6,11,0 + mr 11,12 +- ++ + bf 30,1f + ld 6,0(12) + ld 7,8(12) +@@ -115,7 +113,7 @@ + addi 10,3,16 + bf 31,4f + ld 0,16(12) +- std 0,16(3) ++ std 0,16(3) + blt cr1,3f + addi 11,12,24 + addi 10,3,24 +@@ -129,7 +127,7 @@ + addi 11,12,8 + std 6,0(3) + addi 10,3,8 +- ++ + .align 4 + 4: + ld 6,0(11) +@@ -144,7 +142,7 @@ + std 0,24(10) + addi 10,10,32 + bdnz 4b +-3: ++3: + + rldicr 0,31,0,60 + mtcrf 0x01,31 +@@ -152,9 +150,9 @@ + .L9: + add 3,3,0 + add 12,12,0 +- ++ + /* At this point we have a tail of 0-7 bytes and we know that the +- destiniation is double word aligned. */ ++ destination is double word aligned. */ + 4: bf 29,2f + lwz 6,0(12) + addi 12,12,4 +@@ -173,29 +171,29 @@ + ld 31,-8(1) + ld 3,-16(1) + blr +- +-/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31 +- bytes. Each case is handled without loops, using binary (1,2,4,8) +- tests. +- ++ ++/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31 ++ bytes. Each case is handled without loops, using binary (1,2,4,8) ++ tests. ++ + In the short (0-8 byte) case no attempt is made to force alignment +- of either source or destination. The hardware will handle the +- unaligned load/stores with small delays for crossing 32- 64-byte, and ++ of either source or destination. The hardware will handle the ++ unaligned load/stores with small delays for crossing 32- 64-byte, and + 4096-byte boundaries. Since these short moves are unlikely to be +- unaligned or cross these boundaries, the overhead to force ++ unaligned or cross these boundaries, the overhead to force + alignment is not justified. +- ++ + The longer (9-31 byte) move is more likely to cross 32- or 64-byte + boundaries. Since only loads are sensitive to the 32-/64-byte +- boundaries it is more important to align the source then the ++ boundaries it is more important to align the source then the + destination. If the source is not already word aligned, we first +- move 1-3 bytes as needed. Since we are only word aligned we don't +- use double word load/stores to insure that all loads are aligned. ++ move 1-3 bytes as needed. Since we are only word aligned we don't ++ use double word load/stores to insure that all loads are aligned. + While the destination and stores may still be unaligned, this + is only an issue for page (4096 byte boundary) crossing, which + should be rare for these short moves. The hardware handles this +- case automatically with a small delay. */ +- ++ case automatically with a small delay. */ ++ + .align 4 + .L2: + mtcrf 0x01,5 +@@ -216,15 +214,28 @@ + blt cr6,5f + srdi 7,6,16 + bgt cr6,3f ++#ifdef __LITTLE_ENDIAN__ ++ sth 7,0(3) ++#else + sth 6,0(3) ++#endif + b 7f + .align 4 + 3: ++#ifdef __LITTLE_ENDIAN__ ++ rotlwi 6,6,24 ++ stb 6,0(3) ++ sth 7,1(3) ++#else + stb 7,0(3) + sth 6,1(3) ++#endif + b 7f + .align 4 + 5: ++#ifdef __LITTLE_ENDIAN__ ++ rotlwi 6,6,8 ++#endif + stb 6,0(3) + 7: + cmpldi cr1,10,16 +@@ -258,11 +269,11 @@ + lwz 6,0(12) + addi 12,12,4 + stw 6,0(3) +- addi 3,3,4 ++ addi 3,3,4 + 2: /* Move 2-3 bytes. */ + bf 30,1f + lhz 6,0(12) +- sth 6,0(3) ++ sth 6,0(3) + bf 31,0f + lbz 7,2(12) + stb 7,2(3) +@@ -283,8 +294,8 @@ + mr 12,4 + bne cr6,4f + /* Would have liked to use use ld/std here but the 630 processors are +- slow for load/store doubles that are not at least word aligned. +- Unaligned Load/Store word execute with only a 1 cycle penaltity. */ ++ slow for load/store doubles that are not at least word aligned. ++ Unaligned Load/Store word execute with only a 1 cycle penalty. */ + lwz 6,0(4) + lwz 7,4(4) + stw 6,0(3) +@@ -299,14 +310,14 @@ + 6: + bf 30,5f + lhz 7,4(4) +- sth 7,4(3) ++ sth 7,4(3) + bf 31,0f + lbz 8,6(4) + stb 8,6(3) + ld 3,-16(1) + blr + .align 4 +-5: ++5: + bf 31,0f + lbz 6,4(4) + stb 6,4(3) +@@ -336,13 +347,23 @@ + bf 30,1f + + /* there are at least two DWs to copy */ ++#ifdef __LITTLE_ENDIAN__ ++ srd 0,6,10 ++ sld 8,7,9 ++#else + sld 0,6,10 + srd 8,7,9 ++#endif + or 0,0,8 + ld 6,16(5) + std 0,0(4) ++#ifdef __LITTLE_ENDIAN__ ++ srd 0,7,10 ++ sld 8,6,9 ++#else + sld 0,7,10 + srd 8,6,9 ++#endif + or 0,0,8 + ld 7,24(5) + std 0,8(4) +@@ -351,8 +372,13 @@ + blt cr6,8f /* if total DWs = 3, then bypass loop */ + bf 31,4f + /* there is a third DW to copy */ ++#ifdef __LITTLE_ENDIAN__ ++ srd 0,6,10 ++ sld 8,7,9 ++#else + sld 0,6,10 + srd 8,7,9 ++#endif + or 0,0,8 + std 0,0(4) + mr 6,7 +@@ -363,8 +389,13 @@ + b 4f + .align 4 + 1: ++#ifdef __LITTLE_ENDIAN__ ++ srd 0,6,10 ++ sld 8,7,9 ++#else + sld 0,6,10 + srd 8,7,9 ++#endif + addi 5,5,16 + or 0,0,8 + bf 31,4f +@@ -375,23 +406,44 @@ + addi 4,4,8 + .align 4 + /* copy 32 bytes at a time */ +-4: sld 0,6,10 ++4: ++#ifdef __LITTLE_ENDIAN__ ++ srd 0,6,10 ++ sld 8,7,9 ++#else ++ sld 0,6,10 + srd 8,7,9 ++#endif + or 0,0,8 + ld 6,0(5) + std 0,0(4) ++#ifdef __LITTLE_ENDIAN__ ++ srd 0,7,10 ++ sld 8,6,9 ++#else + sld 0,7,10 + srd 8,6,9 ++#endif + or 0,0,8 + ld 7,8(5) + std 0,8(4) ++#ifdef __LITTLE_ENDIAN__ ++ srd 0,6,10 ++ sld 8,7,9 ++#else + sld 0,6,10 + srd 8,7,9 ++#endif + or 0,0,8 + ld 6,16(5) + std 0,16(4) ++#ifdef __LITTLE_ENDIAN__ ++ srd 0,7,10 ++ sld 8,6,9 ++#else + sld 0,7,10 + srd 8,6,9 ++#endif + or 0,0,8 + ld 7,24(5) + std 0,24(4) +@@ -401,9 +453,14 @@ + .align 4 + 8: + /* calculate and store the final DW */ ++#ifdef __LITTLE_ENDIAN__ ++ srd 0,6,10 ++ sld 8,7,9 ++#else + sld 0,6,10 + srd 8,7,9 +- or 0,0,8 ++#endif ++ or 0,0,8 + std 0,0(4) + 3: + rldicr 0,31,0,60 +@@ -413,5 +470,5 @@ + ld 31,-8(1) + ld 3,-16(1) + blr +-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS) ++END_GEN_TB (memcpy,TB_TOCLESS) + libc_hidden_builtin_def (memcpy) +diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S +--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S 2014-05-29 13:04:56.000000000 -0500 ++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S 2014-05-29 13:05:27.000000000 -0500 +@@ -1,5 +1,5 @@ + /* Optimized memcpy implementation for PowerPC64. +- Copyright (C) 2003, 2006, 2007, 2011 Free Software Foundation, Inc. ++ Copyright (C) 2003-2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or +@@ -17,52 +17,50 @@ + <http://www.gnu.org/licenses/>. */ + + #include <sysdep.h> +-#include <bp-sym.h> +-#include <bp-asm.h> + + /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]); + Returns 'dst'. + +- Memcpy handles short copies (< 32-bytes) using a binary move blocks +- (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled +- with the appropriate combination of byte and halfword load/stores. +- There is minimal effort to optimize the alignment of short moves. ++ Memcpy handles short copies (< 32-bytes) using a binary move blocks ++ (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled ++ with the appropriate combination of byte and halfword load/stores. ++ There is minimal effort to optimize the alignment of short moves. + The 64-bit implementations of POWER3 and POWER4 do a reasonable job +- of handling unligned load/stores that do not cross 32-byte boundries. ++ of handling unaligned load/stores that do not cross 32-byte boundaries. + + Longer moves (>= 32-bytes) justify the effort to get at least the + destination doubleword (8-byte) aligned. Further optimization is +- posible when both source and destination are doubleword aligned. +- Each case has a optimized unrolled loop. +- +- For POWER6 unaligned loads will take a 20+ cycle hicup for any ++ possible when both source and destination are doubleword aligned. ++ Each case has a optimized unrolled loop. ++ ++ For POWER6 unaligned loads will take a 20+ cycle hiccup for any + L1 cache miss that crosses a 32- or 128-byte boundary. Store +- is more forgiving and does not take a hicup until page or +- segment boundaries. So we require doubleword alignment for ++ is more forgiving and does not take a hiccup until page or ++ segment boundaries. So we require doubleword alignment for + the source but may take a risk and only require word alignment + for the destination. */ + + .machine "power6" +-EALIGN (BP_SYM (memcpy), 7, 0) ++EALIGN (memcpy, 7, 0) + CALL_MCOUNT 3 + + cmpldi cr1,5,31 + neg 0,3 + std 3,-16(1) + std 31,-8(1) +- andi. 11,3,7 /* check alignement of dst. */ ++ andi. 11,3,7 /* check alignment of dst. */ + clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */ +- clrldi 10,4,61 /* check alignement of src. */ ++ clrldi 10,4,61 /* check alignment of src. */ + cmpldi cr6,5,8 + ble- cr1,.L2 /* If move < 32 bytes use short move code. */ + mtcrf 0x01,0 +- cmpld cr6,10,11 ++ cmpld cr6,10,11 + srdi 9,5,3 /* Number of full double words remaining. */ + beq .L0 +- ++ + subf 5,0,5 +- /* Move 0-7 bytes as needed to get the destination doubleword alligned. +- Duplicate some code to maximize fall-throught and minimize agen delays. */ ++ /* Move 0-7 bytes as needed to get the destination doubleword aligned. ++ Duplicate some code to maximize fall-through and minimize agen delays. */ + 1: bf 31,2f + lbz 6,0(4) + stb 6,0(3) +@@ -78,7 +76,7 @@ + lwz 6,1(4) + stw 6,1(3) + b 0f +- ++ + 2: bf 30,4f + lhz 6,0(4) + sth 6,0(3) +@@ -86,26 +84,26 @@ + lwz 6,2(4) + stw 6,2(3) + b 0f +- ++ + 4: bf 29,0f + lwz 6,0(4) + stw 6,0(3) +-0: ++0: + /* Add the number of bytes until the 1st doubleword of dst to src and dst. */ + add 4,4,0 + add 3,3,0 +- +- clrldi 10,4,61 /* check alignement of src again. */ ++ ++ clrldi 10,4,61 /* check alignment of src again. */ + srdi 9,5,3 /* Number of full double words remaining. */ +- +- /* Copy doublewords from source to destination, assumpting the ++ ++ /* Copy doublewords from source to destination, assuming the + destination is aligned on a doubleword boundary. + + At this point we know there are at least 25 bytes left (32-7) to copy. +- The next step is to determine if the source is also doubleword aligned. ++ The next step is to determine if the source is also doubleword aligned. + If not branch to the unaligned move code at .L6. which uses + a load, shift, store strategy. +- ++ + Otherwise source and destination are doubleword aligned, and we can + the optimized doubleword copy loop. */ + .align 4 +@@ -123,14 +121,14 @@ + the main loop exits there may be a tail of 1-7 bytes. These byte + are copied a word/halfword/byte at a time as needed to preserve + alignment. +- ++ + For POWER6 the L1 is store-through and the L2 is store-in. The + L2 is clocked at half CPU clock so we can store 16 bytes every + other cycle. POWER6 also has a load/store bypass so we can do +- load, load, store, store every 2 cycles. +- ++ load, load, store, store every 2 cycles. ++ + The following code is sensitive to cache line alignment. Do not +- make any change with out first making sure thay don't result in ++ make any change with out first making sure they don't result in + splitting ld/std pairs across a cache line. */ + + mtcrf 0x02,5 +@@ -273,7 +271,7 @@ + std 8,16+96(10) + std 0,24+96(10) + ble cr5,L(das_loop_e) +- ++ + mtctr 12 + .align 4 + L(das_loop2): +@@ -326,10 +324,10 @@ + .align 4 + L(das_tail): + beq cr1,0f +- ++ + L(das_tail2): + /* At this point we have a tail of 0-7 bytes and we know that the +- destiniation is double word aligned. */ ++ destination is double word aligned. */ + 4: bf 29,2f + lwz 6,0(4) + stw 6,0(3) +@@ -344,7 +342,7 @@ + lbz 6,4(4) + stb 6,4(3) + b 0f +- ++ + 2: bf 30,1f + lhz 6,0(4) + sth 6,0(3) +@@ -352,7 +350,7 @@ + lbz 6,2(4) + stb 6,2(3) + b 0f +- ++ + 1: bf 31,0f + lbz 6,0(4) + stb 6,0(3) +@@ -361,7 +359,7 @@ + ld 3,-16(1) + blr + +-/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31 ++/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31 + bytes. Each case is handled without loops, using binary (1,2,4,8) + tests. + +@@ -402,15 +400,28 @@ + blt cr6,5f + srdi 7,6,16 + bgt cr6,3f ++#ifdef __LITTLE_ENDIAN__ ++ sth 7,0(3) ++#else + sth 6,0(3) ++#endif + b 7f + .align 4 + 3: ++#ifdef __LITTLE_ENDIAN__ ++ rotlwi 6,6,24 ++ stb 6,0(3) ++ sth 7,1(3) ++#else + stb 7,0(3) + sth 6,1(3) ++#endif + b 7f + .align 4 + 5: ++#ifdef __LITTLE_ENDIAN__ ++ rotlwi 6,6,8 ++#endif + stb 6,0(3) + 7: + cmpldi cr1,10,16 +@@ -421,7 +432,7 @@ + /* At least 6 bytes left and the source is word aligned. This allows + some speculative loads up front. */ + /* We need to special case the fall-through because the biggest delays +- are due to address computation not being ready in time for the ++ are due to address computation not being ready in time for the + AGEN. */ + lwz 6,0(12) + lwz 7,4(12) +@@ -452,7 +463,7 @@ + ld 3,-16(1) + blr + .align 4 +-L(dus_tail16p8): /* less then 8 bytes left. */ ++L(dus_tail16p8): /* less than 8 bytes left. */ + beq cr1,L(dus_tailX) /* exactly 16 bytes, early exit. */ + cmpldi cr1,10,20 + bf 29,L(dus_tail16p2) +@@ -466,7 +477,7 @@ + ld 3,-16(1) + blr + .align 4 +-L(dus_tail16p4): /* less then 4 bytes left. */ ++L(dus_tail16p4): /* less than 4 bytes left. */ + addi 12,12,24 + addi 3,3,24 + bgt cr0,L(dus_tail2) +@@ -474,7 +485,7 @@ + ld 3,-16(1) + blr + .align 4 +-L(dus_tail16p2): /* 16 bytes moved, less then 4 bytes left. */ ++L(dus_tail16p2): /* 16 bytes moved, less than 4 bytes left. */ + addi 12,12,16 + addi 3,3,16 + b L(dus_tail2) +@@ -499,7 +510,7 @@ + ld 3,-16(1) + blr + .align 4 +-L(dus_tail8p4): /* less then 4 bytes left. */ ++L(dus_tail8p4): /* less than 4 bytes left. */ + addi 12,12,8 + addi 3,3,8 + bgt cr1,L(dus_tail2) +@@ -510,14 +521,14 @@ + .align 4 + L(dus_tail4): /* Move 4 bytes. */ + /* r6 already loaded speculatively. If we are here we know there is +- more then 4 bytes left. So there is no need to test. */ ++ more than 4 bytes left. So there is no need to test. */ + addi 12,12,4 + stw 6,0(3) + addi 3,3,4 + L(dus_tail2): /* Move 2-3 bytes. */ + bf 30,L(dus_tail1) + lhz 6,0(12) +- sth 6,0(3) ++ sth 6,0(3) + bf 31,L(dus_tailX) + lbz 7,2(12) + stb 7,2(3) +@@ -537,7 +548,7 @@ + .LE8: + mr 12,4 + bne cr6,L(dus_4) +-/* Exactly 8 bytes. We may cross a 32-/128-byte boundry and take a ~20 ++/* Exactly 8 bytes. We may cross a 32-/128-byte boundary and take a ~20 + cycle delay. This case should be rare and any attempt to avoid this + would take most of 20 cycles any way. */ + ld 6,0(4) +@@ -552,7 +563,7 @@ + stw 6,0(3) + bf 30,L(dus_5) + lhz 7,4(4) +- sth 7,4(3) ++ sth 7,4(3) + bf 31,L(dus_0) + lbz 8,6(4) + stb 8,6(3) +@@ -590,20 +601,31 @@ + bge cr0, L(du4_do) + blt cr5, L(du1_do) + beq cr5, L(du2_do) +- b L(du3_do) +- ++ b L(du3_do) ++ + .align 4 + L(du1_do): + bf 30,L(du1_1dw) + + /* there are at least two DWs to copy */ ++ /* FIXME: can combine last shift and "or" into "rldimi" */ ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 8 ++ sldi 8,7, 64-8 ++#else + sldi 0,6, 8 + srdi 8,7, 64-8 ++#endif + or 0,0,8 + ld 6,16(5) + std 0,0(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,7, 8 ++ sldi 8,6, 64-8 ++#else + sldi 0,7, 8 + srdi 8,6, 64-8 ++#endif + or 0,0,8 + ld 7,24(5) + std 0,8(4) +@@ -612,8 +634,13 @@ + blt cr6,L(du1_fini) /* if total DWs = 3, then bypass loop */ + bf 31,L(du1_loop) + /* there is a third DW to copy */ ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 8 ++ sldi 8,7, 64-8 ++#else + sldi 0,6, 8 + srdi 8,7, 64-8 ++#endif + or 0,0,8 + std 0,0(4) + mr 6,7 +@@ -624,8 +651,13 @@ + b L(du1_loop) + .align 4 + L(du1_1dw): ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 8 ++ sldi 8,7, 64-8 ++#else + sldi 0,6, 8 + srdi 8,7, 64-8 ++#endif + addi 5,5,16 + or 0,0,8 + bf 31,L(du1_loop) +@@ -637,23 +669,43 @@ + .align 4 + /* copy 32 bytes at a time */ + L(du1_loop): ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 8 ++ sldi 8,7, 64-8 ++#else + sldi 0,6, 8 + srdi 8,7, 64-8 ++#endif + or 0,0,8 + ld 6,0(5) + std 0,0(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,7, 8 ++ sldi 8,6, 64-8 ++#else + sldi 0,7, 8 + srdi 8,6, 64-8 ++#endif + or 0,0,8 + ld 7,8(5) + std 0,8(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 8 ++ sldi 8,7, 64-8 ++#else + sldi 0,6, 8 + srdi 8,7, 64-8 ++#endif + or 0,0,8 + ld 6,16(5) + std 0,16(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,7, 8 ++ sldi 8,6, 64-8 ++#else + sldi 0,7, 8 + srdi 8,6, 64-8 ++#endif + or 0,0,8 + ld 7,24(5) + std 0,24(4) +@@ -663,9 +715,14 @@ + .align 4 + L(du1_fini): + /* calculate and store the final DW */ ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 8 ++ sldi 8,7, 64-8 ++#else + sldi 0,6, 8 + srdi 8,7, 64-8 +- or 0,0,8 ++#endif ++ or 0,0,8 + std 0,0(4) + b L(du_done) + +@@ -674,13 +731,23 @@ + bf 30,L(du2_1dw) + + /* there are at least two DWs to copy */ ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 16 ++ sldi 8,7, 64-16 ++#else + sldi 0,6, 16 + srdi 8,7, 64-16 ++#endif + or 0,0,8 + ld 6,16(5) + std 0,0(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,7, 16 ++ sldi 8,6, 64-16 ++#else + sldi 0,7, 16 + srdi 8,6, 64-16 ++#endif + or 0,0,8 + ld 7,24(5) + std 0,8(4) +@@ -689,8 +756,13 @@ + blt cr6,L(du2_fini) /* if total DWs = 3, then bypass loop */ + bf 31,L(du2_loop) + /* there is a third DW to copy */ ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 16 ++ sldi 8,7, 64-16 ++#else + sldi 0,6, 16 + srdi 8,7, 64-16 ++#endif + or 0,0,8 + std 0,0(4) + mr 6,7 +@@ -701,8 +773,13 @@ + b L(du2_loop) + .align 4 + L(du2_1dw): ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 16 ++ sldi 8,7, 64-16 ++#else + sldi 0,6, 16 + srdi 8,7, 64-16 ++#endif + addi 5,5,16 + or 0,0,8 + bf 31,L(du2_loop) +@@ -714,23 +791,43 @@ + .align 4 + /* copy 32 bytes at a time */ + L(du2_loop): ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 16 ++ sldi 8,7, 64-16 ++#else + sldi 0,6, 16 + srdi 8,7, 64-16 ++#endif + or 0,0,8 + ld 6,0(5) + std 0,0(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,7, 16 ++ sldi 8,6, 64-16 ++#else + sldi 0,7, 16 + srdi 8,6, 64-16 ++#endif + or 0,0,8 + ld 7,8(5) + std 0,8(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 16 ++ sldi 8,7, 64-16 ++#else + sldi 0,6, 16 + srdi 8,7, 64-16 ++#endif + or 0,0,8 + ld 6,16(5) + std 0,16(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,7, 16 ++ sldi 8,6, 64-16 ++#else + sldi 0,7, 16 + srdi 8,6, 64-16 ++#endif + or 0,0,8 + ld 7,24(5) + std 0,24(4) +@@ -740,9 +837,14 @@ + .align 4 + L(du2_fini): + /* calculate and store the final DW */ ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 16 ++ sldi 8,7, 64-16 ++#else + sldi 0,6, 16 + srdi 8,7, 64-16 +- or 0,0,8 ++#endif ++ or 0,0,8 + std 0,0(4) + b L(du_done) + +@@ -751,13 +853,23 @@ + bf 30,L(du3_1dw) + + /* there are at least two DWs to copy */ ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 24 ++ sldi 8,7, 64-24 ++#else + sldi 0,6, 24 + srdi 8,7, 64-24 ++#endif + or 0,0,8 + ld 6,16(5) + std 0,0(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,7, 24 ++ sldi 8,6, 64-24 ++#else + sldi 0,7, 24 + srdi 8,6, 64-24 ++#endif + or 0,0,8 + ld 7,24(5) + std 0,8(4) +@@ -766,8 +878,13 @@ + blt cr6,L(du3_fini) /* if total DWs = 3, then bypass loop */ + bf 31,L(du3_loop) + /* there is a third DW to copy */ ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 24 ++ sldi 8,7, 64-24 ++#else + sldi 0,6, 24 + srdi 8,7, 64-24 ++#endif + or 0,0,8 + std 0,0(4) + mr 6,7 +@@ -778,8 +895,13 @@ + b L(du3_loop) + .align 4 + L(du3_1dw): ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 24 ++ sldi 8,7, 64-24 ++#else + sldi 0,6, 24 + srdi 8,7, 64-24 ++#endif + addi 5,5,16 + or 0,0,8 + bf 31,L(du3_loop) +@@ -791,23 +913,43 @@ + .align 4 + /* copy 32 bytes at a time */ + L(du3_loop): ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 24 ++ sldi 8,7, 64-24 ++#else + sldi 0,6, 24 + srdi 8,7, 64-24 ++#endif + or 0,0,8 + ld 6,0(5) + std 0,0(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,7, 24 ++ sldi 8,6, 64-24 ++#else + sldi 0,7, 24 + srdi 8,6, 64-24 ++#endif + or 0,0,8 + ld 7,8(5) + std 0,8(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 24 ++ sldi 8,7, 64-24 ++#else + sldi 0,6, 24 + srdi 8,7, 64-24 ++#endif + or 0,0,8 + ld 6,16(5) + std 0,16(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,7, 24 ++ sldi 8,6, 64-24 ++#else + sldi 0,7, 24 + srdi 8,6, 64-24 ++#endif + or 0,0,8 + ld 7,24(5) + std 0,24(4) +@@ -817,9 +959,14 @@ + .align 4 + L(du3_fini): + /* calculate and store the final DW */ ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 24 ++ sldi 8,7, 64-24 ++#else + sldi 0,6, 24 + srdi 8,7, 64-24 +- or 0,0,8 ++#endif ++ or 0,0,8 + std 0,0(4) + b L(du_done) + +@@ -834,13 +981,23 @@ + bf 30,L(du4_1dw) + + /* there are at least two DWs to copy */ ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 32 ++ sldi 8,7, 64-32 ++#else + sldi 0,6, 32 + srdi 8,7, 64-32 ++#endif + or 0,0,8 + ld 6,16(5) + std 0,0(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,7, 32 ++ sldi 8,6, 64-32 ++#else + sldi 0,7, 32 + srdi 8,6, 64-32 ++#endif + or 0,0,8 + ld 7,24(5) + std 0,8(4) +@@ -849,8 +1006,13 @@ + blt cr6,L(du4_fini) /* if total DWs = 3, then bypass loop */ + bf 31,L(du4_loop) + /* there is a third DW to copy */ ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 32 ++ sldi 8,7, 64-32 ++#else + sldi 0,6, 32 + srdi 8,7, 64-32 ++#endif + or 0,0,8 + std 0,0(4) + mr 6,7 +@@ -861,8 +1023,13 @@ + b L(du4_loop) + .align 4 + L(du4_1dw): ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 32 ++ sldi 8,7, 64-32 ++#else + sldi 0,6, 32 + srdi 8,7, 64-32 ++#endif + addi 5,5,16 + or 0,0,8 + bf 31,L(du4_loop) +@@ -874,23 +1041,43 @@ + .align 4 + /* copy 32 bytes at a time */ + L(du4_loop): ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 32 ++ sldi 8,7, 64-32 ++#else + sldi 0,6, 32 + srdi 8,7, 64-32 ++#endif + or 0,0,8 + ld 6,0(5) + std 0,0(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,7, 32 ++ sldi 8,6, 64-32 ++#else + sldi 0,7, 32 + srdi 8,6, 64-32 ++#endif + or 0,0,8 + ld 7,8(5) + std 0,8(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 32 ++ sldi 8,7, 64-32 ++#else + sldi 0,6, 32 + srdi 8,7, 64-32 ++#endif + or 0,0,8 + ld 6,16(5) + std 0,16(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,7, 32 ++ sldi 8,6, 64-32 ++#else + sldi 0,7, 32 + srdi 8,6, 64-32 ++#endif + or 0,0,8 + ld 7,24(5) + std 0,24(4) +@@ -900,9 +1087,14 @@ + .align 4 + L(du4_fini): + /* calculate and store the final DW */ ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 32 ++ sldi 8,7, 64-32 ++#else + sldi 0,6, 32 + srdi 8,7, 64-32 +- or 0,0,8 ++#endif ++ or 0,0,8 + std 0,0(4) + b L(du_done) + +@@ -911,13 +1103,23 @@ + bf 30,L(du5_1dw) + + /* there are at least two DWs to copy */ ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 40 ++ sldi 8,7, 64-40 ++#else + sldi 0,6, 40 + srdi 8,7, 64-40 ++#endif + or 0,0,8 + ld 6,16(5) + std 0,0(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,7, 40 ++ sldi 8,6, 64-40 ++#else + sldi 0,7, 40 + srdi 8,6, 64-40 ++#endif + or 0,0,8 + ld 7,24(5) + std 0,8(4) +@@ -926,8 +1128,13 @@ + blt cr6,L(du5_fini) /* if total DWs = 3, then bypass loop */ + bf 31,L(du5_loop) + /* there is a third DW to copy */ ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 40 ++ sldi 8,7, 64-40 ++#else + sldi 0,6, 40 + srdi 8,7, 64-40 ++#endif + or 0,0,8 + std 0,0(4) + mr 6,7 +@@ -938,8 +1145,13 @@ + b L(du5_loop) + .align 4 + L(du5_1dw): ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 40 ++ sldi 8,7, 64-40 ++#else + sldi 0,6, 40 + srdi 8,7, 64-40 ++#endif + addi 5,5,16 + or 0,0,8 + bf 31,L(du5_loop) +@@ -951,23 +1163,43 @@ + .align 4 + /* copy 32 bytes at a time */ + L(du5_loop): ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 40 ++ sldi 8,7, 64-40 ++#else + sldi 0,6, 40 + srdi 8,7, 64-40 ++#endif + or 0,0,8 + ld 6,0(5) + std 0,0(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,7, 40 ++ sldi 8,6, 64-40 ++#else + sldi 0,7, 40 + srdi 8,6, 64-40 ++#endif + or 0,0,8 + ld 7,8(5) + std 0,8(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 40 ++ sldi 8,7, 64-40 ++#else + sldi 0,6, 40 + srdi 8,7, 64-40 ++#endif + or 0,0,8 + ld 6,16(5) + std 0,16(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,7, 40 ++ sldi 8,6, 64-40 ++#else + sldi 0,7, 40 + srdi 8,6, 64-40 ++#endif + or 0,0,8 + ld 7,24(5) + std 0,24(4) +@@ -977,9 +1209,14 @@ + .align 4 + L(du5_fini): + /* calculate and store the final DW */ ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 40 ++ sldi 8,7, 64-40 ++#else + sldi 0,6, 40 + srdi 8,7, 64-40 +- or 0,0,8 ++#endif ++ or 0,0,8 + std 0,0(4) + b L(du_done) + +@@ -988,13 +1225,23 @@ + bf 30,L(du6_1dw) + + /* there are at least two DWs to copy */ ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 48 ++ sldi 8,7, 64-48 ++#else + sldi 0,6, 48 + srdi 8,7, 64-48 ++#endif + or 0,0,8 + ld 6,16(5) + std 0,0(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,7, 48 ++ sldi 8,6, 64-48 ++#else + sldi 0,7, 48 + srdi 8,6, 64-48 ++#endif + or 0,0,8 + ld 7,24(5) + std 0,8(4) +@@ -1003,8 +1250,13 @@ + blt cr6,L(du6_fini) /* if total DWs = 3, then bypass loop */ + bf 31,L(du6_loop) + /* there is a third DW to copy */ ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 48 ++ sldi 8,7, 64-48 ++#else + sldi 0,6, 48 + srdi 8,7, 64-48 ++#endif + or 0,0,8 + std 0,0(4) + mr 6,7 +@@ -1015,8 +1267,13 @@ + b L(du6_loop) + .align 4 + L(du6_1dw): ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 48 ++ sldi 8,7, 64-48 ++#else + sldi 0,6, 48 + srdi 8,7, 64-48 ++#endif + addi 5,5,16 + or 0,0,8 + bf 31,L(du6_loop) +@@ -1028,23 +1285,43 @@ + .align 4 + /* copy 32 bytes at a time */ + L(du6_loop): ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 48 ++ sldi 8,7, 64-48 ++#else + sldi 0,6, 48 + srdi 8,7, 64-48 ++#endif + or 0,0,8 + ld 6,0(5) + std 0,0(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,7, 48 ++ sldi 8,6, 64-48 ++#else + sldi 0,7, 48 + srdi 8,6, 64-48 ++#endif + or 0,0,8 + ld 7,8(5) + std 0,8(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 48 ++ sldi 8,7, 64-48 ++#else + sldi 0,6, 48 + srdi 8,7, 64-48 ++#endif + or 0,0,8 + ld 6,16(5) + std 0,16(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,7, 48 ++ sldi 8,6, 64-48 ++#else + sldi 0,7, 48 + srdi 8,6, 64-48 ++#endif + or 0,0,8 + ld 7,24(5) + std 0,24(4) +@@ -1054,9 +1331,14 @@ + .align 4 + L(du6_fini): + /* calculate and store the final DW */ ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 48 ++ sldi 8,7, 64-48 ++#else + sldi 0,6, 48 + srdi 8,7, 64-48 +- or 0,0,8 ++#endif ++ or 0,0,8 + std 0,0(4) + b L(du_done) + +@@ -1065,13 +1347,23 @@ + bf 30,L(du7_1dw) + + /* there are at least two DWs to copy */ ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 56 ++ sldi 8,7, 64-56 ++#else + sldi 0,6, 56 + srdi 8,7, 64-56 ++#endif + or 0,0,8 + ld 6,16(5) + std 0,0(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,7, 56 ++ sldi 8,6, 64-56 ++#else + sldi 0,7, 56 + srdi 8,6, 64-56 ++#endif + or 0,0,8 + ld 7,24(5) + std 0,8(4) +@@ -1080,8 +1372,13 @@ + blt cr6,L(du7_fini) /* if total DWs = 3, then bypass loop */ + bf 31,L(du7_loop) + /* there is a third DW to copy */ ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 56 ++ sldi 8,7, 64-56 ++#else + sldi 0,6, 56 + srdi 8,7, 64-56 ++#endif + or 0,0,8 + std 0,0(4) + mr 6,7 +@@ -1092,8 +1389,13 @@ + b L(du7_loop) + .align 4 + L(du7_1dw): ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 56 ++ sldi 8,7, 64-56 ++#else + sldi 0,6, 56 + srdi 8,7, 64-56 ++#endif + addi 5,5,16 + or 0,0,8 + bf 31,L(du7_loop) +@@ -1105,23 +1407,43 @@ + .align 4 + /* copy 32 bytes at a time */ + L(du7_loop): ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 56 ++ sldi 8,7, 64-56 ++#else + sldi 0,6, 56 + srdi 8,7, 64-56 ++#endif + or 0,0,8 + ld 6,0(5) + std 0,0(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,7, 56 ++ sldi 8,6, 64-56 ++#else + sldi 0,7, 56 + srdi 8,6, 64-56 ++#endif + or 0,0,8 + ld 7,8(5) + std 0,8(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 56 ++ sldi 8,7, 64-56 ++#else + sldi 0,6, 56 + srdi 8,7, 64-56 ++#endif + or 0,0,8 + ld 6,16(5) + std 0,16(4) ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,7, 56 ++ sldi 8,6, 64-56 ++#else + sldi 0,7, 56 + srdi 8,6, 64-56 ++#endif + or 0,0,8 + ld 7,24(5) + std 0,24(4) +@@ -1131,12 +1453,17 @@ + .align 4 + L(du7_fini): + /* calculate and store the final DW */ ++#ifdef __LITTLE_ENDIAN__ ++ srdi 0,6, 56 ++ sldi 8,7, 64-56 ++#else + sldi 0,6, 56 + srdi 8,7, 64-56 +- or 0,0,8 ++#endif ++ or 0,0,8 + std 0,0(4) + b L(du_done) +- ++ + .align 4 + L(du_done): + rldicr 0,31,0,60 +@@ -1144,9 +1471,9 @@ + beq cr1,0f /* If the tail is 0 bytes we are done! */ + + add 3,3,0 +- add 12,12,0 ++ add 12,12,0 + /* At this point we have a tail of 0-7 bytes and we know that the +- destiniation is double word aligned. */ ++ destination is double word aligned. */ + 4: bf 29,2f + lwz 6,0(12) + addi 12,12,4 +@@ -1165,5 +1492,5 @@ + ld 31,-8(1) + ld 3,-16(1) + blr +-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS) ++END_GEN_TB (memcpy,TB_TOCLESS) + libc_hidden_builtin_def (memcpy) +diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S +--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S 2014-05-29 13:04:56.000000000 -0500 ++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S 2014-05-29 13:05:40.000000000 -0500 +@@ -1,5 +1,5 @@ + /* Optimized memcpy implementation for PowerPC64/POWER7. +- Copyright (C) 2010, 2011 Free Software Foundation, Inc. ++ Copyright (C) 2010-2014 Free Software Foundation, Inc. + Contributed by Luis Machado <luisgpm@br.ibm.com>. + This file is part of the GNU C Library. + +@@ -18,425 +18,366 @@ + <http://www.gnu.org/licenses/>. */ + + #include <sysdep.h> +-#include <bp-sym.h> +-#include <bp-asm.h> + + + /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]); + Returns 'dst'. */ + ++#define dst 11 /* Use r11 so r3 kept unchanged. */ ++#define src 4 ++#define cnt 5 ++ + .machine power7 +-EALIGN (BP_SYM (memcpy), 5, 0) ++EALIGN (memcpy, 5, 0) + CALL_MCOUNT 3 + +- cmpldi cr1,5,31 ++ cmpldi cr1,cnt,31 + neg 0,3 +- std 3,-16(1) +- std 31,-8(1) +- cfi_offset(31,-8) + ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move + code. */ + +- andi. 11,3,7 /* Check alignment of DST. */ +- ++#ifdef __LITTLE_ENDIAN__ ++/* In little-endian mode, power7 takes an alignment trap on any lxvd2x ++ or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy ++ loop is only used for quadword aligned copies. */ ++ andi. 10,3,15 ++ clrldi 11,4,60 ++#else ++ andi. 10,3,7 /* Check alignment of DST. */ ++ clrldi 11,4,61 /* Check alignment of SRC. */ ++#endif ++ cmpld cr6,10,11 /* SRC and DST alignments match? */ + +- clrldi 10,4,61 /* Check alignment of SRC. */ +- cmpld cr6,10,11 /* SRC and DST alignments match? */ +- mr 12,4 +- mr 31,5 ++ mr dst,3 + bne cr6,L(copy_GE_32_unaligned) ++ beq L(aligned_copy) + +- srdi 9,5,3 /* Number of full quadwords remaining. */ +- +- beq L(copy_GE_32_aligned_cont) +- +- clrldi 0,0,61 +- mtcrf 0x01,0 +- subf 31,0,5 +- +- /* Get the SRC aligned to 8 bytes. */ +- +-1: bf 31,2f +- lbz 6,0(12) +- addi 12,12,1 +- stb 6,0(3) +- addi 3,3,1 +-2: bf 30,4f +- lhz 6,0(12) +- addi 12,12,2 +- sth 6,0(3) +- addi 3,3,2 +-4: bf 29,0f +- lwz 6,0(12) +- addi 12,12,4 +- stw 6,0(3) +- addi 3,3,4 +-0: +- clrldi 10,12,61 /* Check alignment of SRC again. */ +- srdi 9,31,3 /* Number of full doublewords remaining. */ +- +-L(copy_GE_32_aligned_cont): +- +- clrldi 11,31,61 +- mtcrf 0x01,9 +- +- srdi 8,31,5 +- cmpldi cr1,9,4 +- cmpldi cr6,11,0 +- mr 11,12 ++ mtocrf 0x01,0 ++#ifdef __LITTLE_ENDIAN__ ++ clrldi 0,0,60 ++#else ++ clrldi 0,0,61 ++#endif + +- /* Copy 1~3 doublewords so the main loop starts +- at a multiple of 32 bytes. */ +- +- bf 30,1f +- ld 6,0(12) +- ld 7,8(12) +- addi 11,12,16 +- mtctr 8 +- std 6,0(3) +- std 7,8(3) +- addi 10,3,16 +- bf 31,4f +- ld 0,16(12) +- std 0,16(3) +- blt cr1,3f +- addi 11,12,24 +- addi 10,3,24 +- b 4f +- +- .align 4 +-1: /* Copy 1 doubleword and set the counter. */ +- mr 10,3 +- mtctr 8 +- bf 31,4f +- ld 6,0(12) +- addi 11,12,8 +- std 6,0(3) +- addi 10,3,8 ++/* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */ ++1: ++ bf 31,2f ++ lbz 6,0(src) ++ addi src,src,1 ++ stb 6,0(dst) ++ addi dst,dst,1 ++2: ++ bf 30,4f ++ lhz 6,0(src) ++ addi src,src,2 ++ sth 6,0(dst) ++ addi dst,dst,2 ++4: ++ bf 29,8f ++ lwz 6,0(src) ++ addi src,src,4 ++ stw 6,0(dst) ++ addi dst,dst,4 ++8: ++#ifdef __LITTLE_ENDIAN__ ++ bf 28,16f ++ ld 6,0(src) ++ addi src,src,8 ++ std 6,0(dst) ++ addi dst,dst,8 ++16: ++#endif ++ subf cnt,0,cnt + ++/* Main aligned copy loop. Copies 128 bytes at a time. */ + L(aligned_copy): +- /* Main aligned copy loop. Copies up to 128-bytes at a time. */ +- .align 4 +-4: +- /* check for any 32-byte or 64-byte lumps that are outside of a +- nice 128-byte range. R8 contains the number of 32-byte +- lumps, so drop this into the CR, and use the SO/EQ bits to help +- handle the 32- or 64- byte lumps. Then handle the rest with an +- unrolled 128-bytes-at-a-time copy loop. */ +- mtocrf 1,8 +- li 6,16 # 16() index +- li 7,32 # 32() index +- li 8,48 # 48() index +- +-L(aligned_32byte): +- /* if the SO bit (indicating a 32-byte lump) is not set, move along. */ +- bns cr7,L(aligned_64byte) +- lxvd2x 6,0,11 +- lxvd2x 7,11,6 +- addi 11,11,32 +- stxvd2x 6,0,10 +- stxvd2x 7,10,6 +- addi 10,10,32 +- +-L(aligned_64byte): +- /* if the EQ bit (indicating a 64-byte lump) is not set, move along. */ +- bne cr7,L(aligned_128setup) +- lxvd2x 6,0,11 +- lxvd2x 7,11,6 +- lxvd2x 8,11,7 +- lxvd2x 9,11,8 +- addi 11,11,64 +- stxvd2x 6,0,10 +- stxvd2x 7,10,6 +- stxvd2x 8,10,7 +- stxvd2x 9,10,8 +- addi 10,10,64 +- +-L(aligned_128setup): +- /* Set up for the 128-byte at a time copy loop. */ +- srdi 8,31,7 +- cmpdi 8,0 # Any 4x lumps left? +- beq 3f # if not, move along. +- lxvd2x 6,0,11 +- lxvd2x 7,11,6 +- mtctr 8 # otherwise, load the ctr and begin. +- li 8,48 # 48() index ++ li 6,16 ++ li 7,32 ++ li 8,48 ++ mtocrf 0x02,cnt ++ srdi 12,cnt,7 ++ cmpdi 12,0 ++ beq L(aligned_tail) ++ lxvd2x 6,0,src ++ lxvd2x 7,src,6 ++ mtctr 12 + b L(aligned_128loop) + ++ .align 4 + L(aligned_128head): + /* for the 2nd + iteration of this loop. */ +- lxvd2x 6,0,11 +- lxvd2x 7,11,6 ++ lxvd2x 6,0,src ++ lxvd2x 7,src,6 + L(aligned_128loop): +- lxvd2x 8,11,7 +- lxvd2x 9,11,8 +- stxvd2x 6,0,10 +- addi 11,11,64 +- stxvd2x 7,10,6 +- stxvd2x 8,10,7 +- stxvd2x 9,10,8 +- lxvd2x 6,0,11 +- lxvd2x 7,11,6 +- addi 10,10,64 +- lxvd2x 8,11,7 +- lxvd2x 9,11,8 +- addi 11,11,64 +- stxvd2x 6,0,10 +- stxvd2x 7,10,6 +- stxvd2x 8,10,7 +- stxvd2x 9,10,8 +- addi 10,10,64 ++ lxvd2x 8,src,7 ++ lxvd2x 9,src,8 ++ stxvd2x 6,0,dst ++ addi src,src,64 ++ stxvd2x 7,dst,6 ++ stxvd2x 8,dst,7 ++ stxvd2x 9,dst,8 ++ lxvd2x 6,0,src ++ lxvd2x 7,src,6 ++ addi dst,dst,64 ++ lxvd2x 8,src,7 ++ lxvd2x 9,src,8 ++ addi src,src,64 ++ stxvd2x 6,0,dst ++ stxvd2x 7,dst,6 ++ stxvd2x 8,dst,7 ++ stxvd2x 9,dst,8 ++ addi dst,dst,64 + bdnz L(aligned_128head) + +-3: +- /* Check for tail bytes. */ +- rldicr 0,31,0,60 +- mtcrf 0x01,31 +- beq cr6,0f +- +-.L9: +- add 3,3,0 +- add 12,12,0 +- +- /* At this point we have a tail of 0-7 bytes and we know that the +- destination is doubleword-aligned. */ +-4: /* Copy 4 bytes. */ +- bf 29,2f +- +- lwz 6,0(12) +- addi 12,12,4 +- stw 6,0(3) +- addi 3,3,4 +-2: /* Copy 2 bytes. */ +- bf 30,1f +- +- lhz 6,0(12) +- addi 12,12,2 +- sth 6,0(3) +- addi 3,3,2 +-1: /* Copy 1 byte. */ +- bf 31,0f +- +- lbz 6,0(12) +- stb 6,0(3) +-0: /* Return original DST pointer. */ +- ld 31,-8(1) +- ld 3,-16(1) ++L(aligned_tail): ++ mtocrf 0x01,cnt ++ bf 25,32f ++ lxvd2x 6,0,src ++ lxvd2x 7,src,6 ++ lxvd2x 8,src,7 ++ lxvd2x 9,src,8 ++ addi src,src,64 ++ stxvd2x 6,0,dst ++ stxvd2x 7,dst,6 ++ stxvd2x 8,dst,7 ++ stxvd2x 9,dst,8 ++ addi dst,dst,64 ++32: ++ bf 26,16f ++ lxvd2x 6,0,src ++ lxvd2x 7,src,6 ++ addi src,src,32 ++ stxvd2x 6,0,dst ++ stxvd2x 7,dst,6 ++ addi dst,dst,32 ++16: ++ bf 27,8f ++ lxvd2x 6,0,src ++ addi src,src,16 ++ stxvd2x 6,0,dst ++ addi dst,dst,16 ++8: ++ bf 28,4f ++ ld 6,0(src) ++ addi src,src,8 ++ std 6,0(dst) ++ addi dst,dst,8 ++4: /* Copies 4~7 bytes. */ ++ bf 29,L(tail2) ++ lwz 6,0(src) ++ stw 6,0(dst) ++ bf 30,L(tail5) ++ lhz 7,4(src) ++ sth 7,4(dst) ++ bflr 31 ++ lbz 8,6(src) ++ stb 8,6(dst) ++ /* Return original DST pointer. */ + blr + +- /* Handle copies of 0~31 bytes. */ +- .align 4 ++ ++/* Handle copies of 0~31 bytes. */ ++ .align 4 + L(copy_LT_32): +- cmpldi cr6,5,8 +- mr 12,4 +- mtcrf 0x01,5 ++ mr dst,3 ++ cmpldi cr6,cnt,8 ++ mtocrf 0x01,cnt + ble cr6,L(copy_LE_8) + + /* At least 9 bytes to go. */ + neg 8,4 +- clrrdi 11,4,2 +- andi. 0,8,3 +- cmpldi cr1,5,16 +- mr 10,5 ++ andi. 0,8,3 ++ cmpldi cr1,cnt,16 + beq L(copy_LT_32_aligned) + +- /* Force 4-bytes alignment for SRC. */ +- mtocrf 0x01,0 +- subf 10,0,5 +-2: bf 30,1f +- +- lhz 6,0(12) +- addi 12,12,2 +- sth 6,0(3) +- addi 3,3,2 +-1: bf 31,L(end_4bytes_alignment) +- +- lbz 6,0(12) +- addi 12,12,1 +- stb 6,0(3) +- addi 3,3,1 ++ /* Force 4-byte alignment for SRC. */ ++ mtocrf 0x01,0 ++ subf cnt,0,cnt ++2: ++ bf 30,1f ++ lhz 6,0(src) ++ addi src,src,2 ++ sth 6,0(dst) ++ addi dst,dst,2 ++1: ++ bf 31,L(end_4bytes_alignment) ++ lbz 6,0(src) ++ addi src,src,1 ++ stb 6,0(dst) ++ addi dst,dst,1 + +- .align 4 ++ .align 4 + L(end_4bytes_alignment): +- cmpldi cr1,10,16 +- mtcrf 0x01,10 ++ cmpldi cr1,cnt,16 ++ mtocrf 0x01,cnt + + L(copy_LT_32_aligned): + /* At least 6 bytes to go, and SRC is word-aligned. */ + blt cr1,8f + + /* Copy 16 bytes. */ +- lwz 6,0(12) +- lwz 7,4(12) +- stw 6,0(3) +- lwz 8,8(12) +- stw 7,4(3) +- lwz 6,12(12) +- addi 12,12,16 +- stw 8,8(3) +- stw 6,12(3) +- addi 3,3,16 ++ lwz 6,0(src) ++ lwz 7,4(src) ++ stw 6,0(dst) ++ lwz 8,8(src) ++ stw 7,4(dst) ++ lwz 6,12(src) ++ addi src,src,16 ++ stw 8,8(dst) ++ stw 6,12(dst) ++ addi dst,dst,16 + 8: /* Copy 8 bytes. */ +- bf 28,4f ++ bf 28,L(tail4) ++ lwz 6,0(src) ++ lwz 7,4(src) ++ addi src,src,8 ++ stw 6,0(dst) ++ stw 7,4(dst) ++ addi dst,dst,8 ++ ++ .align 4 ++/* Copies 4~7 bytes. */ ++L(tail4): ++ bf 29,L(tail2) ++ lwz 6,0(src) ++ stw 6,0(dst) ++ bf 30,L(tail5) ++ lhz 7,4(src) ++ sth 7,4(dst) ++ bflr 31 ++ lbz 8,6(src) ++ stb 8,6(dst) ++ /* Return original DST pointer. */ ++ blr + +- lwz 6,0(12) +- lwz 7,4(12) +- addi 12,12,8 +- stw 6,0(3) +- stw 7,4(3) +- addi 3,3,8 +-4: /* Copy 4 bytes. */ +- bf 29,2f +- +- lwz 6,0(12) +- addi 12,12,4 +- stw 6,0(3) +- addi 3,3,4 +-2: /* Copy 2-3 bytes. */ ++ .align 4 ++/* Copies 2~3 bytes. */ ++L(tail2): + bf 30,1f +- +- lhz 6,0(12) +- sth 6,0(3) +- bf 31,0f +- lbz 7,2(12) +- stb 7,2(3) +- ld 3,-16(1) ++ lhz 6,0(src) ++ sth 6,0(dst) ++ bflr 31 ++ lbz 7,2(src) ++ stb 7,2(dst) + blr + +- .align 4 +-1: /* Copy 1 byte. */ +- bf 31,0f ++ .align 4 ++L(tail5): ++ bflr 31 ++ lbz 6,4(src) ++ stb 6,4(dst) ++ blr + +- lbz 6,0(12) +- stb 6,0(3) +-0: /* Return original DST pointer. */ +- ld 3,-16(1) ++ .align 4 ++1: ++ bflr 31 ++ lbz 6,0(src) ++ stb 6,0(dst) ++ /* Return original DST pointer. */ + blr + +- /* Handles copies of 0~8 bytes. */ +- .align 4 ++ ++/* Handles copies of 0~8 bytes. */ ++ .align 4 + L(copy_LE_8): +- bne cr6,4f ++ bne cr6,L(tail4) + + /* Though we could've used ld/std here, they are still + slow for unaligned cases. */ + +- lwz 6,0(4) +- lwz 7,4(4) +- stw 6,0(3) +- stw 7,4(3) +- ld 3,-16(1) /* Return original DST pointers. */ ++ lwz 6,0(src) ++ lwz 7,4(src) ++ stw 6,0(dst) ++ stw 7,4(dst) + blr + +- .align 4 +-4: /* Copies 4~7 bytes. */ +- bf 29,2b + +- lwz 6,0(4) +- stw 6,0(3) +- bf 30,5f +- lhz 7,4(4) +- sth 7,4(3) +- bf 31,0f +- lbz 8,6(4) +- stb 8,6(3) +- ld 3,-16(1) +- blr +- +- .align 4 +-5: /* Copy 1 byte. */ +- bf 31,0f +- +- lbz 6,4(4) +- stb 6,4(3) +- +-0: /* Return original DST pointer. */ +- ld 3,-16(1) +- blr +- +- /* Handle copies of 32+ bytes where DST is aligned (to quadword) but +- SRC is not. Use aligned quadword loads from SRC, shifted to realign +- the data, allowing for aligned DST stores. */ +- .align 4 ++/* Handle copies of 32+ bytes where DST is aligned (to quadword) but ++ SRC is not. Use aligned quadword loads from SRC, shifted to realign ++ the data, allowing for aligned DST stores. */ ++ .align 4 + L(copy_GE_32_unaligned): +- clrldi 0,0,60 /* Number of bytes until the 1st +- quadword. */ +- andi. 11,3,15 /* Check alignment of DST (against +- quadwords). */ +- srdi 9,5,4 /* Number of full quadwords remaining. */ ++ clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */ ++#ifndef __LITTLE_ENDIAN__ ++ andi. 10,3,15 /* Check alignment of DST (against quadwords). */ ++#endif ++ srdi 9,cnt,4 /* Number of full quadwords remaining. */ + + beq L(copy_GE_32_unaligned_cont) + +- /* SRC is not quadword aligned, get it aligned. */ ++ /* DST is not quadword aligned, get it aligned. */ + +- mtcrf 0x01,0 +- subf 31,0,5 ++ mtocrf 0x01,0 ++ subf cnt,0,cnt + + /* Vector instructions work best when proper alignment (16-bytes) + is present. Move 0~15 bytes as needed to get DST quadword-aligned. */ +-1: /* Copy 1 byte. */ ++1: + bf 31,2f +- +- lbz 6,0(12) +- addi 12,12,1 +- stb 6,0(3) +- addi 3,3,1 +-2: /* Copy 2 bytes. */ ++ lbz 6,0(src) ++ addi src,src,1 ++ stb 6,0(dst) ++ addi dst,dst,1 ++2: + bf 30,4f +- +- lhz 6,0(12) +- addi 12,12,2 +- sth 6,0(3) +- addi 3,3,2 +-4: /* Copy 4 bytes. */ ++ lhz 6,0(src) ++ addi src,src,2 ++ sth 6,0(dst) ++ addi dst,dst,2 ++4: + bf 29,8f +- +- lwz 6,0(12) +- addi 12,12,4 +- stw 6,0(3) +- addi 3,3,4 +-8: /* Copy 8 bytes. */ ++ lwz 6,0(src) ++ addi src,src,4 ++ stw 6,0(dst) ++ addi dst,dst,4 ++8: + bf 28,0f +- +- ld 6,0(12) +- addi 12,12,8 +- std 6,0(3) +- addi 3,3,8 ++ ld 6,0(src) ++ addi src,src,8 ++ std 6,0(dst) ++ addi dst,dst,8 + 0: +- clrldi 10,12,60 /* Check alignment of SRC. */ +- srdi 9,31,4 /* Number of full quadwords remaining. */ ++ srdi 9,cnt,4 /* Number of full quadwords remaining. */ + + /* The proper alignment is present, it is OK to copy the bytes now. */ + L(copy_GE_32_unaligned_cont): + + /* Setup two indexes to speed up the indexed vector operations. */ +- clrldi 11,31,60 +- li 6,16 /* Index for 16-bytes offsets. */ ++ clrldi 10,cnt,60 ++ li 6,16 /* Index for 16-bytes offsets. */ + li 7,32 /* Index for 32-bytes offsets. */ +- cmpldi cr1,11,0 +- srdi 8,31,5 /* Setup the loop counter. */ +- mr 10,3 +- mr 11,12 +- mtcrf 0x01,9 +- cmpldi cr6,9,1 +- lvsl 5,0,12 +- lvx 3,0,12 +- bf 31,L(setup_unaligned_loop) +- +- /* Copy another 16 bytes to align to 32-bytes due to the loop . */ +- lvx 4,12,6 +- vperm 6,3,4,5 +- addi 11,12,16 +- addi 10,3,16 +- stvx 6,0,3 ++ cmpldi cr1,10,0 ++ srdi 8,cnt,5 /* Setup the loop counter. */ ++ mtocrf 0x01,9 ++ cmpldi cr6,9,1 ++#ifdef __LITTLE_ENDIAN__ ++ lvsr 5,0,src ++#else ++ lvsl 5,0,src ++#endif ++ lvx 3,0,src ++ li 0,0 ++ bf 31,L(setup_unaligned_loop) ++ ++ /* Copy another 16 bytes to align to 32-bytes due to the loop. */ ++ lvx 4,src,6 ++#ifdef __LITTLE_ENDIAN__ ++ vperm 6,4,3,5 ++#else ++ vperm 6,3,4,5 ++#endif ++ addi src,src,16 ++ stvx 6,0,dst ++ addi dst,dst,16 + vor 3,4,4 ++ clrrdi 0,src,60 + + L(setup_unaligned_loop): +- mtctr 8 +- ble cr6,L(end_unaligned_loop) ++ mtctr 8 ++ ble cr6,L(end_unaligned_loop) + + /* Copy 32 bytes at a time using vector instructions. */ +- .align 4 ++ .align 4 + L(unaligned_loop): + + /* Note: vr6/vr10 may contain data that was already copied, +@@ -444,63 +385,56 @@ + some portions again. This is faster than having unaligned + vector instructions though. */ + +- lvx 4,11,6 /* vr4 = r11+16. */ +- vperm 6,3,4,5 /* Merge the correctly-aligned portions +- of vr3/vr4 into vr6. */ +- lvx 3,11,7 /* vr3 = r11+32. */ +- vperm 10,4,3,5 /* Merge the correctly-aligned portions +- of vr3/vr4 into vr10. */ +- addi 11,11,32 +- stvx 6,0,10 +- stvx 10,10,6 +- addi 10,10,32 +- ++ lvx 4,src,6 ++#ifdef __LITTLE_ENDIAN__ ++ vperm 6,4,3,5 ++#else ++ vperm 6,3,4,5 ++#endif ++ lvx 3,src,7 ++#ifdef __LITTLE_ENDIAN__ ++ vperm 10,3,4,5 ++#else ++ vperm 10,4,3,5 ++#endif ++ addi src,src,32 ++ stvx 6,0,dst ++ stvx 10,dst,6 ++ addi dst,dst,32 + bdnz L(unaligned_loop) + +- .align 4 ++ clrrdi 0,src,60 ++ ++ .align 4 + L(end_unaligned_loop): + + /* Check for tail bytes. */ +- rldicr 0,31,0,59 +- mtcrf 0x01,31 +- beq cr1,0f ++ mtocrf 0x01,cnt ++ beqlr cr1 + +- add 3,3,0 +- add 12,12,0 ++ add src,src,0 + + /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */ +-8: /* Copy 8 bytes. */ ++ /* Copy 8 bytes. */ + bf 28,4f +- +- lwz 6,0(12) +- lwz 7,4(12) +- addi 12,12,8 +- stw 6,0(3) +- stw 7,4(3) +- addi 3,3,8 +-4: /* Copy 4 bytes. */ +- bf 29,2f +- +- lwz 6,0(12) +- addi 12,12,4 +- stw 6,0(3) +- addi 3,3,4 +-2: /* Copy 2~3 bytes. */ +- bf 30,1f +- +- lhz 6,0(12) +- addi 12,12,2 +- sth 6,0(3) +- addi 3,3,2 +-1: /* Copy 1 byte. */ +- bf 31,0f +- +- lbz 6,0(12) +- stb 6,0(3) +-0: /* Return original DST pointer. */ +- ld 31,-8(1) +- ld 3,-16(1) ++ lwz 6,0(src) ++ lwz 7,4(src) ++ addi src,src,8 ++ stw 6,0(dst) ++ stw 7,4(dst) ++ addi dst,dst,8 ++4: /* Copy 4~7 bytes. */ ++ bf 29,L(tail2) ++ lwz 6,0(src) ++ stw 6,0(dst) ++ bf 30,L(tail5) ++ lhz 7,4(src) ++ sth 7,4(dst) ++ bflr 31 ++ lbz 8,6(src) ++ stb 8,6(dst) ++ /* Return original DST pointer. */ + blr + +-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS) ++END_GEN_TB (memcpy,TB_TOCLESS) + libc_hidden_builtin_def (memcpy) +diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S +--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S 2014-05-29 13:04:56.000000000 -0500 ++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S 2014-05-29 13:04:56.000000000 -0500 +@@ -367,13 +367,21 @@ + mr 11,12 + mtcrf 0x01,9 + cmpldi cr6,9,1 +- lvsl 5,0,12 ++#ifdef __LITTLE_ENDIAN__ ++ lvsr 5,0,12 ++#else ++ lvsl 5,0,12 ++#endif + lvx 3,0,12 + bf 31,L(setup_unaligned_loop) + + /* Copy another 16 bytes to align to 32-bytes due to the loop . */ + lvx 4,12,6 +- vperm 6,3,4,5 ++#ifdef __LITTLE_ENDIAN__ ++ vperm 6,4,3,5 ++#else ++ vperm 6,3,4,5 ++#endif + addi 11,12,16 + addi 10,3,16 + stvx 6,0,3 +@@ -393,11 +401,17 @@ + vector instructions though. */ + + lvx 4,11,6 /* vr4 = r11+16. */ +- vperm 6,3,4,5 /* Merge the correctly-aligned portions +- of vr3/vr4 into vr6. */ ++#ifdef __LITTLE_ENDIAN__ ++ vperm 6,4,3,5 ++#else ++ vperm 6,3,4,5 ++#endif + lvx 3,11,7 /* vr3 = r11+32. */ +- vperm 10,4,3,5 /* Merge the correctly-aligned portions +- of vr3/vr4 into vr10. */ ++#ifdef __LITTLE_ENDIAN__ ++ vperm 10,3,4,5 ++#else ++ vperm 10,4,3,5 ++#endif + addi 11,11,32 + stvx 6,0,10 + stvx 10,10,6 |