summaryrefslogtreecommitdiff
path: root/packages/glibc/2.17/0053-glibc-ppc64le-31.patch
diff options
context:
space:
mode:
authormessense <messense@icloud.com>2021-05-13 03:35:09 (GMT)
committermessense <messense@icloud.com>2021-05-13 07:12:54 (GMT)
commit798904409cfb7e6b481a290b776b7f178c9036bf (patch)
tree81511cca575718eab971f105f41f695e38b73fe7 /packages/glibc/2.17/0053-glibc-ppc64le-31.patch
parentf9716e8b9042eb14de85320987300aab99300df5 (diff)
Add ppc64le patches for glibc 2.17 from CentOS git
Diffstat (limited to 'packages/glibc/2.17/0053-glibc-ppc64le-31.patch')
-rw-r--r--packages/glibc/2.17/0053-glibc-ppc64le-31.patch2943
1 files changed, 2943 insertions, 0 deletions
diff --git a/packages/glibc/2.17/0053-glibc-ppc64le-31.patch b/packages/glibc/2.17/0053-glibc-ppc64le-31.patch
new file mode 100644
index 0000000..de90661
--- /dev/null
+++ b/packages/glibc/2.17/0053-glibc-ppc64le-31.patch
@@ -0,0 +1,2943 @@
+# commit 759cfef3ac4c07dba1ece0bbc1207e099348816d
+# Author: Alan Modra <amodra@gmail.com>
+# Date: Sat Aug 17 18:47:22 2013 +0930
+#
+# PowerPC LE memcpy
+# http://sourceware.org/ml/libc-alpha/2013-08/msg00103.html
+#
+# LIttle-endian support for memcpy. I spent some time cleaning up the
+# 64-bit power7 memcpy, in order to avoid the extra alignment traps
+# power7 takes for little-endian. It probably would have been better
+# to copy the linux kernel version of memcpy.
+#
+# * sysdeps/powerpc/powerpc32/power4/memcpy.S: Add little endian support.
+# * sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise.
+# * sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise.
+# * sysdeps/powerpc/powerpc32/power7/mempcpy.S: Likewise.
+# * sysdeps/powerpc/powerpc64/memcpy.S: Likewise.
+# * sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise.
+# * sysdeps/powerpc/powerpc64/power6/memcpy.S: Likewise.
+# * sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise.
+# * sysdeps/powerpc/powerpc64/power7/mempcpy.S: Likewise. Make better
+# use of regs. Use power7 mtocrf. Tidy function tails.
+#
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S 2014-05-29 13:04:56.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S 2014-05-29 13:04:56.000000000 -0500
+@@ -205,15 +205,28 @@
+ blt cr6,5f
+ srwi 7,6,16
+ bgt cr6,3f
++#ifdef __LITTLE_ENDIAN__
++ sth 7,0(3)
++#else
+ sth 6,0(3)
++#endif
+ b 7f
+ .align 4
+ 3:
++#ifdef __LITTLE_ENDIAN__
++ rotlwi 6,6,24
++ stb 6,0(3)
++ sth 7,1(3)
++#else
+ stb 7,0(3)
+ sth 6,1(3)
++#endif
+ b 7f
+ .align 4
+ 5:
++#ifdef __LITTLE_ENDIAN__
++ rotlwi 6,6,8
++#endif
+ stb 6,0(3)
+ 7:
+ cmplwi cr1,10,16
+@@ -341,13 +354,23 @@
+ bf 30,1f
+
+ /* there are at least two words to copy, so copy them */
++#ifdef __LITTLE_ENDIAN__
++ srw 0,6,10
++ slw 8,7,9
++#else
+ slw 0,6,10 /* shift 1st src word to left align it in R0 */
+ srw 8,7,9 /* shift 2nd src word to right align it in R8 */
++#endif
+ or 0,0,8 /* or them to get word to store */
+ lwz 6,8(5) /* load the 3rd src word */
+ stw 0,0(4) /* store the 1st dst word */
++#ifdef __LITTLE_ENDIAN__
++ srw 0,7,10
++ slw 8,6,9
++#else
+ slw 0,7,10 /* now left align 2nd src word into R0 */
+ srw 8,6,9 /* shift 3rd src word to right align it in R8 */
++#endif
+ or 0,0,8 /* or them to get word to store */
+ lwz 7,12(5)
+ stw 0,4(4) /* store the 2nd dst word */
+@@ -355,8 +378,13 @@
+ addi 5,5,16
+ bf 31,4f
+ /* there is a third word to copy, so copy it */
++#ifdef __LITTLE_ENDIAN__
++ srw 0,6,10
++ slw 8,7,9
++#else
+ slw 0,6,10 /* shift 3rd src word to left align it in R0 */
+ srw 8,7,9 /* shift 4th src word to right align it in R8 */
++#endif
+ or 0,0,8 /* or them to get word to store */
+ stw 0,0(4) /* store 3rd dst word */
+ mr 6,7
+@@ -366,8 +394,13 @@
+ b 4f
+ .align 4
+ 1:
++#ifdef __LITTLE_ENDIAN__
++ srw 0,6,10
++ slw 8,7,9
++#else
+ slw 0,6,10 /* shift 1st src word to left align it in R0 */
+ srw 8,7,9 /* shift 2nd src word to right align it in R8 */
++#endif
+ addi 5,5,8
+ or 0,0,8 /* or them to get word to store */
+ bf 31,4f
+@@ -380,23 +413,43 @@
+ .align 4
+ 4:
+ /* copy 16 bytes at a time */
++#ifdef __LITTLE_ENDIAN__
++ srw 0,6,10
++ slw 8,7,9
++#else
+ slw 0,6,10
+ srw 8,7,9
++#endif
+ or 0,0,8
+ lwz 6,0(5)
+ stw 0,0(4)
++#ifdef __LITTLE_ENDIAN__
++ srw 0,7,10
++ slw 8,6,9
++#else
+ slw 0,7,10
+ srw 8,6,9
++#endif
+ or 0,0,8
+ lwz 7,4(5)
+ stw 0,4(4)
++#ifdef __LITTLE_ENDIAN__
++ srw 0,6,10
++ slw 8,7,9
++#else
+ slw 0,6,10
+ srw 8,7,9
++#endif
+ or 0,0,8
+ lwz 6,8(5)
+ stw 0,8(4)
++#ifdef __LITTLE_ENDIAN__
++ srw 0,7,10
++ slw 8,6,9
++#else
+ slw 0,7,10
+ srw 8,6,9
++#endif
+ or 0,0,8
+ lwz 7,12(5)
+ stw 0,12(4)
+@@ -405,8 +458,13 @@
+ bdnz+ 4b
+ 8:
+ /* calculate and store the final word */
++#ifdef __LITTLE_ENDIAN__
++ srw 0,6,10
++ slw 8,7,9
++#else
+ slw 0,6,10
+ srw 8,7,9
++#endif
+ or 0,0,8
+ stw 0,0(4)
+ 3:
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S 2014-05-29 13:04:56.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S 2014-05-29 13:04:56.000000000 -0500
+@@ -221,15 +221,28 @@
+ blt cr6,5f
+ srwi 7,6,16
+ bgt cr6,3f
++#ifdef __LITTLE_ENDIAN__
++ sth 7,0(3)
++#else
+ sth 6,0(3)
++#endif
+ b 7f
+ .align 4
+ 3:
++#ifdef __LITTLE_ENDIAN__
++ rotlwi 6,6,24
++ stb 6,0(3)
++ sth 7,1(3)
++#else
+ stb 7,0(3)
+ sth 6,1(3)
++#endif
+ b 7f
+ .align 4
+ 5:
++#ifdef __LITTLE_ENDIAN__
++ rotlwi 6,6,8
++#endif
+ stb 6,0(3)
+ 7:
+ cmplwi cr1,10,16
+@@ -579,7 +592,11 @@
+ lwz 6,-1(4)
+ cmplwi cr6,31,4
+ srwi 8,31,5 /* calculate the 32 byte loop count */
++#ifdef __LITTLE_ENDIAN__
++ srwi 6,6,8
++#else
+ slwi 6,6,8
++#endif
+ clrlwi 31,31,27 /* The remaining bytes, < 32. */
+ blt cr5,L(wdu1_32tail)
+ mtctr 8
+@@ -587,8 +604,12 @@
+
+ lwz 8,3(4)
+ lwz 7,4(4)
++#ifdef __LITTLE_ENDIAN__
++ rldimi 6,8,24,32
++#else
+ /* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
+ rlwimi 6,8,8,(32-8),31
++#endif
+ b L(wdu1_loop32x)
+ .align 4
+ L(wdu1_loop32):
+@@ -597,8 +618,12 @@
+ lwz 7,4(4)
+ stw 10,-8(3)
+ stw 11,-4(3)
++#ifdef __LITTLE_ENDIAN__
++ rldimi 6,8,24,32
++#else
+ /* Equivalent to srwi 8,8,32-8; or 6,6,8 */
+ rlwimi 6,8,8,(32-8),31
++#endif
+ L(wdu1_loop32x):
+ lwz 10,8(4)
+ lwz 11,12(4)
+@@ -615,7 +640,11 @@
+ stw 6,16(3)
+ stw 7,20(3)
+ addi 3,3,32
++#ifdef __LITTLE_ENDIAN__
++ srwi 6,8,8
++#else
+ slwi 6,8,8
++#endif
+ bdnz+ L(wdu1_loop32)
+ stw 10,-8(3)
+ stw 11,-4(3)
+@@ -626,8 +655,12 @@
+ blt cr6,L(wdu_4tail)
+ /* calculate and store the final word */
+ lwz 8,3(4)
+-/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */
++#ifdef __LITTLE_ENDIAN__
++ rldimi 6,8,24,32
++#else
++/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
+ rlwimi 6,8,8,(32-8),31
++#endif
+ b L(wdu_32tailx)
+
+ L(wdu2_32):
+@@ -635,7 +668,11 @@
+ lwz 6,-2(4)
+ cmplwi cr6,31,4
+ srwi 8,31,5 /* calculate the 32 byte loop count */
++#ifdef __LITTLE_ENDIAN__
++ srwi 6,6,16
++#else
+ slwi 6,6,16
++#endif
+ clrlwi 31,31,27 /* The remaining bytes, < 32. */
+ blt cr5,L(wdu2_32tail)
+ mtctr 8
+@@ -643,8 +680,11 @@
+
+ lwz 8,2(4)
+ lwz 7,4(4)
+-/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
++#ifdef __LITTLE_ENDIAN__
++ rldimi 6,8,16,32
++#else
+ rlwimi 6,8,16,(32-16),31
++#endif
+ b L(wdu2_loop32x)
+ .align 4
+ L(wdu2_loop32):
+@@ -653,8 +693,11 @@
+ lwz 7,4(4)
+ stw 10,-8(3)
+ stw 11,-4(3)
+-/* Equivalent to srwi 8,8,32-8; or 6,6,8 */
++#ifdef __LITTLE_ENDIAN__
++ rldimi 6,8,16,32
++#else
+ rlwimi 6,8,16,(32-16),31
++#endif
+ L(wdu2_loop32x):
+ lwz 10,8(4)
+ lwz 11,12(4)
+@@ -672,7 +715,11 @@
+ stw 6,16(3)
+ stw 7,20(3)
+ addi 3,3,32
++#ifdef __LITTLE_ENDIAN__
++ srwi 6,8,16
++#else
+ slwi 6,8,16
++#endif
+ bdnz+ L(wdu2_loop32)
+ stw 10,-8(3)
+ stw 11,-4(3)
+@@ -683,8 +730,11 @@
+ blt cr6,L(wdu_4tail)
+ /* calculate and store the final word */
+ lwz 8,2(4)
+-/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */
++#ifdef __LITTLE_ENDIAN__
++ rldimi 6,8,16,32
++#else
+ rlwimi 6,8,16,(32-16),31
++#endif
+ b L(wdu_32tailx)
+
+ L(wdu3_32):
+@@ -692,7 +742,11 @@
+ lwz 6,-3(4)
+ cmplwi cr6,31,4
+ srwi 8,31,5 /* calculate the 32 byte loop count */
++#ifdef __LITTLE_ENDIAN__
++ srwi 6,6,24
++#else
+ slwi 6,6,24
++#endif
+ clrlwi 31,31,27 /* The remaining bytes, < 32. */
+ blt cr5,L(wdu3_32tail)
+ mtctr 8
+@@ -700,8 +754,11 @@
+
+ lwz 8,1(4)
+ lwz 7,4(4)
+-/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
++#ifdef __LITTLE_ENDIAN__
++ rldimi 6,8,8,32
++#else
+ rlwimi 6,8,24,(32-24),31
++#endif
+ b L(wdu3_loop32x)
+ .align 4
+ L(wdu3_loop32):
+@@ -710,8 +767,11 @@
+ lwz 7,4(4)
+ stw 10,-8(3)
+ stw 11,-4(3)
+-/* Equivalent to srwi 8,8,32-8; or 6,6,8 */
++#ifdef __LITTLE_ENDIAN__
++ rldimi 6,8,8,32
++#else
+ rlwimi 6,8,24,(32-24),31
++#endif
+ L(wdu3_loop32x):
+ lwz 10,8(4)
+ lwz 11,12(4)
+@@ -728,7 +788,11 @@
+ stw 6,16(3)
+ stw 7,20(3)
+ addi 3,3,32
++#ifdef __LITTLE_ENDIAN__
++ srwi 6,8,24
++#else
+ slwi 6,8,24
++#endif
+ bdnz+ L(wdu3_loop32)
+ stw 10,-8(3)
+ stw 11,-4(3)
+@@ -739,8 +803,11 @@
+ blt cr6,L(wdu_4tail)
+ /* calculate and store the final word */
+ lwz 8,1(4)
+-/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */
++#ifdef __LITTLE_ENDIAN__
++ rldimi 6,8,8,32
++#else
+ rlwimi 6,8,24,(32-24),31
++#endif
+ b L(wdu_32tailx)
+ .align 4
+ L(wdu_32tailx):
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S 2014-05-29 13:04:56.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S 2014-05-29 13:04:56.000000000 -0500
+@@ -385,7 +385,7 @@
+
+ beq L(copy_GE_32_unaligned_cont)
+
+- /* SRC is not quadword aligned, get it aligned. */
++ /* DST is not quadword aligned, get it aligned. */
+
+ mtcrf 0x01,0
+ subf 31,0,5
+@@ -437,13 +437,21 @@
+ mr 11,12
+ mtcrf 0x01,9
+ cmplwi cr6,9,1
++#ifdef __LITTLE_ENDIAN__
++ lvsr 5,0,12
++#else
+ lvsl 5,0,12
++#endif
+ lvx 3,0,12
+ bf 31,L(setup_unaligned_loop)
+
+ /* Copy another 16 bytes to align to 32-bytes due to the loop . */
+ lvx 4,12,6
++#ifdef __LITTLE_ENDIAN__
++ vperm 6,4,3,5
++#else
+ vperm 6,3,4,5
++#endif
+ addi 11,12,16
+ addi 10,3,16
+ stvx 6,0,3
+@@ -463,11 +471,17 @@
+ vector instructions though. */
+
+ lvx 4,11,6 /* vr4 = r11+16. */
+- vperm 6,3,4,5 /* Merge the correctly-aligned portions
+- of vr3/vr4 into vr6. */
++#ifdef __LITTLE_ENDIAN__
++ vperm 6,4,3,5
++#else
++ vperm 6,3,4,5
++#endif
+ lvx 3,11,7 /* vr3 = r11+32. */
+- vperm 10,4,3,5 /* Merge the correctly-aligned portions
+- of vr3/vr4 into vr10. */
++#ifdef __LITTLE_ENDIAN__
++ vperm 10,3,4,5
++#else
++ vperm 10,4,3,5
++#endif
+ addi 11,11,32
+ stvx 6,0,10
+ stvx 10,10,6
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S 2014-05-29 13:04:56.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S 2014-05-29 13:04:56.000000000 -0500
+@@ -327,7 +327,7 @@
+
+ beq L(copy_GE_32_unaligned_cont)
+
+- /* SRC is not quadword aligned, get it aligned. */
++ /* DST is not quadword aligned, get it aligned. */
+
+ mtcrf 0x01,0
+ subf 31,0,5
+@@ -379,13 +379,21 @@
+ mr 11,12
+ mtcrf 0x01,9
+ cmplwi cr6,9,1
+- lvsl 5,0,12
++#ifdef __LITTLE_ENDIAN__
++ lvsr 5,0,12
++#else
++ lvsl 5,0,12
++#endif
+ lvx 3,0,12
+ bf 31,L(setup_unaligned_loop)
+
+ /* Copy another 16 bytes to align to 32-bytes due to the loop . */
+ lvx 4,12,6
+- vperm 6,3,4,5
++#ifdef __LITTLE_ENDIAN__
++ vperm 6,4,3,5
++#else
++ vperm 6,3,4,5
++#endif
+ addi 11,12,16
+ addi 10,3,16
+ stvx 6,0,3
+@@ -405,11 +413,17 @@
+ vector instructions though. */
+
+ lvx 4,11,6 /* vr4 = r11+16. */
+- vperm 6,3,4,5 /* Merge the correctly-aligned portions
+- of vr3/vr4 into vr6. */
++#ifdef __LITTLE_ENDIAN__
++ vperm 6,4,3,5
++#else
++ vperm 6,3,4,5
++#endif
+ lvx 3,11,7 /* vr3 = r11+32. */
+- vperm 10,4,3,5 /* Merge the correctly-aligned portions
+- of vr3/vr4 into vr10. */
++#ifdef __LITTLE_ENDIAN__
++ vperm 10,3,4,5
++#else
++ vperm 10,4,3,5
++#endif
+ addi 11,11,32
+ stvx 6,0,10
+ stvx 10,10,6
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S 2014-05-29 13:04:56.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S 2014-05-29 13:04:56.000000000 -0500
+@@ -214,15 +214,28 @@
+ blt cr6,5f
+ srdi 7,6,16
+ bgt cr6,3f
++#ifdef __LITTLE_ENDIAN__
++ sth 7,0(3)
++#else
+ sth 6,0(3)
++#endif
+ b 7f
+ .align 4
+ 3:
++#ifdef __LITTLE_ENDIAN__
++ rotlwi 6,6,24
++ stb 6,0(3)
++ sth 7,1(3)
++#else
+ stb 7,0(3)
+ sth 6,1(3)
++#endif
+ b 7f
+ .align 4
+ 5:
++#ifdef __LITTLE_ENDIAN__
++ rotlwi 6,6,8
++#endif
+ stb 6,0(3)
+ 7:
+ cmpldi cr1,10,16
+@@ -330,7 +343,11 @@
+ ld 7,8(5)
+ subfic 9,10,64
+ beq 2f
++#ifdef __LITTLE_ENDIAN__
++ srd 0,6,10
++#else
+ sld 0,6,10
++#endif
+ cmpldi 11,1
+ mr 6,7
+ addi 4,4,-8
+@@ -338,15 +355,25 @@
+ b 1f
+ 2: addi 5,5,8
+ .align 4
++#ifdef __LITTLE_ENDIAN__
++0: srd 0,6,10
++ sld 8,7,9
++#else
+ 0: sld 0,6,10
+ srd 8,7,9
++#endif
+ cmpldi 11,2
+ ld 6,8(5)
+ or 0,0,8
+ addi 11,11,-2
+ std 0,0(4)
++#ifdef __LITTLE_ENDIAN__
++ srd 0,7,10
++1: sld 8,6,9
++#else
+ sld 0,7,10
+ 1: srd 8,6,9
++#endif
+ or 0,0,8
+ beq 8f
+ ld 7,16(5)
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S 2014-05-29 13:04:56.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S 2014-05-29 13:05:51.000000000 -0500
+@@ -1,5 +1,5 @@
+ /* Optimized memcpy implementation for PowerPC64.
+- Copyright (C) 2003, 2006, 2011 Free Software Foundation, Inc.
++ Copyright (C) 2003-2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+@@ -17,26 +17,24 @@
+ <http://www.gnu.org/licenses/>. */
+
+ #include <sysdep.h>
+-#include <bp-sym.h>
+-#include <bp-asm.h>
+
+ /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
+ Returns 'dst'.
+
+- Memcpy handles short copies (< 32-bytes) using a binary move blocks
+- (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
+- with the appropriate combination of byte and halfword load/stores.
+- There is minimal effort to optimize the alignment of short moves.
++ Memcpy handles short copies (< 32-bytes) using a binary move blocks
++ (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
++ with the appropriate combination of byte and halfword load/stores.
++ There is minimal effort to optimize the alignment of short moves.
+ The 64-bit implementations of POWER3 and POWER4 do a reasonable job
+- of handling unligned load/stores that do not cross 32-byte boundries.
++ of handling unaligned load/stores that do not cross 32-byte boundaries.
+
+ Longer moves (>= 32-bytes) justify the effort to get at least the
+ destination doubleword (8-byte) aligned. Further optimization is
+- posible when both source and destination are doubleword aligned.
++ possible when both source and destination are doubleword aligned.
+ Each case has a optimized unrolled loop. */
+
+ .machine power4
+-EALIGN (BP_SYM (memcpy), 5, 0)
++EALIGN (memcpy, 5, 0)
+ CALL_MCOUNT 3
+
+ cmpldi cr1,5,31
+@@ -44,20 +42,20 @@
+ std 3,-16(1)
+ std 31,-8(1)
+ cfi_offset(31,-8)
+- andi. 11,3,7 /* check alignement of dst. */
++ andi. 11,3,7 /* check alignment of dst. */
+ clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */
+- clrldi 10,4,61 /* check alignement of src. */
++ clrldi 10,4,61 /* check alignment of src. */
+ cmpldi cr6,5,8
+ ble- cr1,.L2 /* If move < 32 bytes use short move code. */
+- cmpld cr6,10,11
++ cmpld cr6,10,11
+ mr 12,4
+ srdi 9,5,3 /* Number of full double words remaining. */
+ mtcrf 0x01,0
+ mr 31,5
+ beq .L0
+-
++
+ subf 31,0,5
+- /* Move 0-7 bytes as needed to get the destination doubleword alligned. */
++ /* Move 0-7 bytes as needed to get the destination doubleword aligned. */
+ 1: bf 31,2f
+ lbz 6,0(12)
+ addi 12,12,1
+@@ -74,17 +72,17 @@
+ stw 6,0(3)
+ addi 3,3,4
+ 0:
+- clrldi 10,12,61 /* check alignement of src again. */
++ clrldi 10,12,61 /* check alignment of src again. */
+ srdi 9,31,3 /* Number of full double words remaining. */
+-
+- /* Copy doublewords from source to destination, assumpting the
++
++ /* Copy doublewords from source to destination, assuming the
+ destination is aligned on a doubleword boundary.
+
+ At this point we know there are at least 25 bytes left (32-7) to copy.
+- The next step is to determine if the source is also doubleword aligned.
++ The next step is to determine if the source is also doubleword aligned.
+ If not branch to the unaligned move code at .L6. which uses
+ a load, shift, store strategy.
+-
++
+ Otherwise source and destination are doubleword aligned, and we can
+ the optimized doubleword copy loop. */
+ .L0:
+@@ -97,14 +95,14 @@
+ Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.
+ If the copy is not an exact multiple of 32 bytes, 1-3
+ doublewords are copied as needed to set up the main loop. After
+- the main loop exits there may be a tail of 1-7 bytes. These byte are
++ the main loop exits there may be a tail of 1-7 bytes. These byte are
+ copied a word/halfword/byte at a time as needed to preserve alignment. */
+
+ srdi 8,31,5
+ cmpldi cr1,9,4
+ cmpldi cr6,11,0
+ mr 11,12
+-
++
+ bf 30,1f
+ ld 6,0(12)
+ ld 7,8(12)
+@@ -115,7 +113,7 @@
+ addi 10,3,16
+ bf 31,4f
+ ld 0,16(12)
+- std 0,16(3)
++ std 0,16(3)
+ blt cr1,3f
+ addi 11,12,24
+ addi 10,3,24
+@@ -129,7 +127,7 @@
+ addi 11,12,8
+ std 6,0(3)
+ addi 10,3,8
+-
++
+ .align 4
+ 4:
+ ld 6,0(11)
+@@ -144,7 +142,7 @@
+ std 0,24(10)
+ addi 10,10,32
+ bdnz 4b
+-3:
++3:
+
+ rldicr 0,31,0,60
+ mtcrf 0x01,31
+@@ -152,9 +150,9 @@
+ .L9:
+ add 3,3,0
+ add 12,12,0
+-
++
+ /* At this point we have a tail of 0-7 bytes and we know that the
+- destiniation is double word aligned. */
++ destination is double word aligned. */
+ 4: bf 29,2f
+ lwz 6,0(12)
+ addi 12,12,4
+@@ -173,29 +171,29 @@
+ ld 31,-8(1)
+ ld 3,-16(1)
+ blr
+-
+-/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31
+- bytes. Each case is handled without loops, using binary (1,2,4,8)
+- tests.
+-
++
++/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31
++ bytes. Each case is handled without loops, using binary (1,2,4,8)
++ tests.
++
+ In the short (0-8 byte) case no attempt is made to force alignment
+- of either source or destination. The hardware will handle the
+- unaligned load/stores with small delays for crossing 32- 64-byte, and
++ of either source or destination. The hardware will handle the
++ unaligned load/stores with small delays for crossing 32- 64-byte, and
+ 4096-byte boundaries. Since these short moves are unlikely to be
+- unaligned or cross these boundaries, the overhead to force
++ unaligned or cross these boundaries, the overhead to force
+ alignment is not justified.
+-
++
+ The longer (9-31 byte) move is more likely to cross 32- or 64-byte
+ boundaries. Since only loads are sensitive to the 32-/64-byte
+- boundaries it is more important to align the source then the
++ boundaries it is more important to align the source then the
+ destination. If the source is not already word aligned, we first
+- move 1-3 bytes as needed. Since we are only word aligned we don't
+- use double word load/stores to insure that all loads are aligned.
++ move 1-3 bytes as needed. Since we are only word aligned we don't
++ use double word load/stores to insure that all loads are aligned.
+ While the destination and stores may still be unaligned, this
+ is only an issue for page (4096 byte boundary) crossing, which
+ should be rare for these short moves. The hardware handles this
+- case automatically with a small delay. */
+-
++ case automatically with a small delay. */
++
+ .align 4
+ .L2:
+ mtcrf 0x01,5
+@@ -216,15 +214,28 @@
+ blt cr6,5f
+ srdi 7,6,16
+ bgt cr6,3f
++#ifdef __LITTLE_ENDIAN__
++ sth 7,0(3)
++#else
+ sth 6,0(3)
++#endif
+ b 7f
+ .align 4
+ 3:
++#ifdef __LITTLE_ENDIAN__
++ rotlwi 6,6,24
++ stb 6,0(3)
++ sth 7,1(3)
++#else
+ stb 7,0(3)
+ sth 6,1(3)
++#endif
+ b 7f
+ .align 4
+ 5:
++#ifdef __LITTLE_ENDIAN__
++ rotlwi 6,6,8
++#endif
+ stb 6,0(3)
+ 7:
+ cmpldi cr1,10,16
+@@ -258,11 +269,11 @@
+ lwz 6,0(12)
+ addi 12,12,4
+ stw 6,0(3)
+- addi 3,3,4
++ addi 3,3,4
+ 2: /* Move 2-3 bytes. */
+ bf 30,1f
+ lhz 6,0(12)
+- sth 6,0(3)
++ sth 6,0(3)
+ bf 31,0f
+ lbz 7,2(12)
+ stb 7,2(3)
+@@ -283,8 +294,8 @@
+ mr 12,4
+ bne cr6,4f
+ /* Would have liked to use use ld/std here but the 630 processors are
+- slow for load/store doubles that are not at least word aligned.
+- Unaligned Load/Store word execute with only a 1 cycle penaltity. */
++ slow for load/store doubles that are not at least word aligned.
++ Unaligned Load/Store word execute with only a 1 cycle penalty. */
+ lwz 6,0(4)
+ lwz 7,4(4)
+ stw 6,0(3)
+@@ -299,14 +310,14 @@
+ 6:
+ bf 30,5f
+ lhz 7,4(4)
+- sth 7,4(3)
++ sth 7,4(3)
+ bf 31,0f
+ lbz 8,6(4)
+ stb 8,6(3)
+ ld 3,-16(1)
+ blr
+ .align 4
+-5:
++5:
+ bf 31,0f
+ lbz 6,4(4)
+ stb 6,4(3)
+@@ -336,13 +347,23 @@
+ bf 30,1f
+
+ /* there are at least two DWs to copy */
++#ifdef __LITTLE_ENDIAN__
++ srd 0,6,10
++ sld 8,7,9
++#else
+ sld 0,6,10
+ srd 8,7,9
++#endif
+ or 0,0,8
+ ld 6,16(5)
+ std 0,0(4)
++#ifdef __LITTLE_ENDIAN__
++ srd 0,7,10
++ sld 8,6,9
++#else
+ sld 0,7,10
+ srd 8,6,9
++#endif
+ or 0,0,8
+ ld 7,24(5)
+ std 0,8(4)
+@@ -351,8 +372,13 @@
+ blt cr6,8f /* if total DWs = 3, then bypass loop */
+ bf 31,4f
+ /* there is a third DW to copy */
++#ifdef __LITTLE_ENDIAN__
++ srd 0,6,10
++ sld 8,7,9
++#else
+ sld 0,6,10
+ srd 8,7,9
++#endif
+ or 0,0,8
+ std 0,0(4)
+ mr 6,7
+@@ -363,8 +389,13 @@
+ b 4f
+ .align 4
+ 1:
++#ifdef __LITTLE_ENDIAN__
++ srd 0,6,10
++ sld 8,7,9
++#else
+ sld 0,6,10
+ srd 8,7,9
++#endif
+ addi 5,5,16
+ or 0,0,8
+ bf 31,4f
+@@ -375,23 +406,44 @@
+ addi 4,4,8
+ .align 4
+ /* copy 32 bytes at a time */
+-4: sld 0,6,10
++4:
++#ifdef __LITTLE_ENDIAN__
++ srd 0,6,10
++ sld 8,7,9
++#else
++ sld 0,6,10
+ srd 8,7,9
++#endif
+ or 0,0,8
+ ld 6,0(5)
+ std 0,0(4)
++#ifdef __LITTLE_ENDIAN__
++ srd 0,7,10
++ sld 8,6,9
++#else
+ sld 0,7,10
+ srd 8,6,9
++#endif
+ or 0,0,8
+ ld 7,8(5)
+ std 0,8(4)
++#ifdef __LITTLE_ENDIAN__
++ srd 0,6,10
++ sld 8,7,9
++#else
+ sld 0,6,10
+ srd 8,7,9
++#endif
+ or 0,0,8
+ ld 6,16(5)
+ std 0,16(4)
++#ifdef __LITTLE_ENDIAN__
++ srd 0,7,10
++ sld 8,6,9
++#else
+ sld 0,7,10
+ srd 8,6,9
++#endif
+ or 0,0,8
+ ld 7,24(5)
+ std 0,24(4)
+@@ -401,9 +453,14 @@
+ .align 4
+ 8:
+ /* calculate and store the final DW */
++#ifdef __LITTLE_ENDIAN__
++ srd 0,6,10
++ sld 8,7,9
++#else
+ sld 0,6,10
+ srd 8,7,9
+- or 0,0,8
++#endif
++ or 0,0,8
+ std 0,0(4)
+ 3:
+ rldicr 0,31,0,60
+@@ -413,5 +470,5 @@
+ ld 31,-8(1)
+ ld 3,-16(1)
+ blr
+-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
++END_GEN_TB (memcpy,TB_TOCLESS)
+ libc_hidden_builtin_def (memcpy)
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S 2014-05-29 13:04:56.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S 2014-05-29 13:05:27.000000000 -0500
+@@ -1,5 +1,5 @@
+ /* Optimized memcpy implementation for PowerPC64.
+- Copyright (C) 2003, 2006, 2007, 2011 Free Software Foundation, Inc.
++ Copyright (C) 2003-2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+@@ -17,52 +17,50 @@
+ <http://www.gnu.org/licenses/>. */
+
+ #include <sysdep.h>
+-#include <bp-sym.h>
+-#include <bp-asm.h>
+
+ /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
+ Returns 'dst'.
+
+- Memcpy handles short copies (< 32-bytes) using a binary move blocks
+- (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
+- with the appropriate combination of byte and halfword load/stores.
+- There is minimal effort to optimize the alignment of short moves.
++ Memcpy handles short copies (< 32-bytes) using a binary move blocks
++ (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
++ with the appropriate combination of byte and halfword load/stores.
++ There is minimal effort to optimize the alignment of short moves.
+ The 64-bit implementations of POWER3 and POWER4 do a reasonable job
+- of handling unligned load/stores that do not cross 32-byte boundries.
++ of handling unaligned load/stores that do not cross 32-byte boundaries.
+
+ Longer moves (>= 32-bytes) justify the effort to get at least the
+ destination doubleword (8-byte) aligned. Further optimization is
+- posible when both source and destination are doubleword aligned.
+- Each case has a optimized unrolled loop.
+-
+- For POWER6 unaligned loads will take a 20+ cycle hicup for any
++ possible when both source and destination are doubleword aligned.
++ Each case has a optimized unrolled loop.
++
++ For POWER6 unaligned loads will take a 20+ cycle hiccup for any
+ L1 cache miss that crosses a 32- or 128-byte boundary. Store
+- is more forgiving and does not take a hicup until page or
+- segment boundaries. So we require doubleword alignment for
++ is more forgiving and does not take a hiccup until page or
++ segment boundaries. So we require doubleword alignment for
+ the source but may take a risk and only require word alignment
+ for the destination. */
+
+ .machine "power6"
+-EALIGN (BP_SYM (memcpy), 7, 0)
++EALIGN (memcpy, 7, 0)
+ CALL_MCOUNT 3
+
+ cmpldi cr1,5,31
+ neg 0,3
+ std 3,-16(1)
+ std 31,-8(1)
+- andi. 11,3,7 /* check alignement of dst. */
++ andi. 11,3,7 /* check alignment of dst. */
+ clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */
+- clrldi 10,4,61 /* check alignement of src. */
++ clrldi 10,4,61 /* check alignment of src. */
+ cmpldi cr6,5,8
+ ble- cr1,.L2 /* If move < 32 bytes use short move code. */
+ mtcrf 0x01,0
+- cmpld cr6,10,11
++ cmpld cr6,10,11
+ srdi 9,5,3 /* Number of full double words remaining. */
+ beq .L0
+-
++
+ subf 5,0,5
+- /* Move 0-7 bytes as needed to get the destination doubleword alligned.
+- Duplicate some code to maximize fall-throught and minimize agen delays. */
++ /* Move 0-7 bytes as needed to get the destination doubleword aligned.
++ Duplicate some code to maximize fall-through and minimize agen delays. */
+ 1: bf 31,2f
+ lbz 6,0(4)
+ stb 6,0(3)
+@@ -78,7 +76,7 @@
+ lwz 6,1(4)
+ stw 6,1(3)
+ b 0f
+-
++
+ 2: bf 30,4f
+ lhz 6,0(4)
+ sth 6,0(3)
+@@ -86,26 +84,26 @@
+ lwz 6,2(4)
+ stw 6,2(3)
+ b 0f
+-
++
+ 4: bf 29,0f
+ lwz 6,0(4)
+ stw 6,0(3)
+-0:
++0:
+ /* Add the number of bytes until the 1st doubleword of dst to src and dst. */
+ add 4,4,0
+ add 3,3,0
+-
+- clrldi 10,4,61 /* check alignement of src again. */
++
++ clrldi 10,4,61 /* check alignment of src again. */
+ srdi 9,5,3 /* Number of full double words remaining. */
+-
+- /* Copy doublewords from source to destination, assumpting the
++
++ /* Copy doublewords from source to destination, assuming the
+ destination is aligned on a doubleword boundary.
+
+ At this point we know there are at least 25 bytes left (32-7) to copy.
+- The next step is to determine if the source is also doubleword aligned.
++ The next step is to determine if the source is also doubleword aligned.
+ If not branch to the unaligned move code at .L6. which uses
+ a load, shift, store strategy.
+-
++
+ Otherwise source and destination are doubleword aligned, and we can
+ the optimized doubleword copy loop. */
+ .align 4
+@@ -123,14 +121,14 @@
+ the main loop exits there may be a tail of 1-7 bytes. These byte
+ are copied a word/halfword/byte at a time as needed to preserve
+ alignment.
+-
++
+ For POWER6 the L1 is store-through and the L2 is store-in. The
+ L2 is clocked at half CPU clock so we can store 16 bytes every
+ other cycle. POWER6 also has a load/store bypass so we can do
+- load, load, store, store every 2 cycles.
+-
++ load, load, store, store every 2 cycles.
++
+ The following code is sensitive to cache line alignment. Do not
+- make any change with out first making sure thay don't result in
++ make any change with out first making sure they don't result in
+ splitting ld/std pairs across a cache line. */
+
+ mtcrf 0x02,5
+@@ -273,7 +271,7 @@
+ std 8,16+96(10)
+ std 0,24+96(10)
+ ble cr5,L(das_loop_e)
+-
++
+ mtctr 12
+ .align 4
+ L(das_loop2):
+@@ -326,10 +324,10 @@
+ .align 4
+ L(das_tail):
+ beq cr1,0f
+-
++
+ L(das_tail2):
+ /* At this point we have a tail of 0-7 bytes and we know that the
+- destiniation is double word aligned. */
++ destination is double word aligned. */
+ 4: bf 29,2f
+ lwz 6,0(4)
+ stw 6,0(3)
+@@ -344,7 +342,7 @@
+ lbz 6,4(4)
+ stb 6,4(3)
+ b 0f
+-
++
+ 2: bf 30,1f
+ lhz 6,0(4)
+ sth 6,0(3)
+@@ -352,7 +350,7 @@
+ lbz 6,2(4)
+ stb 6,2(3)
+ b 0f
+-
++
+ 1: bf 31,0f
+ lbz 6,0(4)
+ stb 6,0(3)
+@@ -361,7 +359,7 @@
+ ld 3,-16(1)
+ blr
+
+-/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31
++/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31
+ bytes. Each case is handled without loops, using binary (1,2,4,8)
+ tests.
+
+@@ -402,15 +400,28 @@
+ blt cr6,5f
+ srdi 7,6,16
+ bgt cr6,3f
++#ifdef __LITTLE_ENDIAN__
++ sth 7,0(3)
++#else
+ sth 6,0(3)
++#endif
+ b 7f
+ .align 4
+ 3:
++#ifdef __LITTLE_ENDIAN__
++ rotlwi 6,6,24
++ stb 6,0(3)
++ sth 7,1(3)
++#else
+ stb 7,0(3)
+ sth 6,1(3)
++#endif
+ b 7f
+ .align 4
+ 5:
++#ifdef __LITTLE_ENDIAN__
++ rotlwi 6,6,8
++#endif
+ stb 6,0(3)
+ 7:
+ cmpldi cr1,10,16
+@@ -421,7 +432,7 @@
+ /* At least 6 bytes left and the source is word aligned. This allows
+ some speculative loads up front. */
+ /* We need to special case the fall-through because the biggest delays
+- are due to address computation not being ready in time for the
++ are due to address computation not being ready in time for the
+ AGEN. */
+ lwz 6,0(12)
+ lwz 7,4(12)
+@@ -452,7 +463,7 @@
+ ld 3,-16(1)
+ blr
+ .align 4
+-L(dus_tail16p8): /* less then 8 bytes left. */
++L(dus_tail16p8): /* less than 8 bytes left. */
+ beq cr1,L(dus_tailX) /* exactly 16 bytes, early exit. */
+ cmpldi cr1,10,20
+ bf 29,L(dus_tail16p2)
+@@ -466,7 +477,7 @@
+ ld 3,-16(1)
+ blr
+ .align 4
+-L(dus_tail16p4): /* less then 4 bytes left. */
++L(dus_tail16p4): /* less than 4 bytes left. */
+ addi 12,12,24
+ addi 3,3,24
+ bgt cr0,L(dus_tail2)
+@@ -474,7 +485,7 @@
+ ld 3,-16(1)
+ blr
+ .align 4
+-L(dus_tail16p2): /* 16 bytes moved, less then 4 bytes left. */
++L(dus_tail16p2): /* 16 bytes moved, less than 4 bytes left. */
+ addi 12,12,16
+ addi 3,3,16
+ b L(dus_tail2)
+@@ -499,7 +510,7 @@
+ ld 3,-16(1)
+ blr
+ .align 4
+-L(dus_tail8p4): /* less then 4 bytes left. */
++L(dus_tail8p4): /* less than 4 bytes left. */
+ addi 12,12,8
+ addi 3,3,8
+ bgt cr1,L(dus_tail2)
+@@ -510,14 +521,14 @@
+ .align 4
+ L(dus_tail4): /* Move 4 bytes. */
+ /* r6 already loaded speculatively. If we are here we know there is
+- more then 4 bytes left. So there is no need to test. */
++ more than 4 bytes left. So there is no need to test. */
+ addi 12,12,4
+ stw 6,0(3)
+ addi 3,3,4
+ L(dus_tail2): /* Move 2-3 bytes. */
+ bf 30,L(dus_tail1)
+ lhz 6,0(12)
+- sth 6,0(3)
++ sth 6,0(3)
+ bf 31,L(dus_tailX)
+ lbz 7,2(12)
+ stb 7,2(3)
+@@ -537,7 +548,7 @@
+ .LE8:
+ mr 12,4
+ bne cr6,L(dus_4)
+-/* Exactly 8 bytes. We may cross a 32-/128-byte boundry and take a ~20
++/* Exactly 8 bytes. We may cross a 32-/128-byte boundary and take a ~20
+ cycle delay. This case should be rare and any attempt to avoid this
+ would take most of 20 cycles any way. */
+ ld 6,0(4)
+@@ -552,7 +563,7 @@
+ stw 6,0(3)
+ bf 30,L(dus_5)
+ lhz 7,4(4)
+- sth 7,4(3)
++ sth 7,4(3)
+ bf 31,L(dus_0)
+ lbz 8,6(4)
+ stb 8,6(3)
+@@ -590,20 +601,31 @@
+ bge cr0, L(du4_do)
+ blt cr5, L(du1_do)
+ beq cr5, L(du2_do)
+- b L(du3_do)
+-
++ b L(du3_do)
++
+ .align 4
+ L(du1_do):
+ bf 30,L(du1_1dw)
+
+ /* there are at least two DWs to copy */
++ /* FIXME: can combine last shift and "or" into "rldimi" */
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 8
++ sldi 8,7, 64-8
++#else
+ sldi 0,6, 8
+ srdi 8,7, 64-8
++#endif
+ or 0,0,8
+ ld 6,16(5)
+ std 0,0(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,7, 8
++ sldi 8,6, 64-8
++#else
+ sldi 0,7, 8
+ srdi 8,6, 64-8
++#endif
+ or 0,0,8
+ ld 7,24(5)
+ std 0,8(4)
+@@ -612,8 +634,13 @@
+ blt cr6,L(du1_fini) /* if total DWs = 3, then bypass loop */
+ bf 31,L(du1_loop)
+ /* there is a third DW to copy */
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 8
++ sldi 8,7, 64-8
++#else
+ sldi 0,6, 8
+ srdi 8,7, 64-8
++#endif
+ or 0,0,8
+ std 0,0(4)
+ mr 6,7
+@@ -624,8 +651,13 @@
+ b L(du1_loop)
+ .align 4
+ L(du1_1dw):
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 8
++ sldi 8,7, 64-8
++#else
+ sldi 0,6, 8
+ srdi 8,7, 64-8
++#endif
+ addi 5,5,16
+ or 0,0,8
+ bf 31,L(du1_loop)
+@@ -637,23 +669,43 @@
+ .align 4
+ /* copy 32 bytes at a time */
+ L(du1_loop):
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 8
++ sldi 8,7, 64-8
++#else
+ sldi 0,6, 8
+ srdi 8,7, 64-8
++#endif
+ or 0,0,8
+ ld 6,0(5)
+ std 0,0(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,7, 8
++ sldi 8,6, 64-8
++#else
+ sldi 0,7, 8
+ srdi 8,6, 64-8
++#endif
+ or 0,0,8
+ ld 7,8(5)
+ std 0,8(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 8
++ sldi 8,7, 64-8
++#else
+ sldi 0,6, 8
+ srdi 8,7, 64-8
++#endif
+ or 0,0,8
+ ld 6,16(5)
+ std 0,16(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,7, 8
++ sldi 8,6, 64-8
++#else
+ sldi 0,7, 8
+ srdi 8,6, 64-8
++#endif
+ or 0,0,8
+ ld 7,24(5)
+ std 0,24(4)
+@@ -663,9 +715,14 @@
+ .align 4
+ L(du1_fini):
+ /* calculate and store the final DW */
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 8
++ sldi 8,7, 64-8
++#else
+ sldi 0,6, 8
+ srdi 8,7, 64-8
+- or 0,0,8
++#endif
++ or 0,0,8
+ std 0,0(4)
+ b L(du_done)
+
+@@ -674,13 +731,23 @@
+ bf 30,L(du2_1dw)
+
+ /* there are at least two DWs to copy */
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 16
++ sldi 8,7, 64-16
++#else
+ sldi 0,6, 16
+ srdi 8,7, 64-16
++#endif
+ or 0,0,8
+ ld 6,16(5)
+ std 0,0(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,7, 16
++ sldi 8,6, 64-16
++#else
+ sldi 0,7, 16
+ srdi 8,6, 64-16
++#endif
+ or 0,0,8
+ ld 7,24(5)
+ std 0,8(4)
+@@ -689,8 +756,13 @@
+ blt cr6,L(du2_fini) /* if total DWs = 3, then bypass loop */
+ bf 31,L(du2_loop)
+ /* there is a third DW to copy */
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 16
++ sldi 8,7, 64-16
++#else
+ sldi 0,6, 16
+ srdi 8,7, 64-16
++#endif
+ or 0,0,8
+ std 0,0(4)
+ mr 6,7
+@@ -701,8 +773,13 @@
+ b L(du2_loop)
+ .align 4
+ L(du2_1dw):
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 16
++ sldi 8,7, 64-16
++#else
+ sldi 0,6, 16
+ srdi 8,7, 64-16
++#endif
+ addi 5,5,16
+ or 0,0,8
+ bf 31,L(du2_loop)
+@@ -714,23 +791,43 @@
+ .align 4
+ /* copy 32 bytes at a time */
+ L(du2_loop):
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 16
++ sldi 8,7, 64-16
++#else
+ sldi 0,6, 16
+ srdi 8,7, 64-16
++#endif
+ or 0,0,8
+ ld 6,0(5)
+ std 0,0(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,7, 16
++ sldi 8,6, 64-16
++#else
+ sldi 0,7, 16
+ srdi 8,6, 64-16
++#endif
+ or 0,0,8
+ ld 7,8(5)
+ std 0,8(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 16
++ sldi 8,7, 64-16
++#else
+ sldi 0,6, 16
+ srdi 8,7, 64-16
++#endif
+ or 0,0,8
+ ld 6,16(5)
+ std 0,16(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,7, 16
++ sldi 8,6, 64-16
++#else
+ sldi 0,7, 16
+ srdi 8,6, 64-16
++#endif
+ or 0,0,8
+ ld 7,24(5)
+ std 0,24(4)
+@@ -740,9 +837,14 @@
+ .align 4
+ L(du2_fini):
+ /* calculate and store the final DW */
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 16
++ sldi 8,7, 64-16
++#else
+ sldi 0,6, 16
+ srdi 8,7, 64-16
+- or 0,0,8
++#endif
++ or 0,0,8
+ std 0,0(4)
+ b L(du_done)
+
+@@ -751,13 +853,23 @@
+ bf 30,L(du3_1dw)
+
+ /* there are at least two DWs to copy */
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 24
++ sldi 8,7, 64-24
++#else
+ sldi 0,6, 24
+ srdi 8,7, 64-24
++#endif
+ or 0,0,8
+ ld 6,16(5)
+ std 0,0(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,7, 24
++ sldi 8,6, 64-24
++#else
+ sldi 0,7, 24
+ srdi 8,6, 64-24
++#endif
+ or 0,0,8
+ ld 7,24(5)
+ std 0,8(4)
+@@ -766,8 +878,13 @@
+ blt cr6,L(du3_fini) /* if total DWs = 3, then bypass loop */
+ bf 31,L(du3_loop)
+ /* there is a third DW to copy */
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 24
++ sldi 8,7, 64-24
++#else
+ sldi 0,6, 24
+ srdi 8,7, 64-24
++#endif
+ or 0,0,8
+ std 0,0(4)
+ mr 6,7
+@@ -778,8 +895,13 @@
+ b L(du3_loop)
+ .align 4
+ L(du3_1dw):
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 24
++ sldi 8,7, 64-24
++#else
+ sldi 0,6, 24
+ srdi 8,7, 64-24
++#endif
+ addi 5,5,16
+ or 0,0,8
+ bf 31,L(du3_loop)
+@@ -791,23 +913,43 @@
+ .align 4
+ /* copy 32 bytes at a time */
+ L(du3_loop):
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 24
++ sldi 8,7, 64-24
++#else
+ sldi 0,6, 24
+ srdi 8,7, 64-24
++#endif
+ or 0,0,8
+ ld 6,0(5)
+ std 0,0(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,7, 24
++ sldi 8,6, 64-24
++#else
+ sldi 0,7, 24
+ srdi 8,6, 64-24
++#endif
+ or 0,0,8
+ ld 7,8(5)
+ std 0,8(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 24
++ sldi 8,7, 64-24
++#else
+ sldi 0,6, 24
+ srdi 8,7, 64-24
++#endif
+ or 0,0,8
+ ld 6,16(5)
+ std 0,16(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,7, 24
++ sldi 8,6, 64-24
++#else
+ sldi 0,7, 24
+ srdi 8,6, 64-24
++#endif
+ or 0,0,8
+ ld 7,24(5)
+ std 0,24(4)
+@@ -817,9 +959,14 @@
+ .align 4
+ L(du3_fini):
+ /* calculate and store the final DW */
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 24
++ sldi 8,7, 64-24
++#else
+ sldi 0,6, 24
+ srdi 8,7, 64-24
+- or 0,0,8
++#endif
++ or 0,0,8
+ std 0,0(4)
+ b L(du_done)
+
+@@ -834,13 +981,23 @@
+ bf 30,L(du4_1dw)
+
+ /* there are at least two DWs to copy */
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 32
++ sldi 8,7, 64-32
++#else
+ sldi 0,6, 32
+ srdi 8,7, 64-32
++#endif
+ or 0,0,8
+ ld 6,16(5)
+ std 0,0(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,7, 32
++ sldi 8,6, 64-32
++#else
+ sldi 0,7, 32
+ srdi 8,6, 64-32
++#endif
+ or 0,0,8
+ ld 7,24(5)
+ std 0,8(4)
+@@ -849,8 +1006,13 @@
+ blt cr6,L(du4_fini) /* if total DWs = 3, then bypass loop */
+ bf 31,L(du4_loop)
+ /* there is a third DW to copy */
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 32
++ sldi 8,7, 64-32
++#else
+ sldi 0,6, 32
+ srdi 8,7, 64-32
++#endif
+ or 0,0,8
+ std 0,0(4)
+ mr 6,7
+@@ -861,8 +1023,13 @@
+ b L(du4_loop)
+ .align 4
+ L(du4_1dw):
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 32
++ sldi 8,7, 64-32
++#else
+ sldi 0,6, 32
+ srdi 8,7, 64-32
++#endif
+ addi 5,5,16
+ or 0,0,8
+ bf 31,L(du4_loop)
+@@ -874,23 +1041,43 @@
+ .align 4
+ /* copy 32 bytes at a time */
+ L(du4_loop):
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 32
++ sldi 8,7, 64-32
++#else
+ sldi 0,6, 32
+ srdi 8,7, 64-32
++#endif
+ or 0,0,8
+ ld 6,0(5)
+ std 0,0(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,7, 32
++ sldi 8,6, 64-32
++#else
+ sldi 0,7, 32
+ srdi 8,6, 64-32
++#endif
+ or 0,0,8
+ ld 7,8(5)
+ std 0,8(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 32
++ sldi 8,7, 64-32
++#else
+ sldi 0,6, 32
+ srdi 8,7, 64-32
++#endif
+ or 0,0,8
+ ld 6,16(5)
+ std 0,16(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,7, 32
++ sldi 8,6, 64-32
++#else
+ sldi 0,7, 32
+ srdi 8,6, 64-32
++#endif
+ or 0,0,8
+ ld 7,24(5)
+ std 0,24(4)
+@@ -900,9 +1087,14 @@
+ .align 4
+ L(du4_fini):
+ /* calculate and store the final DW */
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 32
++ sldi 8,7, 64-32
++#else
+ sldi 0,6, 32
+ srdi 8,7, 64-32
+- or 0,0,8
++#endif
++ or 0,0,8
+ std 0,0(4)
+ b L(du_done)
+
+@@ -911,13 +1103,23 @@
+ bf 30,L(du5_1dw)
+
+ /* there are at least two DWs to copy */
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 40
++ sldi 8,7, 64-40
++#else
+ sldi 0,6, 40
+ srdi 8,7, 64-40
++#endif
+ or 0,0,8
+ ld 6,16(5)
+ std 0,0(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,7, 40
++ sldi 8,6, 64-40
++#else
+ sldi 0,7, 40
+ srdi 8,6, 64-40
++#endif
+ or 0,0,8
+ ld 7,24(5)
+ std 0,8(4)
+@@ -926,8 +1128,13 @@
+ blt cr6,L(du5_fini) /* if total DWs = 3, then bypass loop */
+ bf 31,L(du5_loop)
+ /* there is a third DW to copy */
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 40
++ sldi 8,7, 64-40
++#else
+ sldi 0,6, 40
+ srdi 8,7, 64-40
++#endif
+ or 0,0,8
+ std 0,0(4)
+ mr 6,7
+@@ -938,8 +1145,13 @@
+ b L(du5_loop)
+ .align 4
+ L(du5_1dw):
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 40
++ sldi 8,7, 64-40
++#else
+ sldi 0,6, 40
+ srdi 8,7, 64-40
++#endif
+ addi 5,5,16
+ or 0,0,8
+ bf 31,L(du5_loop)
+@@ -951,23 +1163,43 @@
+ .align 4
+ /* copy 32 bytes at a time */
+ L(du5_loop):
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 40
++ sldi 8,7, 64-40
++#else
+ sldi 0,6, 40
+ srdi 8,7, 64-40
++#endif
+ or 0,0,8
+ ld 6,0(5)
+ std 0,0(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,7, 40
++ sldi 8,6, 64-40
++#else
+ sldi 0,7, 40
+ srdi 8,6, 64-40
++#endif
+ or 0,0,8
+ ld 7,8(5)
+ std 0,8(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 40
++ sldi 8,7, 64-40
++#else
+ sldi 0,6, 40
+ srdi 8,7, 64-40
++#endif
+ or 0,0,8
+ ld 6,16(5)
+ std 0,16(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,7, 40
++ sldi 8,6, 64-40
++#else
+ sldi 0,7, 40
+ srdi 8,6, 64-40
++#endif
+ or 0,0,8
+ ld 7,24(5)
+ std 0,24(4)
+@@ -977,9 +1209,14 @@
+ .align 4
+ L(du5_fini):
+ /* calculate and store the final DW */
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 40
++ sldi 8,7, 64-40
++#else
+ sldi 0,6, 40
+ srdi 8,7, 64-40
+- or 0,0,8
++#endif
++ or 0,0,8
+ std 0,0(4)
+ b L(du_done)
+
+@@ -988,13 +1225,23 @@
+ bf 30,L(du6_1dw)
+
+ /* there are at least two DWs to copy */
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 48
++ sldi 8,7, 64-48
++#else
+ sldi 0,6, 48
+ srdi 8,7, 64-48
++#endif
+ or 0,0,8
+ ld 6,16(5)
+ std 0,0(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,7, 48
++ sldi 8,6, 64-48
++#else
+ sldi 0,7, 48
+ srdi 8,6, 64-48
++#endif
+ or 0,0,8
+ ld 7,24(5)
+ std 0,8(4)
+@@ -1003,8 +1250,13 @@
+ blt cr6,L(du6_fini) /* if total DWs = 3, then bypass loop */
+ bf 31,L(du6_loop)
+ /* there is a third DW to copy */
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 48
++ sldi 8,7, 64-48
++#else
+ sldi 0,6, 48
+ srdi 8,7, 64-48
++#endif
+ or 0,0,8
+ std 0,0(4)
+ mr 6,7
+@@ -1015,8 +1267,13 @@
+ b L(du6_loop)
+ .align 4
+ L(du6_1dw):
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 48
++ sldi 8,7, 64-48
++#else
+ sldi 0,6, 48
+ srdi 8,7, 64-48
++#endif
+ addi 5,5,16
+ or 0,0,8
+ bf 31,L(du6_loop)
+@@ -1028,23 +1285,43 @@
+ .align 4
+ /* copy 32 bytes at a time */
+ L(du6_loop):
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 48
++ sldi 8,7, 64-48
++#else
+ sldi 0,6, 48
+ srdi 8,7, 64-48
++#endif
+ or 0,0,8
+ ld 6,0(5)
+ std 0,0(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,7, 48
++ sldi 8,6, 64-48
++#else
+ sldi 0,7, 48
+ srdi 8,6, 64-48
++#endif
+ or 0,0,8
+ ld 7,8(5)
+ std 0,8(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 48
++ sldi 8,7, 64-48
++#else
+ sldi 0,6, 48
+ srdi 8,7, 64-48
++#endif
+ or 0,0,8
+ ld 6,16(5)
+ std 0,16(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,7, 48
++ sldi 8,6, 64-48
++#else
+ sldi 0,7, 48
+ srdi 8,6, 64-48
++#endif
+ or 0,0,8
+ ld 7,24(5)
+ std 0,24(4)
+@@ -1054,9 +1331,14 @@
+ .align 4
+ L(du6_fini):
+ /* calculate and store the final DW */
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 48
++ sldi 8,7, 64-48
++#else
+ sldi 0,6, 48
+ srdi 8,7, 64-48
+- or 0,0,8
++#endif
++ or 0,0,8
+ std 0,0(4)
+ b L(du_done)
+
+@@ -1065,13 +1347,23 @@
+ bf 30,L(du7_1dw)
+
+ /* there are at least two DWs to copy */
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 56
++ sldi 8,7, 64-56
++#else
+ sldi 0,6, 56
+ srdi 8,7, 64-56
++#endif
+ or 0,0,8
+ ld 6,16(5)
+ std 0,0(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,7, 56
++ sldi 8,6, 64-56
++#else
+ sldi 0,7, 56
+ srdi 8,6, 64-56
++#endif
+ or 0,0,8
+ ld 7,24(5)
+ std 0,8(4)
+@@ -1080,8 +1372,13 @@
+ blt cr6,L(du7_fini) /* if total DWs = 3, then bypass loop */
+ bf 31,L(du7_loop)
+ /* there is a third DW to copy */
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 56
++ sldi 8,7, 64-56
++#else
+ sldi 0,6, 56
+ srdi 8,7, 64-56
++#endif
+ or 0,0,8
+ std 0,0(4)
+ mr 6,7
+@@ -1092,8 +1389,13 @@
+ b L(du7_loop)
+ .align 4
+ L(du7_1dw):
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 56
++ sldi 8,7, 64-56
++#else
+ sldi 0,6, 56
+ srdi 8,7, 64-56
++#endif
+ addi 5,5,16
+ or 0,0,8
+ bf 31,L(du7_loop)
+@@ -1105,23 +1407,43 @@
+ .align 4
+ /* copy 32 bytes at a time */
+ L(du7_loop):
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 56
++ sldi 8,7, 64-56
++#else
+ sldi 0,6, 56
+ srdi 8,7, 64-56
++#endif
+ or 0,0,8
+ ld 6,0(5)
+ std 0,0(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,7, 56
++ sldi 8,6, 64-56
++#else
+ sldi 0,7, 56
+ srdi 8,6, 64-56
++#endif
+ or 0,0,8
+ ld 7,8(5)
+ std 0,8(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 56
++ sldi 8,7, 64-56
++#else
+ sldi 0,6, 56
+ srdi 8,7, 64-56
++#endif
+ or 0,0,8
+ ld 6,16(5)
+ std 0,16(4)
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,7, 56
++ sldi 8,6, 64-56
++#else
+ sldi 0,7, 56
+ srdi 8,6, 64-56
++#endif
+ or 0,0,8
+ ld 7,24(5)
+ std 0,24(4)
+@@ -1131,12 +1453,17 @@
+ .align 4
+ L(du7_fini):
+ /* calculate and store the final DW */
++#ifdef __LITTLE_ENDIAN__
++ srdi 0,6, 56
++ sldi 8,7, 64-56
++#else
+ sldi 0,6, 56
+ srdi 8,7, 64-56
+- or 0,0,8
++#endif
++ or 0,0,8
+ std 0,0(4)
+ b L(du_done)
+-
++
+ .align 4
+ L(du_done):
+ rldicr 0,31,0,60
+@@ -1144,9 +1471,9 @@
+ beq cr1,0f /* If the tail is 0 bytes we are done! */
+
+ add 3,3,0
+- add 12,12,0
++ add 12,12,0
+ /* At this point we have a tail of 0-7 bytes and we know that the
+- destiniation is double word aligned. */
++ destination is double word aligned. */
+ 4: bf 29,2f
+ lwz 6,0(12)
+ addi 12,12,4
+@@ -1165,5 +1492,5 @@
+ ld 31,-8(1)
+ ld 3,-16(1)
+ blr
+-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
++END_GEN_TB (memcpy,TB_TOCLESS)
+ libc_hidden_builtin_def (memcpy)
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S 2014-05-29 13:04:56.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S 2014-05-29 13:05:40.000000000 -0500
+@@ -1,5 +1,5 @@
+ /* Optimized memcpy implementation for PowerPC64/POWER7.
+- Copyright (C) 2010, 2011 Free Software Foundation, Inc.
++ Copyright (C) 2010-2014 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+@@ -18,425 +18,366 @@
+ <http://www.gnu.org/licenses/>. */
+
+ #include <sysdep.h>
+-#include <bp-sym.h>
+-#include <bp-asm.h>
+
+
+ /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
+ Returns 'dst'. */
+
++#define dst 11 /* Use r11 so r3 kept unchanged. */
++#define src 4
++#define cnt 5
++
+ .machine power7
+-EALIGN (BP_SYM (memcpy), 5, 0)
++EALIGN (memcpy, 5, 0)
+ CALL_MCOUNT 3
+
+- cmpldi cr1,5,31
++ cmpldi cr1,cnt,31
+ neg 0,3
+- std 3,-16(1)
+- std 31,-8(1)
+- cfi_offset(31,-8)
+ ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
+ code. */
+
+- andi. 11,3,7 /* Check alignment of DST. */
+-
++#ifdef __LITTLE_ENDIAN__
++/* In little-endian mode, power7 takes an alignment trap on any lxvd2x
++ or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy
++ loop is only used for quadword aligned copies. */
++ andi. 10,3,15
++ clrldi 11,4,60
++#else
++ andi. 10,3,7 /* Check alignment of DST. */
++ clrldi 11,4,61 /* Check alignment of SRC. */
++#endif
++ cmpld cr6,10,11 /* SRC and DST alignments match? */
+
+- clrldi 10,4,61 /* Check alignment of SRC. */
+- cmpld cr6,10,11 /* SRC and DST alignments match? */
+- mr 12,4
+- mr 31,5
++ mr dst,3
+ bne cr6,L(copy_GE_32_unaligned)
++ beq L(aligned_copy)
+
+- srdi 9,5,3 /* Number of full quadwords remaining. */
+-
+- beq L(copy_GE_32_aligned_cont)
+-
+- clrldi 0,0,61
+- mtcrf 0x01,0
+- subf 31,0,5
+-
+- /* Get the SRC aligned to 8 bytes. */
+-
+-1: bf 31,2f
+- lbz 6,0(12)
+- addi 12,12,1
+- stb 6,0(3)
+- addi 3,3,1
+-2: bf 30,4f
+- lhz 6,0(12)
+- addi 12,12,2
+- sth 6,0(3)
+- addi 3,3,2
+-4: bf 29,0f
+- lwz 6,0(12)
+- addi 12,12,4
+- stw 6,0(3)
+- addi 3,3,4
+-0:
+- clrldi 10,12,61 /* Check alignment of SRC again. */
+- srdi 9,31,3 /* Number of full doublewords remaining. */
+-
+-L(copy_GE_32_aligned_cont):
+-
+- clrldi 11,31,61
+- mtcrf 0x01,9
+-
+- srdi 8,31,5
+- cmpldi cr1,9,4
+- cmpldi cr6,11,0
+- mr 11,12
++ mtocrf 0x01,0
++#ifdef __LITTLE_ENDIAN__
++ clrldi 0,0,60
++#else
++ clrldi 0,0,61
++#endif
+
+- /* Copy 1~3 doublewords so the main loop starts
+- at a multiple of 32 bytes. */
+-
+- bf 30,1f
+- ld 6,0(12)
+- ld 7,8(12)
+- addi 11,12,16
+- mtctr 8
+- std 6,0(3)
+- std 7,8(3)
+- addi 10,3,16
+- bf 31,4f
+- ld 0,16(12)
+- std 0,16(3)
+- blt cr1,3f
+- addi 11,12,24
+- addi 10,3,24
+- b 4f
+-
+- .align 4
+-1: /* Copy 1 doubleword and set the counter. */
+- mr 10,3
+- mtctr 8
+- bf 31,4f
+- ld 6,0(12)
+- addi 11,12,8
+- std 6,0(3)
+- addi 10,3,8
++/* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */
++1:
++ bf 31,2f
++ lbz 6,0(src)
++ addi src,src,1
++ stb 6,0(dst)
++ addi dst,dst,1
++2:
++ bf 30,4f
++ lhz 6,0(src)
++ addi src,src,2
++ sth 6,0(dst)
++ addi dst,dst,2
++4:
++ bf 29,8f
++ lwz 6,0(src)
++ addi src,src,4
++ stw 6,0(dst)
++ addi dst,dst,4
++8:
++#ifdef __LITTLE_ENDIAN__
++ bf 28,16f
++ ld 6,0(src)
++ addi src,src,8
++ std 6,0(dst)
++ addi dst,dst,8
++16:
++#endif
++ subf cnt,0,cnt
+
++/* Main aligned copy loop. Copies 128 bytes at a time. */
+ L(aligned_copy):
+- /* Main aligned copy loop. Copies up to 128-bytes at a time. */
+- .align 4
+-4:
+- /* check for any 32-byte or 64-byte lumps that are outside of a
+- nice 128-byte range. R8 contains the number of 32-byte
+- lumps, so drop this into the CR, and use the SO/EQ bits to help
+- handle the 32- or 64- byte lumps. Then handle the rest with an
+- unrolled 128-bytes-at-a-time copy loop. */
+- mtocrf 1,8
+- li 6,16 # 16() index
+- li 7,32 # 32() index
+- li 8,48 # 48() index
+-
+-L(aligned_32byte):
+- /* if the SO bit (indicating a 32-byte lump) is not set, move along. */
+- bns cr7,L(aligned_64byte)
+- lxvd2x 6,0,11
+- lxvd2x 7,11,6
+- addi 11,11,32
+- stxvd2x 6,0,10
+- stxvd2x 7,10,6
+- addi 10,10,32
+-
+-L(aligned_64byte):
+- /* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
+- bne cr7,L(aligned_128setup)
+- lxvd2x 6,0,11
+- lxvd2x 7,11,6
+- lxvd2x 8,11,7
+- lxvd2x 9,11,8
+- addi 11,11,64
+- stxvd2x 6,0,10
+- stxvd2x 7,10,6
+- stxvd2x 8,10,7
+- stxvd2x 9,10,8
+- addi 10,10,64
+-
+-L(aligned_128setup):
+- /* Set up for the 128-byte at a time copy loop. */
+- srdi 8,31,7
+- cmpdi 8,0 # Any 4x lumps left?
+- beq 3f # if not, move along.
+- lxvd2x 6,0,11
+- lxvd2x 7,11,6
+- mtctr 8 # otherwise, load the ctr and begin.
+- li 8,48 # 48() index
++ li 6,16
++ li 7,32
++ li 8,48
++ mtocrf 0x02,cnt
++ srdi 12,cnt,7
++ cmpdi 12,0
++ beq L(aligned_tail)
++ lxvd2x 6,0,src
++ lxvd2x 7,src,6
++ mtctr 12
+ b L(aligned_128loop)
+
++ .align 4
+ L(aligned_128head):
+ /* for the 2nd + iteration of this loop. */
+- lxvd2x 6,0,11
+- lxvd2x 7,11,6
++ lxvd2x 6,0,src
++ lxvd2x 7,src,6
+ L(aligned_128loop):
+- lxvd2x 8,11,7
+- lxvd2x 9,11,8
+- stxvd2x 6,0,10
+- addi 11,11,64
+- stxvd2x 7,10,6
+- stxvd2x 8,10,7
+- stxvd2x 9,10,8
+- lxvd2x 6,0,11
+- lxvd2x 7,11,6
+- addi 10,10,64
+- lxvd2x 8,11,7
+- lxvd2x 9,11,8
+- addi 11,11,64
+- stxvd2x 6,0,10
+- stxvd2x 7,10,6
+- stxvd2x 8,10,7
+- stxvd2x 9,10,8
+- addi 10,10,64
++ lxvd2x 8,src,7
++ lxvd2x 9,src,8
++ stxvd2x 6,0,dst
++ addi src,src,64
++ stxvd2x 7,dst,6
++ stxvd2x 8,dst,7
++ stxvd2x 9,dst,8
++ lxvd2x 6,0,src
++ lxvd2x 7,src,6
++ addi dst,dst,64
++ lxvd2x 8,src,7
++ lxvd2x 9,src,8
++ addi src,src,64
++ stxvd2x 6,0,dst
++ stxvd2x 7,dst,6
++ stxvd2x 8,dst,7
++ stxvd2x 9,dst,8
++ addi dst,dst,64
+ bdnz L(aligned_128head)
+
+-3:
+- /* Check for tail bytes. */
+- rldicr 0,31,0,60
+- mtcrf 0x01,31
+- beq cr6,0f
+-
+-.L9:
+- add 3,3,0
+- add 12,12,0
+-
+- /* At this point we have a tail of 0-7 bytes and we know that the
+- destination is doubleword-aligned. */
+-4: /* Copy 4 bytes. */
+- bf 29,2f
+-
+- lwz 6,0(12)
+- addi 12,12,4
+- stw 6,0(3)
+- addi 3,3,4
+-2: /* Copy 2 bytes. */
+- bf 30,1f
+-
+- lhz 6,0(12)
+- addi 12,12,2
+- sth 6,0(3)
+- addi 3,3,2
+-1: /* Copy 1 byte. */
+- bf 31,0f
+-
+- lbz 6,0(12)
+- stb 6,0(3)
+-0: /* Return original DST pointer. */
+- ld 31,-8(1)
+- ld 3,-16(1)
++L(aligned_tail):
++ mtocrf 0x01,cnt
++ bf 25,32f
++ lxvd2x 6,0,src
++ lxvd2x 7,src,6
++ lxvd2x 8,src,7
++ lxvd2x 9,src,8
++ addi src,src,64
++ stxvd2x 6,0,dst
++ stxvd2x 7,dst,6
++ stxvd2x 8,dst,7
++ stxvd2x 9,dst,8
++ addi dst,dst,64
++32:
++ bf 26,16f
++ lxvd2x 6,0,src
++ lxvd2x 7,src,6
++ addi src,src,32
++ stxvd2x 6,0,dst
++ stxvd2x 7,dst,6
++ addi dst,dst,32
++16:
++ bf 27,8f
++ lxvd2x 6,0,src
++ addi src,src,16
++ stxvd2x 6,0,dst
++ addi dst,dst,16
++8:
++ bf 28,4f
++ ld 6,0(src)
++ addi src,src,8
++ std 6,0(dst)
++ addi dst,dst,8
++4: /* Copies 4~7 bytes. */
++ bf 29,L(tail2)
++ lwz 6,0(src)
++ stw 6,0(dst)
++ bf 30,L(tail5)
++ lhz 7,4(src)
++ sth 7,4(dst)
++ bflr 31
++ lbz 8,6(src)
++ stb 8,6(dst)
++ /* Return original DST pointer. */
+ blr
+
+- /* Handle copies of 0~31 bytes. */
+- .align 4
++
++/* Handle copies of 0~31 bytes. */
++ .align 4
+ L(copy_LT_32):
+- cmpldi cr6,5,8
+- mr 12,4
+- mtcrf 0x01,5
++ mr dst,3
++ cmpldi cr6,cnt,8
++ mtocrf 0x01,cnt
+ ble cr6,L(copy_LE_8)
+
+ /* At least 9 bytes to go. */
+ neg 8,4
+- clrrdi 11,4,2
+- andi. 0,8,3
+- cmpldi cr1,5,16
+- mr 10,5
++ andi. 0,8,3
++ cmpldi cr1,cnt,16
+ beq L(copy_LT_32_aligned)
+
+- /* Force 4-bytes alignment for SRC. */
+- mtocrf 0x01,0
+- subf 10,0,5
+-2: bf 30,1f
+-
+- lhz 6,0(12)
+- addi 12,12,2
+- sth 6,0(3)
+- addi 3,3,2
+-1: bf 31,L(end_4bytes_alignment)
+-
+- lbz 6,0(12)
+- addi 12,12,1
+- stb 6,0(3)
+- addi 3,3,1
++ /* Force 4-byte alignment for SRC. */
++ mtocrf 0x01,0
++ subf cnt,0,cnt
++2:
++ bf 30,1f
++ lhz 6,0(src)
++ addi src,src,2
++ sth 6,0(dst)
++ addi dst,dst,2
++1:
++ bf 31,L(end_4bytes_alignment)
++ lbz 6,0(src)
++ addi src,src,1
++ stb 6,0(dst)
++ addi dst,dst,1
+
+- .align 4
++ .align 4
+ L(end_4bytes_alignment):
+- cmpldi cr1,10,16
+- mtcrf 0x01,10
++ cmpldi cr1,cnt,16
++ mtocrf 0x01,cnt
+
+ L(copy_LT_32_aligned):
+ /* At least 6 bytes to go, and SRC is word-aligned. */
+ blt cr1,8f
+
+ /* Copy 16 bytes. */
+- lwz 6,0(12)
+- lwz 7,4(12)
+- stw 6,0(3)
+- lwz 8,8(12)
+- stw 7,4(3)
+- lwz 6,12(12)
+- addi 12,12,16
+- stw 8,8(3)
+- stw 6,12(3)
+- addi 3,3,16
++ lwz 6,0(src)
++ lwz 7,4(src)
++ stw 6,0(dst)
++ lwz 8,8(src)
++ stw 7,4(dst)
++ lwz 6,12(src)
++ addi src,src,16
++ stw 8,8(dst)
++ stw 6,12(dst)
++ addi dst,dst,16
+ 8: /* Copy 8 bytes. */
+- bf 28,4f
++ bf 28,L(tail4)
++ lwz 6,0(src)
++ lwz 7,4(src)
++ addi src,src,8
++ stw 6,0(dst)
++ stw 7,4(dst)
++ addi dst,dst,8
++
++ .align 4
++/* Copies 4~7 bytes. */
++L(tail4):
++ bf 29,L(tail2)
++ lwz 6,0(src)
++ stw 6,0(dst)
++ bf 30,L(tail5)
++ lhz 7,4(src)
++ sth 7,4(dst)
++ bflr 31
++ lbz 8,6(src)
++ stb 8,6(dst)
++ /* Return original DST pointer. */
++ blr
+
+- lwz 6,0(12)
+- lwz 7,4(12)
+- addi 12,12,8
+- stw 6,0(3)
+- stw 7,4(3)
+- addi 3,3,8
+-4: /* Copy 4 bytes. */
+- bf 29,2f
+-
+- lwz 6,0(12)
+- addi 12,12,4
+- stw 6,0(3)
+- addi 3,3,4
+-2: /* Copy 2-3 bytes. */
++ .align 4
++/* Copies 2~3 bytes. */
++L(tail2):
+ bf 30,1f
+-
+- lhz 6,0(12)
+- sth 6,0(3)
+- bf 31,0f
+- lbz 7,2(12)
+- stb 7,2(3)
+- ld 3,-16(1)
++ lhz 6,0(src)
++ sth 6,0(dst)
++ bflr 31
++ lbz 7,2(src)
++ stb 7,2(dst)
+ blr
+
+- .align 4
+-1: /* Copy 1 byte. */
+- bf 31,0f
++ .align 4
++L(tail5):
++ bflr 31
++ lbz 6,4(src)
++ stb 6,4(dst)
++ blr
+
+- lbz 6,0(12)
+- stb 6,0(3)
+-0: /* Return original DST pointer. */
+- ld 3,-16(1)
++ .align 4
++1:
++ bflr 31
++ lbz 6,0(src)
++ stb 6,0(dst)
++ /* Return original DST pointer. */
+ blr
+
+- /* Handles copies of 0~8 bytes. */
+- .align 4
++
++/* Handles copies of 0~8 bytes. */
++ .align 4
+ L(copy_LE_8):
+- bne cr6,4f
++ bne cr6,L(tail4)
+
+ /* Though we could've used ld/std here, they are still
+ slow for unaligned cases. */
+
+- lwz 6,0(4)
+- lwz 7,4(4)
+- stw 6,0(3)
+- stw 7,4(3)
+- ld 3,-16(1) /* Return original DST pointers. */
++ lwz 6,0(src)
++ lwz 7,4(src)
++ stw 6,0(dst)
++ stw 7,4(dst)
+ blr
+
+- .align 4
+-4: /* Copies 4~7 bytes. */
+- bf 29,2b
+
+- lwz 6,0(4)
+- stw 6,0(3)
+- bf 30,5f
+- lhz 7,4(4)
+- sth 7,4(3)
+- bf 31,0f
+- lbz 8,6(4)
+- stb 8,6(3)
+- ld 3,-16(1)
+- blr
+-
+- .align 4
+-5: /* Copy 1 byte. */
+- bf 31,0f
+-
+- lbz 6,4(4)
+- stb 6,4(3)
+-
+-0: /* Return original DST pointer. */
+- ld 3,-16(1)
+- blr
+-
+- /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+- SRC is not. Use aligned quadword loads from SRC, shifted to realign
+- the data, allowing for aligned DST stores. */
+- .align 4
++/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
++ SRC is not. Use aligned quadword loads from SRC, shifted to realign
++ the data, allowing for aligned DST stores. */
++ .align 4
+ L(copy_GE_32_unaligned):
+- clrldi 0,0,60 /* Number of bytes until the 1st
+- quadword. */
+- andi. 11,3,15 /* Check alignment of DST (against
+- quadwords). */
+- srdi 9,5,4 /* Number of full quadwords remaining. */
++ clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */
++#ifndef __LITTLE_ENDIAN__
++ andi. 10,3,15 /* Check alignment of DST (against quadwords). */
++#endif
++ srdi 9,cnt,4 /* Number of full quadwords remaining. */
+
+ beq L(copy_GE_32_unaligned_cont)
+
+- /* SRC is not quadword aligned, get it aligned. */
++ /* DST is not quadword aligned, get it aligned. */
+
+- mtcrf 0x01,0
+- subf 31,0,5
++ mtocrf 0x01,0
++ subf cnt,0,cnt
+
+ /* Vector instructions work best when proper alignment (16-bytes)
+ is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
+-1: /* Copy 1 byte. */
++1:
+ bf 31,2f
+-
+- lbz 6,0(12)
+- addi 12,12,1
+- stb 6,0(3)
+- addi 3,3,1
+-2: /* Copy 2 bytes. */
++ lbz 6,0(src)
++ addi src,src,1
++ stb 6,0(dst)
++ addi dst,dst,1
++2:
+ bf 30,4f
+-
+- lhz 6,0(12)
+- addi 12,12,2
+- sth 6,0(3)
+- addi 3,3,2
+-4: /* Copy 4 bytes. */
++ lhz 6,0(src)
++ addi src,src,2
++ sth 6,0(dst)
++ addi dst,dst,2
++4:
+ bf 29,8f
+-
+- lwz 6,0(12)
+- addi 12,12,4
+- stw 6,0(3)
+- addi 3,3,4
+-8: /* Copy 8 bytes. */
++ lwz 6,0(src)
++ addi src,src,4
++ stw 6,0(dst)
++ addi dst,dst,4
++8:
+ bf 28,0f
+-
+- ld 6,0(12)
+- addi 12,12,8
+- std 6,0(3)
+- addi 3,3,8
++ ld 6,0(src)
++ addi src,src,8
++ std 6,0(dst)
++ addi dst,dst,8
+ 0:
+- clrldi 10,12,60 /* Check alignment of SRC. */
+- srdi 9,31,4 /* Number of full quadwords remaining. */
++ srdi 9,cnt,4 /* Number of full quadwords remaining. */
+
+ /* The proper alignment is present, it is OK to copy the bytes now. */
+ L(copy_GE_32_unaligned_cont):
+
+ /* Setup two indexes to speed up the indexed vector operations. */
+- clrldi 11,31,60
+- li 6,16 /* Index for 16-bytes offsets. */
++ clrldi 10,cnt,60
++ li 6,16 /* Index for 16-bytes offsets. */
+ li 7,32 /* Index for 32-bytes offsets. */
+- cmpldi cr1,11,0
+- srdi 8,31,5 /* Setup the loop counter. */
+- mr 10,3
+- mr 11,12
+- mtcrf 0x01,9
+- cmpldi cr6,9,1
+- lvsl 5,0,12
+- lvx 3,0,12
+- bf 31,L(setup_unaligned_loop)
+-
+- /* Copy another 16 bytes to align to 32-bytes due to the loop . */
+- lvx 4,12,6
+- vperm 6,3,4,5
+- addi 11,12,16
+- addi 10,3,16
+- stvx 6,0,3
++ cmpldi cr1,10,0
++ srdi 8,cnt,5 /* Setup the loop counter. */
++ mtocrf 0x01,9
++ cmpldi cr6,9,1
++#ifdef __LITTLE_ENDIAN__
++ lvsr 5,0,src
++#else
++ lvsl 5,0,src
++#endif
++ lvx 3,0,src
++ li 0,0
++ bf 31,L(setup_unaligned_loop)
++
++ /* Copy another 16 bytes to align to 32-bytes due to the loop. */
++ lvx 4,src,6
++#ifdef __LITTLE_ENDIAN__
++ vperm 6,4,3,5
++#else
++ vperm 6,3,4,5
++#endif
++ addi src,src,16
++ stvx 6,0,dst
++ addi dst,dst,16
+ vor 3,4,4
++ clrrdi 0,src,60
+
+ L(setup_unaligned_loop):
+- mtctr 8
+- ble cr6,L(end_unaligned_loop)
++ mtctr 8
++ ble cr6,L(end_unaligned_loop)
+
+ /* Copy 32 bytes at a time using vector instructions. */
+- .align 4
++ .align 4
+ L(unaligned_loop):
+
+ /* Note: vr6/vr10 may contain data that was already copied,
+@@ -444,63 +385,56 @@
+ some portions again. This is faster than having unaligned
+ vector instructions though. */
+
+- lvx 4,11,6 /* vr4 = r11+16. */
+- vperm 6,3,4,5 /* Merge the correctly-aligned portions
+- of vr3/vr4 into vr6. */
+- lvx 3,11,7 /* vr3 = r11+32. */
+- vperm 10,4,3,5 /* Merge the correctly-aligned portions
+- of vr3/vr4 into vr10. */
+- addi 11,11,32
+- stvx 6,0,10
+- stvx 10,10,6
+- addi 10,10,32
+-
++ lvx 4,src,6
++#ifdef __LITTLE_ENDIAN__
++ vperm 6,4,3,5
++#else
++ vperm 6,3,4,5
++#endif
++ lvx 3,src,7
++#ifdef __LITTLE_ENDIAN__
++ vperm 10,3,4,5
++#else
++ vperm 10,4,3,5
++#endif
++ addi src,src,32
++ stvx 6,0,dst
++ stvx 10,dst,6
++ addi dst,dst,32
+ bdnz L(unaligned_loop)
+
+- .align 4
++ clrrdi 0,src,60
++
++ .align 4
+ L(end_unaligned_loop):
+
+ /* Check for tail bytes. */
+- rldicr 0,31,0,59
+- mtcrf 0x01,31
+- beq cr1,0f
++ mtocrf 0x01,cnt
++ beqlr cr1
+
+- add 3,3,0
+- add 12,12,0
++ add src,src,0
+
+ /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
+-8: /* Copy 8 bytes. */
++ /* Copy 8 bytes. */
+ bf 28,4f
+-
+- lwz 6,0(12)
+- lwz 7,4(12)
+- addi 12,12,8
+- stw 6,0(3)
+- stw 7,4(3)
+- addi 3,3,8
+-4: /* Copy 4 bytes. */
+- bf 29,2f
+-
+- lwz 6,0(12)
+- addi 12,12,4
+- stw 6,0(3)
+- addi 3,3,4
+-2: /* Copy 2~3 bytes. */
+- bf 30,1f
+-
+- lhz 6,0(12)
+- addi 12,12,2
+- sth 6,0(3)
+- addi 3,3,2
+-1: /* Copy 1 byte. */
+- bf 31,0f
+-
+- lbz 6,0(12)
+- stb 6,0(3)
+-0: /* Return original DST pointer. */
+- ld 31,-8(1)
+- ld 3,-16(1)
++ lwz 6,0(src)
++ lwz 7,4(src)
++ addi src,src,8
++ stw 6,0(dst)
++ stw 7,4(dst)
++ addi dst,dst,8
++4: /* Copy 4~7 bytes. */
++ bf 29,L(tail2)
++ lwz 6,0(src)
++ stw 6,0(dst)
++ bf 30,L(tail5)
++ lhz 7,4(src)
++ sth 7,4(dst)
++ bflr 31
++ lbz 8,6(src)
++ stb 8,6(dst)
++ /* Return original DST pointer. */
+ blr
+
+-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
++END_GEN_TB (memcpy,TB_TOCLESS)
+ libc_hidden_builtin_def (memcpy)
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S 2014-05-29 13:04:56.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S 2014-05-29 13:04:56.000000000 -0500
+@@ -367,13 +367,21 @@
+ mr 11,12
+ mtcrf 0x01,9
+ cmpldi cr6,9,1
+- lvsl 5,0,12
++#ifdef __LITTLE_ENDIAN__
++ lvsr 5,0,12
++#else
++ lvsl 5,0,12
++#endif
+ lvx 3,0,12
+ bf 31,L(setup_unaligned_loop)
+
+ /* Copy another 16 bytes to align to 32-bytes due to the loop . */
+ lvx 4,12,6
+- vperm 6,3,4,5
++#ifdef __LITTLE_ENDIAN__
++ vperm 6,4,3,5
++#else
++ vperm 6,3,4,5
++#endif
+ addi 11,12,16
+ addi 10,3,16
+ stvx 6,0,3
+@@ -393,11 +401,17 @@
+ vector instructions though. */
+
+ lvx 4,11,6 /* vr4 = r11+16. */
+- vperm 6,3,4,5 /* Merge the correctly-aligned portions
+- of vr3/vr4 into vr6. */
++#ifdef __LITTLE_ENDIAN__
++ vperm 6,4,3,5
++#else
++ vperm 6,3,4,5
++#endif
+ lvx 3,11,7 /* vr3 = r11+32. */
+- vperm 10,4,3,5 /* Merge the correctly-aligned portions
+- of vr3/vr4 into vr10. */
++#ifdef __LITTLE_ENDIAN__
++ vperm 10,3,4,5
++#else
++ vperm 10,4,3,5
++#endif
+ addi 11,11,32
+ stvx 6,0,10
+ stvx 10,10,6