Merge pull request #1510 from messense/glibc-ppc64le-patches

Add GLIBC 2.17 support to powerpc64le-unknown-linux-gnu
author: Chris Packham <judge.packham@gmail.com> 2021-05-18 08:46:49 (GMT)
committer: GitHub <noreply@github.com> 2021-05-18 08:46:49 (GMT)
commit: 6d008334bcfa76f8b46e61d9edb6dd5335cd6632 (patch)
tree: cd137ec7ab048fa32049a4322c10a0e27ba80c20 /packages/glibc/2.17/0053-glibc-ppc64le-31.patch
parent: f284f4149518de6e8c403a9392be8e817bfab2e8 (diff)
parent: 0088351811bf442aa2e7d35c564f36ca67a8a699 (diff)
1 files changed, 2943 insertions, 0 deletions
diff --git a/packages/glibc/2.17/0053-glibc-ppc64le-31.patch b/packages/glibc/2.17/0053-glibc-ppc64le-31.patch
new file mode 100644
index 0000000..de90661
--- /dev/null
+++ b/packages/glibc/2.17/0053-glibc-ppc64le-31.patch
@@ -0,0 +1,2943 @@
+# commit 759cfef3ac4c07dba1ece0bbc1207e099348816d
+# Author: Alan Modra <amodra@gmail.com>
+# Date:   Sat Aug 17 18:47:22 2013 +0930
+# 
+#     PowerPC LE memcpy
+#     http://sourceware.org/ml/libc-alpha/2013-08/msg00103.html
+#     
+#     LIttle-endian support for memcpy.  I spent some time cleaning up the
+#     64-bit power7 memcpy, in order to avoid the extra alignment traps
+#     power7 takes for little-endian.  It probably would have been better
+#     to copy the linux kernel version of memcpy.
+#     
+#         * sysdeps/powerpc/powerpc32/power4/memcpy.S: Add little endian support.
+#         * sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise.
+#         * sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise.
+#         * sysdeps/powerpc/powerpc32/power7/mempcpy.S: Likewise.
+#         * sysdeps/powerpc/powerpc64/memcpy.S: Likewise.
+#         * sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise.
+#         * sysdeps/powerpc/powerpc64/power6/memcpy.S: Likewise.
+#         * sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise.
+#         * sysdeps/powerpc/powerpc64/power7/mempcpy.S: Likewise.  Make better
+#         use of regs.  Use power7 mtocrf.  Tidy function tails.
+# 
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S	2014-05-29 13:04:56.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcpy.S	2014-05-29 13:04:56.000000000 -0500
+@@ -205,15 +205,28 @@
+     blt   cr6,5f
+     srwi  7,6,16
+     bgt	  cr6,3f
++#ifdef __LITTLE_ENDIAN__
++    sth   7,0(3)
++#else
+     sth   6,0(3)
++#endif
+     b     7f
+     .align  4
+ 3:
++#ifdef __LITTLE_ENDIAN__
++    rotlwi 6,6,24
++    stb   6,0(3)
++    sth   7,1(3)
++#else
+     stb   7,0(3)
+     sth   6,1(3)
++#endif
+     b     7f
+     .align  4
+ 5:
++#ifdef __LITTLE_ENDIAN__
++    rotlwi 6,6,8
++#endif
+     stb   6,0(3)
+ 7:
+     cmplwi	cr1,10,16
+@@ -341,13 +354,23 @@
+     bf      30,1f
+ 
+     /* there are at least two words to copy, so copy them */
++#ifdef __LITTLE_ENDIAN__
++    srw   0,6,10
++    slw   8,7,9
++#else
+     slw   0,6,10  /* shift 1st src word to left align it in R0 */
+     srw   8,7,9   /* shift 2nd src word to right align it in R8 */
++#endif
+     or    0,0,8   /* or them to get word to store */
+     lwz   6,8(5)  /* load the 3rd src word */
+     stw   0,0(4)  /* store the 1st dst word */
++#ifdef __LITTLE_ENDIAN__
++    srw   0,7,10
++    slw   8,6,9
++#else
+     slw   0,7,10  /* now left align 2nd src word into R0 */
+     srw   8,6,9   /* shift 3rd src word to right align it in R8 */
++#endif
+     or    0,0,8   /* or them to get word to store */
+     lwz   7,12(5)
+     stw   0,4(4)  /* store the 2nd dst word */
+@@ -355,8 +378,13 @@
+     addi  5,5,16
+     bf    31,4f
+     /* there is a third word to copy, so copy it */
++#ifdef __LITTLE_ENDIAN__
++    srw   0,6,10
++    slw   8,7,9
++#else
+     slw   0,6,10  /* shift 3rd src word to left align it in R0 */
+     srw   8,7,9   /* shift 4th src word to right align it in R8 */
++#endif
+     or    0,0,8   /* or them to get word to store */
+     stw   0,0(4)  /* store 3rd dst word */
+     mr    6,7
+@@ -366,8 +394,13 @@
+     b     4f
+     .align 4
+ 1:
++#ifdef __LITTLE_ENDIAN__
++    srw     0,6,10
++    slw     8,7,9
++#else
+     slw     0,6,10  /* shift 1st src word to left align it in R0 */
+     srw     8,7,9   /* shift 2nd src word to right align it in R8 */
++#endif
+     addi  5,5,8
+     or    0,0,8   /* or them to get word to store */
+     bf    31,4f
+@@ -380,23 +413,43 @@
+     .align  4
+ 4:
+     /* copy 16 bytes at a time */
++#ifdef __LITTLE_ENDIAN__
++    srw   0,6,10
++    slw   8,7,9
++#else
+     slw   0,6,10
+     srw   8,7,9
++#endif
+     or    0,0,8
+     lwz   6,0(5)
+     stw   0,0(4)
++#ifdef __LITTLE_ENDIAN__
++    srw   0,7,10
++    slw   8,6,9
++#else
+     slw   0,7,10
+     srw   8,6,9
++#endif
+     or    0,0,8
+     lwz   7,4(5)
+     stw   0,4(4)
++#ifdef __LITTLE_ENDIAN__
++    srw   0,6,10
++    slw   8,7,9
++#else
+     slw   0,6,10
+     srw   8,7,9
++#endif
+     or    0,0,8
+     lwz   6,8(5)
+     stw   0,8(4)
++#ifdef __LITTLE_ENDIAN__
++    srw   0,7,10
++    slw   8,6,9
++#else
+     slw   0,7,10
+     srw   8,6,9
++#endif
+     or    0,0,8
+     lwz   7,12(5)
+     stw   0,12(4)
+@@ -405,8 +458,13 @@
+     bdnz+ 4b
+ 8:
+     /* calculate and store the final word */
++#ifdef __LITTLE_ENDIAN__
++    srw   0,6,10
++    slw   8,7,9
++#else
+     slw   0,6,10
+     srw   8,7,9
++#endif
+     or    0,0,8
+     stw   0,0(4)
+ 3:
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S	2014-05-29 13:04:56.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power6/memcpy.S	2014-05-29 13:04:56.000000000 -0500
+@@ -221,15 +221,28 @@
+     blt   cr6,5f
+     srwi  7,6,16
+     bgt	  cr6,3f
++#ifdef __LITTLE_ENDIAN__
++    sth   7,0(3)
++#else
+     sth   6,0(3)
++#endif
+     b     7f
+     .align  4
+ 3:
++#ifdef __LITTLE_ENDIAN__
++    rotlwi 6,6,24
++    stb   6,0(3)
++    sth   7,1(3)
++#else
+     stb   7,0(3)
+     sth   6,1(3)
++#endif
+     b     7f
+     .align  4
+ 5:
++#ifdef __LITTLE_ENDIAN__
++    rotlwi 6,6,8
++#endif
+     stb   6,0(3)
+ 7:
+     cmplwi	cr1,10,16
+@@ -579,7 +592,11 @@
+     lwz     6,-1(4)
+     cmplwi  cr6,31,4
+     srwi    8,31,5    /* calculate the 32 byte loop count */
++#ifdef __LITTLE_ENDIAN__
++    srwi    6,6,8
++#else
+     slwi    6,6,8
++#endif
+     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
+     blt     cr5,L(wdu1_32tail)
+     mtctr   8
+@@ -587,8 +604,12 @@
+ 
+     lwz   8,3(4)
+     lwz   7,4(4)
++#ifdef __LITTLE_ENDIAN__
++    rldimi 6,8,24,32
++#else
+ /*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
+     rlwimi 6,8,8,(32-8),31
++#endif
+     b      L(wdu1_loop32x)
+     .align  4
+ L(wdu1_loop32):
+@@ -597,8 +618,12 @@
+     lwz   7,4(4)
+     stw   10,-8(3)
+     stw   11,-4(3)
++#ifdef __LITTLE_ENDIAN__
++    rldimi 6,8,24,32
++#else
+ /*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
+     rlwimi 6,8,8,(32-8),31
++#endif
+ L(wdu1_loop32x):
+     lwz   10,8(4)
+     lwz   11,12(4)
+@@ -615,7 +640,11 @@
+     stw   6,16(3)
+     stw   7,20(3)
+     addi  3,3,32
++#ifdef __LITTLE_ENDIAN__
++    srwi  6,8,8
++#else
+     slwi  6,8,8
++#endif
+     bdnz+ L(wdu1_loop32)
+     stw   10,-8(3)
+     stw   11,-4(3)
+@@ -626,8 +655,12 @@
+     blt     cr6,L(wdu_4tail)
+     /* calculate and store the final word */
+     lwz   8,3(4)
+-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
++#ifdef __LITTLE_ENDIAN__
++    rldimi 6,8,24,32
++#else
++/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8  */
+     rlwimi 6,8,8,(32-8),31
++#endif
+     b     L(wdu_32tailx)
+ 
+ L(wdu2_32):
+@@ -635,7 +668,11 @@
+     lwz     6,-2(4)
+     cmplwi  cr6,31,4
+     srwi    8,31,5    /* calculate the 32 byte loop count */
++#ifdef __LITTLE_ENDIAN__
++    srwi    6,6,16
++#else
+     slwi    6,6,16
++#endif
+     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
+     blt     cr5,L(wdu2_32tail)
+     mtctr   8
+@@ -643,8 +680,11 @@
+ 
+     lwz   8,2(4)
+     lwz   7,4(4)
+-/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
++#ifdef __LITTLE_ENDIAN__
++    rldimi 6,8,16,32
++#else
+     rlwimi 6,8,16,(32-16),31
++#endif
+     b      L(wdu2_loop32x)
+     .align  4
+ L(wdu2_loop32):
+@@ -653,8 +693,11 @@
+     lwz   7,4(4)
+     stw   10,-8(3)
+     stw   11,-4(3)
+-/*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
++#ifdef __LITTLE_ENDIAN__
++    rldimi 6,8,16,32
++#else
+     rlwimi 6,8,16,(32-16),31
++#endif
+ L(wdu2_loop32x):
+     lwz   10,8(4)
+     lwz   11,12(4)
+@@ -672,7 +715,11 @@
+     stw   6,16(3)
+     stw   7,20(3)
+     addi  3,3,32
++#ifdef __LITTLE_ENDIAN__
++    srwi  6,8,16
++#else
+     slwi  6,8,16
++#endif
+     bdnz+ L(wdu2_loop32)
+     stw   10,-8(3)
+     stw   11,-4(3)
+@@ -683,8 +730,11 @@
+     blt     cr6,L(wdu_4tail)
+     /* calculate and store the final word */
+     lwz   8,2(4)
+-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
++#ifdef __LITTLE_ENDIAN__
++    rldimi 6,8,16,32
++#else
+     rlwimi 6,8,16,(32-16),31
++#endif
+     b     L(wdu_32tailx)
+ 
+ L(wdu3_32):
+@@ -692,7 +742,11 @@
+     lwz     6,-3(4)
+     cmplwi  cr6,31,4
+     srwi    8,31,5    /* calculate the 32 byte loop count */
++#ifdef __LITTLE_ENDIAN__
++    srwi    6,6,24
++#else
+     slwi    6,6,24
++#endif
+     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
+     blt     cr5,L(wdu3_32tail)
+     mtctr   8
+@@ -700,8 +754,11 @@
+ 
+     lwz   8,1(4)
+     lwz   7,4(4)
+-/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
++#ifdef __LITTLE_ENDIAN__
++    rldimi 6,8,8,32
++#else
+     rlwimi 6,8,24,(32-24),31
++#endif
+     b      L(wdu3_loop32x)
+     .align  4
+ L(wdu3_loop32):
+@@ -710,8 +767,11 @@
+     lwz   7,4(4)
+     stw   10,-8(3)
+     stw   11,-4(3)
+-/*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
++#ifdef __LITTLE_ENDIAN__
++    rldimi 6,8,8,32
++#else
+     rlwimi 6,8,24,(32-24),31
++#endif
+ L(wdu3_loop32x):
+     lwz   10,8(4)
+     lwz   11,12(4)
+@@ -728,7 +788,11 @@
+     stw   6,16(3)
+     stw   7,20(3)
+     addi  3,3,32
++#ifdef __LITTLE_ENDIAN__
++    srwi  6,8,24
++#else
+     slwi  6,8,24
++#endif
+     bdnz+ L(wdu3_loop32)
+     stw   10,-8(3)
+     stw   11,-4(3)
+@@ -739,8 +803,11 @@
+     blt     cr6,L(wdu_4tail)
+     /* calculate and store the final word */
+     lwz   8,1(4)
+-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
++#ifdef __LITTLE_ENDIAN__
++    rldimi 6,8,8,32
++#else
+     rlwimi 6,8,24,(32-24),31
++#endif
+     b     L(wdu_32tailx)
+     .align  4
+ L(wdu_32tailx):
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S	2014-05-29 13:04:56.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcpy.S	2014-05-29 13:04:56.000000000 -0500
+@@ -385,7 +385,7 @@
+ 
+ 	beq    L(copy_GE_32_unaligned_cont)
+ 
+-	/* SRC is not quadword aligned, get it aligned.  */
++	/* DST is not quadword aligned, get it aligned.  */
+ 
+ 	mtcrf   0x01,0
+ 	subf    31,0,5
+@@ -437,13 +437,21 @@
+ 	mr      11,12
+ 	mtcrf   0x01,9
+ 	cmplwi  cr6,9,1
++#ifdef __LITTLE_ENDIAN__
++	lvsr    5,0,12
++#else
+ 	lvsl    5,0,12
++#endif
+ 	lvx     3,0,12
+ 	bf      31,L(setup_unaligned_loop)
+ 
+ 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
+ 	lvx     4,12,6
++#ifdef __LITTLE_ENDIAN__
++	vperm   6,4,3,5
++#else
+ 	vperm   6,3,4,5
++#endif
+ 	addi    11,12,16
+ 	addi    10,3,16
+ 	stvx    6,0,3
+@@ -463,11 +471,17 @@
+ 	vector instructions though.  */
+ 
+ 	lvx	4,11,6	      /* vr4 = r11+16.  */
+-	vperm   6,3,4,5	      /* Merge the correctly-aligned portions
+-			      of vr3/vr4 into vr6.  */
++#ifdef __LITTLE_ENDIAN__
++	vperm   6,4,3,5
++#else
++	vperm   6,3,4,5
++#endif
+ 	lvx	3,11,7	      /* vr3 = r11+32.  */
+-	vperm   10,4,3,5      /* Merge the correctly-aligned portions
+-			      of vr3/vr4 into vr10.  */
++#ifdef __LITTLE_ENDIAN__
++	vperm   10,3,4,5
++#else
++	vperm   10,4,3,5
++#endif
+ 	addi    11,11,32
+ 	stvx    6,0,10
+ 	stvx    10,10,6
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S	2014-05-29 13:04:56.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/mempcpy.S	2014-05-29 13:04:56.000000000 -0500
+@@ -327,7 +327,7 @@
+ 
+ 	beq	L(copy_GE_32_unaligned_cont)
+ 
+-	/* SRC is not quadword aligned, get it aligned.  */
++	/* DST is not quadword aligned, get it aligned.  */
+ 
+ 	mtcrf	0x01,0
+ 	subf	31,0,5
+@@ -379,13 +379,21 @@
+ 	mr	11,12
+ 	mtcrf	0x01,9
+ 	cmplwi	cr6,9,1
+-	lvsl	5,0,12
++#ifdef __LITTLE_ENDIAN__
++	lvsr    5,0,12
++#else
++	lvsl    5,0,12
++#endif
+ 	lvx	3,0,12
+ 	bf	31,L(setup_unaligned_loop)
+ 
+ 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
+ 	lvx	4,12,6
+-	vperm	6,3,4,5
++#ifdef __LITTLE_ENDIAN__
++	vperm   6,4,3,5
++#else
++	vperm   6,3,4,5
++#endif
+ 	addi	11,12,16
+ 	addi	10,3,16
+ 	stvx	6,0,3
+@@ -405,11 +413,17 @@
+ 	vector instructions though.  */
+ 
+ 	lvx	4,11,6	      /* vr4 = r11+16.  */
+-	vperm	6,3,4,5	      /* Merge the correctly-aligned portions
+-				 of vr3/vr4 into vr6.  */
++#ifdef __LITTLE_ENDIAN__
++	vperm   6,4,3,5
++#else
++	vperm   6,3,4,5
++#endif
+ 	lvx	3,11,7	      /* vr3 = r11+32.  */
+-	vperm	10,4,3,5      /* Merge the correctly-aligned portions
+-				 of vr3/vr4 into vr10.  */
++#ifdef __LITTLE_ENDIAN__
++	vperm   10,3,4,5
++#else
++	vperm   10,4,3,5
++#endif
+ 	addi	11,11,32
+ 	stvx	6,0,10
+ 	stvx	10,10,6
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S	2014-05-29 13:04:56.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/memcpy.S	2014-05-29 13:04:56.000000000 -0500
+@@ -214,15 +214,28 @@
+     blt   cr6,5f
+     srdi  7,6,16
+     bgt	  cr6,3f
++#ifdef __LITTLE_ENDIAN__
++    sth   7,0(3)
++#else
+     sth   6,0(3)
++#endif
+     b     7f
+     .align  4
+ 3:
++#ifdef __LITTLE_ENDIAN__
++    rotlwi 6,6,24
++    stb   6,0(3)
++    sth   7,1(3)
++#else
+     stb   7,0(3)
+     sth   6,1(3)
++#endif
+     b     7f
+     .align  4
+ 5:
++#ifdef __LITTLE_ENDIAN__
++    rotlwi 6,6,8
++#endif
+     stb   6,0(3)
+ 7:
+     cmpldi	cr1,10,16
+@@ -330,7 +343,11 @@
+     ld    7,8(5)
+     subfic  9,10,64
+     beq   2f
++#ifdef __LITTLE_ENDIAN__
++    srd   0,6,10
++#else
+     sld   0,6,10
++#endif
+     cmpldi  11,1
+     mr    6,7
+     addi  4,4,-8
+@@ -338,15 +355,25 @@
+     b     1f
+ 2:  addi  5,5,8
+     .align  4
++#ifdef __LITTLE_ENDIAN__
++0:  srd   0,6,10
++    sld   8,7,9
++#else
+ 0:  sld   0,6,10
+     srd   8,7,9
++#endif
+     cmpldi  11,2
+     ld    6,8(5)
+     or    0,0,8
+     addi  11,11,-2
+     std   0,0(4)
++#ifdef __LITTLE_ENDIAN__
++    srd   0,7,10
++1:  sld   8,6,9
++#else
+     sld   0,7,10
+ 1:  srd   8,6,9
++#endif
+     or    0,0,8
+     beq   8f
+     ld    7,16(5)
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S	2014-05-29 13:04:56.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcpy.S	2014-05-29 13:05:51.000000000 -0500
+@@ -1,5 +1,5 @@
+ /* Optimized memcpy implementation for PowerPC64.
+-   Copyright (C) 2003, 2006, 2011 Free Software Foundation, Inc.
++   Copyright (C) 2003-2014 Free Software Foundation, Inc.
+    This file is part of the GNU C Library.
+ 
+    The GNU C Library is free software; you can redistribute it and/or
+@@ -17,26 +17,24 @@
+    <http://www.gnu.org/licenses/>.  */
+ 
+ #include <sysdep.h>
+-#include <bp-sym.h>
+-#include <bp-asm.h>
+ 
+ /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
+    Returns 'dst'.
+ 
+-   Memcpy handles short copies (< 32-bytes) using a binary move blocks 
+-   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled 
+-   with the appropriate combination of byte and halfword load/stores. 
+-   There is minimal effort to optimize the alignment of short moves.  
++   Memcpy handles short copies (< 32-bytes) using a binary move blocks
++   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
++   with the appropriate combination of byte and halfword load/stores.
++   There is minimal effort to optimize the alignment of short moves.
+    The 64-bit implementations of POWER3 and POWER4 do a reasonable job
+-   of handling unligned load/stores that do not cross 32-byte boundries.
++   of handling unaligned load/stores that do not cross 32-byte boundaries.
+ 
+    Longer moves (>= 32-bytes) justify the effort to get at least the
+    destination doubleword (8-byte) aligned.  Further optimization is
+-   posible when both source and destination are doubleword aligned.
++   possible when both source and destination are doubleword aligned.
+    Each case has a optimized unrolled loop.   */
+ 
+ 	.machine power4
+-EALIGN (BP_SYM (memcpy), 5, 0)
++EALIGN (memcpy, 5, 0)
+ 	CALL_MCOUNT 3
+ 
+     cmpldi cr1,5,31
+@@ -44,20 +42,20 @@
+     std   3,-16(1)
+     std   31,-8(1)
+     cfi_offset(31,-8)
+-    andi. 11,3,7	/* check alignement of dst.  */
++    andi. 11,3,7	/* check alignment of dst.  */
+     clrldi 0,0,61	/* Number of bytes until the 1st doubleword of dst.  */
+-    clrldi 10,4,61	/* check alignement of src.  */
++    clrldi 10,4,61	/* check alignment of src.  */
+     cmpldi cr6,5,8
+     ble-  cr1,.L2	/* If move < 32 bytes use short move code.  */
+-    cmpld cr6,10,11     
++    cmpld cr6,10,11
+     mr    12,4
+     srdi  9,5,3		/* Number of full double words remaining.  */
+     mtcrf 0x01,0
+     mr    31,5
+     beq   .L0
+-  
++
+     subf  31,0,5
+-  /* Move 0-7 bytes as needed to get the destination doubleword alligned.  */
++  /* Move 0-7 bytes as needed to get the destination doubleword aligned.  */
+ 1:  bf    31,2f
+     lbz   6,0(12)
+     addi  12,12,1
+@@ -74,17 +72,17 @@
+     stw   6,0(3)
+     addi  3,3,4
+ 0:
+-    clrldi 10,12,61	/* check alignement of src again.  */     
++    clrldi 10,12,61	/* check alignment of src again.  */
+     srdi  9,31,3	/* Number of full double words remaining.  */
+-    
+-  /* Copy doublewords from source to destination, assumpting the
++
++  /* Copy doublewords from source to destination, assuming the
+      destination is aligned on a doubleword boundary.
+ 
+      At this point we know there are at least 25 bytes left (32-7) to copy.
+-     The next step is to determine if the source is also doubleword aligned. 
++     The next step is to determine if the source is also doubleword aligned.
+      If not branch to the unaligned move code at .L6. which uses
+      a load, shift, store strategy.
+-     
++
+      Otherwise source and destination are doubleword aligned, and we can
+      the optimized doubleword copy loop.  */
+ .L0:
+@@ -97,14 +95,14 @@
+      Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.
+      If the copy is not an exact multiple of 32 bytes, 1-3
+      doublewords are copied as needed to set up the main loop.  After
+-     the main loop exits there may be a tail of 1-7 bytes. These byte are 
++     the main loop exits there may be a tail of 1-7 bytes. These byte are
+      copied a word/halfword/byte at a time as needed to preserve alignment.  */
+ 
+     srdi  8,31,5
+     cmpldi	cr1,9,4
+     cmpldi	cr6,11,0
+     mr    11,12
+-    
++
+     bf    30,1f
+     ld    6,0(12)
+     ld    7,8(12)
+@@ -115,7 +113,7 @@
+     addi  10,3,16
+     bf    31,4f
+     ld    0,16(12)
+-    std   0,16(3)    
++    std   0,16(3)
+     blt   cr1,3f
+     addi  11,12,24
+     addi  10,3,24
+@@ -129,7 +127,7 @@
+     addi  11,12,8
+     std   6,0(3)
+     addi  10,3,8
+-    
++
+     .align  4
+ 4:
+     ld    6,0(11)
+@@ -144,7 +142,7 @@
+     std   0,24(10)
+     addi  10,10,32
+     bdnz  4b
+-3:  
++3:
+ 
+     rldicr 0,31,0,60
+     mtcrf 0x01,31
+@@ -152,9 +150,9 @@
+ .L9:
+     add   3,3,0
+     add   12,12,0
+-    
++
+ /*  At this point we have a tail of 0-7 bytes and we know that the
+-    destiniation is double word aligned.  */
++    destination is double word aligned.  */
+ 4:  bf    29,2f
+     lwz   6,0(12)
+     addi  12,12,4
+@@ -173,29 +171,29 @@
+     ld 31,-8(1)
+     ld 3,-16(1)
+     blr
+-       
+-/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31 
+-   bytes.  Each case is handled without loops, using binary (1,2,4,8) 
+-   tests.  
+-   
++
++/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
++   bytes.  Each case is handled without loops, using binary (1,2,4,8)
++   tests.
++
+    In the short (0-8 byte) case no attempt is made to force alignment
+-   of either source or destination.  The hardware will handle the 
+-   unaligned load/stores with small delays for crossing 32- 64-byte, and 
++   of either source or destination.  The hardware will handle the
++   unaligned load/stores with small delays for crossing 32- 64-byte, and
+    4096-byte boundaries. Since these short moves are unlikely to be
+-   unaligned or cross these boundaries, the overhead to force 
++   unaligned or cross these boundaries, the overhead to force
+    alignment is not justified.
+-   
++
+    The longer (9-31 byte) move is more likely to cross 32- or 64-byte
+    boundaries.  Since only loads are sensitive to the 32-/64-byte
+-   boundaries it is more important to align the source then the 
++   boundaries it is more important to align the source then the
+    destination.  If the source is not already word aligned, we first
+-   move 1-3 bytes as needed.  Since we are only word aligned we don't 
+-   use double word load/stores to insure that all loads are aligned. 
++   move 1-3 bytes as needed.  Since we are only word aligned we don't
++   use double word load/stores to insure that all loads are aligned.
+    While the destination and stores may still be unaligned, this
+    is only an issue for page (4096 byte boundary) crossing, which
+    should be rare for these short moves.  The hardware handles this
+-   case automatically with a small delay.  */ 
+-   
++   case automatically with a small delay.  */
++
+     .align  4
+ .L2:
+     mtcrf 0x01,5
+@@ -216,15 +214,28 @@
+     blt   cr6,5f
+     srdi  7,6,16
+     bgt	  cr6,3f
++#ifdef __LITTLE_ENDIAN__
++    sth   7,0(3)
++#else
+     sth   6,0(3)
++#endif
+     b     7f
+     .align  4
+ 3:
++#ifdef __LITTLE_ENDIAN__
++    rotlwi 6,6,24
++    stb   6,0(3)
++    sth   7,1(3)
++#else
+     stb   7,0(3)
+     sth   6,1(3)
++#endif
+     b     7f
+     .align  4
+ 5:
++#ifdef __LITTLE_ENDIAN__
++    rotlwi 6,6,8
++#endif
+     stb   6,0(3)
+ 7:
+     cmpldi	cr1,10,16
+@@ -258,11 +269,11 @@
+     lwz   6,0(12)
+     addi  12,12,4
+     stw   6,0(3)
+-    addi  3,3,4    
++    addi  3,3,4
+ 2:  /* Move 2-3 bytes.  */
+     bf    30,1f
+     lhz   6,0(12)
+-    sth   6,0(3) 
++    sth   6,0(3)
+     bf    31,0f
+     lbz   7,2(12)
+     stb   7,2(3)
+@@ -283,8 +294,8 @@
+     mr    12,4
+     bne   cr6,4f
+ /* Would have liked to use use ld/std here but the 630 processors are
+-   slow for load/store doubles that are not at least word aligned.  
+-   Unaligned Load/Store word execute with only a 1 cycle penaltity.  */
++   slow for load/store doubles that are not at least word aligned.
++   Unaligned Load/Store word execute with only a 1 cycle penalty.  */
+     lwz   6,0(4)
+     lwz   7,4(4)
+     stw   6,0(3)
+@@ -299,14 +310,14 @@
+ 6:
+     bf    30,5f
+     lhz   7,4(4)
+-    sth   7,4(3) 
++    sth   7,4(3)
+     bf    31,0f
+     lbz   8,6(4)
+     stb   8,6(3)
+     ld 3,-16(1)
+     blr
+     .align  4
+-5:  
++5:
+     bf    31,0f
+     lbz   6,4(4)
+     stb   6,4(3)
+@@ -336,13 +347,23 @@
+     bf      30,1f
+ 
+     /* there are at least two DWs to copy */
++#ifdef __LITTLE_ENDIAN__
++    srd     0,6,10
++    sld     8,7,9
++#else
+     sld     0,6,10
+     srd     8,7,9
++#endif
+     or      0,0,8
+     ld      6,16(5)
+     std     0,0(4)
++#ifdef __LITTLE_ENDIAN__
++    srd     0,7,10
++    sld     8,6,9
++#else
+     sld     0,7,10
+     srd     8,6,9
++#endif
+     or      0,0,8
+     ld      7,24(5)
+     std     0,8(4)
+@@ -351,8 +372,13 @@
+     blt     cr6,8f  /* if total DWs = 3, then bypass loop */
+     bf      31,4f
+     /* there is a third DW to copy */
++#ifdef __LITTLE_ENDIAN__
++    srd     0,6,10
++    sld     8,7,9
++#else
+     sld     0,6,10
+     srd     8,7,9
++#endif
+     or      0,0,8
+     std     0,0(4)
+     mr      6,7
+@@ -363,8 +389,13 @@
+     b       4f
+     .align 4
+ 1:
++#ifdef __LITTLE_ENDIAN__
++    srd     0,6,10
++    sld     8,7,9
++#else
+     sld     0,6,10
+     srd     8,7,9
++#endif
+     addi    5,5,16
+     or      0,0,8
+     bf      31,4f
+@@ -375,23 +406,44 @@
+     addi    4,4,8
+     .align 4
+ /* copy 32 bytes at a time */
+-4:  sld   0,6,10
++4:
++#ifdef __LITTLE_ENDIAN__
++    srd   0,6,10
++    sld   8,7,9
++#else
++    sld   0,6,10
+     srd   8,7,9
++#endif
+     or    0,0,8
+     ld    6,0(5)
+     std   0,0(4)
++#ifdef __LITTLE_ENDIAN__
++    srd   0,7,10
++    sld   8,6,9
++#else
+     sld   0,7,10
+     srd   8,6,9
++#endif
+     or    0,0,8
+     ld    7,8(5)
+     std   0,8(4)
++#ifdef __LITTLE_ENDIAN__
++    srd   0,6,10
++    sld   8,7,9
++#else
+     sld   0,6,10
+     srd   8,7,9
++#endif
+     or    0,0,8
+     ld    6,16(5)
+     std   0,16(4)
++#ifdef __LITTLE_ENDIAN__
++    srd   0,7,10
++    sld   8,6,9
++#else
+     sld   0,7,10
+     srd   8,6,9
++#endif
+     or    0,0,8
+     ld    7,24(5)
+     std   0,24(4)
+@@ -401,9 +453,14 @@
+     .align 4
+ 8:
+     /* calculate and store the final DW */
++#ifdef __LITTLE_ENDIAN__
++    srd   0,6,10
++    sld   8,7,9
++#else
+     sld   0,6,10
+     srd   8,7,9
+-    or    0,0,8  
++#endif
++    or    0,0,8
+     std   0,0(4)
+ 3:
+     rldicr 0,31,0,60
+@@ -413,5 +470,5 @@
+     ld 31,-8(1)
+     ld 3,-16(1)
+     blr
+-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
++END_GEN_TB (memcpy,TB_TOCLESS)
+ libc_hidden_builtin_def (memcpy)
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S	2014-05-29 13:04:56.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power6/memcpy.S	2014-05-29 13:05:27.000000000 -0500
+@@ -1,5 +1,5 @@
+ /* Optimized memcpy implementation for PowerPC64.
+-   Copyright (C) 2003, 2006, 2007, 2011 Free Software Foundation, Inc.
++   Copyright (C) 2003-2014 Free Software Foundation, Inc.
+    This file is part of the GNU C Library.
+ 
+    The GNU C Library is free software; you can redistribute it and/or
+@@ -17,52 +17,50 @@
+    <http://www.gnu.org/licenses/>.  */
+ 
+ #include <sysdep.h>
+-#include <bp-sym.h>
+-#include <bp-asm.h>
+ 
+ /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
+    Returns 'dst'.
+ 
+-   Memcpy handles short copies (< 32-bytes) using a binary move blocks 
+-   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled 
+-   with the appropriate combination of byte and halfword load/stores. 
+-   There is minimal effort to optimize the alignment of short moves.  
++   Memcpy handles short copies (< 32-bytes) using a binary move blocks
++   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
++   with the appropriate combination of byte and halfword load/stores.
++   There is minimal effort to optimize the alignment of short moves.
+    The 64-bit implementations of POWER3 and POWER4 do a reasonable job
+-   of handling unligned load/stores that do not cross 32-byte boundries.
++   of handling unaligned load/stores that do not cross 32-byte boundaries.
+ 
+    Longer moves (>= 32-bytes) justify the effort to get at least the
+    destination doubleword (8-byte) aligned.  Further optimization is
+-   posible when both source and destination are doubleword aligned.
+-   Each case has a optimized unrolled loop.  
+-     
+-   For POWER6 unaligned loads will take a 20+ cycle hicup for any
++   possible when both source and destination are doubleword aligned.
++   Each case has a optimized unrolled loop.
++
++   For POWER6 unaligned loads will take a 20+ cycle hiccup for any
+    L1 cache miss that crosses a 32- or 128-byte boundary.  Store
+-   is more forgiving and does not take a hicup until page or 
+-   segment boundaries.  So we require doubleword alignment for 
++   is more forgiving and does not take a hiccup until page or
++   segment boundaries.  So we require doubleword alignment for
+    the source but may take a risk and only require word alignment
+    for the destination.  */
+ 
+ 	.machine	"power6"
+-EALIGN (BP_SYM (memcpy), 7, 0)
++EALIGN (memcpy, 7, 0)
+ 	CALL_MCOUNT 3
+ 
+     cmpldi cr1,5,31
+     neg   0,3
+     std   3,-16(1)
+     std   31,-8(1)
+-    andi. 11,3,7	/* check alignement of dst.  */
++    andi. 11,3,7	/* check alignment of dst.  */
+     clrldi 0,0,61	/* Number of bytes until the 1st doubleword of dst.  */
+-    clrldi 10,4,61	/* check alignement of src.  */
++    clrldi 10,4,61	/* check alignment of src.  */
+     cmpldi cr6,5,8
+     ble-  cr1,.L2	/* If move < 32 bytes use short move code.  */
+     mtcrf 0x01,0
+-    cmpld cr6,10,11  
++    cmpld cr6,10,11
+     srdi  9,5,3		/* Number of full double words remaining.  */
+     beq   .L0
+-  
++
+     subf  5,0,5
+-  /* Move 0-7 bytes as needed to get the destination doubleword alligned.
+-     Duplicate some code to maximize fall-throught and minimize agen delays.  */
++  /* Move 0-7 bytes as needed to get the destination doubleword aligned.
++     Duplicate some code to maximize fall-through and minimize agen delays.  */
+ 1:  bf    31,2f
+     lbz   6,0(4)
+     stb   6,0(3)
+@@ -78,7 +76,7 @@
+     lwz   6,1(4)
+     stw   6,1(3)
+     b     0f
+-    
++
+ 2:  bf    30,4f
+     lhz   6,0(4)
+     sth   6,0(3)
+@@ -86,26 +84,26 @@
+     lwz   6,2(4)
+     stw   6,2(3)
+     b     0f
+-    
++
+ 4:  bf    29,0f
+     lwz   6,0(4)
+     stw   6,0(3)
+-0: 
++0:
+ /* Add the number of bytes until the 1st doubleword of dst to src and dst.  */
+     add   4,4,0
+     add   3,3,0
+-    
+-    clrldi 10,4,61	/* check alignement of src again.  */     
++
++    clrldi 10,4,61	/* check alignment of src again.  */
+     srdi  9,5,3	/* Number of full double words remaining.  */
+-    
+-  /* Copy doublewords from source to destination, assumpting the
++
++  /* Copy doublewords from source to destination, assuming the
+      destination is aligned on a doubleword boundary.
+ 
+      At this point we know there are at least 25 bytes left (32-7) to copy.
+-     The next step is to determine if the source is also doubleword aligned. 
++     The next step is to determine if the source is also doubleword aligned.
+      If not branch to the unaligned move code at .L6. which uses
+      a load, shift, store strategy.
+-     
++
+      Otherwise source and destination are doubleword aligned, and we can
+      the optimized doubleword copy loop.  */
+     .align  4
+@@ -123,14 +121,14 @@
+      the main loop exits there may be a tail of 1-7 bytes. These byte
+      are copied a word/halfword/byte at a time as needed to preserve
+      alignment.
+-     
++
+      For POWER6 the L1 is store-through and the L2 is store-in.  The
+      L2 is clocked at half CPU clock so we can store 16 bytes every
+      other cycle.  POWER6 also has a load/store bypass so we can do
+-     load, load, store, store every 2 cycles.  
+-     
++     load, load, store, store every 2 cycles.
++
+      The following code is sensitive to cache line alignment.  Do not
+-     make any change with out first making sure thay don't result in
++     make any change with out first making sure they don't result in
+      splitting ld/std pairs across a cache line.  */
+ 
+     mtcrf 0x02,5
+@@ -273,7 +271,7 @@
+     std   8,16+96(10)
+     std   0,24+96(10)
+     ble   cr5,L(das_loop_e)
+-    
++
+     mtctr   12
+     .align  4
+ L(das_loop2):
+@@ -326,10 +324,10 @@
+     .align  4
+ L(das_tail):
+     beq   cr1,0f
+-    
++
+ L(das_tail2):
+ /*  At this point we have a tail of 0-7 bytes and we know that the
+-    destiniation is double word aligned.  */
++    destination is double word aligned.  */
+ 4:  bf    29,2f
+     lwz   6,0(4)
+     stw   6,0(3)
+@@ -344,7 +342,7 @@
+     lbz   6,4(4)
+     stb   6,4(3)
+     b     0f
+-  
++
+ 2:  bf    30,1f
+     lhz   6,0(4)
+     sth   6,0(3)
+@@ -352,7 +350,7 @@
+     lbz   6,2(4)
+     stb   6,2(3)
+     b     0f
+-    
++
+ 1:  bf    31,0f
+     lbz   6,0(4)
+     stb   6,0(3)
+@@ -361,7 +359,7 @@
+     ld 3,-16(1)
+     blr
+ 
+-/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31 
++/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
+    bytes.  Each case is handled without loops, using binary (1,2,4,8)
+    tests.
+ 
+@@ -402,15 +400,28 @@
+     blt   cr6,5f
+     srdi  7,6,16
+     bgt	  cr6,3f
++#ifdef __LITTLE_ENDIAN__
++    sth   7,0(3)
++#else
+     sth   6,0(3)
++#endif
+     b     7f
+     .align  4
+ 3:
++#ifdef __LITTLE_ENDIAN__
++    rotlwi 6,6,24
++    stb   6,0(3)
++    sth   7,1(3)
++#else
+     stb   7,0(3)
+     sth   6,1(3)
++#endif
+     b     7f
+     .align  4
+ 5:
++#ifdef __LITTLE_ENDIAN__
++    rotlwi 6,6,8
++#endif
+     stb   6,0(3)
+ 7:
+     cmpldi	cr1,10,16
+@@ -421,7 +432,7 @@
+ /* At least 6 bytes left and the source is word aligned.  This allows
+    some speculative loads up front.  */
+ /* We need to special case the fall-through because the biggest delays
+-   are due to address computation not being ready in time for the 
++   are due to address computation not being ready in time for the
+    AGEN.  */
+     lwz   6,0(12)
+     lwz   7,4(12)
+@@ -452,7 +463,7 @@
+     ld    3,-16(1)
+     blr
+     .align  4
+-L(dus_tail16p8):  /* less then 8 bytes left.  */
++L(dus_tail16p8):  /* less than 8 bytes left.  */
+     beq   cr1,L(dus_tailX) /* exactly 16 bytes, early exit.  */
+     cmpldi	cr1,10,20
+     bf    29,L(dus_tail16p2)
+@@ -466,7 +477,7 @@
+     ld    3,-16(1)
+     blr
+     .align  4
+-L(dus_tail16p4):  /* less then 4 bytes left.  */
++L(dus_tail16p4):  /* less than 4 bytes left.  */
+     addi  12,12,24
+     addi  3,3,24
+     bgt   cr0,L(dus_tail2)
+@@ -474,7 +485,7 @@
+     ld    3,-16(1)
+     blr
+     .align  4
+-L(dus_tail16p2):  /* 16 bytes moved, less then 4 bytes left.  */
++L(dus_tail16p2):  /* 16 bytes moved, less than 4 bytes left.  */
+     addi  12,12,16
+     addi  3,3,16
+     b     L(dus_tail2)
+@@ -499,7 +510,7 @@
+     ld    3,-16(1)
+     blr
+     .align  4
+-L(dus_tail8p4):  /* less then 4 bytes left.  */
++L(dus_tail8p4):  /* less than 4 bytes left.  */
+     addi  12,12,8
+     addi  3,3,8
+     bgt   cr1,L(dus_tail2)
+@@ -510,14 +521,14 @@
+     .align  4
+ L(dus_tail4):  /* Move 4 bytes.  */
+ /*  r6 already loaded speculatively.  If we are here we know there is
+-    more then 4 bytes left.  So there is no need to test.  */
++    more than 4 bytes left.  So there is no need to test.  */
+     addi  12,12,4
+     stw   6,0(3)
+     addi  3,3,4
+ L(dus_tail2):  /* Move 2-3 bytes.  */
+     bf    30,L(dus_tail1)
+     lhz   6,0(12)
+-    sth   6,0(3) 
++    sth   6,0(3)
+     bf    31,L(dus_tailX)
+     lbz   7,2(12)
+     stb   7,2(3)
+@@ -537,7 +548,7 @@
+ .LE8:
+     mr    12,4
+     bne   cr6,L(dus_4)
+-/* Exactly 8 bytes.  We may cross a 32-/128-byte boundry and take a ~20
++/* Exactly 8 bytes.  We may cross a 32-/128-byte boundary and take a ~20
+    cycle delay.  This case should be rare and any attempt to avoid this
+    would take most of 20 cycles any way.  */
+     ld   6,0(4)
+@@ -552,7 +563,7 @@
+     stw   6,0(3)
+     bf    30,L(dus_5)
+     lhz   7,4(4)
+-    sth   7,4(3) 
++    sth   7,4(3)
+     bf    31,L(dus_0)
+     lbz   8,6(4)
+     stb   8,6(3)
+@@ -590,20 +601,31 @@
+     bge     cr0, L(du4_do)
+     blt     cr5, L(du1_do)
+     beq     cr5, L(du2_do)
+-    b       L(du3_do) 
+-       
++    b       L(du3_do)
++
+     .align 4
+ L(du1_do):
+     bf      30,L(du1_1dw)
+ 
+     /* there are at least two DWs to copy */
++    /* FIXME: can combine last shift and "or" into "rldimi" */
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,6, 8
++    sldi     8,7, 64-8
++#else
+     sldi     0,6, 8
+     srdi     8,7, 64-8
++#endif
+     or      0,0,8
+     ld      6,16(5)
+     std     0,0(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,7, 8
++    sldi     8,6, 64-8
++#else
+     sldi     0,7, 8
+     srdi     8,6, 64-8
++#endif
+     or      0,0,8
+     ld      7,24(5)
+     std     0,8(4)
+@@ -612,8 +634,13 @@
+     blt     cr6,L(du1_fini)  /* if total DWs = 3, then bypass loop */
+     bf      31,L(du1_loop)
+     /* there is a third DW to copy */
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,6, 8
++    sldi     8,7, 64-8
++#else
+     sldi     0,6, 8
+     srdi     8,7, 64-8
++#endif
+     or      0,0,8
+     std     0,0(4)
+     mr      6,7
+@@ -624,8 +651,13 @@
+     b       L(du1_loop)
+     .align 4
+ L(du1_1dw):
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,6, 8
++    sldi     8,7, 64-8
++#else
+     sldi     0,6, 8
+     srdi     8,7, 64-8
++#endif
+     addi    5,5,16
+     or      0,0,8
+     bf      31,L(du1_loop)
+@@ -637,23 +669,43 @@
+     .align 4
+ /* copy 32 bytes at a time */
+ L(du1_loop):
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,6, 8
++    sldi   8,7, 64-8
++#else
+     sldi   0,6, 8
+     srdi   8,7, 64-8
++#endif
+     or    0,0,8
+     ld    6,0(5)
+     std   0,0(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,7, 8
++    sldi   8,6, 64-8
++#else
+     sldi   0,7, 8
+     srdi   8,6, 64-8
++#endif
+     or    0,0,8
+     ld    7,8(5)
+     std   0,8(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,6, 8
++    sldi   8,7, 64-8
++#else
+     sldi   0,6, 8
+     srdi   8,7, 64-8
++#endif
+     or    0,0,8
+     ld    6,16(5)
+     std   0,16(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,7, 8
++    sldi   8,6, 64-8
++#else
+     sldi   0,7, 8
+     srdi   8,6, 64-8
++#endif
+     or    0,0,8
+     ld    7,24(5)
+     std   0,24(4)
+@@ -663,9 +715,14 @@
+     .align 4
+ L(du1_fini):
+     /* calculate and store the final DW */
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,6, 8
++    sldi   8,7, 64-8
++#else
+     sldi   0,6, 8
+     srdi   8,7, 64-8
+-    or    0,0,8  
++#endif
++    or    0,0,8
+     std   0,0(4)
+     b     L(du_done)
+ 
+@@ -674,13 +731,23 @@
+     bf      30,L(du2_1dw)
+ 
+     /* there are at least two DWs to copy */
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,6, 16
++    sldi     8,7, 64-16
++#else
+     sldi     0,6, 16
+     srdi     8,7, 64-16
++#endif
+     or      0,0,8
+     ld      6,16(5)
+     std     0,0(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,7, 16
++    sldi     8,6, 64-16
++#else
+     sldi     0,7, 16
+     srdi     8,6, 64-16
++#endif
+     or      0,0,8
+     ld      7,24(5)
+     std     0,8(4)
+@@ -689,8 +756,13 @@
+     blt     cr6,L(du2_fini)  /* if total DWs = 3, then bypass loop */
+     bf      31,L(du2_loop)
+     /* there is a third DW to copy */
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,6, 16
++    sldi     8,7, 64-16
++#else
+     sldi     0,6, 16
+     srdi     8,7, 64-16
++#endif
+     or      0,0,8
+     std     0,0(4)
+     mr      6,7
+@@ -701,8 +773,13 @@
+     b       L(du2_loop)
+     .align 4
+ L(du2_1dw):
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,6, 16
++    sldi     8,7, 64-16
++#else
+     sldi     0,6, 16
+     srdi     8,7, 64-16
++#endif
+     addi    5,5,16
+     or      0,0,8
+     bf      31,L(du2_loop)
+@@ -714,23 +791,43 @@
+     .align 4
+ /* copy 32 bytes at a time */
+ L(du2_loop):
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,6, 16
++    sldi   8,7, 64-16
++#else
+     sldi   0,6, 16
+     srdi   8,7, 64-16
++#endif
+     or    0,0,8
+     ld    6,0(5)
+     std   0,0(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,7, 16
++    sldi   8,6, 64-16
++#else
+     sldi   0,7, 16
+     srdi   8,6, 64-16
++#endif
+     or    0,0,8
+     ld    7,8(5)
+     std   0,8(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,6, 16
++    sldi   8,7, 64-16
++#else
+     sldi   0,6, 16
+     srdi   8,7, 64-16
++#endif
+     or    0,0,8
+     ld    6,16(5)
+     std   0,16(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,7, 16
++    sldi   8,6, 64-16
++#else
+     sldi   0,7, 16
+     srdi   8,6, 64-16
++#endif
+     or    0,0,8
+     ld    7,24(5)
+     std   0,24(4)
+@@ -740,9 +837,14 @@
+     .align 4
+ L(du2_fini):
+     /* calculate and store the final DW */
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,6, 16
++    sldi   8,7, 64-16
++#else
+     sldi   0,6, 16
+     srdi   8,7, 64-16
+-    or    0,0,8  
++#endif
++    or    0,0,8
+     std   0,0(4)
+     b     L(du_done)
+ 
+@@ -751,13 +853,23 @@
+     bf      30,L(du3_1dw)
+ 
+     /* there are at least two DWs to copy */
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,6, 24
++    sldi     8,7, 64-24
++#else
+     sldi     0,6, 24
+     srdi     8,7, 64-24
++#endif
+     or      0,0,8
+     ld      6,16(5)
+     std     0,0(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,7, 24
++    sldi     8,6, 64-24
++#else
+     sldi     0,7, 24
+     srdi     8,6, 64-24
++#endif
+     or      0,0,8
+     ld      7,24(5)
+     std     0,8(4)
+@@ -766,8 +878,13 @@
+     blt     cr6,L(du3_fini)  /* if total DWs = 3, then bypass loop */
+     bf      31,L(du3_loop)
+     /* there is a third DW to copy */
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,6, 24
++    sldi     8,7, 64-24
++#else
+     sldi     0,6, 24
+     srdi     8,7, 64-24
++#endif
+     or      0,0,8
+     std     0,0(4)
+     mr      6,7
+@@ -778,8 +895,13 @@
+     b       L(du3_loop)
+     .align 4
+ L(du3_1dw):
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,6, 24
++    sldi     8,7, 64-24
++#else
+     sldi     0,6, 24
+     srdi     8,7, 64-24
++#endif
+     addi    5,5,16
+     or      0,0,8
+     bf      31,L(du3_loop)
+@@ -791,23 +913,43 @@
+     .align 4
+ /* copy 32 bytes at a time */
+ L(du3_loop):
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,6, 24
++    sldi   8,7, 64-24
++#else
+     sldi   0,6, 24
+     srdi   8,7, 64-24
++#endif
+     or    0,0,8
+     ld    6,0(5)
+     std   0,0(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,7, 24
++    sldi   8,6, 64-24
++#else
+     sldi   0,7, 24
+     srdi   8,6, 64-24
++#endif
+     or    0,0,8
+     ld    7,8(5)
+     std   0,8(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,6, 24
++    sldi   8,7, 64-24
++#else
+     sldi   0,6, 24
+     srdi   8,7, 64-24
++#endif
+     or    0,0,8
+     ld    6,16(5)
+     std   0,16(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,7, 24
++    sldi   8,6, 64-24
++#else
+     sldi   0,7, 24
+     srdi   8,6, 64-24
++#endif
+     or    0,0,8
+     ld    7,24(5)
+     std   0,24(4)
+@@ -817,9 +959,14 @@
+     .align 4
+ L(du3_fini):
+     /* calculate and store the final DW */
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,6, 24
++    sldi   8,7, 64-24
++#else
+     sldi   0,6, 24
+     srdi   8,7, 64-24
+-    or    0,0,8  
++#endif
++    or    0,0,8
+     std   0,0(4)
+     b     L(du_done)
+ 
+@@ -834,13 +981,23 @@
+     bf      30,L(du4_1dw)
+ 
+     /* there are at least two DWs to copy */
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,6, 32
++    sldi     8,7, 64-32
++#else
+     sldi     0,6, 32
+     srdi     8,7, 64-32
++#endif
+     or      0,0,8
+     ld      6,16(5)
+     std     0,0(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,7, 32
++    sldi     8,6, 64-32
++#else
+     sldi     0,7, 32
+     srdi     8,6, 64-32
++#endif
+     or      0,0,8
+     ld      7,24(5)
+     std     0,8(4)
+@@ -849,8 +1006,13 @@
+     blt     cr6,L(du4_fini)  /* if total DWs = 3, then bypass loop */
+     bf      31,L(du4_loop)
+     /* there is a third DW to copy */
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,6, 32
++    sldi     8,7, 64-32
++#else
+     sldi     0,6, 32
+     srdi     8,7, 64-32
++#endif
+     or      0,0,8
+     std     0,0(4)
+     mr      6,7
+@@ -861,8 +1023,13 @@
+     b       L(du4_loop)
+     .align 4
+ L(du4_1dw):
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,6, 32
++    sldi     8,7, 64-32
++#else
+     sldi     0,6, 32
+     srdi     8,7, 64-32
++#endif
+     addi    5,5,16
+     or      0,0,8
+     bf      31,L(du4_loop)
+@@ -874,23 +1041,43 @@
+     .align 4
+ /* copy 32 bytes at a time */
+ L(du4_loop):
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,6, 32
++    sldi   8,7, 64-32
++#else
+     sldi   0,6, 32
+     srdi   8,7, 64-32
++#endif
+     or    0,0,8
+     ld    6,0(5)
+     std   0,0(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,7, 32
++    sldi   8,6, 64-32
++#else
+     sldi   0,7, 32
+     srdi   8,6, 64-32
++#endif
+     or    0,0,8
+     ld    7,8(5)
+     std   0,8(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,6, 32
++    sldi   8,7, 64-32
++#else
+     sldi   0,6, 32
+     srdi   8,7, 64-32
++#endif
+     or    0,0,8
+     ld    6,16(5)
+     std   0,16(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,7, 32
++    sldi   8,6, 64-32
++#else
+     sldi   0,7, 32
+     srdi   8,6, 64-32
++#endif
+     or    0,0,8
+     ld    7,24(5)
+     std   0,24(4)
+@@ -900,9 +1087,14 @@
+     .align 4
+ L(du4_fini):
+     /* calculate and store the final DW */
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,6, 32
++    sldi   8,7, 64-32
++#else
+     sldi   0,6, 32
+     srdi   8,7, 64-32
+-    or    0,0,8  
++#endif
++    or    0,0,8
+     std   0,0(4)
+     b     L(du_done)
+ 
+@@ -911,13 +1103,23 @@
+     bf      30,L(du5_1dw)
+ 
+     /* there are at least two DWs to copy */
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,6, 40
++    sldi     8,7, 64-40
++#else
+     sldi     0,6, 40
+     srdi     8,7, 64-40
++#endif
+     or      0,0,8
+     ld      6,16(5)
+     std     0,0(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,7, 40
++    sldi     8,6, 64-40
++#else
+     sldi     0,7, 40
+     srdi     8,6, 64-40
++#endif
+     or      0,0,8
+     ld      7,24(5)
+     std     0,8(4)
+@@ -926,8 +1128,13 @@
+     blt     cr6,L(du5_fini)  /* if total DWs = 3, then bypass loop */
+     bf      31,L(du5_loop)
+     /* there is a third DW to copy */
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,6, 40
++    sldi     8,7, 64-40
++#else
+     sldi     0,6, 40
+     srdi     8,7, 64-40
++#endif
+     or      0,0,8
+     std     0,0(4)
+     mr      6,7
+@@ -938,8 +1145,13 @@
+     b       L(du5_loop)
+     .align 4
+ L(du5_1dw):
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,6, 40
++    sldi     8,7, 64-40
++#else
+     sldi     0,6, 40
+     srdi     8,7, 64-40
++#endif
+     addi    5,5,16
+     or      0,0,8
+     bf      31,L(du5_loop)
+@@ -951,23 +1163,43 @@
+     .align 4
+ /* copy 32 bytes at a time */
+ L(du5_loop):
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,6, 40
++    sldi   8,7, 64-40
++#else
+     sldi   0,6, 40
+     srdi   8,7, 64-40
++#endif
+     or    0,0,8
+     ld    6,0(5)
+     std   0,0(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,7, 40
++    sldi   8,6, 64-40
++#else
+     sldi   0,7, 40
+     srdi   8,6, 64-40
++#endif
+     or    0,0,8
+     ld    7,8(5)
+     std   0,8(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,6, 40
++    sldi   8,7, 64-40
++#else
+     sldi   0,6, 40
+     srdi   8,7, 64-40
++#endif
+     or    0,0,8
+     ld    6,16(5)
+     std   0,16(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,7, 40
++    sldi   8,6, 64-40
++#else
+     sldi   0,7, 40
+     srdi   8,6, 64-40
++#endif
+     or    0,0,8
+     ld    7,24(5)
+     std   0,24(4)
+@@ -977,9 +1209,14 @@
+     .align 4
+ L(du5_fini):
+     /* calculate and store the final DW */
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,6, 40
++    sldi   8,7, 64-40
++#else
+     sldi   0,6, 40
+     srdi   8,7, 64-40
+-    or    0,0,8  
++#endif
++    or    0,0,8
+     std   0,0(4)
+     b     L(du_done)
+ 
+@@ -988,13 +1225,23 @@
+     bf      30,L(du6_1dw)
+ 
+     /* there are at least two DWs to copy */
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,6, 48
++    sldi     8,7, 64-48
++#else
+     sldi     0,6, 48
+     srdi     8,7, 64-48
++#endif
+     or      0,0,8
+     ld      6,16(5)
+     std     0,0(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,7, 48
++    sldi     8,6, 64-48
++#else
+     sldi     0,7, 48
+     srdi     8,6, 64-48
++#endif
+     or      0,0,8
+     ld      7,24(5)
+     std     0,8(4)
+@@ -1003,8 +1250,13 @@
+     blt     cr6,L(du6_fini)  /* if total DWs = 3, then bypass loop */
+     bf      31,L(du6_loop)
+     /* there is a third DW to copy */
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,6, 48
++    sldi     8,7, 64-48
++#else
+     sldi     0,6, 48
+     srdi     8,7, 64-48
++#endif
+     or      0,0,8
+     std     0,0(4)
+     mr      6,7
+@@ -1015,8 +1267,13 @@
+     b       L(du6_loop)
+     .align 4
+ L(du6_1dw):
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,6, 48
++    sldi     8,7, 64-48
++#else
+     sldi     0,6, 48
+     srdi     8,7, 64-48
++#endif
+     addi    5,5,16
+     or      0,0,8
+     bf      31,L(du6_loop)
+@@ -1028,23 +1285,43 @@
+     .align 4
+ /* copy 32 bytes at a time */
+ L(du6_loop):
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,6, 48
++    sldi   8,7, 64-48
++#else
+     sldi   0,6, 48
+     srdi   8,7, 64-48
++#endif
+     or    0,0,8
+     ld    6,0(5)
+     std   0,0(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,7, 48
++    sldi   8,6, 64-48
++#else
+     sldi   0,7, 48
+     srdi   8,6, 64-48
++#endif
+     or    0,0,8
+     ld    7,8(5)
+     std   0,8(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,6, 48
++    sldi   8,7, 64-48
++#else
+     sldi   0,6, 48
+     srdi   8,7, 64-48
++#endif
+     or    0,0,8
+     ld    6,16(5)
+     std   0,16(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,7, 48
++    sldi   8,6, 64-48
++#else
+     sldi   0,7, 48
+     srdi   8,6, 64-48
++#endif
+     or    0,0,8
+     ld    7,24(5)
+     std   0,24(4)
+@@ -1054,9 +1331,14 @@
+     .align 4
+ L(du6_fini):
+     /* calculate and store the final DW */
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,6, 48
++    sldi   8,7, 64-48
++#else
+     sldi   0,6, 48
+     srdi   8,7, 64-48
+-    or    0,0,8  
++#endif
++    or    0,0,8
+     std   0,0(4)
+     b     L(du_done)
+ 
+@@ -1065,13 +1347,23 @@
+     bf      30,L(du7_1dw)
+ 
+     /* there are at least two DWs to copy */
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,6, 56
++    sldi     8,7, 64-56
++#else
+     sldi     0,6, 56
+     srdi     8,7, 64-56
++#endif
+     or      0,0,8
+     ld      6,16(5)
+     std     0,0(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,7, 56
++    sldi     8,6, 64-56
++#else
+     sldi     0,7, 56
+     srdi     8,6, 64-56
++#endif
+     or      0,0,8
+     ld      7,24(5)
+     std     0,8(4)
+@@ -1080,8 +1372,13 @@
+     blt     cr6,L(du7_fini)  /* if total DWs = 3, then bypass loop */
+     bf      31,L(du7_loop)
+     /* there is a third DW to copy */
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,6, 56
++    sldi     8,7, 64-56
++#else
+     sldi     0,6, 56
+     srdi     8,7, 64-56
++#endif
+     or      0,0,8
+     std     0,0(4)
+     mr      6,7
+@@ -1092,8 +1389,13 @@
+     b       L(du7_loop)
+     .align 4
+ L(du7_1dw):
++#ifdef __LITTLE_ENDIAN__
++    srdi     0,6, 56
++    sldi     8,7, 64-56
++#else
+     sldi     0,6, 56
+     srdi     8,7, 64-56
++#endif
+     addi    5,5,16
+     or      0,0,8
+     bf      31,L(du7_loop)
+@@ -1105,23 +1407,43 @@
+     .align 4
+ /* copy 32 bytes at a time */
+ L(du7_loop):
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,6, 56
++    sldi   8,7, 64-56
++#else
+     sldi   0,6, 56
+     srdi   8,7, 64-56
++#endif
+     or    0,0,8
+     ld    6,0(5)
+     std   0,0(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,7, 56
++    sldi   8,6, 64-56
++#else
+     sldi   0,7, 56
+     srdi   8,6, 64-56
++#endif
+     or    0,0,8
+     ld    7,8(5)
+     std   0,8(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,6, 56
++    sldi   8,7, 64-56
++#else
+     sldi   0,6, 56
+     srdi   8,7, 64-56
++#endif
+     or    0,0,8
+     ld    6,16(5)
+     std   0,16(4)
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,7, 56
++    sldi   8,6, 64-56
++#else
+     sldi   0,7, 56
+     srdi   8,6, 64-56
++#endif
+     or    0,0,8
+     ld    7,24(5)
+     std   0,24(4)
+@@ -1131,12 +1453,17 @@
+     .align 4
+ L(du7_fini):
+     /* calculate and store the final DW */
++#ifdef __LITTLE_ENDIAN__
++    srdi   0,6, 56
++    sldi   8,7, 64-56
++#else
+     sldi   0,6, 56
+     srdi   8,7, 64-56
+-    or    0,0,8  
++#endif
++    or    0,0,8
+     std   0,0(4)
+     b     L(du_done)
+-    
++
+     .align 4
+ L(du_done):
+     rldicr 0,31,0,60
+@@ -1144,9 +1471,9 @@
+     beq   cr1,0f	/* If the tail is 0 bytes we are done!  */
+ 
+     add   3,3,0
+-    add   12,12,0    
++    add   12,12,0
+ /*  At this point we have a tail of 0-7 bytes and we know that the
+-    destiniation is double word aligned.  */
++    destination is double word aligned.  */
+ 4:  bf    29,2f
+     lwz   6,0(12)
+     addi  12,12,4
+@@ -1165,5 +1492,5 @@
+     ld 31,-8(1)
+     ld 3,-16(1)
+     blr
+-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
++END_GEN_TB (memcpy,TB_TOCLESS)
+ libc_hidden_builtin_def (memcpy)
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S	2014-05-29 13:04:56.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcpy.S	2014-05-29 13:05:40.000000000 -0500
+@@ -1,5 +1,5 @@
+ /* Optimized memcpy implementation for PowerPC64/POWER7.
+-   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
++   Copyright (C) 2010-2014 Free Software Foundation, Inc.
+    Contributed by Luis Machado <luisgpm@br.ibm.com>.
+    This file is part of the GNU C Library.
+ 
+@@ -18,425 +18,366 @@
+    <http://www.gnu.org/licenses/>.  */
+ 
+ #include <sysdep.h>
+-#include <bp-sym.h>
+-#include <bp-asm.h>
+ 
+ 
+ /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
+    Returns 'dst'.  */
+ 
++#define dst 11		/* Use r11 so r3 kept unchanged.  */
++#define src 4
++#define cnt 5
++
+ 	.machine power7
+-EALIGN (BP_SYM (memcpy), 5, 0)
++EALIGN (memcpy, 5, 0)
+ 	CALL_MCOUNT 3
+ 
+-	cmpldi  cr1,5,31
++	cmpldi	cr1,cnt,31
+ 	neg	0,3
+-	std	3,-16(1)
+-	std	31,-8(1)
+-	cfi_offset(31,-8)
+ 	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
+ 				    code.  */
+ 
+-	andi.   11,3,7	      /* Check alignment of DST.  */
+-
++#ifdef __LITTLE_ENDIAN__
++/* In little-endian mode, power7 takes an alignment trap on any lxvd2x
++   or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy
++   loop is only used for quadword aligned copies.  */
++	andi.	10,3,15
++	clrldi	11,4,60
++#else
++	andi.	10,3,7		/* Check alignment of DST.  */
++	clrldi	11,4,61		/* Check alignment of SRC.  */
++#endif
++	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
+ 
+-	clrldi  10,4,61       /* Check alignment of SRC.  */
+-	cmpld   cr6,10,11     /* SRC and DST alignments match?  */
+-	mr	12,4
+-	mr	31,5
++	mr	dst,3
+ 	bne	cr6,L(copy_GE_32_unaligned)
++	beq	L(aligned_copy)
+ 
+-	srdi    9,5,3	      /* Number of full quadwords remaining.  */
+-
+-	beq    L(copy_GE_32_aligned_cont)
+-
+-	clrldi  0,0,61
+-	mtcrf   0x01,0
+-	subf    31,0,5
+-
+-	/* Get the SRC aligned to 8 bytes.  */
+-
+-1:	bf	31,2f
+-	lbz	6,0(12)
+-	addi    12,12,1
+-	stb	6,0(3)
+-	addi    3,3,1
+-2:	bf      30,4f
+-	lhz     6,0(12)
+-	addi    12,12,2
+-	sth     6,0(3)
+-	addi    3,3,2
+-4:	bf      29,0f
+-	lwz     6,0(12)
+-	addi    12,12,4
+-	stw     6,0(3)
+-	addi    3,3,4
+-0:
+-	clrldi  10,12,61      /* Check alignment of SRC again.  */
+-	srdi    9,31,3	      /* Number of full doublewords remaining.  */
+-
+-L(copy_GE_32_aligned_cont):
+-
+-	clrldi  11,31,61
+-	mtcrf   0x01,9
+-
+-	srdi    8,31,5
+-	cmpldi  cr1,9,4
+-	cmpldi  cr6,11,0
+-	mr	11,12
++	mtocrf	0x01,0
++#ifdef __LITTLE_ENDIAN__
++	clrldi	0,0,60
++#else
++	clrldi	0,0,61
++#endif
+ 
+-	/* Copy 1~3 doublewords so the main loop starts
+-	at a multiple of 32 bytes.  */
+-
+-	bf	30,1f
+-	ld      6,0(12)
+-	ld      7,8(12)
+-	addi    11,12,16
+-	mtctr   8
+-	std     6,0(3)
+-	std     7,8(3)
+-	addi    10,3,16
+-	bf      31,4f
+-	ld      0,16(12)
+-	std     0,16(3)
+-	blt     cr1,3f
+-	addi    11,12,24
+-	addi    10,3,24
+-	b       4f
+-
+-	.align  4
+-1:	/* Copy 1 doubleword and set the counter.  */
+-	mr	10,3
+-	mtctr   8
+-	bf      31,4f
+-	ld      6,0(12)
+-	addi    11,12,8
+-	std     6,0(3)
+-	addi    10,3,8
++/* Get the DST and SRC aligned to 8 bytes (16 for little-endian).  */
++1:
++	bf	31,2f
++	lbz	6,0(src)
++	addi	src,src,1
++	stb	6,0(dst)
++	addi	dst,dst,1
++2:
++	bf	30,4f
++	lhz	6,0(src)
++	addi	src,src,2
++	sth	6,0(dst)
++	addi	dst,dst,2
++4:
++	bf	29,8f
++	lwz	6,0(src)
++	addi	src,src,4
++	stw	6,0(dst)
++	addi	dst,dst,4
++8:
++#ifdef __LITTLE_ENDIAN__
++	bf	28,16f
++	ld	6,0(src)
++	addi	src,src,8
++	std	6,0(dst)
++	addi	dst,dst,8
++16:
++#endif
++	subf	cnt,0,cnt
+ 
++/* Main aligned copy loop. Copies 128 bytes at a time. */
+ L(aligned_copy):
+-	/* Main aligned copy loop. Copies up to 128-bytes at a time. */
+-	.align  4
+-4:
+-	/* check for any 32-byte or 64-byte lumps that are outside of a
+-	   nice 128-byte range.  R8 contains the number of 32-byte
+-	   lumps, so drop this into the CR, and use the SO/EQ bits to help
+-	   handle the 32- or 64- byte lumps.  Then handle the rest with an
+-	   unrolled 128-bytes-at-a-time copy loop. */
+-	mtocrf	1,8
+-	li	6,16	# 16() index
+-	li	7,32	# 32() index
+-	li	8,48	# 48() index
+-
+-L(aligned_32byte):
+-	/* if the SO bit (indicating a 32-byte lump) is not set, move along. */
+-	bns	cr7,L(aligned_64byte)
+-	lxvd2x	6,0,11
+-	lxvd2x	7,11,6
+-	addi	11,11,32
+-	stxvd2x	6,0,10
+-	stxvd2x	7,10,6
+-	addi	10,10,32
+-
+-L(aligned_64byte):
+-	/* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
+-	bne	cr7,L(aligned_128setup)
+-	lxvd2x	6,0,11
+-	lxvd2x	7,11,6
+-	lxvd2x	8,11,7
+-	lxvd2x	9,11,8
+-	addi	11,11,64
+-	stxvd2x	6,0,10
+-	stxvd2x	7,10,6
+-	stxvd2x	8,10,7
+-	stxvd2x	9,10,8
+-	addi	10,10,64
+-
+-L(aligned_128setup):
+-	/* Set up for the 128-byte at a time copy loop.  */
+-	srdi	8,31,7
+-	cmpdi	8,0	# Any 4x lumps left?
+-	beq	3f	# if not, move along.
+-	lxvd2x	6,0,11
+-	lxvd2x	7,11,6
+-	mtctr	8	# otherwise, load the ctr and begin.
+-	li	8,48	# 48() index
++	li	6,16
++	li	7,32
++	li	8,48
++	mtocrf	0x02,cnt
++	srdi	12,cnt,7
++	cmpdi	12,0
++	beq	L(aligned_tail)
++	lxvd2x	6,0,src
++	lxvd2x	7,src,6
++	mtctr	12
+ 	b	L(aligned_128loop)
+ 
++	.align  4
+ L(aligned_128head):
+ 	/* for the 2nd + iteration of this loop. */
+-	lxvd2x	6,0,11
+-	lxvd2x	7,11,6
++	lxvd2x	6,0,src
++	lxvd2x	7,src,6
+ L(aligned_128loop):
+-	lxvd2x	8,11,7
+-	lxvd2x	9,11,8
+-	stxvd2x	6,0,10
+-	addi	11,11,64
+-	stxvd2x	7,10,6
+-	stxvd2x	8,10,7
+-	stxvd2x	9,10,8
+-	lxvd2x	6,0,11
+-	lxvd2x	7,11,6
+-	addi	10,10,64
+-	lxvd2x	8,11,7
+-	lxvd2x	9,11,8
+-	addi	11,11,64
+-	stxvd2x	6,0,10
+-	stxvd2x	7,10,6
+-	stxvd2x	8,10,7
+-	stxvd2x	9,10,8
+-	addi	10,10,64
++	lxvd2x	8,src,7
++	lxvd2x	9,src,8
++	stxvd2x	6,0,dst
++	addi	src,src,64
++	stxvd2x	7,dst,6
++	stxvd2x	8,dst,7
++	stxvd2x	9,dst,8
++	lxvd2x	6,0,src
++	lxvd2x	7,src,6
++	addi	dst,dst,64
++	lxvd2x	8,src,7
++	lxvd2x	9,src,8
++	addi	src,src,64
++	stxvd2x	6,0,dst
++	stxvd2x	7,dst,6
++	stxvd2x	8,dst,7
++	stxvd2x	9,dst,8
++	addi	dst,dst,64
+ 	bdnz	L(aligned_128head)
+ 
+-3:
+-	/* Check for tail bytes.  */
+-	rldicr  0,31,0,60
+-	mtcrf   0x01,31
+-	beq	cr6,0f
+-
+-.L9:
+-	add	3,3,0
+-	add	12,12,0
+-
+-	/*  At this point we have a tail of 0-7 bytes and we know that the
+-	destination is doubleword-aligned.  */
+-4:	/* Copy 4 bytes.  */
+-	bf	29,2f
+-
+-	lwz     6,0(12)
+-	addi    12,12,4
+-	stw     6,0(3)
+-	addi    3,3,4
+-2:	/* Copy 2 bytes.  */
+-	bf	30,1f
+-
+-	lhz     6,0(12)
+-	addi    12,12,2
+-	sth     6,0(3)
+-	addi    3,3,2
+-1:	/* Copy 1 byte.  */
+-	bf	31,0f
+-
+-	lbz	6,0(12)
+-	stb	6,0(3)
+-0:	/* Return original DST pointer.  */
+-	ld	31,-8(1)
+-	ld	3,-16(1)
++L(aligned_tail):
++	mtocrf	0x01,cnt
++	bf	25,32f
++	lxvd2x	6,0,src
++	lxvd2x	7,src,6
++	lxvd2x	8,src,7
++	lxvd2x	9,src,8
++	addi	src,src,64
++	stxvd2x	6,0,dst
++	stxvd2x	7,dst,6
++	stxvd2x	8,dst,7
++	stxvd2x	9,dst,8
++	addi	dst,dst,64
++32:
++	bf	26,16f
++	lxvd2x	6,0,src
++	lxvd2x	7,src,6
++	addi	src,src,32
++	stxvd2x	6,0,dst
++	stxvd2x	7,dst,6
++	addi	dst,dst,32
++16:
++	bf	27,8f
++	lxvd2x	6,0,src
++	addi	src,src,16
++	stxvd2x	6,0,dst
++	addi	dst,dst,16
++8:
++	bf	28,4f
++	ld	6,0(src)
++	addi	src,src,8
++	std     6,0(dst)
++	addi	dst,dst,8
++4:	/* Copies 4~7 bytes.  */
++	bf	29,L(tail2)
++	lwz	6,0(src)
++	stw     6,0(dst)
++	bf      30,L(tail5)
++	lhz     7,4(src)
++	sth     7,4(dst)
++	bflr	31
++	lbz     8,6(src)
++	stb     8,6(dst)
++	/* Return original DST pointer.  */
+ 	blr
+ 
+-	/* Handle copies of 0~31 bytes.  */
+-	.align  4
++
++/* Handle copies of 0~31 bytes.  */
++	.align	4
+ L(copy_LT_32):
+-	cmpldi  cr6,5,8
+-	mr	12,4
+-	mtcrf   0x01,5
++	mr	dst,3
++	cmpldi	cr6,cnt,8
++	mtocrf	0x01,cnt
+ 	ble	cr6,L(copy_LE_8)
+ 
+ 	/* At least 9 bytes to go.  */
+ 	neg	8,4
+-	clrrdi  11,4,2
+-	andi.   0,8,3
+-	cmpldi  cr1,5,16
+-	mr	10,5
++	andi.	0,8,3
++	cmpldi	cr1,cnt,16
+ 	beq	L(copy_LT_32_aligned)
+ 
+-	/* Force 4-bytes alignment for SRC.  */
+-	mtocrf  0x01,0
+-	subf    10,0,5
+-2:	bf	30,1f
+-
+-	lhz	6,0(12)
+-	addi    12,12,2
+-	sth	6,0(3)
+-	addi    3,3,2
+-1:	bf	31,L(end_4bytes_alignment)
+-
+-	lbz	6,0(12)
+-	addi    12,12,1
+-	stb	6,0(3)
+-	addi    3,3,1
++	/* Force 4-byte alignment for SRC.  */
++	mtocrf	0x01,0
++	subf	cnt,0,cnt
++2:
++	bf	30,1f
++	lhz	6,0(src)
++	addi	src,src,2
++	sth	6,0(dst)
++	addi	dst,dst,2
++1:
++	bf	31,L(end_4bytes_alignment)
++	lbz	6,0(src)
++	addi	src,src,1
++	stb	6,0(dst)
++	addi	dst,dst,1
+ 
+-	.align  4
++	.align	4
+ L(end_4bytes_alignment):
+-	cmpldi  cr1,10,16
+-	mtcrf   0x01,10
++	cmpldi	cr1,cnt,16
++	mtocrf	0x01,cnt
+ 
+ L(copy_LT_32_aligned):
+ 	/* At least 6 bytes to go, and SRC is word-aligned.  */
+ 	blt	cr1,8f
+ 
+ 	/* Copy 16 bytes.  */
+-	lwz	6,0(12)
+-	lwz     7,4(12)
+-	stw     6,0(3)
+-	lwz     8,8(12)
+-	stw     7,4(3)
+-	lwz     6,12(12)
+-	addi    12,12,16
+-	stw     8,8(3)
+-	stw     6,12(3)
+-	addi    3,3,16
++	lwz	6,0(src)
++	lwz	7,4(src)
++	stw	6,0(dst)
++	lwz	8,8(src)
++	stw	7,4(dst)
++	lwz	6,12(src)
++	addi	src,src,16
++	stw	8,8(dst)
++	stw	6,12(dst)
++	addi	dst,dst,16
+ 8:	/* Copy 8 bytes.  */
+-	bf	28,4f
++	bf	28,L(tail4)
++	lwz	6,0(src)
++	lwz	7,4(src)
++	addi	src,src,8
++	stw	6,0(dst)
++	stw	7,4(dst)
++	addi	dst,dst,8
++
++	.align	4
++/* Copies 4~7 bytes.  */
++L(tail4):
++	bf	29,L(tail2)
++	lwz	6,0(src)
++	stw	6,0(dst)
++	bf	30,L(tail5)
++	lhz	7,4(src)
++	sth	7,4(dst)
++	bflr	31
++	lbz	8,6(src)
++	stb	8,6(dst)
++	/* Return original DST pointer.  */
++	blr
+ 
+-	lwz     6,0(12)
+-	lwz     7,4(12)
+-	addi    12,12,8
+-	stw     6,0(3)
+-	stw     7,4(3)
+-	addi    3,3,8
+-4:	/* Copy 4 bytes.  */
+-	bf	29,2f
+-
+-	lwz     6,0(12)
+-	addi    12,12,4
+-	stw     6,0(3)
+-	addi    3,3,4
+-2:	/* Copy 2-3 bytes.  */
++	.align	4
++/* Copies 2~3 bytes.  */
++L(tail2):
+ 	bf	30,1f
+-
+-	lhz     6,0(12)
+-	sth     6,0(3)
+-	bf      31,0f
+-	lbz     7,2(12)
+-	stb     7,2(3)
+-	ld	3,-16(1)
++	lhz	6,0(src)
++	sth	6,0(dst)
++	bflr	31
++	lbz	7,2(src)
++	stb	7,2(dst)
+ 	blr
+ 
+-	.align  4
+-1:	/* Copy 1 byte.  */
+-	bf	31,0f
++	.align	4
++L(tail5):
++	bflr	31
++	lbz	6,4(src)
++	stb	6,4(dst)
++	blr
+ 
+-	lbz	6,0(12)
+-	stb	6,0(3)
+-0:	/* Return original DST pointer.  */
+-	ld	3,-16(1)
++	.align	4
++1:
++	bflr	31
++	lbz	6,0(src)
++	stb	6,0(dst)
++	/* Return original DST pointer.  */
+ 	blr
+ 
+-	/* Handles copies of 0~8 bytes.  */
+-	.align  4
++
++/* Handles copies of 0~8 bytes.  */
++	.align	4
+ L(copy_LE_8):
+-	bne	cr6,4f
++	bne	cr6,L(tail4)
+ 
+ 	/* Though we could've used ld/std here, they are still
+ 	slow for unaligned cases.  */
+ 
+-	lwz	6,0(4)
+-	lwz     7,4(4)
+-	stw     6,0(3)
+-	stw     7,4(3)
+-	ld      3,-16(1)      /* Return original DST pointers.  */
++	lwz	6,0(src)
++	lwz	7,4(src)
++	stw	6,0(dst)
++	stw	7,4(dst)
+ 	blr
+ 
+-	.align  4
+-4:	/* Copies 4~7 bytes.  */
+-	bf	29,2b
+ 
+-	lwz	6,0(4)
+-	stw     6,0(3)
+-	bf      30,5f
+-	lhz     7,4(4)
+-	sth     7,4(3)
+-	bf      31,0f
+-	lbz     8,6(4)
+-	stb     8,6(3)
+-	ld	3,-16(1)
+-	blr
+-
+-	.align  4
+-5:	/* Copy 1 byte.  */
+-	bf	31,0f
+-
+-	lbz	6,4(4)
+-	stb	6,4(3)
+-
+-0:	/* Return original DST pointer.  */
+-	ld	3,-16(1)
+-	blr
+-
+-	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+-	SRC is not.  Use aligned quadword loads from SRC, shifted to realign
+-	the data, allowing for aligned DST stores.  */
+-	.align  4
++/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
++   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
++   the data, allowing for aligned DST stores.  */
++	.align	4
+ L(copy_GE_32_unaligned):
+-	clrldi  0,0,60	      /* Number of bytes until the 1st
+-			      quadword.  */
+-	andi.   11,3,15       /* Check alignment of DST (against
+-			      quadwords).  */
+-	srdi    9,5,4	      /* Number of full quadwords remaining.  */
++	clrldi	0,0,60	      /* Number of bytes until the 1st dst quadword.  */
++#ifndef __LITTLE_ENDIAN__
++	andi.	10,3,15	      /* Check alignment of DST (against quadwords).  */
++#endif
++	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
+ 
+ 	beq	L(copy_GE_32_unaligned_cont)
+ 
+-	/* SRC is not quadword aligned, get it aligned.  */
++	/* DST is not quadword aligned, get it aligned.  */
+ 
+-	mtcrf   0x01,0
+-	subf    31,0,5
++	mtocrf	0x01,0
++	subf	cnt,0,cnt
+ 
+ 	/* Vector instructions work best when proper alignment (16-bytes)
+ 	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
+-1:	/* Copy 1 byte.  */
++1:
+ 	bf	31,2f
+-
+-	lbz	6,0(12)
+-	addi    12,12,1
+-	stb	6,0(3)
+-	addi    3,3,1
+-2:	/* Copy 2 bytes.  */
++	lbz	6,0(src)
++	addi	src,src,1
++	stb	6,0(dst)
++	addi	dst,dst,1
++2:
+ 	bf	30,4f
+-
+-	lhz     6,0(12)
+-	addi    12,12,2
+-	sth     6,0(3)
+-	addi    3,3,2
+-4:	/* Copy 4 bytes.  */
++	lhz	6,0(src)
++	addi	src,src,2
++	sth	6,0(dst)
++	addi	dst,dst,2
++4:
+ 	bf	29,8f
+-
+-	lwz     6,0(12)
+-	addi    12,12,4
+-	stw     6,0(3)
+-	addi    3,3,4
+-8:	/* Copy 8 bytes.  */
++	lwz	6,0(src)
++	addi	src,src,4
++	stw	6,0(dst)
++	addi	dst,dst,4
++8:
+ 	bf	28,0f
+-
+-	ld	6,0(12)
+-	addi    12,12,8
+-	std	6,0(3)
+-	addi    3,3,8
++	ld	6,0(src)
++	addi	src,src,8
++	std	6,0(dst)
++	addi	dst,dst,8
+ 0:
+-	clrldi  10,12,60      /* Check alignment of SRC.  */
+-	srdi    9,31,4	      /* Number of full quadwords remaining.  */
++	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
+ 
+ 	/* The proper alignment is present, it is OK to copy the bytes now.  */
+ L(copy_GE_32_unaligned_cont):
+ 
+ 	/* Setup two indexes to speed up the indexed vector operations.  */
+-	clrldi  11,31,60
+-	li      6,16	      /* Index for 16-bytes offsets.  */
++	clrldi	10,cnt,60
++	li	6,16	      /* Index for 16-bytes offsets.  */
+ 	li	7,32	      /* Index for 32-bytes offsets.  */
+-	cmpldi  cr1,11,0
+-	srdi    8,31,5	      /* Setup the loop counter.  */
+-	mr      10,3
+-	mr      11,12
+-	mtcrf   0x01,9
+-	cmpldi  cr6,9,1
+-	lvsl    5,0,12
+-	lvx     3,0,12
+-	bf      31,L(setup_unaligned_loop)
+-
+-	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
+-	lvx     4,12,6
+-	vperm   6,3,4,5
+-	addi    11,12,16
+-	addi    10,3,16
+-	stvx    6,0,3
++	cmpldi	cr1,10,0
++	srdi	8,cnt,5	      /* Setup the loop counter.  */
++	mtocrf	0x01,9
++	cmpldi	cr6,9,1
++#ifdef __LITTLE_ENDIAN__
++	lvsr	5,0,src
++#else
++	lvsl	5,0,src
++#endif
++	lvx	3,0,src
++	li	0,0
++	bf	31,L(setup_unaligned_loop)
++
++	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
++	lvx	4,src,6
++#ifdef __LITTLE_ENDIAN__
++	vperm	6,4,3,5
++#else
++	vperm	6,3,4,5
++#endif
++	addi	src,src,16
++	stvx	6,0,dst
++	addi	dst,dst,16
+ 	vor	3,4,4
++	clrrdi	0,src,60
+ 
+ L(setup_unaligned_loop):
+-	mtctr   8
+-	ble     cr6,L(end_unaligned_loop)
++	mtctr	8
++	ble	cr6,L(end_unaligned_loop)
+ 
+ 	/* Copy 32 bytes at a time using vector instructions.  */
+-	.align  4
++	.align	4
+ L(unaligned_loop):
+ 
+ 	/* Note: vr6/vr10 may contain data that was already copied,
+@@ -444,63 +385,56 @@
+ 	some portions again. This is faster than having unaligned
+ 	vector instructions though.  */
+ 
+-	lvx	4,11,6	      /* vr4 = r11+16.  */
+-	vperm   6,3,4,5	      /* Merge the correctly-aligned portions
+-			      of vr3/vr4 into vr6.  */
+-	lvx	3,11,7	      /* vr3 = r11+32.  */
+-	vperm   10,4,3,5      /* Merge the correctly-aligned portions
+-			      of vr3/vr4 into vr10.  */
+-	addi    11,11,32
+-	stvx    6,0,10
+-	stvx    10,10,6
+-	addi    10,10,32
+-
++	lvx	4,src,6
++#ifdef __LITTLE_ENDIAN__
++	vperm	6,4,3,5
++#else
++	vperm	6,3,4,5
++#endif
++	lvx	3,src,7
++#ifdef __LITTLE_ENDIAN__
++	vperm	10,3,4,5
++#else
++	vperm	10,4,3,5
++#endif
++	addi	src,src,32
++	stvx	6,0,dst
++	stvx	10,dst,6
++	addi	dst,dst,32
+ 	bdnz	L(unaligned_loop)
+ 
+-	.align  4
++	clrrdi	0,src,60
++
++	.align	4
+ L(end_unaligned_loop):
+ 
+ 	/* Check for tail bytes.  */
+-	rldicr  0,31,0,59
+-	mtcrf   0x01,31
+-	beq	cr1,0f
++	mtocrf	0x01,cnt
++	beqlr	cr1
+ 
+-	add	3,3,0
+-	add	12,12,0
++	add	src,src,0
+ 
+ 	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
+-8:	/* Copy 8 bytes.  */
++	/* Copy 8 bytes.  */
+ 	bf	28,4f
+-
+-	lwz	6,0(12)
+-	lwz	7,4(12)
+-	addi    12,12,8
+-	stw	6,0(3)
+-	stw	7,4(3)
+-	addi    3,3,8
+-4:	/* Copy 4 bytes.  */
+-	bf	29,2f
+-
+-	lwz	6,0(12)
+-	addi    12,12,4
+-	stw	6,0(3)
+-	addi    3,3,4
+-2:	/* Copy 2~3 bytes.  */
+-	bf	30,1f
+-
+-	lhz	6,0(12)
+-	addi    12,12,2
+-	sth	6,0(3)
+-	addi    3,3,2
+-1:	/* Copy 1 byte.  */
+-	bf	31,0f
+-
+-	lbz	6,0(12)
+-	stb	6,0(3)
+-0:	/* Return original DST pointer.  */
+-	ld	31,-8(1)
+-	ld	3,-16(1)
++	lwz	6,0(src)
++	lwz	7,4(src)
++	addi	src,src,8
++	stw	6,0(dst)
++	stw	7,4(dst)
++	addi	dst,dst,8
++4:	/* Copy 4~7 bytes.  */
++	bf	29,L(tail2)
++	lwz	6,0(src)
++	stw	6,0(dst)
++	bf	30,L(tail5)
++	lhz	7,4(src)
++	sth	7,4(dst)
++	bflr	31
++	lbz	8,6(src)
++	stb	8,6(dst)
++	/* Return original DST pointer.  */
+ 	blr
+ 
+-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
++END_GEN_TB (memcpy,TB_TOCLESS)
+ libc_hidden_builtin_def (memcpy)
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S	2014-05-29 13:04:56.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/mempcpy.S	2014-05-29 13:04:56.000000000 -0500
+@@ -367,13 +367,21 @@
+ 	mr	11,12
+ 	mtcrf	0x01,9
+ 	cmpldi	cr6,9,1
+-	lvsl	5,0,12
++#ifdef __LITTLE_ENDIAN__
++	lvsr    5,0,12
++#else
++	lvsl    5,0,12
++#endif
+ 	lvx	3,0,12
+ 	bf	31,L(setup_unaligned_loop)
+ 
+ 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
+ 	lvx	4,12,6
+-	vperm	6,3,4,5
++#ifdef __LITTLE_ENDIAN__
++	vperm   6,4,3,5
++#else
++	vperm   6,3,4,5
++#endif
+ 	addi	11,12,16
+ 	addi	10,3,16
+ 	stvx	6,0,3
+@@ -393,11 +401,17 @@
+ 	vector instructions though.  */
+ 
+ 	lvx	4,11,6	      /* vr4 = r11+16.  */
+-	vperm	6,3,4,5	      /* Merge the correctly-aligned portions
+-				 of vr3/vr4 into vr6.  */
++#ifdef __LITTLE_ENDIAN__
++	vperm   6,4,3,5
++#else
++	vperm   6,3,4,5
++#endif
+ 	lvx	3,11,7	      /* vr3 = r11+32.  */
+-	vperm	10,4,3,5      /* Merge the correctly-aligned portions
+-				 of vr3/vr4 into vr10.  */
++#ifdef __LITTLE_ENDIAN__
++	vperm   10,3,4,5
++#else
++	vperm   10,4,3,5
++#endif
+ 	addi	11,11,32
+ 	stvx	6,0,10
+ 	stvx	10,10,6
author	Chris Packham <judge.packham@gmail.com>	2021-05-18 08:46:49 (GMT)
committer	GitHub <noreply@github.com>	2021-05-18 08:46:49 (GMT)
commit	6d008334bcfa76f8b46e61d9edb6dd5335cd6632 (patch)
tree	cd137ec7ab048fa32049a4322c10a0e27ba80c20 /packages/glibc/2.17/0053-glibc-ppc64le-31.patch
parent	f284f4149518de6e8c403a9392be8e817bfab2e8 (diff)
parent	0088351811bf442aa2e7d35c564f36ca67a8a699 (diff)